src/patches/suse-2.6.27.25/patches.xen/xen3-patch-2.6.25

   1 From: kernel.org
   2 Subject: 2.6.25
   3 Patch-mainline: 2.6.25
   4
   5 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
   6
   7 Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
   8
   9 --- sle11-2009-05-14.orig/arch/x86/Kconfig      2009-02-16 16:18:36.000000000 +0100
  10 +++ sle11-2009-05-14/arch/x86/Kconfig   2009-03-16 16:33:40.000000000 +0100
  11 @@ -27,7 +27,7 @@ config X86
  12         select HAVE_KRETPROBES
  13         select HAVE_DYNAMIC_FTRACE
  14         select HAVE_FTRACE
  15 -       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
  16 +       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
  17         select HAVE_ARCH_KGDB if !X86_VOYAGER
  18         select HAVE_ARCH_TRACEHOOK
  19         select HAVE_GENERIC_DMA_COHERENT if X86_32
  20 @@ -211,14 +211,12 @@ config X86_TRAMPOLINE
  21         default y
  22
  23  config X86_NO_TSS
  24 -       bool
  25 +       def_bool y
  26         depends on XEN
  27 -       default y
  28
  29  config X86_NO_IDT
  30 -       bool
  31 +       def_bool y
  32         depends on XEN
  33 -       default y
  34
  35  config KTIME_SCALAR
  36         def_bool X86_32
  37 @@ -728,9 +726,8 @@ config X86_VISWS_APIC
  38         depends on X86_32 && X86_VISWS
  39
  40  config X86_XEN_GENAPIC
  41 -       bool
  42 +       def_bool y
  43         depends on X86_64_XEN
  44 -       default y
  45
  46  config X86_MCE
  47         bool "Machine Check Exception"
  48 @@ -1117,7 +1114,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
  49
  50  config ARCH_SPARSEMEM_DEFAULT
  51         def_bool y
  52 -       depends on X86_64
  53 +       depends on X86_64 && !X86_64_XEN
  54
  55  config ARCH_SPARSEMEM_ENABLE
  56         def_bool y
  57 @@ -1747,10 +1744,10 @@ config PCI_MMCONFIG
  58         depends on X86_64 && PCI && ACPI
  59
  60  config XEN_PCIDEV_FRONTEND
  61 -       bool "Xen PCI Frontend" if X86_64
  62 +       def_bool y
  63 +       prompt "Xen PCI Frontend" if X86_64
  64         depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
  65         select HOTPLUG
  66 -       default y
  67         help
  68           The PCI device frontend driver allows the kernel to import arbitrary
  69           PCI devices from a PCI backend to support PCI driver domains.
  70 @@ -1758,7 +1755,6 @@ config XEN_PCIDEV_FRONTEND
  71  config XEN_PCIDEV_FE_DEBUG
  72         bool "Xen PCI Frontend Debugging"
  73         depends on XEN_PCIDEV_FRONTEND
  74 -       default n
  75         help
  76           Enables some debug statements within the PCI Frontend.
  77
  78 --- sle11-2009-05-14.orig/arch/x86/Kconfig.debug        2009-02-02 09:40:56.000000000 +0100
  79 +++ sle11-2009-05-14/arch/x86/Kconfig.debug     2009-03-16 16:33:40.000000000 +0100
  80 @@ -279,6 +279,7 @@ config DEBUG_BOOT_PARAMS
  81         bool "Debug boot parameters"
  82         depends on DEBUG_KERNEL
  83         depends on DEBUG_FS
  84 +       depends on !XEN
  85         help
  86           This option will cause struct boot_params to be exported via debugfs.
  87
  88 --- sle11-2009-05-14.orig/arch/x86/ia32/ia32entry-xen.S 2009-02-16 16:18:36.000000000 +0100
  89 +++ sle11-2009-05-14/arch/x86/ia32/ia32entry-xen.S      2009-03-16 16:33:40.000000000 +0100
  90 @@ -12,7 +12,6 @@
  91  #include <asm/ia32_unistd.h>
  92  #include <asm/thread_info.h>
  93  #include <asm/segment.h>
  94 -#include <asm/vsyscall32.h>
  95  #include <asm/irqflags.h>
  96  #include <linux/linkage.h>
  97
  98 @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
  99         CFI_RESTORE     rcx
 100         movl    %ebp,%ebp               /* zero extension */
 101         movl    %eax,%eax
 102 +       movl    48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
 103         movl    $__USER32_DS,40(%rsp)
 104         movq    %rbp,32(%rsp)
 105         movl    $__USER32_CS,16(%rsp)
 106 -       movl    $VSYSCALL32_SYSEXIT,8(%rsp)
 107 +       movq    %r10,8(%rsp)
 108         movq    %rax,(%rsp)
 109         cld
 110         SAVE_ARGS 0,0,1
 111 @@ -582,8 +582,8 @@ ia32_sys_call_table:
 112         .quad compat_sys_futex          /* 240 */
 113         .quad compat_sys_sched_setaffinity
 114         .quad compat_sys_sched_getaffinity
 115 -       .quad sys32_set_thread_area
 116 -       .quad sys32_get_thread_area
 117 +       .quad sys_set_thread_area
 118 +       .quad sys_get_thread_area
 119         .quad compat_sys_io_setup       /* 245 */
 120         .quad sys_io_destroy
 121         .quad compat_sys_io_getevents
 122 @@ -661,7 +661,9 @@ ia32_sys_call_table:
 123         .quad sys_epoll_pwait
 124         .quad compat_sys_utimensat      /* 320 */
 125         .quad compat_sys_signalfd
 126 -       .quad compat_sys_timerfd
 127 +       .quad sys_timerfd_create
 128         .quad sys_eventfd
 129         .quad sys32_fallocate
 130 +       .quad compat_sys_timerfd_settime        /* 325 */
 131 +       .quad compat_sys_timerfd_gettime
 132  ia32_syscall_end:
 133 --- sle11-2009-05-14.orig/arch/x86/kernel/Makefile      2009-02-16 16:18:36.000000000 +0100
 134 +++ sle11-2009-05-14/arch/x86/kernel/Makefile   2009-03-16 16:33:40.000000000 +0100
 135 @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
 136
 137          obj-$(CONFIG_PCI_MMCONFIG)     += mmconf-fam10h_64.o
 138
 139 +       obj-$(CONFIG_XEN)               += nmi_64.o
 140         time_64-$(CONFIG_XEN)           += time_32.o
 141         pci-dma_64-$(CONFIG_XEN)        += pci-dma_32.o
 142  endif
 143
 144  disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
 145         smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
 146 -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
 147 -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
 148 --- sle11-2009-05-14.orig/arch/x86/kernel/acpi/boot.c   2008-12-01 11:11:08.000000000 +0100
 149 +++ sle11-2009-05-14/arch/x86/kernel/acpi/boot.c        2009-03-16 16:33:40.000000000 +0100
 150 @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
 151  #ifndef CONFIG_XEN
 152         if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
 153                 return __va(phys);
 154 +#else
 155 +       if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
 156 +               return isa_bus_to_virt(phys);
 157  #endif
 158
 159         offset = phys & (PAGE_SIZE - 1);
 160 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
 161 +++ sle11-2009-05-14/arch/x86/kernel/acpi/sleep-xen.c   2009-03-16 16:33:40.000000000 +0100
 162 @@ -0,0 +1,95 @@
 163 +/*
 164 + * sleep.c - x86-specific ACPI sleep support.
 165 + *
 166 + *  Copyright (C) 2001-2003 Patrick Mochel
 167 + *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
 168 + */
 169 +
 170 +#include <linux/acpi.h>
 171 +#include <linux/bootmem.h>
 172 +#include <linux/dmi.h>
 173 +#include <linux/cpumask.h>
 174 +
 175 +#include <asm/smp.h>
 176 +
 177 +#ifndef CONFIG_ACPI_PV_SLEEP
 178 +/* address in low memory of the wakeup routine. */
 179 +unsigned long acpi_wakeup_address = 0;
 180 +unsigned long acpi_realmode_flags;
 181 +extern char wakeup_start, wakeup_end;
 182 +
 183 +extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 184 +#endif
 185 +
 186 +/**
 187 + * acpi_save_state_mem - save kernel state
 188 + *
 189 + * Create an identity mapped page table and copy the wakeup routine to
 190 + * low memory.
 191 + */
 192 +int acpi_save_state_mem(void)
 193 +{
 194 +#ifndef CONFIG_ACPI_PV_SLEEP
 195 +       if (!acpi_wakeup_address) {
 196 +               printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
 197 +               return -ENOMEM;
 198 +       }
 199 +       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 200 +              &wakeup_end - &wakeup_start);
 201 +       acpi_copy_wakeup_routine(acpi_wakeup_address);
 202 +#endif
 203 +
 204 +       return 0;
 205 +}
 206 +
 207 +/*
 208 + * acpi_restore_state - undo effects of acpi_save_state_mem
 209 + */
 210 +void acpi_restore_state_mem(void)
 211 +{
 212 +}
 213 +
 214 +
 215 +/**
 216 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 217 + *
 218 + * We allocate a page from the first 1MB of memory for the wakeup
 219 + * routine for when we come back from a sleep state. The
 220 + * runtime allocator allows specification of <16MB pages, but not
 221 + * <1MB pages.
 222 + */
 223 +void __init acpi_reserve_bootmem(void)
 224 +{
 225 +#ifndef CONFIG_ACPI_PV_SLEEP
 226 +       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
 227 +               printk(KERN_ERR
 228 +                      "ACPI: Wakeup code way too big, S3 disabled.\n");
 229 +               return;
 230 +       }
 231 +
 232 +       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
 233 +       if (!acpi_wakeup_address)
 234 +               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 235 +#endif
 236 +}
 237 +
 238 +
 239 +#ifndef CONFIG_ACPI_PV_SLEEP
 240 +static int __init acpi_sleep_setup(char *str)
 241 +{
 242 +       while ((str != NULL) && (*str != '\0')) {
 243 +               if (strncmp(str, "s3_bios", 7) == 0)
 244 +                       acpi_realmode_flags |= 1;
 245 +               if (strncmp(str, "s3_mode", 7) == 0)
 246 +                       acpi_realmode_flags |= 2;
 247 +               if (strncmp(str, "s3_beep", 7) == 0)
 248 +                       acpi_realmode_flags |= 4;
 249 +               str = strchr(str, ',');
 250 +               if (str != NULL)
 251 +                       str += strspn(str, ", \t");
 252 +       }
 253 +       return 1;
 254 +}
 255 +
 256 +__setup("acpi_sleep=", acpi_sleep_setup);
 257 +#endif /* CONFIG_ACPI_PV_SLEEP */
 258 --- sle11-2009-05-14.orig/arch/x86/kernel/acpi/sleep_32-xen.c   2009-02-16 16:18:36.000000000 +0100
 259 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
 260 @@ -1,117 +0,0 @@
 261 -/*
 262 - * sleep.c - x86-specific ACPI sleep support.
 263 - *
 264 - *  Copyright (C) 2001-2003 Patrick Mochel
 265 - *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
 266 - */
 267 -
 268 -#include <linux/acpi.h>
 269 -#include <linux/bootmem.h>
 270 -#include <linux/dmi.h>
 271 -#include <linux/cpumask.h>
 272 -
 273 -#include <asm/smp.h>
 274 -
 275 -#ifndef CONFIG_ACPI_PV_SLEEP
 276 -/* address in low memory of the wakeup routine. */
 277 -unsigned long acpi_wakeup_address = 0;
 278 -unsigned long acpi_realmode_flags;
 279 -extern char wakeup_start, wakeup_end;
 280 -
 281 -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
 282 -#endif
 283 -
 284 -/**
 285 - * acpi_save_state_mem - save kernel state
 286 - *
 287 - * Create an identity mapped page table and copy the wakeup routine to
 288 - * low memory.
 289 - */
 290 -int acpi_save_state_mem(void)
 291 -{
 292 -#ifndef CONFIG_ACPI_PV_SLEEP
 293 -       if (!acpi_wakeup_address)
 294 -               return 1;
 295 -       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 296 -              &wakeup_end - &wakeup_start);
 297 -       acpi_copy_wakeup_routine(acpi_wakeup_address);
 298 -#endif
 299 -       return 0;
 300 -}
 301 -
 302 -/*
 303 - * acpi_restore_state - undo effects of acpi_save_state_mem
 304 - */
 305 -void acpi_restore_state_mem(void)
 306 -{
 307 -}
 308 -
 309 -/**
 310 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 311 - *
 312 - * We allocate a page from the first 1MB of memory for the wakeup
 313 - * routine for when we come back from a sleep state. The
 314 - * runtime allocator allows specification of <16MB pages, but not
 315 - * <1MB pages.
 316 - */
 317 -void __init acpi_reserve_bootmem(void)
 318 -{
 319 -#ifndef CONFIG_ACPI_PV_SLEEP
 320 -       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
 321 -               printk(KERN_ERR
 322 -                      "ACPI: Wakeup code way too big, S3 disabled.\n");
 323 -               return;
 324 -       }
 325 -
 326 -       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
 327 -       if (!acpi_wakeup_address)
 328 -               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 329 -#endif
 330 -}
 331 -
 332 -#ifndef CONFIG_ACPI_PV_SLEEP
 333 -static int __init acpi_sleep_setup(char *str)
 334 -{
 335 -       while ((str != NULL) && (*str != '\0')) {
 336 -               if (strncmp(str, "s3_bios", 7) == 0)
 337 -                       acpi_realmode_flags |= 1;
 338 -               if (strncmp(str, "s3_mode", 7) == 0)
 339 -                       acpi_realmode_flags |= 2;
 340 -               if (strncmp(str, "s3_beep", 7) == 0)
 341 -                       acpi_realmode_flags |= 4;
 342 -               str = strchr(str, ',');
 343 -               if (str != NULL)
 344 -                       str += strspn(str, ", \t");
 345 -       }
 346 -       return 1;
 347 -}
 348 -
 349 -__setup("acpi_sleep=", acpi_sleep_setup);
 350 -
 351 -/* Ouch, we want to delete this. We already have better version in userspace, in
 352 -   s2ram from suspend.sf.net project */
 353 -static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
 354 -{
 355 -       acpi_realmode_flags |= 2;
 356 -       return 0;
 357 -}
 358 -
 359 -static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
 360 -       {                       /* Reset video mode after returning from ACPI S3 sleep */
 361 -        .callback = reset_videomode_after_s3,
 362 -        .ident = "Toshiba Satellite 4030cdt",
 363 -        .matches = {
 364 -                    DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
 365 -                    },
 366 -        },
 367 -       {}
 368 -};
 369 -
 370 -static int __init acpisleep_dmi_init(void)
 371 -{
 372 -       dmi_check_system(acpisleep_dmi_table);
 373 -       return 0;
 374 -}
 375 -
 376 -core_initcall(acpisleep_dmi_init);
 377 -#endif /* CONFIG_ACPI_PV_SLEEP */
 378 --- sle11-2009-05-14.orig/arch/x86/kernel/acpi/sleep_64-xen.c   2009-02-16 16:18:36.000000000 +0100
 379 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
 380 @@ -1,125 +0,0 @@
 381 -/*
 382 - *  acpi.c - Architecture-Specific Low-Level ACPI Support
 383 - *
 384 - *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 385 - *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
 386 - *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
 387 - *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
 388 - *  Copyright (C) 2003 Pavel Machek, SuSE Labs
 389 - *
 390 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 391 - *
 392 - *  This program is free software; you can redistribute it and/or modify
 393 - *  it under the terms of the GNU General Public License as published by
 394 - *  the Free Software Foundation; either version 2 of the License, or
 395 - *  (at your option) any later version.
 396 - *
 397 - *  This program is distributed in the hope that it will be useful,
 398 - *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 399 - *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 400 - *  GNU General Public License for more details.
 401 - *
 402 - *  You should have received a copy of the GNU General Public License
 403 - *  along with this program; if not, write to the Free Software
 404 - *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 405 - *
 406 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 407 - */
 408 -
 409 -#include <linux/kernel.h>
 410 -#include <linux/init.h>
 411 -#include <linux/types.h>
 412 -#include <linux/stddef.h>
 413 -#include <linux/slab.h>
 414 -#include <linux/pci.h>
 415 -#include <linux/bootmem.h>
 416 -#include <linux/acpi.h>
 417 -#include <linux/cpumask.h>
 418 -
 419 -#include <asm/mpspec.h>
 420 -#include <asm/io.h>
 421 -#include <asm/apic.h>
 422 -#include <asm/apicdef.h>
 423 -#include <asm/page.h>
 424 -#include <asm/pgtable.h>
 425 -#include <asm/pgalloc.h>
 426 -#include <asm/io_apic.h>
 427 -#include <asm/proto.h>
 428 -#include <asm/tlbflush.h>
 429 -
 430 -/* --------------------------------------------------------------------------
 431 -                              Low-Level Sleep Support
 432 -   -------------------------------------------------------------------------- */
 433 -
 434 -#ifndef CONFIG_ACPI_PV_SLEEP
 435 -/* address in low memory of the wakeup routine. */
 436 -unsigned long acpi_wakeup_address = 0;
 437 -unsigned long acpi_realmode_flags;
 438 -extern char wakeup_start, wakeup_end;
 439 -
 440 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 441 -#endif
 442 -
 443 -/**
 444 - * acpi_save_state_mem - save kernel state
 445 - *
 446 - * Create an identity mapped page table and copy the wakeup routine to
 447 - * low memory.
 448 - */
 449 -int acpi_save_state_mem(void)
 450 -{
 451 -#ifndef CONFIG_ACPI_PV_SLEEP
 452 -       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 453 -              &wakeup_end - &wakeup_start);
 454 -       acpi_copy_wakeup_routine(acpi_wakeup_address);
 455 -#endif
 456 -       return 0;
 457 -}
 458 -
 459 -/*
 460 - * acpi_restore_state
 461 - */
 462 -void acpi_restore_state_mem(void)
 463 -{
 464 -}
 465 -
 466 -/**
 467 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 468 - *
 469 - * We allocate a page in low memory for the wakeup
 470 - * routine for when we come back from a sleep state. The
 471 - * runtime allocator allows specification of <16M pages, but not
 472 - * <1M pages.
 473 - */
 474 -void __init acpi_reserve_bootmem(void)
 475 -{
 476 -#ifndef CONFIG_ACPI_PV_SLEEP
 477 -       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
 478 -       if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
 479 -               printk(KERN_CRIT
 480 -                      "ACPI: Wakeup code way too big, will crash on attempt"
 481 -                      " to suspend\n");
 482 -#endif
 483 -}
 484 -
 485 -#ifndef CONFIG_ACPI_PV_SLEEP
 486 -static int __init acpi_sleep_setup(char *str)
 487 -{
 488 -       while ((str != NULL) && (*str != '\0')) {
 489 -               if (strncmp(str, "s3_bios", 7) == 0)
 490 -                       acpi_realmode_flags |= 1;
 491 -               if (strncmp(str, "s3_mode", 7) == 0)
 492 -                       acpi_realmode_flags |= 2;
 493 -               if (strncmp(str, "s3_beep", 7) == 0)
 494 -                       acpi_realmode_flags |= 4;
 495 -               str = strchr(str, ',');
 496 -               if (str != NULL)
 497 -                       str += strspn(str, ", \t");
 498 -       }
 499 -
 500 -       return 1;
 501 -}
 502 -
 503 -__setup("acpi_sleep=", acpi_sleep_setup);
 504 -#endif                         /* CONFIG_ACPI_PV_SLEEP */
 505 -
 506 --- sle11-2009-05-14.orig/arch/x86/kernel/apic_32-xen.c 2008-12-15 11:27:22.000000000 +0100
 507 +++ sle11-2009-05-14/arch/x86/kernel/apic_32-xen.c      2009-03-16 16:33:40.000000000 +0100
 508 @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
 509   * This initializes the IO-APIC and APIC hardware if this is
 510   * a UP kernel.
 511   */
 512 -int __init APIC_init_uniprocessor (void)
 513 +int __init APIC_init_uniprocessor(void)
 514  {
 515  #ifdef CONFIG_X86_IO_APIC
 516         if (smp_found_config)
 517 --- sle11-2009-05-14.orig/arch/x86/kernel/apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
 518 +++ sle11-2009-05-14/arch/x86/kernel/apic_64-xen.c      2009-03-16 16:33:40.000000000 +0100
 519 @@ -34,34 +34,17 @@
 520  #include <asm/hpet.h>
 521  #include <asm/idle.h>
 522
 523 -int apic_verbosity;
 524 +int disable_apic;
 525
 526  /*
 527 - * 'what should we do if we get a hw irq event on an illegal vector'.
 528 - * each architecture has to answer this themselves.
 529 + * Debug level, exported for io_apic.c
 530   */
 531 -void ack_bad_irq(unsigned int irq)
 532 -{
 533 -       printk("unexpected IRQ trap at irq %02x\n", irq);
 534 -       /*
 535 -        * Currently unexpected vectors happen only on SMP and APIC.
 536 -        * We _must_ ack these because every local APIC has only N
 537 -        * irq slots per priority level, and a 'hanging, unacked' IRQ
 538 -        * holds up an irq slot - in excessive cases (when multiple
 539 -        * unexpected vectors occur) that might lock up the APIC
 540 -        * completely.
 541 -        * But don't ack when the APIC is disabled. -AK
 542 -        */
 543 -       if (!disable_apic)
 544 -               ack_APIC_irq();
 545 -}
 546 -
 547 -int setup_profiling_timer(unsigned int multiplier)
 548 -{
 549 -       return -EINVAL;
 550 -}
 551 +int apic_verbosity;
 552
 553 -void smp_local_timer_interrupt(void)
 554 +/*
 555 + * The guts of the apic timer interrupt
 556 + */
 557 +static void local_apic_timer_interrupt(void)
 558  {
 559  #ifndef CONFIG_XEN
 560         int cpu = smp_processor_id();
 561 @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
 562          */
 563         exit_idle();
 564         irq_enter();
 565 -       smp_local_timer_interrupt();
 566 +       local_apic_timer_interrupt();
 567         irq_exit();
 568         set_irq_regs(old_regs);
 569  }
 570
 571 +int setup_profiling_timer(unsigned int multiplier)
 572 +{
 573 +       return -EINVAL;
 574 +}
 575 +
 576 +/*
 577 + * This initializes the IO-APIC and APIC hardware if this is
 578 + * a UP kernel.
 579 + */
 580 +int __init APIC_init_uniprocessor(void)
 581 +{
 582 +#ifdef CONFIG_X86_IO_APIC
 583 +       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
 584 +               setup_IO_APIC();
 585 +#endif
 586 +
 587 +       return 1;
 588 +}
 589 +
 590 +/*
 591 + * Local APIC interrupts
 592 + */
 593 +
 594  /*
 595   * This interrupt should _never_ happen with our APIC/SMP architecture
 596   */
 597 @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
 598  /*
 599   * This interrupt should never happen with our APIC/SMP architecture
 600   */
 601 -
 602  asmlinkage void smp_error_interrupt(void)
 603  {
 604         unsigned int v, v1;
 605 @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
 606                 smp_processor_id(), v , v1);
 607         irq_exit();
 608  }
 609 -
 610 -int disable_apic;
 611 -
 612 -/*
 613 - * This initializes the IO-APIC and APIC hardware if this is
 614 - * a UP kernel.
 615 - */
 616 -int __init APIC_init_uniprocessor (void)
 617 -{
 618 -#ifdef CONFIG_X86_IO_APIC
 619 -       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
 620 -               setup_IO_APIC();
 621 -#endif
 622 -
 623 -       return 1;
 624 -}
 625 --- sle11-2009-05-14.orig/arch/x86/kernel/asm-offsets_32.c      2009-02-16 16:17:21.000000000 +0100
 626 +++ sle11-2009-05-14/arch/x86/kernel/asm-offsets_32.c   2009-03-16 16:33:40.000000000 +0100
 627 @@ -23,8 +23,10 @@
 628  #include <xen/interface/xen.h>
 629  #endif
 630
 631 +#ifdef CONFIG_LGUEST_GUEST
 632  #include <linux/lguest.h>
 633  #include "../../../drivers/lguest/lg.h"
 634 +#endif
 635
 636  /* workaround for a warning with -Wmissing-prototypes */
 637  void foo(void);
 638 --- sle11-2009-05-14.orig/arch/x86/kernel/cpu/common-xen.c      2009-02-16 16:18:36.000000000 +0100
 639 +++ sle11-2009-05-14/arch/x86/kernel/cpu/common-xen.c   2009-03-16 16:33:40.000000000 +0100
 640 @@ -27,45 +27,50 @@
 641  #include "cpu.h"
 642
 643  DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
 644 -       [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
 645 -       [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
 646 -       [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
 647 -       [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
 648 +       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
 649 +       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
 650 +       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
 651 +       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
 652  #ifndef CONFIG_XEN
 653         /*
 654          * Segments used for calling PnP BIOS have byte granularity.
 655          * They code segments and data segments have fixed 64k limits,
 656          * the transfer segment sizes are set at run time.
 657          */
 658 -       [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
 659 -       [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
 660 -       [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
 661 -       [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
 662 -       [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
 663 +       /* 32-bit code */
 664 +       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
 665 +       /* 16-bit code */
 666 +       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
 667 +       /* 16-bit data */
 668 +       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
 669 +       /* 16-bit data */
 670 +       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
 671 +       /* 16-bit data */
 672 +       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
 673         /*
 674          * The APM segments have byte granularity and their bases
 675          * are set at run time.  All have 64k limits.
 676          */
 677 -       [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
 678 +       /* 32-bit code */
 679 +       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
 680         /* 16-bit code */
 681 -       [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
 682 -       [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
 683 +       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
 684 +       /* data */
 685 +       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
 686
 687 -       [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
 688 +       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
 689  #endif
 690 -       [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
 691 +       [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 692  } };
 693  EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 694
 695 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 696 +
 697  static int cachesize_override __cpuinitdata = -1;
 698 -static int disable_x86_fxsr __cpuinitdata;
 699  static int disable_x86_serial_nr __cpuinitdata = 1;
 700 -static int disable_x86_sep __cpuinitdata;
 701
 702  struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 703
 704 -extern int disable_pse;
 705 -
 706  static void __cpuinit default_init(struct cpuinfo_x86 * c)
 707  {
 708         /* Not much we can do here... */
 709 @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
 710
 711  static int __init x86_fxsr_setup(char * s)
 712  {
 713 -       /* Tell all the other CPUs to not use it... */
 714 -       disable_x86_fxsr = 1;
 715 -
 716 -       /*
 717 -        * ... and clear the bits early in the boot_cpu_data
 718 -        * so that the bootup process doesn't try to do this
 719 -        * either.
 720 -        */
 721 -       clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
 722 -       clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
 723 +       setup_clear_cpu_cap(X86_FEATURE_FXSR);
 724 +       setup_clear_cpu_cap(X86_FEATURE_XMM);
 725         return 1;
 726  }
 727  __setup("nofxsr", x86_fxsr_setup);
 728 @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
 729
 730  static int __init x86_sep_setup(char * s)
 731  {
 732 -       disable_x86_sep = 1;
 733 +       setup_clear_cpu_cap(X86_FEATURE_SEP);
 734         return 1;
 735  }
 736  __setup("nosep", x86_sep_setup);
 737 @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
 738  void __init cpu_detect(struct cpuinfo_x86 *c)
 739  {
 740         /* Get vendor name */
 741 -       cpuid(0x00000000, &c->cpuid_level,
 742 -             (int *)&c->x86_vendor_id[0],
 743 -             (int *)&c->x86_vendor_id[8],
 744 -             (int *)&c->x86_vendor_id[4]);
 745 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
 746 +             (unsigned int *)&c->x86_vendor_id[0],
 747 +             (unsigned int *)&c->x86_vendor_id[8],
 748 +             (unsigned int *)&c->x86_vendor_id[4]);
 749
 750         c->x86 = 4;
 751         if (c->cpuid_level >= 0x00000001) {
 752 @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
 753                 if (c->x86 >= 0x6)
 754                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
 755                 c->x86_mask = tfms & 15;
 756 -               if (cap0 & (1<<19))
 757 +               if (cap0 & (1<<19)) {
 758                         c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 759 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 760 +               }
 761 +       }
 762 +}
 763 +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
 764 +{
 765 +       u32 tfms, xlvl;
 766 +       unsigned int ebx;
 767 +
 768 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
 769 +       if (have_cpuid_p()) {
 770 +               /* Intel-defined flags: level 0x00000001 */
 771 +               if (c->cpuid_level >= 0x00000001) {
 772 +                       u32 capability, excap;
 773 +                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 774 +                       c->x86_capability[0] = capability;
 775 +                       c->x86_capability[4] = excap;
 776 +               }
 777 +
 778 +               /* AMD-defined flags: level 0x80000001 */
 779 +               xlvl = cpuid_eax(0x80000000);
 780 +               if ((xlvl & 0xffff0000) == 0x80000000) {
 781 +                       if (xlvl >= 0x80000001) {
 782 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
 783 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
 784 +                       }
 785 +               }
 786 +
 787         }
 788 +
 789  }
 790
 791  /* Do minimum CPU detection early.
 792 @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
 793         struct cpuinfo_x86 *c = &boot_cpu_data;
 794
 795         c->x86_cache_alignment = 32;
 796 +       c->x86_clflush_size = 32;
 797
 798         if (!have_cpuid_p())
 799                 return;
 800 @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
 801         cpu_detect(c);
 802
 803         get_cpu_vendor(c, 1);
 804 +
 805 +       switch (c->x86_vendor) {
 806 +       case X86_VENDOR_AMD:
 807 +               early_init_amd(c);
 808 +               break;
 809 +       case X86_VENDOR_INTEL:
 810 +               early_init_intel(c);
 811 +               break;
 812 +       }
 813 +
 814 +       early_get_cap(c);
 815  }
 816
 817  static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 818  {
 819         u32 tfms, xlvl;
 820 -       int ebx;
 821 +       unsigned int ebx;
 822
 823         if (have_cpuid_p()) {
 824                 /* Get vendor name */
 825 -               cpuid(0x00000000, &c->cpuid_level,
 826 -                     (int *)&c->x86_vendor_id[0],
 827 -                     (int *)&c->x86_vendor_id[8],
 828 -                     (int *)&c->x86_vendor_id[4]);
 829 +               cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
 830 +                     (unsigned int *)&c->x86_vendor_id[0],
 831 +                     (unsigned int *)&c->x86_vendor_id[8],
 832 +                     (unsigned int *)&c->x86_vendor_id[4]);
 833
 834                 get_cpu_vendor(c, 0);
 835                 /* Initialize the standard set of capabilities */
 836 @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
 837                 init_scattered_cpuid_features(c);
 838         }
 839
 840 -       early_intel_workaround(c);
 841 -
 842  #ifdef CONFIG_X86_HT
 843         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 844  #endif
 845 @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
 846  /*
 847   * This does the hard work of actually picking apart the CPU stuff...
 848   */
 849 -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 850 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 851  {
 852         int i;
 853
 854 @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
 855
 856         generic_identify(c);
 857
 858 -       printk(KERN_DEBUG "CPU: After generic identify, caps:");
 859 -       for (i = 0; i < NCAPINTS; i++)
 860 -               printk(" %08lx", c->x86_capability[i]);
 861 -       printk("\n");
 862 -
 863 -       if (this_cpu->c_identify) {
 864 +       if (this_cpu->c_identify)
 865                 this_cpu->c_identify(c);
 866
 867 -               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
 868 -               for (i = 0; i < NCAPINTS; i++)
 869 -                       printk(" %08lx", c->x86_capability[i]);
 870 -               printk("\n");
 871 -       }
 872 -
 873         /*
 874          * Vendor-specific initialization.  In this section we
 875          * canonicalize the feature flags, meaning if there are
 876 @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
 877          * we do "generic changes."
 878          */
 879
 880 -       /* TSC disabled? */
 881 -       if ( tsc_disable )
 882 -               clear_bit(X86_FEATURE_TSC, c->x86_capability);
 883 -
 884 -       /* FXSR disabled? */
 885 -       if (disable_x86_fxsr) {
 886 -               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
 887 -               clear_bit(X86_FEATURE_XMM, c->x86_capability);
 888 -       }
 889 -
 890 -       /* SEP disabled? */
 891 -       if (disable_x86_sep)
 892 -               clear_bit(X86_FEATURE_SEP, c->x86_capability);
 893 -
 894 -       if (disable_pse)
 895 -               clear_bit(X86_FEATURE_PSE, c->x86_capability);
 896 -
 897         /* If the model name is still unset, do table lookup. */
 898         if ( !c->x86_model_id[0] ) {
 899                 char *p;
 900 @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
 901                                 c->x86, c->x86_model);
 902         }
 903
 904 -       /* Now the feature flags better reflect actual CPU features! */
 905 -
 906 -       printk(KERN_DEBUG "CPU: After all inits, caps:");
 907 -       for (i = 0; i < NCAPINTS; i++)
 908 -               printk(" %08lx", c->x86_capability[i]);
 909 -       printk("\n");
 910 -
 911         /*
 912          * On SMP, boot_cpu_data holds the common feature set between
 913          * all CPUs; so make sure that we indicate which features are
 914 @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
 915                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 916         }
 917
 918 +       /* Clear all flags overriden by options */
 919 +       for (i = 0; i < NCAPINTS; i++)
 920 +               c->x86_capability[i] &= ~cleared_cpu_caps[i];
 921 +
 922         /* Init Machine Check Exception if available. */
 923         mcheck_init(c);
 924 +
 925 +       select_idle_routine(c);
 926  }
 927
 928  void __init identify_boot_cpu(void)
 929 @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
 930         identify_cpu(&boot_cpu_data);
 931         sysenter_setup();
 932         enable_sep_cpu();
 933 -       mtrr_bp_init();
 934  }
 935
 936  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 937 @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
 938  }
 939  #endif
 940
 941 +static __init int setup_noclflush(char *arg)
 942 +{
 943 +       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
 944 +       return 1;
 945 +}
 946 +__setup("noclflush", setup_noclflush);
 947 +
 948  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 949  {
 950         char *vendor = NULL;
 951 @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
 952                 printk("\n");
 953  }
 954
 955 +static __init int setup_disablecpuid(char *arg)
 956 +{
 957 +       int bit;
 958 +       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
 959 +               setup_clear_cpu_cap(bit);
 960 +       else
 961 +               return 0;
 962 +       return 1;
 963 +}
 964 +__setup("clearcpuid=", setup_disablecpuid);
 965 +
 966  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 967
 968  /* This is hacky. :)
 969 @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
 970   * They will insert themselves into the cpu_devs structure.
 971   * Then, when cpu_init() is called, we can just iterate over that array.
 972   */
 973 -
 974 -extern int intel_cpu_init(void);
 975 -extern int cyrix_init_cpu(void);
 976 -extern int nsc_init_cpu(void);
 977 -extern int amd_init_cpu(void);
 978 -extern int centaur_init_cpu(void);
 979 -extern int transmeta_init_cpu(void);
 980 -extern int nexgen_init_cpu(void);
 981 -extern int umc_init_cpu(void);
 982 -
 983  void __init early_cpu_init(void)
 984  {
 985         intel_cpu_init();
 986 @@ -627,21 +641,13 @@ void __init early_cpu_init(void)
 987         nexgen_init_cpu();
 988         umc_init_cpu();
 989         early_cpu_detect();
 990 -
 991 -#ifdef CONFIG_DEBUG_PAGEALLOC
 992 -       /* pse is not compatible with on-the-fly unmapping,
 993 -        * disable it even if the cpus claim to support it.
 994 -        */
 995 -       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
 996 -       disable_pse = 1;
 997 -#endif
 998  }
 999
1000  /* Make sure %fs is initialized properly in idle threads */
1001 -struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1002 +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1003  {
1004         memset(regs, 0, sizeof(struct pt_regs));
1005 -       regs->xfs = __KERNEL_PERCPU;
1006 +       regs->fs = __KERNEL_PERCPU;
1007         return regs;
1008  }
1009
1010 @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1011   * it's on the real one. */
1012  void switch_to_new_gdt(void)
1013  {
1014 -       struct Xgt_desc_struct gdt_descr;
1015 +       struct desc_ptr gdt_descr;
1016         unsigned long va, frames[16];
1017         int f;
1018
1019 @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1020
1021         if (cpu_has_vme || cpu_has_de)
1022                 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1023 -       if (tsc_disable && cpu_has_tsc) {
1024 -               printk(KERN_NOTICE "Disabling TSC...\n");
1025 -               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1026 -               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1027 -               set_in_cr4(X86_CR4_TSD);
1028 -       }
1029
1030         switch_to_new_gdt();
1031
1032 @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1033                 BUG();
1034         enter_lazy_tlb(&init_mm, curr);
1035
1036 -       load_esp0(t, thread);
1037 +       load_sp0(t, thread);
1038
1039         load_LDT(&init_mm.context);
1040
1041 --- sle11-2009-05-14.orig/arch/x86/kernel/cpu/mtrr/main-xen.c   2009-02-16 16:17:21.000000000 +0100
1042 +++ sle11-2009-05-14/arch/x86/kernel/cpu/mtrr/main-xen.c        2009-03-16 16:33:40.000000000 +0100
1043 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1044
1045  struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1046  unsigned int num_var_ranges;
1047 -unsigned int *usage_table;
1048 +unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1049
1050  static void __init set_num_var_ranges(void)
1051  {
1052 @@ -52,17 +52,12 @@ static void __init init_table(void)
1053         int i, max;
1054
1055         max = num_var_ranges;
1056 -       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1057 -           == NULL) {
1058 -               printk(KERN_ERR "mtrr: could not allocate\n");
1059 -               return;
1060 -       }
1061         for (i = 0; i < max; i++)
1062 -               usage_table[i] = 0;
1063 +               mtrr_usage_table[i] = 0;
1064  }
1065
1066  int mtrr_add_page(unsigned long base, unsigned long size,
1067 -                 unsigned int type, char increment)
1068 +                 unsigned int type, bool increment)
1069  {
1070         int error;
1071         struct xen_platform_op op;
1072 @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1073         }
1074
1075         if (increment)
1076 -               ++usage_table[op.u.add_memtype.reg];
1077 +               ++mtrr_usage_table[op.u.add_memtype.reg];
1078
1079         mutex_unlock(&mtrr_mutex);
1080
1081 @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1082
1083  int
1084  mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1085 -        char increment)
1086 +        bool increment)
1087  {
1088         if (mtrr_check(base, size))
1089                 return -EINVAL;
1090 @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1091                         goto out;
1092                 }
1093         }
1094 -       if (usage_table[reg] < 1) {
1095 +       if (mtrr_usage_table[reg] < 1) {
1096                 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1097                 goto out;
1098         }
1099 -       if (--usage_table[reg] < 1) {
1100 +       if (--mtrr_usage_table[reg] < 1) {
1101                 op.cmd = XENPF_del_memtype;
1102                 op.u.del_memtype.handle = 0;
1103                 op.u.del_memtype.reg    = reg;
1104 --- sle11-2009-05-14.orig/arch/x86/kernel/e820_32-xen.c 2009-02-16 16:18:36.000000000 +0100
1105 +++ sle11-2009-05-14/arch/x86/kernel/e820_32-xen.c      2009-03-16 16:33:40.000000000 +0100
1106 @@ -7,7 +7,6 @@
1107  #include <linux/kexec.h>
1108  #include <linux/module.h>
1109  #include <linux/mm.h>
1110 -#include <linux/efi.h>
1111  #include <linux/pfn.h>
1112  #include <linux/uaccess.h>
1113  #include <linux/suspend.h>
1114 @@ -18,11 +17,6 @@
1115  #include <asm/setup.h>
1116  #include <xen/interface/memory.h>
1117
1118 -#ifdef CONFIG_EFI
1119 -int efi_enabled = 0;
1120 -EXPORT_SYMBOL(efi_enabled);
1121 -#endif
1122 -
1123  struct e820map e820;
1124  struct change_member {
1125         struct e820entry *pbios; /* pointer to original bios entry */
1126 @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1127  EXPORT_SYMBOL(pci_mem_start);
1128  #endif
1129  extern int user_defined_memmap;
1130 -struct resource data_resource = {
1131 -       .name   = "Kernel data",
1132 -       .start  = 0,
1133 -       .end    = 0,
1134 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1135 -};
1136 -
1137 -struct resource code_resource = {
1138 -       .name   = "Kernel code",
1139 -       .start  = 0,
1140 -       .end    = 0,
1141 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1142 -};
1143 -
1144 -struct resource bss_resource = {
1145 -       .name   = "Kernel bss",
1146 -       .start  = 0,
1147 -       .end    = 0,
1148 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1149 -};
1150
1151  static struct resource system_rom_resource = {
1152         .name   = "System ROM",
1153 @@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1154         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1155  };
1156
1157 -static struct resource video_ram_resource = {
1158 -       .name   = "Video RAM area",
1159 -       .start  = 0xa0000,
1160 -       .end    = 0xbffff,
1161 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1162 -};
1163 -
1164 -static struct resource standard_io_resources[] = { {
1165 -       .name   = "dma1",
1166 -       .start  = 0x0000,
1167 -       .end    = 0x001f,
1168 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1169 -}, {
1170 -       .name   = "pic1",
1171 -       .start  = 0x0020,
1172 -       .end    = 0x0021,
1173 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1174 -}, {
1175 -       .name   = "timer0",
1176 -       .start  = 0x0040,
1177 -       .end    = 0x0043,
1178 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1179 -}, {
1180 -       .name   = "timer1",
1181 -       .start  = 0x0050,
1182 -       .end    = 0x0053,
1183 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1184 -}, {
1185 -       .name   = "keyboard",
1186 -       .start  = 0x0060,
1187 -       .end    = 0x006f,
1188 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1189 -}, {
1190 -       .name   = "dma page reg",
1191 -       .start  = 0x0080,
1192 -       .end    = 0x008f,
1193 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1194 -}, {
1195 -       .name   = "pic2",
1196 -       .start  = 0x00a0,
1197 -       .end    = 0x00a1,
1198 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1199 -}, {
1200 -       .name   = "dma2",
1201 -       .start  = 0x00c0,
1202 -       .end    = 0x00df,
1203 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1204 -}, {
1205 -       .name   = "fpu",
1206 -       .start  = 0x00f0,
1207 -       .end    = 0x00ff,
1208 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1209 -} };
1210 -
1211  #define ROMSIGNATURE 0xaa55
1212
1213  static int __init romsignature(const unsigned char *rom)
1214 @@ -272,10 +192,9 @@ static struct e820map machine_e820;
1215   * Request address space for all standard RAM and ROM resources
1216   * and also for regions reported as reserved by the e820.
1217   */
1218 -static void __init
1219 -legacy_init_iomem_resources(struct resource *code_resource,
1220 -                           struct resource *data_resource,
1221 -                           struct resource *bss_resource)
1222 +void __init init_iomem_resources(struct resource *code_resource,
1223 +               struct resource *data_resource,
1224 +               struct resource *bss_resource)
1225  {
1226         int i;
1227
1228 @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1229
1230  #undef e820
1231
1232 -/*
1233 - * Request address space for all standard resources
1234 - *
1235 - * This is called just before pcibios_init(), which is also a
1236 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1237 - */
1238 -static int __init request_standard_resources(void)
1239 -{
1240 -       int i;
1241 -
1242 -       /* Nothing to do if not running in dom0. */
1243 -       if (!is_initial_xendomain())
1244 -               return 0;
1245 -
1246 -       printk("Setting up standard PCI resources\n");
1247 -       if (efi_enabled)
1248 -               efi_initialize_iomem_resources(&code_resource,
1249 -                               &data_resource, &bss_resource);
1250 -       else
1251 -               legacy_init_iomem_resources(&code_resource,
1252 -                               &data_resource, &bss_resource);
1253 -
1254 -       /* EFI systems may still have VGA */
1255 -       request_resource(&iomem_resource, &video_ram_resource);
1256 -
1257 -       /* request I/O space for devices used on all i[345]86 PCs */
1258 -       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1259 -               request_resource(&ioport_resource, &standard_io_resources[i]);
1260 -       return 0;
1261 -}
1262 -
1263 -subsys_initcall(request_standard_resources);
1264 -
1265  #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1266  /**
1267   * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1268 @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1269  {
1270         int x;
1271
1272 -       if (!efi_enabled) {
1273 -                       x = e820.nr_map;
1274 -
1275 -               if (x == E820MAX) {
1276 -                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1277 -                   return;
1278 -               }
1279 +       x = e820.nr_map;
1280
1281 -               e820.map[x].addr = start;
1282 -               e820.map[x].size = size;
1283 -               e820.map[x].type = type;
1284 -               e820.nr_map++;
1285 +       if (x == E820MAX) {
1286 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1287 +               return;
1288         }
1289 +
1290 +       e820.map[x].addr = start;
1291 +       e820.map[x].size = size;
1292 +       e820.map[x].type = type;
1293 +       e820.nr_map++;
1294  } /* add_memory_region */
1295
1296  /*
1297 @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1298  }
1299
1300  /*
1301 - * Callback for efi_memory_walk.
1302 - */
1303 -static int __init
1304 -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1305 -{
1306 -       unsigned long *max_pfn = arg, pfn;
1307 -
1308 -       if (start < end) {
1309 -               pfn = PFN_UP(end -1);
1310 -               if (pfn > *max_pfn)
1311 -                       *max_pfn = pfn;
1312 -       }
1313 -       return 0;
1314 -}
1315 -
1316 -static int __init
1317 -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1318 -{
1319 -       memory_present(0, PFN_UP(start), PFN_DOWN(end));
1320 -       return 0;
1321 -}
1322 -
1323 -/*
1324   * Find the highest page frame number we have available
1325   */
1326  void __init find_max_pfn(void)
1327 @@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1328         int i;
1329
1330         max_pfn = 0;
1331 -       if (efi_enabled) {
1332 -               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1333 -               efi_memmap_walk(efi_memory_present_wrapper, NULL);
1334 -               return;
1335 -       }
1336
1337         for (i = 0; i < e820.nr_map; i++) {
1338                 unsigned long start, end;
1339 @@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1340  }
1341
1342  /*
1343 - * Free all available memory for boot time allocation.  Used
1344 - * as a callback function by efi_memory_walk()
1345 - */
1346 -
1347 -static int __init
1348 -free_available_memory(unsigned long start, unsigned long end, void *arg)
1349 -{
1350 -       /* check max_low_pfn */
1351 -       if (start >= (max_low_pfn << PAGE_SHIFT))
1352 -               return 0;
1353 -       if (end >= (max_low_pfn << PAGE_SHIFT))
1354 -               end = max_low_pfn << PAGE_SHIFT;
1355 -       if (start < end)
1356 -               free_bootmem(start, end - start);
1357 -
1358 -       return 0;
1359 -}
1360 -/*
1361   * Register fully available low RAM pages with the bootmem allocator.
1362   */
1363  void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1364  {
1365         int i;
1366
1367 -       if (efi_enabled) {
1368 -               efi_memmap_walk(free_available_memory, NULL);
1369 -               return;
1370 -       }
1371         for (i = 0; i < e820.nr_map; i++) {
1372                 unsigned long curr_pfn, last_pfn, size;
1373                 /*
1374 @@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1375         }
1376  }
1377
1378 -static __init __always_inline void efi_limit_regions(unsigned long long size)
1379 -{
1380 -       unsigned long long current_addr = 0;
1381 -       efi_memory_desc_t *md, *next_md;
1382 -       void *p, *p1;
1383 -       int i, j;
1384 -
1385 -       j = 0;
1386 -       p1 = memmap.map;
1387 -       for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1388 -               md = p;
1389 -               next_md = p1;
1390 -               current_addr = md->phys_addr +
1391 -                       PFN_PHYS(md->num_pages);
1392 -               if (is_available_memory(md)) {
1393 -                       if (md->phys_addr >= size) continue;
1394 -                       memcpy(next_md, md, memmap.desc_size);
1395 -                       if (current_addr >= size) {
1396 -                               next_md->num_pages -=
1397 -                                       PFN_UP(current_addr-size);
1398 -                       }
1399 -                       p1 += memmap.desc_size;
1400 -                       next_md = p1;
1401 -                       j++;
1402 -               } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1403 -                          EFI_MEMORY_RUNTIME) {
1404 -                       /* In order to make runtime services
1405 -                        * available we have to include runtime
1406 -                        * memory regions in memory map */
1407 -                       memcpy(next_md, md, memmap.desc_size);
1408 -                       p1 += memmap.desc_size;
1409 -                       next_md = p1;
1410 -                       j++;
1411 -               }
1412 -       }
1413 -       memmap.nr_map = j;
1414 -       memmap.map_end = memmap.map +
1415 -               (memmap.nr_map * memmap.desc_size);
1416 -}
1417 -
1418  void __init limit_regions(unsigned long long size)
1419  {
1420         unsigned long long current_addr = 0;
1421         int i;
1422
1423         print_memory_map("limit_regions start");
1424 -       if (efi_enabled) {
1425 -               efi_limit_regions(size);
1426 -               return;
1427 -       }
1428         for (i = 0; i < e820.nr_map; i++) {
1429                 current_addr = e820.map[i].addr + e820.map[i].size;
1430                 if (current_addr < size)
1431 @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1432         return 0;
1433  }
1434  early_param("memmap", parse_memmap);
1435 +
1436 +#ifndef CONFIG_XEN
1437 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1438 +                               unsigned new_type)
1439 +{
1440 +       int i;
1441 +
1442 +       BUG_ON(old_type == new_type);
1443 +
1444 +       for (i = 0; i < e820.nr_map; i++) {
1445 +               struct e820entry *ei = &e820.map[i];
1446 +               u64 final_start, final_end;
1447 +               if (ei->type != old_type)
1448 +                       continue;
1449 +               /* totally covered? */
1450 +               if (ei->addr >= start && ei->size <= size) {
1451 +                       ei->type = new_type;
1452 +                       continue;
1453 +               }
1454 +               /* partially covered */
1455 +               final_start = max(start, ei->addr);
1456 +               final_end = min(start + size, ei->addr + ei->size);
1457 +               if (final_start >= final_end)
1458 +                       continue;
1459 +               add_memory_region(final_start, final_end - final_start,
1460 +                                        new_type);
1461 +       }
1462 +}
1463 +
1464 +void __init update_e820(void)
1465 +{
1466 +       u8 nr_map;
1467 +
1468 +       nr_map = e820.nr_map;
1469 +       if (sanitize_e820_map(e820.map, &nr_map))
1470 +               return;
1471 +       e820.nr_map = nr_map;
1472 +       printk(KERN_INFO "modified physical RAM map:\n");
1473 +       print_memory_map("modified");
1474 +}
1475 +#endif
1476 --- sle11-2009-05-14.orig/arch/x86/kernel/e820_64-xen.c 2009-02-16 16:18:36.000000000 +0100
1477 +++ sle11-2009-05-14/arch/x86/kernel/e820_64-xen.c      2009-03-16 16:33:40.000000000 +0100
1478 @@ -1,4 +1,4 @@
1479 -/*
1480 +/*
1481   * Handle the memory map.
1482   * The functions here do the job until bootmem takes over.
1483   *
1484 @@ -26,6 +26,7 @@
1485  #include <asm/proto.h>
1486  #include <asm/setup.h>
1487  #include <asm/sections.h>
1488 +#include <asm/kdebug.h>
1489  #include <xen/interface/memory.h>
1490
1491  struct e820map e820 __initdata;
1492 @@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1493  struct e820map machine_e820;
1494  #endif
1495
1496 -/*
1497 +/*
1498   * PFN of last memory page.
1499   */
1500 -unsigned long end_pfn;
1501 -EXPORT_SYMBOL(end_pfn);
1502 +unsigned long end_pfn;
1503
1504 -/*
1505 +/*
1506   * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1507   * The direct mapping extends to end_pfn_map, so that we can directly access
1508   * apertures, ACPI and other tables without having to play with fixmaps.
1509 - */
1510 -unsigned long end_pfn_map;
1511 + */
1512 +unsigned long end_pfn_map;
1513
1514 -/*
1515 +/*
1516   * Last pfn which the user wants to use.
1517   */
1518  static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1519
1520 -extern struct resource code_resource, data_resource, bss_resource;
1521 -
1522 -/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1523 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
1524 -{
1525 -       unsigned long addr = *addrp, last = addr + size;
1526 +/*
1527 + * Early reserved memory areas.
1528 + */
1529 +#define MAX_EARLY_RES 20
1530
1531 +struct early_res {
1532 +       unsigned long start, end;
1533 +       char name[16];
1534 +};
1535 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1536  #ifndef CONFIG_XEN
1537 -       /* various gunk below that needed for SMP startup */
1538 -       if (addr < 0x8000) {
1539 -               *addrp = PAGE_ALIGN(0x8000);
1540 -               return 1;
1541 -       }
1542 -
1543 -       /* direct mapping tables of the kernel */
1544 -       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1545 -               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1546 -               return 1;
1547 -       }
1548 -
1549 -       /* initrd */
1550 -#ifdef CONFIG_BLK_DEV_INITRD
1551 -       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1552 -               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1553 -               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
1554 -               unsigned long ramdisk_end   = ramdisk_image+ramdisk_size;
1555 -
1556 -               if (last >= ramdisk_image && addr < ramdisk_end) {
1557 -                       *addrp = PAGE_ALIGN(ramdisk_end);
1558 -                       return 1;
1559 -               }
1560 -       }
1561 +       { 0, PAGE_SIZE, "BIOS data page" },                     /* BIOS data page */
1562 +#ifdef CONFIG_SMP
1563 +       { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1564  #endif
1565 -       /* kernel code */
1566 -       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1567 -               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1568 -               return 1;
1569 -       }
1570 +#endif
1571 +       {}
1572 +};
1573
1574 -       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1575 -               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1576 -               return 1;
1577 +void __init reserve_early(unsigned long start, unsigned long end, char *name)
1578 +{
1579 +       int i;
1580 +       struct early_res *r;
1581 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1582 +               r = &early_res[i];
1583 +               if (end > r->start && start < r->end)
1584 +                       panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1585 +                             start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1586         }
1587 +       if (i >= MAX_EARLY_RES)
1588 +               panic("Too many early reservations");
1589 +       r = &early_res[i];
1590 +       r->start = start;
1591 +       r->end = end;
1592 +       if (name)
1593 +               strncpy(r->name, name, sizeof(r->name) - 1);
1594 +}
1595
1596 -#ifdef CONFIG_NUMA
1597 -       /* NUMA memory to node map */
1598 -       if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1599 -               *addrp = nodemap_addr + nodemap_size;
1600 -               return 1;
1601 +void __init early_res_to_bootmem(void)
1602 +{
1603 +       int i;
1604 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1605 +               struct early_res *r = &early_res[i];
1606 +               printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1607 +                       r->start, r->end - 1, r->name);
1608 +               reserve_bootmem_generic(r->start, r->end - r->start);
1609         }
1610 -#endif
1611 -       /* XXX ramdisk image here? */
1612 -#else
1613 -       if (last < (table_end<<PAGE_SHIFT)) {
1614 -               *addrp = table_end << PAGE_SHIFT;
1615 -               return 1;
1616 +}
1617 +
1618 +/* Check for already reserved areas */
1619 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
1620 +{
1621 +       int i;
1622 +       unsigned long addr = *addrp, last;
1623 +       int changed = 0;
1624 +again:
1625 +       last = addr + size;
1626 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1627 +               struct early_res *r = &early_res[i];
1628 +               if (last >= r->start && addr < r->end) {
1629 +                       *addrp = addr = r->end;
1630 +                       changed = 1;
1631 +                       goto again;
1632 +               }
1633         }
1634 -#endif
1635 -       return 0;
1636 -}
1637 +       return changed;
1638 +}
1639
1640  /*
1641   * This function checks if any part of the range <start,end> is mapped
1642   * with type.
1643   */
1644 -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1645 -{
1646 +int
1647 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1648 +{
1649         int i;
1650
1651  #ifndef CONFIG_XEN
1652 -       for (i = 0; i < e820.nr_map; i++) {
1653 -               struct e820entry *ei = &e820.map[i];
1654 +       for (i = 0; i < e820.nr_map; i++) {
1655 +               struct e820entry *ei = &e820.map[i];
1656  #else
1657         if (!is_initial_xendomain())
1658                 return 0;
1659 @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1660                 const struct e820entry *ei = &machine_e820.map[i];
1661  #endif
1662
1663 -               if (type && ei->type != type)
1664 +               if (type && ei->type != type)
1665                         continue;
1666                 if (ei->addr >= end || ei->addr + ei->size <= start)
1667 -                       continue;
1668 -               return 1;
1669 -       }
1670 +                       continue;
1671 +               return 1;
1672 +       }
1673         return 0;
1674  }
1675  EXPORT_SYMBOL_GPL(e820_any_mapped);
1676 @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1677   * Note: this function only works correct if the e820 table is sorted and
1678   * not-overlapping, which is the case
1679   */
1680 -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1681 +int __init e820_all_mapped(unsigned long start, unsigned long end,
1682 +                          unsigned type)
1683  {
1684         int i;
1685
1686 @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1687                  */
1688                 if (ei->addr <= start)
1689                         start = ei->addr + ei->size;
1690 -               /* if start is now at or beyond end, we're done, full coverage */
1691 +               /*
1692 +                * if start is now at or beyond end, we're done, full
1693 +                * coverage
1694 +                */
1695                 if (start >= end)
1696 -                       return 1; /* we're done */
1697 +                       return 1;
1698         }
1699         return 0;
1700  }
1701
1702 -/*
1703 - * Find a free area in a specific range.
1704 - */
1705 -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1706 -{
1707 -       int i;
1708 -       for (i = 0; i < e820.nr_map; i++) {
1709 -               struct e820entry *ei = &e820.map[i];
1710 -               unsigned long addr = ei->addr, last;
1711 -               if (ei->type != E820_RAM)
1712 -                       continue;
1713 -               if (addr < start)
1714 +/*
1715 + * Find a free area with specified alignment in a specific range.
1716 + */
1717 +unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1718 +                                   unsigned size, unsigned long align)
1719 +{
1720 +       int i;
1721 +       unsigned long mask = ~(align - 1);
1722 +
1723 +       for (i = 0; i < e820.nr_map; i++) {
1724 +               struct e820entry *ei = &e820.map[i];
1725 +               unsigned long addr = ei->addr, last;
1726 +
1727 +               if (ei->type != E820_RAM)
1728 +                       continue;
1729 +               if (addr < start)
1730                         addr = start;
1731 -               if (addr > ei->addr + ei->size)
1732 -                       continue;
1733 +               if (addr > ei->addr + ei->size)
1734 +                       continue;
1735                 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1736                         ;
1737 -               last = PAGE_ALIGN(addr) + size;
1738 +               addr = (addr + align - 1) & mask;
1739 +               last = addr + size;
1740                 if (last > ei->addr + ei->size)
1741                         continue;
1742 -               if (last > end)
1743 +               if (last > end)
1744                         continue;
1745 -               return addr;
1746 -       }
1747 -       return -1UL;
1748 -}
1749 +               return addr;
1750 +       }
1751 +       return -1UL;
1752 +}
1753
1754  /*
1755   * Find the highest page frame number we have available
1756   */
1757  unsigned long __init e820_end_of_ram(void)
1758  {
1759 -       unsigned long end_pfn = 0;
1760 +       unsigned long end_pfn;
1761 +
1762         end_pfn = find_max_pfn_with_active_regions();
1763 -
1764 -       if (end_pfn > end_pfn_map)
1765 +
1766 +       if (end_pfn > end_pfn_map)
1767                 end_pfn_map = end_pfn;
1768         if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1769                 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1770         if (end_pfn > end_user_pfn)
1771                 end_pfn = end_user_pfn;
1772 -       if (end_pfn > end_pfn_map)
1773 -               end_pfn = end_pfn_map;
1774 +       if (end_pfn > end_pfn_map)
1775 +               end_pfn = end_pfn_map;
1776
1777 -       printk("end_pfn_map = %lu\n", end_pfn_map);
1778 -       return end_pfn;
1779 +       printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1780 +       return end_pfn;
1781  }
1782
1783  /*
1784   * Mark e820 reserved areas as busy for the resource manager.
1785   */
1786 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1787 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1788 +                                  struct resource *code_resource,
1789 +                                  struct resource *data_resource,
1790 +                                  struct resource *bss_resource)
1791  {
1792         int i;
1793         for (i = 0; i < nr_map; i++) {
1794 @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1795                 request_resource(&iomem_resource, res);
1796                 if (e820[i].type == E820_RAM) {
1797                         /*
1798 -                        *  We don't know which RAM region contains kernel data,
1799 -                        *  so we try it repeatedly and let the resource manager
1800 -                        *  test it.
1801 +                        * We don't know which RAM region contains kernel data,
1802 +                        * so we try it repeatedly and let the resource manager
1803 +                        * test it.
1804                          */
1805  #ifndef CONFIG_XEN
1806 -                       request_resource(res, &code_resource);
1807 -                       request_resource(res, &data_resource);
1808 -                       request_resource(res, &bss_resource);
1809 +                       request_resource(res, code_resource);
1810 +                       request_resource(res, data_resource);
1811 +                       request_resource(res, bss_resource);
1812  #endif
1813  #ifdef CONFIG_KEXEC
1814                         if (crashk_res.start != crashk_res.end)
1815 @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1816                         add_active_range(nid, ei_startpfn, ei_endpfn);
1817  }
1818
1819 -/*
1820 +/*
1821   * Add a memory region to the kernel e820 map.
1822 - */
1823 + */
1824  void __init add_memory_region(unsigned long start, unsigned long size, int type)
1825  {
1826         int x = e820.nr_map;
1827 @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1828  {
1829         unsigned long start_pfn = start >> PAGE_SHIFT;
1830         unsigned long end_pfn = end >> PAGE_SHIFT;
1831 -       unsigned long ei_startpfn;
1832 -       unsigned long ei_endpfn;
1833 -       unsigned long ram = 0;
1834 +       unsigned long ei_startpfn, ei_endpfn, ram = 0;
1835         int i;
1836
1837         for (i = 0; i < e820.nr_map; i++) {
1838 @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1839         return end - start - (ram << PAGE_SHIFT);
1840  }
1841
1842 -void __init e820_print_map(char *who)
1843 +static void __init e820_print_map(char *who)
1844  {
1845         int i;
1846
1847         for (i = 0; i < e820.nr_map; i++) {
1848                 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1849 -                       (unsigned long long) e820.map[i].addr,
1850 -                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1851 +                      (unsigned long long) e820.map[i].addr,
1852 +                      (unsigned long long)
1853 +                      (e820.map[i].addr + e820.map[i].size));
1854                 switch (e820.map[i].type) {
1855 -               case E820_RAM:  printk("(usable)\n");
1856 -                               break;
1857 +               case E820_RAM:
1858 +                       printk(KERN_CONT "(usable)\n");
1859 +                       break;
1860                 case E820_RESERVED:
1861 -                               printk("(reserved)\n");
1862 -                               break;
1863 +                       printk(KERN_CONT "(reserved)\n");
1864 +                       break;
1865                 case E820_ACPI:
1866 -                               printk("(ACPI data)\n");
1867 -                               break;
1868 +                       printk(KERN_CONT "(ACPI data)\n");
1869 +                       break;
1870                 case E820_NVS:
1871 -                               printk("(ACPI NVS)\n");
1872 -                               break;
1873 -               default:        printk("type %u\n", e820.map[i].type);
1874 -                               break;
1875 +                       printk(KERN_CONT "(ACPI NVS)\n");
1876 +                       break;
1877 +               default:
1878 +                       printk(KERN_CONT "type %u\n", e820.map[i].type);
1879 +                       break;
1880                 }
1881         }
1882  }
1883 @@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
1884  /*
1885   * Sanitize the BIOS e820 map.
1886   *
1887 - * Some e820 responses include overlapping entries.  The following
1888 + * Some e820 responses include overlapping entries. The following
1889   * replaces the original e820 map with a new one, removing overlaps.
1890   *
1891   */
1892 -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
1893 +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
1894  {
1895         struct change_member {
1896                 struct e820entry *pbios; /* pointer to original bios entry */
1897 @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
1898         int i;
1899
1900         /*
1901 -               Visually we're performing the following (1,2,3,4 = memory types)...
1902 +               Visually we're performing the following
1903 +               (1,2,3,4 = memory types)...
1904
1905                 Sample memory map (w/overlaps):
1906                    ____22__________________
1907 @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
1908         old_nr = *pnr_map;
1909
1910         /* bail out if we find any unreasonable addresses in bios map */
1911 -       for (i=0; i<old_nr; i++)
1912 +       for (i = 0; i < old_nr; i++)
1913                 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1914                         return -1;
1915
1916         /* create pointers for initial change-point information (for sorting) */
1917 -       for (i=0; i < 2*old_nr; i++)
1918 +       for (i = 0; i < 2 * old_nr; i++)
1919                 change_point[i] = &change_point_list[i];
1920
1921         /* record all known change-points (starting and ending addresses),
1922            omitting those that are for empty memory regions */
1923         chgidx = 0;
1924 -       for (i=0; i < old_nr; i++)      {
1925 +       for (i = 0; i < old_nr; i++)    {
1926                 if (biosmap[i].size != 0) {
1927                         change_point[chgidx]->addr = biosmap[i].addr;
1928                         change_point[chgidx++]->pbios = &biosmap[i];
1929 -                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
1930 +                       change_point[chgidx]->addr = biosmap[i].addr +
1931 +                               biosmap[i].size;
1932                         change_point[chgidx++]->pbios = &biosmap[i];
1933                 }
1934         }
1935 @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
1936         still_changing = 1;
1937         while (still_changing)  {
1938                 still_changing = 0;
1939 -               for (i=1; i < chg_nr; i++)  {
1940 -                       /* if <current_addr> > <last_addr>, swap */
1941 -                       /* or, if current=<start_addr> & last=<end_addr>, swap */
1942 -                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
1943 -                               ((change_point[i]->addr == change_point[i-1]->addr) &&
1944 -                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
1945 -                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
1946 -                          )
1947 -                       {
1948 +               for (i = 1; i < chg_nr; i++)  {
1949 +                       unsigned long long curaddr, lastaddr;
1950 +                       unsigned long long curpbaddr, lastpbaddr;
1951 +
1952 +                       curaddr = change_point[i]->addr;
1953 +                       lastaddr = change_point[i - 1]->addr;
1954 +                       curpbaddr = change_point[i]->pbios->addr;
1955 +                       lastpbaddr = change_point[i - 1]->pbios->addr;
1956 +
1957 +                       /*
1958 +                        * swap entries, when:
1959 +                        *
1960 +                        * curaddr > lastaddr or
1961 +                        * curaddr == lastaddr and curaddr == curpbaddr and
1962 +                        * lastaddr != lastpbaddr
1963 +                        */
1964 +                       if (curaddr < lastaddr ||
1965 +                           (curaddr == lastaddr && curaddr == curpbaddr &&
1966 +                            lastaddr != lastpbaddr)) {
1967                                 change_tmp = change_point[i];
1968                                 change_point[i] = change_point[i-1];
1969                                 change_point[i-1] = change_tmp;
1970 -                               still_changing=1;
1971 +                               still_changing = 1;
1972                         }
1973                 }
1974         }
1975
1976         /* create a new bios memory map, removing overlaps */
1977 -       overlap_entries=0;       /* number of entries in the overlap table */
1978 -       new_bios_entry=0;        /* index for creating new bios map entries */
1979 +       overlap_entries = 0;     /* number of entries in the overlap table */
1980 +       new_bios_entry = 0;      /* index for creating new bios map entries */
1981         last_type = 0;           /* start with undefined memory type */
1982         last_addr = 0;           /* start with 0 as last starting address */
1983 +
1984         /* loop through change-points, determining affect on the new bios map */
1985 -       for (chgidx=0; chgidx < chg_nr; chgidx++)
1986 -       {
1987 +       for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1988                 /* keep track of all overlapping bios entries */
1989 -               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
1990 -               {
1991 -                       /* add map entry to overlap list (> 1 entry implies an overlap) */
1992 -                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
1993 -               }
1994 -               else
1995 -               {
1996 -                       /* remove entry from list (order independent, so swap with last) */
1997 -                       for (i=0; i<overlap_entries; i++)
1998 -                       {
1999 -                               if (overlap_list[i] == change_point[chgidx]->pbios)
2000 -                                       overlap_list[i] = overlap_list[overlap_entries-1];
2001 +               if (change_point[chgidx]->addr ==
2002 +                   change_point[chgidx]->pbios->addr) {
2003 +                       /*
2004 +                        * add map entry to overlap list (> 1 entry
2005 +                        * implies an overlap)
2006 +                        */
2007 +                       overlap_list[overlap_entries++] =
2008 +                               change_point[chgidx]->pbios;
2009 +               } else {
2010 +                       /*
2011 +                        * remove entry from list (order independent,
2012 +                        * so swap with last)
2013 +                        */
2014 +                       for (i = 0; i < overlap_entries; i++) {
2015 +                               if (overlap_list[i] ==
2016 +                                   change_point[chgidx]->pbios)
2017 +                                       overlap_list[i] =
2018 +                                               overlap_list[overlap_entries-1];
2019                         }
2020                         overlap_entries--;
2021                 }
2022 -               /* if there are overlapping entries, decide which "type" to use */
2023 -               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2024 +               /*
2025 +                * if there are overlapping entries, decide which
2026 +                * "type" to use (larger value takes precedence --
2027 +                * 1=usable, 2,3,4,4+=unusable)
2028 +                */
2029                 current_type = 0;
2030 -               for (i=0; i<overlap_entries; i++)
2031 +               for (i = 0; i < overlap_entries; i++)
2032                         if (overlap_list[i]->type > current_type)
2033                                 current_type = overlap_list[i]->type;
2034 -               /* continue building up new bios map based on this information */
2035 +               /*
2036 +                * continue building up new bios map based on this
2037 +                * information
2038 +                */
2039                 if (current_type != last_type)  {
2040                         if (last_type != 0)      {
2041                                 new_bios[new_bios_entry].size =
2042                                         change_point[chgidx]->addr - last_addr;
2043 -                               /* move forward only if the new size was non-zero */
2044 +                               /*
2045 +                                * move forward only if the new size
2046 +                                * was non-zero
2047 +                                */
2048                                 if (new_bios[new_bios_entry].size != 0)
2049 +                                       /*
2050 +                                        * no more space left for new
2051 +                                        * bios entries ?
2052 +                                        */
2053                                         if (++new_bios_entry >= E820MAX)
2054 -                                               break;  /* no more space left for new bios entries */
2055 +                                               break;
2056                         }
2057                         if (current_type != 0)  {
2058 -                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2059 +                               new_bios[new_bios_entry].addr =
2060 +                                       change_point[chgidx]->addr;
2061                                 new_bios[new_bios_entry].type = current_type;
2062 -                               last_addr=change_point[chgidx]->addr;
2063 +                               last_addr = change_point[chgidx]->addr;
2064                         }
2065                         last_type = current_type;
2066                 }
2067         }
2068 -       new_nr = new_bios_entry;   /* retain count for new bios entries */
2069 +       /* retain count for new bios entries */
2070 +       new_nr = new_bios_entry;
2071
2072         /* copy new bios mapping into original location */
2073 -       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2074 +       memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2075         *pnr_map = new_nr;
2076
2077         return 0;
2078 @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2079   * will have given us a memory map that we can use to properly
2080   * set up memory.  If we aren't, we'll fake a memory map.
2081   */
2082 -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2083 +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2084  {
2085  #ifndef CONFIG_XEN
2086         /* Only one memory region (or negative)? Ignore it */
2087 @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2088                         return -1;
2089
2090                 add_memory_region(start, size, type);
2091 -       } while (biosmap++,--nr_map);
2092 +       } while (biosmap++, --nr_map);
2093
2094  #ifdef CONFIG_XEN
2095         if (is_initial_xendomain()) {
2096 @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2097         return 0;
2098  }
2099
2100 -void early_panic(char *msg)
2101 +static void early_panic(char *msg)
2102  {
2103         early_printk(msg);
2104         panic(msg);
2105  }
2106
2107 -#ifndef CONFIG_XEN
2108 -void __init setup_memory_region(void)
2109 +/* We're not void only for x86 32-bit compat */
2110 +char * __init machine_specific_memory_setup(void)
2111  {
2112 +#ifndef CONFIG_XEN
2113 +       char *who = "BIOS-e820";
2114         /*
2115          * Try to copy the BIOS-supplied E820-map.
2116          *
2117 @@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2118         sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2119         if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2120                 early_panic("Cannot find a valid memory map");
2121 -       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2122 -       e820_print_map("BIOS-e820");
2123 -}
2124 -
2125  #else  /* CONFIG_XEN */
2126 -
2127 -void __init setup_memory_region(void)
2128 -{
2129 +       char *who = "Xen";
2130         int rc;
2131         struct xen_memory_map memmap;
2132         /*
2133 @@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2134
2135         if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2136                 early_panic("Cannot find a valid memory map");
2137 -
2138 +#endif
2139         printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2140 -       e820_print_map("Xen");
2141 +       e820_print_map(who);
2142 +
2143 +       /* In case someone cares... */
2144 +       return who;
2145  }
2146 -#endif
2147
2148  static int __init parse_memopt(char *p)
2149  {
2150 @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2151         if (!p)
2152                 return -EINVAL;
2153         end_user_pfn = memparse(p, &p);
2154 -       end_user_pfn >>= PAGE_SHIFT;
2155 +       end_user_pfn >>= PAGE_SHIFT;
2156
2157         end = end_user_pfn<<PAGE_SHIFT;
2158         i = e820.nr_map-1;
2159 @@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2160         }
2161
2162         return 0;
2163 -}
2164 +}
2165  early_param("mem", parse_memopt);
2166
2167  static int userdef __initdata;
2168 @@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2169
2170         if (!strcmp(p, "exactmap")) {
2171  #ifdef CONFIG_CRASH_DUMP
2172 -               /* If we are doing a crash dump, we
2173 -                * still need to know the real mem
2174 -                * size before original memory map is
2175 +               /*
2176 +                * If we are doing a crash dump, we still need to know
2177 +                * the real mem size before original memory map is
2178                  * reset.
2179                  */
2180                 e820_register_active_regions(0, 0, -1UL);
2181 @@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2182         mem_size = memparse(p, &p);
2183         if (p == oldp)
2184                 return -EINVAL;
2185 +
2186 +       userdef = 1;
2187         if (*p == '@') {
2188                 start_at = memparse(p+1, &p);
2189                 add_memory_region(start_at, mem_size, E820_RAM);
2190 @@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2191  void __init finish_e820_parsing(void)
2192  {
2193         if (userdef) {
2194 +               char nr = e820.nr_map;
2195 +
2196 +               if (sanitize_e820_map(e820.map, &nr) < 0)
2197 +                       early_panic("Invalid user supplied memory map");
2198 +               e820.nr_map = nr;
2199 +
2200                 printk(KERN_INFO "user-defined physical RAM map:\n");
2201                 e820_print_map("user");
2202         }
2203  }
2204
2205 +#ifndef CONFIG_XEN
2206 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2207 +                               unsigned new_type)
2208 +{
2209 +       int i;
2210 +
2211 +       BUG_ON(old_type == new_type);
2212 +
2213 +       for (i = 0; i < e820.nr_map; i++) {
2214 +               struct e820entry *ei = &e820.map[i];
2215 +               u64 final_start, final_end;
2216 +               if (ei->type != old_type)
2217 +                       continue;
2218 +               /* totally covered? */
2219 +               if (ei->addr >= start && ei->size <= size) {
2220 +                       ei->type = new_type;
2221 +                       continue;
2222 +               }
2223 +               /* partially covered */
2224 +               final_start = max(start, ei->addr);
2225 +               final_end = min(start + size, ei->addr + ei->size);
2226 +               if (final_start >= final_end)
2227 +                       continue;
2228 +               add_memory_region(final_start, final_end - final_start,
2229 +                                        new_type);
2230 +       }
2231 +}
2232 +
2233 +void __init update_e820(void)
2234 +{
2235 +       u8 nr_map;
2236 +
2237 +       nr_map = e820.nr_map;
2238 +       if (sanitize_e820_map(e820.map, &nr_map))
2239 +               return;
2240 +       e820.nr_map = nr_map;
2241 +       printk(KERN_INFO "modified physical RAM map:\n");
2242 +       e820_print_map("modified");
2243 +}
2244 +#endif
2245 +
2246  unsigned long pci_mem_start = 0xaeedbabe;
2247  EXPORT_SYMBOL(pci_mem_start);
2248
2249 @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2250
2251         if (!found) {
2252                 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2253 -               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2254 -                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2255 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2256 +                      "address range\n"
2257 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource "
2258 +                      "registers may break!\n");
2259         }
2260
2261         /*
2262 @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2263         /* Fun with two's complement */
2264         pci_mem_start = (gapstart + round) & -round;
2265
2266 -       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2267 -               pci_mem_start, gapstart, gapsize);
2268 +       printk(KERN_INFO
2269 +              "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2270 +              pci_mem_start, gapstart, gapsize);
2271  }
2272
2273  int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2274 --- sle11-2009-05-14.orig/arch/x86/kernel/early_printk-xen.c    2009-02-16 16:18:36.000000000 +0100
2275 +++ sle11-2009-05-14/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
2276 @@ -222,7 +222,7 @@ static struct console simnow_console = {
2277  };
2278
2279  /* Direct interface for emergencies */
2280 -struct console *early_console = &early_vga_console;
2281 +static struct console *early_console = &early_vga_console;
2282  static int early_console_initialized = 0;
2283
2284  void early_printk(const char *fmt, ...)
2285 --- sle11-2009-05-14.orig/arch/x86/kernel/entry_32-xen.S        2009-05-14 11:18:18.000000000 +0200
2286 +++ sle11-2009-05-14/arch/x86/kernel/entry_32-xen.S     2009-05-14 11:18:32.000000000 +0200
2287 @@ -59,7 +59,7 @@
2288   * for paravirtualization.  The following will never clobber any registers:
2289   *   INTERRUPT_RETURN (aka. "iret")
2290   *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2291 - *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2292 + *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2293   *
2294   * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2295   * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2296 @@ -282,16 +282,21 @@ END(resume_kernel)
2297  #endif
2298         CFI_ENDPROC
2299
2300 +       .macro test_tif ti_reg          # system call tracing in operation / emulation
2301 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2302 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2303 +       .endm
2304 +
2305  /* SYSENTER_RETURN points to after the "sysenter" instruction in
2306     the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2307
2308         # sysenter call handler stub
2309 -ENTRY(sysenter_entry)
2310 +ENTRY(ia32_sysenter_target)
2311         CFI_STARTPROC simple
2312         CFI_SIGNAL_FRAME
2313         CFI_DEF_CFA esp, 0
2314         CFI_REGISTER esp, ebp
2315 -       movl SYSENTER_stack_esp0(%esp),%esp
2316 +       movl SYSENTER_stack_sp0(%esp),%esp
2317  sysenter_past_esp:
2318         /*
2319          * No need to follow this irqs on/off section: the syscall
2320 @@ -334,9 +339,7 @@ sysenter_past_esp:
2321         CFI_ADJUST_CFA_OFFSET 4
2322         SAVE_ALL
2323         GET_THREAD_INFO(%ebp)
2324 -
2325 -       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2326 -       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2327 +       test_tif %ebp
2328         jnz syscall_trace_entry
2329         cmpl $(nr_syscalls), %eax
2330         jae syscall_badsys
2331 @@ -354,7 +357,7 @@ sysenter_past_esp:
2332         xorl %ebp,%ebp
2333         TRACE_IRQS_ON
2334  1:     mov  PT_FS(%esp), %fs
2335 -       ENABLE_INTERRUPTS_SYSEXIT
2336 +       ENABLE_INTERRUPTS_SYSCALL_RET
2337         CFI_ENDPROC
2338  .pushsection .fixup,"ax"
2339  2:     movl $0,PT_FS(%esp)
2340 @@ -363,10 +366,10 @@ sysenter_past_esp:
2341         .align 4
2342         .long 1b,2b
2343  .popsection
2344 -ENDPROC(sysenter_entry)
2345 +ENDPROC(ia32_sysenter_target)
2346
2347         # pv sysenter call handler stub
2348 -ENTRY(sysenter_entry_pv)
2349 +ENTRY(ia32pv_sysenter_target)
2350         RING0_INT_FRAME
2351         movl $__USER_DS,16(%esp)
2352         movl %ebp,12(%esp)
2353 @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2354  .previous
2355         /* fall through */
2356         CFI_ENDPROC
2357 -ENDPROC(sysenter_entry_pv)
2358 +ENDPROC(ia32pv_sysenter_target)
2359
2360         # system call handler stub
2361  ENTRY(system_call)
2362 @@ -398,9 +401,7 @@ ENTRY(system_call)
2363         CFI_ADJUST_CFA_OFFSET 4
2364         SAVE_ALL
2365         GET_THREAD_INFO(%ebp)
2366 -                                       # system call tracing in operation / emulation
2367 -       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2368 -       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2369 +       test_tif %ebp
2370         jnz syscall_trace_entry
2371         cmpl $(nr_syscalls), %eax
2372         jae syscall_badsys
2373 @@ -452,7 +453,8 @@ restore_nocheck_notrace:
2374         RESTORE_REGS
2375         addl $4, %esp                   # skip orig_eax/error_code
2376         CFI_ADJUST_CFA_OFFSET -4
2377 -1:     INTERRUPT_RETURN
2378 +irq_return:
2379 +       INTERRUPT_RETURN
2380  .section .fixup,"ax"
2381  iret_exc:
2382         pushl $0                        # no error code
2383 @@ -461,7 +463,7 @@ iret_exc:
2384  .previous
2385  .section __ex_table,"a"
2386         .align 4
2387 -       .long 1b,iret_exc
2388 +       .long irq_return,iret_exc
2389  .previous
2390
2391         CFI_RESTORE_STATE
2392 @@ -657,7 +659,7 @@ END(syscall_badsys)
2393   * Build the entry stubs and pointer table with
2394   * some assembler magic.
2395   */
2396 -.data
2397 +.section .rodata,"a"
2398  ENTRY(interrupt)
2399  .text
2400
2401 @@ -963,7 +965,7 @@ END(device_not_available)
2402   * that sets up the real kernel stack. Check here, since we can't
2403   * allow the wrong stack to be used.
2404   *
2405 - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2406 + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2407   * already pushed 3 words if it hits on the sysenter instruction:
2408   * eflags, cs and eip.
2409   *
2410 @@ -975,7 +977,7 @@ END(device_not_available)
2411         cmpw $__KERNEL_CS,4(%esp);              \
2412         jne ok;                                 \
2413  label:                                         \
2414 -       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
2415 +       movl SYSENTER_stack_sp0+offset(%esp),%esp;      \
2416         CFI_DEF_CFA esp, 0;                     \
2417         CFI_UNDEFINED eip;                      \
2418         pushfl;                                 \
2419 @@ -990,7 +992,7 @@ label:                                              \
2420  KPROBE_ENTRY(debug)
2421         RING0_INT_FRAME
2422  #ifndef CONFIG_XEN
2423 -       cmpl $sysenter_entry,(%esp)
2424 +       cmpl $ia32_sysenter_target,(%esp)
2425         jne debug_stack_correct
2426         FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2427  debug_stack_correct:
2428 @@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi)
2429         popl %eax
2430         CFI_ADJUST_CFA_OFFSET -4
2431         je nmi_espfix_stack
2432 -       cmpl $sysenter_entry,(%esp)
2433 +       cmpl $ia32_sysenter_target,(%esp)
2434         je nmi_stack_fixup
2435         pushl %eax
2436         CFI_ADJUST_CFA_OFFSET 4
2437 @@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi)
2438         popl %eax
2439         CFI_ADJUST_CFA_OFFSET -4
2440         jae nmi_stack_correct
2441 -       cmpl $sysenter_entry,12(%esp)
2442 +       cmpl $ia32_sysenter_target,12(%esp)
2443         je nmi_debug_stack_check
2444  nmi_stack_correct:
2445         /* We have a RING0_INT_FRAME here */
2446 @@ -1089,12 +1091,8 @@ nmi_espfix_stack:
2447         RESTORE_REGS
2448         lss 12+4(%esp), %esp            # back to espfix stack
2449         CFI_ADJUST_CFA_OFFSET -24
2450 -1:     INTERRUPT_RETURN
2451 +       jmp irq_return
2452         CFI_ENDPROC
2453 -.section __ex_table,"a"
2454 -       .align 4
2455 -       .long 1b,iret_exc
2456 -.previous
2457  #else
2458  KPROBE_ENTRY(nmi)
2459         RING0_INT_FRAME
2460 @@ -1112,17 +1110,17 @@ KPROBE_END(nmi)
2461
2462  #ifdef CONFIG_PARAVIRT
2463  ENTRY(native_iret)
2464 -1:     iret
2465 +       iret
2466  .section __ex_table,"a"
2467         .align 4
2468 -       .long 1b,iret_exc
2469 +       .long native_iret, iret_exc
2470  .previous
2471  END(native_iret)
2472
2473 -ENTRY(native_irq_enable_sysexit)
2474 +ENTRY(native_irq_enable_syscall_ret)
2475         sti
2476         sysexit
2477 -END(native_irq_enable_sysexit)
2478 +END(native_irq_enable_syscall_ret)
2479  #endif
2480
2481  KPROBE_ENTRY(int3)
2482 @@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper)
2483         CFI_ENDPROC
2484  ENDPROC(kernel_thread_helper)
2485
2486 +#include <asm/alternative-asm.h>
2487 +
2488 +       # pv syscall call handler stub
2489 +ENTRY(ia32pv_cstar_target)
2490 +       RING0_INT_FRAME
2491 +       movl $__USER_DS,16(%esp)
2492 +       movl %ebp,%ecx
2493 +       movl $__USER_CS,4(%esp)
2494 +       movl 12(%esp),%ebp
2495 +       pushl %eax                      # save orig_eax
2496 +       CFI_ADJUST_CFA_OFFSET 4
2497 +/*
2498 + * Load the potential sixth argument from user stack.
2499 + * Careful about security.
2500 + */
2501 +       cmpl $__PAGE_OFFSET-4,%ebp
2502 +       CFI_REMEMBER_STATE
2503 +       ja cstar_fault
2504 +1:     movl (%ebp),%ebp
2505 +.section __ex_table,"a"
2506 +       .align 4
2507 +       .long 1b,cstar_fault
2508 +.previous
2509 +       SAVE_ALL
2510 +       GET_THREAD_INFO(%ebp)
2511 +       test_tif %ebp
2512 +       jnz cstar_trace_entry
2513 +       cmpl $nr_syscalls,%eax
2514 +       jae cstar_badsys
2515 +.Lcstar_call:
2516 +       btl %eax,cstar_special
2517 +       jc .Lcstar_special
2518 +       call *cstar_call_table(,%eax,4)
2519 +       movl %eax,PT_EAX(%esp)          # store the return value
2520 +.Lcstar_exit:
2521 +       movl PT_ECX(%esp),%ecx
2522 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2523 +       jmp syscall_exit
2524 +.Lcstar_special:
2525 +       movl PT_ECX(%esp),%ecx
2526 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2527 +       jmp syscall_call
2528 +cstar_set_tif:
2529 +       movl $cstar_clear_tif,(%esp)    # replace return address
2530 +       LOCK_PREFIX
2531 +       orl $_TIF_CSTAR,TI_flags(%ebp)
2532 +       jmp *sys_call_table(,%eax,4)
2533 +cstar_clear_tif:
2534 +       movl %eax,PT_EAX(%esp)          # store the return value
2535 +       LOCK_PREFIX
2536 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2537 +       jmp .Lcstar_exit
2538 +cstar_trace_entry:
2539 +       movl $-ENOSYS,PT_EAX(%esp)
2540 +       cmpl $nr_syscalls,%eax
2541 +       jae 1f
2542 +       btl %eax,cstar_special
2543 +       jc .Lcstar_trace_special
2544 +1:     movl %esp,%eax
2545 +       xorl %edx,%edx
2546 +       LOCK_PREFIX
2547 +       orl $_TIF_CSTAR,TI_flags(%ebp)
2548 +       call do_syscall_trace
2549 +       LOCK_PREFIX
2550 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2551 +       testl %eax,%eax
2552 +       jne .Lcstar_resume              # ret != 0 -> running under PTRACE_SYSEMU,
2553 +                                       # so must skip actual syscall
2554 +       movl PT_ORIG_EAX(%esp),%eax
2555 +       cmpl $nr_syscalls,%eax
2556 +       jb .Lcstar_call
2557 +       jmp .Lcstar_exit
2558 +.Lcstar_trace_special:
2559 +       movl PT_ECX(%esp),%ecx
2560 +       movl %esp,%eax
2561 +       xorl %edx,%edx
2562 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2563 +       call do_syscall_trace
2564 +       testl %eax,%eax
2565 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
2566 +                                       # so must skip actual syscall
2567 +       movl PT_ORIG_EAX(%esp),%eax
2568 +       cmpl $nr_syscalls,%eax
2569 +       jb syscall_call
2570 +       jmp syscall_exit
2571 +cstar_badsys:
2572 +       movl $-ENOSYS,PT_EAX(%esp)
2573 +.Lcstar_resume:
2574 +       movl PT_ECX(%esp),%ecx
2575 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2576 +       jmp resume_userspace
2577 +       CFI_RESTORE_STATE
2578 +cstar_fault:
2579 +       movl $-EFAULT,%eax
2580 +       SAVE_ALL
2581 +       GET_THREAD_INFO(%ebp)
2582 +       jmp .Lcstar_resume
2583 +       CFI_ENDPROC
2584 +ENDPROC(ia32pv_cstar_target)
2585 +
2586 +ENTRY(cstar_ret_from_fork)
2587 +       CFI_STARTPROC
2588 +       movl PT_ECX(%esp),%ecx
2589 +       GET_THREAD_INFO(%ebp)
2590 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2591 +       LOCK_PREFIX
2592 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2593 +       jmp ret_from_fork
2594 +       CFI_ENDPROC
2595 +END(ret_from_fork)
2596 +
2597  .section .rodata,"a"
2598  #include "syscall_table_32.S"
2599
2600  syscall_table_size=(.-sys_call_table)
2601 +
2602 +#include <asm/unistd.h>
2603 +cstar_special:
2604 +nr=0
2605 +mask=0
2606 +.rept nr_syscalls+31
2607 + .irp n, __NR_sigreturn, __NR_rt_sigreturn
2608 +  .if nr == \n
2609 +   mask = mask | (1 << (\n & 31))
2610 +  .endif
2611 + .endr
2612 + nr = nr + 1
2613 + .if (nr & 31) == 0
2614 +  .long mask
2615 +  mask = 0
2616 + .endif
2617 +.endr
2618 +#define        sys_call_table cstar_call_table
2619 +#define        sys_fork cstar_set_tif
2620 +#define        sys_clone cstar_set_tif
2621 +#define        sys_vfork cstar_set_tif
2622 +#include "syscall_table_32.S"
2623 +#undef sys_call_table
2624 +#undef sys_fork
2625 +#undef sys_clone
2626 +#undef sys_vfork
2627 --- sle11-2009-05-14.orig/arch/x86/kernel/entry_64-xen.S        2009-02-16 16:18:36.000000000 +0100
2628 +++ sle11-2009-05-14/arch/x86/kernel/entry_64-xen.S     2009-03-16 16:33:40.000000000 +0100
2629 @@ -54,17 +54,22 @@
2630  #include <asm/page.h>
2631  #include <asm/irqflags.h>
2632  #include <asm/errno.h>
2633 -#include <xen/interface/arch-x86_64.h>
2634 +#include <xen/interface/xen.h>
2635  #include <xen/interface/features.h>
2636
2637 -#include "xen_entry_64.S"
2638 -
2639         .code64
2640
2641  #ifndef CONFIG_PREEMPT
2642  #define retint_kernel retint_restore_args
2643  #endif
2644
2645 +#ifdef CONFIG_PARAVIRT
2646 +ENTRY(native_irq_enable_syscall_ret)
2647 +       movq    %gs:pda_oldrsp,%rsp
2648 +       swapgs
2649 +       sysretq
2650 +#endif /* CONFIG_PARAVIRT */
2651 +
2652
2653  .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2654  #ifdef CONFIG_TRACE_IRQFLAGS
2655 @@ -277,7 +282,7 @@ ret_from_sys_call:
2656  sysret_check:
2657         LOCKDEP_SYS_EXIT
2658         GET_THREAD_INFO(%rcx)
2659 -        XEN_BLOCK_EVENTS(%rsi)
2660 +       DISABLE_INTERRUPTS(CLBR_NONE)
2661         TRACE_IRQS_OFF
2662         movl threadinfo_flags(%rcx),%edx
2663         andl %edi,%edx
2664 @@ -287,7 +292,7 @@ sysret_check:
2665          * sysretq will re-enable interrupts:
2666          */
2667         TRACE_IRQS_ON
2668 -        XEN_UNBLOCK_EVENTS(%rsi)
2669 +       ENABLE_INTERRUPTS(CLBR_NONE)
2670         RESTORE_ARGS 0,8,0
2671          HYPERVISOR_IRET VGCF_IN_SYSCALL
2672
2673 @@ -298,7 +303,7 @@ sysret_careful:
2674         bt $TIF_NEED_RESCHED,%edx
2675         jnc sysret_signal
2676         TRACE_IRQS_ON
2677 -       XEN_UNBLOCK_EVENTS(%rsi)
2678 +       ENABLE_INTERRUPTS(CLBR_NONE)
2679         pushq %rdi
2680         CFI_ADJUST_CFA_OFFSET 8
2681         call schedule
2682 @@ -309,9 +314,8 @@ sysret_careful:
2683         /* Handle a signal */
2684  sysret_signal:
2685         TRACE_IRQS_ON
2686 -/*     sti */
2687 -        XEN_UNBLOCK_EVENTS(%rsi)
2688 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2689 +       ENABLE_INTERRUPTS(CLBR_NONE)
2690 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2691         jz    1f
2692
2693         /* Really a signal */
2694 @@ -323,7 +327,7 @@ sysret_signal:
2695  1:     movl $_TIF_NEED_RESCHED,%edi
2696         /* Use IRET because user could have changed frame. This
2697            works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2698 -       XEN_BLOCK_EVENTS(%rsi)
2699 +       DISABLE_INTERRUPTS(CLBR_NONE)
2700         TRACE_IRQS_OFF
2701         jmp int_with_check
2702
2703 @@ -355,7 +359,7 @@ tracesys:
2704   */
2705         .globl int_ret_from_sys_call
2706  int_ret_from_sys_call:
2707 -        XEN_BLOCK_EVENTS(%rsi)
2708 +       DISABLE_INTERRUPTS(CLBR_NONE)
2709         TRACE_IRQS_OFF
2710         testb $3,CS-ARGOFFSET(%rsp)
2711          jnz 1f
2712 @@ -381,22 +385,20 @@ int_careful:
2713         bt $TIF_NEED_RESCHED,%edx
2714         jnc  int_very_careful
2715         TRACE_IRQS_ON
2716 -/*     sti */
2717 -        XEN_UNBLOCK_EVENTS(%rsi)
2718 +       ENABLE_INTERRUPTS(CLBR_NONE)
2719         pushq %rdi
2720         CFI_ADJUST_CFA_OFFSET 8
2721         call schedule
2722         popq %rdi
2723         CFI_ADJUST_CFA_OFFSET -8
2724 -       XEN_BLOCK_EVENTS(%rsi)
2725 +       DISABLE_INTERRUPTS(CLBR_NONE)
2726         TRACE_IRQS_OFF
2727         jmp int_with_check
2728
2729         /* handle signals and tracing -- both require a full stack frame */
2730  int_very_careful:
2731         TRACE_IRQS_ON
2732 -/*     sti */
2733 -        XEN_UNBLOCK_EVENTS(%rsi)
2734 +       ENABLE_INTERRUPTS(CLBR_NONE)
2735         SAVE_REST
2736         /* Check for syscall exit trace */
2737         testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2738 @@ -411,7 +413,7 @@ int_very_careful:
2739         jmp int_restore_rest
2740
2741  int_signal:
2742 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2743 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2744         jz 1f
2745         movq %rsp,%rdi          # &ptregs -> arg1
2746         xorl %esi,%esi          # oldset -> arg2
2747 @@ -419,7 +421,7 @@ int_signal:
2748  1:     movl $_TIF_NEED_RESCHED,%edi
2749  int_restore_rest:
2750         RESTORE_REST
2751 -       XEN_BLOCK_EVENTS(%rsi)
2752 +       DISABLE_INTERRUPTS(CLBR_NONE)
2753         TRACE_IRQS_OFF
2754         jmp int_with_check
2755         CFI_ENDPROC
2756 @@ -474,6 +476,7 @@ ENTRY(stub_execve)
2757         CFI_REGISTER rip, r11
2758         SAVE_REST
2759         FIXUP_TOP_OF_STACK %r11
2760 +       movq %rsp, %rcx
2761         call sys_execve
2762         RESTORE_TOP_OF_STACK %r11
2763         movq %rax,RAX(%rsp)
2764 @@ -526,11 +529,10 @@ retint_check:
2765  retint_restore_args:   /* return to kernel space */
2766         movl EFLAGS-REST_SKIP(%rsp), %eax
2767         shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
2768 -       XEN_GET_VCPU_INFO(%rsi)
2769 +       GET_VCPU_INFO
2770         andb evtchn_upcall_mask(%rsi),%al
2771         andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
2772         jnz restore_all_enable_events   #        != 0 => enable event delivery
2773 -       XEN_PUT_VCPU_INFO(%rsi)
2774
2775         RESTORE_ARGS 0,8,0
2776         HYPERVISOR_IRET 0
2777 @@ -541,31 +543,29 @@ retint_careful:
2778         bt    $TIF_NEED_RESCHED,%edx
2779         jnc   retint_signal
2780         TRACE_IRQS_ON
2781 -       XEN_UNBLOCK_EVENTS(%rsi)
2782 -/*     sti */
2783 +       ENABLE_INTERRUPTS(CLBR_NONE)
2784         pushq %rdi
2785         CFI_ADJUST_CFA_OFFSET   8
2786         call  schedule
2787         popq %rdi
2788         CFI_ADJUST_CFA_OFFSET   -8
2789         GET_THREAD_INFO(%rcx)
2790 -       XEN_BLOCK_EVENTS(%rsi)
2791 -/*     cli */
2792 +       DISABLE_INTERRUPTS(CLBR_NONE)
2793         TRACE_IRQS_OFF
2794         jmp retint_check
2795
2796  retint_signal:
2797 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2798 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2799         jz    retint_restore_args
2800         TRACE_IRQS_ON
2801 -        XEN_UNBLOCK_EVENTS(%rsi)
2802 +       ENABLE_INTERRUPTS(CLBR_NONE)
2803         SAVE_REST
2804         movq $-1,ORIG_RAX(%rsp)
2805         xorl %esi,%esi          # oldset
2806         movq %rsp,%rdi          # &pt_regs
2807         call do_notify_resume
2808         RESTORE_REST
2809 -        XEN_BLOCK_EVENTS(%rsi)
2810 +       DISABLE_INTERRUPTS(CLBR_NONE)
2811         TRACE_IRQS_OFF
2812         movl $_TIF_NEED_RESCHED,%edi
2813         GET_THREAD_INFO(%rcx)
2814 @@ -702,7 +702,7 @@ END(spurious_interrupt)
2815         rdmsr
2816         testl %edx,%edx
2817         js    1f
2818 -       swapgs
2819 +       SWAPGS
2820         xorl  %ebx,%ebx
2821  1:
2822  #endif
2823 @@ -719,8 +719,7 @@ END(spurious_interrupt)
2824         .if \ist
2825         addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2826         .endif
2827 -/*     cli */
2828 -       XEN_BLOCK_EVENTS(%rsi)
2829 +       DISABLE_INTERRUPTS(CLBR_NONE)
2830         .if \irqtrace
2831         TRACE_IRQS_OFF
2832         .endif
2833 @@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2834         .if \trace
2835         TRACE_IRQS_IRETQ 0
2836         .endif
2837 -       swapgs
2838 +       SWAPGS_UNSAFE_STACK
2839  paranoid_restore\trace:
2840         RESTORE_ALL 8
2841 -       iretq
2842 +       jmp irq_return
2843  paranoid_userspace\trace:
2844         GET_THREAD_INFO(%rcx)
2845         movl threadinfo_flags(%rcx),%ebx
2846 @@ -767,11 +766,11 @@ paranoid_userspace\trace:
2847         .if \trace
2848         TRACE_IRQS_ON
2849         .endif
2850 -       sti
2851 +       ENABLE_INTERRUPTS(CLBR_NONE)
2852         xorl %esi,%esi                  /* arg2: oldset */
2853         movq %rsp,%rdi                  /* arg1: &pt_regs */
2854         call do_notify_resume
2855 -       cli
2856 +       DISABLE_INTERRUPTS(CLBR_NONE)
2857         .if \trace
2858         TRACE_IRQS_OFF
2859         .endif
2860 @@ -780,9 +779,9 @@ paranoid_schedule\trace:
2861         .if \trace
2862         TRACE_IRQS_ON
2863         .endif
2864 -       sti
2865 +       ENABLE_INTERRUPTS(CLBR_ANY)
2866         call schedule
2867 -       cli
2868 +       DISABLE_INTERRUPTS(CLBR_ANY)
2869         .if \trace
2870         TRACE_IRQS_OFF
2871         .endif
2872 @@ -846,8 +845,7 @@ error_call_handler:
2873         call *%rax
2874  error_exit:
2875         RESTORE_REST
2876 -/*     cli */
2877 -       XEN_BLOCK_EVENTS(%rsi)
2878 +       DISABLE_INTERRUPTS(CLBR_NONE)
2879         TRACE_IRQS_OFF
2880         GET_THREAD_INFO(%rcx)
2881         testb $3,CS-ARGOFFSET(%rsp)
2882 @@ -875,7 +873,7 @@ error_kernelspace:
2883            iret run with kernel gs again, so don't set the user space flag.
2884            B stepping K8s sometimes report an truncated RIP for IRET
2885            exceptions returning to compat mode. Check for these here too. */
2886 -       leaq iret_label(%rip),%rbp
2887 +       leaq irq_return(%rip),%rbp
2888         cmpq %rbp,RIP(%rsp)
2889         je   error_swapgs
2890         movl %ebp,%ebp  /* zero extend */
2891 @@ -930,19 +928,17 @@ END(do_hypervisor_callback)
2892  restore_all_enable_events:
2893         CFI_DEFAULT_STACK adj=1
2894         TRACE_IRQS_ON
2895 -       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
2896 +       __ENABLE_INTERRUPTS
2897
2898  scrit: /**** START OF CRITICAL REGION ****/
2899 -       XEN_TEST_PENDING(%rsi)
2900 +       __TEST_PENDING
2901         CFI_REMEMBER_STATE
2902         jnz  14f                        # process more events if necessary...
2903 -       XEN_PUT_VCPU_INFO(%rsi)
2904          RESTORE_ARGS 0,8,0
2905          HYPERVISOR_IRET 0
2906
2907         CFI_RESTORE_STATE
2908 -14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
2909 -       XEN_PUT_VCPU_INFO(%rsi)
2910 +14:    __DISABLE_INTERRUPTS
2911         SAVE_REST
2912          movq %rsp,%rdi                  # set the argument again
2913         jmp  11b
2914 @@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
2915   *     rdi: name, rsi: argv, rdx: envp
2916   *
2917   * We want to fallback into:
2918 - *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
2919 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
2920   *
2921   * do_sys_execve asm fallback arguments:
2922 - *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
2923 + *     rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
2924   */
2925  ENTRY(kernel_execve)
2926         CFI_STARTPROC
2927         FAKE_STACK_FRAME $0
2928         SAVE_ALL
2929 +       movq %rsp,%rcx
2930         call sys_execve
2931         movq %rax, RAX(%rsp)
2932         RESTORE_REST
2933 @@ -1144,7 +1141,7 @@ do_nmi_callback:
2934         call do_nmi
2935         orl  $NMI_MASK,EFLAGS(%rsp)
2936         RESTORE_REST
2937 -       XEN_BLOCK_EVENTS(%rsi)
2938 +       DISABLE_INTERRUPTS(CLBR_NONE)
2939         TRACE_IRQS_OFF
2940         GET_THREAD_INFO(%rcx)
2941         jmp  retint_restore_args
2942 --- sle11-2009-05-14.orig/arch/x86/kernel/fixup.c       2009-05-14 10:56:29.000000000 +0200
2943 +++ sle11-2009-05-14/arch/x86/kernel/fixup.c    2009-03-16 16:33:40.000000000 +0100
2944 @@ -36,7 +36,7 @@
2945
2946  #define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
2947
2948 -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2949 +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2950  {
2951         static unsigned long printed = 0;
2952         char info[100];
2953 --- sle11-2009-05-14.orig/arch/x86/kernel/genapic_64-xen.c      2009-02-16 16:18:36.000000000 +0100
2954 +++ sle11-2009-05-14/arch/x86/kernel/genapic_64-xen.c   2009-03-16 16:33:40.000000000 +0100
2955 @@ -24,20 +24,13 @@
2956  #include <acpi/acpi_bus.h>
2957  #endif
2958
2959 -/*
2960 - * which logical CPU number maps to which CPU (physical APIC ID)
2961 - *
2962 - * The following static array is used during kernel startup
2963 - * and the x86_cpu_to_apicid_ptr contains the address of the
2964 - * array during this time.  Is it zeroed when the per_cpu
2965 - * data area is removed.
2966 - */
2967 +/* which logical CPU number maps to which CPU (physical APIC ID) */
2968  #ifndef CONFIG_XEN
2969 -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2970 +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2971                                         = { [0 ... NR_CPUS-1] = BAD_APICID };
2972 -void *x86_cpu_to_apicid_ptr;
2973 +void *x86_cpu_to_apicid_early_ptr;
2974  #endif
2975 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
2976 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
2977  EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
2978
2979  #ifndef CONFIG_XEN
2980 --- sle11-2009-05-14.orig/arch/x86/kernel/head64-xen.c  2009-02-16 16:18:36.000000000 +0100
2981 +++ sle11-2009-05-14/arch/x86/kernel/head64-xen.c       2009-03-16 16:33:40.000000000 +0100
2982 @@ -16,6 +16,7 @@
2983  #include <linux/kernel.h>
2984  #include <linux/string.h>
2985  #include <linux/percpu.h>
2986 +#include <linux/start_kernel.h>
2987  #include <linux/module.h>
2988
2989  #include <asm/processor.h>
2990 @@ -26,6 +27,8 @@
2991  #include <asm/pgtable.h>
2992  #include <asm/tlbflush.h>
2993  #include <asm/sections.h>
2994 +#include <asm/kdebug.h>
2995 +#include <asm/e820.h>
2996
2997  unsigned long start_pfn;
2998
2999 @@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3000  {
3001         pgd_t *pgd = pgd_offset_k(0UL);
3002         pgd_clear(pgd);
3003 -       __flush_tlb();
3004 +       __flush_tlb_all();
3005  }
3006
3007  /* Don't add a printk in there. printk relies on the PDA which is not initialized
3008 @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3009  unsigned int machine_to_phys_order;
3010  EXPORT_SYMBOL(machine_to_phys_order);
3011
3012 +#define EBDA_ADDR_POINTER 0x40E
3013 +
3014 +static __init void reserve_ebda(void)
3015 +{
3016 +#ifndef CONFIG_XEN
3017 +       unsigned ebda_addr, ebda_size;
3018 +
3019 +       /*
3020 +        * there is a real-mode segmented pointer pointing to the
3021 +        * 4K EBDA area at 0x40E
3022 +        */
3023 +       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3024 +       ebda_addr <<= 4;
3025 +
3026 +       if (!ebda_addr)
3027 +               return;
3028 +
3029 +       ebda_size = *(unsigned short *)__va(ebda_addr);
3030 +
3031 +       /* Round EBDA up to pages */
3032 +       if (ebda_size == 0)
3033 +               ebda_size = 1;
3034 +       ebda_size <<= 10;
3035 +       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3036 +       if (ebda_size > 64*1024)
3037 +               ebda_size = 64*1024;
3038 +
3039 +       reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3040 +#endif
3041 +}
3042 +
3043  void __init x86_64_start_kernel(char * real_mode_data)
3044  {
3045         struct xen_machphys_mapping mapping;
3046 @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3047         /* Make NULL pointers segfault */
3048         zap_identity_mappings();
3049
3050 -       for (i = 0; i < IDT_ENTRIES; i++)
3051 +       /* Cleanup the over mapped high alias */
3052 +       cleanup_highmap();
3053 +
3054 +       for (i = 0; i < IDT_ENTRIES; i++) {
3055 +#ifdef CONFIG_EARLY_PRINTK
3056 +               set_intr_gate(i, &early_idt_handlers[i]);
3057 +#else
3058                 set_intr_gate(i, early_idt_handler);
3059 +#endif
3060 +       }
3061         load_idt((const struct desc_ptr *)&idt_descr);
3062  #endif
3063
3064 @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3065
3066         pda_init(0);
3067         copy_bootdata(__va(real_mode_data));
3068 -#ifdef CONFIG_SMP
3069 -       cpu_set(0, cpu_online_map);
3070 -#endif
3071 +
3072 +       reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3073 +
3074 +       reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3075 +                     start_pfn << PAGE_SHIFT, "Xen provided");
3076 +
3077 +       reserve_ebda();
3078 +
3079 +       /*
3080 +        * At this point everything still needed from the boot loader
3081 +        * or BIOS or kernel text should be early reserved or marked not
3082 +        * RAM in e820. All other memory is free game.
3083 +        */
3084 +
3085         start_kernel();
3086  }
3087 --- sle11-2009-05-14.orig/arch/x86/kernel/head_32-xen.S 2009-02-16 16:17:21.000000000 +0100
3088 +++ sle11-2009-05-14/arch/x86/kernel/head_32-xen.S      2009-03-16 16:33:40.000000000 +0100
3089 @@ -3,6 +3,7 @@
3090  .text
3091  #include <linux/elfnote.h>
3092  #include <linux/threads.h>
3093 +#include <linux/init.h>
3094  #include <linux/linkage.h>
3095  #include <asm/segment.h>
3096  #include <asm/page.h>
3097 @@ -88,7 +89,7 @@ ENTRY(_stext)
3098   */
3099  .section ".bss.page_aligned","wa"
3100         .align PAGE_SIZE_asm
3101 -ENTRY(swapper_pg_pmd)
3102 +ENTRY(swapper_pg_fixmap)
3103         .fill 1024,4,0
3104  ENTRY(empty_zero_page)
3105         .fill 4096,1,0
3106 --- sle11-2009-05-14.orig/arch/x86/kernel/init_task-xen.c       2009-02-16 16:18:36.000000000 +0100
3107 +++ sle11-2009-05-14/arch/x86/kernel/init_task-xen.c    2009-03-16 16:33:40.000000000 +0100
3108 @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3109  #endif
3110  struct mm_struct init_mm = INIT_MM(init_mm);
3111  #undef swapper_pg_dir
3112 -EXPORT_SYMBOL(init_mm);
3113 +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3114
3115  /*
3116   * Initial thread structure.
3117 --- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_32-xen.c      2009-02-16 16:18:36.000000000 +0100
3118 +++ sle11-2009-05-14/arch/x86/kernel/io_apic_32-xen.c   2009-03-16 16:33:40.000000000 +0100
3119 @@ -35,6 +35,7 @@
3120  #include <linux/htirq.h>
3121  #include <linux/freezer.h>
3122  #include <linux/kthread.h>
3123 +#include <linux/jiffies.h>     /* time_after() */
3124
3125  #include <asm/io.h>
3126  #include <asm/smp.h>
3127 @@ -48,8 +49,6 @@
3128  #include <mach_apic.h>
3129  #include <mach_apicdef.h>
3130
3131 -#include "io_ports.h"
3132 -
3133  #ifdef CONFIG_XEN
3134  #include <xen/interface/xen.h>
3135  #include <xen/interface/physdev.h>
3136 @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3137  # include <asm/processor.h>    /* kernel_thread() */
3138  # include <linux/kernel_stat.h>        /* kstat */
3139  # include <linux/slab.h>               /* kmalloc() */
3140 -# include <linux/timer.h>      /* time_after() */
3141 +# include <linux/timer.h>
3142
3143  #define IRQBALANCE_CHECK_ARCH -999
3144  #define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
3145 @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3146  #endif
3147
3148  #ifndef CONFIG_SMP
3149 -void fastcall send_IPI_self(int vector)
3150 +void send_IPI_self(int vector)
3151  {
3152  #ifndef CONFIG_XEN
3153         unsigned int cfg;
3154 @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3155          * might have cached one ExtINT interrupt.  Finally, at
3156          * least one tick may be lost due to delays.
3157          */
3158 -       if (jiffies - t1 > 4)
3159 +       if (time_after(jiffies, t1 + 4))
3160                 return 1;
3161
3162         return 0;
3163 @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3164         .eoi            = ack_apic,
3165  };
3166
3167 -static void setup_nmi (void)
3168 +static void __init setup_nmi(void)
3169  {
3170         /*
3171          * Dirty trick to enable the NMI watchdog ...
3172 @@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3173          */
3174         apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3175
3176 -       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3177 +       enable_NMI_through_LVT0();
3178
3179         apic_printk(APIC_VERBOSE, " done.\n");
3180  }
3181 @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3182  }
3183
3184  static struct sysdev_class ioapic_sysdev_class = {
3185 -       set_kset_name("ioapic"),
3186 +       .name = "ioapic",
3187         .suspend = ioapic_suspend,
3188         .resume = ioapic_resume,
3189  };
3190 --- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_64-xen.c      2009-02-16 16:18:36.000000000 +0100
3191 +++ sle11-2009-05-14/arch/x86/kernel/io_apic_64-xen.c   2009-03-16 16:33:40.000000000 +0100
3192 @@ -32,9 +32,11 @@
3193  #include <linux/msi.h>
3194  #include <linux/htirq.h>
3195  #include <linux/dmar.h>
3196 +#include <linux/jiffies.h>
3197  #ifdef CONFIG_ACPI
3198  #include <acpi/acpi_bus.h>
3199  #endif
3200 +#include <linux/bootmem.h>
3201
3202  #include <asm/idle.h>
3203  #include <asm/io.h>
3204 @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3205         v = apic_read(APIC_LVR);
3206         printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3207         ver = GET_APIC_VERSION(v);
3208 -       maxlvt = get_maxlvt();
3209 +       maxlvt = lapic_get_maxlvt();
3210
3211         v = apic_read(APIC_TASKPRI);
3212         printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3213 @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3214  }
3215  #endif /* !CONFIG_XEN */
3216
3217 -static void __init enable_IO_APIC(void)
3218 +void __init enable_IO_APIC(void)
3219  {
3220         union IO_APIC_reg_01 reg_01;
3221  #ifndef CONFIG_XEN
3222 @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3223          */
3224
3225         /* jiffies wrap? */
3226 -       if (jiffies - t1 > 4)
3227 +       if (time_after(jiffies, t1 + 4))
3228                 return 1;
3229         return 0;
3230  }
3231 @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3232         if (likely(!cfg->move_in_progress))
3233                 return;
3234
3235 -       vector = ~get_irq_regs()->orig_rax;
3236 +       vector = ~get_irq_regs()->orig_ax;
3237         me = smp_processor_id();
3238         if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3239                 cpumask_t cleanup_mask;
3240 @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3241         int do_unmask_irq = 0;
3242
3243         irq_complete_move(irq);
3244 -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3245 +#ifdef CONFIG_GENERIC_PENDING_IRQ
3246         /* If we are moving the irq we need to mask it */
3247         if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3248                 do_unmask_irq = 1;
3249 @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3250         .end = end_lapic_irq,
3251  };
3252
3253 -static void setup_nmi (void)
3254 +static void __init setup_nmi(void)
3255  {
3256         /*
3257          * Dirty trick to enable the NMI watchdog ...
3258 @@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3259          */
3260         printk(KERN_INFO "activating NMI Watchdog ...");
3261
3262 -       enable_NMI_through_LVT0(NULL);
3263 +       enable_NMI_through_LVT0();
3264
3265         printk(" done.\n");
3266  }
3267 @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3268   *
3269   * FIXME: really need to revamp this for modern platforms only.
3270   */
3271 -static inline void check_timer(void)
3272 +static inline void __init check_timer(void)
3273  {
3274         struct irq_cfg *cfg = irq_cfg + 0;
3275         int apic1, pin1, apic2, pin2;
3276 @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3277  }
3278
3279  static struct sysdev_class ioapic_sysdev_class = {
3280 -       set_kset_name("ioapic"),
3281 +       .name = "ioapic",
3282         .suspend = ioapic_suspend,
3283         .resume = ioapic_resume,
3284  };
3285 @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3286         }
3287  }
3288  #endif
3289 -#endif /* !CONFIG_XEN */
3290
3291 +#define IOAPIC_RESOURCE_NAME_SIZE 11
3292 +
3293 +static struct resource *ioapic_resources;
3294 +
3295 +static struct resource * __init ioapic_setup_resources(void)
3296 +{
3297 +       unsigned long n;
3298 +       struct resource *res;
3299 +       char *mem;
3300 +       int i;
3301 +
3302 +       if (nr_ioapics <= 0)
3303 +               return NULL;
3304 +
3305 +       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3306 +       n *= nr_ioapics;
3307 +
3308 +       mem = alloc_bootmem(n);
3309 +       res = (void *)mem;
3310 +
3311 +       if (mem != NULL) {
3312 +               memset(mem, 0, n);
3313 +               mem += sizeof(struct resource) * nr_ioapics;
3314 +
3315 +               for (i = 0; i < nr_ioapics; i++) {
3316 +                       res[i].name = mem;
3317 +                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3318 +                       sprintf(mem,  "IOAPIC %u", i);
3319 +                       mem += IOAPIC_RESOURCE_NAME_SIZE;
3320 +               }
3321 +       }
3322 +
3323 +       ioapic_resources = res;
3324 +
3325 +       return res;
3326 +}
3327 +
3328 +void __init ioapic_init_mappings(void)
3329 +{
3330 +       unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3331 +       struct resource *ioapic_res;
3332 +       int i;
3333 +
3334 +       ioapic_res = ioapic_setup_resources();
3335 +       for (i = 0; i < nr_ioapics; i++) {
3336 +               if (smp_found_config) {
3337 +                       ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3338 +               } else {
3339 +                       ioapic_phys = (unsigned long)
3340 +                               alloc_bootmem_pages(PAGE_SIZE);
3341 +                       ioapic_phys = __pa(ioapic_phys);
3342 +               }
3343 +               set_fixmap_nocache(idx, ioapic_phys);
3344 +               apic_printk(APIC_VERBOSE,
3345 +                           "mapped IOAPIC to %016lx (%016lx)\n",
3346 +                           __fix_to_virt(idx), ioapic_phys);
3347 +               idx++;
3348 +
3349 +               if (ioapic_res != NULL) {
3350 +                       ioapic_res->start = ioapic_phys;
3351 +                       ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3352 +                       ioapic_res++;
3353 +               }
3354 +       }
3355 +}
3356 +
3357 +static int __init ioapic_insert_resources(void)
3358 +{
3359 +       int i;
3360 +       struct resource *r = ioapic_resources;
3361 +
3362 +       if (!r) {
3363 +               printk(KERN_ERR
3364 +                      "IO APIC resources could be not be allocated.\n");
3365 +               return -1;
3366 +       }
3367 +
3368 +       for (i = 0; i < nr_ioapics; i++) {
3369 +               insert_resource(&iomem_resource, r);
3370 +               r++;
3371 +       }
3372 +
3373 +       return 0;
3374 +}
3375 +
3376 +/* Insert the IO APIC resources after PCI initialization has occured to handle
3377 + * IO APICS that are mapped in on a BAR in PCI space. */
3378 +late_initcall(ioapic_insert_resources);
3379 +#endif /* !CONFIG_XEN */
3380 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
3381 +++ sle11-2009-05-14/arch/x86/kernel/ioport-xen.c       2009-03-16 16:33:40.000000000 +0100
3382 @@ -0,0 +1,112 @@
3383 +/*
3384 + * This contains the io-permission bitmap code - written by obz, with changes
3385 + * by Linus. 32/64 bits code unification by Miguel Botón.
3386 + */
3387 +
3388 +#include <linux/sched.h>
3389 +#include <linux/kernel.h>
3390 +#include <linux/capability.h>
3391 +#include <linux/errno.h>
3392 +#include <linux/types.h>
3393 +#include <linux/ioport.h>
3394 +#include <linux/smp.h>
3395 +#include <linux/stddef.h>
3396 +#include <linux/slab.h>
3397 +#include <linux/thread_info.h>
3398 +#include <linux/syscalls.h>
3399 +#include <xen/interface/physdev.h>
3400 +
3401 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3402 +static void set_bitmap(unsigned long *bitmap, unsigned int base,
3403 +                      unsigned int extent, int new_value)
3404 +{
3405 +       unsigned int i;
3406 +
3407 +       for (i = base; i < base + extent; i++) {
3408 +               if (new_value)
3409 +                       __set_bit(i, bitmap);
3410 +               else
3411 +                       __clear_bit(i, bitmap);
3412 +       }
3413 +}
3414 +
3415 +/*
3416 + * this changes the io permissions bitmap in the current task.
3417 + */
3418 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3419 +{
3420 +       struct thread_struct * t = &current->thread;
3421 +       struct physdev_set_iobitmap set_iobitmap;
3422 +
3423 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3424 +               return -EINVAL;
3425 +       if (turn_on && !capable(CAP_SYS_RAWIO))
3426 +               return -EPERM;
3427 +
3428 +       /*
3429 +        * If it's the first ioperm() call in this thread's lifetime, set the
3430 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
3431 +        * this is why we delay this operation until now:
3432 +        */
3433 +       if (!t->io_bitmap_ptr) {
3434 +               unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3435 +
3436 +               if (!bitmap)
3437 +                       return -ENOMEM;
3438 +
3439 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3440 +               t->io_bitmap_ptr = bitmap;
3441 +               set_thread_flag(TIF_IO_BITMAP);
3442 +
3443 +               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3444 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3445 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3446 +                                             &set_iobitmap));
3447 +       }
3448 +
3449 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3450 +
3451 +       return 0;
3452 +}
3453 +
3454 +/*
3455 + * sys_iopl has to be used when you want to access the IO ports
3456 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3457 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
3458 + */
3459 +static int do_iopl(unsigned int level, struct thread_struct *t)
3460 +{
3461 +       unsigned int old = t->iopl >> 12;
3462 +
3463 +       if (level > 3)
3464 +               return -EINVAL;
3465 +       /* Trying to gain more privileges? */
3466 +       if (level > old) {
3467 +               if (!capable(CAP_SYS_RAWIO))
3468 +                       return -EPERM;
3469 +       }
3470 +
3471 +       return 0;
3472 +}
3473 +
3474 +#ifdef CONFIG_X86_32
3475 +asmlinkage long sys_iopl(unsigned long regsp)
3476 +{
3477 +       struct pt_regs *regs = (struct pt_regs *)&regsp;
3478 +       unsigned int level = regs->bx;
3479 +#else
3480 +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3481 +{
3482 +#endif
3483 +       struct thread_struct *t = &current->thread;
3484 +       int rc;
3485 +
3486 +       rc = do_iopl(level, t);
3487 +       if (rc < 0)
3488 +               goto out;
3489 +
3490 +       t->iopl = level << 12;
3491 +       set_iopl_mask(t->iopl);
3492 +out:
3493 +       return rc;
3494 +}
3495 --- sle11-2009-05-14.orig/arch/x86/kernel/ioport_32-xen.c       2009-02-16 16:18:36.000000000 +0100
3496 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
3497 @@ -1,121 +0,0 @@
3498 -/*
3499 - * This contains the io-permission bitmap code - written by obz, with changes
3500 - * by Linus.
3501 - */
3502 -
3503 -#include <linux/sched.h>
3504 -#include <linux/kernel.h>
3505 -#include <linux/capability.h>
3506 -#include <linux/errno.h>
3507 -#include <linux/types.h>
3508 -#include <linux/ioport.h>
3509 -#include <linux/smp.h>
3510 -#include <linux/stddef.h>
3511 -#include <linux/slab.h>
3512 -#include <linux/thread_info.h>
3513 -#include <linux/syscalls.h>
3514 -#include <xen/interface/physdev.h>
3515 -
3516 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3517 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3518 -{
3519 -       unsigned long mask;
3520 -       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3521 -       unsigned int low_index = base & (BITS_PER_LONG-1);
3522 -       int length = low_index + extent;
3523 -
3524 -       if (low_index != 0) {
3525 -               mask = (~0UL << low_index);
3526 -               if (length < BITS_PER_LONG)
3527 -                       mask &= ~(~0UL << length);
3528 -               if (new_value)
3529 -                       *bitmap_base++ |= mask;
3530 -               else
3531 -                       *bitmap_base++ &= ~mask;
3532 -               length -= BITS_PER_LONG;
3533 -       }
3534 -
3535 -       mask = (new_value ? ~0UL : 0UL);
3536 -       while (length >= BITS_PER_LONG) {
3537 -               *bitmap_base++ = mask;
3538 -               length -= BITS_PER_LONG;
3539 -       }
3540 -
3541 -       if (length > 0) {
3542 -               mask = ~(~0UL << length);
3543 -               if (new_value)
3544 -                       *bitmap_base++ |= mask;
3545 -               else
3546 -                       *bitmap_base++ &= ~mask;
3547 -       }
3548 -}
3549 -
3550 -
3551 -/*
3552 - * this changes the io permissions bitmap in the current task.
3553 - */
3554 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3555 -{
3556 -       struct thread_struct * t = &current->thread;
3557 -       unsigned long *bitmap;
3558 -       struct physdev_set_iobitmap set_iobitmap;
3559 -
3560 -       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3561 -               return -EINVAL;
3562 -       if (turn_on && !capable(CAP_SYS_RAWIO))
3563 -               return -EPERM;
3564 -
3565 -       /*
3566 -        * If it's the first ioperm() call in this thread's lifetime, set the
3567 -        * IO bitmap up. ioperm() is much less timing critical than clone(),
3568 -        * this is why we delay this operation until now:
3569 -        */
3570 -       if (!t->io_bitmap_ptr) {
3571 -               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3572 -               if (!bitmap)
3573 -                       return -ENOMEM;
3574 -
3575 -               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3576 -               t->io_bitmap_ptr = bitmap;
3577 -               set_thread_flag(TIF_IO_BITMAP);
3578 -
3579 -               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3580 -               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3581 -               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3582 -                                             &set_iobitmap));
3583 -       }
3584 -
3585 -       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3586 -
3587 -       return 0;
3588 -}
3589 -
3590 -/*
3591 - * sys_iopl has to be used when you want to access the IO ports
3592 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3593 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3594 - *
3595 - * Here we just change the eflags value on the stack: we allow
3596 - * only the super-user to do it. This depends on the stack-layout
3597 - * on system-call entry - see also fork() and the signal handling
3598 - * code.
3599 - */
3600 -
3601 -asmlinkage long sys_iopl(unsigned long unused)
3602 -{
3603 -       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3604 -       unsigned int level = regs->ebx;
3605 -       struct thread_struct *t = &current->thread;
3606 -       unsigned int old = (t->iopl >> 12) & 3;
3607 -
3608 -       if (level > 3)
3609 -               return -EINVAL;
3610 -       /* Trying to gain more privileges? */
3611 -       if (level > old) {
3612 -               if (!capable(CAP_SYS_RAWIO))
3613 -                       return -EPERM;
3614 -       }
3615 -       t->iopl = level << 12;
3616 -       set_iopl_mask(t->iopl);
3617 -       return 0;
3618 -}
3619 --- sle11-2009-05-14.orig/arch/x86/kernel/ioport_64-xen.c       2009-02-16 16:18:36.000000000 +0100
3620 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
3621 @@ -1,99 +0,0 @@
3622 -/*
3623 - * This contains the io-permission bitmap code - written by obz, with changes
3624 - * by Linus.
3625 - */
3626 -
3627 -#include <linux/sched.h>
3628 -#include <linux/kernel.h>
3629 -#include <linux/capability.h>
3630 -#include <linux/errno.h>
3631 -#include <linux/types.h>
3632 -#include <linux/ioport.h>
3633 -#include <linux/mm.h>
3634 -#include <linux/smp.h>
3635 -#include <linux/stddef.h>
3636 -#include <linux/slab.h>
3637 -#include <linux/thread_info.h>
3638 -#include <linux/syscalls.h>
3639 -#include <xen/interface/physdev.h>
3640 -
3641 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3642 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3643 -{
3644 -       int i;
3645 -
3646 -       if (new_value)
3647 -               for (i = base; i < base + extent; i++)
3648 -                       __set_bit(i, bitmap);
3649 -       else
3650 -               for (i = base; i < base + extent; i++)
3651 -                       clear_bit(i, bitmap);
3652 -}
3653 -
3654 -/*
3655 - * this changes the io permissions bitmap in the current task.
3656 - */
3657 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3658 -{
3659 -       struct thread_struct * t = &current->thread;
3660 -       unsigned long *bitmap;
3661 -       struct physdev_set_iobitmap set_iobitmap;
3662 -
3663 -       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3664 -               return -EINVAL;
3665 -       if (turn_on && !capable(CAP_SYS_RAWIO))
3666 -               return -EPERM;
3667 -
3668 -       /*
3669 -        * If it's the first ioperm() call in this thread's lifetime, set the
3670 -        * IO bitmap up. ioperm() is much less timing critical than clone(),
3671 -        * this is why we delay this operation until now:
3672 -        */
3673 -       if (!t->io_bitmap_ptr) {
3674 -               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3675 -               if (!bitmap)
3676 -                       return -ENOMEM;
3677 -
3678 -               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3679 -               t->io_bitmap_ptr = bitmap;
3680 -               set_thread_flag(TIF_IO_BITMAP);
3681 -
3682 -               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3683 -               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3684 -               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3685 -                                             &set_iobitmap));
3686 -       }
3687 -
3688 -       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3689 -
3690 -       return 0;
3691 -}
3692 -
3693 -/*
3694 - * sys_iopl has to be used when you want to access the IO ports
3695 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3696 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3697 - *
3698 - */
3699 -
3700 -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3701 -{
3702 -       unsigned int old_iopl = current->thread.iopl;
3703 -       struct physdev_set_iopl set_iopl;
3704 -
3705 -       if (new_iopl > 3)
3706 -               return -EINVAL;
3707 -
3708 -       /* Need "raw I/O" privileges for direct port access. */
3709 -       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3710 -               return -EPERM;
3711 -
3712 -       /* Change our version of the privilege levels. */
3713 -       current->thread.iopl = new_iopl;
3714 -
3715 -       /* Force the change at ring 0. */
3716 -       set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3717 -       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3718 -
3719 -       return 0;
3720 -}
3721 --- sle11-2009-05-14.orig/arch/x86/kernel/irq_32-xen.c  2009-02-16 16:18:36.000000000 +0100
3722 +++ sle11-2009-05-14/arch/x86/kernel/irq_32-xen.c       2009-03-16 16:33:40.000000000 +0100
3723 @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3724   * SMP cross-CPU interrupts have their own specific
3725   * handlers).
3726   */
3727 -fastcall unsigned int do_IRQ(struct pt_regs *regs)
3728 +unsigned int do_IRQ(struct pt_regs *regs)
3729  {
3730         struct pt_regs *old_regs;
3731         /* high bit used in ret_from_ code */
3732 -       int irq = ~regs->orig_eax;
3733 +       int irq = ~regs->orig_ax;
3734         struct irq_desc *desc = irq_desc + irq;
3735  #ifdef CONFIG_4KSTACKS
3736         union irq_ctx *curctx, *irqctx;
3737 @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3738  #ifdef CONFIG_DEBUG_STACKOVERFLOW
3739         /* Debugging check for stack overflow: is there less than 1KB free? */
3740         {
3741 -               long esp;
3742 +               long sp;
3743
3744                 __asm__ __volatile__("andl %%esp,%0" :
3745 -                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
3746 -               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3747 +                                       "=r" (sp) : "0" (THREAD_SIZE - 1));
3748 +               if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3749                         printk("do_IRQ: stack overflow: %ld\n",
3750 -                               esp - sizeof(struct thread_info));
3751 +                               sp - sizeof(struct thread_info));
3752                         dump_stack();
3753                 }
3754         }
3755 @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3756          * current stack (which is the irq stack already after all)
3757          */
3758         if (curctx != irqctx) {
3759 -               int arg1, arg2, ebx;
3760 +               int arg1, arg2, bx;
3761
3762                 /* build the stack frame on the IRQ stack */
3763                 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3764 @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3765                         (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3766
3767                 asm volatile(
3768 -                       "       xchgl  %%ebx,%%esp      \n"
3769 -                       "       call   *%%edi           \n"
3770 -                       "       movl   %%ebx,%%esp      \n"
3771 -                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3772 +                       "       xchgl  %%ebx,%%esp    \n"
3773 +                       "       call   *%%edi         \n"
3774 +                       "       movl   %%ebx,%%esp    \n"
3775 +                       : "=a" (arg1), "=d" (arg2), "=b" (bx)
3776                         :  "0" (irq),   "1" (desc),  "2" (isp),
3777                            "D" (desc->handle_irq)
3778                         : "memory", "cc"
3779 --- sle11-2009-05-14.orig/arch/x86/kernel/irq_64-xen.c  2009-02-16 16:18:36.000000000 +0100
3780 +++ sle11-2009-05-14/arch/x86/kernel/irq_64-xen.c       2009-03-16 16:33:40.000000000 +0100
3781 @@ -20,6 +20,28 @@
3782
3783  atomic_t irq_err_count;
3784
3785 +/*
3786 + * 'what should we do if we get a hw irq event on an illegal vector'.
3787 + * each architecture has to answer this themselves.
3788 + */
3789 +void ack_bad_irq(unsigned int irq)
3790 +{
3791 +       printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3792 +#ifdef CONFIG_X86_LOCAL_APIC
3793 +       /*
3794 +        * Currently unexpected vectors happen only on SMP and APIC.
3795 +        * We _must_ ack these because every local APIC has only N
3796 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
3797 +        * holds up an irq slot - in excessive cases (when multiple
3798 +        * unexpected vectors occur) that might lock up the APIC
3799 +        * completely.
3800 +        * But don't ack when the APIC is disabled. -AK
3801 +        */
3802 +       if (!disable_apic)
3803 +               ack_APIC_irq();
3804 +#endif
3805 +}
3806 +
3807  #ifdef CONFIG_DEBUG_STACKOVERFLOW
3808  /*
3809   * Probabilistic stack overflow check:
3810 @@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3811         u64 curbase = (u64)task_stack_page(current);
3812         static unsigned long warned = -60*HZ;
3813
3814 -       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3815 -           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
3816 +       if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3817 +           regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
3818             time_after(jiffies, warned + 60*HZ)) {
3819 -               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3820 -                      current->comm, curbase, regs->rsp);
3821 +               printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3822 +                      current->comm, curbase, regs->sp);
3823                 show_stack(NULL,NULL);
3824                 warned = jiffies;
3825         }
3826 @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3827         struct pt_regs *old_regs = set_irq_regs(regs);
3828
3829         /* high bit used in ret_from_ code  */
3830 -       unsigned irq = ~regs->orig_rax;
3831 +       unsigned irq = ~regs->orig_ax;
3832
3833         /*exit_idle();*/
3834         /*irq_enter();*/
3835 @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3836         }
3837         local_irq_restore(flags);
3838  }
3839 -
3840 -#ifndef CONFIG_X86_LOCAL_APIC
3841 -/*
3842 - * 'what should we do if we get a hw irq event on an illegal vector'.
3843 - * each architecture has to answer this themselves.
3844 - */
3845 -void ack_bad_irq(unsigned int irq)
3846 -{
3847 -        printk("unexpected IRQ trap at irq %02x\n", irq);
3848 -}
3849 -#endif
3850 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
3851 +++ sle11-2009-05-14/arch/x86/kernel/ldt-xen.c  2009-03-16 16:33:40.000000000 +0100
3852 @@ -0,0 +1,272 @@
3853 +/*
3854 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3855 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3856 + * Copyright (C) 2002 Andi Kleen
3857 + *
3858 + * This handles calls from both 32bit and 64bit mode.
3859 + */
3860 +
3861 +#include <linux/errno.h>
3862 +#include <linux/sched.h>
3863 +#include <linux/string.h>
3864 +#include <linux/mm.h>
3865 +#include <linux/smp.h>
3866 +#include <linux/vmalloc.h>
3867 +
3868 +#include <asm/uaccess.h>
3869 +#include <asm/system.h>
3870 +#include <asm/ldt.h>
3871 +#include <asm/desc.h>
3872 +#include <asm/mmu_context.h>
3873 +
3874 +#ifdef CONFIG_SMP
3875 +static void flush_ldt(void *null)
3876 +{
3877 +       if (current->active_mm)
3878 +               load_LDT(&current->active_mm->context);
3879 +}
3880 +#endif
3881 +
3882 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
3883 +{
3884 +       void *oldldt, *newldt;
3885 +       int oldsize;
3886 +
3887 +       if (mincount <= pc->size)
3888 +               return 0;
3889 +       oldsize = pc->size;
3890 +       mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
3891 +                       (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
3892 +       if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
3893 +               newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
3894 +       else
3895 +               newldt = (void *)__get_free_page(GFP_KERNEL);
3896 +
3897 +       if (!newldt)
3898 +               return -ENOMEM;
3899 +
3900 +       if (oldsize)
3901 +               memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
3902 +       oldldt = pc->ldt;
3903 +       memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
3904 +              (mincount - oldsize) * LDT_ENTRY_SIZE);
3905 +
3906 +#ifdef CONFIG_X86_64
3907 +       /* CHECKME: Do we really need this ? */
3908 +       wmb();
3909 +#endif
3910 +       pc->ldt = newldt;
3911 +       wmb();
3912 +       pc->size = mincount;
3913 +       wmb();
3914 +
3915 +       if (reload) {
3916 +#ifdef CONFIG_SMP
3917 +               cpumask_t mask;
3918 +
3919 +               preempt_disable();
3920 +#endif
3921 +               make_pages_readonly(newldt,
3922 +                                   (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
3923 +                                   XENFEAT_writable_descriptor_tables);
3924 +               load_LDT(pc);
3925 +#ifdef CONFIG_SMP
3926 +               mask = cpumask_of_cpu(smp_processor_id());
3927 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
3928 +                       smp_call_function(flush_ldt, NULL, 1, 1);
3929 +               preempt_enable();
3930 +#endif
3931 +       }
3932 +       if (oldsize) {
3933 +               make_pages_writable(oldldt,
3934 +                                   (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
3935 +                                   XENFEAT_writable_descriptor_tables);
3936 +               if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
3937 +                       vfree(oldldt);
3938 +               else
3939 +                       put_page(virt_to_page(oldldt));
3940 +       }
3941 +       return 0;
3942 +}
3943 +
3944 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
3945 +{
3946 +       int err = alloc_ldt(new, old->size, 0);
3947 +
3948 +       if (err < 0)
3949 +               return err;
3950 +       memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
3951 +       make_pages_readonly(new->ldt,
3952 +                           (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3953 +                           XENFEAT_writable_descriptor_tables);
3954 +       return 0;
3955 +}
3956 +
3957 +/*
3958 + * we do not have to muck with descriptors here, that is
3959 + * done in switch_mm() as needed.
3960 + */
3961 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
3962 +{
3963 +       struct mm_struct *old_mm;
3964 +       int retval = 0;
3965 +
3966 +       memset(&mm->context, 0, sizeof(mm->context));
3967 +       mutex_init(&mm->context.lock);
3968 +       old_mm = current->mm;
3969 +       if (old_mm)
3970 +               mm->context.vdso = old_mm->context.vdso;
3971 +       if (old_mm && old_mm->context.size > 0) {
3972 +               mutex_lock(&old_mm->context.lock);
3973 +               retval = copy_ldt(&mm->context, &old_mm->context);
3974 +               mutex_unlock(&old_mm->context.lock);
3975 +       }
3976 +       return retval;
3977 +}
3978 +
3979 +/*
3980 + * No need to lock the MM as we are the last user
3981 + *
3982 + * 64bit: Don't touch the LDT register - we're already in the next thread.
3983 + */
3984 +void destroy_context(struct mm_struct *mm)
3985 +{
3986 +       if (mm->context.size) {
3987 +               /* CHECKME: Can this ever happen ? */
3988 +               if (mm == current->active_mm)
3989 +                       clear_LDT();
3990 +               make_pages_writable(mm->context.ldt,
3991 +                                   (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3992 +                                   XENFEAT_writable_descriptor_tables);
3993 +               if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
3994 +                       vfree(mm->context.ldt);
3995 +               else
3996 +                       put_page(virt_to_page(mm->context.ldt));
3997 +               mm->context.size = 0;
3998 +       }
3999 +}
4000 +
4001 +static int read_ldt(void __user *ptr, unsigned long bytecount)
4002 +{
4003 +       int err;
4004 +       unsigned long size;
4005 +       struct mm_struct *mm = current->mm;
4006 +
4007 +       if (!mm->context.size)
4008 +               return 0;
4009 +       if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4010 +               bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4011 +
4012 +       mutex_lock(&mm->context.lock);
4013 +       size = mm->context.size * LDT_ENTRY_SIZE;
4014 +       if (size > bytecount)
4015 +               size = bytecount;
4016 +
4017 +       err = 0;
4018 +       if (copy_to_user(ptr, mm->context.ldt, size))
4019 +               err = -EFAULT;
4020 +       mutex_unlock(&mm->context.lock);
4021 +       if (err < 0)
4022 +               goto error_return;
4023 +       if (size != bytecount) {
4024 +               /* zero-fill the rest */
4025 +               if (clear_user(ptr + size, bytecount - size) != 0) {
4026 +                       err = -EFAULT;
4027 +                       goto error_return;
4028 +               }
4029 +       }
4030 +       return bytecount;
4031 +error_return:
4032 +       return err;
4033 +}
4034 +
4035 +static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4036 +{
4037 +       /* CHECKME: Can we use _one_ random number ? */
4038 +#ifdef CONFIG_X86_32
4039 +       unsigned long size = 5 * sizeof(struct desc_struct);
4040 +#else
4041 +       unsigned long size = 128;
4042 +#endif
4043 +       if (bytecount > size)
4044 +               bytecount = size;
4045 +       if (clear_user(ptr, bytecount))
4046 +               return -EFAULT;
4047 +       return bytecount;
4048 +}
4049 +
4050 +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4051 +{
4052 +       struct mm_struct *mm = current->mm;
4053 +       struct desc_struct ldt;
4054 +       int error;
4055 +       struct user_desc ldt_info;
4056 +
4057 +       error = -EINVAL;
4058 +       if (bytecount != sizeof(ldt_info))
4059 +               goto out;
4060 +       error = -EFAULT;
4061 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4062 +               goto out;
4063 +
4064 +       error = -EINVAL;
4065 +       if (ldt_info.entry_number >= LDT_ENTRIES)
4066 +               goto out;
4067 +       if (ldt_info.contents == 3) {
4068 +               if (oldmode)
4069 +                       goto out;
4070 +               if (ldt_info.seg_not_present == 0)
4071 +                       goto out;
4072 +       }
4073 +
4074 +       mutex_lock(&mm->context.lock);
4075 +       if (ldt_info.entry_number >= mm->context.size) {
4076 +               error = alloc_ldt(&current->mm->context,
4077 +                                 ldt_info.entry_number + 1, 1);
4078 +               if (error < 0)
4079 +                       goto out_unlock;
4080 +       }
4081 +
4082 +       /* Allow LDTs to be cleared by the user. */
4083 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4084 +               if (oldmode || LDT_empty(&ldt_info)) {
4085 +                       memset(&ldt, 0, sizeof(ldt));
4086 +                       goto install;
4087 +               }
4088 +       }
4089 +
4090 +       fill_ldt(&ldt, &ldt_info);
4091 +       if (oldmode)
4092 +               ldt.avl = 0;
4093 +
4094 +       /* Install the new entry ...  */
4095 +install:
4096 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4097 +
4098 +out_unlock:
4099 +       mutex_unlock(&mm->context.lock);
4100 +out:
4101 +       return error;
4102 +}
4103 +
4104 +asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4105 +                             unsigned long bytecount)
4106 +{
4107 +       int ret = -ENOSYS;
4108 +
4109 +       switch (func) {
4110 +       case 0:
4111 +               ret = read_ldt(ptr, bytecount);
4112 +               break;
4113 +       case 1:
4114 +               ret = write_ldt(ptr, bytecount, 1);
4115 +               break;
4116 +       case 2:
4117 +               ret = read_default_ldt(ptr, bytecount);
4118 +               break;
4119 +       case 0x11:
4120 +               ret = write_ldt(ptr, bytecount, 0);
4121 +               break;
4122 +       }
4123 +       return ret;
4124 +}
4125 --- sle11-2009-05-14.orig/arch/x86/kernel/ldt_32-xen.c  2009-02-16 16:18:36.000000000 +0100
4126 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
4127 @@ -1,265 +0,0 @@
4128 -/*
4129 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4130 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4131 - */
4132 -
4133 -#include <linux/errno.h>
4134 -#include <linux/sched.h>
4135 -#include <linux/string.h>
4136 -#include <linux/mm.h>
4137 -#include <linux/smp.h>
4138 -#include <linux/vmalloc.h>
4139 -#include <linux/slab.h>
4140 -
4141 -#include <asm/uaccess.h>
4142 -#include <asm/system.h>
4143 -#include <asm/ldt.h>
4144 -#include <asm/desc.h>
4145 -#include <asm/mmu_context.h>
4146 -
4147 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4148 -static void flush_ldt(void *null)
4149 -{
4150 -       if (current->active_mm)
4151 -               load_LDT(&current->active_mm->context);
4152 -}
4153 -#endif
4154 -
4155 -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4156 -{
4157 -       void *oldldt;
4158 -       void *newldt;
4159 -       int oldsize;
4160 -
4161 -       if (mincount <= pc->size)
4162 -               return 0;
4163 -       oldsize = pc->size;
4164 -       mincount = (mincount+511)&(~511);
4165 -       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4166 -               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4167 -       else
4168 -               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4169 -
4170 -       if (!newldt)
4171 -               return -ENOMEM;
4172 -
4173 -       if (oldsize)
4174 -               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4175 -       oldldt = pc->ldt;
4176 -       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4177 -       pc->ldt = newldt;
4178 -       wmb();
4179 -       pc->size = mincount;
4180 -       wmb();
4181 -
4182 -       if (reload) {
4183 -#ifdef CONFIG_SMP
4184 -               cpumask_t mask;
4185 -               preempt_disable();
4186 -#endif
4187 -               make_pages_readonly(
4188 -                       pc->ldt,
4189 -                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4190 -                       XENFEAT_writable_descriptor_tables);
4191 -               load_LDT(pc);
4192 -#ifdef CONFIG_SMP
4193 -               mask = cpumask_of_cpu(smp_processor_id());
4194 -               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4195 -                       smp_call_function(flush_ldt, NULL, 1, 1);
4196 -               preempt_enable();
4197 -#endif
4198 -       }
4199 -       if (oldsize) {
4200 -               make_pages_writable(
4201 -                       oldldt,
4202 -                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4203 -                       XENFEAT_writable_descriptor_tables);
4204 -               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4205 -                       vfree(oldldt);
4206 -               else
4207 -                       kfree(oldldt);
4208 -       }
4209 -       return 0;
4210 -}
4211 -
4212 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4213 -{
4214 -       int err = alloc_ldt(new, old->size, 0);
4215 -       if (err < 0)
4216 -               return err;
4217 -       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4218 -       make_pages_readonly(
4219 -               new->ldt,
4220 -               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4221 -               XENFEAT_writable_descriptor_tables);
4222 -       return 0;
4223 -}
4224 -
4225 -/*
4226 - * we do not have to muck with descriptors here, that is
4227 - * done in switch_mm() as needed.
4228 - */
4229 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4230 -{
4231 -       struct mm_struct * old_mm;
4232 -       int retval = 0;
4233 -
4234 -       mutex_init(&mm->context.lock);
4235 -       mm->context.size = 0;
4236 -       mm->context.has_foreign_mappings = 0;
4237 -       old_mm = current->mm;
4238 -       if (old_mm && old_mm->context.size > 0) {
4239 -               mutex_lock(&old_mm->context.lock);
4240 -               retval = copy_ldt(&mm->context, &old_mm->context);
4241 -               mutex_unlock(&old_mm->context.lock);
4242 -       }
4243 -       return retval;
4244 -}
4245 -
4246 -/*
4247 - * No need to lock the MM as we are the last user
4248 - */
4249 -void destroy_context(struct mm_struct *mm)
4250 -{
4251 -       if (mm->context.size) {
4252 -               if (mm == current->active_mm)
4253 -                       clear_LDT();
4254 -               make_pages_writable(
4255 -                       mm->context.ldt,
4256 -                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4257 -                       XENFEAT_writable_descriptor_tables);
4258 -               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4259 -                       vfree(mm->context.ldt);
4260 -               else
4261 -                       kfree(mm->context.ldt);
4262 -               mm->context.size = 0;
4263 -       }
4264 -}
4265 -
4266 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4267 -{
4268 -       int err;
4269 -       unsigned long size;
4270 -       struct mm_struct * mm = current->mm;
4271 -
4272 -       if (!mm->context.size)
4273 -               return 0;
4274 -       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4275 -               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4276 -
4277 -       mutex_lock(&mm->context.lock);
4278 -       size = mm->context.size*LDT_ENTRY_SIZE;
4279 -       if (size > bytecount)
4280 -               size = bytecount;
4281 -
4282 -       err = 0;
4283 -       if (copy_to_user(ptr, mm->context.ldt, size))
4284 -               err = -EFAULT;
4285 -       mutex_unlock(&mm->context.lock);
4286 -       if (err < 0)
4287 -               goto error_return;
4288 -       if (size != bytecount) {
4289 -               /* zero-fill the rest */
4290 -               if (clear_user(ptr+size, bytecount-size) != 0) {
4291 -                       err = -EFAULT;
4292 -                       goto error_return;
4293 -               }
4294 -       }
4295 -       return bytecount;
4296 -error_return:
4297 -       return err;
4298 -}
4299 -
4300 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4301 -{
4302 -       int err;
4303 -       unsigned long size;
4304 -
4305 -       err = 0;
4306 -       size = 5*sizeof(struct desc_struct);
4307 -       if (size > bytecount)
4308 -               size = bytecount;
4309 -
4310 -       err = size;
4311 -       if (clear_user(ptr, size))
4312 -               err = -EFAULT;
4313 -
4314 -       return err;
4315 -}
4316 -
4317 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4318 -{
4319 -       struct mm_struct * mm = current->mm;
4320 -       __u32 entry_1, entry_2;
4321 -       int error;
4322 -       struct user_desc ldt_info;
4323 -
4324 -       error = -EINVAL;
4325 -       if (bytecount != sizeof(ldt_info))
4326 -               goto out;
4327 -       error = -EFAULT;
4328 -       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4329 -               goto out;
4330 -
4331 -       error = -EINVAL;
4332 -       if (ldt_info.entry_number >= LDT_ENTRIES)
4333 -               goto out;
4334 -       if (ldt_info.contents == 3) {
4335 -               if (oldmode)
4336 -                       goto out;
4337 -               if (ldt_info.seg_not_present == 0)
4338 -                       goto out;
4339 -       }
4340 -
4341 -       mutex_lock(&mm->context.lock);
4342 -       if (ldt_info.entry_number >= mm->context.size) {
4343 -               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4344 -               if (error < 0)
4345 -                       goto out_unlock;
4346 -       }
4347 -
4348 -       /* Allow LDTs to be cleared by the user. */
4349 -       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4350 -               if (oldmode || LDT_empty(&ldt_info)) {
4351 -                       entry_1 = 0;
4352 -                       entry_2 = 0;
4353 -                       goto install;
4354 -               }
4355 -       }
4356 -
4357 -       entry_1 = LDT_entry_a(&ldt_info);
4358 -       entry_2 = LDT_entry_b(&ldt_info);
4359 -       if (oldmode)
4360 -               entry_2 &= ~(1 << 20);
4361 -
4362 -       /* Install the new entry ...  */
4363 -install:
4364 -       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4365 -                               entry_1, entry_2);
4366 -
4367 -out_unlock:
4368 -       mutex_unlock(&mm->context.lock);
4369 -out:
4370 -       return error;
4371 -}
4372 -
4373 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4374 -{
4375 -       int ret = -ENOSYS;
4376 -
4377 -       switch (func) {
4378 -       case 0:
4379 -               ret = read_ldt(ptr, bytecount);
4380 -               break;
4381 -       case 1:
4382 -               ret = write_ldt(ptr, bytecount, 1);
4383 -               break;
4384 -       case 2:
4385 -               ret = read_default_ldt(ptr, bytecount);
4386 -               break;
4387 -       case 0x11:
4388 -               ret = write_ldt(ptr, bytecount, 0);
4389 -               break;
4390 -       }
4391 -       return ret;
4392 -}
4393 --- sle11-2009-05-14.orig/arch/x86/kernel/ldt_64-xen.c  2009-02-16 16:18:36.000000000 +0100
4394 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
4395 @@ -1,271 +0,0 @@
4396 -/*
4397 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4398 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4399 - * Copyright (C) 2002 Andi Kleen
4400 - *
4401 - * This handles calls from both 32bit and 64bit mode.
4402 - */
4403 -
4404 -#include <linux/errno.h>
4405 -#include <linux/sched.h>
4406 -#include <linux/string.h>
4407 -#include <linux/mm.h>
4408 -#include <linux/smp.h>
4409 -#include <linux/vmalloc.h>
4410 -#include <linux/slab.h>
4411 -
4412 -#include <asm/uaccess.h>
4413 -#include <asm/system.h>
4414 -#include <asm/ldt.h>
4415 -#include <asm/desc.h>
4416 -#include <asm/proto.h>
4417 -#include <asm/pgalloc.h>
4418 -
4419 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4420 -static void flush_ldt(void *null)
4421 -{
4422 -       if (current->active_mm)
4423 -               load_LDT(&current->active_mm->context);
4424 -}
4425 -#endif
4426 -
4427 -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4428 -{
4429 -       void *oldldt;
4430 -       void *newldt;
4431 -       unsigned oldsize;
4432 -
4433 -       if (mincount <= (unsigned)pc->size)
4434 -               return 0;
4435 -       oldsize = pc->size;
4436 -       mincount = (mincount+511)&(~511);
4437 -       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4438 -               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4439 -       else
4440 -               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4441 -
4442 -       if (!newldt)
4443 -               return -ENOMEM;
4444 -
4445 -       if (oldsize)
4446 -               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4447 -       oldldt = pc->ldt;
4448 -       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4449 -       wmb();
4450 -       pc->ldt = newldt;
4451 -       wmb();
4452 -       pc->size = mincount;
4453 -       wmb();
4454 -       if (reload) {
4455 -#ifdef CONFIG_SMP
4456 -               cpumask_t mask;
4457 -
4458 -               preempt_disable();
4459 -#endif
4460 -               make_pages_readonly(
4461 -                       pc->ldt,
4462 -                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4463 -                       XENFEAT_writable_descriptor_tables);
4464 -               load_LDT(pc);
4465 -#ifdef CONFIG_SMP
4466 -               mask = cpumask_of_cpu(smp_processor_id());
4467 -               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4468 -                       smp_call_function(flush_ldt, NULL, 1, 1);
4469 -               preempt_enable();
4470 -#endif
4471 -       }
4472 -       if (oldsize) {
4473 -               make_pages_writable(
4474 -                       oldldt,
4475 -                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4476 -                       XENFEAT_writable_descriptor_tables);
4477 -               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4478 -                       vfree(oldldt);
4479 -               else
4480 -                       kfree(oldldt);
4481 -       }
4482 -       return 0;
4483 -}
4484 -
4485 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4486 -{
4487 -       int err = alloc_ldt(new, old->size, 0);
4488 -       if (err < 0)
4489 -               return err;
4490 -       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4491 -       make_pages_readonly(
4492 -               new->ldt,
4493 -               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4494 -               XENFEAT_writable_descriptor_tables);
4495 -       return 0;
4496 -}
4497 -
4498 -/*
4499 - * we do not have to muck with descriptors here, that is
4500 - * done in switch_mm() as needed.
4501 - */
4502 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4503 -{
4504 -       struct mm_struct * old_mm;
4505 -       int retval = 0;
4506 -
4507 -       memset(&mm->context, 0, sizeof(mm->context));
4508 -       mutex_init(&mm->context.lock);
4509 -       old_mm = current->mm;
4510 -       if (old_mm)
4511 -               mm->context.vdso = old_mm->context.vdso;
4512 -       if (old_mm && old_mm->context.size > 0) {
4513 -               mutex_lock(&old_mm->context.lock);
4514 -               retval = copy_ldt(&mm->context, &old_mm->context);
4515 -               mutex_unlock(&old_mm->context.lock);
4516 -       }
4517 -       return retval;
4518 -}
4519 -
4520 -/*
4521 - *
4522 - * Don't touch the LDT register - we're already in the next thread.
4523 - */
4524 -void destroy_context(struct mm_struct *mm)
4525 -{
4526 -       if (mm->context.size) {
4527 -               if (mm == current->active_mm)
4528 -                       clear_LDT();
4529 -               make_pages_writable(
4530 -                       mm->context.ldt,
4531 -                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4532 -                       XENFEAT_writable_descriptor_tables);
4533 -               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4534 -                       vfree(mm->context.ldt);
4535 -               else
4536 -                       kfree(mm->context.ldt);
4537 -               mm->context.size = 0;
4538 -       }
4539 -}
4540 -
4541 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4542 -{
4543 -       int err;
4544 -       unsigned long size;
4545 -       struct mm_struct * mm = current->mm;
4546 -
4547 -       if (!mm->context.size)
4548 -               return 0;
4549 -       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4550 -               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4551 -
4552 -       mutex_lock(&mm->context.lock);
4553 -       size = mm->context.size*LDT_ENTRY_SIZE;
4554 -       if (size > bytecount)
4555 -               size = bytecount;
4556 -
4557 -       err = 0;
4558 -       if (copy_to_user(ptr, mm->context.ldt, size))
4559 -               err = -EFAULT;
4560 -       mutex_unlock(&mm->context.lock);
4561 -       if (err < 0)
4562 -               goto error_return;
4563 -       if (size != bytecount) {
4564 -               /* zero-fill the rest */
4565 -               if (clear_user(ptr+size, bytecount-size) != 0) {
4566 -                       err = -EFAULT;
4567 -                       goto error_return;
4568 -               }
4569 -       }
4570 -       return bytecount;
4571 -error_return:
4572 -       return err;
4573 -}
4574 -
4575 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4576 -{
4577 -       /* Arbitrary number */
4578 -       /* x86-64 default LDT is all zeros */
4579 -       if (bytecount > 128)
4580 -               bytecount = 128;
4581 -       if (clear_user(ptr, bytecount))
4582 -               return -EFAULT;
4583 -       return bytecount;
4584 -}
4585 -
4586 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4587 -{
4588 -       struct task_struct *me = current;
4589 -       struct mm_struct * mm = me->mm;
4590 -       __u32 entry_1, entry_2, *lp;
4591 -       unsigned long mach_lp;
4592 -       int error;
4593 -       struct user_desc ldt_info;
4594 -
4595 -       error = -EINVAL;
4596 -
4597 -       if (bytecount != sizeof(ldt_info))
4598 -               goto out;
4599 -       error = -EFAULT;
4600 -       if (copy_from_user(&ldt_info, ptr, bytecount))
4601 -               goto out;
4602 -
4603 -       error = -EINVAL;
4604 -       if (ldt_info.entry_number >= LDT_ENTRIES)
4605 -               goto out;
4606 -       if (ldt_info.contents == 3) {
4607 -               if (oldmode)
4608 -                       goto out;
4609 -               if (ldt_info.seg_not_present == 0)
4610 -                       goto out;
4611 -       }
4612 -
4613 -       mutex_lock(&mm->context.lock);
4614 -       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4615 -               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4616 -               if (error < 0)
4617 -                       goto out_unlock;
4618 -       }
4619 -
4620 -       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4621 -       mach_lp = arbitrary_virt_to_machine(lp);
4622 -
4623 -       /* Allow LDTs to be cleared by the user. */
4624 -       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4625 -               if (oldmode || LDT_empty(&ldt_info)) {
4626 -                       entry_1 = 0;
4627 -                       entry_2 = 0;
4628 -                       goto install;
4629 -               }
4630 -       }
4631 -
4632 -       entry_1 = LDT_entry_a(&ldt_info);
4633 -       entry_2 = LDT_entry_b(&ldt_info);
4634 -       if (oldmode)
4635 -               entry_2 &= ~(1 << 20);
4636 -
4637 -       /* Install the new entry ...  */
4638 -install:
4639 -       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4640 -
4641 -out_unlock:
4642 -       mutex_unlock(&mm->context.lock);
4643 -out:
4644 -       return error;
4645 -}
4646 -
4647 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4648 -{
4649 -       int ret = -ENOSYS;
4650 -
4651 -       switch (func) {
4652 -       case 0:
4653 -               ret = read_ldt(ptr, bytecount);
4654 -               break;
4655 -       case 1:
4656 -               ret = write_ldt(ptr, bytecount, 1);
4657 -               break;
4658 -       case 2:
4659 -               ret = read_default_ldt(ptr, bytecount);
4660 -               break;
4661 -       case 0x11:
4662 -               ret = write_ldt(ptr, bytecount, 0);
4663 -               break;
4664 -       }
4665 -       return ret;
4666 -}
4667 --- sle11-2009-05-14.orig/arch/x86/kernel/machine_kexec_64.c    2008-11-25 12:35:54.000000000 +0100
4668 +++ sle11-2009-05-14/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
4669 @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4670
4671  void arch_crash_save_vmcoreinfo(void)
4672  {
4673 +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4674         VMCOREINFO_SYMBOL(phys_base);
4675 +#endif
4676         VMCOREINFO_SYMBOL(init_level4_pgt);
4677
4678  #ifdef CONFIG_NUMA
4679 --- sle11-2009-05-14.orig/arch/x86/kernel/microcode-xen.c       2009-02-16 16:17:21.000000000 +0100
4680 +++ sle11-2009-05-14/arch/x86/kernel/microcode-xen.c    2009-03-16 16:33:40.000000000 +0100
4681 @@ -167,7 +167,7 @@ static int request_microcode(void)
4682         }
4683
4684         op.cmd = XENPF_microcode_update;
4685 -       set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4686 +       set_xen_guest_handle(op.u.microcode.data, firmware->data);
4687         op.u.microcode.length = firmware->size;
4688         error = HYPERVISOR_platform_op(&op);
4689
4690 --- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_32-xen.c      2009-02-16 16:18:36.000000000 +0100
4691 +++ sle11-2009-05-14/arch/x86/kernel/mpparse_32-xen.c   2009-03-16 16:33:40.000000000 +0100
4692 @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4693  /* Processor that is doing the boot up */
4694  unsigned int boot_cpu_physical_apicid = -1U;
4695  /* Internal processor count */
4696 -unsigned int __cpuinitdata num_processors;
4697 +unsigned int num_processors;
4698
4699  /* Bitmask of physically existing CPUs */
4700  physid_mask_t phys_cpu_present_map;
4701 @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4702         if (!(m->mpc_flags & MPC_APIC_USABLE))
4703                 return;
4704
4705 -       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4706 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4707                 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4708         if (nr_ioapics >= MAX_IO_APICS) {
4709                 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4710 @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4711
4712         mps_oem_check(mpc, oem, str);
4713
4714 -       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4715 +       printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4716
4717 -       /*
4718 +       /*
4719          * Save the local APIC address (it might be non-default) -- but only
4720          * if we're not using ACPI.
4721          */
4722 @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4723         unsigned long *bp = isa_bus_to_virt(base);
4724         struct intel_mp_floating *mpf;
4725
4726 -       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4727 +       printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4728         if (sizeof(*mpf) != 16)
4729                 printk("Error: MPF size\n");
4730
4731 @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4732
4733                         smp_found_config = 1;
4734  #ifndef CONFIG_XEN
4735 -                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
4736 -                                               virt_to_phys(mpf));
4737 -                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4738 +                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4739 +                               mpf, virt_to_phys(mpf));
4740 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4741 +                                       BOOTMEM_DEFAULT);
4742                         if (mpf->mpf_physptr) {
4743                                 /*
4744                                  * We cannot access to MPC table to compute
4745 @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4746                                 unsigned long end = max_low_pfn * PAGE_SIZE;
4747                                 if (mpf->mpf_physptr + size > end)
4748                                         size = end - mpf->mpf_physptr;
4749 -                               reserve_bootmem(mpf->mpf_physptr, size);
4750 +                               reserve_bootmem(mpf->mpf_physptr, size,
4751 +                                               BOOTMEM_DEFAULT);
4752                         }
4753  #else
4754 -                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
4755 -                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4756 +                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4757 +                               mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4758  #endif
4759
4760                         mpf_found = mpf;
4761 @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4762          */
4763         mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4764         mp_ioapic_routing[idx].gsi_base = gsi_base;
4765 -       mp_ioapic_routing[idx].gsi_end = gsi_base +
4766 +       mp_ioapic_routing[idx].gsi_end = gsi_base +
4767                 io_apic_get_redir_entries(idx);
4768
4769 -       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4770 -               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4771 -               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4772 -               mp_ioapic_routing[idx].gsi_base,
4773 -               mp_ioapic_routing[idx].gsi_end);
4774 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4775 +              "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4776 +              mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4777 +              mp_ioapic_routing[idx].gsi_base,
4778 +              mp_ioapic_routing[idx].gsi_end);
4779  }
4780
4781  void __init
4782 @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4783  }
4784
4785  #define MAX_GSI_NUM    4096
4786 +#define IRQ_COMPRESSION_START  64
4787
4788  int mp_register_gsi(u32 gsi, int triggering, int polarity)
4789  {
4790         int ioapic = -1;
4791         int ioapic_pin = 0;
4792         int idx, bit = 0;
4793 -       static int pci_irq = 16;
4794 +       static int pci_irq = IRQ_COMPRESSION_START;
4795         /*
4796 -        * Mapping between Global System Interrups, which
4797 +        * Mapping between Global System Interrupts, which
4798          * represent all possible interrupts, and IRQs
4799          * assigned to actual devices.
4800          */
4801 @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4802         if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4803                 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4804                         mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4805 -               return gsi_to_irq[gsi];
4806 +               return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4807         }
4808
4809         mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4810
4811 -       if (triggering == ACPI_LEVEL_SENSITIVE) {
4812 +       /*
4813 +        * For GSI >= 64, use IRQ compression
4814 +        */
4815 +       if ((gsi >= IRQ_COMPRESSION_START)
4816 +               && (triggering == ACPI_LEVEL_SENSITIVE)) {
4817                 /*
4818                  * For PCI devices assign IRQs in order, avoiding gaps
4819                  * due to unused I/O APIC pins.
4820 --- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_64-xen.c      2009-02-16 16:18:36.000000000 +0100
4821 +++ sle11-2009-05-14/arch/x86/kernel/mpparse_64-xen.c   2009-03-16 16:33:40.000000000 +0100
4822 @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4823  EXPORT_SYMBOL(boot_cpu_id);
4824
4825  /* Internal processor count */
4826 -unsigned int num_processors __cpuinitdata = 0;
4827 +unsigned int num_processors;
4828
4829  unsigned disabled_cpus __cpuinitdata;
4830
4831  /* Bitmask of physically existing CPUs */
4832  physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4833
4834 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4835 +#ifndef CONFIG_XEN
4836 +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4837 +                               = { [0 ... NR_CPUS-1] = BAD_APICID };
4838 +void *x86_bios_cpu_apicid_early_ptr;
4839 +#endif
4840 +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4841 +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4842
4843
4844  /*
4845 @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4846         physid_set(m->mpc_apicid, phys_cpu_present_map);
4847         if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4848                 /*
4849 -                * bios_cpu_apicid is required to have processors listed
4850 +                * x86_bios_cpu_apicid is required to have processors listed
4851                  * in same order as logical cpu numbers. Hence the first
4852                  * entry is BSP, and so on.
4853                  */
4854                 cpu = 0;
4855         }
4856 -       bios_cpu_apicid[cpu] = m->mpc_apicid;
4857 -       /*
4858 -        * We get called early in the the start_kernel initialization
4859 -        * process when the per_cpu data area is not yet setup, so we
4860 -        * use a static array that is removed after the per_cpu data
4861 -        * area is created.
4862 -        */
4863 -       if (x86_cpu_to_apicid_ptr) {
4864 -               u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
4865 -               x86_cpu_to_apicid[cpu] = m->mpc_apicid;
4866 +       /* are we being called early in kernel startup? */
4867 +       if (x86_cpu_to_apicid_early_ptr) {
4868 +               u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
4869 +               u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
4870 +
4871 +               cpu_to_apicid[cpu] = m->mpc_apicid;
4872 +               bios_cpu_apicid[cpu] = m->mpc_apicid;
4873         } else {
4874                 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
4875 +               per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
4876         }
4877
4878         cpu_set(cpu, cpu_possible_map);
4879 --- sle11-2009-05-14.orig/arch/x86/kernel/pci-dma-xen.c 2009-02-16 16:18:36.000000000 +0100
4880 +++ sle11-2009-05-14/arch/x86/kernel/pci-dma-xen.c      2009-03-16 16:33:40.000000000 +0100
4881 @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
4882                 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
4883  }
4884  EXPORT_SYMBOL(dma_sync_single_for_device);
4885 +
4886 +void
4887 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
4888 +                   enum dma_data_direction direction)
4889 +{
4890 +       if (swiotlb)
4891 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
4892 +       flush_write_buffers();
4893 +}
4894 +EXPORT_SYMBOL(dma_sync_sg_for_cpu);
4895 +
4896 +void
4897 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
4898 +                   enum dma_data_direction direction)
4899 +{
4900 +       if (swiotlb)
4901 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
4902 +       flush_write_buffers();
4903 +}
4904 +EXPORT_SYMBOL(dma_sync_sg_for_device);
4905 --- sle11-2009-05-14.orig/arch/x86/kernel/process_32-xen.c      2009-02-16 16:18:36.000000000 +0100
4906 +++ sle11-2009-05-14/arch/x86/kernel/process_32-xen.c   2009-03-16 16:33:40.000000000 +0100
4907 @@ -23,7 +23,6 @@
4908  #include <linux/slab.h>
4909  #include <linux/vmalloc.h>
4910  #include <linux/user.h>
4911 -#include <linux/a.out.h>
4912  #include <linux/interrupt.h>
4913  #include <linux/utsname.h>
4914  #include <linux/delay.h>
4915 @@ -59,8 +58,10 @@
4916
4917  #include <asm/tlbflush.h>
4918  #include <asm/cpu.h>
4919 +#include <asm/kdebug.h>
4920
4921  asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
4922 +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
4923
4924  static int hlt_counter;
4925
4926 @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
4927   */
4928  unsigned long thread_saved_pc(struct task_struct *tsk)
4929  {
4930 -       return ((unsigned long *)tsk->thread.esp)[3];
4931 +       return ((unsigned long *)tsk->thread.sp)[3];
4932  }
4933
4934  /*
4935 @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
4936   */
4937  void (*pm_idle)(void);
4938  EXPORT_SYMBOL(pm_idle);
4939 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
4940
4941  void disable_hlt(void)
4942  {
4943 @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
4944   * to poll the ->work.need_resched flag instead of waiting for the
4945   * cross-CPU IPI to arrive. Use this option with caution.
4946   */
4947 -static void poll_idle (void)
4948 +static void poll_idle(void)
4949  {
4950         cpu_relax();
4951  }
4952 @@ -122,10 +122,19 @@ static void xen_idle(void)
4953         smp_mb();
4954
4955         local_irq_disable();
4956 -       if (!need_resched())
4957 +       if (!need_resched()) {
4958 +               ktime_t t0, t1;
4959 +               u64 t0n, t1n;
4960 +
4961 +               t0 = ktime_get();
4962 +               t0n = ktime_to_ns(t0);
4963                 safe_halt();    /* enables interrupts racelessly */
4964 -       else
4965 -               local_irq_enable();
4966 +               local_irq_disable();
4967 +               t1 = ktime_get();
4968 +               t1n = ktime_to_ns(t1);
4969 +               sched_clock_idle_wakeup_event(t1n - t0n);
4970 +       }
4971 +       local_irq_enable();
4972         current_thread_info()->status |= TS_POLLING;
4973  }
4974  #ifdef CONFIG_APM_MODULE
4975 @@ -168,13 +177,13 @@ void cpu_idle(void)
4976                 while (!need_resched()) {
4977                         void (*idle)(void);
4978
4979 -                       if (__get_cpu_var(cpu_idle_state))
4980 -                               __get_cpu_var(cpu_idle_state) = 0;
4981 -
4982                         check_pgt_cache();
4983                         rmb();
4984                         idle = xen_idle; /* no alternatives */
4985
4986 +                       if (rcu_pending(cpu))
4987 +                               rcu_check_callbacks(cpu, 0);
4988 +
4989                         if (cpu_is_offline(cpu))
4990                                 play_dead();
4991
4992 @@ -192,40 +201,19 @@ static void do_nothing(void *unused)
4993  {
4994  }
4995
4996 +/*
4997 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
4998 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
4999 + * handler on SMP systems.
5000 + *
5001 + * Caller must have changed pm_idle to the new value before the call. Old
5002 + * pm_idle value will not be used by any CPU after the return of this function.
5003 + */
5004  void cpu_idle_wait(void)
5005  {
5006 -       unsigned int cpu, this_cpu = get_cpu();
5007 -       cpumask_t map, tmp = current->cpus_allowed;
5008 -
5009 -       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5010 -       put_cpu();
5011 -
5012 -       cpus_clear(map);
5013 -       for_each_online_cpu(cpu) {
5014 -               per_cpu(cpu_idle_state, cpu) = 1;
5015 -               cpu_set(cpu, map);
5016 -       }
5017 -
5018 -       __get_cpu_var(cpu_idle_state) = 0;
5019 -
5020 -       wmb();
5021 -       do {
5022 -               ssleep(1);
5023 -               for_each_online_cpu(cpu) {
5024 -                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5025 -                               cpu_clear(cpu, map);
5026 -               }
5027 -               cpus_and(map, map, cpu_online_map);
5028 -               /*
5029 -                * We waited 1 sec, if a CPU still did not call idle
5030 -                * it may be because it is in idle and not waking up
5031 -                * because it has nothing to do.
5032 -                * Give all the remaining CPUS a kick.
5033 -                */
5034 -               smp_call_function_mask(map, do_nothing, 0, 0);
5035 -       } while (!cpus_empty(map));
5036 -
5037 -       set_cpus_allowed(current, tmp);
5038 +       smp_mb();
5039 +       /* kick all the CPUs so that they exit out of pm_idle */
5040 +       smp_call_function(do_nothing, NULL, 0, 1);
5041  }
5042  EXPORT_SYMBOL_GPL(cpu_idle_wait);
5043
5044 @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5045  {
5046         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5047         unsigned long d0, d1, d2, d3, d6, d7;
5048 -       unsigned long esp;
5049 +       unsigned long sp;
5050         unsigned short ss, gs;
5051
5052         if (user_mode_vm(regs)) {
5053 -               esp = regs->esp;
5054 -               ss = regs->xss & 0xffff;
5055 +               sp = regs->sp;
5056 +               ss = regs->ss & 0xffff;
5057                 savesegment(gs, gs);
5058         } else {
5059 -               esp = (unsigned long) (&regs->esp);
5060 +               sp = (unsigned long) (&regs->sp);
5061                 savesegment(ss, ss);
5062                 savesegment(gs, gs);
5063         }
5064 @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5065                         init_utsname()->version);
5066
5067         printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5068 -                       0xffff & regs->xcs, regs->eip, regs->eflags,
5069 +                       0xffff & regs->cs, regs->ip, regs->flags,
5070                         smp_processor_id());
5071 -       print_symbol("EIP is at %s\n", regs->eip);
5072 +       print_symbol("EIP is at %s\n", regs->ip);
5073
5074         printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5075 -               regs->eax, regs->ebx, regs->ecx, regs->edx);
5076 +               regs->ax, regs->bx, regs->cx, regs->dx);
5077         printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5078 -               regs->esi, regs->edi, regs->ebp, esp);
5079 +               regs->si, regs->di, regs->bp, sp);
5080         printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5081 -              regs->xds & 0xffff, regs->xes & 0xffff,
5082 -              regs->xfs & 0xffff, gs, ss);
5083 +              regs->ds & 0xffff, regs->es & 0xffff,
5084 +              regs->fs & 0xffff, gs, ss);
5085
5086         if (!all)
5087                 return;
5088 @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5089  void show_regs(struct pt_regs *regs)
5090  {
5091         __show_registers(regs, 1);
5092 -       show_trace(NULL, regs, &regs->esp);
5093 +       show_trace(NULL, regs, &regs->sp, regs->bp);
5094  }
5095
5096  /*
5097 - * This gets run with %ebx containing the
5098 - * function to call, and %edx containing
5099 + * This gets run with %bx containing the
5100 + * function to call, and %dx containing
5101   * the "args".
5102   */
5103  extern void kernel_thread_helper(void);
5104 @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5105
5106         memset(&regs, 0, sizeof(regs));
5107
5108 -       regs.ebx = (unsigned long) fn;
5109 -       regs.edx = (unsigned long) arg;
5110 +       regs.bx = (unsigned long) fn;
5111 +       regs.dx = (unsigned long) arg;
5112
5113 -       regs.xds = __USER_DS;
5114 -       regs.xes = __USER_DS;
5115 -       regs.xfs = __KERNEL_PERCPU;
5116 -       regs.orig_eax = -1;
5117 -       regs.eip = (unsigned long) kernel_thread_helper;
5118 -       regs.xcs = __KERNEL_CS | get_kernel_rpl();
5119 -       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5120 +       regs.ds = __USER_DS;
5121 +       regs.es = __USER_DS;
5122 +       regs.fs = __KERNEL_PERCPU;
5123 +       regs.orig_ax = -1;
5124 +       regs.ip = (unsigned long) kernel_thread_helper;
5125 +       regs.cs = __KERNEL_CS | get_kernel_rpl();
5126 +       regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5127
5128         /* Ok, create the new process.. */
5129         return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
5130 @@ -368,7 +356,12 @@ void flush_thread(void)
5131  {
5132         struct task_struct *tsk = current;
5133
5134 -       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5135 +       tsk->thread.debugreg0 = 0;
5136 +       tsk->thread.debugreg1 = 0;
5137 +       tsk->thread.debugreg2 = 0;
5138 +       tsk->thread.debugreg3 = 0;
5139 +       tsk->thread.debugreg6 = 0;
5140 +       tsk->thread.debugreg7 = 0;
5141         memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5142         clear_tsk_thread_flag(tsk, TIF_DEBUG);
5143         /*
5144 @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5145         unlazy_fpu(tsk);
5146  }
5147
5148 -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5149 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5150         unsigned long unused,
5151         struct task_struct * p, struct pt_regs * regs)
5152  {
5153 @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5154
5155         childregs = task_pt_regs(p);
5156         *childregs = *regs;
5157 -       childregs->eax = 0;
5158 -       childregs->esp = esp;
5159 +       childregs->ax = 0;
5160 +       childregs->sp = sp;
5161
5162 -       p->thread.esp = (unsigned long) childregs;
5163 -       p->thread.esp0 = (unsigned long) (childregs+1);
5164 +       p->thread.sp = (unsigned long) childregs;
5165 +       p->thread.sp0 = (unsigned long) (childregs+1);
5166
5167 -       p->thread.eip = (unsigned long) ret_from_fork;
5168 +       p->thread.ip = (unsigned long) ret_from_fork;
5169
5170 -       savesegment(gs,p->thread.gs);
5171 +       savesegment(gs, p->thread.gs);
5172
5173         tsk = current;
5174 +       if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5175 +               p->thread.ip = (unsigned long) cstar_ret_from_fork;
5176         if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5177                 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5178                                                 IO_BITMAP_BYTES, GFP_KERNEL);
5179 @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5180                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5181         }
5182
5183 +       err = 0;
5184 +
5185         /*
5186          * Set a new TLS for the child thread?
5187          */
5188 -       if (clone_flags & CLONE_SETTLS) {
5189 -               struct desc_struct *desc;
5190 -               struct user_desc info;
5191 -               int idx;
5192 -
5193 -               err = -EFAULT;
5194 -               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5195 -                       goto out;
5196 -               err = -EINVAL;
5197 -               if (LDT_empty(&info))
5198 -                       goto out;
5199 -
5200 -               idx = info.entry_number;
5201 -               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5202 -                       goto out;
5203 -
5204 -               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5205 -               desc->a = LDT_entry_a(&info);
5206 -               desc->b = LDT_entry_b(&info);
5207 -       }
5208 +       if (clone_flags & CLONE_SETTLS)
5209 +               err = do_set_thread_area(p, -1,
5210 +                       (struct user_desc __user *)childregs->si, 0);
5211
5212         p->thread.iopl = current->thread.iopl;
5213
5214 -       err = 0;
5215 - out:
5216         if (err && p->thread.io_bitmap_ptr) {
5217                 kfree(p->thread.io_bitmap_ptr);
5218                 p->thread.io_bitmap_max = 0;
5219 @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5220         return err;
5221  }
5222
5223 -/*
5224 - * fill in the user structure for a core dump..
5225 - */
5226 -void dump_thread(struct pt_regs * regs, struct user * dump)
5227 -{
5228 -       int i;
5229 -
5230 -/* changed the size calculations - should hopefully work better. lbt */
5231 -       dump->magic = CMAGIC;
5232 -       dump->start_code = 0;
5233 -       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5234 -       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5235 -       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5236 -       dump->u_dsize -= dump->u_tsize;
5237 -       dump->u_ssize = 0;
5238 -       for (i = 0; i < 8; i++)
5239 -               dump->u_debugreg[i] = current->thread.debugreg[i];
5240 -
5241 -       if (dump->start_stack < TASK_SIZE)
5242 -               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5243 -
5244 -       dump->regs.ebx = regs->ebx;
5245 -       dump->regs.ecx = regs->ecx;
5246 -       dump->regs.edx = regs->edx;
5247 -       dump->regs.esi = regs->esi;
5248 -       dump->regs.edi = regs->edi;
5249 -       dump->regs.ebp = regs->ebp;
5250 -       dump->regs.eax = regs->eax;
5251 -       dump->regs.ds = regs->xds;
5252 -       dump->regs.es = regs->xes;
5253 -       dump->regs.fs = regs->xfs;
5254 -       savesegment(gs,dump->regs.gs);
5255 -       dump->regs.orig_eax = regs->orig_eax;
5256 -       dump->regs.eip = regs->eip;
5257 -       dump->regs.cs = regs->xcs;
5258 -       dump->regs.eflags = regs->eflags;
5259 -       dump->regs.esp = regs->esp;
5260 -       dump->regs.ss = regs->xss;
5261 -
5262 -       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5263 -}
5264 -EXPORT_SYMBOL(dump_thread);
5265 -
5266 -/*
5267 - * Capture the user space registers if the task is not running (in user space)
5268 - */
5269 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5270 -{
5271 -       struct pt_regs ptregs = *task_pt_regs(tsk);
5272 -       ptregs.xcs &= 0xffff;
5273 -       ptregs.xds &= 0xffff;
5274 -       ptregs.xes &= 0xffff;
5275 -       ptregs.xss &= 0xffff;
5276 -
5277 -       elf_core_copy_regs(regs, &ptregs);
5278 -
5279 -       return 1;
5280 -}
5281 -
5282  #ifdef CONFIG_SECCOMP
5283 -void hard_disable_TSC(void)
5284 +static void hard_disable_TSC(void)
5285  {
5286         write_cr4(read_cr4() | X86_CR4_TSD);
5287  }
5288 @@ -534,7 +453,7 @@ void disable_TSC(void)
5289                 hard_disable_TSC();
5290         preempt_enable();
5291  }
5292 -void hard_enable_TSC(void)
5293 +static void hard_enable_TSC(void)
5294  {
5295         write_cr4(read_cr4() & ~X86_CR4_TSD);
5296  }
5297 @@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5298  static noinline void
5299  __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5300  {
5301 -       struct thread_struct *next;
5302 +       struct thread_struct *prev, *next;
5303 +       unsigned long debugctl;
5304
5305 +       prev = &prev_p->thread;
5306         next = &next_p->thread;
5307
5308 +       debugctl = prev->debugctlmsr;
5309 +       if (next->ds_area_msr != prev->ds_area_msr) {
5310 +               /* we clear debugctl to make sure DS
5311 +                * is not in use when we change it */
5312 +               debugctl = 0;
5313 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5314 +               wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5315 +       }
5316 +
5317 +       if (next->debugctlmsr != debugctl)
5318 +               wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5319 +
5320         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5321 -               set_debugreg(next->debugreg[0], 0);
5322 -               set_debugreg(next->debugreg[1], 1);
5323 -               set_debugreg(next->debugreg[2], 2);
5324 -               set_debugreg(next->debugreg[3], 3);
5325 +               set_debugreg(next->debugreg0, 0);
5326 +               set_debugreg(next->debugreg1, 1);
5327 +               set_debugreg(next->debugreg2, 2);
5328 +               set_debugreg(next->debugreg3, 3);
5329                 /* no 4 and 5 */
5330 -               set_debugreg(next->debugreg[6], 6);
5331 -               set_debugreg(next->debugreg[7], 7);
5332 +               set_debugreg(next->debugreg6, 6);
5333 +               set_debugreg(next->debugreg7, 7);
5334         }
5335
5336  #ifdef CONFIG_SECCOMP
5337 @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5338                         hard_enable_TSC();
5339         }
5340  #endif
5341 +
5342 +#ifdef X86_BTS
5343 +       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5344 +               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5345 +
5346 +       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5347 +               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5348 +#endif
5349  }
5350
5351  /*
5352 @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5353   * More important, however, is the fact that this allows us much
5354   * more flexibility.
5355   *
5356 - * The return value (in %eax) will be the "prev" task after
5357 + * The return value (in %ax) will be the "prev" task after
5358   * the task-switch, and shows up in ret_from_fork in entry.S,
5359   * for example.
5360   */
5361 -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5362 +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5363  {
5364         struct thread_struct *prev = &prev_p->thread,
5365                                  *next = &next_p->thread;
5366 @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5367  #endif
5368
5369         /*
5370 -        * Reload esp0.
5371 -        * This is load_esp0(tss, next) with a multicall.
5372 +        * Reload sp0.
5373 +        * This is load_sp0(tss, next) with a multicall.
5374          */
5375         mcl->op      = __HYPERVISOR_stack_switch;
5376         mcl->args[0] = __KERNEL_DS;
5377 -       mcl->args[1] = next->esp0;
5378 +       mcl->args[1] = next->sp0;
5379         mcl++;
5380
5381         /*
5382 @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5383
5384  asmlinkage int sys_fork(struct pt_regs regs)
5385  {
5386 -       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5387 +       return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5388  }
5389
5390  asmlinkage int sys_clone(struct pt_regs regs)
5391 @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5392         unsigned long newsp;
5393         int __user *parent_tidptr, *child_tidptr;
5394
5395 -       clone_flags = regs.ebx;
5396 -       newsp = regs.ecx;
5397 -       parent_tidptr = (int __user *)regs.edx;
5398 -       child_tidptr = (int __user *)regs.edi;
5399 +       clone_flags = regs.bx;
5400 +       newsp = regs.cx;
5401 +       parent_tidptr = (int __user *)regs.dx;
5402 +       child_tidptr = (int __user *)regs.di;
5403         if (!newsp)
5404 -               newsp = regs.esp;
5405 +               newsp = regs.sp;
5406         return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
5407  }
5408
5409 @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5410   */
5411  asmlinkage int sys_vfork(struct pt_regs regs)
5412  {
5413 -       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5414 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5415  }
5416
5417  /*
5418 @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5419         int error;
5420         char * filename;
5421
5422 -       filename = getname((char __user *) regs.ebx);
5423 +       filename = getname((char __user *) regs.bx);
5424         error = PTR_ERR(filename);
5425         if (IS_ERR(filename))
5426                 goto out;
5427         error = do_execve(filename,
5428 -                       (char __user * __user *) regs.ecx,
5429 -                       (char __user * __user *) regs.edx,
5430 +                       (char __user * __user *) regs.cx,
5431 +                       (char __user * __user *) regs.dx,
5432                         &regs);
5433         if (error == 0) {
5434 -               task_lock(current);
5435 -               current->ptrace &= ~PT_DTRACE;
5436 -               task_unlock(current);
5437                 /* Make sure we don't return using sysenter.. */
5438                 set_thread_flag(TIF_IRET);
5439         }
5440 @@ -800,145 +738,37 @@ out:
5441
5442  unsigned long get_wchan(struct task_struct *p)
5443  {
5444 -       unsigned long ebp, esp, eip;
5445 +       unsigned long bp, sp, ip;
5446         unsigned long stack_page;
5447         int count = 0;
5448         if (!p || p == current || p->state == TASK_RUNNING)
5449                 return 0;
5450         stack_page = (unsigned long)task_stack_page(p);
5451 -       esp = p->thread.esp;
5452 -       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5453 +       sp = p->thread.sp;
5454 +       if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5455                 return 0;
5456 -       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5457 -       ebp = *(unsigned long *) esp;
5458 +       /* include/asm-i386/system.h:switch_to() pushes bp last. */
5459 +       bp = *(unsigned long *) sp;
5460         do {
5461 -               if (ebp < stack_page || ebp > top_ebp+stack_page)
5462 +               if (bp < stack_page || bp > top_ebp+stack_page)
5463                         return 0;
5464 -               eip = *(unsigned long *) (ebp+4);
5465 -               if (!in_sched_functions(eip))
5466 -                       return eip;
5467 -               ebp = *(unsigned long *) ebp;
5468 +               ip = *(unsigned long *) (bp+4);
5469 +               if (!in_sched_functions(ip))
5470 +                       return ip;
5471 +               bp = *(unsigned long *) bp;
5472         } while (count++ < 16);
5473         return 0;
5474  }
5475
5476 -/*
5477 - * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5478 - */
5479 -static int get_free_idx(void)
5480 -{
5481 -       struct thread_struct *t = &current->thread;
5482 -       int idx;
5483 -
5484 -       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5485 -               if (desc_empty(t->tls_array + idx))
5486 -                       return idx + GDT_ENTRY_TLS_MIN;
5487 -       return -ESRCH;
5488 -}
5489 -
5490 -/*
5491 - * Set a given TLS descriptor:
5492 - */
5493 -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5494 -{
5495 -       struct thread_struct *t = &current->thread;
5496 -       struct user_desc info;
5497 -       struct desc_struct *desc;
5498 -       int cpu, idx;
5499 -
5500 -       if (copy_from_user(&info, u_info, sizeof(info)))
5501 -               return -EFAULT;
5502 -       idx = info.entry_number;
5503 -
5504 -       /*
5505 -        * index -1 means the kernel should try to find and
5506 -        * allocate an empty descriptor:
5507 -        */
5508 -       if (idx == -1) {
5509 -               idx = get_free_idx();
5510 -               if (idx < 0)
5511 -                       return idx;
5512 -               if (put_user(idx, &u_info->entry_number))
5513 -                       return -EFAULT;
5514 -       }
5515 -
5516 -       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5517 -               return -EINVAL;
5518 -
5519 -       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5520 -
5521 -       /*
5522 -        * We must not get preempted while modifying the TLS.
5523 -        */
5524 -       cpu = get_cpu();
5525 -
5526 -       if (LDT_empty(&info)) {
5527 -               desc->a = 0;
5528 -               desc->b = 0;
5529 -       } else {
5530 -               desc->a = LDT_entry_a(&info);
5531 -               desc->b = LDT_entry_b(&info);
5532 -       }
5533 -       load_TLS(t, cpu);
5534 -
5535 -       put_cpu();
5536 -
5537 -       return 0;
5538 -}
5539 -
5540 -/*
5541 - * Get the current Thread-Local Storage area:
5542 - */
5543 -
5544 -#define GET_BASE(desc) ( \
5545 -       (((desc)->a >> 16) & 0x0000ffff) | \
5546 -       (((desc)->b << 16) & 0x00ff0000) | \
5547 -       ( (desc)->b        & 0xff000000)   )
5548 -
5549 -#define GET_LIMIT(desc) ( \
5550 -       ((desc)->a & 0x0ffff) | \
5551 -        ((desc)->b & 0xf0000) )
5552 -
5553 -#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
5554 -#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
5555 -#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
5556 -#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
5557 -#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
5558 -#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
5559 -
5560 -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5561 -{
5562 -       struct user_desc info;
5563 -       struct desc_struct *desc;
5564 -       int idx;
5565 -
5566 -       if (get_user(idx, &u_info->entry_number))
5567 -               return -EFAULT;
5568 -       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5569 -               return -EINVAL;
5570 -
5571 -       memset(&info, 0, sizeof(info));
5572 -
5573 -       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5574 -
5575 -       info.entry_number = idx;
5576 -       info.base_addr = GET_BASE(desc);
5577 -       info.limit = GET_LIMIT(desc);
5578 -       info.seg_32bit = GET_32BIT(desc);
5579 -       info.contents = GET_CONTENTS(desc);
5580 -       info.read_exec_only = !GET_WRITABLE(desc);
5581 -       info.limit_in_pages = GET_LIMIT_PAGES(desc);
5582 -       info.seg_not_present = !GET_PRESENT(desc);
5583 -       info.useable = GET_USEABLE(desc);
5584 -
5585 -       if (copy_to_user(u_info, &info, sizeof(info)))
5586 -               return -EFAULT;
5587 -       return 0;
5588 -}
5589 -
5590  unsigned long arch_align_stack(unsigned long sp)
5591  {
5592         if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5593                 sp -= get_random_int() % 8192;
5594         return sp & ~0xf;
5595  }
5596 +
5597 +unsigned long arch_randomize_brk(struct mm_struct *mm)
5598 +{
5599 +       unsigned long range_end = mm->brk + 0x02000000;
5600 +       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5601 +}
5602 --- sle11-2009-05-14.orig/arch/x86/kernel/process_64-xen.c      2009-02-16 16:18:36.000000000 +0100
5603 +++ sle11-2009-05-14/arch/x86/kernel/process_64-xen.c   2009-03-16 16:33:40.000000000 +0100
5604 @@ -3,7 +3,7 @@
5605   *
5606   *  Pentium III FXSR, SSE support
5607   *     Gareth Hughes <gareth@valinux.com>, May 2000
5608 - *
5609 + *
5610   *  X86-64 port
5611   *     Andi Kleen.
5612   *
5613 @@ -22,19 +22,18 @@
5614  #include <linux/cpu.h>
5615  #include <linux/errno.h>
5616  #include <linux/sched.h>
5617 +#include <linux/fs.h>
5618  #include <linux/kernel.h>
5619  #include <linux/mm.h>
5620 -#include <linux/fs.h>
5621  #include <linux/elfcore.h>
5622  #include <linux/smp.h>
5623  #include <linux/slab.h>
5624  #include <linux/user.h>
5625 -#include <linux/module.h>
5626 -#include <linux/a.out.h>
5627  #include <linux/interrupt.h>
5628 +#include <linux/utsname.h>
5629  #include <linux/delay.h>
5630 +#include <linux/module.h>
5631  #include <linux/ptrace.h>
5632 -#include <linux/utsname.h>
5633  #include <linux/random.h>
5634  #include <linux/notifier.h>
5635  #include <linux/kprobes.h>
5636 @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5637   */
5638  void (*pm_idle)(void);
5639  EXPORT_SYMBOL(pm_idle);
5640 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5641
5642  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5643
5644 @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5645  {
5646         atomic_notifier_chain_register(&idle_notifier, n);
5647  }
5648 -EXPORT_SYMBOL_GPL(idle_notifier_register);
5649 -
5650 -void idle_notifier_unregister(struct notifier_block *n)
5651 -{
5652 -       atomic_notifier_chain_unregister(&idle_notifier, n);
5653 -}
5654 -EXPORT_SYMBOL(idle_notifier_unregister);
5655
5656  void enter_idle(void)
5657  {
5658 @@ -116,7 +107,7 @@ void exit_idle(void)
5659   * to poll the ->need_resched flag instead of waiting for the
5660   * cross-CPU IPI to arrive. Use this option with caution.
5661   */
5662 -static void poll_idle (void)
5663 +static void poll_idle(void)
5664  {
5665         local_irq_enable();
5666         cpu_relax();
5667 @@ -131,10 +122,19 @@ static void xen_idle(void)
5668          */
5669         smp_mb();
5670         local_irq_disable();
5671 -       if (!need_resched())
5672 -               safe_halt();
5673 -       else
5674 -               local_irq_enable();
5675 +       if (!need_resched()) {
5676 +               ktime_t t0, t1;
5677 +               u64 t0n, t1n;
5678 +
5679 +               t0 = ktime_get();
5680 +               t0n = ktime_to_ns(t0);
5681 +               safe_halt();    /* enables interrupts racelessly */
5682 +               local_irq_disable();
5683 +               t1 = ktime_get();
5684 +               t1n = ktime_to_ns(t1);
5685 +               sched_clock_idle_wakeup_event(t1n - t0n);
5686 +       }
5687 +       local_irq_enable();
5688         current_thread_info()->status |= TS_POLLING;
5689  }
5690
5691 @@ -161,19 +161,15 @@ static inline void play_dead(void)
5692   * low exit latency (ie sit in a loop waiting for
5693   * somebody to say that they'd like to reschedule)
5694   */
5695 -void cpu_idle (void)
5696 +void cpu_idle(void)
5697  {
5698         current_thread_info()->status |= TS_POLLING;
5699         /* endless idle loop with no priority at all */
5700         while (1) {
5701 +               tick_nohz_stop_sched_tick();
5702                 while (!need_resched()) {
5703                         void (*idle)(void);
5704
5705 -                       if (__get_cpu_var(cpu_idle_state))
5706 -                               __get_cpu_var(cpu_idle_state) = 0;
5707 -
5708 -                       tick_nohz_stop_sched_tick();
5709 -
5710                         rmb();
5711                         idle = xen_idle; /* no alternatives */
5712                         if (cpu_is_offline(smp_processor_id()))
5713 @@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5714  {
5715  }
5716
5717 +/*
5718 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5719 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5720 + * handler on SMP systems.
5721 + *
5722 + * Caller must have changed pm_idle to the new value before the call. Old
5723 + * pm_idle value will not be used by any CPU after the return of this function.
5724 + */
5725  void cpu_idle_wait(void)
5726  {
5727 -       unsigned int cpu, this_cpu = get_cpu();
5728 -       cpumask_t map, tmp = current->cpus_allowed;
5729 -
5730 -       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5731 -       put_cpu();
5732 -
5733 -       cpus_clear(map);
5734 -       for_each_online_cpu(cpu) {
5735 -               per_cpu(cpu_idle_state, cpu) = 1;
5736 -               cpu_set(cpu, map);
5737 -       }
5738 -
5739 -       __get_cpu_var(cpu_idle_state) = 0;
5740 -
5741 -       wmb();
5742 -       do {
5743 -               ssleep(1);
5744 -               for_each_online_cpu(cpu) {
5745 -                       if (cpu_isset(cpu, map) &&
5746 -                                       !per_cpu(cpu_idle_state, cpu))
5747 -                               cpu_clear(cpu, map);
5748 -               }
5749 -               cpus_and(map, map, cpu_online_map);
5750 -               /*
5751 -                * We waited 1 sec, if a CPU still did not call idle
5752 -                * it may be because it is in idle and not waking up
5753 -                * because it has nothing to do.
5754 -                * Give all the remaining CPUS a kick.
5755 -                */
5756 -               smp_call_function_mask(map, do_nothing, 0, 0);
5757 -       } while (!cpus_empty(map));
5758 -
5759 -       set_cpus_allowed(current, tmp);
5760 +       smp_mb();
5761 +       /* kick all the CPUs so that they exit out of pm_idle */
5762 +       smp_call_function(do_nothing, NULL, 0, 1);
5763  }
5764  EXPORT_SYMBOL_GPL(cpu_idle_wait);
5765
5766 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5767 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5768  {
5769  }
5770
5771 -static int __init idle_setup (char *str)
5772 +static int __init idle_setup(char *str)
5773  {
5774         if (!strcmp(str, "poll")) {
5775                 printk("using polling idle threads.\n");
5776 @@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5777  }
5778  early_param("idle", idle_setup);
5779
5780 -/* Prints also some state that isn't saved in the pt_regs */
5781 +/* Prints also some state that isn't saved in the pt_regs */
5782  void __show_regs(struct pt_regs * regs)
5783  {
5784         unsigned long fs, gs, shadowgs;
5785         unsigned long d0, d1, d2, d3, d6, d7;
5786 -       unsigned int fsindex,gsindex;
5787 -       unsigned int ds,cs,es;
5788 +       unsigned int fsindex, gsindex;
5789 +       unsigned int ds, cs, es;
5790
5791         printk("\n");
5792         print_modules();
5793 @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5794                 init_utsname()->release,
5795                 (int)strcspn(init_utsname()->version, " "),
5796                 init_utsname()->version);
5797 -       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5798 -       printk_address(regs->rip);
5799 -       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
5800 -               regs->eflags);
5801 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5802 +       printk_address(regs->ip, 1);
5803 +       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
5804 +               regs->flags);
5805         printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5806 -              regs->rax, regs->rbx, regs->rcx);
5807 +              regs->ax, regs->bx, regs->cx);
5808         printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5809 -              regs->rdx, regs->rsi, regs->rdi);
5810 +              regs->dx, regs->si, regs->di);
5811         printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5812 -              regs->rbp, regs->r8, regs->r9);
5813 +              regs->bp, regs->r8, regs->r9);
5814         printk("R10: %016lx R11: %016lx R12: %016lx\n",
5815                regs->r10, regs->r11, regs->r12);
5816         printk("R13: %016lx R14: %016lx R15: %016lx\n",
5817 @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5818  {
5819         printk("CPU %d:", smp_processor_id());
5820         __show_regs(regs);
5821 -       show_trace(NULL, regs, (void *)(regs + 1));
5822 +       show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5823  }
5824
5825  /*
5826 @@ -329,7 +303,7 @@ void exit_thread(void)
5827         struct task_struct *me = current;
5828         struct thread_struct *t = &me->thread;
5829
5830 -       if (me->thread.io_bitmap_ptr) {
5831 +       if (me->thread.io_bitmap_ptr) {
5832  #ifndef CONFIG_X86_NO_TSS
5833                 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5834  #endif
5835 @@ -382,7 +356,7 @@ void flush_thread(void)
5836         tsk->thread.debugreg3 = 0;
5837         tsk->thread.debugreg6 = 0;
5838         tsk->thread.debugreg7 = 0;
5839 -       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5840 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5841         /*
5842          * Forget coprocessor state..
5843          */
5844 @@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5845
5846  static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5847  {
5848 -       struct user_desc ud = {
5849 +       struct user_desc ud = {
5850                 .base_addr = addr,
5851                 .limit = 0xfffff,
5852                 .seg_32bit = 1,
5853                 .limit_in_pages = 1,
5854                 .useable = 1,
5855         };
5856 -       struct n_desc_struct *desc = (void *)t->thread.tls_array;
5857 +       struct desc_struct *desc = t->thread.tls_array;
5858         desc += tls;
5859 -       desc->a = LDT_entry_a(&ud);
5860 -       desc->b = LDT_entry_b(&ud);
5861 +       fill_ldt(desc, &ud);
5862  }
5863
5864  static inline u32 read_32bit_tls(struct task_struct *t, int tls)
5865  {
5866 -       struct desc_struct *desc = (void *)t->thread.tls_array;
5867 -       desc += tls;
5868 -       return desc->base0 |
5869 -               (((u32)desc->base1) << 16) |
5870 -               (((u32)desc->base2) << 24);
5871 +       return get_desc_base(&t->thread.tls_array[tls]);
5872  }
5873
5874  /*
5875 @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
5876         unlazy_fpu(tsk);
5877  }
5878
5879 -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
5880 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5881                 unsigned long unused,
5882         struct task_struct * p, struct pt_regs * regs)
5883  {
5884 @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
5885                         (THREAD_SIZE + task_stack_page(p))) - 1;
5886         *childregs = *regs;
5887
5888 -       childregs->rax = 0;
5889 -       childregs->rsp = rsp;
5890 -       if (rsp == ~0UL)
5891 -               childregs->rsp = (unsigned long)childregs;
5892 -
5893 -       p->thread.rsp = (unsigned long) childregs;
5894 -       p->thread.rsp0 = (unsigned long) (childregs+1);
5895 -       p->thread.userrsp = me->thread.userrsp;
5896 +       childregs->ax = 0;
5897 +       childregs->sp = sp;
5898 +       if (sp == ~0UL)
5899 +               childregs->sp = (unsigned long)childregs;
5900 +
5901 +       p->thread.sp = (unsigned long) childregs;
5902 +       p->thread.sp0 = (unsigned long) (childregs+1);
5903 +       p->thread.usersp = me->thread.usersp;
5904
5905         set_tsk_thread_flag(p, TIF_FORK);
5906
5907 @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
5908                 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5909                                 IO_BITMAP_BYTES);
5910                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5911 -       }
5912 +       }
5913
5914         /*
5915          * Set a new TLS for the child thread?
5916 @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
5917         if (clone_flags & CLONE_SETTLS) {
5918  #ifdef CONFIG_IA32_EMULATION
5919                 if (test_thread_flag(TIF_IA32))
5920 -                       err = ia32_child_tls(p, childregs);
5921 +                       err = do_set_thread_area(p, -1,
5922 +                               (struct user_desc __user *)childregs->si, 0);
5923                 else
5924  #endif
5925                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
5926 @@ -502,26 +472,32 @@ out:
5927         return err;
5928  }
5929
5930 -static inline void __save_init_fpu( struct task_struct *tsk )
5931 -{
5932 -       asm volatile( "rex64 ; fxsave %0 ; fnclex"
5933 -                     : "=m" (tsk->thread.i387.fxsave));
5934 -       tsk->thread_info->status &= ~TS_USEDFPU;
5935 -}
5936 -
5937  /*
5938   * This special macro can be used to load a debugging register
5939   */
5940 -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5941 +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
5942
5943  static inline void __switch_to_xtra(struct task_struct *prev_p,
5944 -                                   struct task_struct *next_p)
5945 +                                   struct task_struct *next_p)
5946  {
5947         struct thread_struct *prev, *next;
5948 +       unsigned long debugctl;
5949
5950         prev = &prev_p->thread,
5951         next = &next_p->thread;
5952
5953 +       debugctl = prev->debugctlmsr;
5954 +       if (next->ds_area_msr != prev->ds_area_msr) {
5955 +               /* we clear debugctl to make sure DS
5956 +                * is not in use when we change it */
5957 +               debugctl = 0;
5958 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5959 +               wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
5960 +       }
5961 +
5962 +       if (next->debugctlmsr != debugctl)
5963 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
5964 +
5965         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5966                 loaddebug(next, 0);
5967                 loaddebug(next, 1);
5968 @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
5969                 loaddebug(next, 6);
5970                 loaddebug(next, 7);
5971         }
5972 +
5973 +#ifdef X86_BTS
5974 +       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5975 +               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5976 +
5977 +       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5978 +               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5979 +#endif
5980  }
5981
5982  /*
5983   *     switch_to(x,y) should switch tasks from x to y.
5984   *
5985 - * This could still be optimized:
5986 + * This could still be optimized:
5987   * - fold all the options into a flag word and test it with a single test.
5988   * - could test fs/gs bitsliced
5989   *
5990 @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
5991  {
5992         struct thread_struct *prev = &prev_p->thread,
5993                                  *next = &next_p->thread;
5994 -       int cpu = smp_processor_id();
5995 +       int cpu = smp_processor_id();
5996  #ifndef CONFIG_X86_NO_TSS
5997         struct tss_struct *tss = &per_cpu(init_tss, cpu);
5998  #endif
5999 @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6000                 prev_p->fpu_counter = 0;
6001
6002         /*
6003 -        * Reload esp0, LDT and the page table pointer:
6004 +        * Reload sp0.
6005 +        * This is load_sp0(tss, next) with a multicall.
6006          */
6007         mcl->op      = __HYPERVISOR_stack_switch;
6008         mcl->args[0] = __KERNEL_DS;
6009 -       mcl->args[1] = next->rsp0;
6010 +       mcl->args[1] = next->sp0;
6011         mcl++;
6012
6013         /*
6014 @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6015          * This is load_TLS(next, cpu) with multicalls.
6016          */
6017  #define C(i) do {                                                      \
6018 -       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
6019 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
6020 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
6021                 mcl->op      = __HYPERVISOR_update_descriptor;          \
6022                 mcl->args[0] = virt_to_machine(                         \
6023 -                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
6024 -               mcl->args[1] = next->tls_array[i];                      \
6025 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6026 +               mcl->args[1] = *(u64 *)&next->tls_array[i];             \
6027                 mcl++;                                                  \
6028         }                                                               \
6029  } while (0)
6030 @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6031  #undef C
6032
6033         if (unlikely(prev->iopl != next->iopl)) {
6034 -               iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6035 +               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6036  #if CONFIG_XEN_COMPAT > 0x030002
6037                 mcl->op      = __HYPERVISOR_physdev_op;
6038                 mcl->args[0] = PHYSDEVOP_set_iopl;
6039 @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6040         /*
6041          * Switch the PDA context.
6042          */
6043 -       prev->userrsp = read_pda(oldrsp);
6044 -       write_pda(oldrsp, next->userrsp);
6045 +       prev->usersp = read_pda(oldrsp);
6046 +       write_pda(oldrsp, next->usersp);
6047         write_pda(pcurrent, next_p);
6048         write_pda(kernelstack,
6049         (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6050 @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6051         /*
6052          * Now maybe reload the debug registers
6053          */
6054 -       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6055 +       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6056 +                    task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6057                 __switch_to_xtra(prev_p, next_p);
6058
6059         /* If the task has used fpu the last 5 timeslices, just do a full
6060 @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6061  /*
6062   * sys_execve() executes a new program.
6063   */
6064 -asmlinkage
6065 +asmlinkage
6066  long sys_execve(char __user *name, char __user * __user *argv,
6067 -               char __user * __user *envp, struct pt_regs regs)
6068 +               char __user * __user *envp, struct pt_regs *regs)
6069  {
6070         long error;
6071         char * filename;
6072
6073         filename = getname(name);
6074         error = PTR_ERR(filename);
6075 -       if (IS_ERR(filename))
6076 +       if (IS_ERR(filename))
6077                 return error;
6078 -       error = do_execve(filename, argv, envp, &regs);
6079 -       if (error == 0) {
6080 -               task_lock(current);
6081 -               current->ptrace &= ~PT_DTRACE;
6082 -               task_unlock(current);
6083 -       }
6084 +       error = do_execve(filename, argv, envp, regs);
6085         putname(filename);
6086         return error;
6087  }
6088 @@ -728,18 +710,18 @@ void set_personality_64bit(void)
6089         /* inherit personality from parent */
6090
6091         /* Make sure to be in 64bit mode */
6092 -       clear_thread_flag(TIF_IA32);
6093 +       clear_thread_flag(TIF_IA32);
6094
6095         /* TBD: overwrites user setup. Should have two bits.
6096            But 64bit processes have always behaved this way,
6097            so it's not too bad. The main problem is just that
6098 -          32bit childs are affected again. */
6099 +          32bit childs are affected again. */
6100         current->personality &= ~READ_IMPLIES_EXEC;
6101  }
6102
6103  asmlinkage long sys_fork(struct pt_regs *regs)
6104  {
6105 -       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6106 +       return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6107  }
6108
6109  asmlinkage long
6110 @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6111           void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6112  {
6113         if (!newsp)
6114 -               newsp = regs->rsp;
6115 +               newsp = regs->sp;
6116         return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6117  }
6118
6119 @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6120   */
6121  asmlinkage long sys_vfork(struct pt_regs *regs)
6122  {
6123 -       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6124 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6125                     NULL, NULL);
6126  }
6127
6128  unsigned long get_wchan(struct task_struct *p)
6129  {
6130         unsigned long stack;
6131 -       u64 fp,rip;
6132 +       u64 fp,ip;
6133         int count = 0;
6134
6135         if (!p || p == current || p->state==TASK_RUNNING)
6136                 return 0;
6137         stack = (unsigned long)task_stack_page(p);
6138 -       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6139 +       if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6140                 return 0;
6141 -       fp = *(u64 *)(p->thread.rsp);
6142 +       fp = *(u64 *)(p->thread.sp);
6143         do {
6144                 if (fp < (unsigned long)stack ||
6145                     fp > (unsigned long)stack+THREAD_SIZE)
6146                         return 0;
6147 -               rip = *(u64 *)(fp+8);
6148 -               if (!in_sched_functions(rip))
6149 -                       return rip;
6150 +               ip = *(u64 *)(fp+8);
6151 +               if (!in_sched_functions(ip))
6152 +                       return ip;
6153                 fp = *(u64 *)fp;
6154         } while (count++ < 16);
6155         return 0;
6156 @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6157                 /* Not strictly needed for fs, but do it for symmetry
6158                    with gs */
6159                 if (addr >= TASK_SIZE_OF(task))
6160 -                       return -EPERM;
6161 +                       return -EPERM;
6162                 cpu = get_cpu();
6163 -               /* handle small bases via the GDT because that's faster to
6164 +               /* handle small bases via the GDT because that's faster to
6165                    switch. */
6166 -               if (addr <= 0xffffffff) {
6167 +               if (addr <= 0xffffffff) {
6168                         set_32bit_tls(task, FS_TLS, addr);
6169 -                       if (doit) {
6170 -                               load_TLS(&task->thread, cpu);
6171 +                       if (doit) {
6172 +                               load_TLS(&task->thread, cpu);
6173                                 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6174                         }
6175                         task->thread.fsindex = FS_TLS_SEL;
6176                         task->thread.fs = 0;
6177 -               } else {
6178 +               } else {
6179                         task->thread.fsindex = 0;
6180                         task->thread.fs = addr;
6181                         if (doit) {
6182 @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6183                 }
6184                 put_cpu();
6185                 break;
6186 -       case ARCH_GET_FS: {
6187 -               unsigned long base;
6188 +       case ARCH_GET_FS: {
6189 +               unsigned long base;
6190                 if (task->thread.fsindex == FS_TLS_SEL)
6191                         base = read_32bit_tls(task, FS_TLS);
6192                 else if (doit)
6193                         rdmsrl(MSR_FS_BASE, base);
6194                 else
6195                         base = task->thread.fs;
6196 -               ret = put_user(base, (unsigned long __user *)addr);
6197 -               break;
6198 +               ret = put_user(base, (unsigned long __user *)addr);
6199 +               break;
6200         }
6201 -       case ARCH_GET_GS: {
6202 +       case ARCH_GET_GS: {
6203                 unsigned long base;
6204                 unsigned gsindex;
6205                 if (task->thread.gsindex == GS_TLS_SEL)
6206                         base = read_32bit_tls(task, GS_TLS);
6207                 else if (doit) {
6208 -                       asm("movl %%gs,%0" : "=r" (gsindex));
6209 +                       asm("movl %%gs,%0" : "=r" (gsindex));
6210                         if (gsindex)
6211                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
6212                         else
6213 @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6214                 }
6215                 else
6216                         base = task->thread.gs;
6217 -               ret = put_user(base, (unsigned long __user *)addr);
6218 +               ret = put_user(base, (unsigned long __user *)addr);
6219                 break;
6220         }
6221
6222         default:
6223                 ret = -EINVAL;
6224                 break;
6225 -       }
6226 +       }
6227
6228 -       return ret;
6229 -}
6230 +       return ret;
6231 +}
6232
6233  long sys_arch_prctl(int code, unsigned long addr)
6234  {
6235         return do_arch_prctl(current, code, addr);
6236 -}
6237 -
6238 -/*
6239 - * Capture the user space registers if the task is not running (in user space)
6240 - */
6241 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6242 -{
6243 -       struct pt_regs *pp, ptregs;
6244 -
6245 -       pp = task_pt_regs(tsk);
6246 -
6247 -       ptregs = *pp;
6248 -       ptregs.cs &= 0xffff;
6249 -       ptregs.ss &= 0xffff;
6250 -
6251 -       elf_core_copy_regs(regs, &ptregs);
6252 -
6253 -        boot_option_idle_override = 1;
6254 -       return 1;
6255  }
6256
6257  unsigned long arch_align_stack(unsigned long sp)
6258 @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6259                 sp -= get_random_int() % 8192;
6260         return sp & ~0xf;
6261  }
6262 +
6263 +unsigned long arch_randomize_brk(struct mm_struct *mm)
6264 +{
6265 +       unsigned long range_end = mm->brk + 0x02000000;
6266 +       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6267 +}
6268 --- sle11-2009-05-14.orig/arch/x86/kernel/quirks-xen.c  2009-02-16 16:18:36.000000000 +0100
6269 +++ sle11-2009-05-14/arch/x86/kernel/quirks-xen.c       2009-03-16 16:33:40.000000000 +0100
6270 @@ -9,7 +9,7 @@
6271  static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6272  {
6273         u8 config, rev;
6274 -       u32 word;
6275 +       u16 word;
6276
6277         /* BIOS may enable hardware IRQ balancing for
6278          * E7520/E7320/E7525(revision ID 0x9 and below)
6279 @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6280         pci_read_config_byte(dev, 0xf4, &config);
6281         pci_write_config_byte(dev, 0xf4, config|0x2);
6282
6283 -       /* read xTPR register */
6284 -       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6285 +       /*
6286 +        * read xTPR register.  We may not have a pci_dev for device 8
6287 +        * because it might be hidden until the above write.
6288 +        */
6289 +       pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6290
6291         if (!(word & (1 << 13))) {
6292                 struct xen_platform_op op;
6293
6294 -               printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6295 -                       "Disabling irq balancing and affinity\n");
6296 +               dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6297 +                       "disabling irq balancing and affinity\n");
6298                 op.cmd = XENPF_platform_quirk;
6299                 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6300                 WARN_ON(HYPERVISOR_platform_op(&op));
6301 @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6302         pci_read_config_dword(dev, 0xF0, &rcba);
6303         rcba &= 0xFFFFC000;
6304         if (rcba == 0) {
6305 -               printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6306 +               dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6307 +                       "cannot force enable HPET\n");
6308                 return;
6309         }
6310
6311         /* use bits 31:14, 16 kB aligned */
6312         rcba_base = ioremap_nocache(rcba, 0x4000);
6313         if (rcba_base == NULL) {
6314 -               printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6315 +               dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6316 +                       "cannot force enable HPET\n");
6317                 return;
6318         }
6319
6320 @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6321                 /* HPET is enabled in HPTC. Just not reported by BIOS */
6322                 val = val & 0x3;
6323                 force_hpet_address = 0xFED00000 | (val << 12);
6324 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6325 -                              force_hpet_address);
6326 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6327 +                       "0x%lx\n", force_hpet_address);
6328                 iounmap(rcba_base);
6329                 return;
6330         }
6331 @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6332         if (err) {
6333                 force_hpet_address = 0;
6334                 iounmap(rcba_base);
6335 -               printk(KERN_DEBUG "Failed to force enable HPET\n");
6336 +               dev_printk(KERN_DEBUG, &dev->dev,
6337 +                       "Failed to force enable HPET\n");
6338         } else {
6339                 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6340 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6341 -                              force_hpet_address);
6342 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6343 +                       "0x%lx\n", force_hpet_address);
6344         }
6345  }
6346
6347 @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6348                          ich_force_enable_hpet);
6349  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6350                          ich_force_enable_hpet);
6351 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6352 +                        ich_force_enable_hpet);
6353
6354
6355  static struct pci_dev *cached_dev;
6356 @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6357         if (val & 0x4) {
6358                 val &= 0x3;
6359                 force_hpet_address = 0xFED00000 | (val << 12);
6360 -               printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6361 -                              force_hpet_address);
6362 +               dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6363 +                       force_hpet_address);
6364                 return;
6365         }
6366
6367 @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6368                 /* HPET is enabled in HPTC. Just not reported by BIOS */
6369                 val &= 0x3;
6370                 force_hpet_address = 0xFED00000 | (val << 12);
6371 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6372 -                              force_hpet_address);
6373 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6374 +                       "0x%lx\n", force_hpet_address);
6375                 cached_dev = dev;
6376                 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6377                 return;
6378         }
6379
6380 -       printk(KERN_DEBUG "Failed to force enable HPET\n");
6381 +       dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6382  }
6383
6384  /*
6385 @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6386          */
6387         if (val & 0x80) {
6388                 force_hpet_address = (val & ~0x3ff);
6389 -               printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6390 -                              force_hpet_address);
6391 +               dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6392 +                       force_hpet_address);
6393                 return;
6394         }
6395
6396 @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6397         pci_read_config_dword(dev, 0x68, &val);
6398         if (val & 0x80) {
6399                 force_hpet_address = (val & ~0x3ff);
6400 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6401 -                              force_hpet_address);
6402 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6403 +                       "0x%lx\n", force_hpet_address);
6404                 cached_dev = dev;
6405                 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6406                 return;
6407         }
6408
6409 -       printk(KERN_DEBUG "Failed to force enable HPET\n");
6410 +       dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6411  }
6412
6413  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6414 @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6415         pci_read_config_dword(dev, 0x44, &val);
6416         force_hpet_address = val & 0xfffffffe;
6417         force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6418 -       printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6419 +       dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6420                 force_hpet_address);
6421         cached_dev = dev;
6422         return;
6423 @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6424                         nvidia_force_enable_hpet);
6425
6426  /* LPC bridges */
6427 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6428 +                       nvidia_force_enable_hpet);
6429  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6430                         nvidia_force_enable_hpet);
6431  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6432 @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6433  void force_hpet_resume(void)
6434  {
6435         switch (force_hpet_resume_type) {
6436 -           case ICH_FORCE_HPET_RESUME:
6437 -               return ich_force_hpet_resume();
6438 -
6439 -           case OLD_ICH_FORCE_HPET_RESUME:
6440 -               return old_ich_force_hpet_resume();
6441 -
6442 -           case VT8237_FORCE_HPET_RESUME:
6443 -               return vt8237_force_hpet_resume();
6444 -
6445 -           case NVIDIA_FORCE_HPET_RESUME:
6446 -               return nvidia_force_hpet_resume();
6447 -
6448 -           default:
6449 +       case ICH_FORCE_HPET_RESUME:
6450 +               ich_force_hpet_resume();
6451 +               return;
6452 +       case OLD_ICH_FORCE_HPET_RESUME:
6453 +               old_ich_force_hpet_resume();
6454 +               return;
6455 +       case VT8237_FORCE_HPET_RESUME:
6456 +               vt8237_force_hpet_resume();
6457 +               return;
6458 +       case NVIDIA_FORCE_HPET_RESUME:
6459 +               nvidia_force_hpet_resume();
6460 +               return;
6461 +       default:
6462                 break;
6463         }
6464  }
6465 --- sle11-2009-05-14.orig/arch/x86/kernel/rtc.c 2009-05-14 10:56:29.000000000 +0200
6466 +++ sle11-2009-05-14/arch/x86/kernel/rtc.c      2009-03-16 16:33:40.000000000 +0100
6467 @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6468  {
6469         unsigned long retval, flags;
6470
6471 +#ifdef CONFIG_XEN
6472 +       if (!is_initial_xendomain())
6473 +               return xen_read_persistent_clock();
6474 +#endif
6475         spin_lock_irqsave(&rtc_lock, flags);
6476         retval = get_wallclock();
6477         spin_unlock_irqrestore(&rtc_lock, flags);
6478 @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6479
6480  int update_persistent_clock(struct timespec now)
6481  {
6482 +#ifdef CONFIG_XEN
6483 +       if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6484 +               return 0;
6485 +#endif
6486         return set_rtc_mmss(now.tv_sec);
6487  }
6488
6489 --- sle11-2009-05-14.orig/arch/x86/kernel/setup64-xen.c 2009-02-16 16:18:36.000000000 +0100
6490 +++ sle11-2009-05-14/arch/x86/kernel/setup64-xen.c      2009-03-16 16:33:40.000000000 +0100
6491 @@ -31,7 +31,11 @@
6492  #include <asm/hypervisor.h>
6493  #endif
6494
6495 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6496  struct boot_params __initdata boot_params;
6497 +#else
6498 +struct boot_params boot_params;
6499 +#endif
6500
6501  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
6502
6503 @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
6504
6505  unsigned long __supported_pte_mask __read_mostly = ~0UL;
6506  EXPORT_SYMBOL(__supported_pte_mask);
6507 +
6508  static int do_not_nx __cpuinitdata = 0;
6509
6510  /* noexec=on|off
6511 @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
6512  __setup("noexec32=", nonx32_setup);
6513
6514  /*
6515 + * Copy data used in early init routines from the initial arrays to the
6516 + * per cpu data areas.  These arrays then become expendable and the
6517 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
6518 + */
6519 +static void __init setup_per_cpu_maps(void)
6520 +{
6521 +#ifndef CONFIG_XEN
6522 +       int cpu;
6523 +
6524 +       for_each_possible_cpu(cpu) {
6525 +#ifdef CONFIG_SMP
6526 +               if (per_cpu_offset(cpu)) {
6527 +#endif
6528 +                       per_cpu(x86_cpu_to_apicid, cpu) =
6529 +                                               x86_cpu_to_apicid_init[cpu];
6530 +                       per_cpu(x86_bios_cpu_apicid, cpu) =
6531 +                                               x86_bios_cpu_apicid_init[cpu];
6532 +#ifdef CONFIG_NUMA
6533 +                       per_cpu(x86_cpu_to_node_map, cpu) =
6534 +                                               x86_cpu_to_node_map_init[cpu];
6535 +#endif
6536 +#ifdef CONFIG_SMP
6537 +               }
6538 +               else
6539 +                       printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
6540 +                                                                       cpu);
6541 +#endif
6542 +       }
6543 +
6544 +       /* indicate the early static arrays will soon be gone */
6545 +       x86_cpu_to_apicid_early_ptr = NULL;
6546 +       x86_bios_cpu_apicid_early_ptr = NULL;
6547 +#ifdef CONFIG_NUMA
6548 +       x86_cpu_to_node_map_early_ptr = NULL;
6549 +#endif
6550 +#endif
6551 +}
6552 +
6553 +/*
6554   * Great future plan:
6555   * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
6556   * Always point %gs to its beginning
6557 @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
6558         printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
6559         for_each_cpu_mask (i, cpu_possible_map) {
6560                 char *ptr;
6561 +#ifndef CONFIG_NEED_MULTIPLE_NODES
6562 +               ptr = alloc_bootmem_pages(size);
6563 +#else
6564 +               int node = early_cpu_to_node(i);
6565
6566 -               if (!NODE_DATA(cpu_to_node(i))) {
6567 -                       printk("cpu with no node %d, num_online_nodes %d\n",
6568 -                              i, num_online_nodes());
6569 +               if (!node_online(node) || !NODE_DATA(node))
6570                         ptr = alloc_bootmem_pages(size);
6571 -               } else {
6572 -                       ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
6573 -               }
6574 +               else
6575 +                       ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
6576 +#endif
6577                 if (!ptr)
6578                         panic("Cannot allocate cpu data for CPU %d\n", i);
6579                 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
6580                 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
6581         }
6582 +
6583 +       /* setup percpu data maps early */
6584 +       setup_per_cpu_maps();
6585  }
6586
6587  #ifdef CONFIG_XEN
6588 @@ -224,7 +273,8 @@ void syscall_init(void)
6589         wrmsrl(MSR_CSTAR, ignore_sysret);
6590
6591         /* Flags to clear on syscall */
6592 -       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
6593 +       wrmsrl(MSR_SYSCALL_MASK,
6594 +              X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
6595  #endif
6596  #ifdef CONFIG_IA32_EMULATION
6597         syscall32_cpu_init ();
6598 @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
6599          */
6600  #ifndef CONFIG_XEN
6601         if (cpu)
6602 -               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
6603 +               memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
6604  #endif
6605
6606         cpu_gdt_descr[cpu].size = GDT_SIZE;
6607 @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
6608                                       v, cpu);
6609                 }
6610                 estacks += PAGE_SIZE << order[v];
6611 -               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
6612 +               orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
6613         }
6614
6615 -       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6616 +       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6617         /*
6618          * <= is required because the CPU will access up to
6619          * 8 bits beyond the end of the IO permission bitmap.
6620 --- sle11-2009-05-14.orig/arch/x86/kernel/setup_32-xen.c        2009-02-16 16:18:36.000000000 +0100
6621 +++ sle11-2009-05-14/arch/x86/kernel/setup_32-xen.c     2009-03-16 16:33:40.000000000 +0100
6622 @@ -47,9 +47,12 @@
6623  #include <linux/crash_dump.h>
6624  #include <linux/dmi.h>
6625  #include <linux/pfn.h>
6626 +#include <linux/pci.h>
6627 +#include <linux/init_ohci1394_dma.h>
6628
6629  #include <video/edid.h>
6630
6631 +#include <asm/mtrr.h>
6632  #include <asm/apic.h>
6633  #include <asm/e820.h>
6634  #include <asm/mpspec.h>
6635 @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6636         xen_panic_event, NULL, 0 /* try to go last */
6637  };
6638
6639 -int disable_pse __cpuinitdata = 0;
6640 -
6641  /*
6642   * Machine setup..
6643   */
6644 -extern struct resource code_resource;
6645 -extern struct resource data_resource;
6646 -extern struct resource bss_resource;
6647 +static struct resource data_resource = {
6648 +       .name   = "Kernel data",
6649 +       .start  = 0,
6650 +       .end    = 0,
6651 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6652 +};
6653 +
6654 +static struct resource code_resource = {
6655 +       .name   = "Kernel code",
6656 +       .start  = 0,
6657 +       .end    = 0,
6658 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6659 +};
6660 +
6661 +static struct resource bss_resource = {
6662 +       .name   = "Kernel bss",
6663 +       .start  = 0,
6664 +       .end    = 0,
6665 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6666 +};
6667 +
6668 +static struct resource video_ram_resource = {
6669 +       .name   = "Video RAM area",
6670 +       .start  = 0xa0000,
6671 +       .end    = 0xbffff,
6672 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6673 +};
6674 +
6675 +static struct resource standard_io_resources[] = { {
6676 +       .name   = "dma1",
6677 +       .start  = 0x0000,
6678 +       .end    = 0x001f,
6679 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6680 +}, {
6681 +       .name   = "pic1",
6682 +       .start  = 0x0020,
6683 +       .end    = 0x0021,
6684 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6685 +}, {
6686 +       .name   = "timer0",
6687 +       .start  = 0x0040,
6688 +       .end    = 0x0043,
6689 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6690 +}, {
6691 +       .name   = "timer1",
6692 +       .start  = 0x0050,
6693 +       .end    = 0x0053,
6694 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6695 +}, {
6696 +       .name   = "keyboard",
6697 +       .start  = 0x0060,
6698 +       .end    = 0x006f,
6699 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6700 +}, {
6701 +       .name   = "dma page reg",
6702 +       .start  = 0x0080,
6703 +       .end    = 0x008f,
6704 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6705 +}, {
6706 +       .name   = "pic2",
6707 +       .start  = 0x00a0,
6708 +       .end    = 0x00a1,
6709 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6710 +}, {
6711 +       .name   = "dma2",
6712 +       .start  = 0x00c0,
6713 +       .end    = 0x00df,
6714 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6715 +}, {
6716 +       .name   = "fpu",
6717 +       .start  = 0x00f0,
6718 +       .end    = 0x00ff,
6719 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6720 +} };
6721
6722  /* cpu data as detected by the assembly code in head.S */
6723  struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6724 @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6725  struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6726  EXPORT_SYMBOL(boot_cpu_data);
6727
6728 +#ifndef CONFIG_X86_PAE
6729  unsigned long mmu_cr4_features;
6730 +#else
6731 +unsigned long mmu_cr4_features = X86_CR4_PAE;
6732 +#endif
6733
6734  /* for MCA, but anyone else can use it if they want */
6735  unsigned int machine_id;
6736  unsigned int machine_submodel_id;
6737  unsigned int BIOS_revision;
6738 -unsigned int mca_pentium_flag;
6739
6740  /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6741  int bootloader_type;
6742 @@ -131,13 +206,17 @@ extern int root_mountflags;
6743
6744  unsigned long saved_videomode;
6745
6746 -#define RAMDISK_IMAGE_START_MASK       0x07FF
6747 +#define RAMDISK_IMAGE_START_MASK       0x07FF
6748  #define RAMDISK_PROMPT_FLAG            0x8000
6749 -#define RAMDISK_LOAD_FLAG              0x4000
6750 +#define RAMDISK_LOAD_FLAG              0x4000
6751
6752  static char __initdata command_line[COMMAND_LINE_SIZE];
6753
6754 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6755  struct boot_params __initdata boot_params;
6756 +#else
6757 +struct boot_params boot_params;
6758 +#endif
6759
6760  /*
6761   * Point at the empty zero page to start with. We map the real shared_info
6762 @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6763                 return -EINVAL;
6764
6765         if (strcmp(arg, "nopentium") == 0) {
6766 -               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6767 -               disable_pse = 1;
6768 +               setup_clear_cpu_cap(X86_FEATURE_PSE);
6769         } else {
6770                 /* If the user specifies memory size, we
6771                  * limit the BIOS-provided memory map to
6772 @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6773                  * trim the existing memory map.
6774                  */
6775                 unsigned long long mem_size;
6776 -
6777 +
6778                 mem_size = memparse(arg, &arg);
6779                 limit_regions(mem_size);
6780                 user_defined_memmap = 1;
6781 @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6782         unsigned int addr;
6783         addr = get_bios_ebda();
6784         if (addr)
6785 -               reserve_bootmem(addr, PAGE_SIZE);
6786 +               reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6787  }
6788  #endif
6789
6790 @@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6791         min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6792                 xen_start_info->nr_pt_frames;
6793
6794 -       find_max_pfn();
6795 -
6796         max_low_pfn = find_max_low_pfn();
6797
6798  #ifdef CONFIG_HIGHMEM
6799 @@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6800                                         (unsigned long)(total_mem >> 20));
6801                         crashk_res.start = crash_base;
6802                         crashk_res.end   = crash_base + crash_size - 1;
6803 -                       reserve_bootmem(crash_base, crash_size);
6804 +                       reserve_bootmem(crash_base, crash_size,
6805 +                                       BOOTMEM_DEFAULT);
6806                 } else
6807                         printk(KERN_INFO "crashkernel reservation failed - "
6808                                         "you have to specify a base address\n");
6809 @@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6810  {}
6811  #endif
6812
6813 +#ifdef CONFIG_BLK_DEV_INITRD
6814 +
6815 +static bool do_relocate_initrd = false;
6816 +
6817 +static void __init reserve_initrd(void)
6818 +{
6819 +       unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6820 +       unsigned long ramdisk_size  = xen_start_info->mod_len;
6821 +       unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
6822 +       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6823 +       unsigned long ramdisk_here;
6824 +
6825 +       initrd_start = 0;
6826 +
6827 +       if (!xen_start_info->mod_start || !ramdisk_size)
6828 +               return;         /* No initrd provided by bootloader */
6829 +
6830 +       if (ramdisk_end < ramdisk_image) {
6831 +               printk(KERN_ERR "initrd wraps around end of memory, "
6832 +                      "disabling initrd\n");
6833 +               return;
6834 +       }
6835 +       if (ramdisk_size >= end_of_lowmem/2) {
6836 +               printk(KERN_ERR "initrd too large to handle, "
6837 +                      "disabling initrd\n");
6838 +               return;
6839 +       }
6840 +       if (ramdisk_end <= end_of_lowmem) {
6841 +               /* All in lowmem, easy case */
6842 +               reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6843 +               initrd_start = ramdisk_image + PAGE_OFFSET;
6844 +               initrd_end = initrd_start+ramdisk_size;
6845 +               return;
6846 +       }
6847 +
6848 +       /* We need to move the initrd down into lowmem */
6849 +       ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6850 +
6851 +       /* Note: this includes all the lowmem currently occupied by
6852 +          the initrd, we rely on that fact to keep the data intact. */
6853 +       reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6854 +       initrd_start = ramdisk_here + PAGE_OFFSET;
6855 +       initrd_end   = initrd_start + ramdisk_size;
6856 +
6857 +       do_relocate_initrd = true;
6858 +}
6859 +
6860 +#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
6861 +
6862 +static void __init relocate_initrd(void)
6863 +{
6864 +       unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6865 +       unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
6866 +       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6867 +       unsigned long ramdisk_here;
6868 +       unsigned long slop, clen, mapaddr;
6869 +       char *p, *q;
6870 +
6871 +       if (!do_relocate_initrd)
6872 +               return;
6873 +
6874 +       ramdisk_here = initrd_start - PAGE_OFFSET;
6875 +
6876 +       q = (char *)initrd_start;
6877 +
6878 +       /* Copy any lowmem portion of the initrd */
6879 +       if (ramdisk_image < end_of_lowmem) {
6880 +               clen = end_of_lowmem - ramdisk_image;
6881 +               p = (char *)__va(ramdisk_image);
6882 +               memcpy(q, p, clen);
6883 +               q += clen;
6884 +               ramdisk_image += clen;
6885 +               ramdisk_size  -= clen;
6886 +       }
6887 +
6888 +       /* Copy the highmem portion of the initrd */
6889 +       while (ramdisk_size) {
6890 +               slop = ramdisk_image & ~PAGE_MASK;
6891 +               clen = ramdisk_size;
6892 +               if (clen > MAX_MAP_CHUNK-slop)
6893 +                       clen = MAX_MAP_CHUNK-slop;
6894 +               mapaddr = ramdisk_image & PAGE_MASK;
6895 +               p = early_ioremap(mapaddr, clen+slop);
6896 +               memcpy(q, p+slop, clen);
6897 +               early_iounmap(p, clen+slop);
6898 +               q += clen;
6899 +               ramdisk_image += clen;
6900 +               ramdisk_size  -= clen;
6901 +       }
6902 +}
6903 +
6904 +#endif /* CONFIG_BLK_DEV_INITRD */
6905 +
6906  void __init setup_bootmem_allocator(void)
6907  {
6908         unsigned long bootmap_size;
6909 @@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6910          * bootmem allocator with an invalid RAM area.
6911          */
6912         reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6913 -                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6914 +                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6915 +                        BOOTMEM_DEFAULT);
6916
6917  #ifndef CONFIG_XEN
6918         /*
6919          * reserve physical page 0 - it's a special BIOS page on many boxes,
6920          * enabling clean reboots, SMP operation, laptop functions.
6921          */
6922 -       reserve_bootmem(0, PAGE_SIZE);
6923 +       reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6924
6925         /* reserve EBDA region, it's a 4K region */
6926         reserve_ebda_region();
6927 @@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6928         unless you have no PS/2 mouse plugged in. */
6929         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6930             boot_cpu_data.x86 == 6)
6931 -            reserve_bootmem(0xa0000 - 4096, 4096);
6932 +            reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6933
6934  #ifdef CONFIG_SMP
6935         /*
6936 @@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6937          * FIXME: Don't need the extra page at 4K, but need to fix
6938          * trampoline before removing it. (see the GDT stuff)
6939          */
6940 -       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6941 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6942  #endif
6943  #ifdef CONFIG_ACPI_SLEEP
6944         /*
6945 @@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6946          */
6947         acpi_reserve_bootmem();
6948  #endif
6949 -       numa_kva_reserve();
6950  #endif /* !CONFIG_XEN */
6951
6952  #ifdef CONFIG_BLK_DEV_INITRD
6953 -       if (xen_start_info->mod_start) {
6954 -               unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6955 -               unsigned long ramdisk_size  = xen_start_info->mod_len;
6956 -               unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
6957 -               unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6958 -
6959 -               if (ramdisk_end <= end_of_lowmem) {
6960 -                       /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6961 -                       initrd_start = ramdisk_image + PAGE_OFFSET;
6962 -                       initrd_end = initrd_start+ramdisk_size;
6963 -                       initrd_below_start_ok = 1;
6964 -               } else {
6965 -                       printk(KERN_ERR "initrd extends beyond end of memory "
6966 -                              "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6967 -                              ramdisk_end, end_of_lowmem);
6968 -                       initrd_start = 0;
6969 -               }
6970 -       }
6971 +       reserve_initrd();
6972  #endif
6973 +       numa_kva_reserve();
6974         reserve_crashkernel();
6975  }
6976
6977 @@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6978         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6979         pre_setup_arch_hook();
6980         early_cpu_init();
6981 +       early_ioremap_init();
6982  #ifdef CONFIG_SMP
6983         prefill_possible_map();
6984  #endif
6985
6986 -       /*
6987 -        * FIXME: This isn't an official loader_type right
6988 -        * now but does currently work with elilo.
6989 -        * If we were configured as an EFI kernel, check to make
6990 -        * sure that we were loaded correctly from elilo and that
6991 -        * the system table is valid.  If not, then initialize normally.
6992 -        */
6993  #ifdef CONFIG_EFI
6994 -       if ((boot_params.hdr.type_of_loader == 0x50) &&
6995 -           boot_params.efi_info.efi_systab)
6996 +       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
6997 +                    "EL32", 4))
6998                 efi_enabled = 1;
6999  #endif
7000
7001 @@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7002  #endif
7003
7004         ARCH_SETUP
7005 -       if (efi_enabled)
7006 -               efi_init();
7007 -       else {
7008 -               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7009 -               print_memory_map(memory_setup());
7010 -       }
7011 +
7012 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7013 +       print_memory_map(memory_setup());
7014
7015         copy_edd();
7016
7017 @@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7018         strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7019         *cmdline_p = command_line;
7020
7021 +       if (efi_enabled)
7022 +               efi_init();
7023 +
7024 +       /* update e820 for memory not covered by WB MTRRs */
7025 +       find_max_pfn();
7026 +       mtrr_bp_init();
7027 +#ifndef CONFIG_XEN
7028 +       if (mtrr_trim_uncached_memory(max_pfn))
7029 +               find_max_pfn();
7030 +#endif
7031 +
7032         max_low_pfn = setup_memory();
7033
7034  #ifdef CONFIG_VMI
7035 @@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7036         smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7037  #endif
7038         paging_init();
7039 +
7040 +       /*
7041 +        * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7042 +        */
7043 +
7044 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7045 +       if (init_ohci1394_dma_early)
7046 +               init_ohci1394_dma_on_all_controllers();
7047 +#endif
7048 +
7049         remapped_pgdat_init();
7050         sparse_init();
7051         zone_sizes_init();
7052 @@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7053          * NOTE: at this point the bootmem allocator is fully available.
7054          */
7055
7056 +#ifdef CONFIG_BLK_DEV_INITRD
7057 +       relocate_initrd();
7058 +#endif
7059 +
7060         paravirt_post_allocator_init();
7061
7062         if (is_initial_xendomain())
7063                 dmi_scan_machine();
7064
7065 +       io_delay_init();
7066 +
7067  #ifdef CONFIG_X86_GENERICARCH
7068         generic_apic_probe();
7069 -#endif
7070 -       if (efi_enabled)
7071 -               efi_map_memmap();
7072 +#endif
7073
7074         set_iopl.iopl = 1;
7075         WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7076 @@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7077         acpi_boot_table_init();
7078  #endif
7079
7080 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7081 +#ifndef CONFIG_XEN
7082         early_quirks();
7083  #endif
7084
7085 @@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7086         /* we're never actually going to get here... */
7087         return NOTIFY_DONE;
7088  }
7089 +
7090 +/*
7091 + * Request address space for all standard resources
7092 + *
7093 + * This is called just before pcibios_init(), which is also a
7094 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7095 + */
7096 +static int __init request_standard_resources(void)
7097 +{
7098 +       int i;
7099 +
7100 +       /* Nothing to do if not running in dom0. */
7101 +       if (!is_initial_xendomain())
7102 +               return 0;
7103 +
7104 +       printk(KERN_INFO "Setting up standard PCI resources\n");
7105 +       init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7106 +
7107 +       request_resource(&iomem_resource, &video_ram_resource);
7108 +
7109 +       /* request I/O space for devices used on all i[345]86 PCs */
7110 +       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7111 +               request_resource(&ioport_resource, &standard_io_resources[i]);
7112 +       return 0;
7113 +}
7114 +
7115 +subsys_initcall(request_standard_resources);
7116 --- sle11-2009-05-14.orig/arch/x86/kernel/setup_64-xen.c        2009-02-16 16:18:36.000000000 +0100
7117 +++ sle11-2009-05-14/arch/x86/kernel/setup_64-xen.c     2009-03-16 16:33:40.000000000 +0100
7118 @@ -15,7 +15,6 @@
7119  #include <linux/ptrace.h>
7120  #include <linux/slab.h>
7121  #include <linux/user.h>
7122 -#include <linux/a.out.h>
7123  #include <linux/screen_info.h>
7124  #include <linux/ioport.h>
7125  #include <linux/delay.h>
7126 @@ -30,6 +29,7 @@
7127  #include <linux/crash_dump.h>
7128  #include <linux/root_dev.h>
7129  #include <linux/pci.h>
7130 +#include <linux/efi.h>
7131  #include <linux/acpi.h>
7132  #include <linux/kallsyms.h>
7133  #include <linux/edd.h>
7134 @@ -39,10 +39,13 @@
7135  #include <linux/dmi.h>
7136  #include <linux/dma-mapping.h>
7137  #include <linux/ctype.h>
7138 +#include <linux/uaccess.h>
7139 +#include <linux/init_ohci1394_dma.h>
7140
7141  #include <asm/mtrr.h>
7142  #include <asm/uaccess.h>
7143  #include <asm/system.h>
7144 +#include <asm/vsyscall.h>
7145  #include <asm/io.h>
7146  #include <asm/smp.h>
7147  #include <asm/msr.h>
7148 @@ -50,6 +53,7 @@
7149  #include <video/edid.h>
7150  #include <asm/e820.h>
7151  #include <asm/dma.h>
7152 +#include <asm/gart.h>
7153  #include <asm/mpspec.h>
7154  #include <asm/mmu_context.h>
7155  #include <asm/proto.h>
7156 @@ -59,6 +63,9 @@
7157  #include <asm/sections.h>
7158  #include <asm/dmi.h>
7159  #include <asm/cacheflush.h>
7160 +#include <asm/mce.h>
7161 +#include <asm/ds.h>
7162 +#include <asm/topology.h>
7163  #ifdef CONFIG_XEN
7164  #include <linux/percpu.h>
7165  #include <xen/interface/physdev.h>
7166 @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7167  struct cpuinfo_x86 boot_cpu_data __read_mostly;
7168  EXPORT_SYMBOL(boot_cpu_data);
7169
7170 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7171 +
7172  unsigned long mmu_cr4_features;
7173
7174  /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7175 @@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7176
7177  int force_mwait __cpuinitdata;
7178
7179 -/*
7180 +/*
7181   * Early DMI memory
7182   */
7183  int dmi_alloc_index;
7184 @@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7185
7186  #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7187
7188 -struct resource data_resource = {
7189 +static struct resource data_resource = {
7190         .name = "Kernel data",
7191         .start = 0,
7192         .end = 0,
7193         .flags = IORESOURCE_RAM,
7194  };
7195 -struct resource code_resource = {
7196 +static struct resource code_resource = {
7197         .name = "Kernel code",
7198         .start = 0,
7199         .end = 0,
7200         .flags = IORESOURCE_RAM,
7201  };
7202 -struct resource bss_resource = {
7203 +static struct resource bss_resource = {
7204         .name = "Kernel bss",
7205         .start = 0,
7206         .end = 0,
7207         .flags = IORESOURCE_RAM,
7208  };
7209
7210 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7211 +
7212  #ifdef CONFIG_PROC_VMCORE
7213  /* elfcorehdr= specifies the location of elf core header
7214   * stored by the crashed kernel. This option will be passed
7215 @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7216         unsigned long bootmap_size, bootmap;
7217
7218         bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7219 -       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7220 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7221 +                                PAGE_SIZE);
7222         if (bootmap == -1L)
7223 -               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7224 +               panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7225         bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7226         e820_register_active_regions(0, start_pfn, end_pfn);
7227  #ifdef CONFIG_XEN
7228 @@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7229  #else
7230         free_bootmem_with_active_regions(0, end_pfn);
7231  #endif
7232 -       reserve_bootmem(bootmap, bootmap_size);
7233 -}
7234 +       reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7235 +}
7236  #endif
7237
7238  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7239 @@ -249,27 +261,35 @@ static inline void copy_edd(void)
7240  #ifndef CONFIG_XEN
7241  static void __init reserve_crashkernel(void)
7242  {
7243 -       unsigned long long free_mem;
7244 +       unsigned long long total_mem;
7245         unsigned long long crash_size, crash_base;
7246         int ret;
7247
7248 -       free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7249 +       total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7250
7251 -       ret = parse_crashkernel(boot_command_line, free_mem,
7252 +       ret = parse_crashkernel(boot_command_line, total_mem,
7253                         &crash_size, &crash_base);
7254         if (ret == 0 && crash_size) {
7255 -               if (crash_base > 0) {
7256 -                       printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7257 -                                       "for crashkernel (System RAM: %ldMB)\n",
7258 -                                       (unsigned long)(crash_size >> 20),
7259 -                                       (unsigned long)(crash_base >> 20),
7260 -                                       (unsigned long)(free_mem >> 20));
7261 -                       crashk_res.start = crash_base;
7262 -                       crashk_res.end   = crash_base + crash_size - 1;
7263 -                       reserve_bootmem(crash_base, crash_size);
7264 -               } else
7265 +               if (crash_base <= 0) {
7266                         printk(KERN_INFO "crashkernel reservation failed - "
7267                                         "you have to specify a base address\n");
7268 +                       return;
7269 +               }
7270 +
7271 +               if (reserve_bootmem(crash_base, crash_size,
7272 +                                       BOOTMEM_EXCLUSIVE) < 0) {
7273 +                       printk(KERN_INFO "crashkernel reservation failed - "
7274 +                                       "memory is in use\n");
7275 +                       return;
7276 +               }
7277 +
7278 +               printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7279 +                               "for crashkernel (System RAM: %ldMB)\n",
7280 +                               (unsigned long)(crash_size >> 20),
7281 +                               (unsigned long)(crash_base >> 20),
7282 +                               (unsigned long)(total_mem >> 20));
7283 +               crashk_res.start = crash_base;
7284 +               crashk_res.end   = crash_base + crash_size - 1;
7285         }
7286  }
7287  #else
7288 @@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7289  {}
7290  #endif
7291
7292 -#ifndef CONFIG_XEN
7293 -#define EBDA_ADDR_POINTER 0x40E
7294 -
7295 -unsigned __initdata ebda_addr;
7296 -unsigned __initdata ebda_size;
7297 -
7298 -static void discover_ebda(void)
7299 +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7300 +void __attribute__((weak)) __init memory_setup(void)
7301  {
7302 -       /*
7303 -        * there is a real-mode segmented pointer pointing to the
7304 -        * 4K EBDA area at 0x40E
7305 -        */
7306 -       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7307 -       ebda_addr <<= 4;
7308 -
7309 -       ebda_size = *(unsigned short *)__va(ebda_addr);
7310 -
7311 -       /* Round EBDA up to pages */
7312 -       if (ebda_size == 0)
7313 -               ebda_size = 1;
7314 -       ebda_size <<= 10;
7315 -       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7316 -       if (ebda_size > 64*1024)
7317 -               ebda_size = 64*1024;
7318 +       machine_specific_memory_setup();
7319  }
7320 -#else
7321 -#define discover_ebda() ((void)0)
7322 -#endif
7323
7324 +/*
7325 + * setup_arch - architecture-specific boot-time initializations
7326 + *
7327 + * Note: On x86_64, fixmaps are ready for use even before this is called.
7328 + */
7329  void __init setup_arch(char **cmdline_p)
7330  {
7331 +       unsigned i;
7332 +
7333  #ifdef CONFIG_XEN
7334         extern struct e820map machine_e820;
7335
7336 @@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7337         /* Register a call for panic conditions. */
7338         atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7339
7340 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7341 +                                    VMASST_TYPE_writable_pagetables));
7342 +
7343 +       early_ioremap_init();
7344 +
7345         ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7346         screen_info = boot_params.screen_info;
7347
7348 @@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7349                 screen_info.orig_video_isVGA = 0;
7350
7351         copy_edid();
7352 -
7353 -       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7354 -                                    VMASST_TYPE_writable_pagetables));
7355 -
7356 -       ARCH_SETUP
7357  #else
7358         printk(KERN_INFO "Command line: %s\n", boot_command_line);
7359
7360 @@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7361         rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7362         rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7363  #endif
7364 -       setup_memory_region();
7365 +#ifdef CONFIG_EFI
7366 +       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7367 +                    "EL64", 4))
7368 +               efi_enabled = 1;
7369 +#endif
7370 +
7371 +       ARCH_SETUP
7372 +
7373 +       memory_setup();
7374         copy_edd();
7375
7376         if (!boot_params.hdr.root_flags)
7377 @@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7378
7379         parse_early_param();
7380
7381 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7382 +       if (init_ohci1394_dma_early)
7383 +               init_ohci1394_dma_on_all_controllers();
7384 +#endif
7385 +
7386         finish_e820_parsing();
7387
7388 +       early_gart_iommu_check();
7389 +
7390         e820_register_active_regions(0, 0, -1UL);
7391         /*
7392          * partially used pages are not usable - thus
7393          * we are rounding upwards:
7394          */
7395         end_pfn = e820_end_of_ram();
7396 +       /* update e820 for memory not covered by WB MTRRs */
7397 +       mtrr_bp_init();
7398 +#ifndef CONFIG_XEN
7399 +       if (mtrr_trim_uncached_memory(end_pfn)) {
7400 +               e820_register_active_regions(0, 0, -1UL);
7401 +               end_pfn = e820_end_of_ram();
7402 +       }
7403 +#endif
7404 +
7405         num_physpages = end_pfn;
7406 +       max_mapnr = end_pfn;
7407
7408         check_efer();
7409
7410 -       discover_ebda();
7411 -
7412         init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7413 +       if (efi_enabled)
7414 +               efi_init();
7415
7416         if (is_initial_xendomain())
7417                 dmi_scan_machine();
7418
7419 +       io_delay_init();
7420 +
7421  #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7422 -       /* setup to use the static apicid table during kernel startup */
7423 -       x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7424 +       /* setup to use the early static init tables during kernel startup */
7425 +       x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7426 +       x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7427 +#ifdef CONFIG_NUMA
7428 +       x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7429 +#endif
7430  #endif
7431
7432         /* How many end-of-memory variables you have, grandma! */
7433 @@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7434  #endif
7435
7436  #ifdef CONFIG_NUMA
7437 -       numa_initmem_init(0, end_pfn);
7438 +       numa_initmem_init(0, end_pfn);
7439  #else
7440         contig_initmem_init(0, end_pfn);
7441  #endif
7442
7443 -#ifdef CONFIG_XEN
7444 -       /*
7445 -        * Reserve kernel, physmap, start info, initial page tables, and
7446 -        * direct mapping.
7447 -        */
7448 -       reserve_bootmem_generic(__pa_symbol(&_text),
7449 -                               (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7450 -#else
7451 -       /* Reserve direct mapping */
7452 -       reserve_bootmem_generic(table_start << PAGE_SHIFT,
7453 -                               (table_end - table_start) << PAGE_SHIFT);
7454 -
7455 -       /* reserve kernel */
7456 -       reserve_bootmem_generic(__pa_symbol(&_text),
7457 -                               __pa_symbol(&_end) - __pa_symbol(&_text));
7458 +       early_res_to_bootmem();
7459
7460 +#ifndef CONFIG_XEN
7461 +#ifdef CONFIG_ACPI_SLEEP
7462         /*
7463 -        * reserve physical page 0 - it's a special BIOS page on many boxes,
7464 -        * enabling clean reboots, SMP operation, laptop functions.
7465 +        * Reserve low memory region for sleep support.
7466          */
7467 -       reserve_bootmem_generic(0, PAGE_SIZE);
7468 -
7469 -       /* reserve ebda region */
7470 -       if (ebda_addr)
7471 -               reserve_bootmem_generic(ebda_addr, ebda_size);
7472 -#ifdef CONFIG_NUMA
7473 -       /* reserve nodemap region */
7474 -       if (nodemap_addr)
7475 -               reserve_bootmem_generic(nodemap_addr, nodemap_size);
7476 +       acpi_reserve_bootmem();
7477  #endif
7478
7479 -#ifdef CONFIG_SMP
7480 -       /* Reserve SMP trampoline */
7481 -       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7482 -#endif
7483 +       if (efi_enabled)
7484 +               efi_reserve_bootmem();
7485  #endif
7486
7487 -#ifdef CONFIG_ACPI_SLEEP
7488 -       /*
7489 -        * Reserve low memory region for sleep support.
7490 -        */
7491 -       acpi_reserve_bootmem();
7492 -#endif
7493  #ifdef CONFIG_BLK_DEV_INITRD
7494  #ifdef CONFIG_XEN
7495         if (xen_start_info->mod_start) {
7496 @@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7497                         initrd_below_start_ok = 1;
7498  #endif
7499                 } else {
7500 +                       /* Assumes everything on node 0 */
7501 +                       free_bootmem(ramdisk_image, ramdisk_size);
7502                         printk(KERN_ERR "initrd extends beyond end of memory "
7503                                "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7504                                ramdisk_end, end_of_mem);
7505 @@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7506  #endif
7507         reserve_crashkernel();
7508         paging_init();
7509 +       map_vsyscall();
7510  #ifdef CONFIG_X86_LOCAL_APIC
7511         /*
7512 -        * Find and reserve possible boot-time SMP configuration:
7513 -        */
7514 +       * Find and reserve possible boot-time SMP configuration:
7515 +       */
7516         find_smp_config();
7517  #endif
7518  #ifdef CONFIG_XEN
7519 @@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7520  #endif
7521  #endif
7522
7523 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7524 +#ifndef CONFIG_XEN
7525         early_quirks();
7526  #endif
7527
7528 -       /*
7529 -        * set this early, so we dont allocate cpu0
7530 -        * if MADT list doesnt list BSP first
7531 -        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7532 -        */
7533 -       cpu_set(0, cpu_present_map);
7534  #ifdef CONFIG_ACPI
7535         /*
7536          * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7537 @@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7538                 get_smp_config();
7539  #ifndef CONFIG_XEN
7540         init_apic_mappings();
7541 +       ioapic_init_mappings();
7542  #endif
7543  #endif
7544  #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7545 @@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7546          */
7547  #ifdef CONFIG_XEN
7548         if (is_initial_xendomain())
7549 -               e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7550 +               e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7551 +                                      &code_resource, &data_resource, &bss_resource);
7552  #else
7553 -       e820_reserve_resources(e820.map, e820.nr_map);
7554 +       e820_reserve_resources(e820.map, e820.nr_map,
7555 +                              &code_resource, &data_resource, &bss_resource);
7556         e820_mark_nosave_regions();
7557  #endif
7558
7559 -       {
7560 -       unsigned i;
7561         /* request I/O space for devices used on all i[345]86 PCs */
7562         for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7563                 request_resource(&ioport_resource, &standard_io_resources[i]);
7564 -       }
7565
7566  #ifdef CONFIG_XEN
7567         if (is_initial_xendomain())
7568 @@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7569
7570  #ifdef CONFIG_VT
7571  #if defined(CONFIG_VGA_CONSOLE)
7572 -       conswitchp = &vga_con;
7573 +       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7574 +               conswitchp = &vga_con;
7575  #elif defined(CONFIG_DUMMY_CONSOLE)
7576         conswitchp = &dummy_con;
7577  #endif
7578 @@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7579
7580         if (n >= 0x80000005) {
7581                 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7582 -               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7583 -                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7584 -               c->x86_cache_size=(ecx>>24)+(edx>>24);
7585 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7586 +                      "D cache %dK (%d bytes/line)\n",
7587 +                      edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7588 +               c->x86_cache_size = (ecx>>24) + (edx>>24);
7589                 /* On K8 L1 TLB is inclusive, so don't count it */
7590                 c->x86_tlbsize = 0;
7591         }
7592 @@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7593                 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7594                 c->x86_cache_size, ecx & 0xFF);
7595         }
7596 -
7597 -       if (n >= 0x80000007)
7598 -               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7599         if (n >= 0x80000008) {
7600 -               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7601 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7602                 c->x86_virt_bits = (eax >> 8) & 0xff;
7603                 c->x86_phys_bits = eax & 0xff;
7604         }
7605  }
7606
7607  #ifdef CONFIG_NUMA
7608 -static int nearby_node(int apicid)
7609 +static int __cpuinit nearby_node(int apicid)
7610  {
7611 -       int i;
7612 +       int i, node;
7613 +
7614         for (i = apicid - 1; i >= 0; i--) {
7615 -               int node = apicid_to_node[i];
7616 +               node = apicid_to_node[i];
7617                 if (node != NUMA_NO_NODE && node_online(node))
7618                         return node;
7619         }
7620         for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7621 -               int node = apicid_to_node[i];
7622 +               node = apicid_to_node[i];
7623                 if (node != NUMA_NO_NODE && node_online(node))
7624                         return node;
7625         }
7626 @@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7627   * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7628   * Assumes number of cores is a power of two.
7629   */
7630 -static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7631 +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7632  {
7633  #ifdef CONFIG_SMP
7634         unsigned bits;
7635 @@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7636         int node = 0;
7637         unsigned apicid = hard_smp_processor_id();
7638  #endif
7639 -       unsigned ecx = cpuid_ecx(0x80000008);
7640 +       bits = c->x86_coreid_bits;
7641 +
7642 +       /* Low order bits define the core id (index of core in socket) */
7643 +       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7644 +       /* Convert the APIC ID into the socket ID */
7645 +       c->phys_proc_id = phys_pkg_id(bits);
7646 +
7647 +#ifdef CONFIG_NUMA
7648 +       node = c->phys_proc_id;
7649 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
7650 +               node = apicid_to_node[apicid];
7651 +       if (!node_online(node)) {
7652 +               /* Two possibilities here:
7653 +                  - The CPU is missing memory and no node was created.
7654 +                  In that case try picking one from a nearby CPU
7655 +                  - The APIC IDs differ from the HyperTransport node IDs
7656 +                  which the K8 northbridge parsing fills in.
7657 +                  Assume they are all increased by a constant offset,
7658 +                  but in the same order as the HT nodeids.
7659 +                  If that doesn't result in a usable node fall back to the
7660 +                  path for the previous case.  */
7661 +
7662 +               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7663 +
7664 +               if (ht_nodeid >= 0 &&
7665 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7666 +                       node = apicid_to_node[ht_nodeid];
7667 +               /* Pick a nearby node */
7668 +               if (!node_online(node))
7669 +                       node = nearby_node(apicid);
7670 +       }
7671 +       numa_set_node(cpu, node);
7672 +
7673 +       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7674 +#endif
7675 +#endif
7676 +}
7677 +
7678 +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7679 +{
7680 +#ifdef CONFIG_SMP
7681 +       unsigned bits, ecx;
7682 +
7683 +       /* Multi core CPU? */
7684 +       if (c->extended_cpuid_level < 0x80000008)
7685 +               return;
7686 +
7687 +       ecx = cpuid_ecx(0x80000008);
7688
7689         c->x86_max_cores = (ecx & 0xff) + 1;
7690
7691 @@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7692                         bits++;
7693         }
7694
7695 -       /* Low order bits define the core id (index of core in socket) */
7696 -       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7697 -       /* Convert the APIC ID into the socket ID */
7698 -       c->phys_proc_id = phys_pkg_id(bits);
7699 -
7700 -#ifdef CONFIG_NUMA
7701 -       node = c->phys_proc_id;
7702 -       if (apicid_to_node[apicid] != NUMA_NO_NODE)
7703 -               node = apicid_to_node[apicid];
7704 -       if (!node_online(node)) {
7705 -               /* Two possibilities here:
7706 -                  - The CPU is missing memory and no node was created.
7707 -                  In that case try picking one from a nearby CPU
7708 -                  - The APIC IDs differ from the HyperTransport node IDs
7709 -                  which the K8 northbridge parsing fills in.
7710 -                  Assume they are all increased by a constant offset,
7711 -                  but in the same order as the HT nodeids.
7712 -                  If that doesn't result in a usable node fall back to the
7713 -                  path for the previous case.  */
7714 -               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7715 -               if (ht_nodeid >= 0 &&
7716 -                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7717 -                       node = apicid_to_node[ht_nodeid];
7718 -               /* Pick a nearby node */
7719 -               if (!node_online(node))
7720 -                       node = nearby_node(apicid);
7721 -       }
7722 -       numa_set_node(cpu, node);
7723 +       c->x86_coreid_bits = bits;
7724
7725 -       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7726 -#endif
7727  #endif
7728  }
7729
7730 @@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7731  /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7732  static __cpuinit int amd_apic_timer_broken(void)
7733  {
7734 -       u32 lo, hi;
7735 -       u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7736 +       u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7737 +
7738         switch (eax & CPUID_XFAM) {
7739         case CPUID_XFAM_K8:
7740                 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7741 @@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7742  }
7743  #endif
7744
7745 +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7746 +{
7747 +       early_init_amd_mc(c);
7748 +
7749 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7750 +       if (c->x86_power & (1<<8))
7751 +               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7752 +}
7753 +
7754  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7755  {
7756         unsigned level;
7757 @@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7758         /*
7759          * Disable TLB flush filter by setting HWCR.FFDIS on K8
7760          * bit 6 of msr C001_0015
7761 -        *
7762 +        *
7763          * Errata 63 for SH-B3 steppings
7764          * Errata 122 for all steppings (F+ have it disabled by default)
7765          */
7766 @@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7767
7768         /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7769            3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7770 -       clear_bit(0*32+31, &c->x86_capability);
7771 -
7772 +       clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7773 +
7774         /* On C+ stepping K8 rep microcode works well for copy/memset */
7775         level = cpuid_eax(1);
7776 -       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7777 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7778 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7779 +                            level >= 0x0f58))
7780 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7781         if (c->x86 == 0x10 || c->x86 == 0x11)
7782 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7783 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7784
7785         /* Enable workaround for FXSAVE leak */
7786         if (c->x86 >= 6)
7787 -               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7788 +               set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7789
7790         level = get_model_name(c);
7791         if (!level) {
7792 -               switch (c->x86) {
7793 +               switch (c->x86) {
7794                 case 15:
7795                         /* Should distinguish Models here, but this is only
7796                            a fallback anyways. */
7797                         strcpy(c->x86_model_id, "Hammer");
7798 -                       break;
7799 -               }
7800 -       }
7801 +                       break;
7802 +               }
7803 +       }
7804         display_cacheinfo(c);
7805
7806 -       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7807 -       if (c->x86_power & (1<<8))
7808 -               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7809 -
7810         /* Multi core CPU? */
7811         if (c->extended_cpuid_level >= 0x80000008)
7812                 amd_detect_cmp(c);
7813 @@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7814                 num_cache_leaves = 3;
7815
7816         if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7817 -               set_bit(X86_FEATURE_K8, &c->x86_capability);
7818 -
7819 -       /* RDTSC can be speculated around */
7820 -       clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7821 +               set_cpu_cap(c, X86_FEATURE_K8);
7822
7823 -       /* Family 10 doesn't support C states in MWAIT so don't use it */
7824 -       if (c->x86 == 0x10 && !force_mwait)
7825 -               clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7826 +       /* MFENCE stops RDTSC speculation */
7827 +       set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7828
7829  #ifndef CONFIG_XEN
7830         if (amd_apic_timer_broken())
7831 @@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7832  #endif
7833  }
7834
7835 -static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7836 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7837  {
7838  #ifdef CONFIG_SMP
7839 -       u32     eax, ebx, ecx, edx;
7840 -       int     index_msb, core_bits;
7841 +       u32 eax, ebx, ecx, edx;
7842 +       int index_msb, core_bits;
7843
7844         cpuid(1, &eax, &ebx, &ecx, &edx);
7845
7846
7847         if (!cpu_has(c, X86_FEATURE_HT))
7848                 return;
7849 -       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7850 +       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7851                 goto out;
7852
7853         smp_num_siblings = (ebx & 0xff0000) >> 16;
7854
7855         if (smp_num_siblings == 1) {
7856                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
7857 -       } else if (smp_num_siblings > 1 ) {
7858 +       } else if (smp_num_siblings > 1) {
7859
7860                 if (smp_num_siblings > NR_CPUS) {
7861 -                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7862 +                       printk(KERN_WARNING "CPU: Unsupported number of "
7863 +                              "siblings %d", smp_num_siblings);
7864                         smp_num_siblings = 1;
7865                         return;
7866                 }
7867 @@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7868
7869                 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7870
7871 -               index_msb = get_count_order(smp_num_siblings) ;
7872 +               index_msb = get_count_order(smp_num_siblings);
7873
7874                 core_bits = get_count_order(c->x86_max_cores);
7875
7876 @@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7877         }
7878  out:
7879         if ((c->x86_max_cores * smp_num_siblings) > 1) {
7880 -               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7881 -               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7882 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
7883 +                      c->phys_proc_id);
7884 +               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
7885 +                      c->cpu_core_id);
7886         }
7887
7888  #endif
7889 @@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7890                 return 1;
7891  }
7892
7893 -static void srat_detect_node(void)
7894 +static void __cpuinit srat_detect_node(void)
7895  {
7896  #ifdef CONFIG_NUMA
7897         unsigned node;
7898 @@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7899         /* Don't do the funky fallback heuristics the AMD version employs
7900            for now. */
7901         node = apicid_to_node[apicid];
7902 -       if (node == NUMA_NO_NODE)
7903 +       if (node == NUMA_NO_NODE || !node_online(node))
7904                 node = first_node(node_online_map);
7905         numa_set_node(cpu, node);
7906
7907 @@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7908  #endif
7909  }
7910
7911 +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7912 +{
7913 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7914 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
7915 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7916 +}
7917 +
7918  static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7919  {
7920         /* Cache sizes */
7921         unsigned n;
7922
7923         init_intel_cacheinfo(c);
7924 -       if (c->cpuid_level > 9 ) {
7925 +       if (c->cpuid_level > 9) {
7926                 unsigned eax = cpuid_eax(10);
7927                 /* Check for version and the number of counters */
7928                 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7929 -                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7930 +                       set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7931         }
7932
7933         if (cpu_has_ds) {
7934                 unsigned int l1, l2;
7935                 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7936                 if (!(l1 & (1<<11)))
7937 -                       set_bit(X86_FEATURE_BTS, c->x86_capability);
7938 +                       set_cpu_cap(c, X86_FEATURE_BTS);
7939                 if (!(l1 & (1<<12)))
7940 -                       set_bit(X86_FEATURE_PEBS, c->x86_capability);
7941 +                       set_cpu_cap(c, X86_FEATURE_PEBS);
7942         }
7943
7944 +
7945 +       if (cpu_has_bts)
7946 +               ds_init_intel(c);
7947 +
7948         n = c->extended_cpuid_level;
7949         if (n >= 0x80000008) {
7950                 unsigned eax = cpuid_eax(0x80000008);
7951 @@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7952                 c->x86_cache_alignment = c->x86_clflush_size * 2;
7953         if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7954             (c->x86 == 0x6 && c->x86_model >= 0x0e))
7955 -               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7956 +               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7957         if (c->x86 == 6)
7958 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7959 -       if (c->x86 == 15)
7960 -               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7961 -       else
7962 -               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7963 -       c->x86_max_cores = intel_num_cpu_cores(c);
7964 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7965 +       set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7966 +       c->x86_max_cores = intel_num_cpu_cores(c);
7967
7968         srat_detect_node();
7969  }
7970 @@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7971                 c->x86_vendor = X86_VENDOR_UNKNOWN;
7972  }
7973
7974 -struct cpu_model_info {
7975 -       int vendor;
7976 -       int family;
7977 -       char *model_names[16];
7978 -};
7979 -
7980  /* Do some early cpuid on the boot CPU to get some parameter that are
7981     needed before check_bugs. Everything advanced is in identify_cpu
7982     below. */
7983 -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7984 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7985  {
7986 -       u32 tfms;
7987 +       u32 tfms, xlvl;
7988
7989         c->loops_per_jiffy = loops_per_jiffy;
7990         c->x86_cache_size = -1;
7991 @@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
7992         c->x86_clflush_size = 64;
7993         c->x86_cache_alignment = c->x86_clflush_size;
7994         c->x86_max_cores = 1;
7995 +       c->x86_coreid_bits = 0;
7996         c->extended_cpuid_level = 0;
7997         memset(&c->x86_capability, 0, sizeof c->x86_capability);
7998
7999 @@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8000               (unsigned int *)&c->x86_vendor_id[0],
8001               (unsigned int *)&c->x86_vendor_id[8],
8002               (unsigned int *)&c->x86_vendor_id[4]);
8003 -
8004 +
8005         get_cpu_vendor(c);
8006
8007         /* Initialize the standard set of capabilities */
8008 @@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8009                         c->x86 += (tfms >> 20) & 0xff;
8010                 if (c->x86 >= 0x6)
8011                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
8012 -               if (c->x86_capability[0] & (1<<19))
8013 +               if (c->x86_capability[0] & (1<<19))
8014                         c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8015         } else {
8016                 /* Have CPUID level 0 only - unheard of */
8017 @@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8018  #ifdef CONFIG_SMP
8019         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8020  #endif
8021 -}
8022 -
8023 -/*
8024 - * This does the hard work of actually picking apart the CPU stuff...
8025 - */
8026 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8027 -{
8028 -       int i;
8029 -       u32 xlvl;
8030 -
8031 -       early_identify_cpu(c);
8032 -
8033         /* AMD-defined flags: level 0x80000001 */
8034         xlvl = cpuid_eax(0x80000000);
8035         c->extended_cpuid_level = xlvl;
8036 @@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8037                         c->x86_capability[2] = cpuid_edx(0x80860001);
8038         }
8039
8040 +       c->extended_cpuid_level = cpuid_eax(0x80000000);
8041 +       if (c->extended_cpuid_level >= 0x80000007)
8042 +               c->x86_power = cpuid_edx(0x80000007);
8043 +
8044 +       switch (c->x86_vendor) {
8045 +       case X86_VENDOR_AMD:
8046 +               early_init_amd(c);
8047 +               break;
8048 +       case X86_VENDOR_INTEL:
8049 +               early_init_intel(c);
8050 +               break;
8051 +       }
8052 +
8053 +}
8054 +
8055 +/*
8056 + * This does the hard work of actually picking apart the CPU stuff...
8057 + */
8058 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8059 +{
8060 +       int i;
8061 +
8062 +       early_identify_cpu(c);
8063 +
8064         init_scattered_cpuid_features(c);
8065
8066         c->apicid = phys_pkg_id(0);
8067 @@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8068                 break;
8069         }
8070
8071 -       select_idle_routine(c);
8072 -       detect_ht(c);
8073 +       detect_ht(c);
8074
8075         /*
8076          * On SMP, boot_cpu_data holds the common feature set between
8077 @@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8078          */
8079         if (c != &boot_cpu_data) {
8080                 /* AND the already accumulated flags with these */
8081 -               for (i = 0 ; i < NCAPINTS ; i++)
8082 +               for (i = 0; i < NCAPINTS; i++)
8083                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8084         }
8085
8086 +       /* Clear all flags overriden by options */
8087 +       for (i = 0; i < NCAPINTS; i++)
8088 +               c->x86_capability[i] &= ~cleared_cpu_caps[i];
8089 +
8090  #ifdef CONFIG_X86_MCE
8091         mcheck_init(c);
8092  #endif
8093 +       select_idle_routine(c);
8094 +
8095         if (c != &boot_cpu_data)
8096                 mtrr_ap_init();
8097  #ifdef CONFIG_NUMA
8098         numa_add_cpu(smp_processor_id());
8099  #endif
8100 +
8101  }
8102 -
8103 +
8104 +static __init int setup_noclflush(char *arg)
8105 +{
8106 +       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8107 +       return 1;
8108 +}
8109 +__setup("noclflush", setup_noclflush);
8110
8111  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8112  {
8113         if (c->x86_model_id[0])
8114 -               printk("%s", c->x86_model_id);
8115 +               printk(KERN_CONT "%s", c->x86_model_id);
8116 +
8117 +       if (c->x86_mask || c->cpuid_level >= 0)
8118 +               printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8119 +       else
8120 +               printk(KERN_CONT "\n");
8121 +}
8122
8123 -       if (c->x86_mask || c->cpuid_level >= 0)
8124 -               printk(" stepping %02x\n", c->x86_mask);
8125 +static __init int setup_disablecpuid(char *arg)
8126 +{
8127 +       int bit;
8128 +       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8129 +               setup_clear_cpu_cap(bit);
8130         else
8131 -               printk("\n");
8132 +               return 0;
8133 +       return 1;
8134  }
8135 +__setup("clearcpuid=", setup_disablecpuid);
8136
8137  /*
8138   *     Get CPU information for use by the procfs.
8139 @@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8140  static int show_cpuinfo(struct seq_file *m, void *v)
8141  {
8142         struct cpuinfo_x86 *c = v;
8143 -       int cpu = 0;
8144 -
8145 -       /*
8146 -        * These flag bits must match the definitions in <asm/cpufeature.h>.
8147 -        * NULL means this bit is undefined or reserved; either way it doesn't
8148 -        * have meaning as far as Linux is concerned.  Note that it's important
8149 -        * to realize there is a difference between this table and CPUID -- if
8150 -        * applications want to get the raw CPUID data, they should access
8151 -        * /dev/cpu/<cpu_nr>/cpuid instead.
8152 -        */
8153 -       static const char *const x86_cap_flags[] = {
8154 -               /* Intel-defined */
8155 -               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8156 -               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8157 -               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8158 -               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8159 -
8160 -               /* AMD-defined */
8161 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8162 -               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8163 -               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8164 -               NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8165 -               "3dnowext", "3dnow",
8166 -
8167 -               /* Transmeta-defined */
8168 -               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8169 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8170 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8171 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8172 -
8173 -               /* Other (Linux-defined) */
8174 -               "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8175 -               NULL, NULL, NULL, NULL,
8176 -               "constant_tsc", "up", NULL, "arch_perfmon",
8177 -               "pebs", "bts", NULL, "sync_rdtsc",
8178 -               "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8179 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8180 -
8181 -               /* Intel-defined (#2) */
8182 -               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8183 -               "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8184 -               NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8185 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8186 -
8187 -               /* VIA/Cyrix/Centaur-defined */
8188 -               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8189 -               "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8190 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8191 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8192 -
8193 -               /* AMD-defined (#2) */
8194 -               "lahf_lm", "cmp_legacy", "svm", "extapic",
8195 -               "cr8_legacy", "abm", "sse4a", "misalignsse",
8196 -               "3dnowprefetch", "osvw", "ibs", "sse5",
8197 -               "skinit", "wdt", NULL, NULL,
8198 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8199 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8200 -
8201 -               /* Auxiliary (Linux-defined) */
8202 -               "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8203 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8204 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8205 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8206 -       };
8207 -       static const char *const x86_power_flags[] = {
8208 -               "ts",   /* temperature sensor */
8209 -               "fid",  /* frequency id control */
8210 -               "vid",  /* voltage id control */
8211 -               "ttp",  /* thermal trip */
8212 -               "tm",
8213 -               "stc",
8214 -               "100mhzsteps",
8215 -               "hwpstate",
8216 -               "",     /* tsc invariant mapped to constant_tsc */
8217 -               /* nothing */
8218 -       };
8219 -
8220 +       int cpu = 0, i;
8221
8222  #ifdef CONFIG_SMP
8223         cpu = c->cpu_index;
8224  #endif
8225
8226 -       seq_printf(m,"processor\t: %u\n"
8227 -                    "vendor_id\t: %s\n"
8228 -                    "cpu family\t: %d\n"
8229 -                    "model\t\t: %d\n"
8230 -                    "model name\t: %s\n",
8231 -                    (unsigned)cpu,
8232 -                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8233 -                    c->x86,
8234 -                    (int)c->x86_model,
8235 -                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
8236 -
8237 +       seq_printf(m, "processor\t: %u\n"
8238 +                  "vendor_id\t: %s\n"
8239 +                  "cpu family\t: %d\n"
8240 +                  "model\t\t: %d\n"
8241 +                  "model name\t: %s\n",
8242 +                  (unsigned)cpu,
8243 +                  c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8244 +                  c->x86,
8245 +                  (int)c->x86_model,
8246 +                  c->x86_model_id[0] ? c->x86_model_id : "unknown");
8247 +
8248         if (c->x86_mask || c->cpuid_level >= 0)
8249                 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8250         else
8251                 seq_printf(m, "stepping\t: unknown\n");
8252 -
8253 -       if (cpu_has(c,X86_FEATURE_TSC)) {
8254 +
8255 +       if (cpu_has(c, X86_FEATURE_TSC)) {
8256                 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8257 +
8258                 if (!freq)
8259                         freq = cpu_khz;
8260                 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8261 -                            freq / 1000, (freq % 1000));
8262 +                          freq / 1000, (freq % 1000));
8263         }
8264
8265         /* Cache size */
8266 -       if (c->x86_cache_size >= 0)
8267 +       if (c->x86_cache_size >= 0)
8268                 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8269 -
8270 +
8271  #ifdef CONFIG_SMP
8272         if (smp_num_siblings * c->x86_max_cores > 1) {
8273                 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8274 @@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8275                 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8276                 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8277         }
8278 -#endif
8279 +#endif
8280
8281         seq_printf(m,
8282 -               "fpu\t\t: yes\n"
8283 -               "fpu_exception\t: yes\n"
8284 -               "cpuid level\t: %d\n"
8285 -               "wp\t\t: yes\n"
8286 -               "flags\t\t:",
8287 +                  "fpu\t\t: yes\n"
8288 +                  "fpu_exception\t: yes\n"
8289 +                  "cpuid level\t: %d\n"
8290 +                  "wp\t\t: yes\n"
8291 +                  "flags\t\t:",
8292                    c->cpuid_level);
8293
8294 -       {
8295 -               int i;
8296 -               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8297 -                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8298 -                               seq_printf(m, " %s", x86_cap_flags[i]);
8299 -       }
8300 -
8301 +       for (i = 0; i < 32*NCAPINTS; i++)
8302 +               if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8303 +                       seq_printf(m, " %s", x86_cap_flags[i]);
8304 +
8305         seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8306                    c->loops_per_jiffy/(500000/HZ),
8307                    (c->loops_per_jiffy/(5000/HZ)) % 100);
8308
8309 -       if (c->x86_tlbsize > 0)
8310 +       if (c->x86_tlbsize > 0)
8311                 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8312         seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8313         seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8314
8315 -       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8316 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8317                    c->x86_phys_bits, c->x86_virt_bits);
8318
8319         seq_printf(m, "power management:");
8320 -       {
8321 -               unsigned i;
8322 -               for (i = 0; i < 32; i++)
8323 -                       if (c->x86_power & (1 << i)) {
8324 -                               if (i < ARRAY_SIZE(x86_power_flags) &&
8325 -                                       x86_power_flags[i])
8326 -                                       seq_printf(m, "%s%s",
8327 -                                               x86_power_flags[i][0]?" ":"",
8328 -                                               x86_power_flags[i]);
8329 -                               else
8330 -                                       seq_printf(m, " [%d]", i);
8331 -                       }
8332 +       for (i = 0; i < 32; i++) {
8333 +               if (c->x86_power & (1 << i)) {
8334 +                       if (i < ARRAY_SIZE(x86_power_flags) &&
8335 +                           x86_power_flags[i])
8336 +                               seq_printf(m, "%s%s",
8337 +                                          x86_power_flags[i][0]?" ":"",
8338 +                                          x86_power_flags[i]);
8339 +                       else
8340 +                               seq_printf(m, " [%d]", i);
8341 +               }
8342         }
8343
8344         seq_printf(m, "\n\n");
8345 @@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8346  {
8347  }
8348
8349 -struct seq_operations cpuinfo_op = {
8350 -       .start =c_start,
8351 +const struct seq_operations cpuinfo_op = {
8352 +       .start = c_start,
8353         .next = c_next,
8354         .stop = c_stop,
8355         .show = show_cpuinfo,
8356 --- sle11-2009-05-14.orig/arch/x86/kernel/smp_32-xen.c  2009-02-16 16:18:36.000000000 +0100
8357 +++ sle11-2009-05-14/arch/x86/kernel/smp_32-xen.c       2009-03-16 16:33:40.000000000 +0100
8358 @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8359         }
8360  }
8361
8362 -void fastcall send_IPI_self(int vector)
8363 +void send_IPI_self(int vector)
8364  {
8365         __send_IPI_shortcut(APIC_DEST_SELF, vector);
8366  }
8367 @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8368   * We need to reload %cr3 since the page tables may be going
8369   * away from under us..
8370   */
8371 -void leave_mm(unsigned long cpu)
8372 +void leave_mm(int cpu)
8373  {
8374         if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8375                 BUG();
8376         cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8377         load_cr3(swapper_pg_dir);
8378  }
8379 +EXPORT_SYMBOL_GPL(leave_mm);
8380
8381  /*
8382   *
8383 --- sle11-2009-05-14.orig/arch/x86/kernel/smp_64-xen.c  2009-02-16 16:18:36.000000000 +0100
8384 +++ sle11-2009-05-14/arch/x86/kernel/smp_64-xen.c       2009-03-16 16:33:40.000000000 +0100
8385 @@ -33,7 +33,7 @@
8386
8387  #ifndef CONFIG_XEN
8388  /*
8389 - *     Smarter SMP flushing macros.
8390 + *     Smarter SMP flushing macros.
8391   *             c/o Linus Torvalds.
8392   *
8393   *     These mean you can really definitely utterly forget about
8394 @@ -41,15 +41,15 @@
8395   *
8396   *     Optimizations Manfred Spraul <manfred@colorfullife.com>
8397   *
8398 - *     More scalable flush, from Andi Kleen
8399 + *     More scalable flush, from Andi Kleen
8400   *
8401 - *     To avoid global state use 8 different call vectors.
8402 - *     Each CPU uses a specific vector to trigger flushes on other
8403 - *     CPUs. Depending on the received vector the target CPUs look into
8404 + *     To avoid global state use 8 different call vectors.
8405 + *     Each CPU uses a specific vector to trigger flushes on other
8406 + *     CPUs. Depending on the received vector the target CPUs look into
8407   *     the right per cpu variable for the flush data.
8408   *
8409 - *     With more than 8 CPUs they are hashed to the 8 available
8410 - *     vectors. The limited global vector space forces us to this right now.
8411 + *     With more than 8 CPUs they are hashed to the 8 available
8412 + *     vectors. The limited global vector space forces us to this right now.
8413   *     In future when interrupts are split into per CPU domains this could be
8414   *     fixed, at the cost of triggering multiple IPIs in some cases.
8415   */
8416 @@ -59,7 +59,6 @@ union smp_flush_state {
8417                 cpumask_t flush_cpumask;
8418                 struct mm_struct *flush_mm;
8419                 unsigned long flush_va;
8420 -#define FLUSH_ALL      -1ULL
8421                 spinlock_t tlbstate_lock;
8422         };
8423         char pad[SMP_CACHE_BYTES];
8424 @@ -71,16 +70,17 @@ union smp_flush_state {
8425  static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8426
8427  /*
8428 - * We cannot call mmdrop() because we are in interrupt context,
8429 + * We cannot call mmdrop() because we are in interrupt context,
8430   * instead update mm->cpu_vm_mask.
8431   */
8432 -static inline void leave_mm(unsigned long cpu)
8433 +void leave_mm(int cpu)
8434  {
8435         if (read_pda(mmu_state) == TLBSTATE_OK)
8436                 BUG();
8437         cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8438         load_cr3(swapper_pg_dir);
8439  }
8440 +EXPORT_SYMBOL_GPL(leave_mm);
8441
8442  /*
8443   *
8444 @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8445   * 1) switch_mm() either 1a) or 1b)
8446   * 1a) thread switch to a different mm
8447   * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8448 - *     Stop ipi delivery for the old mm. This is not synchronized with
8449 - *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
8450 - *     for the wrong mm, and in the worst case we perform a superfluous
8451 - *     tlb flush.
8452 + *     Stop ipi delivery for the old mm. This is not synchronized with
8453 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
8454 + *     for the wrong mm, and in the worst case we perform a superfluous
8455 + *     tlb flush.
8456   * 1a2) set cpu mmu_state to TLBSTATE_OK
8457 - *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8458 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8459   *     was in lazy tlb mode.
8460   * 1a3) update cpu active_mm
8461 - *     Now cpu0 accepts tlb flushes for the new mm.
8462 + *     Now cpu0 accepts tlb flushes for the new mm.
8463   * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8464 - *     Now the other cpus will send tlb flush ipis.
8465 + *     Now the other cpus will send tlb flush ipis.
8466   * 1a4) change cr3.
8467   * 1b) thread switch without mm change
8468   *     cpu active_mm is correct, cpu0 already handles
8469   *     flush ipis.
8470   * 1b1) set cpu mmu_state to TLBSTATE_OK
8471   * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8472 - *     Atomically set the bit [other cpus will start sending flush ipis],
8473 - *     and test the bit.
8474 + *     Atomically set the bit [other cpus will start sending flush ipis],
8475 + *     and test the bit.
8476   * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8477   * 2) switch %%esp, ie current
8478   *
8479 @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8480          * orig_rax contains the negated interrupt vector.
8481          * Use that to determine where the sender put the data.
8482          */
8483 -       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8484 +       sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8485         f = &per_cpu(flush_state, sender);
8486
8487         if (!cpu_isset(cpu, f->flush_cpumask))
8488                 goto out;
8489 -               /*
8490 +               /*
8491                  * This was a BUG() but until someone can quote me the
8492                  * line from the intel manual that guarantees an IPI to
8493                  * multiple CPUs is retried _only_ on the erroring CPUs
8494 @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8495                  *
8496                  * BUG();
8497                  */
8498 -
8499 +
8500         if (f->flush_mm == read_pda(active_mm)) {
8501                 if (read_pda(mmu_state) == TLBSTATE_OK) {
8502 -                       if (f->flush_va == FLUSH_ALL)
8503 +                       if (f->flush_va == TLB_FLUSH_ALL)
8504                                 local_flush_tlb();
8505                         else
8506                                 __flush_tlb_one(f->flush_va);
8507 @@ -170,19 +170,22 @@ out:
8508         add_pda(irq_tlb_count, 1);
8509  }
8510
8511 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8512 -                                               unsigned long va)
8513 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8514 +                            unsigned long va)
8515  {
8516         int sender;
8517         union smp_flush_state *f;
8518 +       cpumask_t cpumask = *cpumaskp;
8519
8520         /* Caller has disabled preemption */
8521         sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8522         f = &per_cpu(flush_state, sender);
8523
8524 -       /* Could avoid this lock when
8525 -          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8526 -          probably not worth checking this for a cache-hot lock. */
8527 +       /*
8528 +        * Could avoid this lock when
8529 +        * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8530 +        * probably not worth checking this for a cache-hot lock.
8531 +        */
8532         spin_lock(&f->tlbstate_lock);
8533
8534         f->flush_mm = mm;
8535 @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8536  int __cpuinit init_smp_flush(void)
8537  {
8538         int i;
8539 +
8540         for_each_cpu_mask(i, cpu_possible_map) {
8541                 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8542         }
8543         return 0;
8544  }
8545 -
8546  core_initcall(init_smp_flush);
8547 -
8548 +
8549  void flush_tlb_current_task(void)
8550  {
8551         struct mm_struct *mm = current->mm;
8552 @@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8553
8554         local_flush_tlb();
8555         if (!cpus_empty(cpu_mask))
8556 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8557 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8558         preempt_enable();
8559  }
8560 -EXPORT_SYMBOL(flush_tlb_current_task);
8561
8562  void flush_tlb_mm (struct mm_struct * mm)
8563  {
8564 @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8565                         leave_mm(smp_processor_id());
8566         }
8567         if (!cpus_empty(cpu_mask))
8568 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8569 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8570
8571         preempt_enable();
8572  }
8573 -EXPORT_SYMBOL(flush_tlb_mm);
8574
8575  void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8576  {
8577 @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8578         if (current->active_mm == mm) {
8579                 if(current->mm)
8580                         __flush_tlb_one(va);
8581 -                else
8582 -                       leave_mm(smp_processor_id());
8583 +               else
8584 +                       leave_mm(smp_processor_id());
8585         }
8586
8587         if (!cpus_empty(cpu_mask))
8588 @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8589
8590         preempt_enable();
8591  }
8592 -EXPORT_SYMBOL(flush_tlb_page);
8593
8594  static void do_flush_tlb_all(void* info)
8595  {
8596 @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8597   * this function sends a 'generic call function' IPI to all other CPU
8598   * of the system defined in the mask.
8599   */
8600 -
8601 -static int
8602 -__smp_call_function_mask(cpumask_t mask,
8603 -                        void (*func)(void *), void *info,
8604 -                        int wait)
8605 +static int __smp_call_function_mask(cpumask_t mask,
8606 +                                   void (*func)(void *), void *info,
8607 +                                   int wait)
8608  {
8609         struct call_data_struct data;
8610         cpumask_t allbutself;
8611 @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8612   */
8613
8614  int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8615 -       int nonatomic, int wait)
8616 +                             int nonatomic, int wait)
8617  {
8618         /* prevent preemption and reschedule on another processor */
8619 -       int ret;
8620 -       int me = get_cpu();
8621 +       int ret, me = get_cpu();
8622
8623         /* Can deadlock when called with interrupts disabled */
8624         WARN_ON(irqs_disabled());
8625 @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8626          */
8627         cpu_clear(smp_processor_id(), cpu_online_map);
8628         disable_all_local_evtchn();
8629 -       for (;;)
8630 +       for (;;)
8631                 halt();
8632 -}
8633 +}
8634
8635  void smp_send_stop(void)
8636  {
8637 --- sle11-2009-05-14.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:35.000000000 +0100
8638 +++ sle11-2009-05-14/arch/x86/kernel/time_32-xen.c      2009-03-24 10:12:48.000000000 +0100
8639 @@ -28,21 +28,9 @@
8640   *     serialize accesses to xtime/lost_ticks).
8641   */
8642
8643 -#include <linux/errno.h>
8644 -#include <linux/sched.h>
8645 -#include <linux/kernel.h>
8646 -#include <linux/param.h>
8647 -#include <linux/string.h>
8648 -#include <linux/mm.h>
8649 +#include <linux/init.h>
8650  #include <linux/interrupt.h>
8651  #include <linux/time.h>
8652 -#include <linux/delay.h>
8653 -#include <linux/init.h>
8654 -#include <linux/smp.h>
8655 -#include <linux/module.h>
8656 -#include <linux/sysdev.h>
8657 -#include <linux/bcd.h>
8658 -#include <linux/efi.h>
8659  #include <linux/mca.h>
8660  #include <linux/sysctl.h>
8661  #include <linux/percpu.h>
8662 @@ -50,26 +38,10 @@
8663  #include <linux/posix-timers.h>
8664  #include <linux/cpufreq.h>
8665  #include <linux/clocksource.h>
8666 +#include <linux/sysdev.h>
8667
8668 -#include <asm/io.h>
8669 -#include <asm/smp.h>
8670 -#include <asm/irq.h>
8671 -#include <asm/msr.h>
8672  #include <asm/delay.h>
8673 -#include <asm/mpspec.h>
8674 -#include <asm/uaccess.h>
8675 -#include <asm/processor.h>
8676 -#include <asm/timer.h>
8677  #include <asm/time.h>
8678 -#include <asm/sections.h>
8679 -
8680 -#include "mach_time.h"
8681 -
8682 -#include <linux/timex.h>
8683 -
8684 -#include <asm/hpet.h>
8685 -
8686 -#include <asm/arch_hooks.h>
8687
8688  #include <xen/evtchn.h>
8689  #include <xen/sysctl.h>
8690 @@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8691  unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
8692  EXPORT_SYMBOL(cpu_khz);
8693
8694 -DEFINE_SPINLOCK(rtc_lock);
8695 -EXPORT_SYMBOL(rtc_lock);
8696 -
8697  /* These are peridically updated in shared_info, and then copied here. */
8698  struct shadow_time_info {
8699         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
8700 @@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8701  }
8702  __setup("independent_wallclock", __independent_wallclock);
8703
8704 +int xen_independent_wallclock(void)
8705 +{
8706 +       return independent_wallclock;
8707 +}
8708 +
8709  /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8710  static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8711  static int __init __permitted_clock_jitter(char *str)
8712 @@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8713         return cmpxchg64(ptr, 0, 0);
8714  #else
8715         return *ptr;
8716 -#define cmpxchg64 cmpxchg
8717  #endif
8718  }
8719
8720 @@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8721         return cmpxchg64_local(ptr, 0, 0);
8722  #else
8723         return *ptr;
8724 -#define cmpxchg64_local cmpxchg_local
8725  #endif
8726  }
8727
8728 @@ -339,35 +311,6 @@ static inline int time_values_up_to_date
8729         return (dst->version == src->version);
8730  }
8731
8732 -/*
8733 - * This is a special lock that is owned by the CPU and holds the index
8734 - * register we are working with.  It is required for NMI access to the
8735 - * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
8736 - */
8737 -volatile unsigned long cmos_lock = 0;
8738 -EXPORT_SYMBOL(cmos_lock);
8739 -
8740 -/* Routines for accessing the CMOS RAM/RTC. */
8741 -unsigned char rtc_cmos_read(unsigned char addr)
8742 -{
8743 -       unsigned char val;
8744 -       lock_cmos_prefix(addr);
8745 -       outb_p(addr, RTC_PORT(0));
8746 -       val = inb_p(RTC_PORT(1));
8747 -       lock_cmos_suffix(addr);
8748 -       return val;
8749 -}
8750 -EXPORT_SYMBOL(rtc_cmos_read);
8751 -
8752 -void rtc_cmos_write(unsigned char val, unsigned char addr)
8753 -{
8754 -       lock_cmos_prefix(addr);
8755 -       outb_p(addr, RTC_PORT(0));
8756 -       outb_p(val, RTC_PORT(1));
8757 -       lock_cmos_suffix(addr);
8758 -}
8759 -EXPORT_SYMBOL(rtc_cmos_write);
8760 -
8761  static void sync_xen_wallclock(unsigned long dummy);
8762  static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8763  static void sync_xen_wallclock(unsigned long dummy)
8764 @@ -376,7 +319,8 @@ static void sync_xen_wallclock(unsigned
8765         s64 nsec;
8766         struct xen_platform_op op;
8767
8768 -       if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8769 +       BUG_ON(!is_initial_xendomain());
8770 +       if (!ntp_synced() || independent_wallclock)
8771                 return;
8772
8773         write_seqlock_irq(&xtime_lock);
8774 @@ -399,23 +343,6 @@ static void sync_xen_wallclock(unsigned
8775         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8776  }
8777
8778 -static int set_rtc_mmss(unsigned long nowtime)
8779 -{
8780 -       int retval;
8781 -       unsigned long flags;
8782 -
8783 -       if (independent_wallclock || !is_initial_xendomain())
8784 -               return 0;
8785 -
8786 -       /* gets recalled with irq locally disabled */
8787 -       /* XXX - does irqsave resolve this? -johnstul */
8788 -       spin_lock_irqsave(&rtc_lock, flags);
8789 -       retval = set_wallclock(nowtime);
8790 -       spin_unlock_irqrestore(&rtc_lock, flags);
8791 -
8792 -       return retval;
8793 -}
8794 -
8795  static unsigned long long local_clock(void)
8796  {
8797         unsigned int cpu = get_cpu();
8798 @@ -498,28 +425,24 @@ unsigned long profile_pc(struct pt_regs
8799
8800  #if defined(CONFIG_SMP) || defined(__x86_64__)
8801  # ifdef __i386__
8802 -       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8803 +       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8804  # else
8805         if (!user_mode(regs)
8806  # endif
8807             && in_lock_functions(pc)) {
8808  # ifdef CONFIG_FRAME_POINTER
8809 -#  ifdef __i386__
8810 -               return ((unsigned long *)regs->ebp)[1];
8811 -#  else
8812 -               return ((unsigned long *)regs->rbp)[1];
8813 -#  endif
8814 +               return ((unsigned long *)regs->bp)[1];
8815  # else
8816  #  ifdef __i386__
8817 -               unsigned long *sp = (unsigned long *)&regs->esp;
8818 +               unsigned long *sp = (unsigned long *)&regs->sp;
8819  #  else
8820 -               unsigned long *sp = (unsigned long *)regs->rsp;
8821 +               unsigned long *sp = (unsigned long *)regs->sp;
8822  #  endif
8823
8824                 /* Return address is either directly at stack pointer
8825 -                  or above a saved eflags. Eflags has bits 22-31 zero,
8826 +                  or above a saved flags. Eflags has bits 22-31 zero,
8827                    kernel addresses don't. */
8828 -               if (sp[0] >> 22)
8829 +               if (sp[0] >> 22)
8830                         return sp[0];
8831                 if (sp[1] >> 22)
8832                         return sp[1];
8833 @@ -748,25 +671,32 @@ static void init_missing_ticks_accountin
8834                 runstate->time[RUNSTATE_offline];
8835  }
8836
8837 -/* not static: needed by APM */
8838 -unsigned long read_persistent_clock(void)
8839 +unsigned long xen_read_persistent_clock(void)
8840  {
8841 -       unsigned long retval;
8842 -       unsigned long flags;
8843 -
8844 -       spin_lock_irqsave(&rtc_lock, flags);
8845 +       const shared_info_t *s = HYPERVISOR_shared_info;
8846 +       u32 version, sec, nsec;
8847 +       u64 delta;
8848
8849 -       retval = get_wallclock();
8850 +       do {
8851 +               version = s->wc_version;
8852 +               rmb();
8853 +               sec     = s->wc_sec;
8854 +               nsec    = s->wc_nsec;
8855 +               rmb();
8856 +       } while ((s->wc_version & 1) | (version ^ s->wc_version));
8857
8858 -       spin_unlock_irqrestore(&rtc_lock, flags);
8859 +       delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
8860 +       do_div(delta, NSEC_PER_SEC);
8861
8862 -       return retval;
8863 +       return delta;
8864  }
8865
8866 -int update_persistent_clock(struct timespec now)
8867 +int xen_update_persistent_clock(void)
8868  {
8869 +       if (!is_initial_xendomain())
8870 +               return -1;
8871         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
8872 -       return set_rtc_mmss(now.tv_sec);
8873 +       return 0;
8874  }
8875
8876  extern void (*late_time_init)(void);
8877 --- sle11-2009-05-14.orig/arch/x86/kernel/traps_32-xen.c        2009-02-16 16:18:36.000000000 +0100
8878 +++ sle11-2009-05-14/arch/x86/kernel/traps_32-xen.c     2009-03-16 16:33:40.000000000 +0100
8879 @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
8880   * F0 0F bug workaround.. We have a special link segment
8881   * for this.
8882   */
8883 -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
8884 +gate_desc idt_table[256]
8885 +       __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
8886  #endif
8887
8888  asmlinkage void divide_error(void);
8889 @@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
8890  int kstack_depth_to_print = 24;
8891  static unsigned int code_bytes = 64;
8892
8893 +void printk_address(unsigned long address, int reliable)
8894 +{
8895 +#ifdef CONFIG_KALLSYMS
8896 +       unsigned long offset = 0, symsize;
8897 +       const char *symname;
8898 +       char *modname;
8899 +       char *delim = ":";
8900 +       char namebuf[128];
8901 +       char reliab[4] = "";
8902 +
8903 +       symname = kallsyms_lookup(address, &symsize, &offset,
8904 +                                       &modname, namebuf);
8905 +       if (!symname) {
8906 +               printk(" [<%08lx>]\n", address);
8907 +               return;
8908 +       }
8909 +       if (!reliable)
8910 +               strcpy(reliab, "? ");
8911 +
8912 +       if (!modname)
8913 +               modname = delim = "";
8914 +       printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
8915 +               address, reliab, delim, modname, delim, symname, offset, symsize);
8916 +#else
8917 +       printk(" [<%08lx>]\n", address);
8918 +#endif
8919 +}
8920 +
8921  static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
8922  {
8923         return  p > (void *)tinfo &&
8924 @@ -122,48 +151,35 @@ struct stack_frame {
8925  };
8926
8927  static inline unsigned long print_context_stack(struct thread_info *tinfo,
8928 -                               unsigned long *stack, unsigned long ebp,
8929 +                               unsigned long *stack, unsigned long bp,
8930                                 const struct stacktrace_ops *ops, void *data)
8931  {
8932 -#ifdef CONFIG_FRAME_POINTER
8933 -       struct stack_frame *frame = (struct stack_frame *)ebp;
8934 -       while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
8935 -               struct stack_frame *next;
8936 -               unsigned long addr;
8937 +       struct stack_frame *frame = (struct stack_frame *)bp;
8938
8939 -               addr = frame->return_address;
8940 -               ops->address(data, addr);
8941 -               /*
8942 -                * break out of recursive entries (such as
8943 -                * end_of_stack_stop_unwind_function). Also,
8944 -                * we can never allow a frame pointer to
8945 -                * move downwards!
8946 -                */
8947 -               next = frame->next_frame;
8948 -               if (next <= frame)
8949 -                       break;
8950 -               frame = next;
8951 -       }
8952 -#else
8953         while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
8954                 unsigned long addr;
8955
8956 -               addr = *stack++;
8957 -               if (__kernel_text_address(addr))
8958 -                       ops->address(data, addr);
8959 +               addr = *stack;
8960 +               if (__kernel_text_address(addr)) {
8961 +                       if ((unsigned long) stack == bp + 4) {
8962 +                               ops->address(data, addr, 1);
8963 +                               frame = frame->next_frame;
8964 +                               bp = (unsigned long) frame;
8965 +                       } else {
8966 +                               ops->address(data, addr, bp == 0);
8967 +                       }
8968 +               }
8969 +               stack++;
8970         }
8971 -#endif
8972 -       return ebp;
8973 +       return bp;
8974  }
8975
8976  #define MSG(msg) ops->warning(data, msg)
8977
8978  void dump_trace(struct task_struct *task, struct pt_regs *regs,
8979 -               unsigned long *stack,
8980 +               unsigned long *stack, unsigned long bp,
8981                 const struct stacktrace_ops *ops, void *data)
8982  {
8983 -       unsigned long ebp = 0;
8984 -
8985         if (!task)
8986                 task = current;
8987
8988 @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
8989                 unsigned long dummy;
8990                 stack = &dummy;
8991                 if (task != current)
8992 -                       stack = (unsigned long *)task->thread.esp;
8993 +                       stack = (unsigned long *)task->thread.sp;
8994         }
8995
8996  #ifdef CONFIG_FRAME_POINTER
8997 -       if (!ebp) {
8998 +       if (!bp) {
8999                 if (task == current) {
9000 -                       /* Grab ebp right from our regs */
9001 -                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
9002 +                       /* Grab bp right from our regs */
9003 +                       asm ("movl %%ebp, %0" : "=r" (bp) : );
9004                 } else {
9005 -                       /* ebp is the last reg pushed by switch_to */
9006 -                       ebp = *(unsigned long *) task->thread.esp;
9007 +                       /* bp is the last reg pushed by switch_to */
9008 +                       bp = *(unsigned long *) task->thread.sp;
9009                 }
9010         }
9011  #endif
9012 @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9013                 struct thread_info *context;
9014                 context = (struct thread_info *)
9015                         ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9016 -               ebp = print_context_stack(context, stack, ebp, ops, data);
9017 +               bp = print_context_stack(context, stack, bp, ops, data);
9018                 /* Should be after the line below, but somewhere
9019                    in early boot context comes out corrupted and we
9020                    can't reference it -AK */
9021 @@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9022  /*
9023   * Print one address/symbol entries per line.
9024   */
9025 -static void print_trace_address(void *data, unsigned long addr)
9026 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9027  {
9028         printk("%s [<%08lx>] ", (char *)data, addr);
9029 +       if (!reliable)
9030 +               printk("? ");
9031         print_symbol("%s\n", addr);
9032         touch_nmi_watchdog();
9033  }
9034 @@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9035
9036  static void
9037  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9038 -                  unsigned long * stack, char *log_lvl)
9039 +               unsigned long *stack, unsigned long bp, char *log_lvl)
9040  {
9041 -       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9042 +       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9043         printk("%s =======================\n", log_lvl);
9044  }
9045
9046  void show_trace(struct task_struct *task, struct pt_regs *regs,
9047 -               unsigned long * stack)
9048 +               unsigned long *stack, unsigned long bp)
9049  {
9050 -       show_trace_log_lvl(task, regs, stack, "");
9051 +       show_trace_log_lvl(task, regs, stack, bp, "");
9052  }
9053
9054  static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9055 -                              unsigned long *esp, char *log_lvl)
9056 +                      unsigned long *sp, unsigned long bp, char *log_lvl)
9057  {
9058         unsigned long *stack;
9059         int i;
9060
9061 -       if (esp == NULL) {
9062 +       if (sp == NULL) {
9063                 if (task)
9064 -                       esp = (unsigned long*)task->thread.esp;
9065 +                       sp = (unsigned long*)task->thread.sp;
9066                 else
9067 -                       esp = (unsigned long *)&esp;
9068 +                       sp = (unsigned long *)&sp;
9069         }
9070
9071 -       stack = esp;
9072 +       stack = sp;
9073         for(i = 0; i < kstack_depth_to_print; i++) {
9074                 if (kstack_end(stack))
9075                         break;
9076 @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9077                 printk("%08lx ", *stack++);
9078         }
9079         printk("\n%sCall Trace:\n", log_lvl);
9080 -       show_trace_log_lvl(task, regs, esp, log_lvl);
9081 +       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9082  }
9083
9084 -void show_stack(struct task_struct *task, unsigned long *esp)
9085 +void show_stack(struct task_struct *task, unsigned long *sp)
9086  {
9087         printk("       ");
9088 -       show_stack_log_lvl(task, NULL, esp, "");
9089 +       show_stack_log_lvl(task, NULL, sp, 0, "");
9090  }
9091
9092  /*
9093 @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9094  void dump_stack(void)
9095  {
9096         unsigned long stack;
9097 +       unsigned long bp = 0;
9098 +
9099 +#ifdef CONFIG_FRAME_POINTER
9100 +       if (!bp)
9101 +               asm("movl %%ebp, %0" : "=r" (bp):);
9102 +#endif
9103
9104         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9105                 current->pid, current->comm, print_tainted(),
9106                 init_utsname()->release,
9107                 (int)strcspn(init_utsname()->version, " "),
9108                 init_utsname()->version);
9109 -       show_trace(current, NULL, &stack);
9110 +       show_trace(current, NULL, &stack, bp);
9111  }
9112
9113  EXPORT_SYMBOL(dump_stack);
9114 @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9115          * time of the fault..
9116          */
9117         if (!user_mode_vm(regs)) {
9118 -               u8 *eip;
9119 +               u8 *ip;
9120                 unsigned int code_prologue = code_bytes * 43 / 64;
9121                 unsigned int code_len = code_bytes;
9122                 unsigned char c;
9123
9124                 printk("\n" KERN_EMERG "Stack: ");
9125 -               show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
9126 +               show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
9127
9128                 printk(KERN_EMERG "Code: ");
9129
9130 -               eip = (u8 *)regs->eip - code_prologue;
9131 -               if (eip < (u8 *)PAGE_OFFSET ||
9132 -                       probe_kernel_address(eip, c)) {
9133 +               ip = (u8 *)regs->ip - code_prologue;
9134 +               if (ip < (u8 *)PAGE_OFFSET ||
9135 +                       probe_kernel_address(ip, c)) {
9136                         /* try starting at EIP */
9137 -                       eip = (u8 *)regs->eip;
9138 +                       ip = (u8 *)regs->ip;
9139                         code_len = code_len - code_prologue + 1;
9140                 }
9141 -               for (i = 0; i < code_len; i++, eip++) {
9142 -                       if (eip < (u8 *)PAGE_OFFSET ||
9143 -                               probe_kernel_address(eip, c)) {
9144 +               for (i = 0; i < code_len; i++, ip++) {
9145 +                       if (ip < (u8 *)PAGE_OFFSET ||
9146 +                               probe_kernel_address(ip, c)) {
9147                                 printk(" Bad EIP value.");
9148                                 break;
9149                         }
9150 -                       if (eip == (u8 *)regs->eip)
9151 +                       if (ip == (u8 *)regs->ip)
9152                                 printk("<%02x> ", c);
9153                         else
9154                                 printk("%02x ", c);
9155 @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9156         printk("\n");
9157  }
9158
9159 -int is_valid_bugaddr(unsigned long eip)
9160 +int is_valid_bugaddr(unsigned long ip)
9161  {
9162         unsigned short ud2;
9163
9164 -       if (eip < PAGE_OFFSET)
9165 +       if (ip < PAGE_OFFSET)
9166                 return 0;
9167 -       if (probe_kernel_address((unsigned short *)eip, ud2))
9168 +       if (probe_kernel_address((unsigned short *)ip, ud2))
9169                 return 0;
9170
9171         return ud2 == 0x0b0f;
9172  }
9173
9174 +static int die_counter;
9175 +
9176 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9177 +{
9178 +       unsigned long sp;
9179 +       unsigned short ss;
9180 +
9181 +       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9182 +#ifdef CONFIG_PREEMPT
9183 +       printk("PREEMPT ");
9184 +#endif
9185 +#ifdef CONFIG_SMP
9186 +       printk("SMP ");
9187 +#endif
9188 +#ifdef CONFIG_DEBUG_PAGEALLOC
9189 +       printk("DEBUG_PAGEALLOC");
9190 +#endif
9191 +       printk("\n");
9192 +
9193 +       if (notify_die(DIE_OOPS, str, regs, err,
9194 +                               current->thread.trap_no, SIGSEGV) !=
9195 +                       NOTIFY_STOP) {
9196 +               show_registers(regs);
9197 +               /* Executive summary in case the oops scrolled away */
9198 +               sp = (unsigned long) (&regs->sp);
9199 +               savesegment(ss, ss);
9200 +               if (user_mode(regs)) {
9201 +                       sp = regs->sp;
9202 +                       ss = regs->ss & 0xffff;
9203 +               }
9204 +               printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9205 +               print_symbol("%s", regs->ip);
9206 +               printk(" SS:ESP %04x:%08lx\n", ss, sp);
9207 +               return 0;
9208 +       } else {
9209 +               return 1;
9210 +       }
9211 +}
9212 +
9213  /*
9214   * This is gone through when something in the kernel has done something bad and
9215   * is about to be terminated.
9216 @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9217                 .lock_owner =           -1,
9218                 .lock_owner_depth =     0
9219         };
9220 -       static int die_counter;
9221         unsigned long flags;
9222
9223         oops_enter();
9224 @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9225                 raw_local_irq_save(flags);
9226
9227         if (++die.lock_owner_depth < 3) {
9228 -               unsigned long esp;
9229 -               unsigned short ss;
9230 -
9231 -               report_bug(regs->eip, regs);
9232 -
9233 -               printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9234 -                      ++die_counter);
9235 -#ifdef CONFIG_PREEMPT
9236 -               printk("PREEMPT ");
9237 -#endif
9238 -#ifdef CONFIG_SMP
9239 -               printk("SMP ");
9240 -#endif
9241 -#ifdef CONFIG_DEBUG_PAGEALLOC
9242 -               printk("DEBUG_PAGEALLOC");
9243 -#endif
9244 -               printk("\n");
9245 +               report_bug(regs->ip, regs);
9246
9247 -               if (notify_die(DIE_OOPS, str, regs, err,
9248 -                                       current->thread.trap_no, SIGSEGV) !=
9249 -                               NOTIFY_STOP) {
9250 -                       show_registers(regs);
9251 -                       /* Executive summary in case the oops scrolled away */
9252 -                       esp = (unsigned long) (&regs->esp);
9253 -                       savesegment(ss, ss);
9254 -                       if (user_mode(regs)) {
9255 -                               esp = regs->esp;
9256 -                               ss = regs->xss & 0xffff;
9257 -                       }
9258 -                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9259 -                       print_symbol("%s", regs->eip);
9260 -                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
9261 -               }
9262 -               else
9263 +               if (__die(str, regs, err))
9264                         regs = NULL;
9265 -       } else
9266 +       } else {
9267                 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9268 +       }
9269
9270         bust_spinlocks(0);
9271         die.lock_owner = -1;
9272 @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9273  {
9274         struct task_struct *tsk = current;
9275
9276 -       if (regs->eflags & VM_MASK) {
9277 +       if (regs->flags & VM_MASK) {
9278                 if (vm86)
9279                         goto vm86_trap;
9280                 goto trap_signal;
9281 @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9282  }
9283
9284  #define DO_ERROR(trapnr, signr, str, name) \
9285 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9286 +void do_##name(struct pt_regs * regs, long error_code) \
9287  { \
9288         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9289                                                 == NOTIFY_STOP) \
9290 @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9291  }
9292
9293  #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9294 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9295 +void do_##name(struct pt_regs * regs, long error_code) \
9296  { \
9297         siginfo_t info; \
9298         if (irq) \
9299 @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9300  }
9301
9302  #define DO_VM86_ERROR(trapnr, signr, str, name) \
9303 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9304 +void do_##name(struct pt_regs * regs, long error_code) \
9305  { \
9306         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9307                                                 == NOTIFY_STOP) \
9308 @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9309  }
9310
9311  #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9312 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9313 +void do_##name(struct pt_regs * regs, long error_code) \
9314  { \
9315         siginfo_t info; \
9316         info.si_signo = signr; \
9317 @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9318         do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9319  }
9320
9321 -DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
9322 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
9323  #ifndef CONFIG_KPROBES
9324  DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9325  #endif
9326  DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9327  DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9328 -DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9329 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9330  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
9331  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9332  DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
9333 @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS,  "stack segment", s
9334  DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9335  DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9336
9337 -fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9338 +void __kprobes do_general_protection(struct pt_regs * regs,
9339                                               long error_code)
9340  {
9341 -       if (regs->eflags & VM_MASK)
9342 +       if (regs->flags & VM_MASK)
9343                 goto gp_in_vm86;
9344
9345         if (!user_mode(regs))
9346 @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9347         current->thread.error_code = error_code;
9348         current->thread.trap_no = 13;
9349         if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9350 -           printk_ratelimit())
9351 +           printk_ratelimit()) {
9352                 printk(KERN_INFO
9353 -                   "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9354 +                   "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9355                     current->comm, task_pid_nr(current),
9356 -                   regs->eip, regs->esp, error_code);
9357 +                   regs->ip, regs->sp, error_code);
9358 +               print_vma_addr(" in ", regs->ip);
9359 +               printk("\n");
9360 +       }
9361
9362         force_sig(SIGSEGV, current);
9363         return;
9364 @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9365         */
9366         bust_spinlocks(1);
9367         printk(KERN_EMERG "%s", msg);
9368 -       printk(" on CPU%d, eip %08lx, registers:\n",
9369 -               smp_processor_id(), regs->eip);
9370 +       printk(" on CPU%d, ip %08lx, registers:\n",
9371 +               smp_processor_id(), regs->ip);
9372         show_registers(regs);
9373         console_silent();
9374         spin_unlock(&nmi_print_lock);
9375 @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9376
9377  static int ignore_nmis;
9378
9379 -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9380 +__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9381  {
9382         int cpu;
9383
9384 @@ -762,7 +797,7 @@ void restart_nmi(void)
9385  }
9386
9387  #ifdef CONFIG_KPROBES
9388 -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9389 +void __kprobes do_int3(struct pt_regs *regs, long error_code)
9390  {
9391         trace_hardirqs_fixup();
9392
9393 @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9394   * find every occurrence of the TF bit that could be saved away even
9395   * by user code)
9396   */
9397 -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9398 +void __kprobes do_debug(struct pt_regs * regs, long error_code)
9399  {
9400         unsigned int condition;
9401         struct task_struct *tsk = current;
9402 @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9403
9404         get_debugreg(condition, 6);
9405
9406 +       /*
9407 +        * The processor cleared BTF, so don't mark that we need it set.
9408 +        */
9409 +       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9410 +       tsk->thread.debugctlmsr = 0;
9411 +
9412         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9413                                         SIGTRAP) == NOTIFY_STOP)
9414                 return;
9415         /* It's safe to allow irq's after DR6 has been saved */
9416 -       if (regs->eflags & X86_EFLAGS_IF)
9417 +       if (regs->flags & X86_EFLAGS_IF)
9418                 local_irq_enable();
9419
9420         /* Mask out spurious debug traps due to lazy DR7 setting */
9421         if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9422 -               if (!tsk->thread.debugreg[7])
9423 +               if (!tsk->thread.debugreg7)
9424                         goto clear_dr7;
9425         }
9426
9427 -       if (regs->eflags & VM_MASK)
9428 +       if (regs->flags & VM_MASK)
9429                 goto debug_vm86;
9430
9431         /* Save debug status register where ptrace can see it */
9432 -       tsk->thread.debugreg[6] = condition;
9433 +       tsk->thread.debugreg6 = condition;
9434
9435         /*
9436          * Single-stepping through TF: make sure we ignore any events in
9437 @@ -856,7 +897,7 @@ debug_vm86:
9438
9439  clear_TF_reenable:
9440         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9441 -       regs->eflags &= ~TF_MASK;
9442 +       regs->flags &= ~TF_MASK;
9443         return;
9444  }
9445
9446 @@ -865,7 +906,7 @@ clear_TF_reenable:
9447   * the correct behaviour even in the presence of the asynchronous
9448   * IRQ13 behaviour
9449   */
9450 -void math_error(void __user *eip)
9451 +void math_error(void __user *ip)
9452  {
9453         struct task_struct * task;
9454         siginfo_t info;
9455 @@ -881,7 +922,7 @@ void math_error(void __user *eip)
9456         info.si_signo = SIGFPE;
9457         info.si_errno = 0;
9458         info.si_code = __SI_FAULT;
9459 -       info.si_addr = eip;
9460 +       info.si_addr = ip;
9461         /*
9462          * (~cwd & swd) will mask out exceptions that are not set to unmasked
9463          * status.  0x3f is the exception bits in these regs, 0x200 is the
9464 @@ -924,13 +965,13 @@ void math_error(void __user *eip)
9465         force_sig_info(SIGFPE, &info, task);
9466  }
9467
9468 -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9469 +void do_coprocessor_error(struct pt_regs * regs, long error_code)
9470  {
9471         ignore_fpu_irq = 1;
9472 -       math_error((void __user *)regs->eip);
9473 +       math_error((void __user *)regs->ip);
9474  }
9475
9476 -static void simd_math_error(void __user *eip)
9477 +static void simd_math_error(void __user *ip)
9478  {
9479         struct task_struct * task;
9480         siginfo_t info;
9481 @@ -946,7 +987,7 @@ static void simd_math_error(void __user
9482         info.si_signo = SIGFPE;
9483         info.si_errno = 0;
9484         info.si_code = __SI_FAULT;
9485 -       info.si_addr = eip;
9486 +       info.si_addr = ip;
9487         /*
9488          * The SIMD FPU exceptions are handled a little differently, as there
9489          * is only a single status/control register.  Thus, to determine which
9490 @@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9491         force_sig_info(SIGFPE, &info, task);
9492  }
9493
9494 -fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9495 +void do_simd_coprocessor_error(struct pt_regs * regs,
9496                                           long error_code)
9497  {
9498         if (cpu_has_xmm) {
9499                 /* Handle SIMD FPU exceptions on PIII+ processors. */
9500                 ignore_fpu_irq = 1;
9501 -               simd_math_error((void __user *)regs->eip);
9502 +               simd_math_error((void __user *)regs->ip);
9503         } else {
9504                 /*
9505                  * Handle strange cache flush from user space exception
9506                  * in all other cases.  This is undocumented behaviour.
9507                  */
9508 -               if (regs->eflags & VM_MASK) {
9509 +               if (regs->flags & VM_MASK) {
9510                         handle_vm86_fault((struct kernel_vm86_regs *)regs,
9511                                           error_code);
9512                         return;
9513 @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9514  }
9515
9516  #ifndef CONFIG_XEN
9517 -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9518 +void do_spurious_interrupt_bug(struct pt_regs * regs,
9519                                           long error_code)
9520  {
9521  #if 0
9522 @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9523  #endif
9524  }
9525
9526 -fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9527 +unsigned long patch_espfix_desc(unsigned long uesp,
9528                                           unsigned long kesp)
9529  {
9530         struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9531 @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9532   * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9533   * for those that specify <dpl>|4 in the second field.
9534   */
9535 -static trap_info_t __cpuinitdata trap_table[] = {
9536 +static const trap_info_t __cpuinitconst trap_table[] = {
9537         {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
9538         {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
9539         {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
9540 @@ -1105,17 +1146,12 @@ void __init trap_init(void)
9541         if (ret)
9542                 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9543
9544 +       /*
9545 +        * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9546 +        * Generate a build-time error if the alignment is wrong.
9547 +        */
9548 +       BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9549         if (cpu_has_fxsr) {
9550 -               /*
9551 -                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9552 -                * Generates a compile-time "error: zero width for bit-field" if
9553 -                * the alignment is wrong.
9554 -                */
9555 -               struct fxsrAlignAssert {
9556 -                       int _:!(offsetof(struct task_struct,
9557 -                                       thread.i387.fxsave) & 15);
9558 -               };
9559 -
9560                 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9561                 set_in_cr4(X86_CR4_OSFXSR);
9562                 printk("done.\n");
9563 --- sle11-2009-05-14.orig/arch/x86/kernel/traps_64-xen.c        2009-02-16 16:18:36.000000000 +0100
9564 +++ sle11-2009-05-14/arch/x86/kernel/traps_64-xen.c     2009-03-16 16:33:40.000000000 +0100
9565 @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9566  asmlinkage void machine_check(void);
9567  asmlinkage void spurious_interrupt_bug(void);
9568
9569 +static unsigned int code_bytes = 64;
9570 +
9571  static inline void conditional_sti(struct pt_regs *regs)
9572  {
9573 -       if (regs->eflags & X86_EFLAGS_IF)
9574 +       if (regs->flags & X86_EFLAGS_IF)
9575                 local_irq_enable();
9576  }
9577
9578  static inline void preempt_conditional_sti(struct pt_regs *regs)
9579  {
9580 -       preempt_disable();
9581 -       if (regs->eflags & X86_EFLAGS_IF)
9582 +       inc_preempt_count();
9583 +       if (regs->flags & X86_EFLAGS_IF)
9584                 local_irq_enable();
9585  }
9586
9587  static inline void preempt_conditional_cli(struct pt_regs *regs)
9588  {
9589 -       if (regs->eflags & X86_EFLAGS_IF)
9590 +       if (regs->flags & X86_EFLAGS_IF)
9591                 local_irq_disable();
9592         /* Make sure to not schedule here because we could be running
9593            on an exception stack. */
9594 -       preempt_enable_no_resched();
9595 +       dec_preempt_count();
9596  }
9597
9598  int kstack_depth_to_print = 12;
9599
9600 -#ifdef CONFIG_KALLSYMS
9601 -void printk_address(unsigned long address)
9602 +void printk_address(unsigned long address, int reliable)
9603  {
9604 +#ifdef CONFIG_KALLSYMS
9605         unsigned long offset = 0, symsize;
9606         const char *symname;
9607         char *modname;
9608         char *delim = ":";
9609 -       char namebuf[128];
9610 +       char namebuf[KSYM_NAME_LEN];
9611 +       char reliab[4] = "";
9612
9613         symname = kallsyms_lookup(address, &symsize, &offset,
9614                                         &modname, namebuf);
9615 @@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9616                 printk(" [<%016lx>]\n", address);
9617                 return;
9618         }
9619 +       if (!reliable)
9620 +               strcpy(reliab, "? ");
9621 +
9622         if (!modname)
9623 -               modname = delim = "";
9624 -       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9625 -               address, delim, modname, delim, symname, offset, symsize);
9626 -}
9627 +               modname = delim = "";
9628 +       printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9629 +               address, reliab, delim, modname, delim, symname, offset, symsize);
9630  #else
9631 -void printk_address(unsigned long address)
9632 -{
9633         printk(" [<%016lx>]\n", address);
9634 -}
9635  #endif
9636 +}
9637
9638  static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9639                                         unsigned *usedp, char **idp)
9640 @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9641   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9642   */
9643
9644 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9645 +static inline int valid_stack_ptr(struct thread_info *tinfo,
9646 +                       void *p, unsigned int size, void *end)
9647  {
9648 -       void *t = (void *)tinfo;
9649 -        return p > t && p < t + THREAD_SIZE - 3;
9650 +       void *t = tinfo;
9651 +       if (end) {
9652 +               if (p < end && p >= (end-THREAD_SIZE))
9653 +                       return 1;
9654 +               else
9655 +                       return 0;
9656 +       }
9657 +       return p > t && p < t + THREAD_SIZE - size;
9658 +}
9659 +
9660 +/* The form of the top of the frame on the stack */
9661 +struct stack_frame {
9662 +       struct stack_frame *next_frame;
9663 +       unsigned long return_address;
9664 +};
9665 +
9666 +
9667 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
9668 +                               unsigned long *stack, unsigned long bp,
9669 +                               const struct stacktrace_ops *ops, void *data,
9670 +                               unsigned long *end)
9671 +{
9672 +       struct stack_frame *frame = (struct stack_frame *)bp;
9673 +
9674 +       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9675 +               unsigned long addr;
9676 +
9677 +               addr = *stack;
9678 +               if (__kernel_text_address(addr)) {
9679 +                       if ((unsigned long) stack == bp + 8) {
9680 +                               ops->address(data, addr, 1);
9681 +                               frame = frame->next_frame;
9682 +                               bp = (unsigned long) frame;
9683 +                       } else {
9684 +                               ops->address(data, addr, bp == 0);
9685 +                       }
9686 +               }
9687 +               stack++;
9688 +       }
9689 +       return bp;
9690  }
9691
9692  void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9693 -               unsigned long *stack,
9694 +               unsigned long *stack, unsigned long bp,
9695                 const struct stacktrace_ops *ops, void *data)
9696  {
9697         const unsigned cpu = get_cpu();
9698 @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9699
9700         if (!tsk)
9701                 tsk = current;
9702 +       tinfo = task_thread_info(tsk);
9703
9704         if (!stack) {
9705                 unsigned long dummy;
9706                 stack = &dummy;
9707                 if (tsk && tsk != current)
9708 -                       stack = (unsigned long *)tsk->thread.rsp;
9709 +                       stack = (unsigned long *)tsk->thread.sp;
9710         }
9711
9712 -       /*
9713 -        * Print function call entries within a stack. 'cond' is the
9714 -        * "end of stackframe" condition, that the 'stack++'
9715 -        * iteration will eventually trigger.
9716 -        */
9717 -#define HANDLE_STACK(cond) \
9718 -       do while (cond) { \
9719 -               unsigned long addr = *stack++; \
9720 -               /* Use unlocked access here because except for NMIs     \
9721 -                  we should be already protected against module unloads */ \
9722 -               if (__kernel_text_address(addr)) { \
9723 -                       /* \
9724 -                        * If the address is either in the text segment of the \
9725 -                        * kernel, or in the region which contains vmalloc'ed \
9726 -                        * memory, it *may* be the address of a calling \
9727 -                        * routine; if so, print it so that someone tracing \
9728 -                        * down the cause of the crash will be able to figure \
9729 -                        * out the call path that was taken. \
9730 -                        */ \
9731 -                       ops->address(data, addr);   \
9732 -               } \
9733 -       } while (0)
9734 +#ifdef CONFIG_FRAME_POINTER
9735 +       if (!bp) {
9736 +               if (tsk == current) {
9737 +                       /* Grab bp right from our regs */
9738 +                       asm("movq %%rbp, %0" : "=r" (bp):);
9739 +               } else {
9740 +                       /* bp is the last reg pushed by switch_to */
9741 +                       bp = *(unsigned long *) tsk->thread.sp;
9742 +               }
9743 +       }
9744 +#endif
9745 +
9746 +
9747
9748         /*
9749          * Print function call entries in all stacks, starting at the
9750 @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9751                 if (estack_end) {
9752                         if (ops->stack(data, id) < 0)
9753                                 break;
9754 -                       HANDLE_STACK (stack < estack_end);
9755 +
9756 +                       bp = print_context_stack(tinfo, stack, bp, ops,
9757 +                                                       data, estack_end);
9758                         ops->stack(data, "<EOE>");
9759                         /*
9760                          * We link to the next stack via the
9761 @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9762                         if (stack >= irqstack && stack < irqstack_end) {
9763                                 if (ops->stack(data, "IRQ") < 0)
9764                                         break;
9765 -                               HANDLE_STACK (stack < irqstack_end);
9766 +                               bp = print_context_stack(tinfo, stack, bp,
9767 +                                               ops, data, irqstack_end);
9768                                 /*
9769                                  * We link to the next stack (which would be
9770                                  * the process stack normally) the last
9771 @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9772         /*
9773          * This handles the process stack:
9774          */
9775 -       tinfo = task_thread_info(tsk);
9776 -       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9777 -#undef HANDLE_STACK
9778 +       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9779         put_cpu();
9780  }
9781  EXPORT_SYMBOL(dump_trace);
9782 @@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9783         return 0;
9784  }
9785
9786 -static void print_trace_address(void *data, unsigned long addr)
9787 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9788  {
9789         touch_nmi_watchdog();
9790 -       printk_address(addr);
9791 +       printk_address(addr, reliable);
9792  }
9793
9794  static const struct stacktrace_ops print_trace_ops = {
9795 @@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9796  };
9797
9798  void
9799 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9800 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9801 +               unsigned long bp)
9802  {
9803         printk("\nCall Trace:\n");
9804 -       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9805 +       dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9806         printk("\n");
9807  }
9808
9809  static void
9810 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9811 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9812 +                                                       unsigned long bp)
9813  {
9814         unsigned long *stack;
9815         int i;
9816 @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9817         // debugging aid: "show_stack(NULL, NULL);" prints the
9818         // back trace for this cpu.
9819
9820 -       if (rsp == NULL) {
9821 +       if (sp == NULL) {
9822                 if (tsk)
9823 -                       rsp = (unsigned long *)tsk->thread.rsp;
9824 +                       sp = (unsigned long *)tsk->thread.sp;
9825                 else
9826 -                       rsp = (unsigned long *)&rsp;
9827 +                       sp = (unsigned long *)&sp;
9828         }
9829
9830 -       stack = rsp;
9831 +       stack = sp;
9832         for(i=0; i < kstack_depth_to_print; i++) {
9833                 if (stack >= irqstack && stack <= irqstack_end) {
9834                         if (stack == irqstack_end) {
9835 @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9836                 printk(" %016lx", *stack++);
9837                 touch_nmi_watchdog();
9838         }
9839 -       show_trace(tsk, regs, rsp);
9840 +       show_trace(tsk, regs, sp, bp);
9841  }
9842
9843 -void show_stack(struct task_struct *tsk, unsigned long * rsp)
9844 +void show_stack(struct task_struct *tsk, unsigned long * sp)
9845  {
9846 -       _show_stack(tsk, NULL, rsp);
9847 +       _show_stack(tsk, NULL, sp, 0);
9848  }
9849
9850  /*
9851 @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9852  void dump_stack(void)
9853  {
9854         unsigned long dummy;
9855 +       unsigned long bp = 0;
9856 +
9857 +#ifdef CONFIG_FRAME_POINTER
9858 +       if (!bp)
9859 +               asm("movq %%rbp, %0" : "=r" (bp):);
9860 +#endif
9861
9862         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9863                 current->pid, current->comm, print_tainted(),
9864                 init_utsname()->release,
9865                 (int)strcspn(init_utsname()->version, " "),
9866                 init_utsname()->version);
9867 -       show_trace(NULL, NULL, &dummy);
9868 +       show_trace(NULL, NULL, &dummy, bp);
9869  }
9870
9871  EXPORT_SYMBOL(dump_stack);
9872 @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
9873  void show_registers(struct pt_regs *regs)
9874  {
9875         int i;
9876 -       int in_kernel = !user_mode(regs);
9877 -       unsigned long rsp;
9878 +       unsigned long sp;
9879         const int cpu = smp_processor_id();
9880         struct task_struct *cur = cpu_pda(cpu)->pcurrent;
9881 +       u8 *ip;
9882 +       unsigned int code_prologue = code_bytes * 43 / 64;
9883 +       unsigned int code_len = code_bytes;
9884
9885 -       rsp = regs->rsp;
9886 +       sp = regs->sp;
9887 +       ip = (u8 *) regs->ip - code_prologue;
9888         printk("CPU %d ", cpu);
9889         __show_regs(regs);
9890         printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
9891 @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
9892          * When in-kernel, we also print out the stack and code at the
9893          * time of the fault..
9894          */
9895 -       if (in_kernel) {
9896 +       if (!user_mode(regs)) {
9897 +               unsigned char c;
9898                 printk("Stack: ");
9899 -               _show_stack(NULL, regs, (unsigned long*)rsp);
9900 +               _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
9901 +               printk("\n");
9902
9903 -               printk("\nCode: ");
9904 -               if (regs->rip < PAGE_OFFSET)
9905 -                       goto bad;
9906 -
9907 -               for (i=0; i<20; i++) {
9908 -                       unsigned char c;
9909 -                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
9910 -bad:
9911 +               printk(KERN_EMERG "Code: ");
9912 +               if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
9913 +                       /* try starting at RIP */
9914 +                       ip = (u8 *) regs->ip;
9915 +                       code_len = code_len - code_prologue + 1;
9916 +               }
9917 +               for (i = 0; i < code_len; i++, ip++) {
9918 +                       if (ip < (u8 *)PAGE_OFFSET ||
9919 +                                       probe_kernel_address(ip, c)) {
9920                                 printk(" Bad RIP value.");
9921                                 break;
9922                         }
9923 -                       printk("%02x ", c);
9924 +                       if (ip == (u8 *)regs->ip)
9925 +                               printk("<%02x> ", c);
9926 +                       else
9927 +                               printk("%02x ", c);
9928                 }
9929         }
9930         printk("\n");
9931  }
9932
9933 -int is_valid_bugaddr(unsigned long rip)
9934 +int is_valid_bugaddr(unsigned long ip)
9935  {
9936         unsigned short ud2;
9937
9938 -       if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
9939 +       if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
9940                 return 0;
9941
9942         return ud2 == 0x0b0f;
9943  }
9944
9945 -#ifdef CONFIG_BUG
9946 -void out_of_line_bug(void)
9947 -{
9948 -       BUG();
9949 -}
9950 -EXPORT_SYMBOL(out_of_line_bug);
9951 -#endif
9952 -
9953  static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
9954  static int die_owner = -1;
9955  static unsigned int die_nest_count;
9956 @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
9957         return flags;
9958  }
9959
9960 -void __kprobes oops_end(unsigned long flags)
9961 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
9962  {
9963         die_owner = -1;
9964         bust_spinlocks(0);
9965 @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
9966                 /* Nest count reaches zero, release the lock. */
9967                 __raw_spin_unlock(&die_lock);
9968         raw_local_irq_restore(flags);
9969 +       if (!regs) {
9970 +               oops_exit();
9971 +               return;
9972 +       }
9973         if (panic_on_oops)
9974                 panic("Fatal exception");
9975         oops_exit();
9976 +       do_exit(signr);
9977  }
9978
9979 -void __kprobes __die(const char * str, struct pt_regs * regs, long err)
9980 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9981  {
9982         static int die_counter;
9983         printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
9984 @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
9985         printk("DEBUG_PAGEALLOC");
9986  #endif
9987         printk("\n");
9988 -       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
9989 +       if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
9990 +               return 1;
9991         show_registers(regs);
9992         add_taint(TAINT_DIE);
9993         /* Executive summary in case the oops scrolled away */
9994         printk(KERN_ALERT "RIP ");
9995 -       printk_address(regs->rip);
9996 -       printk(" RSP <%016lx>\n", regs->rsp);
9997 +       printk_address(regs->ip, 1);
9998 +       printk(" RSP <%016lx>\n", regs->sp);
9999         if (kexec_should_crash(current))
10000                 crash_kexec(regs);
10001 +       return 0;
10002  }
10003
10004  void die(const char * str, struct pt_regs * regs, long err)
10005 @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10006         unsigned long flags = oops_begin();
10007
10008         if (!user_mode(regs))
10009 -               report_bug(regs->rip, regs);
10010 +               report_bug(regs->ip, regs);
10011
10012 -       __die(str, regs, err);
10013 -       oops_end(flags);
10014 -       do_exit(SIGSEGV);
10015 +       if (__die(str, regs, err))
10016 +               regs = NULL;
10017 +       oops_end(flags, regs, SIGSEGV);
10018  }
10019
10020  #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10021 @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10022                 crash_kexec(regs);
10023         if (do_panic || panic_on_oops)
10024                 panic("Non maskable interrupt");
10025 -       oops_end(flags);
10026 +       oops_end(flags, NULL, SIGBUS);
10027         nmi_exit();
10028         local_irq_enable();
10029 -       do_exit(SIGSEGV);
10030 +       do_exit(SIGBUS);
10031  }
10032  #endif
10033
10034 @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10035                 tsk->thread.trap_no = trapnr;
10036
10037                 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10038 -                   printk_ratelimit())
10039 +                   printk_ratelimit()) {
10040                         printk(KERN_INFO
10041 -                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10042 +                              "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10043                                tsk->comm, tsk->pid, str,
10044 -                              regs->rip, regs->rsp, error_code);
10045 +                              regs->ip, regs->sp, error_code);
10046 +                       print_vma_addr(" in ", regs->ip);
10047 +                       printk("\n");
10048 +               }
10049
10050                 if (info)
10051                         force_sig_info(signr, info, tsk);
10052 @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10053         }
10054
10055
10056 -       /* kernel trap */
10057 -       {
10058 -               const struct exception_table_entry *fixup;
10059 -               fixup = search_exception_tables(regs->rip);
10060 -               if (fixup)
10061 -                       regs->rip = fixup->fixup;
10062 -               else {
10063 -                       tsk->thread.error_code = error_code;
10064 -                       tsk->thread.trap_no = trapnr;
10065 -                       die(str, regs, error_code);
10066 -               }
10067 -               return;
10068 +       if (!fixup_exception(regs)) {
10069 +               tsk->thread.error_code = error_code;
10070 +               tsk->thread.trap_no = trapnr;
10071 +               die(str, regs, error_code);
10072         }
10073 +       return;
10074  }
10075
10076  #define DO_ERROR(trapnr, signr, str, name) \
10077 @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10078         do_trap(trapnr, signr, str, regs, error_code, &info); \
10079  }
10080
10081 -DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
10082 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
10083  DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10084  DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10085 -DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10086 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10087  DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10088  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
10089  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10090 @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10091                 tsk->thread.trap_no = 13;
10092
10093                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10094 -                   printk_ratelimit())
10095 +                   printk_ratelimit()) {
10096                         printk(KERN_INFO
10097 -                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10098 +                      "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10099                                tsk->comm, tsk->pid,
10100 -                              regs->rip, regs->rsp, error_code);
10101 +                              regs->ip, regs->sp, error_code);
10102 +                       print_vma_addr(" in ", regs->ip);
10103 +                       printk("\n");
10104 +               }
10105
10106                 force_sig(SIGSEGV, tsk);
10107                 return;
10108         }
10109
10110 -       /* kernel gp */
10111 -       {
10112 -               const struct exception_table_entry *fixup;
10113 -               fixup = search_exception_tables(regs->rip);
10114 -               if (fixup) {
10115 -                       regs->rip = fixup->fixup;
10116 -                       return;
10117 -               }
10118 +       if (fixup_exception(regs))
10119 +               return;
10120
10121 -               tsk->thread.error_code = error_code;
10122 -               tsk->thread.trap_no = 13;
10123 -               if (notify_die(DIE_GPF, "general protection fault", regs,
10124 -                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
10125 -                       return;
10126 -               die("general protection fault", regs, error_code);
10127 -       }
10128 +       tsk->thread.error_code = error_code;
10129 +       tsk->thread.trap_no = 13;
10130 +       if (notify_die(DIE_GPF, "general protection fault", regs,
10131 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
10132 +               return;
10133 +       die("general protection fault", regs, error_code);
10134  }
10135
10136  static __kprobes void
10137 @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10138  {
10139         struct pt_regs *regs = eregs;
10140         /* Did already sync */
10141 -       if (eregs == (struct pt_regs *)eregs->rsp)
10142 +       if (eregs == (struct pt_regs *)eregs->sp)
10143                 ;
10144         /* Exception from user space */
10145         else if (user_mode(eregs))
10146                 regs = task_pt_regs(current);
10147         /* Exception from kernel and interrupts are enabled. Move to
10148            kernel process stack. */
10149 -       else if (eregs->eflags & X86_EFLAGS_IF)
10150 -               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10151 +       else if (eregs->flags & X86_EFLAGS_IF)
10152 +               regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10153         if (eregs != regs)
10154                 *regs = *eregs;
10155         return regs;
10156 @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10157
10158         get_debugreg(condition, 6);
10159
10160 +       /*
10161 +        * The processor cleared BTF, so don't mark that we need it set.
10162 +        */
10163 +       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10164 +       tsk->thread.debugctlmsr = 0;
10165 +
10166         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10167                                                 SIGTRAP) == NOTIFY_STOP)
10168                 return;
10169 @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10170
10171         tsk->thread.debugreg6 = condition;
10172
10173 -       /* Mask out spurious TF errors due to lazy TF clearing */
10174 +
10175 +       /*
10176 +        * Single-stepping through TF: make sure we ignore any events in
10177 +        * kernel space (but re-enable TF when returning to user mode).
10178 +        */
10179         if (condition & DR_STEP) {
10180 -               /*
10181 -                * The TF error should be masked out only if the current
10182 -                * process is not traced and if the TRAP flag has been set
10183 -                * previously by a tracing process (condition detected by
10184 -                * the PT_DTRACE flag); remember that the i386 TRAP flag
10185 -                * can be modified by the process itself in user mode,
10186 -                * allowing programs to debug themselves without the ptrace()
10187 -                * interface.
10188 -                */
10189                  if (!user_mode(regs))
10190                         goto clear_TF_reenable;
10191 -               /*
10192 -                * Was the TF flag set by a debugger? If so, clear it now,
10193 -                * so that register information is correct.
10194 -                */
10195 -               if (tsk->ptrace & PT_DTRACE) {
10196 -                       regs->eflags &= ~TF_MASK;
10197 -                       tsk->ptrace &= ~PT_DTRACE;
10198 -               }
10199         }
10200
10201         /* Ok, finally something we can handle */
10202 @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10203         info.si_signo = SIGTRAP;
10204         info.si_errno = 0;
10205         info.si_code = TRAP_BRKPT;
10206 -       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10207 +       info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10208         force_sig_info(SIGTRAP, &info, tsk);
10209
10210  clear_dr7:
10211 @@ -913,18 +949,15 @@ clear_dr7:
10212
10213  clear_TF_reenable:
10214         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10215 -       regs->eflags &= ~TF_MASK;
10216 +       regs->flags &= ~X86_EFLAGS_TF;
10217         preempt_conditional_cli(regs);
10218  }
10219
10220  static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10221  {
10222 -       const struct exception_table_entry *fixup;
10223 -       fixup = search_exception_tables(regs->rip);
10224 -       if (fixup) {
10225 -               regs->rip = fixup->fixup;
10226 +       if (fixup_exception(regs))
10227                 return 1;
10228 -       }
10229 +
10230         notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10231         /* Illegal floating point operation in the kernel */
10232         current->thread.trap_no = trapnr;
10233 @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10234   */
10235  asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10236  {
10237 -       void __user *rip = (void __user *)(regs->rip);
10238 +       void __user *ip = (void __user *)(regs->ip);
10239         struct task_struct * task;
10240         siginfo_t info;
10241         unsigned short cwd, swd;
10242 @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10243         info.si_signo = SIGFPE;
10244         info.si_errno = 0;
10245         info.si_code = __SI_FAULT;
10246 -       info.si_addr = rip;
10247 +       info.si_addr = ip;
10248         /*
10249          * (~cwd & swd) will mask out exceptions that are not set to unmasked
10250          * status.  0x3f is the exception bits in these regs, 0x200 is the
10251 @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10252
10253  asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10254  {
10255 -       void __user *rip = (void __user *)(regs->rip);
10256 +       void __user *ip = (void __user *)(regs->ip);
10257         struct task_struct * task;
10258         siginfo_t info;
10259         unsigned short mxcsr;
10260 @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10261         info.si_signo = SIGFPE;
10262         info.si_errno = 0;
10263         info.si_code = __SI_FAULT;
10264 -       info.si_addr = rip;
10265 +       info.si_addr = ip;
10266         /*
10267          * The SIMD FPU exceptions are handled a little differently, as there
10268          * is only a single status/control register.  Thus, to determine which
10269 @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10270         task_thread_info(me)->status |= TS_USEDFPU;
10271         me->fpu_counter++;
10272  }
10273 +EXPORT_SYMBOL_GPL(math_state_restore);
10274
10275
10276  /*
10277   * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10278   * specify <dpl>|4 in the second field.
10279   */
10280 -static trap_info_t __cpuinitdata trap_table[] = {
10281 +static const trap_info_t __cpuinitconst trap_table[] = {
10282          {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
10283          {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
10284          {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
10285 @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10286         return 0;
10287  }
10288  early_param("kstack", kstack_setup);
10289 +
10290 +
10291 +static int __init code_bytes_setup(char *s)
10292 +{
10293 +       code_bytes = simple_strtoul(s, NULL, 0);
10294 +       if (code_bytes > 8192)
10295 +               code_bytes = 8192;
10296 +
10297 +       return 1;
10298 +}
10299 +__setup("code_bytes=", code_bytes_setup);
10300 --- sle11-2009-05-14.orig/arch/x86/kernel/vsyscall_64-xen.c     2009-02-16 16:18:36.000000000 +0100
10301 +++ sle11-2009-05-14/arch/x86/kernel/vsyscall_64-xen.c  2009-03-16 16:33:40.000000000 +0100
10302 @@ -43,12 +43,7 @@
10303  #include <asm/vgtod.h>
10304
10305  #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10306 -#define __syscall_clobber "r11","rcx","memory"
10307 -#define __pa_vsymbol(x)                        \
10308 -       ({unsigned long v;              \
10309 -       extern char __vsyscall_0;       \
10310 -         asm("" : "=r" (v) : "0" (x)); \
10311 -         ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10312 +#define __syscall_clobber "r11","cx","memory"
10313
10314  /*
10315   * vsyscall_gtod_data contains data that is :
10316 @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10317  static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10318  {
10319         int ret;
10320 -       asm volatile("vsysc2: syscall"
10321 +       asm volatile("syscall"
10322                 : "=a" (ret)
10323                 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10324                 : __syscall_clobber );
10325 @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10326  static __always_inline long time_syscall(long *t)
10327  {
10328         long secs;
10329 -       asm volatile("vsysc1: syscall"
10330 +       asm volatile("syscall"
10331                 : "=a" (secs)
10332                 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10333         return secs;
10334 @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10335  long __vsyscall(2)
10336  vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10337  {
10338 -       unsigned int dummy, p;
10339 +       unsigned int p;
10340         unsigned long j = 0;
10341
10342         /* Fast cache - only recompute value once per jiffies and avoid
10343 @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10344                 p = tcache->blob[1];
10345         } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10346                 /* Load per CPU data from RDTSCP */
10347 -               rdtscp(dummy, dummy, p);
10348 +               native_read_tscp(&p);
10349         } else {
10350                 /* Load per CPU data from GDT */
10351                 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10352 @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10353
10354  #ifdef CONFIG_SYSCTL
10355
10356 -#define SYSCALL 0x050f
10357 -#define NOP2    0x9090
10358 -
10359 -/*
10360 - * NOP out syscall in vsyscall page when not needed.
10361 - */
10362 -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10363 -                        void __user *buffer, size_t *lenp, loff_t *ppos)
10364 +static int
10365 +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10366 +                      void __user *buffer, size_t *lenp, loff_t *ppos)
10367  {
10368 -       extern u16 vsysc1, vsysc2;
10369 -       u16 __iomem *map1;
10370 -       u16 __iomem *map2;
10371 -       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10372 -       if (!write)
10373 -               return ret;
10374 -       /* gcc has some trouble with __va(__pa()), so just do it this
10375 -          way. */
10376 -       map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10377 -       if (!map1)
10378 -               return -ENOMEM;
10379 -       map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10380 -       if (!map2) {
10381 -               ret = -ENOMEM;
10382 -               goto out;
10383 -       }
10384 -       if (!vsyscall_gtod_data.sysctl_enabled) {
10385 -               writew(SYSCALL, map1);
10386 -               writew(SYSCALL, map2);
10387 -       } else {
10388 -               writew(NOP2, map1);
10389 -               writew(NOP2, map2);
10390 -       }
10391 -       iounmap(map2);
10392 -out:
10393 -       iounmap(map1);
10394 -       return ret;
10395 +       return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10396  }
10397
10398  static ctl_table kernel_table2[] = {
10399 @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10400           .child = kernel_table2 },
10401         {}
10402  };
10403 -
10404  #endif
10405
10406  /* Assume __initcall executes before all user space. Hopefully kmod
10407 @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10408         d |= cpu;
10409         d |= (node & 0xf) << 12;
10410         d |= (node >> 4) << 48;
10411 -       if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10412 +       if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10413                                                          + GDT_ENTRY_PER_CPU),
10414                                          d))
10415                 BUG();
10416 @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10417         return NOTIFY_DONE;
10418  }
10419
10420 -static void __init map_vsyscall(void)
10421 +void __init map_vsyscall(void)
10422  {
10423         extern char __vsyscall_0;
10424         unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10425 @@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10426         BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10427         BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10428         BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10429 -       map_vsyscall();
10430  #ifdef CONFIG_XEN
10431         vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10432         if (boot_cpu_has(X86_FEATURE_RDTSCP))
10433 --- sle11-2009-05-14.orig/arch/x86/kernel/xen_entry_64.S        2009-05-14 10:56:29.000000000 +0200
10434 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
10435 @@ -1,36 +0,0 @@
10436 -/*
10437 - * Copied from arch/xen/i386/kernel/entry.S
10438 - */
10439 -/* Offsets into shared_info_t. */
10440 -#define evtchn_upcall_pending          /* 0 */
10441 -#define evtchn_upcall_mask             1
10442 -
10443 -#define sizeof_vcpu_shift              6
10444 -
10445 -#ifdef CONFIG_SMP
10446 -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10447 -//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
10448 -#define preempt_disable(reg)
10449 -#define preempt_enable(reg)
10450 -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
10451 -                               movq %gs:pda_cpunumber,reg              ; \
10452 -                               shl  $32, reg                           ; \
10453 -                               shr  $32-sizeof_vcpu_shift,reg          ; \
10454 -                               addq HYPERVISOR_shared_info,reg
10455 -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
10456 -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10457 -#else
10458 -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10459 -#define XEN_PUT_VCPU_INFO(reg)
10460 -#define XEN_PUT_VCPU_INFO_fixup
10461 -#endif
10462 -
10463 -#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
10464 -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10465 -#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
10466 -                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
10467 -                               XEN_PUT_VCPU_INFO(reg)
10468 -#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
10469 -                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
10470 -                               XEN_PUT_VCPU_INFO(reg)
10471 -#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
10472 --- sle11-2009-05-14.orig/arch/x86/mach-xen/setup.c     2009-02-16 16:17:21.000000000 +0100
10473 +++ sle11-2009-05-14/arch/x86/mach-xen/setup.c  2009-03-16 16:33:40.000000000 +0100
10474 @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10475
10476         /* Do an early initialization of the fixmap area */
10477         {
10478 -               extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10479 +               extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10480                 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10481 -               pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10482 -               pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10483 +               pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10484                 pmd_t *pmd = pmd_offset(pud, addr);
10485
10486 -               swapper_pg_dir = pgd;
10487 -               init_mm.pgd    = pgd;
10488 -               make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10489 -               set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10490 +               make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10491 +               set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10492         }
10493  }
10494 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
10495 +++ sle11-2009-05-14/arch/x86/mm/fault-xen.c    2009-03-16 16:33:40.000000000 +0100
10496 @@ -0,0 +1,1025 @@
10497 +/*
10498 + *  Copyright (C) 1995  Linus Torvalds
10499 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
10500 + */
10501 +
10502 +#include <linux/signal.h>
10503 +#include <linux/sched.h>
10504 +#include <linux/kernel.h>
10505 +#include <linux/errno.h>
10506 +#include <linux/string.h>
10507 +#include <linux/types.h>
10508 +#include <linux/ptrace.h>
10509 +#include <linux/mman.h>
10510 +#include <linux/mm.h>
10511 +#include <linux/smp.h>
10512 +#include <linux/interrupt.h>
10513 +#include <linux/init.h>
10514 +#include <linux/tty.h>
10515 +#include <linux/vt_kern.h>             /* For unblank_screen() */
10516 +#include <linux/compiler.h>
10517 +#include <linux/highmem.h>
10518 +#include <linux/bootmem.h>             /* for max_low_pfn */
10519 +#include <linux/vmalloc.h>
10520 +#include <linux/module.h>
10521 +#include <linux/kprobes.h>
10522 +#include <linux/uaccess.h>
10523 +#include <linux/kdebug.h>
10524 +
10525 +#include <asm/system.h>
10526 +#include <asm/desc.h>
10527 +#include <asm/segment.h>
10528 +#include <asm/pgalloc.h>
10529 +#include <asm/smp.h>
10530 +#include <asm/tlbflush.h>
10531 +#include <asm/proto.h>
10532 +#include <asm-generic/sections.h>
10533 +
10534 +/*
10535 + * Page fault error code bits
10536 + *     bit 0 == 0 means no page found, 1 means protection fault
10537 + *     bit 1 == 0 means read, 1 means write
10538 + *     bit 2 == 0 means kernel, 1 means user-mode
10539 + *     bit 3 == 1 means use of reserved bit detected
10540 + *     bit 4 == 1 means fault was an instruction fetch
10541 + */
10542 +#define PF_PROT                (1<<0)
10543 +#define PF_WRITE       (1<<1)
10544 +#define PF_USER                (1<<2)
10545 +#define PF_RSVD                (1<<3)
10546 +#define PF_INSTR       (1<<4)
10547 +
10548 +static inline int notify_page_fault(struct pt_regs *regs)
10549 +{
10550 +#ifdef CONFIG_KPROBES
10551 +       int ret = 0;
10552 +
10553 +       /* kprobe_running() needs smp_processor_id() */
10554 +#ifdef CONFIG_X86_32
10555 +       if (!user_mode_vm(regs)) {
10556 +#else
10557 +       if (!user_mode(regs)) {
10558 +#endif
10559 +               preempt_disable();
10560 +               if (kprobe_running() && kprobe_fault_handler(regs, 14))
10561 +                       ret = 1;
10562 +               preempt_enable();
10563 +       }
10564 +
10565 +       return ret;
10566 +#else
10567 +       return 0;
10568 +#endif
10569 +}
10570 +
10571 +/*
10572 + * X86_32
10573 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10574 + * Check that here and ignore it.
10575 + *
10576 + * X86_64
10577 + * Sometimes the CPU reports invalid exceptions on prefetch.
10578 + * Check that here and ignore it.
10579 + *
10580 + * Opcode checker based on code by Richard Brunner
10581 + */
10582 +static int is_prefetch(struct pt_regs *regs, unsigned long addr,
10583 +                      unsigned long error_code)
10584 +{
10585 +       unsigned char *instr;
10586 +       int scan_more = 1;
10587 +       int prefetch = 0;
10588 +       unsigned char *max_instr;
10589 +
10590 +       /*
10591 +        * If it was a exec (instruction fetch) fault on NX page, then
10592 +        * do not ignore the fault:
10593 +        */
10594 +       if (error_code & PF_INSTR)
10595 +               return 0;
10596 +
10597 +       instr = (unsigned char *)convert_ip_to_linear(current, regs);
10598 +       max_instr = instr + 15;
10599 +
10600 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
10601 +               return 0;
10602 +
10603 +       while (scan_more && instr < max_instr) {
10604 +               unsigned char opcode;
10605 +               unsigned char instr_hi;
10606 +               unsigned char instr_lo;
10607 +
10608 +               if (probe_kernel_address(instr, opcode))
10609 +                       break;
10610 +
10611 +               instr_hi = opcode & 0xf0;
10612 +               instr_lo = opcode & 0x0f;
10613 +               instr++;
10614 +
10615 +               switch (instr_hi) {
10616 +               case 0x20:
10617 +               case 0x30:
10618 +                       /*
10619 +                        * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
10620 +                        * In X86_64 long mode, the CPU will signal invalid
10621 +                        * opcode if some of these prefixes are present so
10622 +                        * X86_64 will never get here anyway
10623 +                        */
10624 +                       scan_more = ((instr_lo & 7) == 0x6);
10625 +                       break;
10626 +#ifdef CONFIG_X86_64
10627 +               case 0x40:
10628 +                       /*
10629 +                        * In AMD64 long mode 0x40..0x4F are valid REX prefixes
10630 +                        * Need to figure out under what instruction mode the
10631 +                        * instruction was issued. Could check the LDT for lm,
10632 +                        * but for now it's good enough to assume that long
10633 +                        * mode only uses well known segments or kernel.
10634 +                        */
10635 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
10636 +                       break;
10637 +#endif
10638 +               case 0x60:
10639 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
10640 +                       scan_more = (instr_lo & 0xC) == 0x4;
10641 +                       break;
10642 +               case 0xF0:
10643 +                       /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
10644 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
10645 +                       break;
10646 +               case 0x00:
10647 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
10648 +                       scan_more = 0;
10649 +
10650 +                       if (probe_kernel_address(instr, opcode))
10651 +                               break;
10652 +                       prefetch = (instr_lo == 0xF) &&
10653 +                               (opcode == 0x0D || opcode == 0x18);
10654 +                       break;
10655 +               default:
10656 +                       scan_more = 0;
10657 +                       break;
10658 +               }
10659 +       }
10660 +       return prefetch;
10661 +}
10662 +
10663 +static void force_sig_info_fault(int si_signo, int si_code,
10664 +       unsigned long address, struct task_struct *tsk)
10665 +{
10666 +       siginfo_t info;
10667 +
10668 +       info.si_signo = si_signo;
10669 +       info.si_errno = 0;
10670 +       info.si_code = si_code;
10671 +       info.si_addr = (void __user *)address;
10672 +       force_sig_info(si_signo, &info, tsk);
10673 +}
10674 +
10675 +#ifdef CONFIG_X86_64
10676 +static int bad_address(void *p)
10677 +{
10678 +       unsigned long dummy;
10679 +       return probe_kernel_address((unsigned long *)p, dummy);
10680 +}
10681 +#endif
10682 +
10683 +static void dump_pagetable(unsigned long address)
10684 +{
10685 +#ifdef CONFIG_X86_32
10686 +       __typeof__(pte_val(__pte(0))) page;
10687 +
10688 +       page = read_cr3();
10689 +       page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
10690 +#ifdef CONFIG_X86_PAE
10691 +       printk("*pdpt = %016Lx ", page);
10692 +       if ((page & _PAGE_PRESENT)
10693 +           && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
10694 +               page = mfn_to_pfn(page >> PAGE_SHIFT);
10695 +               page <<= PAGE_SHIFT;
10696 +               page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
10697 +                                                        & (PTRS_PER_PMD - 1)];
10698 +               printk(KERN_CONT "*pde = %016Lx ", page);
10699 +               page &= ~_PAGE_NX;
10700 +       }
10701 +#else
10702 +       printk("*pde = %08lx ", page);
10703 +#endif
10704 +
10705 +       /*
10706 +        * We must not directly access the pte in the highpte
10707 +        * case if the page table is located in highmem.
10708 +        * And let's rather not kmap-atomic the pte, just in case
10709 +        * it's allocated already.
10710 +        */
10711 +       if ((page & _PAGE_PRESENT)
10712 +           && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
10713 +           && !(page & _PAGE_PSE)) {
10714 +               page = mfn_to_pfn(page >> PAGE_SHIFT);
10715 +               page <<= PAGE_SHIFT;
10716 +               page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
10717 +                                                        & (PTRS_PER_PTE - 1)];
10718 +               printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
10719 +       }
10720 +
10721 +       printk(KERN_CONT "\n");
10722 +#else /* CONFIG_X86_64 */
10723 +       pgd_t *pgd;
10724 +       pud_t *pud;
10725 +       pmd_t *pmd;
10726 +       pte_t *pte;
10727 +
10728 +       pgd = (pgd_t *)read_cr3();
10729 +
10730 +       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
10731 +       pgd += pgd_index(address);
10732 +       if (bad_address(pgd)) goto bad;
10733 +       printk("PGD %lx ", pgd_val(*pgd));
10734 +       if (!pgd_present(*pgd)) goto ret;
10735 +
10736 +       pud = pud_offset(pgd, address);
10737 +       if (bad_address(pud)) goto bad;
10738 +       printk(KERN_CONT "PUD %lx ", pud_val(*pud));
10739 +       if (!pud_present(*pud) || pud_large(*pud))
10740 +               goto ret;
10741 +
10742 +       pmd = pmd_offset(pud, address);
10743 +       if (bad_address(pmd)) goto bad;
10744 +       printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
10745 +       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
10746 +
10747 +       pte = pte_offset_kernel(pmd, address);
10748 +       if (bad_address(pte)) goto bad;
10749 +       printk(KERN_CONT "PTE %lx", pte_val(*pte));
10750 +ret:
10751 +       printk(KERN_CONT "\n");
10752 +       return;
10753 +bad:
10754 +       printk("BAD\n");
10755 +#endif
10756 +}
10757 +
10758 +#ifdef CONFIG_X86_32
10759 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10760 +{
10761 +       unsigned index = pgd_index(address);
10762 +       pgd_t *pgd_k;
10763 +       pud_t *pud, *pud_k;
10764 +       pmd_t *pmd, *pmd_k;
10765 +
10766 +       pgd += index;
10767 +       pgd_k = init_mm.pgd + index;
10768 +
10769 +       if (!pgd_present(*pgd_k))
10770 +               return NULL;
10771 +
10772 +       /*
10773 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
10774 +        * and redundant with the set_pmd() on non-PAE. As would
10775 +        * set_pud.
10776 +        */
10777 +
10778 +       pud = pud_offset(pgd, address);
10779 +       pud_k = pud_offset(pgd_k, address);
10780 +       if (!pud_present(*pud_k))
10781 +               return NULL;
10782 +
10783 +       pmd = pmd_offset(pud, address);
10784 +       pmd_k = pmd_offset(pud_k, address);
10785 +       if (!pmd_present(*pmd_k))
10786 +               return NULL;
10787 +       if (!pmd_present(*pmd)) {
10788 +               bool lazy = x86_read_percpu(xen_lazy_mmu);
10789 +
10790 +               x86_write_percpu(xen_lazy_mmu, false);
10791 +#if CONFIG_XEN_COMPAT > 0x030002
10792 +               set_pmd(pmd, *pmd_k);
10793 +#else
10794 +               /*
10795 +                * When running on older Xen we must launder *pmd_k through
10796 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10797 +                */
10798 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10799 +#endif
10800 +               x86_write_percpu(xen_lazy_mmu, lazy);
10801 +       } else
10802 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
10803 +       return pmd_k;
10804 +}
10805 +#endif
10806 +
10807 +#ifdef CONFIG_X86_64
10808 +static const char errata93_warning[] =
10809 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
10810 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
10811 +KERN_ERR "******* Please consider a BIOS update.\n"
10812 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
10813 +#endif
10814 +
10815 +/* Workaround for K8 erratum #93 & buggy BIOS.
10816 +   BIOS SMM functions are required to use a specific workaround
10817 +   to avoid corruption of the 64bit RIP register on C stepping K8.
10818 +   A lot of BIOS that didn't get tested properly miss this.
10819 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
10820 +   Try to work around it here.
10821 +   Note we only handle faults in kernel here.
10822 +   Does nothing for X86_32
10823 + */
10824 +static int is_errata93(struct pt_regs *regs, unsigned long address)
10825 +{
10826 +#ifdef CONFIG_X86_64
10827 +       static int warned;
10828 +       if (address != regs->ip)
10829 +               return 0;
10830 +       if ((address >> 32) != 0)
10831 +               return 0;
10832 +       address |= 0xffffffffUL << 32;
10833 +       if ((address >= (u64)_stext && address <= (u64)_etext) ||
10834 +           (address >= MODULES_VADDR && address <= MODULES_END)) {
10835 +               if (!warned) {
10836 +                       printk(errata93_warning);
10837 +                       warned = 1;
10838 +               }
10839 +               regs->ip = address;
10840 +               return 1;
10841 +       }
10842 +#endif
10843 +       return 0;
10844 +}
10845 +
10846 +/*
10847 + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
10848 + * addresses >4GB.  We catch this in the page fault handler because these
10849 + * addresses are not reachable. Just detect this case and return.  Any code
10850 + * segment in LDT is compatibility mode.
10851 + */
10852 +static int is_errata100(struct pt_regs *regs, unsigned long address)
10853 +{
10854 +#ifdef CONFIG_X86_64
10855 +       if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
10856 +           (address >> 32))
10857 +               return 1;
10858 +#endif
10859 +       return 0;
10860 +}
10861 +
10862 +void do_invalid_op(struct pt_regs *, unsigned long);
10863 +
10864 +static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
10865 +{
10866 +#ifdef CONFIG_X86_F00F_BUG
10867 +       unsigned long nr;
10868 +       /*
10869 +        * Pentium F0 0F C7 C8 bug workaround.
10870 +        */
10871 +       if (boot_cpu_data.f00f_bug) {
10872 +               nr = (address - idt_descr.address) >> 3;
10873 +
10874 +               if (nr == 6) {
10875 +                       do_invalid_op(regs, 0);
10876 +                       return 1;
10877 +               }
10878 +       }
10879 +#endif
10880 +       return 0;
10881 +}
10882 +
10883 +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
10884 +                           unsigned long address)
10885 +{
10886 +#ifdef CONFIG_X86_32
10887 +       if (!oops_may_print())
10888 +               return;
10889 +#endif
10890 +
10891 +#ifdef CONFIG_X86_PAE
10892 +       if (error_code & PF_INSTR) {
10893 +               unsigned int level;
10894 +               pte_t *pte = lookup_address(address, &level);
10895 +
10896 +               if (pte && pte_present(*pte) && !pte_exec(*pte))
10897 +                       printk(KERN_CRIT "kernel tried to execute "
10898 +                               "NX-protected page - exploit attempt? "
10899 +                               "(uid: %d)\n", current->uid);
10900 +       }
10901 +#endif
10902 +
10903 +       printk(KERN_ALERT "BUG: unable to handle kernel ");
10904 +       if (address < PAGE_SIZE)
10905 +               printk(KERN_CONT "NULL pointer dereference");
10906 +       else
10907 +               printk(KERN_CONT "paging request");
10908 +#ifdef CONFIG_X86_32
10909 +       printk(KERN_CONT " at %08lx\n", address);
10910 +#else
10911 +       printk(KERN_CONT " at %016lx\n", address);
10912 +#endif
10913 +       printk(KERN_ALERT "IP:");
10914 +       printk_address(regs->ip, 1);
10915 +       dump_pagetable(address);
10916 +}
10917 +
10918 +#ifdef CONFIG_X86_64
10919 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
10920 +                                unsigned long error_code)
10921 +{
10922 +       unsigned long flags = oops_begin();
10923 +       struct task_struct *tsk;
10924 +
10925 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
10926 +              current->comm, address);
10927 +       dump_pagetable(address);
10928 +       tsk = current;
10929 +       tsk->thread.cr2 = address;
10930 +       tsk->thread.trap_no = 14;
10931 +       tsk->thread.error_code = error_code;
10932 +       if (__die("Bad pagetable", regs, error_code))
10933 +               regs = NULL;
10934 +       oops_end(flags, regs, SIGKILL);
10935 +}
10936 +#endif
10937 +
10938 +static int spurious_fault_check(unsigned long error_code, pte_t *pte)
10939 +{
10940 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
10941 +               return 0;
10942 +       if ((error_code & PF_INSTR) && !pte_exec(*pte))
10943 +               return 0;
10944 +
10945 +       return 1;
10946 +}
10947 +
10948 +/*
10949 + * Handle a spurious fault caused by a stale TLB entry.  This allows
10950 + * us to lazily refresh the TLB when increasing the permissions of a
10951 + * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
10952 + * expensive since that implies doing a full cross-processor TLB
10953 + * flush, even if no stale TLB entries exist on other processors.
10954 + * There are no security implications to leaving a stale TLB when
10955 + * increasing the permissions on a page.
10956 + */
10957 +static int spurious_fault(unsigned long address,
10958 +                         unsigned long error_code)
10959 +{
10960 +       pgd_t *pgd;
10961 +       pud_t *pud;
10962 +       pmd_t *pmd;
10963 +       pte_t *pte;
10964 +
10965 +       /* Reserved-bit violation or user access to kernel space? */
10966 +       if (error_code & (PF_USER | PF_RSVD))
10967 +               return 0;
10968 +
10969 +       pgd = init_mm.pgd + pgd_index(address);
10970 +       if (!pgd_present(*pgd))
10971 +               return 0;
10972 +
10973 +       pud = pud_offset(pgd, address);
10974 +       if (!pud_present(*pud))
10975 +               return 0;
10976 +
10977 +       if (pud_large(*pud))
10978 +               return spurious_fault_check(error_code, (pte_t *) pud);
10979 +
10980 +       pmd = pmd_offset(pud, address);
10981 +       if (!pmd_present(*pmd))
10982 +               return 0;
10983 +
10984 +       if (pmd_large(*pmd))
10985 +               return spurious_fault_check(error_code, (pte_t *) pmd);
10986 +
10987 +       pte = pte_offset_kernel(pmd, address);
10988 +       if (!pte_present(*pte))
10989 +               return 0;
10990 +
10991 +       return spurious_fault_check(error_code, pte);
10992 +}
10993 +
10994 +/*
10995 + * X86_32
10996 + * Handle a fault on the vmalloc or module mapping area
10997 + *
10998 + * X86_64
10999 + * Handle a fault on the vmalloc area
11000 + *
11001 + * This assumes no large pages in there.
11002 + */
11003 +static int vmalloc_fault(unsigned long address)
11004 +{
11005 +#ifdef CONFIG_X86_32
11006 +       unsigned long pgd_paddr;
11007 +       pmd_t *pmd_k;
11008 +       pte_t *pte_k;
11009 +       /*
11010 +        * Synchronize this task's top level page-table
11011 +        * with the 'reference' page table.
11012 +        *
11013 +        * Do _not_ use "current" here. We might be inside
11014 +        * an interrupt in the middle of a task switch..
11015 +        */
11016 +       pgd_paddr = read_cr3();
11017 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11018 +       if (!pmd_k)
11019 +               return -1;
11020 +       pte_k = pte_offset_kernel(pmd_k, address);
11021 +       if (!pte_present(*pte_k))
11022 +               return -1;
11023 +       return 0;
11024 +#else
11025 +       pgd_t *pgd, *pgd_ref;
11026 +       pud_t *pud, *pud_ref;
11027 +       pmd_t *pmd, *pmd_ref;
11028 +       pte_t *pte, *pte_ref;
11029 +
11030 +       /* Make sure we are in vmalloc area */
11031 +       if (!(address >= VMALLOC_START && address < VMALLOC_END))
11032 +               return -1;
11033 +
11034 +       /* Copy kernel mappings over when needed. This can also
11035 +          happen within a race in page table update. In the later
11036 +          case just flush. */
11037 +
11038 +       /* On Xen the line below does not always work. Needs investigating! */
11039 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11040 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11041 +       pgd += pgd_index(address);
11042 +       pgd_ref = pgd_offset_k(address);
11043 +       if (pgd_none(*pgd_ref))
11044 +               return -1;
11045 +       if (pgd_none(*pgd))
11046 +               set_pgd(pgd, *pgd_ref);
11047 +       else
11048 +               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11049 +
11050 +       /* Below here mismatches are bugs because these lower tables
11051 +          are shared */
11052 +
11053 +       pud = pud_offset(pgd, address);
11054 +       pud_ref = pud_offset(pgd_ref, address);
11055 +       if (pud_none(*pud_ref))
11056 +               return -1;
11057 +       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11058 +               BUG();
11059 +       pmd = pmd_offset(pud, address);
11060 +       pmd_ref = pmd_offset(pud_ref, address);
11061 +       if (pmd_none(*pmd_ref))
11062 +               return -1;
11063 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11064 +               BUG();
11065 +       pte_ref = pte_offset_kernel(pmd_ref, address);
11066 +       if (!pte_present(*pte_ref))
11067 +               return -1;
11068 +       pte = pte_offset_kernel(pmd, address);
11069 +       /* Don't use pte_page here, because the mappings can point
11070 +          outside mem_map, and the NUMA hash lookup cannot handle
11071 +          that. */
11072 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11073 +               BUG();
11074 +       return 0;
11075 +#endif
11076 +}
11077 +
11078 +int show_unhandled_signals = 1;
11079 +
11080 +/*
11081 + * This routine handles page faults.  It determines the address,
11082 + * and the problem, and then passes it off to one of the appropriate
11083 + * routines.
11084 + */
11085 +#ifdef CONFIG_X86_64
11086 +asmlinkage
11087 +#endif
11088 +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
11089 +{
11090 +       struct task_struct *tsk;
11091 +       struct mm_struct *mm;
11092 +       struct vm_area_struct *vma;
11093 +       unsigned long address;
11094 +       int write, si_code;
11095 +       int fault;
11096 +#ifdef CONFIG_X86_64
11097 +       unsigned long flags;
11098 +#endif
11099 +
11100 +       /*
11101 +        * We can fault from pretty much anywhere, with unknown IRQ state.
11102 +        */
11103 +       trace_hardirqs_fixup();
11104 +
11105 +       /* Set the "privileged fault" bit to something sane. */
11106 +       if (user_mode_vm(regs))
11107 +               error_code |= PF_USER;
11108 +       else
11109 +               error_code &= ~PF_USER;
11110 +
11111 +       tsk = current;
11112 +       mm = tsk->mm;
11113 +       prefetchw(&mm->mmap_sem);
11114 +
11115 +       /* get the address */
11116 +       address = read_cr2();
11117 +
11118 +       si_code = SEGV_MAPERR;
11119 +
11120 +       if (notify_page_fault(regs))
11121 +               return;
11122 +
11123 +       /*
11124 +        * We fault-in kernel-space virtual memory on-demand. The
11125 +        * 'reference' page table is init_mm.pgd.
11126 +        *
11127 +        * NOTE! We MUST NOT take any locks for this case. We may
11128 +        * be in an interrupt or a critical region, and should
11129 +        * only copy the information from the master page table,
11130 +        * nothing more.
11131 +        *
11132 +        * This verifies that the fault happens in kernel space
11133 +        * (error_code & 4) == 0, and that the fault was not a
11134 +        * protection error (error_code & 9) == 0.
11135 +        */
11136 +#ifdef CONFIG_X86_32
11137 +       if (unlikely(address >= TASK_SIZE)) {
11138 +#else
11139 +       if (unlikely(address >= TASK_SIZE64)) {
11140 +#endif
11141 +               /* Faults in hypervisor area can never be patched up. */
11142 +#if defined(CONFIG_X86_XEN)
11143 +               if (address >= hypervisor_virt_start)
11144 +                       goto bad_area_nosemaphore;
11145 +#elif defined(CONFIG_X86_64_XEN)
11146 +               if (address >= HYPERVISOR_VIRT_START
11147 +                   && address < HYPERVISOR_VIRT_END)
11148 +                       goto bad_area_nosemaphore;
11149 +#endif
11150 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11151 +                   vmalloc_fault(address) >= 0)
11152 +                       return;
11153 +
11154 +               /* Can handle a stale RO->RW TLB */
11155 +               if (spurious_fault(address, error_code))
11156 +                       return;
11157 +
11158 +               /*
11159 +                * Don't take the mm semaphore here. If we fixup a prefetch
11160 +                * fault we could otherwise deadlock.
11161 +                */
11162 +               goto bad_area_nosemaphore;
11163 +       }
11164 +
11165 +
11166 +#ifdef CONFIG_X86_32
11167 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11168 +          fault has been handled. */
11169 +       if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11170 +               local_irq_enable();
11171 +
11172 +       /*
11173 +        * If we're in an interrupt, have no user context or are running in an
11174 +        * atomic region then we must not take the fault.
11175 +        */
11176 +       if (in_atomic() || !mm)
11177 +               goto bad_area_nosemaphore;
11178 +#else /* CONFIG_X86_64 */
11179 +       if (likely(regs->flags & X86_EFLAGS_IF))
11180 +               local_irq_enable();
11181 +
11182 +       if (unlikely(error_code & PF_RSVD))
11183 +               pgtable_bad(address, regs, error_code);
11184 +
11185 +       /*
11186 +        * If we're in an interrupt, have no user context or are running in an
11187 +        * atomic region then we must not take the fault.
11188 +        */
11189 +       if (unlikely(in_atomic() || !mm))
11190 +               goto bad_area_nosemaphore;
11191 +
11192 +       /*
11193 +        * User-mode registers count as a user access even for any
11194 +        * potential system fault or CPU buglet.
11195 +        */
11196 +       if (user_mode_vm(regs))
11197 +               error_code |= PF_USER;
11198 +again:
11199 +#endif
11200 +       /* When running in the kernel we expect faults to occur only to
11201 +        * addresses in user space.  All other faults represent errors in the
11202 +        * kernel and should generate an OOPS.  Unfortunately, in the case of an
11203 +        * erroneous fault occurring in a code path which already holds mmap_sem
11204 +        * we will deadlock attempting to validate the fault against the
11205 +        * address space.  Luckily the kernel only validly references user
11206 +        * space from well defined areas of code, which are listed in the
11207 +        * exceptions table.
11208 +        *
11209 +        * As the vast majority of faults will be valid we will only perform
11210 +        * the source reference check when there is a possibility of a deadlock.
11211 +        * Attempt to lock the address space, if we cannot we then validate the
11212 +        * source.  If this is invalid we can skip the address space check,
11213 +        * thus avoiding the deadlock.
11214 +        */
11215 +       if (!down_read_trylock(&mm->mmap_sem)) {
11216 +               if ((error_code & PF_USER) == 0 &&
11217 +                   !search_exception_tables(regs->ip))
11218 +                       goto bad_area_nosemaphore;
11219 +               down_read(&mm->mmap_sem);
11220 +       }
11221 +
11222 +       vma = find_vma(mm, address);
11223 +       if (!vma)
11224 +               goto bad_area;
11225 +       if (vma->vm_start <= address)
11226 +               goto good_area;
11227 +       if (!(vma->vm_flags & VM_GROWSDOWN))
11228 +               goto bad_area;
11229 +       if (error_code & PF_USER) {
11230 +               /*
11231 +                * Accessing the stack below %sp is always a bug.
11232 +                * The large cushion allows instructions like enter
11233 +                * and pusha to work.  ("enter $65535,$31" pushes
11234 +                * 32 pointers and then decrements %sp by 65535.)
11235 +                */
11236 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
11237 +                       goto bad_area;
11238 +       }
11239 +       if (expand_stack(vma, address))
11240 +               goto bad_area;
11241 +/*
11242 + * Ok, we have a good vm_area for this memory access, so
11243 + * we can handle it..
11244 + */
11245 +good_area:
11246 +       si_code = SEGV_ACCERR;
11247 +       write = 0;
11248 +       switch (error_code & (PF_PROT|PF_WRITE)) {
11249 +       default:        /* 3: write, present */
11250 +               /* fall through */
11251 +       case PF_WRITE:          /* write, not present */
11252 +               if (!(vma->vm_flags & VM_WRITE))
11253 +                       goto bad_area;
11254 +               write++;
11255 +               break;
11256 +       case PF_PROT:           /* read, present */
11257 +               goto bad_area;
11258 +       case 0:                 /* read, not present */
11259 +               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11260 +                       goto bad_area;
11261 +       }
11262 +
11263 +#ifdef CONFIG_X86_32
11264 +survive:
11265 +#endif
11266 +       /*
11267 +        * If for any reason at all we couldn't handle the fault,
11268 +        * make sure we exit gracefully rather than endlessly redo
11269 +        * the fault.
11270 +        */
11271 +       fault = handle_mm_fault(mm, vma, address, write);
11272 +       if (unlikely(fault & VM_FAULT_ERROR)) {
11273 +               if (fault & VM_FAULT_OOM)
11274 +                       goto out_of_memory;
11275 +               else if (fault & VM_FAULT_SIGBUS)
11276 +                       goto do_sigbus;
11277 +               BUG();
11278 +       }
11279 +       if (fault & VM_FAULT_MAJOR)
11280 +               tsk->maj_flt++;
11281 +       else
11282 +               tsk->min_flt++;
11283 +
11284 +#ifdef CONFIG_X86_32
11285 +       /*
11286 +        * Did it hit the DOS screen memory VA from vm86 mode?
11287 +        */
11288 +       if (v8086_mode(regs)) {
11289 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11290 +               if (bit < 32)
11291 +                       tsk->thread.screen_bitmap |= 1 << bit;
11292 +       }
11293 +#endif
11294 +       up_read(&mm->mmap_sem);
11295 +       return;
11296 +
11297 +/*
11298 + * Something tried to access memory that isn't in our memory map..
11299 + * Fix it, but check if it's kernel or user first..
11300 + */
11301 +bad_area:
11302 +       up_read(&mm->mmap_sem);
11303 +
11304 +bad_area_nosemaphore:
11305 +       /* User mode accesses just cause a SIGSEGV */
11306 +       if (error_code & PF_USER) {
11307 +               /*
11308 +                * It's possible to have interrupts off here.
11309 +                */
11310 +               local_irq_enable();
11311 +
11312 +               /*
11313 +                * Valid to do another page fault here because this one came
11314 +                * from user space.
11315 +                */
11316 +               if (is_prefetch(regs, address, error_code))
11317 +                       return;
11318 +
11319 +               if (is_errata100(regs, address))
11320 +                       return;
11321 +
11322 +               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11323 +                   printk_ratelimit()) {
11324 +                       printk(
11325 +#ifdef CONFIG_X86_32
11326 +                       "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
11327 +#else
11328 +                       "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
11329 +#endif
11330 +                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11331 +                       tsk->comm, task_pid_nr(tsk), address, regs->ip,
11332 +                       regs->sp, error_code);
11333 +                       print_vma_addr(" in ", regs->ip);
11334 +                       printk("\n");
11335 +               }
11336 +
11337 +               tsk->thread.cr2 = address;
11338 +               /* Kernel addresses are always protection faults */
11339 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11340 +               tsk->thread.trap_no = 14;
11341 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11342 +               return;
11343 +       }
11344 +
11345 +       if (is_f00f_bug(regs, address))
11346 +               return;
11347 +
11348 +no_context:
11349 +       /* Are we prepared to handle this kernel fault?  */
11350 +       if (fixup_exception(regs))
11351 +               return;
11352 +
11353 +       /*
11354 +        * X86_32
11355 +        * Valid to do another page fault here, because if this fault
11356 +        * had been triggered by is_prefetch fixup_exception would have
11357 +        * handled it.
11358 +        *
11359 +        * X86_64
11360 +        * Hall of shame of CPU/BIOS bugs.
11361 +        */
11362 +       if (is_prefetch(regs, address, error_code))
11363 +               return;
11364 +
11365 +       if (is_errata93(regs, address))
11366 +               return;
11367 +
11368 +/*
11369 + * Oops. The kernel tried to access some bad page. We'll have to
11370 + * terminate things with extreme prejudice.
11371 + */
11372 +#ifdef CONFIG_X86_32
11373 +       bust_spinlocks(1);
11374 +#else
11375 +       flags = oops_begin();
11376 +#endif
11377 +
11378 +       show_fault_oops(regs, error_code, address);
11379 +
11380 +       tsk->thread.cr2 = address;
11381 +       tsk->thread.trap_no = 14;
11382 +       tsk->thread.error_code = error_code;
11383 +
11384 +#ifdef CONFIG_X86_32
11385 +       die("Oops", regs, error_code);
11386 +       bust_spinlocks(0);
11387 +       do_exit(SIGKILL);
11388 +#else
11389 +       if (__die("Oops", regs, error_code))
11390 +               regs = NULL;
11391 +       /* Executive summary in case the body of the oops scrolled away */
11392 +       printk(KERN_EMERG "CR2: %016lx\n", address);
11393 +       oops_end(flags, regs, SIGKILL);
11394 +#endif
11395 +
11396 +/*
11397 + * We ran out of memory, or some other thing happened to us that made
11398 + * us unable to handle the page fault gracefully.
11399 + */
11400 +out_of_memory:
11401 +       up_read(&mm->mmap_sem);
11402 +       if (is_global_init(tsk)) {
11403 +               yield();
11404 +#ifdef CONFIG_X86_32
11405 +               down_read(&mm->mmap_sem);
11406 +               goto survive;
11407 +#else
11408 +               goto again;
11409 +#endif
11410 +       }
11411 +
11412 +       printk("VM: killing process %s\n", tsk->comm);
11413 +       if (error_code & PF_USER)
11414 +               do_group_exit(SIGKILL);
11415 +       goto no_context;
11416 +
11417 +do_sigbus:
11418 +       up_read(&mm->mmap_sem);
11419 +
11420 +       /* Kernel mode? Handle exceptions or die */
11421 +       if (!(error_code & PF_USER))
11422 +               goto no_context;
11423 +#ifdef CONFIG_X86_32
11424 +       /* User space => ok to do another page fault */
11425 +       if (is_prefetch(regs, address, error_code))
11426 +               return;
11427 +#endif
11428 +       tsk->thread.cr2 = address;
11429 +       tsk->thread.error_code = error_code;
11430 +       tsk->thread.trap_no = 14;
11431 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11432 +}
11433 +
11434 +DEFINE_SPINLOCK(pgd_lock);
11435 +LIST_HEAD(pgd_list);
11436 +
11437 +void vmalloc_sync_all(void)
11438 +{
11439 +#ifdef CONFIG_X86_32
11440 +       /*
11441 +        * Note that races in the updates of insync and start aren't
11442 +        * problematic: insync can only get set bits added, and updates to
11443 +        * start are only improving performance (without affecting correctness
11444 +        * if undone).
11445 +        * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11446 +        *      This change works just fine with 2-level paging too.
11447 +        */
11448 +#define sync_index(a) ((a) >> PMD_SHIFT)
11449 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11450 +       static unsigned long start = TASK_SIZE;
11451 +       unsigned long address;
11452 +
11453 +       if (SHARED_KERNEL_PMD)
11454 +               return;
11455 +
11456 +       BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
11457 +       for (address = start;
11458 +            address < hypervisor_virt_start;
11459 +            address += PMD_SIZE) {
11460 +               if (!test_bit(sync_index(address), insync)) {
11461 +                       unsigned long flags;
11462 +                       struct page *page;
11463 +
11464 +                       spin_lock_irqsave(&pgd_lock, flags);
11465 +                       /* XEN: failure path assumes non-empty pgd_list. */
11466 +                       if (unlikely(list_empty(&pgd_list))) {
11467 +                               spin_unlock_irqrestore(&pgd_lock, flags);
11468 +                               return;
11469 +                       }
11470 +                       list_for_each_entry(page, &pgd_list, lru) {
11471 +                               if (!vmalloc_sync_one(page_address(page),
11472 +                                                     address))
11473 +                                       break;
11474 +                       }
11475 +                       spin_unlock_irqrestore(&pgd_lock, flags);
11476 +                       if (!page)
11477 +                               set_bit(sync_index(address), insync);
11478 +               }
11479 +               if (address == start && test_bit(sync_index(address), insync))
11480 +                       start = address + PMD_SIZE;
11481 +       }
11482 +#else /* CONFIG_X86_64 */
11483 +       /*
11484 +        * Note that races in the updates of insync and start aren't
11485 +        * problematic: insync can only get set bits added, and updates to
11486 +        * start are only improving performance (without affecting correctness
11487 +        * if undone).
11488 +        */
11489 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
11490 +       static unsigned long start = VMALLOC_START & PGDIR_MASK;
11491 +       unsigned long address;
11492 +
11493 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
11494 +               if (!test_bit(pgd_index(address), insync)) {
11495 +                       const pgd_t *pgd_ref = pgd_offset_k(address);
11496 +                       unsigned long flags;
11497 +                       struct page *page;
11498 +
11499 +                       if (pgd_none(*pgd_ref))
11500 +                               continue;
11501 +                       spin_lock_irqsave(&pgd_lock, flags);
11502 +                       list_for_each_entry(page, &pgd_list, lru) {
11503 +                               pgd_t *pgd;
11504 +                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
11505 +                               if (pgd_none(*pgd))
11506 +                                       set_pgd(pgd, *pgd_ref);
11507 +                               else
11508 +                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11509 +                       }
11510 +                       spin_unlock_irqrestore(&pgd_lock, flags);
11511 +                       set_bit(pgd_index(address), insync);
11512 +               }
11513 +               if (address == start)
11514 +                       start = address + PGDIR_SIZE;
11515 +       }
11516 +       /* Check that there is no need to do the same for the modules area. */
11517 +       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11518 +       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11519 +                               (__START_KERNEL & PGDIR_MASK)));
11520 +#endif
11521 +}
11522 --- sle11-2009-05-14.orig/arch/x86/mm/fault_32-xen.c    2009-02-16 16:18:36.000000000 +0100
11523 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
11524 @@ -1,757 +0,0 @@
11525 -/*
11526 - *  linux/arch/i386/mm/fault.c
11527 - *
11528 - *  Copyright (C) 1995  Linus Torvalds
11529 - */
11530 -
11531 -#include <linux/signal.h>
11532 -#include <linux/sched.h>
11533 -#include <linux/kernel.h>
11534 -#include <linux/errno.h>
11535 -#include <linux/string.h>
11536 -#include <linux/types.h>
11537 -#include <linux/ptrace.h>
11538 -#include <linux/mman.h>
11539 -#include <linux/mm.h>
11540 -#include <linux/smp.h>
11541 -#include <linux/interrupt.h>
11542 -#include <linux/init.h>
11543 -#include <linux/tty.h>
11544 -#include <linux/vt_kern.h>             /* For unblank_screen() */
11545 -#include <linux/highmem.h>
11546 -#include <linux/bootmem.h>             /* for max_low_pfn */
11547 -#include <linux/vmalloc.h>
11548 -#include <linux/module.h>
11549 -#include <linux/kprobes.h>
11550 -#include <linux/uaccess.h>
11551 -#include <linux/kdebug.h>
11552 -#include <linux/kprobes.h>
11553 -
11554 -#include <asm/system.h>
11555 -#include <asm/desc.h>
11556 -#include <asm/segment.h>
11557 -
11558 -extern void die(const char *,struct pt_regs *,long);
11559 -
11560 -#ifdef CONFIG_KPROBES
11561 -static inline int notify_page_fault(struct pt_regs *regs)
11562 -{
11563 -       int ret = 0;
11564 -
11565 -       /* kprobe_running() needs smp_processor_id() */
11566 -       if (!user_mode_vm(regs)) {
11567 -               preempt_disable();
11568 -               if (kprobe_running() && kprobe_fault_handler(regs, 14))
11569 -                       ret = 1;
11570 -               preempt_enable();
11571 -       }
11572 -
11573 -       return ret;
11574 -}
11575 -#else
11576 -static inline int notify_page_fault(struct pt_regs *regs)
11577 -{
11578 -       return 0;
11579 -}
11580 -#endif
11581 -
11582 -/*
11583 - * Return EIP plus the CS segment base.  The segment limit is also
11584 - * adjusted, clamped to the kernel/user address space (whichever is
11585 - * appropriate), and returned in *eip_limit.
11586 - *
11587 - * The segment is checked, because it might have been changed by another
11588 - * task between the original faulting instruction and here.
11589 - *
11590 - * If CS is no longer a valid code segment, or if EIP is beyond the
11591 - * limit, or if it is a kernel address when CS is not a kernel segment,
11592 - * then the returned value will be greater than *eip_limit.
11593 - *
11594 - * This is slow, but is very rarely executed.
11595 - */
11596 -static inline unsigned long get_segment_eip(struct pt_regs *regs,
11597 -                                           unsigned long *eip_limit)
11598 -{
11599 -       unsigned long eip = regs->eip;
11600 -       unsigned seg = regs->xcs & 0xffff;
11601 -       u32 seg_ar, seg_limit, base, *desc;
11602 -
11603 -       /* Unlikely, but must come before segment checks. */
11604 -       if (unlikely(regs->eflags & VM_MASK)) {
11605 -               base = seg << 4;
11606 -               *eip_limit = base + 0xffff;
11607 -               return base + (eip & 0xffff);
11608 -       }
11609 -
11610 -       /* The standard kernel/user address space limit. */
11611 -       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
11612 -
11613 -       /* By far the most common cases. */
11614 -       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
11615 -               return eip;
11616 -
11617 -       /* Check the segment exists, is within the current LDT/GDT size,
11618 -          that kernel/user (ring 0..3) has the appropriate privilege,
11619 -          that it's a code segment, and get the limit. */
11620 -       __asm__ ("larl %3,%0; lsll %3,%1"
11621 -                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
11622 -       if ((~seg_ar & 0x9800) || eip > seg_limit) {
11623 -               *eip_limit = 0;
11624 -               return 1;        /* So that returned eip > *eip_limit. */
11625 -       }
11626 -
11627 -       /* Get the GDT/LDT descriptor base.
11628 -          When you look for races in this code remember that
11629 -          LDT and other horrors are only used in user space. */
11630 -       if (seg & (1<<2)) {
11631 -               /* Must lock the LDT while reading it. */
11632 -               mutex_lock(&current->mm->context.lock);
11633 -               desc = current->mm->context.ldt;
11634 -               desc = (void *)desc + (seg & ~7);
11635 -       } else {
11636 -               /* Must disable preemption while reading the GDT. */
11637 -               desc = (u32 *)get_cpu_gdt_table(get_cpu());
11638 -               desc = (void *)desc + (seg & ~7);
11639 -       }
11640 -
11641 -       /* Decode the code segment base from the descriptor */
11642 -       base = get_desc_base((unsigned long *)desc);
11643 -
11644 -       if (seg & (1<<2)) {
11645 -               mutex_unlock(&current->mm->context.lock);
11646 -       } else
11647 -               put_cpu();
11648 -
11649 -       /* Adjust EIP and segment limit, and clamp at the kernel limit.
11650 -          It's legitimate for segments to wrap at 0xffffffff. */
11651 -       seg_limit += base;
11652 -       if (seg_limit < *eip_limit && seg_limit >= base)
11653 -               *eip_limit = seg_limit;
11654 -       return eip + base;
11655 -}
11656 -
11657 -/*
11658 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
11659 - * Check that here and ignore it.
11660 - */
11661 -static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
11662 -{
11663 -       unsigned long limit;
11664 -       unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
11665 -       int scan_more = 1;
11666 -       int prefetch = 0;
11667 -       int i;
11668 -
11669 -       for (i = 0; scan_more && i < 15; i++) {
11670 -               unsigned char opcode;
11671 -               unsigned char instr_hi;
11672 -               unsigned char instr_lo;
11673 -
11674 -               if (instr > (unsigned char *)limit)
11675 -                       break;
11676 -               if (probe_kernel_address(instr, opcode))
11677 -                       break;
11678 -
11679 -               instr_hi = opcode & 0xf0;
11680 -               instr_lo = opcode & 0x0f;
11681 -               instr++;
11682 -
11683 -               switch (instr_hi) {
11684 -               case 0x20:
11685 -               case 0x30:
11686 -                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
11687 -                       scan_more = ((instr_lo & 7) == 0x6);
11688 -                       break;
11689 -
11690 -               case 0x60:
11691 -                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
11692 -                       scan_more = (instr_lo & 0xC) == 0x4;
11693 -                       break;
11694 -               case 0xF0:
11695 -                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
11696 -                       scan_more = !instr_lo || (instr_lo>>1) == 1;
11697 -                       break;
11698 -               case 0x00:
11699 -                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
11700 -                       scan_more = 0;
11701 -                       if (instr > (unsigned char *)limit)
11702 -                               break;
11703 -                       if (probe_kernel_address(instr, opcode))
11704 -                               break;
11705 -                       prefetch = (instr_lo == 0xF) &&
11706 -                               (opcode == 0x0D || opcode == 0x18);
11707 -                       break;
11708 -               default:
11709 -                       scan_more = 0;
11710 -                       break;
11711 -               }
11712 -       }
11713 -       return prefetch;
11714 -}
11715 -
11716 -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11717 -                             unsigned long error_code)
11718 -{
11719 -       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11720 -                    boot_cpu_data.x86 >= 6)) {
11721 -               /* Catch an obscure case of prefetch inside an NX page. */
11722 -               if (nx_enabled && (error_code & 16))
11723 -                       return 0;
11724 -               return __is_prefetch(regs, addr);
11725 -       }
11726 -       return 0;
11727 -}
11728 -
11729 -static noinline void force_sig_info_fault(int si_signo, int si_code,
11730 -       unsigned long address, struct task_struct *tsk)
11731 -{
11732 -       siginfo_t info;
11733 -
11734 -       info.si_signo = si_signo;
11735 -       info.si_errno = 0;
11736 -       info.si_code = si_code;
11737 -       info.si_addr = (void __user *)address;
11738 -       force_sig_info(si_signo, &info, tsk);
11739 -}
11740 -
11741 -fastcall void do_invalid_op(struct pt_regs *, unsigned long);
11742 -
11743 -#ifdef CONFIG_X86_PAE
11744 -static void dump_fault_path(unsigned long address)
11745 -{
11746 -       unsigned long *p, page;
11747 -       unsigned long mfn;
11748 -
11749 -       page = read_cr3();
11750 -       p  = (unsigned long *)__va(page);
11751 -       p += (address >> 30) * 2;
11752 -       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
11753 -       if (p[0] & _PAGE_PRESENT) {
11754 -               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11755 -               page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11756 -               p  = (unsigned long *)__va(page);
11757 -               address &= 0x3fffffff;
11758 -               p += (address >> 21) * 2;
11759 -               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
11760 -                      page, p[1], p[0]);
11761 -               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11762 -#ifdef CONFIG_HIGHPTE
11763 -               if (mfn_to_pfn(mfn) >= highstart_pfn)
11764 -                       return;
11765 -#endif
11766 -               if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
11767 -                       page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11768 -                       p  = (unsigned long *) __va(page);
11769 -                       address &= 0x001fffff;
11770 -                       p += (address >> 12) * 2;
11771 -                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
11772 -                              page, p[1], p[0]);
11773 -               }
11774 -       }
11775 -}
11776 -#else
11777 -static void dump_fault_path(unsigned long address)
11778 -{
11779 -       unsigned long page;
11780 -
11781 -       page = read_cr3();
11782 -       page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
11783 -       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
11784 -              machine_to_phys(page));
11785 -       /*
11786 -        * We must not directly access the pte in the highpte
11787 -        * case if the page table is located in highmem.
11788 -        * And lets rather not kmap-atomic the pte, just in case
11789 -        * it's allocated already.
11790 -        */
11791 -       if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
11792 -           && (page & _PAGE_PRESENT)
11793 -           && !(page & _PAGE_PSE)) {
11794 -               page = machine_to_phys(page & PAGE_MASK);
11795 -               page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
11796 -                                                     & (PTRS_PER_PTE - 1)];
11797 -               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
11798 -                      machine_to_phys(page));
11799 -       }
11800 -}
11801 -#endif
11802 -
11803 -static int spurious_fault(struct pt_regs *regs,
11804 -                         unsigned long address,
11805 -                         unsigned long error_code)
11806 -{
11807 -       pgd_t *pgd;
11808 -       pud_t *pud;
11809 -       pmd_t *pmd;
11810 -       pte_t *pte;
11811 -
11812 -       /* Reserved-bit violation or user access to kernel space? */
11813 -       if (error_code & 0x0c)
11814 -               return 0;
11815 -
11816 -       pgd = init_mm.pgd + pgd_index(address);
11817 -       if (!pgd_present(*pgd))
11818 -               return 0;
11819 -
11820 -       pud = pud_offset(pgd, address);
11821 -       if (!pud_present(*pud))
11822 -               return 0;
11823 -
11824 -       pmd = pmd_offset(pud, address);
11825 -       if (!pmd_present(*pmd))
11826 -               return 0;
11827 -
11828 -       pte = pte_offset_kernel(pmd, address);
11829 -       if (!pte_present(*pte))
11830 -               return 0;
11831 -       if ((error_code & 0x02) && !pte_write(*pte))
11832 -               return 0;
11833 -#ifdef CONFIG_X86_PAE
11834 -       if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
11835 -               return 0;
11836 -#endif
11837 -
11838 -       return 1;
11839 -}
11840 -
11841 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
11842 -{
11843 -       unsigned index = pgd_index(address);
11844 -       pgd_t *pgd_k;
11845 -       pud_t *pud, *pud_k;
11846 -       pmd_t *pmd, *pmd_k;
11847 -
11848 -       pgd += index;
11849 -       pgd_k = init_mm.pgd + index;
11850 -
11851 -       if (!pgd_present(*pgd_k))
11852 -               return NULL;
11853 -
11854 -       /*
11855 -        * set_pgd(pgd, *pgd_k); here would be useless on PAE
11856 -        * and redundant with the set_pmd() on non-PAE. As would
11857 -        * set_pud.
11858 -        */
11859 -
11860 -       pud = pud_offset(pgd, address);
11861 -       pud_k = pud_offset(pgd_k, address);
11862 -       if (!pud_present(*pud_k))
11863 -               return NULL;
11864 -
11865 -       pmd = pmd_offset(pud, address);
11866 -       pmd_k = pmd_offset(pud_k, address);
11867 -       if (!pmd_present(*pmd_k))
11868 -               return NULL;
11869 -       if (!pmd_present(*pmd)) {
11870 -               bool lazy = x86_read_percpu(xen_lazy_mmu);
11871 -
11872 -               x86_write_percpu(xen_lazy_mmu, false);
11873 -#if CONFIG_XEN_COMPAT > 0x030002
11874 -               set_pmd(pmd, *pmd_k);
11875 -#else
11876 -               /*
11877 -                * When running on older Xen we must launder *pmd_k through
11878 -                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
11879 -                */
11880 -               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
11881 -#endif
11882 -               x86_write_percpu(xen_lazy_mmu, lazy);
11883 -       } else
11884 -               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
11885 -       return pmd_k;
11886 -}
11887 -
11888 -/*
11889 - * Handle a fault on the vmalloc or module mapping area
11890 - *
11891 - * This assumes no large pages in there.
11892 - */
11893 -static inline int vmalloc_fault(unsigned long address)
11894 -{
11895 -       unsigned long pgd_paddr;
11896 -       pmd_t *pmd_k;
11897 -       pte_t *pte_k;
11898 -       /*
11899 -        * Synchronize this task's top level page-table
11900 -        * with the 'reference' page table.
11901 -        *
11902 -        * Do _not_ use "current" here. We might be inside
11903 -        * an interrupt in the middle of a task switch..
11904 -        */
11905 -       pgd_paddr = read_cr3();
11906 -       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11907 -       if (!pmd_k)
11908 -               return -1;
11909 -       pte_k = pte_offset_kernel(pmd_k, address);
11910 -       if (!pte_present(*pte_k))
11911 -               return -1;
11912 -       return 0;
11913 -}
11914 -
11915 -int show_unhandled_signals = 1;
11916 -
11917 -/*
11918 - * This routine handles page faults.  It determines the address,
11919 - * and the problem, and then passes it off to one of the appropriate
11920 - * routines.
11921 - *
11922 - * error_code:
11923 - *     bit 0 == 0 means no page found, 1 means protection fault
11924 - *     bit 1 == 0 means read, 1 means write
11925 - *     bit 2 == 0 means kernel, 1 means user-mode
11926 - *     bit 3 == 1 means use of reserved bit detected
11927 - *     bit 4 == 1 means fault was an instruction fetch
11928 - */
11929 -fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11930 -                                     unsigned long error_code)
11931 -{
11932 -       struct task_struct *tsk;
11933 -       struct mm_struct *mm;
11934 -       struct vm_area_struct * vma;
11935 -       unsigned long address;
11936 -       int write, si_code;
11937 -       int fault;
11938 -
11939 -       /*
11940 -        * We can fault from pretty much anywhere, with unknown IRQ state.
11941 -        */
11942 -       trace_hardirqs_fixup();
11943 -
11944 -       /* get the address */
11945 -        address = read_cr2();
11946 -
11947 -       /* Set the "privileged fault" bit to something sane. */
11948 -       error_code &= ~4;
11949 -       error_code |= (regs->xcs & 2) << 1;
11950 -       if (regs->eflags & X86_EFLAGS_VM)
11951 -               error_code |= 4;
11952 -
11953 -       tsk = current;
11954 -
11955 -       si_code = SEGV_MAPERR;
11956 -
11957 -       /*
11958 -        * We fault-in kernel-space virtual memory on-demand. The
11959 -        * 'reference' page table is init_mm.pgd.
11960 -        *
11961 -        * NOTE! We MUST NOT take any locks for this case. We may
11962 -        * be in an interrupt or a critical region, and should
11963 -        * only copy the information from the master page table,
11964 -        * nothing more.
11965 -        *
11966 -        * This verifies that the fault happens in kernel space
11967 -        * (error_code & 4) == 0, and that the fault was not a
11968 -        * protection error (error_code & 9) == 0.
11969 -        */
11970 -       if (unlikely(address >= TASK_SIZE)) {
11971 -#ifdef CONFIG_XEN
11972 -               /* Faults in hypervisor area can never be patched up. */
11973 -               if (address >= hypervisor_virt_start)
11974 -                       goto bad_area_nosemaphore;
11975 -#endif
11976 -               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11977 -                       return;
11978 -               /* Can take a spurious fault if mapping changes R/O -> R/W. */
11979 -               if (spurious_fault(regs, address, error_code))
11980 -                       return;
11981 -               if (notify_page_fault(regs))
11982 -                       return;
11983 -               /*
11984 -                * Don't take the mm semaphore here. If we fixup a prefetch
11985 -                * fault we could otherwise deadlock.
11986 -                */
11987 -               goto bad_area_nosemaphore;
11988 -       }
11989 -
11990 -       if (notify_page_fault(regs))
11991 -               return;
11992 -
11993 -       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11994 -          fault has been handled. */
11995 -       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11996 -               local_irq_enable();
11997 -
11998 -       mm = tsk->mm;
11999 -
12000 -       /*
12001 -        * If we're in an interrupt, have no user context or are running in an
12002 -        * atomic region then we must not take the fault..
12003 -        */
12004 -       if (in_atomic() || !mm)
12005 -               goto bad_area_nosemaphore;
12006 -
12007 -       /* When running in the kernel we expect faults to occur only to
12008 -        * addresses in user space.  All other faults represent errors in the
12009 -        * kernel and should generate an OOPS.  Unfortunately, in the case of an
12010 -        * erroneous fault occurring in a code path which already holds mmap_sem
12011 -        * we will deadlock attempting to validate the fault against the
12012 -        * address space.  Luckily the kernel only validly references user
12013 -        * space from well defined areas of code, which are listed in the
12014 -        * exceptions table.
12015 -        *
12016 -        * As the vast majority of faults will be valid we will only perform
12017 -        * the source reference check when there is a possibility of a deadlock.
12018 -        * Attempt to lock the address space, if we cannot we then validate the
12019 -        * source.  If this is invalid we can skip the address space check,
12020 -        * thus avoiding the deadlock.
12021 -        */
12022 -       if (!down_read_trylock(&mm->mmap_sem)) {
12023 -               if ((error_code & 4) == 0 &&
12024 -                   !search_exception_tables(regs->eip))
12025 -                       goto bad_area_nosemaphore;
12026 -               down_read(&mm->mmap_sem);
12027 -       }
12028 -
12029 -       vma = find_vma(mm, address);
12030 -       if (!vma)
12031 -               goto bad_area;
12032 -       if (vma->vm_start <= address)
12033 -               goto good_area;
12034 -       if (!(vma->vm_flags & VM_GROWSDOWN))
12035 -               goto bad_area;
12036 -       if (error_code & 4) {
12037 -               /*
12038 -                * Accessing the stack below %esp is always a bug.
12039 -                * The large cushion allows instructions like enter
12040 -                * and pusha to work.  ("enter $65535,$31" pushes
12041 -                * 32 pointers and then decrements %esp by 65535.)
12042 -                */
12043 -               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
12044 -                       goto bad_area;
12045 -       }
12046 -       if (expand_stack(vma, address))
12047 -               goto bad_area;
12048 -/*
12049 - * Ok, we have a good vm_area for this memory access, so
12050 - * we can handle it..
12051 - */
12052 -good_area:
12053 -       si_code = SEGV_ACCERR;
12054 -       write = 0;
12055 -       switch (error_code & 3) {
12056 -               default:        /* 3: write, present */
12057 -                               /* fall through */
12058 -               case 2:         /* write, not present */
12059 -                       if (!(vma->vm_flags & VM_WRITE))
12060 -                               goto bad_area;
12061 -                       write++;
12062 -                       break;
12063 -               case 1:         /* read, present */
12064 -                       goto bad_area;
12065 -               case 0:         /* read, not present */
12066 -                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12067 -                               goto bad_area;
12068 -       }
12069 -
12070 - survive:
12071 -       /*
12072 -        * If for any reason at all we couldn't handle the fault,
12073 -        * make sure we exit gracefully rather than endlessly redo
12074 -        * the fault.
12075 -        */
12076 -       fault = handle_mm_fault(mm, vma, address, write);
12077 -       if (unlikely(fault & VM_FAULT_ERROR)) {
12078 -               if (fault & VM_FAULT_OOM)
12079 -                       goto out_of_memory;
12080 -               else if (fault & VM_FAULT_SIGBUS)
12081 -                       goto do_sigbus;
12082 -               BUG();
12083 -       }
12084 -       if (fault & VM_FAULT_MAJOR)
12085 -               tsk->maj_flt++;
12086 -       else
12087 -               tsk->min_flt++;
12088 -
12089 -       /*
12090 -        * Did it hit the DOS screen memory VA from vm86 mode?
12091 -        */
12092 -       if (regs->eflags & VM_MASK) {
12093 -               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12094 -               if (bit < 32)
12095 -                       tsk->thread.screen_bitmap |= 1 << bit;
12096 -       }
12097 -       up_read(&mm->mmap_sem);
12098 -       return;
12099 -
12100 -/*
12101 - * Something tried to access memory that isn't in our memory map..
12102 - * Fix it, but check if it's kernel or user first..
12103 - */
12104 -bad_area:
12105 -       up_read(&mm->mmap_sem);
12106 -
12107 -bad_area_nosemaphore:
12108 -       /* User mode accesses just cause a SIGSEGV */
12109 -       if (error_code & 4) {
12110 -               /*
12111 -                * It's possible to have interrupts off here.
12112 -                */
12113 -               local_irq_enable();
12114 -
12115 -               /*
12116 -                * Valid to do another page fault here because this one came
12117 -                * from user space.
12118 -                */
12119 -               if (is_prefetch(regs, address, error_code))
12120 -                       return;
12121 -
12122 -               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12123 -                   printk_ratelimit()) {
12124 -                       printk("%s%s[%d]: segfault at %08lx eip %08lx "
12125 -                           "esp %08lx error %lx\n",
12126 -                           task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12127 -                           tsk->comm, task_pid_nr(tsk), address, regs->eip,
12128 -                           regs->esp, error_code);
12129 -               }
12130 -               tsk->thread.cr2 = address;
12131 -               /* Kernel addresses are always protection faults */
12132 -               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12133 -               tsk->thread.trap_no = 14;
12134 -               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12135 -               return;
12136 -       }
12137 -
12138 -#ifdef CONFIG_X86_F00F_BUG
12139 -       /*
12140 -        * Pentium F0 0F C7 C8 bug workaround.
12141 -        */
12142 -       if (boot_cpu_data.f00f_bug) {
12143 -               unsigned long nr;
12144 -
12145 -               nr = (address - idt_descr.address) >> 3;
12146 -
12147 -               if (nr == 6) {
12148 -                       do_invalid_op(regs, 0);
12149 -                       return;
12150 -               }
12151 -       }
12152 -#endif
12153 -
12154 -no_context:
12155 -       /* Are we prepared to handle this kernel fault?  */
12156 -       if (fixup_exception(regs))
12157 -               return;
12158 -
12159 -       /*
12160 -        * Valid to do another page fault here, because if this fault
12161 -        * had been triggered by is_prefetch fixup_exception would have
12162 -        * handled it.
12163 -        */
12164 -       if (is_prefetch(regs, address, error_code))
12165 -               return;
12166 -
12167 -/*
12168 - * Oops. The kernel tried to access some bad page. We'll have to
12169 - * terminate things with extreme prejudice.
12170 - */
12171 -
12172 -       bust_spinlocks(1);
12173 -
12174 -       if (oops_may_print()) {
12175 -#ifdef CONFIG_X86_PAE
12176 -               if (error_code & 16) {
12177 -                       pte_t *pte = lookup_address(address);
12178 -
12179 -                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
12180 -                               printk(KERN_CRIT "kernel tried to execute "
12181 -                                       "NX-protected page - exploit attempt? "
12182 -                                       "(uid: %d)\n", current->uid);
12183 -               }
12184 -#endif
12185 -               if (address < PAGE_SIZE)
12186 -                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
12187 -                                       "pointer dereference");
12188 -               else
12189 -                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
12190 -                                       " request");
12191 -               printk(" at virtual address %08lx\n",address);
12192 -               printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
12193 -               dump_fault_path(address);
12194 -       }
12195 -       tsk->thread.cr2 = address;
12196 -       tsk->thread.trap_no = 14;
12197 -       tsk->thread.error_code = error_code;
12198 -       die("Oops", regs, error_code);
12199 -       bust_spinlocks(0);
12200 -       do_exit(SIGKILL);
12201 -
12202 -/*
12203 - * We ran out of memory, or some other thing happened to us that made
12204 - * us unable to handle the page fault gracefully.
12205 - */
12206 -out_of_memory:
12207 -       up_read(&mm->mmap_sem);
12208 -       if (is_global_init(tsk)) {
12209 -               yield();
12210 -               down_read(&mm->mmap_sem);
12211 -               goto survive;
12212 -       }
12213 -       printk("VM: killing process %s\n", tsk->comm);
12214 -       if (error_code & 4)
12215 -               do_group_exit(SIGKILL);
12216 -       goto no_context;
12217 -
12218 -do_sigbus:
12219 -       up_read(&mm->mmap_sem);
12220 -
12221 -       /* Kernel mode? Handle exceptions or die */
12222 -       if (!(error_code & 4))
12223 -               goto no_context;
12224 -
12225 -       /* User space => ok to do another page fault */
12226 -       if (is_prefetch(regs, address, error_code))
12227 -               return;
12228 -
12229 -       tsk->thread.cr2 = address;
12230 -       tsk->thread.error_code = error_code;
12231 -       tsk->thread.trap_no = 14;
12232 -       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
12233 -}
12234 -
12235 -void vmalloc_sync_all(void)
12236 -{
12237 -       /*
12238 -        * Note that races in the updates of insync and start aren't
12239 -        * problematic: insync can only get set bits added, and updates to
12240 -        * start are only improving performance (without affecting correctness
12241 -        * if undone).
12242 -        * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
12243 -        *      This change works just fine with 2-level paging too.
12244 -        */
12245 -#define sync_index(a) ((a) >> PMD_SHIFT)
12246 -       static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
12247 -       static unsigned long start = TASK_SIZE;
12248 -       unsigned long address;
12249 -
12250 -       if (SHARED_KERNEL_PMD)
12251 -               return;
12252 -
12253 -       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
12254 -       for (address = start;
12255 -            address >= TASK_SIZE && address < hypervisor_virt_start;
12256 -            address += 1UL << PMD_SHIFT) {
12257 -               if (!test_bit(sync_index(address), insync)) {
12258 -                       unsigned long flags;
12259 -                       struct page *page;
12260 -
12261 -                       spin_lock_irqsave(&pgd_lock, flags);
12262 -                       /* XEN: failure path assumes non-empty pgd_list. */
12263 -                       if (unlikely(!pgd_list)) {
12264 -                               spin_unlock_irqrestore(&pgd_lock, flags);
12265 -                               return;
12266 -                       }
12267 -                       for (page = pgd_list; page; page =
12268 -                                       (struct page *)page->index)
12269 -                               if (!vmalloc_sync_one(page_address(page),
12270 -                                                               address)) {
12271 -                                       BUG_ON(page != pgd_list);
12272 -                                       break;
12273 -                               }
12274 -                       spin_unlock_irqrestore(&pgd_lock, flags);
12275 -                       if (!page)
12276 -                               set_bit(sync_index(address), insync);
12277 -               }
12278 -               if (address == start && test_bit(sync_index(address), insync))
12279 -                       start = address + (1UL << PMD_SHIFT);
12280 -       }
12281 -}
12282 --- sle11-2009-05-14.orig/arch/x86/mm/fault_64-xen.c    2009-02-16 16:18:36.000000000 +0100
12283 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
12284 @@ -1,686 +0,0 @@
12285 -/*
12286 - *  linux/arch/x86-64/mm/fault.c
12287 - *
12288 - *  Copyright (C) 1995  Linus Torvalds
12289 - *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12290 - */
12291 -
12292 -#include <linux/signal.h>
12293 -#include <linux/sched.h>
12294 -#include <linux/kernel.h>
12295 -#include <linux/errno.h>
12296 -#include <linux/string.h>
12297 -#include <linux/types.h>
12298 -#include <linux/ptrace.h>
12299 -#include <linux/mman.h>
12300 -#include <linux/mm.h>
12301 -#include <linux/smp.h>
12302 -#include <linux/interrupt.h>
12303 -#include <linux/init.h>
12304 -#include <linux/tty.h>
12305 -#include <linux/vt_kern.h>             /* For unblank_screen() */
12306 -#include <linux/compiler.h>
12307 -#include <linux/vmalloc.h>
12308 -#include <linux/module.h>
12309 -#include <linux/kprobes.h>
12310 -#include <linux/uaccess.h>
12311 -#include <linux/kdebug.h>
12312 -#include <linux/kprobes.h>
12313 -
12314 -#include <asm/system.h>
12315 -#include <asm/pgalloc.h>
12316 -#include <asm/smp.h>
12317 -#include <asm/tlbflush.h>
12318 -#include <asm/proto.h>
12319 -#include <asm-generic/sections.h>
12320 -
12321 -/* Page fault error code bits */
12322 -#define PF_PROT        (1<<0)          /* or no page found */
12323 -#define PF_WRITE       (1<<1)
12324 -#define PF_USER        (1<<2)
12325 -#define PF_RSVD        (1<<3)
12326 -#define PF_INSTR       (1<<4)
12327 -
12328 -#ifdef CONFIG_KPROBES
12329 -static inline int notify_page_fault(struct pt_regs *regs)
12330 -{
12331 -       int ret = 0;
12332 -
12333 -       /* kprobe_running() needs smp_processor_id() */
12334 -       if (!user_mode(regs)) {
12335 -               preempt_disable();
12336 -               if (kprobe_running() && kprobe_fault_handler(regs, 14))
12337 -                       ret = 1;
12338 -               preempt_enable();
12339 -       }
12340 -
12341 -       return ret;
12342 -}
12343 -#else
12344 -static inline int notify_page_fault(struct pt_regs *regs)
12345 -{
12346 -       return 0;
12347 -}
12348 -#endif
12349 -
12350 -/* Sometimes the CPU reports invalid exceptions on prefetch.
12351 -   Check that here and ignore.
12352 -   Opcode checker based on code by Richard Brunner */
12353 -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
12354 -                               unsigned long error_code)
12355 -{
12356 -       unsigned char *instr;
12357 -       int scan_more = 1;
12358 -       int prefetch = 0;
12359 -       unsigned char *max_instr;
12360 -
12361 -       /* If it was a exec fault ignore */
12362 -       if (error_code & PF_INSTR)
12363 -               return 0;
12364 -
12365 -       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
12366 -       max_instr = instr + 15;
12367 -
12368 -       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12369 -               return 0;
12370 -
12371 -       while (scan_more && instr < max_instr) {
12372 -               unsigned char opcode;
12373 -               unsigned char instr_hi;
12374 -               unsigned char instr_lo;
12375 -
12376 -               if (probe_kernel_address(instr, opcode))
12377 -                       break;
12378 -
12379 -               instr_hi = opcode & 0xf0;
12380 -               instr_lo = opcode & 0x0f;
12381 -               instr++;
12382 -
12383 -               switch (instr_hi) {
12384 -               case 0x20:
12385 -               case 0x30:
12386 -                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
12387 -                          prefixes.  In long mode, the CPU will signal
12388 -                          invalid opcode if some of these prefixes are
12389 -                          present so we will never get here anyway */
12390 -                       scan_more = ((instr_lo & 7) == 0x6);
12391 -                       break;
12392 -
12393 -               case 0x40:
12394 -                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
12395 -                          Need to figure out under what instruction mode the
12396 -                          instruction was issued ... */
12397 -                       /* Could check the LDT for lm, but for now it's good
12398 -                          enough to assume that long mode only uses well known
12399 -                          segments or kernel. */
12400 -                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12401 -                       break;
12402 -
12403 -               case 0x60:
12404 -                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
12405 -                       scan_more = (instr_lo & 0xC) == 0x4;
12406 -                       break;
12407 -               case 0xF0:
12408 -                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
12409 -                       scan_more = !instr_lo || (instr_lo>>1) == 1;
12410 -                       break;
12411 -               case 0x00:
12412 -                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
12413 -                       scan_more = 0;
12414 -                       if (probe_kernel_address(instr, opcode))
12415 -                               break;
12416 -                       prefetch = (instr_lo == 0xF) &&
12417 -                               (opcode == 0x0D || opcode == 0x18);
12418 -                       break;
12419 -               default:
12420 -                       scan_more = 0;
12421 -                       break;
12422 -               }
12423 -       }
12424 -       return prefetch;
12425 -}
12426 -
12427 -static int bad_address(void *p)
12428 -{
12429 -       unsigned long dummy;
12430 -       return probe_kernel_address((unsigned long *)p, dummy);
12431 -}
12432 -
12433 -void dump_pagetable(unsigned long address)
12434 -{
12435 -       pgd_t *pgd;
12436 -       pud_t *pud;
12437 -       pmd_t *pmd;
12438 -       pte_t *pte;
12439 -
12440 -       pgd = (pgd_t *)read_cr3();
12441 -
12442 -       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12443 -       pgd += pgd_index(address);
12444 -       if (bad_address(pgd)) goto bad;
12445 -       printk("PGD %lx ", pgd_val(*pgd));
12446 -       if (!pgd_present(*pgd)) goto ret;
12447 -
12448 -       pud = pud_offset(pgd, address);
12449 -       if (bad_address(pud)) goto bad;
12450 -       printk("PUD %lx ", pud_val(*pud));
12451 -       if (!pud_present(*pud)) goto ret;
12452 -
12453 -       pmd = pmd_offset(pud, address);
12454 -       if (bad_address(pmd)) goto bad;
12455 -       printk("PMD %lx ", pmd_val(*pmd));
12456 -       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12457 -
12458 -       pte = pte_offset_kernel(pmd, address);
12459 -       if (bad_address(pte)) goto bad;
12460 -       printk("PTE %lx", pte_val(*pte));
12461 -ret:
12462 -       printk("\n");
12463 -       return;
12464 -bad:
12465 -       printk("BAD\n");
12466 -}
12467 -
12468 -static const char errata93_warning[] =
12469 -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12470 -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12471 -KERN_ERR "******* Please consider a BIOS update.\n"
12472 -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12473 -
12474 -/* Workaround for K8 erratum #93 & buggy BIOS.
12475 -   BIOS SMM functions are required to use a specific workaround
12476 -   to avoid corruption of the 64bit RIP register on C stepping K8.
12477 -   A lot of BIOS that didn't get tested properly miss this.
12478 -   The OS sees this as a page fault with the upper 32bits of RIP cleared.
12479 -   Try to work around it here.
12480 -   Note we only handle faults in kernel here. */
12481 -
12482 -static int is_errata93(struct pt_regs *regs, unsigned long address)
12483 -{
12484 -       static int warned;
12485 -       if (address != regs->rip)
12486 -               return 0;
12487 -       if ((address >> 32) != 0)
12488 -               return 0;
12489 -       address |= 0xffffffffUL << 32;
12490 -       if ((address >= (u64)_stext && address <= (u64)_etext) ||
12491 -           (address >= MODULES_VADDR && address <= MODULES_END)) {
12492 -               if (!warned) {
12493 -                       printk(errata93_warning);
12494 -                       warned = 1;
12495 -               }
12496 -               regs->rip = address;
12497 -               return 1;
12498 -       }
12499 -       return 0;
12500 -}
12501 -
12502 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12503 -                                unsigned long error_code)
12504 -{
12505 -       unsigned long flags = oops_begin();
12506 -       struct task_struct *tsk;
12507 -
12508 -       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12509 -              current->comm, address);
12510 -       dump_pagetable(address);
12511 -       tsk = current;
12512 -       tsk->thread.cr2 = address;
12513 -       tsk->thread.trap_no = 14;
12514 -       tsk->thread.error_code = error_code;
12515 -       __die("Bad pagetable", regs, error_code);
12516 -       oops_end(flags);
12517 -       do_exit(SIGKILL);
12518 -}
12519 -
12520 -/*
12521 - * Handle a fault on the vmalloc area
12522 - *
12523 - * This assumes no large pages in there.
12524 - */
12525 -static int vmalloc_fault(unsigned long address)
12526 -{
12527 -       pgd_t *pgd, *pgd_ref;
12528 -       pud_t *pud, *pud_ref;
12529 -       pmd_t *pmd, *pmd_ref;
12530 -       pte_t *pte, *pte_ref;
12531 -
12532 -       /* Copy kernel mappings over when needed. This can also
12533 -          happen within a race in page table update. In the later
12534 -          case just flush. */
12535 -
12536 -       /* On Xen the line below does not always work. Needs investigating! */
12537 -       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12538 -       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12539 -       pgd += pgd_index(address);
12540 -       pgd_ref = pgd_offset_k(address);
12541 -       if (pgd_none(*pgd_ref))
12542 -               return -1;
12543 -       if (pgd_none(*pgd))
12544 -               set_pgd(pgd, *pgd_ref);
12545 -       else
12546 -               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12547 -
12548 -       /* Below here mismatches are bugs because these lower tables
12549 -          are shared */
12550 -
12551 -       pud = pud_offset(pgd, address);
12552 -       pud_ref = pud_offset(pgd_ref, address);
12553 -       if (pud_none(*pud_ref))
12554 -               return -1;
12555 -       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12556 -               BUG();
12557 -       pmd = pmd_offset(pud, address);
12558 -       pmd_ref = pmd_offset(pud_ref, address);
12559 -       if (pmd_none(*pmd_ref))
12560 -               return -1;
12561 -       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12562 -               BUG();
12563 -       pte_ref = pte_offset_kernel(pmd_ref, address);
12564 -       if (!pte_present(*pte_ref))
12565 -               return -1;
12566 -       pte = pte_offset_kernel(pmd, address);
12567 -       /* Don't use pte_page here, because the mappings can point
12568 -          outside mem_map, and the NUMA hash lookup cannot handle
12569 -          that. */
12570 -       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12571 -               BUG();
12572 -       return 0;
12573 -}
12574 -
12575 -int show_unhandled_signals = 1;
12576 -
12577 -
12578 -#define MEM_VERBOSE 1
12579 -
12580 -#ifdef MEM_VERBOSE
12581 -#define MEM_LOG(_f, _a...)                     \
12582 -       printk("fault.c:[%d]-> " _f "\n",       \
12583 -       __LINE__ , ## _a )
12584 -#else
12585 -#define MEM_LOG(_f, _a...) ((void)0)
12586 -#endif
12587 -
12588 -static int spurious_fault(struct pt_regs *regs,
12589 -                         unsigned long address,
12590 -                         unsigned long error_code)
12591 -{
12592 -       pgd_t *pgd;
12593 -       pud_t *pud;
12594 -       pmd_t *pmd;
12595 -       pte_t *pte;
12596 -
12597 -#ifdef CONFIG_XEN
12598 -       /* Faults in hypervisor area are never spurious. */
12599 -       if ((address >= HYPERVISOR_VIRT_START) &&
12600 -           (address < HYPERVISOR_VIRT_END))
12601 -               return 0;
12602 -#endif
12603 -
12604 -       /* Reserved-bit violation or user access to kernel space? */
12605 -       if (error_code & (PF_RSVD|PF_USER))
12606 -               return 0;
12607 -
12608 -       pgd = init_mm.pgd + pgd_index(address);
12609 -       if (!pgd_present(*pgd))
12610 -               return 0;
12611 -
12612 -       pud = pud_offset(pgd, address);
12613 -       if (!pud_present(*pud))
12614 -               return 0;
12615 -
12616 -       pmd = pmd_offset(pud, address);
12617 -       if (!pmd_present(*pmd))
12618 -               return 0;
12619 -
12620 -       pte = pte_offset_kernel(pmd, address);
12621 -       if (!pte_present(*pte))
12622 -               return 0;
12623 -       if ((error_code & PF_WRITE) && !pte_write(*pte))
12624 -               return 0;
12625 -       if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
12626 -               return 0;
12627 -
12628 -       return 1;
12629 -}
12630 -
12631 -/*
12632 - * This routine handles page faults.  It determines the address,
12633 - * and the problem, and then passes it off to one of the appropriate
12634 - * routines.
12635 - */
12636 -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
12637 -                                       unsigned long error_code)
12638 -{
12639 -       struct task_struct *tsk;
12640 -       struct mm_struct *mm;
12641 -       struct vm_area_struct * vma;
12642 -       unsigned long address;
12643 -       const struct exception_table_entry *fixup;
12644 -       int write, fault;
12645 -       unsigned long flags;
12646 -       siginfo_t info;
12647 -
12648 -       if (!user_mode(regs))
12649 -               error_code &= ~PF_USER; /* means kernel */
12650 -
12651 -       /*
12652 -        * We can fault from pretty much anywhere, with unknown IRQ state.
12653 -        */
12654 -       trace_hardirqs_fixup();
12655 -
12656 -       tsk = current;
12657 -       mm = tsk->mm;
12658 -       prefetchw(&mm->mmap_sem);
12659 -
12660 -       /* get the address */
12661 -       address = read_cr2();
12662 -
12663 -       info.si_code = SEGV_MAPERR;
12664 -
12665 -
12666 -       /*
12667 -        * We fault-in kernel-space virtual memory on-demand. The
12668 -        * 'reference' page table is init_mm.pgd.
12669 -        *
12670 -        * NOTE! We MUST NOT take any locks for this case. We may
12671 -        * be in an interrupt or a critical region, and should
12672 -        * only copy the information from the master page table,
12673 -        * nothing more.
12674 -        *
12675 -        * This verifies that the fault happens in kernel space
12676 -        * (error_code & 4) == 0, and that the fault was not a
12677 -        * protection error (error_code & 9) == 0.
12678 -        */
12679 -       if (unlikely(address >= TASK_SIZE64)) {
12680 -               /*
12681 -                * Don't check for the module range here: its PML4
12682 -                * is always initialized because it's shared with the main
12683 -                * kernel text. Only vmalloc may need PML4 syncups.
12684 -                */
12685 -               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12686 -                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
12687 -                       if (vmalloc_fault(address) >= 0)
12688 -                               return;
12689 -               }
12690 -               /* Can take a spurious fault if mapping changes R/O -> R/W. */
12691 -               if (spurious_fault(regs, address, error_code))
12692 -                       return;
12693 -               if (notify_page_fault(regs))
12694 -                       return;
12695 -               /*
12696 -                * Don't take the mm semaphore here. If we fixup a prefetch
12697 -                * fault we could otherwise deadlock.
12698 -                */
12699 -               goto bad_area_nosemaphore;
12700 -       }
12701 -
12702 -       if (notify_page_fault(regs))
12703 -               return;
12704 -
12705 -       if (likely(regs->eflags & X86_EFLAGS_IF))
12706 -               local_irq_enable();
12707 -
12708 -       if (unlikely(error_code & PF_RSVD))
12709 -               pgtable_bad(address, regs, error_code);
12710 -
12711 -       /*
12712 -        * If we're in an interrupt or have no user
12713 -        * context, we must not take the fault..
12714 -        */
12715 -       if (unlikely(in_atomic() || !mm))
12716 -               goto bad_area_nosemaphore;
12717 -
12718 -       /*
12719 -        * User-mode registers count as a user access even for any
12720 -        * potential system fault or CPU buglet.
12721 -        */
12722 -       if (user_mode_vm(regs))
12723 -               error_code |= PF_USER;
12724 -
12725 - again:
12726 -       /* When running in the kernel we expect faults to occur only to
12727 -        * addresses in user space.  All other faults represent errors in the
12728 -        * kernel and should generate an OOPS.  Unfortunately, in the case of an
12729 -        * erroneous fault occurring in a code path which already holds mmap_sem
12730 -        * we will deadlock attempting to validate the fault against the
12731 -        * address space.  Luckily the kernel only validly references user
12732 -        * space from well defined areas of code, which are listed in the
12733 -        * exceptions table.
12734 -        *
12735 -        * As the vast majority of faults will be valid we will only perform
12736 -        * the source reference check when there is a possibility of a deadlock.
12737 -        * Attempt to lock the address space, if we cannot we then validate the
12738 -        * source.  If this is invalid we can skip the address space check,
12739 -        * thus avoiding the deadlock.
12740 -        */
12741 -       if (!down_read_trylock(&mm->mmap_sem)) {
12742 -               if ((error_code & PF_USER) == 0 &&
12743 -                   !search_exception_tables(regs->rip))
12744 -                       goto bad_area_nosemaphore;
12745 -               down_read(&mm->mmap_sem);
12746 -       }
12747 -
12748 -       vma = find_vma(mm, address);
12749 -       if (!vma)
12750 -               goto bad_area;
12751 -       if (likely(vma->vm_start <= address))
12752 -               goto good_area;
12753 -       if (!(vma->vm_flags & VM_GROWSDOWN))
12754 -               goto bad_area;
12755 -       if (error_code & 4) {
12756 -               /* Allow userspace just enough access below the stack pointer
12757 -                * to let the 'enter' instruction work.
12758 -                */
12759 -               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
12760 -                       goto bad_area;
12761 -       }
12762 -       if (expand_stack(vma, address))
12763 -               goto bad_area;
12764 -/*
12765 - * Ok, we have a good vm_area for this memory access, so
12766 - * we can handle it..
12767 - */
12768 -good_area:
12769 -       info.si_code = SEGV_ACCERR;
12770 -       write = 0;
12771 -       switch (error_code & (PF_PROT|PF_WRITE)) {
12772 -               default:        /* 3: write, present */
12773 -                       /* fall through */
12774 -               case PF_WRITE:          /* write, not present */
12775 -                       if (!(vma->vm_flags & VM_WRITE))
12776 -                               goto bad_area;
12777 -                       write++;
12778 -                       break;
12779 -               case PF_PROT:           /* read, present */
12780 -                       goto bad_area;
12781 -               case 0:                 /* read, not present */
12782 -                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12783 -                               goto bad_area;
12784 -       }
12785 -
12786 -       /*
12787 -        * If for any reason at all we couldn't handle the fault,
12788 -        * make sure we exit gracefully rather than endlessly redo
12789 -        * the fault.
12790 -        */
12791 -       fault = handle_mm_fault(mm, vma, address, write);
12792 -       if (unlikely(fault & VM_FAULT_ERROR)) {
12793 -               if (fault & VM_FAULT_OOM)
12794 -                       goto out_of_memory;
12795 -               else if (fault & VM_FAULT_SIGBUS)
12796 -                       goto do_sigbus;
12797 -               BUG();
12798 -       }
12799 -       if (fault & VM_FAULT_MAJOR)
12800 -               tsk->maj_flt++;
12801 -       else
12802 -               tsk->min_flt++;
12803 -       up_read(&mm->mmap_sem);
12804 -       return;
12805 -
12806 -/*
12807 - * Something tried to access memory that isn't in our memory map..
12808 - * Fix it, but check if it's kernel or user first..
12809 - */
12810 -bad_area:
12811 -       up_read(&mm->mmap_sem);
12812 -
12813 -bad_area_nosemaphore:
12814 -       /* User mode accesses just cause a SIGSEGV */
12815 -       if (error_code & PF_USER) {
12816 -
12817 -               /*
12818 -                * It's possible to have interrupts off here.
12819 -                */
12820 -               local_irq_enable();
12821 -
12822 -               if (is_prefetch(regs, address, error_code))
12823 -                       return;
12824 -
12825 -               /* Work around K8 erratum #100 K8 in compat mode
12826 -                  occasionally jumps to illegal addresses >4GB.  We
12827 -                  catch this here in the page fault handler because
12828 -                  these addresses are not reachable. Just detect this
12829 -                  case and return.  Any code segment in LDT is
12830 -                  compatibility mode. */
12831 -               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12832 -                   (address >> 32))
12833 -                       return;
12834 -
12835 -               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12836 -                   printk_ratelimit()) {
12837 -                       printk(
12838 -                      "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
12839 -                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
12840 -                                       tsk->comm, tsk->pid, address, regs->rip,
12841 -                                       regs->rsp, error_code);
12842 -               }
12843 -
12844 -               tsk->thread.cr2 = address;
12845 -               /* Kernel addresses are always protection faults */
12846 -               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12847 -               tsk->thread.trap_no = 14;
12848 -               info.si_signo = SIGSEGV;
12849 -               info.si_errno = 0;
12850 -               /* info.si_code has been set above */
12851 -               info.si_addr = (void __user *)address;
12852 -               force_sig_info(SIGSEGV, &info, tsk);
12853 -               return;
12854 -       }
12855 -
12856 -no_context:
12857 -
12858 -       /* Are we prepared to handle this kernel fault?  */
12859 -       fixup = search_exception_tables(regs->rip);
12860 -       if (fixup) {
12861 -               regs->rip = fixup->fixup;
12862 -               return;
12863 -       }
12864 -
12865 -       /*
12866 -        * Hall of shame of CPU/BIOS bugs.
12867 -        */
12868 -
12869 -       if (is_prefetch(regs, address, error_code))
12870 -               return;
12871 -
12872 -       if (is_errata93(regs, address))
12873 -               return;
12874 -
12875 -/*
12876 - * Oops. The kernel tried to access some bad page. We'll have to
12877 - * terminate things with extreme prejudice.
12878 - */
12879 -
12880 -       flags = oops_begin();
12881 -
12882 -       if (address < PAGE_SIZE)
12883 -               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
12884 -       else
12885 -               printk(KERN_ALERT "Unable to handle kernel paging request");
12886 -       printk(" at %016lx RIP: \n" KERN_ALERT,address);
12887 -       printk_address(regs->rip);
12888 -       dump_pagetable(address);
12889 -       tsk->thread.cr2 = address;
12890 -       tsk->thread.trap_no = 14;
12891 -       tsk->thread.error_code = error_code;
12892 -       __die("Oops", regs, error_code);
12893 -       /* Executive summary in case the body of the oops scrolled away */
12894 -       printk(KERN_EMERG "CR2: %016lx\n", address);
12895 -       oops_end(flags);
12896 -       do_exit(SIGKILL);
12897 -
12898 -/*
12899 - * We ran out of memory, or some other thing happened to us that made
12900 - * us unable to handle the page fault gracefully.
12901 - */
12902 -out_of_memory:
12903 -       up_read(&mm->mmap_sem);
12904 -       if (is_global_init(current)) {
12905 -               yield();
12906 -               goto again;
12907 -       }
12908 -       printk("VM: killing process %s\n", tsk->comm);
12909 -       if (error_code & 4)
12910 -               do_group_exit(SIGKILL);
12911 -       goto no_context;
12912 -
12913 -do_sigbus:
12914 -       up_read(&mm->mmap_sem);
12915 -
12916 -       /* Kernel mode? Handle exceptions or die */
12917 -       if (!(error_code & PF_USER))
12918 -               goto no_context;
12919 -
12920 -       tsk->thread.cr2 = address;
12921 -       tsk->thread.error_code = error_code;
12922 -       tsk->thread.trap_no = 14;
12923 -       info.si_signo = SIGBUS;
12924 -       info.si_errno = 0;
12925 -       info.si_code = BUS_ADRERR;
12926 -       info.si_addr = (void __user *)address;
12927 -       force_sig_info(SIGBUS, &info, tsk);
12928 -       return;
12929 -}
12930 -
12931 -DEFINE_SPINLOCK(pgd_lock);
12932 -LIST_HEAD(pgd_list);
12933 -
12934 -void vmalloc_sync_all(void)
12935 -{
12936 -       /* Note that races in the updates of insync and start aren't
12937 -          problematic:
12938 -          insync can only get set bits added, and updates to start are only
12939 -          improving performance (without affecting correctness if undone). */
12940 -       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12941 -       static unsigned long start = VMALLOC_START & PGDIR_MASK;
12942 -       unsigned long address;
12943 -
12944 -       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12945 -               if (!test_bit(pgd_index(address), insync)) {
12946 -                       const pgd_t *pgd_ref = pgd_offset_k(address);
12947 -                       struct page *page;
12948 -
12949 -                       if (pgd_none(*pgd_ref))
12950 -                               continue;
12951 -                       spin_lock(&pgd_lock);
12952 -                       list_for_each_entry(page, &pgd_list, lru) {
12953 -                               pgd_t *pgd;
12954 -                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
12955 -                               if (pgd_none(*pgd))
12956 -                                       set_pgd(pgd, *pgd_ref);
12957 -                               else
12958 -                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12959 -                       }
12960 -                       spin_unlock(&pgd_lock);
12961 -                       set_bit(pgd_index(address), insync);
12962 -               }
12963 -               if (address == start)
12964 -                       start = address + PGDIR_SIZE;
12965 -       }
12966 -       /* Check that there is no need to do the same for the modules area. */
12967 -       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12968 -       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12969 -                               (__START_KERNEL & PGDIR_MASK)));
12970 -}
12971 --- sle11-2009-05-14.orig/arch/x86/mm/highmem_32-xen.c  2009-02-16 16:17:21.000000000 +0100
12972 +++ sle11-2009-05-14/arch/x86/mm/highmem_32-xen.c       2009-03-16 16:33:40.000000000 +0100
12973 @@ -18,6 +18,49 @@ void kunmap(struct page *page)
12974         kunmap_high(page);
12975  }
12976
12977 +static void debug_kmap_atomic_prot(enum km_type type)
12978 +{
12979 +#ifdef CONFIG_DEBUG_HIGHMEM
12980 +       static unsigned warn_count = 10;
12981 +
12982 +       if (unlikely(warn_count == 0))
12983 +               return;
12984 +
12985 +       if (unlikely(in_interrupt())) {
12986 +               if (in_irq()) {
12987 +                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
12988 +                           type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
12989 +                           type != KM_BOUNCE_READ) {
12990 +                               WARN_ON(1);
12991 +                               warn_count--;
12992 +                       }
12993 +               } else if (!irqs_disabled()) {  /* softirq */
12994 +                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
12995 +                           type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
12996 +                           type != KM_SKB_SUNRPC_DATA &&
12997 +                           type != KM_SKB_DATA_SOFTIRQ &&
12998 +                           type != KM_BOUNCE_READ) {
12999 +                               WARN_ON(1);
13000 +                               warn_count--;
13001 +                       }
13002 +               }
13003 +       }
13004 +
13005 +       if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13006 +                       type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13007 +               if (!irqs_disabled()) {
13008 +                       WARN_ON(1);
13009 +                       warn_count--;
13010 +               }
13011 +       } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13012 +               if (irq_count() == 0 && !irqs_disabled()) {
13013 +                       WARN_ON(1);
13014 +                       warn_count--;
13015 +               }
13016 +       }
13017 +#endif
13018 +}
13019 +
13020  /*
13021   * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13022   * no global lock is needed and because the kmap code must perform a global TLB
13023 @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13024         if (!PageHighMem(page))
13025                 return page_address(page);
13026
13027 +       debug_kmap_atomic_prot(type);
13028 +
13029         idx = type + KM_TYPE_NR*smp_processor_id();
13030         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13031         BUG_ON(!pte_none(*(kmap_pte-idx)));
13032 --- sle11-2009-05-14.orig/arch/x86/mm/hypervisor.c      2009-05-06 10:23:43.000000000 +0200
13033 +++ sle11-2009-05-14/arch/x86/mm/hypervisor.c   2009-05-14 11:18:39.000000000 +0200
13034 @@ -869,15 +869,11 @@ int xen_limit_pages_to_max_mfn(
13035  }
13036  EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13037
13038 -#ifdef __i386__
13039 -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13040 +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13041  {
13042 -       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13043 -       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13044 -       return HYPERVISOR_update_descriptor(
13045 -               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13046 +       maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13047 +       return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13048  }
13049 -#endif
13050
13051  #define MAX_BATCHED_FULL_PTES 32
13052
13053 --- sle11-2009-05-14.orig/arch/x86/mm/init_32-xen.c     2009-02-16 16:18:36.000000000 +0100
13054 +++ sle11-2009-05-14/arch/x86/mm/init_32-xen.c  2009-03-16 16:33:40.000000000 +0100
13055 @@ -27,13 +27,13 @@
13056  #include <linux/bootmem.h>
13057  #include <linux/slab.h>
13058  #include <linux/proc_fs.h>
13059 -#include <linux/efi.h>
13060  #include <linux/memory_hotplug.h>
13061  #include <linux/initrd.h>
13062  #include <linux/cpumask.h>
13063  #include <linux/dma-mapping.h>
13064  #include <linux/scatterlist.h>
13065
13066 +#include <asm/asm.h>
13067  #include <asm/processor.h>
13068  #include <asm/system.h>
13069  #include <asm/uaccess.h>
13070 @@ -42,18 +42,22 @@
13071  #include <asm/fixmap.h>
13072  #include <asm/e820.h>
13073  #include <asm/apic.h>
13074 +#include <asm/bugs.h>
13075  #include <asm/tlb.h>
13076  #include <asm/tlbflush.h>
13077 +#include <asm/pgalloc.h>
13078  #include <asm/sections.h>
13079  #include <asm/hypervisor.h>
13080  #include <asm/swiotlb.h>
13081 +#include <asm/setup.h>
13082 +#include <asm/cacheflush.h>
13083
13084  unsigned int __VMALLOC_RESERVE = 128 << 20;
13085
13086  DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13087  unsigned long highstart_pfn, highend_pfn;
13088
13089 -static int noinline do_test_wp_bit(void);
13090 +static noinline int do_test_wp_bit(void);
13091
13092  /*
13093   * Creates a middle page table and puts a pointer to it in the
13094 @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13095  {
13096         pud_t *pud;
13097         pmd_t *pmd_table;
13098 -
13099 +
13100  #ifdef CONFIG_X86_PAE
13101         if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13102                 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13103
13104 -               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13105 +               paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13106                 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13107                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13108                 pud = pud_offset(pgd, 0);
13109 -               if (pmd_table != pmd_offset(pud, 0))
13110 -                       BUG();
13111 +               BUG_ON(pmd_table != pmd_offset(pud, 0));
13112         }
13113  #endif
13114         pud = pud_offset(pgd, 0);
13115 @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13116
13117  /*
13118   * Create a page table and place a pointer to it in a middle page
13119 - * directory entry.
13120 + * directory entry:
13121   */
13122  static pte_t * __init one_page_table_init(pmd_t *pmd)
13123  {
13124 @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13125  #ifdef CONFIG_DEBUG_PAGEALLOC
13126                 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13127  #endif
13128 -               if (!page_table)
13129 +               if (!page_table) {
13130                         page_table =
13131                                 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13132 +               }
13133
13134                 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13135                 make_lowmem_page_readonly(page_table,
13136 @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13137  }
13138
13139  /*
13140 - * This function initializes a certain range of kernel virtual memory
13141 + * This function initializes a certain range of kernel virtual memory
13142   * with new bootmem page tables, everywhere page tables are missing in
13143   * the given range.
13144 - */
13145 -
13146 -/*
13147 - * NOTE: The pagetables are allocated contiguous on the physical space
13148 - * so we can cache the place of the first one and move around without
13149 + *
13150 + * NOTE: The pagetables are allocated contiguous on the physical space
13151 + * so we can cache the place of the first one and move around without
13152   * checking the pgd every time.
13153   */
13154 -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13155 +static void __init
13156 +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13157  {
13158 -       pgd_t *pgd;
13159 -       pmd_t *pmd;
13160         int pgd_idx, pmd_idx;
13161         unsigned long vaddr;
13162 +       pgd_t *pgd;
13163 +       pmd_t *pmd;
13164
13165         vaddr = start;
13166         pgd_idx = pgd_index(vaddr);
13167 @@ -139,7 +142,8 @@ static void __init page_table_range_init
13168         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13169                 pmd = one_md_table_init(pgd);
13170                 pmd = pmd + pmd_index(vaddr);
13171 -               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13172 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13173 +                                                       pmd++, pmd_idx++) {
13174                         if (vaddr < hypervisor_virt_start)
13175                                 one_page_table_init(pmd);
13176
13177 @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13178  }
13179
13180  /*
13181 - * This maps the physical memory to kernel virtual address space, a total
13182 - * of max_low_pfn pages, by creating page tables starting from address
13183 - * PAGE_OFFSET.
13184 + * This maps the physical memory to kernel virtual address space, a total
13185 + * of max_low_pfn pages, by creating page tables starting from address
13186 + * PAGE_OFFSET:
13187   */
13188  static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13189  {
13190 +       int pgd_idx, pmd_idx, pte_ofs;
13191         unsigned long pfn;
13192         pgd_t *pgd;
13193         pmd_t *pmd;
13194         pte_t *pte;
13195 -       int pgd_idx, pmd_idx, pte_ofs;
13196
13197         unsigned long max_ram_pfn = xen_start_info->nr_pages;
13198         if (max_ram_pfn > max_low_pfn)
13199 @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13200                 if (pfn >= max_low_pfn)
13201                         continue;
13202                 pmd += pmd_idx;
13203 -               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13204 -                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13205 -                       if (address >= hypervisor_virt_start)
13206 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13207 +                    pmd++, pmd_idx++) {
13208 +                       unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13209 +
13210 +                       if (addr >= hypervisor_virt_start)
13211                                 continue;
13212
13213 -                       /* Map with big pages if possible, otherwise create normal page tables. */
13214 +                       /*
13215 +                        * Map with big pages if possible, otherwise
13216 +                        * create normal page tables:
13217 +                        */
13218                         if (cpu_has_pse) {
13219 -                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13220 -                               if (is_kernel_text(address) || is_kernel_text(address2))
13221 -                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13222 -                               else
13223 -                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13224 +                               unsigned int addr2;
13225 +                               pgprot_t prot = PAGE_KERNEL_LARGE;
13226 +
13227 +                               addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13228 +                                       PAGE_OFFSET + PAGE_SIZE-1;
13229 +
13230 +                               if (is_kernel_text(addr) ||
13231 +                                   is_kernel_text(addr2))
13232 +                                       prot = PAGE_KERNEL_LARGE_EXEC;
13233 +
13234 +                               set_pmd(pmd, pfn_pmd(pfn, prot));
13235
13236                                 pfn += PTRS_PER_PTE;
13237 -                       } else {
13238 -                               pte = one_page_table_init(pmd);
13239 +                               continue;
13240 +                       }
13241 +                       pte = one_page_table_init(pmd);
13242 +
13243 +                       for (pte += pte_ofs;
13244 +                            pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13245 +                            pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13246 +                               pgprot_t prot = PAGE_KERNEL;
13247 +
13248 +                               /* XEN: Only map initial RAM allocation. */
13249 +                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
13250 +                                       continue;
13251 +                               if (is_kernel_text(addr))
13252 +                                       prot = PAGE_KERNEL_EXEC;
13253
13254 -                               for (pte += pte_ofs;
13255 -                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13256 -                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13257 -                                       /* XEN: Only map initial RAM allocation. */
13258 -                                       if ((pfn >= max_ram_pfn) || pte_present(*pte))
13259 -                                               continue;
13260 -                                       if (is_kernel_text(address))
13261 -                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13262 -                                       else
13263 -                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13264 -                               }
13265 -                               pte_ofs = 0;
13266 +                               set_pte(pte, pfn_pte(pfn, prot));
13267                         }
13268 +                       pte_ofs = 0;
13269                 }
13270                 pmd_idx = 0;
13271         }
13272 @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13273
13274  #endif
13275
13276 -int page_is_ram(unsigned long pagenr)
13277 -{
13278 -       int i;
13279 -       unsigned long addr, end;
13280 -
13281 -       if (efi_enabled) {
13282 -               efi_memory_desc_t *md;
13283 -               void *p;
13284 -
13285 -               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13286 -                       md = p;
13287 -                       if (!is_available_memory(md))
13288 -                               continue;
13289 -                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13290 -                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13291 -
13292 -                       if ((pagenr >= addr) && (pagenr < end))
13293 -                               return 1;
13294 -               }
13295 -               return 0;
13296 -       }
13297 -
13298 -       for (i = 0; i < e820.nr_map; i++) {
13299 -
13300 -               if (e820.map[i].type != E820_RAM)       /* not usable memory */
13301 -                       continue;
13302 -               /*
13303 -                *      !!!FIXME!!! Some BIOSen report areas as RAM that
13304 -                *      are not. Notably the 640->1Mb area. We need a sanity
13305 -                *      check here.
13306 -                */
13307 -               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13308 -               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13309 -               if  ((pagenr >= addr) && (pagenr < end))
13310 -                       return 1;
13311 -       }
13312 -       return 0;
13313 -}
13314 -
13315  #ifdef CONFIG_HIGHMEM
13316  pte_t *kmap_pte;
13317  pgprot_t kmap_prot;
13318
13319 -#define kmap_get_fixmap_pte(vaddr)                                     \
13320 -       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13321 +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13322 +{
13323 +       return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13324 +                       vaddr), vaddr), vaddr);
13325 +}
13326
13327  static void __init kmap_init(void)
13328  {
13329         unsigned long kmap_vstart;
13330
13331 -       /* cache the first kmap pte */
13332 +       /*
13333 +        * Cache the first kmap pte:
13334 +        */
13335         kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13336         kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13337
13338 @@ -304,11 +287,11 @@ static void __init kmap_init(void)
13339
13340  static void __init permanent_kmaps_init(pgd_t *pgd_base)
13341  {
13342 +       unsigned long vaddr;
13343         pgd_t *pgd;
13344         pud_t *pud;
13345         pmd_t *pmd;
13346         pte_t *pte;
13347 -       unsigned long vaddr;
13348
13349         vaddr = PKMAP_BASE;
13350         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13351 @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13352         pud = pud_offset(pgd, vaddr);
13353         pmd = pmd_offset(pud, vaddr);
13354         pte = pte_offset_kernel(pmd, vaddr);
13355 -       pkmap_page_table = pte;
13356 +       pkmap_page_table = pte;
13357  }
13358
13359  static void __meminit free_new_highpage(struct page *page, int pfn)
13360 @@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13361                 SetPageReserved(page);
13362  }
13363
13364 -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13365 +static int __meminit
13366 +add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13367  {
13368         free_new_highpage(page, pfn);
13369         totalram_pages++;
13370 @@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13371         max_mapnr = max(pfn, max_mapnr);
13372  #endif
13373         num_physpages++;
13374 +
13375         return 0;
13376  }
13377
13378 @@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13379   * Not currently handling the NUMA case.
13380   * Assuming single node and all memory that
13381   * has been added dynamically that would be
13382 - * onlined here is in HIGHMEM
13383 + * onlined here is in HIGHMEM.
13384   */
13385  void __meminit online_page(struct page *page)
13386  {
13387 @@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13388         add_one_highpage_hotplug(page, page_to_pfn(page));
13389  }
13390
13391 -
13392 -#ifdef CONFIG_NUMA
13393 -extern void set_highmem_pages_init(int);
13394 -#else
13395 +#ifndef CONFIG_NUMA
13396  static void __init set_highmem_pages_init(int bad_ppro)
13397  {
13398         int pfn;
13399 +
13400         for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13401                 /*
13402                  * Holes under sparsemem might not have no mem_map[]:
13403 @@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13404         }
13405         totalram_pages += totalhigh_pages;
13406  }
13407 -#endif /* CONFIG_FLATMEM */
13408 +#endif /* !CONFIG_NUMA */
13409
13410  #else
13411 -#define kmap_init() do { } while (0)
13412 -#define permanent_kmaps_init(pgd_base) do { } while (0)
13413 -#define set_highmem_pages_init(bad_ppro) do { } while (0)
13414 +# define kmap_init()                           do { } while (0)
13415 +# define permanent_kmaps_init(pgd_base)                do { } while (0)
13416 +# define set_highmem_pages_init(bad_ppro)      do { } while (0)
13417  #endif /* CONFIG_HIGHMEM */
13418
13419 -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13420 +pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13421  EXPORT_SYMBOL(__PAGE_KERNEL);
13422 -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13423
13424 -#ifdef CONFIG_NUMA
13425 -extern void __init remap_numa_kva(void);
13426 -#else
13427 -#define remap_numa_kva() do {} while (0)
13428 -#endif
13429 +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13430
13431  pgd_t *swapper_pg_dir;
13432
13433 @@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13434   * the boot process.
13435   *
13436   * If we're booting on native hardware, this will be a pagetable
13437 - * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13438 - * (even if we'll end up running in PAE).  The root of the pagetable
13439 - * will be swapper_pg_dir.
13440 + * constructed in arch/x86/kernel/head_32.S.  The root of the
13441 + * pagetable will be swapper_pg_dir.
13442   *
13443   * If we're booting paravirtualized under a hypervisor, then there are
13444   * more options: we may already be running PAE, and the pagetable may
13445 @@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13446   * be partially populated, and so it avoids stomping on any existing
13447   * mappings.
13448   */
13449 -static void __init pagetable_init (void)
13450 +static void __init pagetable_init(void)
13451  {
13452 -       unsigned long vaddr, end;
13453         pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13454 +       unsigned long vaddr, end;
13455
13456         xen_pagetable_setup_start(pgd_base);
13457
13458 @@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13459          * Fixed mappings, only the page table structure has to be
13460          * created - mappings will be set by set_fixmap():
13461          */
13462 +       early_ioremap_clear();
13463         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13464         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13465         page_table_range_init(vaddr, end, pgd_base);
13466 +       early_ioremap_reset();
13467
13468         permanent_kmaps_init(pgd_base);
13469
13470         xen_pagetable_setup_done(pgd_base);
13471  }
13472
13473 -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13474 +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13475  /*
13476 - * Swap suspend & friends need this for resume because things like the intel-agp
13477 + * ACPI suspend needs this for resume, because things like the intel-agp
13478   * driver might have split up a kernel 4MB mapping.
13479   */
13480 -char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13481 -       __attribute__ ((aligned (PAGE_SIZE)));
13482 +char swsusp_pg_dir[PAGE_SIZE]
13483 +       __attribute__ ((aligned(PAGE_SIZE)));
13484
13485  static inline void save_pg_dir(void)
13486  {
13487         memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13488  }
13489 -#else
13490 +#else /* !CONFIG_ACPI_SLEEP */
13491  static inline void save_pg_dir(void)
13492  {
13493  }
13494 -#endif
13495 +#endif /* !CONFIG_ACPI_SLEEP */
13496
13497 -void zap_low_mappings (void)
13498 +void zap_low_mappings(void)
13499  {
13500         int i;
13501
13502 @@ -488,22 +467,24 @@ void zap_low_mappings (void)
13503          * Note that "pgd_clear()" doesn't do it for
13504          * us, because pgd_clear() is a no-op on i386.
13505          */
13506 -       for (i = 0; i < USER_PTRS_PER_PGD; i++)
13507 +       for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13508  #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13509                 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13510  #else
13511                 set_pgd(swapper_pg_dir+i, __pgd(0));
13512  #endif
13513 +       }
13514         flush_tlb_all();
13515  }
13516
13517 -int nx_enabled = 0;
13518 +int nx_enabled;
13519 +
13520 +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13521 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
13522
13523  #ifdef CONFIG_X86_PAE
13524
13525 -static int disable_nx __initdata = 0;
13526 -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13527 -EXPORT_SYMBOL_GPL(__supported_pte_mask);
13528 +static int disable_nx __initdata;
13529
13530  /*
13531   * noexec = on|off
13532 @@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13533                         __supported_pte_mask |= _PAGE_NX;
13534                         disable_nx = 0;
13535                 }
13536 -       } else if (!strcmp(str,"off")) {
13537 -               disable_nx = 1;
13538 -               __supported_pte_mask &= ~_PAGE_NX;
13539 -       } else
13540 -               return -EINVAL;
13541 +       } else {
13542 +               if (!strcmp(str, "off")) {
13543 +                       disable_nx = 1;
13544 +                       __supported_pte_mask &= ~_PAGE_NX;
13545 +               } else {
13546 +                       return -EINVAL;
13547 +               }
13548 +       }
13549
13550         return 0;
13551  }
13552 @@ -536,6 +520,7 @@ static void __init set_nx(void)
13553
13554         if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13555                 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13556 +
13557                 if ((v[3] & (1 << 20)) && !disable_nx) {
13558                         rdmsr(MSR_EFER, l, h);
13559                         l |= EFER_NX;
13560 @@ -545,35 +530,6 @@ static void __init set_nx(void)
13561                 }
13562         }
13563  }
13564 -
13565 -/*
13566 - * Enables/disables executability of a given kernel page and
13567 - * returns the previous setting.
13568 - */
13569 -int __init set_kernel_exec(unsigned long vaddr, int enable)
13570 -{
13571 -       pte_t *pte;
13572 -       int ret = 1;
13573 -
13574 -       if (!nx_enabled)
13575 -               goto out;
13576 -
13577 -       pte = lookup_address(vaddr);
13578 -       BUG_ON(!pte);
13579 -
13580 -       if (!pte_exec_kernel(*pte))
13581 -               ret = 0;
13582 -
13583 -       if (enable)
13584 -               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13585 -       else
13586 -               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13587 -       pte_update_defer(&init_mm, vaddr, pte);
13588 -       __flush_tlb_all();
13589 -out:
13590 -       return ret;
13591 -}
13592 -
13593  #endif
13594
13595  /*
13596 @@ -590,21 +546,10 @@ void __init paging_init(void)
13597  #ifdef CONFIG_X86_PAE
13598         set_nx();
13599         if (nx_enabled)
13600 -               printk("NX (Execute Disable) protection: active\n");
13601 +               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13602  #endif
13603 -
13604         pagetable_init();
13605
13606 -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13607 -       /*
13608 -        * We will bail out later - printk doesn't work right now so
13609 -        * the user would just see a hanging kernel.
13610 -        * when running as xen domain we are already in PAE mode at
13611 -        * this point.
13612 -        */
13613 -       if (cpu_has_pae)
13614 -               set_in_cr4(X86_CR4_PAE);
13615 -#endif
13616         __flush_tlb_all();
13617
13618         kmap_init();
13619 @@ -631,10 +576,10 @@ void __init paging_init(void)
13620   * used to involve black magic jumps to work around some nasty CPU bugs,
13621   * but fortunately the switch to using exceptions got rid of all that.
13622   */
13623 -
13624  static void __init test_wp_bit(void)
13625  {
13626 -       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13627 +       printk(KERN_INFO
13628 +  "Checking if this processor honours the WP bit even in supervisor mode...");
13629
13630         /* Any page-aligned address will do, the test is non-destructive */
13631         __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13632 @@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13633         clear_fixmap(FIX_WP_TEST);
13634
13635         if (!boot_cpu_data.wp_works_ok) {
13636 -               printk("No.\n");
13637 +               printk(KERN_CONT "No.\n");
13638  #ifdef CONFIG_X86_WP_WORKS_OK
13639 -               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13640 +               panic(
13641 +  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13642  #endif
13643         } else {
13644 -               printk("Ok.\n");
13645 +               printk(KERN_CONT "Ok.\n");
13646         }
13647  }
13648
13649 -static struct kcore_list kcore_mem, kcore_vmalloc;
13650 +static struct kcore_list kcore_mem, kcore_vmalloc;
13651
13652  void __init mem_init(void)
13653  {
13654 -       extern int ppro_with_ram_bug(void);
13655         int codesize, reservedpages, datasize, initsize;
13656 -       int tmp;
13657 -       int bad_ppro;
13658 +       int tmp, bad_ppro;
13659         unsigned long pfn;
13660
13661  #if defined(CONFIG_SWIOTLB)
13662 @@ -668,19 +612,19 @@ void __init mem_init(void)
13663  #ifdef CONFIG_FLATMEM
13664         BUG_ON(!mem_map);
13665  #endif
13666 -
13667         bad_ppro = ppro_with_ram_bug();
13668
13669  #ifdef CONFIG_HIGHMEM
13670         /* check that fixmap and pkmap do not overlap */
13671 -       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13672 -               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13673 +       if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13674 +               printk(KERN_ERR
13675 +                       "fixmap and kmap areas overlap - this will crash\n");
13676                 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13677 -                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13678 +                               PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13679 +                               FIXADDR_START);
13680                 BUG();
13681         }
13682  #endif
13683 -
13684         /* this will put all low memory onto the freelists */
13685         totalram_pages += free_all_bootmem();
13686         /* XEN: init and count low-mem pages outside initial allocation. */
13687 @@ -693,7 +637,7 @@ void __init mem_init(void)
13688         reservedpages = 0;
13689         for (tmp = 0; tmp < max_low_pfn; tmp++)
13690                 /*
13691 -                * Only count reserved RAM pages
13692 +                * Only count reserved RAM pages:
13693                  */
13694                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13695                         reservedpages++;
13696 @@ -704,11 +648,12 @@ void __init mem_init(void)
13697         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
13698         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
13699
13700 -       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13701 -       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13702 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13703 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13704                    VMALLOC_END-VMALLOC_START);
13705
13706 -       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13707 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13708 +                       "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13709                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13710                 num_physpages << (PAGE_SHIFT-10),
13711                 codesize >> 10,
13712 @@ -719,54 +664,53 @@ void __init mem_init(void)
13713                );
13714
13715  #if 1 /* double-sanity-check paranoia */
13716 -       printk("virtual kernel memory layout:\n"
13717 -              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13718 +       printk(KERN_INFO "virtual kernel memory layout:\n"
13719 +               "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13720  #ifdef CONFIG_HIGHMEM
13721 -              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13722 +               "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13723  #endif
13724 -              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13725 -              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13726 -              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13727 -              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13728 -              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
13729 -              FIXADDR_START, FIXADDR_TOP,
13730 -              (FIXADDR_TOP - FIXADDR_START) >> 10,
13731 +               "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13732 +               "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13733 +               "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13734 +               "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13735 +               "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
13736 +               FIXADDR_START, FIXADDR_TOP,
13737 +               (FIXADDR_TOP - FIXADDR_START) >> 10,
13738
13739  #ifdef CONFIG_HIGHMEM
13740 -              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13741 -              (LAST_PKMAP*PAGE_SIZE) >> 10,
13742 +               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13743 +               (LAST_PKMAP*PAGE_SIZE) >> 10,
13744  #endif
13745
13746 -              VMALLOC_START, VMALLOC_END,
13747 -              (VMALLOC_END - VMALLOC_START) >> 20,
13748 +               VMALLOC_START, VMALLOC_END,
13749 +               (VMALLOC_END - VMALLOC_START) >> 20,
13750
13751 -              (unsigned long)__va(0), (unsigned long)high_memory,
13752 -              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13753 +               (unsigned long)__va(0), (unsigned long)high_memory,
13754 +               ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13755
13756 -              (unsigned long)&__init_begin, (unsigned long)&__init_end,
13757 -              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13758 +               (unsigned long)&__init_begin, (unsigned long)&__init_end,
13759 +               ((unsigned long)&__init_end -
13760 +                (unsigned long)&__init_begin) >> 10,
13761
13762 -              (unsigned long)&_etext, (unsigned long)&_edata,
13763 -              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13764 +               (unsigned long)&_etext, (unsigned long)&_edata,
13765 +               ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13766
13767 -              (unsigned long)&_text, (unsigned long)&_etext,
13768 -              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13769 +               (unsigned long)&_text, (unsigned long)&_etext,
13770 +               ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13771
13772  #ifdef CONFIG_HIGHMEM
13773 -       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13774 -       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
13775 +       BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
13776 +       BUG_ON(VMALLOC_END                              > PKMAP_BASE);
13777  #endif
13778 -       BUG_ON(VMALLOC_START                   > VMALLOC_END);
13779 -       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
13780 +       BUG_ON(VMALLOC_START                            > VMALLOC_END);
13781 +       BUG_ON((unsigned long)high_memory               > VMALLOC_START);
13782  #endif /* double-sanity-check paranoia */
13783
13784 -#ifdef CONFIG_X86_PAE
13785 -       if (!cpu_has_pae)
13786 -               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13787 -#endif
13788         if (boot_cpu_data.wp_works_ok < 0)
13789                 test_wp_bit();
13790
13791 +       cpa_init();
13792 +
13793         /*
13794          * Subtle. SMP is doing it's boot stuff late (because it has to
13795          * fork idle threads) - but it also needs low mappings for the
13796 @@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13797
13798         return __add_pages(zone, start_pfn, nr_pages);
13799  }
13800 -
13801  #endif
13802
13803 -struct kmem_cache *pmd_cache;
13804 -
13805 -void __init pgtable_cache_init(void)
13806 -{
13807 -       if (PTRS_PER_PMD > 1)
13808 -               pmd_cache = kmem_cache_create("pmd",
13809 -                                             PTRS_PER_PMD*sizeof(pmd_t),
13810 -                                             PTRS_PER_PMD*sizeof(pmd_t),
13811 -                                             SLAB_PANIC,
13812 -                                             pmd_ctor);
13813 -}
13814 -
13815  /*
13816   * This function cannot be __init, since exceptions don't work in that
13817   * section.  Put this after the callers, so that it cannot be inlined.
13818   */
13819 -static int noinline do_test_wp_bit(void)
13820 +static noinline int do_test_wp_bit(void)
13821  {
13822         char tmp_reg;
13823         int flag;
13824
13825         __asm__ __volatile__(
13826 -               "       movb %0,%1      \n"
13827 -               "1:     movb %1,%0      \n"
13828 -               "       xorl %2,%2      \n"
13829 +               "       movb %0, %1     \n"
13830 +               "1:     movb %1, %0     \n"
13831 +               "       xorl %2, %2     \n"
13832                 "2:                     \n"
13833 -               ".section __ex_table,\"a\"\n"
13834 -               "       .align 4        \n"
13835 -               "       .long 1b,2b     \n"
13836 -               ".previous              \n"
13837 +               _ASM_EXTABLE(1b,2b)
13838                 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13839                  "=q" (tmp_reg),
13840                  "=r" (flag)
13841                 :"2" (1)
13842                 :"memory");
13843 -
13844 +
13845         return flag;
13846  }
13847
13848  #ifdef CONFIG_DEBUG_RODATA
13849 +const int rodata_test_data = 0xC3;
13850 +EXPORT_SYMBOL_GPL(rodata_test_data);
13851
13852  void mark_rodata_ro(void)
13853  {
13854 @@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13855         if (num_possible_cpus() <= 1)
13856  #endif
13857         {
13858 -               change_page_attr(virt_to_page(start),
13859 -                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
13860 -               printk("Write protecting the kernel text: %luk\n", size >> 10);
13861 +               set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13862 +               printk(KERN_INFO "Write protecting the kernel text: %luk\n",
13863 +                       size >> 10);
13864 +
13865 +#ifdef CONFIG_CPA_DEBUG
13866 +               printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
13867 +                       start, start+size);
13868 +               set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
13869 +
13870 +               printk(KERN_INFO "Testing CPA: write protecting again\n");
13871 +               set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
13872 +#endif
13873         }
13874  #endif
13875         start += size;
13876         size = (unsigned long)__end_rodata - start;
13877 -       change_page_attr(virt_to_page(start),
13878 -                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
13879 -       printk("Write protecting the kernel read-only data: %luk\n",
13880 -              size >> 10);
13881 +       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13882 +       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
13883 +               size >> 10);
13884 +       rodata_test();
13885 +
13886 +#ifdef CONFIG_CPA_DEBUG
13887 +       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
13888 +       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
13889
13890 -       /*
13891 -        * change_page_attr() requires a global_flush_tlb() call after it.
13892 -        * We do this after the printk so that if something went wrong in the
13893 -        * change, the printk gets out at least to give a better debug hint
13894 -        * of who is the culprit.
13895 -        */
13896 -       global_flush_tlb();
13897 +       printk(KERN_INFO "Testing CPA: write protecting again\n");
13898 +       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13899 +#endif
13900  }
13901  #endif
13902
13903  void free_init_pages(char *what, unsigned long begin, unsigned long end)
13904  {
13905 +#ifdef CONFIG_DEBUG_PAGEALLOC
13906 +       /*
13907 +        * If debugging page accesses then do not free this memory but
13908 +        * mark them not present - any buggy init-section access will
13909 +        * create a kernel page fault:
13910 +        */
13911 +       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
13912 +               begin, PAGE_ALIGN(end));
13913 +       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
13914 +#else
13915         unsigned long addr;
13916
13917 +       /*
13918 +        * We just marked the kernel text read only above, now that
13919 +        * we are going to free part of that, we need to make that
13920 +        * writeable first.
13921 +        */
13922 +       set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
13923 +
13924         for (addr = begin; addr < end; addr += PAGE_SIZE) {
13925                 ClearPageReserved(virt_to_page(addr));
13926                 init_page_count(virt_to_page(addr));
13927 @@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
13928                 totalram_pages++;
13929         }
13930         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
13931 +#endif
13932  }
13933
13934  void free_initmem(void)
13935 @@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
13936         free_init_pages("initrd memory", start, end);
13937  }
13938  #endif
13939 -
13940 --- sle11-2009-05-14.orig/arch/x86/mm/init_64-xen.c     2009-02-16 16:18:36.000000000 +0100
13941 +++ sle11-2009-05-14/arch/x86/mm/init_64-xen.c  2009-03-16 16:33:40.000000000 +0100
13942 @@ -46,14 +46,13 @@
13943  #include <asm/proto.h>
13944  #include <asm/smp.h>
13945  #include <asm/sections.h>
13946 +#include <asm/kdebug.h>
13947 +#include <asm/numa.h>
13948 +#include <asm/cacheflush.h>
13949
13950  #include <xen/features.h>
13951
13952 -#ifndef Dprintk
13953 -#define Dprintk(x...)
13954 -#endif
13955 -
13956 -const struct dma_mapping_ops* dma_ops;
13957 +const struct dma_mapping_ops *dma_ops;
13958  EXPORT_SYMBOL(dma_ops);
13959
13960  #if CONFIG_XEN_COMPAT <= 0x030002
13961 @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
13962         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
13963         __START_KERNEL_map)))
13964
13965 -static void __meminit early_make_page_readonly(void *va, unsigned int feature)
13966 +pmd_t *__init early_get_pmd(unsigned long va)
13967 +{
13968 +       unsigned long addr;
13969 +       unsigned long *page = (unsigned long *)init_level4_pgt;
13970 +
13971 +       addr = page[pgd_index(va)];
13972 +       addr_to_page(addr, page);
13973 +
13974 +       addr = page[pud_index(va)];
13975 +       addr_to_page(addr, page);
13976 +
13977 +       return (pmd_t *)&page[pmd_index(va)];
13978 +}
13979 +
13980 +void __meminit early_make_page_readonly(void *va, unsigned int feature)
13981  {
13982         unsigned long addr, _va = (unsigned long)va;
13983         pte_t pte, *ptep;
13984 @@ -107,76 +120,6 @@ static void __meminit early_make_page_re
13985                 BUG();
13986  }
13987
13988 -static void __make_page_readonly(void *va)
13989 -{
13990 -       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
13991 -       unsigned long addr = (unsigned long) va;
13992 -
13993 -       pgd = pgd_offset_k(addr);
13994 -       pud = pud_offset(pgd, addr);
13995 -       pmd = pmd_offset(pud, addr);
13996 -       ptep = pte_offset_kernel(pmd, addr);
13997 -
13998 -       pte.pte = ptep->pte & ~_PAGE_RW;
13999 -       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14000 -               xen_l1_entry_update(ptep, pte); /* fallback */
14001 -
14002 -       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14003 -               __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14004 -}
14005 -
14006 -static void __make_page_writable(void *va)
14007 -{
14008 -       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14009 -       unsigned long addr = (unsigned long) va;
14010 -
14011 -       pgd = pgd_offset_k(addr);
14012 -       pud = pud_offset(pgd, addr);
14013 -       pmd = pmd_offset(pud, addr);
14014 -       ptep = pte_offset_kernel(pmd, addr);
14015 -
14016 -       pte.pte = ptep->pte | _PAGE_RW;
14017 -       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14018 -               xen_l1_entry_update(ptep, pte); /* fallback */
14019 -
14020 -       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14021 -               __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14022 -}
14023 -
14024 -void make_page_readonly(void *va, unsigned int feature)
14025 -{
14026 -       if (!xen_feature(feature))
14027 -               __make_page_readonly(va);
14028 -}
14029 -
14030 -void make_page_writable(void *va, unsigned int feature)
14031 -{
14032 -       if (!xen_feature(feature))
14033 -               __make_page_writable(va);
14034 -}
14035 -
14036 -void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14037 -{
14038 -       if (xen_feature(feature))
14039 -               return;
14040 -
14041 -       while (nr-- != 0) {
14042 -               __make_page_readonly(va);
14043 -               va = (void*)((unsigned long)va + PAGE_SIZE);
14044 -       }
14045 -}
14046 -
14047 -void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14048 -{
14049 -       if (xen_feature(feature))
14050 -               return;
14051 -
14052 -       while (nr-- != 0) {
14053 -               __make_page_writable(va);
14054 -               va = (void*)((unsigned long)va + PAGE_SIZE);
14055 -       }
14056 -}
14057 -
14058  /*
14059   * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14060   * physical space so we can cache the place of the first one and move
14061 @@ -187,22 +130,26 @@ void show_mem(void)
14062  {
14063         long i, total = 0, reserved = 0;
14064         long shared = 0, cached = 0;
14065 -       pg_data_t *pgdat;
14066         struct page *page;
14067 +       pg_data_t *pgdat;
14068
14069         printk(KERN_INFO "Mem-info:\n");
14070         show_free_areas();
14071 -       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14072 +       printk(KERN_INFO "Free swap:       %6ldkB\n",
14073 +               nr_swap_pages << (PAGE_SHIFT-10));
14074
14075         for_each_online_pgdat(pgdat) {
14076 -               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14077 -                       /* this loop can take a while with 256 GB and 4k pages
14078 -                          so update the NMI watchdog */
14079 -                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14080 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14081 +                       /*
14082 +                        * This loop can take a while with 256 GB and
14083 +                        * 4k pages so defer the NMI watchdog:
14084 +                        */
14085 +                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14086                                 touch_nmi_watchdog();
14087 -                       }
14088 +
14089                         if (!pfn_valid(pgdat->node_start_pfn + i))
14090                                 continue;
14091 +
14092                         page = pfn_to_page(pgdat->node_start_pfn + i);
14093                         total++;
14094                         if (PageReserved(page))
14095 @@ -211,58 +158,67 @@ void show_mem(void)
14096                                 cached++;
14097                         else if (page_count(page))
14098                                 shared += page_count(page) - 1;
14099 -               }
14100 +               }
14101         }
14102 -       printk(KERN_INFO "%lu pages of RAM\n", total);
14103 -       printk(KERN_INFO "%lu reserved pages\n",reserved);
14104 -       printk(KERN_INFO "%lu pages shared\n",shared);
14105 -       printk(KERN_INFO "%lu pages swap cached\n",cached);
14106 +       printk(KERN_INFO "%lu pages of RAM\n",          total);
14107 +       printk(KERN_INFO "%lu reserved pages\n",        reserved);
14108 +       printk(KERN_INFO "%lu pages shared\n",          shared);
14109 +       printk(KERN_INFO "%lu pages swap cached\n",     cached);
14110  }
14111
14112 +static unsigned long __meminitdata table_start;
14113 +static unsigned long __meminitdata table_end;
14114
14115  static __init void *spp_getpage(void)
14116 -{
14117 +{
14118         void *ptr;
14119 +
14120         if (after_bootmem)
14121 -               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14122 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14123         else if (start_pfn < table_end) {
14124                 ptr = __va(start_pfn << PAGE_SHIFT);
14125                 start_pfn++;
14126                 memset(ptr, 0, PAGE_SIZE);
14127         } else
14128                 ptr = alloc_bootmem_pages(PAGE_SIZE);
14129 -       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14130 -               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14131
14132 -       Dprintk("spp_getpage %p\n", ptr);
14133 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14134 +               panic("set_pte_phys: cannot allocate page data %s\n",
14135 +                       after_bootmem ? "after bootmem" : "");
14136 +       }
14137 +
14138 +       pr_debug("spp_getpage %p\n", ptr);
14139 +
14140         return ptr;
14141 -}
14142 +}
14143
14144  #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14145  #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14146
14147 -static __init void set_pte_phys(unsigned long vaddr,
14148 -                        unsigned long phys, pgprot_t prot, int user_mode)
14149 +static __init void
14150 +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14151  {
14152         pgd_t *pgd;
14153         pud_t *pud;
14154         pmd_t *pmd;
14155         pte_t *pte, new_pte;
14156
14157 -       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14158 +       pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14159
14160         pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14161         if (pgd_none(*pgd)) {
14162 -               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14163 +               printk(KERN_ERR
14164 +                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14165                 return;
14166         }
14167         pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14168         if (pud_none(*pud)) {
14169 -               pmd = (pmd_t *) spp_getpage();
14170 +               pmd = (pmd_t *) spp_getpage();
14171                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14172                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14173                 if (pmd != pmd_offset(pud, 0)) {
14174 -                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14175 +                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14176 +                               pmd, pmd_offset(pud, 0));
14177                         return;
14178                 }
14179         }
14180 @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14181                 make_page_readonly(pte, XENFEAT_writable_page_tables);
14182                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14183                 if (pte != pte_offset_kernel(pmd, 0)) {
14184 -                       printk("PAGETABLE BUG #02!\n");
14185 +                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
14186                         return;
14187                 }
14188         }
14189 @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14190         __flush_tlb_one(vaddr);
14191  }
14192
14193 -static __init void set_pte_phys_ma(unsigned long vaddr,
14194 -                                  unsigned long phys, pgprot_t prot)
14195 +static __init void
14196 +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14197  {
14198         pgd_t *pgd;
14199         pud_t *pud;
14200         pmd_t *pmd;
14201         pte_t *pte, new_pte;
14202
14203 -       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14204 +       pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14205
14206         pgd = pgd_offset_k(vaddr);
14207         if (pgd_none(*pgd)) {
14208 -               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14209 +               printk(KERN_ERR
14210 +                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14211                 return;
14212         }
14213         pud = pud_offset(pgd, vaddr);
14214         if (pud_none(*pud)) {
14215 -
14216 -               pmd = (pmd_t *) spp_getpage();
14217 +               pmd = (pmd_t *) spp_getpage();
14218                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14219                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14220                 if (pmd != pmd_offset(pud, 0)) {
14221 -                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14222 -                       return;
14223 +                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14224 +                               pmd, pmd_offset(pud, 0));
14225                 }
14226         }
14227         pmd = pmd_offset(pud, vaddr);
14228 @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14229                 make_page_readonly(pte, XENFEAT_writable_page_tables);
14230                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14231                 if (pte != pte_offset_kernel(pmd, 0)) {
14232 -                       printk("PAGETABLE BUG #02!\n");
14233 +                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
14234                         return;
14235                 }
14236         }
14237 @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14238         __flush_tlb_one(vaddr);
14239  }
14240
14241 +#ifndef CONFIG_XEN
14242 +/*
14243 + * The head.S code sets up the kernel high mapping:
14244 + *
14245 + *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14246 + *
14247 + * phys_addr holds the negative offset to the kernel, which is added
14248 + * to the compile time generated pmds. This results in invalid pmds up
14249 + * to the point where we hit the physaddr 0 mapping.
14250 + *
14251 + * We limit the mappings to the region from _text to _end.  _end is
14252 + * rounded up to the 2MB boundary. This catches the invalid pmds as
14253 + * well, as they are located before _text:
14254 + */
14255 +void __init cleanup_highmap(void)
14256 +{
14257 +       unsigned long vaddr = __START_KERNEL_map;
14258 +       unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14259 +       pmd_t *pmd = level2_kernel_pgt;
14260 +       pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14261 +
14262 +       for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14263 +               if (!pmd_present(*pmd))
14264 +                       continue;
14265 +               if (vaddr < (unsigned long) _text || vaddr > end)
14266 +                       set_pmd(pmd, __pmd(0));
14267 +       }
14268 +}
14269 +#endif
14270 +
14271  /* NOTE: this is meant to be run only at boot */
14272 -void __init
14273 -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14274 +void __init
14275 +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14276  {
14277         unsigned long address = __fix_to_virt(idx);
14278
14279         if (idx >= __end_of_fixed_addresses) {
14280 -               printk("Invalid __set_fixmap\n");
14281 +               printk(KERN_ERR "Invalid __set_fixmap\n");
14282                 return;
14283         }
14284         switch (idx) {
14285 @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14286         }
14287  }
14288
14289 -unsigned long __meminitdata table_start, table_end;
14290 -
14291  static __meminit void *alloc_static_page(unsigned long *phys)
14292  {
14293         unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14294
14295         if (after_bootmem) {
14296                 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14297 -
14298                 *phys = __pa(adr);
14299 +
14300                 return adr;
14301         }
14302
14303 @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14304
14305  #define PTE_SIZE PAGE_SIZE
14306
14307 -static inline int make_readonly(unsigned long paddr)
14308 +static inline int __meminit make_readonly(unsigned long paddr)
14309  {
14310         extern char __vsyscall_0;
14311         int readonly = 0;
14312 @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14313  /* Must run before zap_low_mappings */
14314  __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14315  {
14316 -       unsigned long vaddr;
14317         pmd_t *pmd, *last_pmd;
14318 +       unsigned long vaddr;
14319         int i, pmds;
14320
14321         pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14322         vaddr = __START_KERNEL_map;
14323         pmd = level2_kernel_pgt;
14324         last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14325 +
14326         for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14327                 for (i = 0; i < pmds; i++) {
14328                         if (pmd_present(pmd[i]))
14329 -                               goto next;
14330 +                               goto continue_outer_loop;
14331                 }
14332                 vaddr += addr & ~PMD_MASK;
14333                 addr &= PMD_MASK;
14334 +
14335                 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14336 -                       set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14337 -               __flush_tlb();
14338 +                       set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14339 +               __flush_tlb_all();
14340 +
14341                 return (void *)vaddr;
14342 -       next:
14343 +continue_outer_loop:
14344                 ;
14345         }
14346         printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14347         return NULL;
14348  }
14349
14350 -/* To avoid virtual aliases later */
14351 +/*
14352 + * To avoid virtual aliases later:
14353 + */
14354  __meminit void early_iounmap(void *addr, unsigned long size)
14355  {
14356         unsigned long vaddr;
14357 @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14358         vaddr = (unsigned long)addr;
14359         pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14360         pmd = level2_kernel_pgt + pmd_index(vaddr);
14361 +
14362         for (i = 0; i < pmds; i++)
14363                 pmd_clear(pmd + i);
14364 -       __flush_tlb();
14365 +
14366 +       __flush_tlb_all();
14367  }
14368  #endif
14369
14370 @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14371  static void __meminit
14372  phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14373  {
14374 -       pmd_t *pmd = pmd_offset(pud,0);
14375 +       pmd_t *pmd = pmd_offset(pud, 0);
14376         spin_lock(&init_mm.page_table_lock);
14377         phys_pmd_init(pmd, address, end);
14378         spin_unlock(&init_mm.page_table_lock);
14379         __flush_tlb_all();
14380  }
14381
14382 -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14383 -{
14384 +static void __meminit
14385 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14386 +{
14387         int i = pud_index(addr);
14388
14389 -       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14390 +       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14391                 unsigned long pmd_phys;
14392                 pud_t *pud = pud_page + pud_index(addr);
14393                 pmd_t *pmd;
14394 @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14395
14396                 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14397         }
14398 -       __flush_tlb();
14399 -}
14400 +       __flush_tlb_all();
14401 +}
14402
14403  void __init xen_init_pt(void)
14404  {
14405 @@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14406  static void __init extend_init_mapping(unsigned long tables_space)
14407  {
14408         unsigned long va = __START_KERNEL_map;
14409 +       unsigned long start = start_pfn;
14410         unsigned long phys, addr, *pte_page;
14411         pmd_t *pmd;
14412         pte_t *pte, new_pte;
14413 @@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14414                         BUG();
14415                 va += PAGE_SIZE;
14416         }
14417 +
14418 +       if (start_pfn > start)
14419 +               reserve_early(start << PAGE_SHIFT,
14420 +                             start_pfn << PAGE_SHIFT, "INITMAP");
14421  }
14422
14423  static void __init find_early_table_space(unsigned long end)
14424 @@ -706,7 +703,7 @@ static void __init find_early_table_spac
14425                 (table_start << PAGE_SHIFT) + tables);
14426  }
14427
14428 -static void xen_finish_init_mapping(void)
14429 +static void __init xen_finish_init_mapping(void)
14430  {
14431         unsigned long i, start, end;
14432
14433 @@ -738,13 +735,6 @@ static void xen_finish_init_mapping(void
14434         /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
14435         table_end = ~0UL;
14436
14437 -       /*
14438 -        * Prefetch pte's for the bt_ioremap() area. It gets used before the
14439 -        * boot-time allocator is online, so allocate-on-demand would fail.
14440 -        */
14441 -       for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14442 -               __set_fixmap(i, 0, __pgprot(0));
14443 -
14444         /* Switch to the real shared_info page, and clear the dummy page. */
14445         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14446         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
14447 @@ -764,20 +754,23 @@ static void xen_finish_init_mapping(void
14448         table_end = start_pfn;
14449  }
14450
14451 -/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14452 -   This runs before bootmem is initialized and gets pages directly from the
14453 -   physical memory. To access them they are temporarily mapped. */
14454 +/*
14455 + * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14456 + * This runs before bootmem is initialized and gets pages directly from
14457 + * the physical memory. To access them they are temporarily mapped.
14458 + */
14459  void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14460 -{
14461 +{
14462         unsigned long next;
14463
14464 -       Dprintk("init_memory_mapping\n");
14465 +       pr_debug("init_memory_mapping\n");
14466
14467 -       /*
14468 +       /*
14469          * Find space for the kernel direct mapping tables.
14470 -        * Later we should allocate these tables in the local node of the memory
14471 -        * mapped.  Unfortunately this is done currently before the nodes are
14472 -        * discovered.
14473 +        *
14474 +        * Later we should allocate these tables in the local node of the
14475 +        * memory mapped. Unfortunately this is done currently before the
14476 +        * nodes are discovered.
14477          */
14478         if (!after_bootmem)
14479                 find_early_table_space(end);
14480 @@ -786,8 +779,8 @@ void __init_refok init_memory_mapping(un
14481         end = (unsigned long)__va(end);
14482
14483         for (; start < end; start = next) {
14484 -               unsigned long pud_phys;
14485                 pgd_t *pgd = pgd_offset_k(start);
14486 +               unsigned long pud_phys;
14487                 pud_t *pud;
14488
14489                 if (after_bootmem)
14490 @@ -795,8 +788,8 @@ void __init_refok init_memory_mapping(un
14491                 else
14492                         pud = alloc_static_page(&pud_phys);
14493                 next = start + PGDIR_SIZE;
14494 -               if (next > end)
14495 -                       next = end;
14496 +               if (next > end)
14497 +                       next = end;
14498                 phys_pud_init(pud, __pa(start), __pa(next));
14499                 if (!after_bootmem) {
14500                         early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14501 @@ -810,12 +803,17 @@ void __init_refok init_memory_mapping(un
14502         }
14503
14504         __flush_tlb_all();
14505 +
14506 +       if (!after_bootmem)
14507 +               reserve_early(table_start << PAGE_SHIFT,
14508 +                             table_end << PAGE_SHIFT, "PGTABLE");
14509  }
14510
14511  #ifndef CONFIG_NUMA
14512  void __init paging_init(void)
14513  {
14514         unsigned long max_zone_pfns[MAX_NR_ZONES];
14515 +
14516         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14517         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14518         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14519 @@ -829,40 +827,6 @@ void __init paging_init(void)
14520  }
14521  #endif
14522
14523 -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14524 -   from the CPU leading to inconsistent cache lines. address and size
14525 -   must be aligned to 2MB boundaries.
14526 -   Does nothing when the mapping doesn't exist. */
14527 -void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14528 -{
14529 -       unsigned long end = address + size;
14530 -
14531 -       BUG_ON(address & ~LARGE_PAGE_MASK);
14532 -       BUG_ON(size & ~LARGE_PAGE_MASK);
14533 -
14534 -       for (; address < end; address += LARGE_PAGE_SIZE) {
14535 -               pgd_t *pgd = pgd_offset_k(address);
14536 -               pud_t *pud;
14537 -               pmd_t *pmd;
14538 -               if (pgd_none(*pgd))
14539 -                       continue;
14540 -               pud = pud_offset(pgd, address);
14541 -               if (pud_none(*pud))
14542 -                       continue;
14543 -               pmd = pmd_offset(pud, address);
14544 -               if (!pmd || pmd_none(*pmd))
14545 -                       continue;
14546 -               if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14547 -                       /* Could handle this, but it should not happen currently. */
14548 -                       printk(KERN_ERR
14549 -              "clear_kernel_mapping: mapping has been split. will leak memory\n");
14550 -                       pmd_ERROR(*pmd);
14551 -               }
14552 -               set_pmd(pmd, __pmd(0));
14553 -       }
14554 -       __flush_tlb_all();
14555 -}
14556 -
14557  /*
14558   * Memory hotplug specific functions
14559   */
14560 @@ -888,16 +852,12 @@ int arch_add_memory(int nid, u64 start,
14561         unsigned long nr_pages = size >> PAGE_SHIFT;
14562         int ret;
14563
14564 -       init_memory_mapping(start, (start + size -1));
14565 +       init_memory_mapping(start, start + size-1);
14566
14567         ret = __add_pages(zone, start_pfn, nr_pages);
14568 -       if (ret)
14569 -               goto error;
14570 +       WARN_ON(1);
14571
14572         return ret;
14573 -error:
14574 -       printk("%s: Problem encountered in __add_pages!\n", __func__);
14575 -       return ret;
14576  }
14577  EXPORT_SYMBOL_GPL(arch_add_memory);
14578
14579 @@ -911,36 +871,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14580
14581  #endif /* CONFIG_MEMORY_HOTPLUG */
14582
14583 -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14584 -/*
14585 - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14586 - * just online the pages.
14587 - */
14588 -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14589 -{
14590 -       int err = -EIO;
14591 -       unsigned long pfn;
14592 -       unsigned long total = 0, mem = 0;
14593 -       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14594 -               if (pfn_valid(pfn)) {
14595 -                       online_page(pfn_to_page(pfn));
14596 -                       err = 0;
14597 -                       mem++;
14598 -               }
14599 -               total++;
14600 -       }
14601 -       if (!err) {
14602 -               z->spanned_pages += total;
14603 -               z->present_pages += mem;
14604 -               z->zone_pgdat->node_spanned_pages += total;
14605 -               z->zone_pgdat->node_present_pages += mem;
14606 -       }
14607 -       return err;
14608 -}
14609 -#endif
14610 -
14611 -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14612 -                        kcore_vsyscall;
14613 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14614 +                        kcore_modules, kcore_vsyscall;
14615
14616  void __init mem_init(void)
14617  {
14618 @@ -949,8 +881,7 @@ void __init mem_init(void)
14619
14620         pci_iommu_alloc();
14621
14622 -       /* clear the zero-page */
14623 -       memset(empty_zero_page, 0, PAGE_SIZE);
14624 +       /* clear_bss() already clear the empty_zero_page */
14625
14626         reservedpages = 0;
14627
14628 @@ -968,7 +899,6 @@ void __init mem_init(void)
14629         }
14630         reservedpages = end_pfn - totalram_pages -
14631                                         absent_pages_in_range(0, end_pfn);
14632 -
14633         after_bootmem = 1;
14634
14635         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
14636 @@ -976,46 +906,64 @@ void __init mem_init(void)
14637         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
14638
14639         /* Register memory areas for /proc/kcore */
14640 -       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14641 -       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14642 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14643 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14644                    VMALLOC_END-VMALLOC_START);
14645         kclist_add(&kcore_kernel, &_stext, _end - _stext);
14646         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14647 -       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14648 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14649                                  VSYSCALL_END - VSYSCALL_START);
14650
14651 -       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14652 +       printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14653 +                               "%ldk reserved, %ldk data, %ldk init)\n",
14654                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14655                 end_pfn << (PAGE_SHIFT-10),
14656                 codesize >> 10,
14657                 reservedpages << (PAGE_SHIFT-10),
14658                 datasize >> 10,
14659                 initsize >> 10);
14660 +
14661 +       cpa_init();
14662  }
14663
14664  void free_init_pages(char *what, unsigned long begin, unsigned long end)
14665  {
14666 -       unsigned long addr;
14667 +       unsigned long addr = begin;
14668
14669 -       if (begin >= end)
14670 +       if (addr >= end)
14671                 return;
14672
14673 +       /*
14674 +        * If debugging page accesses then do not free this memory but
14675 +        * mark them not present - any buggy init-section access will
14676 +        * create a kernel page fault:
14677 +        */
14678 +#ifdef CONFIG_DEBUG_PAGEALLOC
14679 +       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14680 +               begin, PAGE_ALIGN(end));
14681 +       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14682 +#else
14683         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14684 -       for (addr = begin; addr < end; addr += PAGE_SIZE) {
14685 +
14686 +       for (; addr < end; addr += PAGE_SIZE) {
14687                 ClearPageReserved(virt_to_page(addr));
14688                 init_page_count(virt_to_page(addr));
14689                 memset((void *)(addr & ~(PAGE_SIZE-1)),
14690                        POISON_FREE_INITMEM, PAGE_SIZE);
14691                 if (addr >= __START_KERNEL_map) {
14692                         /* make_readonly() reports all kernel addresses. */
14693 -                       __make_page_writable(__va(__pa(addr)));
14694 -                       change_page_attr_addr(addr, 1, __pgprot(0));
14695 +                       if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14696 +                                                        pfn_pte(__pa(addr) >> PAGE_SHIFT,
14697 +                                                                PAGE_KERNEL),
14698 +                                                        0))
14699 +                               BUG();
14700 +                       if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14701 +                               BUG();
14702                 }
14703                 free_page(addr);
14704                 totalram_pages++;
14705         }
14706 -       if (addr > __START_KERNEL_map)
14707 -               global_flush_tlb();
14708 +#endif
14709  }
14710
14711  void free_initmem(void)
14712 @@ -1026,6 +974,8 @@ void free_initmem(void)
14713  }
14714
14715  #ifdef CONFIG_DEBUG_RODATA
14716 +const int rodata_test_data = 0xC3;
14717 +EXPORT_SYMBOL_GPL(rodata_test_data);
14718
14719  void mark_rodata_ro(void)
14720  {
14721 @@ -1047,18 +997,27 @@ void mark_rodata_ro(void)
14722         if (end <= start)
14723                 return;
14724
14725 -       change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14726
14727         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14728                (end - start) >> 10);
14729 +       set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14730
14731         /*
14732 -        * change_page_attr_addr() requires a global_flush_tlb() call after it.
14733 -        * We do this after the printk so that if something went wrong in the
14734 -        * change, the printk gets out at least to give a better debug hint
14735 -        * of who is the culprit.
14736 +        * The rodata section (but not the kernel text!) should also be
14737 +        * not-executable.
14738          */
14739 -       global_flush_tlb();
14740 +       start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14741 +       set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14742 +
14743 +       rodata_test();
14744 +
14745 +#ifdef CONFIG_CPA_DEBUG
14746 +       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14747 +       set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14748 +
14749 +       printk(KERN_INFO "Testing CPA: again\n");
14750 +       set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14751 +#endif
14752  }
14753  #endif
14754
14755 @@ -1069,17 +1028,21 @@ void free_initrd_mem(unsigned long start
14756  }
14757  #endif
14758
14759 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14760 -{
14761 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14762 +{
14763  #ifdef CONFIG_NUMA
14764         int nid = phys_to_nid(phys);
14765  #endif
14766         unsigned long pfn = phys >> PAGE_SHIFT;
14767 +
14768         if (pfn >= end_pfn) {
14769 -               /* This can happen with kdump kernels when accessing firmware
14770 -                  tables. */
14771 +               /*
14772 +                * This can happen with kdump kernels when accessing
14773 +                * firmware tables:
14774 +                */
14775                 if (pfn < end_pfn_map)
14776                         return;
14777 +
14778                 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14779                                 phys, len);
14780                 return;
14781 @@ -1087,9 +1050,9 @@ void __init reserve_bootmem_generic(unsi
14782
14783         /* Should check here against the e820 map to avoid double free */
14784  #ifdef CONFIG_NUMA
14785 -       reserve_bootmem_node(NODE_DATA(nid), phys, len);
14786 -#else
14787 -       reserve_bootmem(phys, len);
14788 +       reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14789 +#else
14790 +       reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14791  #endif
14792  #ifndef CONFIG_XEN
14793         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14794 @@ -1099,46 +1062,49 @@ void __init reserve_bootmem_generic(unsi
14795  #endif
14796  }
14797
14798 -int kern_addr_valid(unsigned long addr)
14799 -{
14800 +int kern_addr_valid(unsigned long addr)
14801 +{
14802         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14803 -       pgd_t *pgd;
14804 -       pud_t *pud;
14805 -       pmd_t *pmd;
14806 -       pte_t *pte;
14807 +       pgd_t *pgd;
14808 +       pud_t *pud;
14809 +       pmd_t *pmd;
14810 +       pte_t *pte;
14811
14812         if (above != 0 && above != -1UL)
14813 -               return 0;
14814 -
14815 +               return 0;
14816 +
14817         pgd = pgd_offset_k(addr);
14818         if (pgd_none(*pgd))
14819                 return 0;
14820
14821         pud = pud_offset(pgd, addr);
14822         if (pud_none(*pud))
14823 -               return 0;
14824 +               return 0;
14825
14826         pmd = pmd_offset(pud, addr);
14827         if (pmd_none(*pmd))
14828                 return 0;
14829 +
14830         if (pmd_large(*pmd))
14831                 return pfn_valid(pmd_pfn(*pmd));
14832
14833         pte = pte_offset_kernel(pmd, addr);
14834         if (pte_none(*pte))
14835                 return 0;
14836 +
14837         return pfn_valid(pte_pfn(*pte));
14838  }
14839
14840 -/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
14841 -   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14842 -   not need special handling anymore. */
14843 -
14844 +/*
14845 + * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
14846 + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14847 + * not need special handling anymore:
14848 + */
14849  static struct vm_area_struct gate_vma = {
14850 -       .vm_start = VSYSCALL_START,
14851 -       .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14852 -       .vm_page_prot = PAGE_READONLY_EXEC,
14853 -       .vm_flags = VM_READ | VM_EXEC
14854 +       .vm_start       = VSYSCALL_START,
14855 +       .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14856 +       .vm_page_prot   = PAGE_READONLY_EXEC,
14857 +       .vm_flags       = VM_READ | VM_EXEC
14858  };
14859
14860  struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
14861 @@ -1153,14 +1119,17 @@ struct vm_area_struct *get_gate_vma(stru
14862  int in_gate_area(struct task_struct *task, unsigned long addr)
14863  {
14864         struct vm_area_struct *vma = get_gate_vma(task);
14865 +
14866         if (!vma)
14867                 return 0;
14868 +
14869         return (addr >= vma->vm_start) && (addr < vma->vm_end);
14870  }
14871
14872 -/* Use this when you have no reliable task/vma, typically from interrupt
14873 - * context.  It is less reliable than using the task's vma and may give
14874 - * false positives.
14875 +/*
14876 + * Use this when you have no reliable task/vma, typically from interrupt
14877 + * context. It is less reliable than using the task's vma and may give
14878 + * false positives:
14879   */
14880  int in_gate_area_no_task(unsigned long addr)
14881  {
14882 @@ -1180,8 +1149,8 @@ const char *arch_vma_name(struct vm_area
14883  /*
14884   * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
14885   */
14886 -int __meminit vmemmap_populate(struct page *start_page,
14887 -                                               unsigned long size, int node)
14888 +int __meminit
14889 +vmemmap_populate(struct page *start_page, unsigned long size, int node)
14890  {
14891         unsigned long addr = (unsigned long)start_page;
14892         unsigned long end = (unsigned long)(start_page + size);
14893 @@ -1196,6 +1165,7 @@ int __meminit vmemmap_populate(struct pa
14894                 pgd = vmemmap_pgd_populate(addr, node);
14895                 if (!pgd)
14896                         return -ENOMEM;
14897 +
14898                 pud = vmemmap_pud_populate(pgd, addr, node);
14899                 if (!pud)
14900                         return -ENOMEM;
14901 @@ -1203,20 +1173,22 @@ int __meminit vmemmap_populate(struct pa
14902                 pmd = pmd_offset(pud, addr);
14903                 if (pmd_none(*pmd)) {
14904                         pte_t entry;
14905 -                       void *p = vmemmap_alloc_block(PMD_SIZE, node);
14906 +                       void *p;
14907 +
14908 +                       p = vmemmap_alloc_block(PMD_SIZE, node);
14909                         if (!p)
14910                                 return -ENOMEM;
14911
14912 -                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
14913 -                       mk_pte_huge(entry);
14914 -                       set_pmd(pmd, __pmd(pte_val(entry)));
14915 +                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
14916 +                                                       PAGE_KERNEL_LARGE);
14917 +                       set_pmd(pmd, __pmd_ma(__pte_val(entry)));
14918
14919                         printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
14920                                 addr, addr + PMD_SIZE - 1, p, node);
14921 -               } else
14922 +               } else {
14923                         vmemmap_verify((pte_t *)pmd, node, addr, next);
14924 +               }
14925         }
14926 -
14927         return 0;
14928  }
14929  #endif
14930 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
14931 +++ sle11-2009-05-14/arch/x86/mm/ioremap-xen.c  2009-03-16 16:33:40.000000000 +0100
14932 @@ -0,0 +1,687 @@
14933 +/*
14934 + * Re-map IO memory to kernel address space so that we can access it.
14935 + * This is needed for high PCI addresses that aren't mapped in the
14936 + * 640k-1MB IO memory area on PC's
14937 + *
14938 + * (C) Copyright 1995 1996 Linus Torvalds
14939 + */
14940 +
14941 +#include <linux/bootmem.h>
14942 +#include <linux/init.h>
14943 +#include <linux/io.h>
14944 +#include <linux/module.h>
14945 +#include <linux/pfn.h>
14946 +#include <linux/slab.h>
14947 +#include <linux/vmalloc.h>
14948 +
14949 +#include <asm/cacheflush.h>
14950 +#include <asm/e820.h>
14951 +#include <asm/fixmap.h>
14952 +#include <asm/pgtable.h>
14953 +#include <asm/tlbflush.h>
14954 +#include <asm/pgalloc.h>
14955 +
14956 +enum ioremap_mode {
14957 +       IOR_MODE_UNCACHED,
14958 +       IOR_MODE_CACHED,
14959 +};
14960 +
14961 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
14962 +
14963 +unsigned long __phys_addr(unsigned long x)
14964 +{
14965 +       if (x >= __START_KERNEL_map)
14966 +               return x - __START_KERNEL_map + phys_base;
14967 +       return x - PAGE_OFFSET;
14968 +}
14969 +EXPORT_SYMBOL(__phys_addr);
14970 +
14971 +#endif
14972 +
14973 +static int direct_remap_area_pte_fn(pte_t *pte,
14974 +                                   struct page *pmd_page,
14975 +                                   unsigned long address,
14976 +                                   void *data)
14977 +{
14978 +       mmu_update_t **v = (mmu_update_t **)data;
14979 +
14980 +       BUG_ON(!pte_none(*pte));
14981 +
14982 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
14983 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
14984 +       (*v)++;
14985 +
14986 +       return 0;
14987 +}
14988 +
14989 +static int __direct_remap_pfn_range(struct mm_struct *mm,
14990 +                                   unsigned long address,
14991 +                                   unsigned long mfn,
14992 +                                   unsigned long size,
14993 +                                   pgprot_t prot,
14994 +                                   domid_t  domid)
14995 +{
14996 +       int rc;
14997 +       unsigned long i, start_address;
14998 +       mmu_update_t *u, *v, *w;
14999 +
15000 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15001 +       if (u == NULL)
15002 +               return -ENOMEM;
15003 +
15004 +       start_address = address;
15005 +
15006 +       flush_cache_all();
15007 +
15008 +       for (i = 0; i < size; i += PAGE_SIZE) {
15009 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15010 +                       /* Flush a full batch after filling in the PTE ptrs. */
15011 +                       rc = apply_to_page_range(mm, start_address,
15012 +                                                address - start_address,
15013 +                                                direct_remap_area_pte_fn, &w);
15014 +                       if (rc)
15015 +                               goto out;
15016 +                       rc = -EFAULT;
15017 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15018 +                               goto out;
15019 +                       v = w = u;
15020 +                       start_address = address;
15021 +               }
15022 +
15023 +               /*
15024 +                * Fill in the machine address: PTE ptr is done later by
15025 +                * apply_to_page_range().
15026 +                */
15027 +               v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15028 +
15029 +               mfn++;
15030 +               address += PAGE_SIZE;
15031 +               v++;
15032 +       }
15033 +
15034 +       if (v != u) {
15035 +               /* Final batch. */
15036 +               rc = apply_to_page_range(mm, start_address,
15037 +                                        address - start_address,
15038 +                                        direct_remap_area_pte_fn, &w);
15039 +               if (rc)
15040 +                       goto out;
15041 +               rc = -EFAULT;
15042 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15043 +                       goto out;
15044 +       }
15045 +
15046 +       rc = 0;
15047 +
15048 + out:
15049 +       flush_tlb_all();
15050 +
15051 +       free_page((unsigned long)u);
15052 +
15053 +       return rc;
15054 +}
15055 +
15056 +int direct_remap_pfn_range(struct vm_area_struct *vma,
15057 +                          unsigned long address,
15058 +                          unsigned long mfn,
15059 +                          unsigned long size,
15060 +                          pgprot_t prot,
15061 +                          domid_t  domid)
15062 +{
15063 +       if (xen_feature(XENFEAT_auto_translated_physmap))
15064 +               return remap_pfn_range(vma, address, mfn, size, prot);
15065 +
15066 +       if (domid == DOMID_SELF)
15067 +               return -EINVAL;
15068 +
15069 +       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15070 +
15071 +       vma->vm_mm->context.has_foreign_mappings = 1;
15072 +
15073 +       return __direct_remap_pfn_range(
15074 +               vma->vm_mm, address, mfn, size, prot, domid);
15075 +}
15076 +EXPORT_SYMBOL(direct_remap_pfn_range);
15077 +
15078 +int direct_kernel_remap_pfn_range(unsigned long address,
15079 +                                 unsigned long mfn,
15080 +                                 unsigned long size,
15081 +                                 pgprot_t prot,
15082 +                                 domid_t  domid)
15083 +{
15084 +       return __direct_remap_pfn_range(
15085 +               &init_mm, address, mfn, size, prot, domid);
15086 +}
15087 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15088 +
15089 +static int lookup_pte_fn(
15090 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15091 +{
15092 +       uint64_t *ptep = (uint64_t *)data;
15093 +       if (ptep)
15094 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15095 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15096 +       return 0;
15097 +}
15098 +
15099 +int create_lookup_pte_addr(struct mm_struct *mm,
15100 +                          unsigned long address,
15101 +                          uint64_t *ptep)
15102 +{
15103 +       return apply_to_page_range(mm, address, PAGE_SIZE,
15104 +                                  lookup_pte_fn, ptep);
15105 +}
15106 +
15107 +EXPORT_SYMBOL(create_lookup_pte_addr);
15108 +
15109 +static int noop_fn(
15110 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15111 +{
15112 +       return 0;
15113 +}
15114 +
15115 +int touch_pte_range(struct mm_struct *mm,
15116 +                   unsigned long address,
15117 +                   unsigned long size)
15118 +{
15119 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
15120 +}
15121 +
15122 +EXPORT_SYMBOL(touch_pte_range);
15123 +
15124 +#ifdef CONFIG_X86_32
15125 +int page_is_ram(unsigned long pagenr)
15126 +{
15127 +       unsigned long addr, end;
15128 +       int i;
15129 +
15130 +#ifndef CONFIG_XEN
15131 +       /*
15132 +        * A special case is the first 4Kb of memory;
15133 +        * This is a BIOS owned area, not kernel ram, but generally
15134 +        * not listed as such in the E820 table.
15135 +        */
15136 +       if (pagenr == 0)
15137 +               return 0;
15138 +
15139 +       /*
15140 +        * Second special case: Some BIOSen report the PC BIOS
15141 +        * area (640->1Mb) as ram even though it is not.
15142 +        */
15143 +       if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15144 +                   pagenr < (BIOS_END >> PAGE_SHIFT))
15145 +               return 0;
15146 +#endif
15147 +
15148 +       for (i = 0; i < e820.nr_map; i++) {
15149 +               /*
15150 +                * Not usable memory:
15151 +                */
15152 +               if (e820.map[i].type != E820_RAM)
15153 +                       continue;
15154 +               addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15155 +               end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15156 +
15157 +
15158 +               if ((pagenr >= addr) && (pagenr < end))
15159 +                       return 1;
15160 +       }
15161 +       return 0;
15162 +}
15163 +#endif
15164 +
15165 +/*
15166 + * Fix up the linear direct mapping of the kernel to avoid cache attribute
15167 + * conflicts.
15168 + */
15169 +static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15170 +                              enum ioremap_mode mode)
15171 +{
15172 +       unsigned long nrpages = size >> PAGE_SHIFT;
15173 +       int err;
15174 +
15175 +       switch (mode) {
15176 +       case IOR_MODE_UNCACHED:
15177 +       default:
15178 +               err = set_memory_uc(vaddr, nrpages);
15179 +               break;
15180 +       case IOR_MODE_CACHED:
15181 +               err = set_memory_wb(vaddr, nrpages);
15182 +               break;
15183 +       }
15184 +
15185 +       return err;
15186 +}
15187 +
15188 +/*
15189 + * Remap an arbitrary physical address space into the kernel virtual
15190 + * address space. Needed when the kernel wants to access high addresses
15191 + * directly.
15192 + *
15193 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15194 + * have to convert them into an offset in a page-aligned mapping, but the
15195 + * caller shouldn't need to know that small detail.
15196 + */
15197 +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15198 +                              enum ioremap_mode mode)
15199 +{
15200 +       unsigned long mfn, offset, last_addr, vaddr;
15201 +       struct vm_struct *area;
15202 +       pgprot_t prot;
15203 +       domid_t domid = DOMID_IO;
15204 +
15205 +       /* Don't allow wraparound or zero size */
15206 +       last_addr = phys_addr + size - 1;
15207 +       if (!size || last_addr < phys_addr)
15208 +               return NULL;
15209 +
15210 +       /*
15211 +        * Don't remap the low PCI/ISA area, it's always mapped..
15212 +        */
15213 +       if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15214 +               return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15215 +
15216 +       /*
15217 +        * Don't allow anybody to remap normal RAM that we're using..
15218 +        */
15219 +       for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15220 +               unsigned long pfn = mfn_to_local_pfn(mfn);
15221 +
15222 +               if (pfn >= max_pfn)
15223 +                       continue;
15224 +
15225 +               domid = DOMID_SELF;
15226 +
15227 +               if (pfn >= max_pfn_mapped) /* bogus */
15228 +                       continue;
15229 +
15230 +               if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15231 +                       return NULL;
15232 +       }
15233 +
15234 +       switch (mode) {
15235 +       case IOR_MODE_UNCACHED:
15236 +       default:
15237 +               /*
15238 +                * FIXME: we will use UC MINUS for now, as video fb drivers
15239 +                * depend on it. Upcoming ioremap_wc() will fix this behavior.
15240 +                */
15241 +               prot = PAGE_KERNEL_UC_MINUS;
15242 +               break;
15243 +       case IOR_MODE_CACHED:
15244 +               prot = PAGE_KERNEL;
15245 +               break;
15246 +       }
15247 +
15248 +       /*
15249 +        * Mappings have to be page-aligned
15250 +        */
15251 +       offset = phys_addr & ~PAGE_MASK;
15252 +       phys_addr &= PAGE_MASK;
15253 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
15254 +
15255 +       /*
15256 +        * Ok, go for it..
15257 +        */
15258 +       area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15259 +       if (!area)
15260 +               return NULL;
15261 +       area->phys_addr = phys_addr;
15262 +       vaddr = (unsigned long) area->addr;
15263 +       if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15264 +                                    size, prot, domid)) {
15265 +               free_vm_area(area);
15266 +               return NULL;
15267 +       }
15268 +
15269 +       if (ioremap_change_attr(vaddr, size, mode) < 0) {
15270 +               iounmap((void __iomem *) vaddr);
15271 +               return NULL;
15272 +       }
15273 +
15274 +       return (void __iomem *) (vaddr + offset);
15275 +}
15276 +
15277 +/**
15278 + * ioremap_nocache     -   map bus memory into CPU space
15279 + * @offset:    bus address of the memory
15280 + * @size:      size of the resource to map
15281 + *
15282 + * ioremap_nocache performs a platform specific sequence of operations to
15283 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
15284 + * writew/writel functions and the other mmio helpers. The returned
15285 + * address is not guaranteed to be usable directly as a virtual
15286 + * address.
15287 + *
15288 + * This version of ioremap ensures that the memory is marked uncachable
15289 + * on the CPU as well as honouring existing caching rules from things like
15290 + * the PCI bus. Note that there are other caches and buffers on many
15291 + * busses. In particular driver authors should read up on PCI writes
15292 + *
15293 + * It's useful if some control registers are in such an area and
15294 + * write combining or read caching is not desirable:
15295 + *
15296 + * Must be freed with iounmap.
15297 + */
15298 +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15299 +{
15300 +       return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15301 +}
15302 +EXPORT_SYMBOL(ioremap_nocache);
15303 +
15304 +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15305 +{
15306 +       return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15307 +}
15308 +EXPORT_SYMBOL(ioremap_cache);
15309 +
15310 +/**
15311 + * iounmap - Free a IO remapping
15312 + * @addr: virtual address from ioremap_*
15313 + *
15314 + * Caller must ensure there is only one unmapping for the same pointer.
15315 + */
15316 +void iounmap(volatile void __iomem *addr)
15317 +{
15318 +       struct vm_struct *p, *o;
15319 +
15320 +       if ((void __force *)addr <= high_memory)
15321 +               return;
15322 +
15323 +       /*
15324 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
15325 +        * vm_area and by simply returning an address into the kernel mapping
15326 +        * of ISA space.   So handle that here.
15327 +        */
15328 +       if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15329 +               return;
15330 +
15331 +       addr = (volatile void __iomem *)
15332 +               (PAGE_MASK & (unsigned long __force)addr);
15333 +
15334 +       /* Use the vm area unlocked, assuming the caller
15335 +          ensures there isn't another iounmap for the same address
15336 +          in parallel. Reuse of the virtual address is prevented by
15337 +          leaving it in the global lists until we're done with it.
15338 +          cpa takes care of the direct mappings. */
15339 +       read_lock(&vmlist_lock);
15340 +       for (p = vmlist; p; p = p->next) {
15341 +               if (p->addr == addr)
15342 +                       break;
15343 +       }
15344 +       read_unlock(&vmlist_lock);
15345 +
15346 +       if (!p) {
15347 +               printk(KERN_ERR "iounmap: bad address %p\n", addr);
15348 +               dump_stack();
15349 +               return;
15350 +       }
15351 +
15352 +       if ((p->flags >> 20) != IOR_MODE_CACHED) {
15353 +               unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15354 +               unsigned long mfn = p->phys_addr;
15355 +               unsigned long va = (unsigned long)addr;
15356 +
15357 +               for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15358 +                       if (mfn_to_local_pfn(mfn) < max_pfn)
15359 +                               set_memory_wb(va, 1);
15360 +       }
15361 +
15362 +       /* Finally remove it */
15363 +       o = remove_vm_area((void *)addr);
15364 +       BUG_ON(p != o || o == NULL);
15365 +       kfree(p);
15366 +}
15367 +EXPORT_SYMBOL(iounmap);
15368 +
15369 +int __initdata early_ioremap_debug;
15370 +
15371 +static int __init early_ioremap_debug_setup(char *str)
15372 +{
15373 +       early_ioremap_debug = 1;
15374 +
15375 +       return 0;
15376 +}
15377 +early_param("early_ioremap_debug", early_ioremap_debug_setup);
15378 +
15379 +static __initdata int after_paging_init;
15380 +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15381 +                               __attribute__((aligned(PAGE_SIZE)));
15382 +
15383 +#ifdef CONFIG_X86_32
15384 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15385 +{
15386 +       /* Don't assume we're using swapper_pg_dir at this point */
15387 +       pgd_t *base = __va(read_cr3());
15388 +       pgd_t *pgd = &base[pgd_index(addr)];
15389 +       pud_t *pud = pud_offset(pgd, addr);
15390 +       pmd_t *pmd = pmd_offset(pud, addr);
15391 +
15392 +       return pmd;
15393 +}
15394 +#else
15395 +#define early_ioremap_pmd early_get_pmd
15396 +#define make_lowmem_page_readonly early_make_page_readonly
15397 +#define make_lowmem_page_writable make_page_writable
15398 +#endif
15399 +
15400 +static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15401 +{
15402 +       return &bm_pte[pte_index(addr)];
15403 +}
15404 +
15405 +void __init early_ioremap_init(void)
15406 +{
15407 +       pmd_t *pmd;
15408 +
15409 +       if (early_ioremap_debug)
15410 +               printk(KERN_INFO "early_ioremap_init()\n");
15411 +
15412 +       pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15413 +       memset(bm_pte, 0, sizeof(bm_pte));
15414 +       make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
15415 +       pmd_populate_kernel(&init_mm, pmd, bm_pte);
15416 +
15417 +       /*
15418 +        * The boot-ioremap range spans multiple pmds, for which
15419 +        * we are not prepared:
15420 +        */
15421 +       if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
15422 +               WARN_ON(1);
15423 +               printk(KERN_WARNING "pmd %p != %p\n",
15424 +                      pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
15425 +               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
15426 +                       fix_to_virt(FIX_BTMAP_BEGIN));
15427 +               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
15428 +                       fix_to_virt(FIX_BTMAP_END));
15429 +
15430 +               printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
15431 +               printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
15432 +                      FIX_BTMAP_BEGIN);
15433 +       }
15434 +}
15435 +
15436 +#ifdef CONFIG_X86_32
15437 +void __init early_ioremap_clear(void)
15438 +{
15439 +       pmd_t *pmd;
15440 +
15441 +       if (early_ioremap_debug)
15442 +               printk(KERN_INFO "early_ioremap_clear()\n");
15443 +
15444 +       pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15445 +       pmd_clear(pmd);
15446 +       make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
15447 +       /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
15448 +       __flush_tlb_all();
15449 +}
15450 +
15451 +void __init early_ioremap_reset(void)
15452 +{
15453 +       enum fixed_addresses idx;
15454 +       unsigned long addr, phys;
15455 +       pte_t *pte;
15456 +
15457 +       after_paging_init = 1;
15458 +       for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
15459 +               addr = fix_to_virt(idx);
15460 +               pte = early_ioremap_pte(addr);
15461 +               if (pte_present(*pte)) {
15462 +                       phys = __pte_val(*pte) & PAGE_MASK;
15463 +                       set_fixmap(idx, phys);
15464 +               }
15465 +       }
15466 +}
15467 +#endif /* CONFIG_X86_32 */
15468 +
15469 +static void __init __early_set_fixmap(enum fixed_addresses idx,
15470 +                                  unsigned long phys, pgprot_t flags)
15471 +{
15472 +       unsigned long addr = __fix_to_virt(idx);
15473 +       pte_t *pte;
15474 +
15475 +       if (idx >= __end_of_fixed_addresses) {
15476 +               BUG();
15477 +               return;
15478 +       }
15479 +       pte = early_ioremap_pte(addr);
15480 +       if (pgprot_val(flags))
15481 +               set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
15482 +       else
15483 +               pte_clear(NULL, addr, pte);
15484 +       __flush_tlb_one(addr);
15485 +}
15486 +
15487 +static inline void __init early_set_fixmap(enum fixed_addresses idx,
15488 +                                       unsigned long phys)
15489 +{
15490 +       if (after_paging_init)
15491 +               set_fixmap(idx, phys);
15492 +       else
15493 +               __early_set_fixmap(idx, phys, PAGE_KERNEL);
15494 +}
15495 +
15496 +static inline void __init early_clear_fixmap(enum fixed_addresses idx)
15497 +{
15498 +       if (after_paging_init)
15499 +               clear_fixmap(idx);
15500 +       else
15501 +               __early_set_fixmap(idx, 0, __pgprot(0));
15502 +}
15503 +
15504 +
15505 +int __initdata early_ioremap_nested;
15506 +
15507 +static int __init check_early_ioremap_leak(void)
15508 +{
15509 +       if (!early_ioremap_nested)
15510 +               return 0;
15511 +
15512 +       printk(KERN_WARNING
15513 +              "Debug warning: early ioremap leak of %d areas detected.\n",
15514 +              early_ioremap_nested);
15515 +       printk(KERN_WARNING
15516 +              "please boot with early_ioremap_debug and report the dmesg.\n");
15517 +       WARN_ON(1);
15518 +
15519 +       return 1;
15520 +}
15521 +late_initcall(check_early_ioremap_leak);
15522 +
15523 +void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
15524 +{
15525 +       unsigned long offset, last_addr;
15526 +       unsigned int nrpages, nesting;
15527 +       enum fixed_addresses idx0, idx;
15528 +
15529 +       WARN_ON(system_state != SYSTEM_BOOTING);
15530 +
15531 +       nesting = early_ioremap_nested;
15532 +       if (early_ioremap_debug) {
15533 +               printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
15534 +                      phys_addr, size, nesting);
15535 +               dump_stack();
15536 +       }
15537 +
15538 +       /* Don't allow wraparound or zero size */
15539 +       last_addr = phys_addr + size - 1;
15540 +       if (!size || last_addr < phys_addr) {
15541 +               WARN_ON(1);
15542 +               return NULL;
15543 +       }
15544 +
15545 +       if (nesting >= FIX_BTMAPS_NESTING) {
15546 +               WARN_ON(1);
15547 +               return NULL;
15548 +       }
15549 +       early_ioremap_nested++;
15550 +       /*
15551 +        * Mappings have to be page-aligned
15552 +        */
15553 +       offset = phys_addr & ~PAGE_MASK;
15554 +       phys_addr &= PAGE_MASK;
15555 +       size = PAGE_ALIGN(last_addr) - phys_addr;
15556 +
15557 +       /*
15558 +        * Mappings have to fit in the FIX_BTMAP area.
15559 +        */
15560 +       nrpages = size >> PAGE_SHIFT;
15561 +       if (nrpages > NR_FIX_BTMAPS) {
15562 +               WARN_ON(1);
15563 +               return NULL;
15564 +       }
15565 +
15566 +       /*
15567 +        * Ok, go for it..
15568 +        */
15569 +       idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15570 +       idx = idx0;
15571 +       while (nrpages > 0) {
15572 +               early_set_fixmap(idx, phys_addr);
15573 +               phys_addr += PAGE_SIZE;
15574 +               --idx;
15575 +               --nrpages;
15576 +       }
15577 +       if (early_ioremap_debug)
15578 +               printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
15579 +
15580 +       return (void *) (offset + fix_to_virt(idx0));
15581 +}
15582 +
15583 +void __init early_iounmap(void *addr, unsigned long size)
15584 +{
15585 +       unsigned long virt_addr;
15586 +       unsigned long offset;
15587 +       unsigned int nrpages;
15588 +       enum fixed_addresses idx;
15589 +       unsigned int nesting;
15590 +
15591 +       nesting = --early_ioremap_nested;
15592 +       WARN_ON(nesting < 0);
15593 +
15594 +       if (early_ioremap_debug) {
15595 +               printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
15596 +                      size, nesting);
15597 +               dump_stack();
15598 +       }
15599 +
15600 +       virt_addr = (unsigned long)addr;
15601 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
15602 +               WARN_ON(1);
15603 +               return;
15604 +       }
15605 +       offset = virt_addr & ~PAGE_MASK;
15606 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15607 +
15608 +       idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15609 +       while (nrpages > 0) {
15610 +               early_clear_fixmap(idx);
15611 +               --idx;
15612 +               --nrpages;
15613 +       }
15614 +}
15615 +
15616 +void __this_fixmap_does_not_exist(void)
15617 +{
15618 +       WARN_ON(1);
15619 +}
15620 --- sle11-2009-05-14.orig/arch/x86/mm/ioremap_32-xen.c  2009-02-16 16:17:21.000000000 +0100
15621 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
15622 @@ -1,445 +0,0 @@
15623 -/*
15624 - * arch/i386/mm/ioremap.c
15625 - *
15626 - * Re-map IO memory to kernel address space so that we can access it.
15627 - * This is needed for high PCI addresses that aren't mapped in the
15628 - * 640k-1MB IO memory area on PC's
15629 - *
15630 - * (C) Copyright 1995 1996 Linus Torvalds
15631 - */
15632 -
15633 -#include <linux/vmalloc.h>
15634 -#include <linux/init.h>
15635 -#include <linux/slab.h>
15636 -#include <linux/module.h>
15637 -#include <linux/io.h>
15638 -#include <linux/sched.h>
15639 -#include <asm/fixmap.h>
15640 -#include <asm/cacheflush.h>
15641 -#include <asm/tlbflush.h>
15642 -#include <asm/pgtable.h>
15643 -#include <asm/pgalloc.h>
15644 -
15645 -#define ISA_START_ADDRESS      0x0
15646 -#define ISA_END_ADDRESS                0x100000
15647 -
15648 -static int direct_remap_area_pte_fn(pte_t *pte,
15649 -                                   struct page *pmd_page,
15650 -                                   unsigned long address,
15651 -                                   void *data)
15652 -{
15653 -       mmu_update_t **v = (mmu_update_t **)data;
15654 -
15655 -       BUG_ON(!pte_none(*pte));
15656 -
15657 -       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15658 -                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15659 -       (*v)++;
15660 -
15661 -       return 0;
15662 -}
15663 -
15664 -static int __direct_remap_pfn_range(struct mm_struct *mm,
15665 -                                   unsigned long address,
15666 -                                   unsigned long mfn,
15667 -                                   unsigned long size,
15668 -                                   pgprot_t prot,
15669 -                                   domid_t  domid)
15670 -{
15671 -       int rc;
15672 -       unsigned long i, start_address;
15673 -       mmu_update_t *u, *v, *w;
15674 -
15675 -       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15676 -       if (u == NULL)
15677 -               return -ENOMEM;
15678 -
15679 -       start_address = address;
15680 -
15681 -       flush_cache_all();
15682 -
15683 -       for (i = 0; i < size; i += PAGE_SIZE) {
15684 -               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15685 -                       /* Flush a full batch after filling in the PTE ptrs. */
15686 -                       rc = apply_to_page_range(mm, start_address,
15687 -                                                address - start_address,
15688 -                                                direct_remap_area_pte_fn, &w);
15689 -                       if (rc)
15690 -                               goto out;
15691 -                       rc = -EFAULT;
15692 -                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15693 -                               goto out;
15694 -                       v = w = u;
15695 -                       start_address = address;
15696 -               }
15697 -
15698 -               /*
15699 -                * Fill in the machine address: PTE ptr is done later by
15700 -                * apply_to_page_range().
15701 -                */
15702 -               v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15703 -
15704 -               mfn++;
15705 -               address += PAGE_SIZE;
15706 -               v++;
15707 -       }
15708 -
15709 -       if (v != u) {
15710 -               /* Final batch. */
15711 -               rc = apply_to_page_range(mm, start_address,
15712 -                                        address - start_address,
15713 -                                        direct_remap_area_pte_fn, &w);
15714 -               if (rc)
15715 -                       goto out;
15716 -               rc = -EFAULT;
15717 -               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15718 -                       goto out;
15719 -       }
15720 -
15721 -       rc = 0;
15722 -
15723 - out:
15724 -       flush_tlb_all();
15725 -
15726 -       free_page((unsigned long)u);
15727 -
15728 -       return rc;
15729 -}
15730 -
15731 -int direct_remap_pfn_range(struct vm_area_struct *vma,
15732 -                          unsigned long address,
15733 -                          unsigned long mfn,
15734 -                          unsigned long size,
15735 -                          pgprot_t prot,
15736 -                          domid_t  domid)
15737 -{
15738 -       if (xen_feature(XENFEAT_auto_translated_physmap))
15739 -               return remap_pfn_range(vma, address, mfn, size, prot);
15740 -
15741 -       if (domid == DOMID_SELF)
15742 -               return -EINVAL;
15743 -
15744 -       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15745 -
15746 -       vma->vm_mm->context.has_foreign_mappings = 1;
15747 -
15748 -       return __direct_remap_pfn_range(
15749 -               vma->vm_mm, address, mfn, size, prot, domid);
15750 -}
15751 -EXPORT_SYMBOL(direct_remap_pfn_range);
15752 -
15753 -int direct_kernel_remap_pfn_range(unsigned long address,
15754 -                                 unsigned long mfn,
15755 -                                 unsigned long size,
15756 -                                 pgprot_t prot,
15757 -                                 domid_t  domid)
15758 -{
15759 -       return __direct_remap_pfn_range(
15760 -               &init_mm, address, mfn, size, prot, domid);
15761 -}
15762 -EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15763 -
15764 -static int lookup_pte_fn(
15765 -       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15766 -{
15767 -       uint64_t *ptep = (uint64_t *)data;
15768 -       if (ptep)
15769 -               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15770 -                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15771 -       return 0;
15772 -}
15773 -
15774 -int create_lookup_pte_addr(struct mm_struct *mm,
15775 -                          unsigned long address,
15776 -                          uint64_t *ptep)
15777 -{
15778 -       return apply_to_page_range(mm, address, PAGE_SIZE,
15779 -                                  lookup_pte_fn, ptep);
15780 -}
15781 -
15782 -EXPORT_SYMBOL(create_lookup_pte_addr);
15783 -
15784 -static int noop_fn(
15785 -       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15786 -{
15787 -       return 0;
15788 -}
15789 -
15790 -int touch_pte_range(struct mm_struct *mm,
15791 -                   unsigned long address,
15792 -                   unsigned long size)
15793 -{
15794 -       return apply_to_page_range(mm, address, size, noop_fn, NULL);
15795 -}
15796 -
15797 -EXPORT_SYMBOL(touch_pte_range);
15798 -
15799 -/*
15800 - * Does @address reside within a non-highmem page that is local to this virtual
15801 - * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15802 - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15803 - * why this works.
15804 - */
15805 -static inline int is_local_lowmem(unsigned long address)
15806 -{
15807 -       extern unsigned long max_low_pfn;
15808 -       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15809 -}
15810 -
15811 -/*
15812 - * Generic mapping function (not visible outside):
15813 - */
15814 -
15815 -/*
15816 - * Remap an arbitrary physical address space into the kernel virtual
15817 - * address space. Needed when the kernel wants to access high addresses
15818 - * directly.
15819 - *
15820 - * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15821 - * have to convert them into an offset in a page-aligned mapping, but the
15822 - * caller shouldn't need to know that small detail.
15823 - */
15824 -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15825 -{
15826 -       void __iomem * addr;
15827 -       struct vm_struct * area;
15828 -       unsigned long offset, last_addr;
15829 -       pgprot_t prot;
15830 -       domid_t domid = DOMID_IO;
15831 -
15832 -       /* Don't allow wraparound or zero size */
15833 -       last_addr = phys_addr + size - 1;
15834 -       if (!size || last_addr < phys_addr)
15835 -               return NULL;
15836 -
15837 -       /*
15838 -        * Don't remap the low PCI/ISA area, it's always mapped..
15839 -        */
15840 -       if (is_initial_xendomain() &&
15841 -           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15842 -               return (void __iomem *) isa_bus_to_virt(phys_addr);
15843 -
15844 -       /*
15845 -        * Don't allow anybody to remap normal RAM that we're using..
15846 -        */
15847 -       if (is_local_lowmem(phys_addr)) {
15848 -               char *t_addr, *t_end;
15849 -               struct page *page;
15850 -
15851 -               t_addr = bus_to_virt(phys_addr);
15852 -               t_end = t_addr + (size - 1);
15853 -
15854 -               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15855 -                       if(!PageReserved(page))
15856 -                               return NULL;
15857 -
15858 -               domid = DOMID_SELF;
15859 -       }
15860 -
15861 -       prot = __pgprot(_KERNPG_TABLE | flags);
15862 -
15863 -       /*
15864 -        * Mappings have to be page-aligned
15865 -        */
15866 -       offset = phys_addr & ~PAGE_MASK;
15867 -       phys_addr &= PAGE_MASK;
15868 -       size = PAGE_ALIGN(last_addr+1) - phys_addr;
15869 -
15870 -       /*
15871 -        * Ok, go for it..
15872 -        */
15873 -       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15874 -       if (!area)
15875 -               return NULL;
15876 -       area->phys_addr = phys_addr;
15877 -       addr = (void __iomem *) area->addr;
15878 -       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15879 -                                    phys_addr>>PAGE_SHIFT,
15880 -                                    size, prot, domid)) {
15881 -               vunmap((void __force *) addr);
15882 -               return NULL;
15883 -       }
15884 -       return (void __iomem *) (offset + (char __iomem *)addr);
15885 -}
15886 -EXPORT_SYMBOL(__ioremap);
15887 -
15888 -/**
15889 - * ioremap_nocache     -   map bus memory into CPU space
15890 - * @offset:    bus address of the memory
15891 - * @size:      size of the resource to map
15892 - *
15893 - * ioremap_nocache performs a platform specific sequence of operations to
15894 - * make bus memory CPU accessible via the readb/readw/readl/writeb/
15895 - * writew/writel functions and the other mmio helpers. The returned
15896 - * address is not guaranteed to be usable directly as a virtual
15897 - * address.
15898 - *
15899 - * This version of ioremap ensures that the memory is marked uncachable
15900 - * on the CPU as well as honouring existing caching rules from things like
15901 - * the PCI bus. Note that there are other caches and buffers on many
15902 - * busses. In particular driver authors should read up on PCI writes
15903 - *
15904 - * It's useful if some control registers are in such an area and
15905 - * write combining or read caching is not desirable:
15906 - *
15907 - * Must be freed with iounmap.
15908 - */
15909 -
15910 -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15911 -{
15912 -       unsigned long last_addr;
15913 -       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15914 -       if (!p)
15915 -               return p;
15916 -
15917 -       /* Guaranteed to be > phys_addr, as per __ioremap() */
15918 -       last_addr = phys_addr + size - 1;
15919 -
15920 -       if (is_local_lowmem(last_addr)) {
15921 -               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15922 -               unsigned long npages;
15923 -
15924 -               phys_addr &= PAGE_MASK;
15925 -
15926 -               /* This might overflow and become zero.. */
15927 -               last_addr = PAGE_ALIGN(last_addr);
15928 -
15929 -               /* .. but that's ok, because modulo-2**n arithmetic will make
15930 -               * the page-aligned "last - first" come out right.
15931 -               */
15932 -               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15933 -
15934 -               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15935 -                       iounmap(p);
15936 -                       p = NULL;
15937 -               }
15938 -               global_flush_tlb();
15939 -       }
15940 -
15941 -       return p;
15942 -}
15943 -EXPORT_SYMBOL(ioremap_nocache);
15944 -
15945 -/**
15946 - * iounmap - Free a IO remapping
15947 - * @addr: virtual address from ioremap_*
15948 - *
15949 - * Caller must ensure there is only one unmapping for the same pointer.
15950 - */
15951 -void iounmap(volatile void __iomem *addr)
15952 -{
15953 -       struct vm_struct *p, *o;
15954 -
15955 -       if ((void __force *)addr <= high_memory)
15956 -               return;
15957 -
15958 -       /*
15959 -        * __ioremap special-cases the PCI/ISA range by not instantiating a
15960 -        * vm_area and by simply returning an address into the kernel mapping
15961 -        * of ISA space.   So handle that here.
15962 -        */
15963 -       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15964 -               return;
15965 -
15966 -       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15967 -
15968 -       /* Use the vm area unlocked, assuming the caller
15969 -          ensures there isn't another iounmap for the same address
15970 -          in parallel. Reuse of the virtual address is prevented by
15971 -          leaving it in the global lists until we're done with it.
15972 -          cpa takes care of the direct mappings. */
15973 -       read_lock(&vmlist_lock);
15974 -       for (p = vmlist; p; p = p->next) {
15975 -               if (p->addr == addr)
15976 -                       break;
15977 -       }
15978 -       read_unlock(&vmlist_lock);
15979 -
15980 -       if (!p) {
15981 -               printk("iounmap: bad address %p\n", addr);
15982 -               dump_stack();
15983 -               return;
15984 -       }
15985 -
15986 -       /* Reset the direct mapping. Can block */
15987 -       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15988 -               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15989 -                                get_vm_area_size(p) >> PAGE_SHIFT,
15990 -                                PAGE_KERNEL);
15991 -               global_flush_tlb();
15992 -       }
15993 -
15994 -       /* Finally remove it */
15995 -       o = remove_vm_area((void *)addr);
15996 -       BUG_ON(p != o || o == NULL);
15997 -       kfree(p);
15998 -}
15999 -EXPORT_SYMBOL(iounmap);
16000 -
16001 -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
16002 -{
16003 -       unsigned long offset, last_addr;
16004 -       unsigned int nrpages;
16005 -       enum fixed_addresses idx;
16006 -
16007 -       /* Don't allow wraparound or zero size */
16008 -       last_addr = phys_addr + size - 1;
16009 -       if (!size || last_addr < phys_addr)
16010 -               return NULL;
16011 -
16012 -       /*
16013 -        * Don't remap the low PCI/ISA area, it's always mapped..
16014 -        */
16015 -       if (is_initial_xendomain() &&
16016 -           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
16017 -               return isa_bus_to_virt(phys_addr);
16018 -
16019 -       /*
16020 -        * Mappings have to be page-aligned
16021 -        */
16022 -       offset = phys_addr & ~PAGE_MASK;
16023 -       phys_addr &= PAGE_MASK;
16024 -       size = PAGE_ALIGN(last_addr) - phys_addr;
16025 -
16026 -       /*
16027 -        * Mappings have to fit in the FIX_BTMAP area.
16028 -        */
16029 -       nrpages = size >> PAGE_SHIFT;
16030 -       if (nrpages > NR_FIX_BTMAPS)
16031 -               return NULL;
16032 -
16033 -       /*
16034 -        * Ok, go for it..
16035 -        */
16036 -       idx = FIX_BTMAP_BEGIN;
16037 -       while (nrpages > 0) {
16038 -               set_fixmap(idx, phys_addr);
16039 -               phys_addr += PAGE_SIZE;
16040 -               --idx;
16041 -               --nrpages;
16042 -       }
16043 -       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
16044 -}
16045 -
16046 -void __init bt_iounmap(void *addr, unsigned long size)
16047 -{
16048 -       unsigned long virt_addr;
16049 -       unsigned long offset;
16050 -       unsigned int nrpages;
16051 -       enum fixed_addresses idx;
16052 -
16053 -       virt_addr = (unsigned long)addr;
16054 -       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
16055 -               return;
16056 -       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
16057 -               return;
16058 -       offset = virt_addr & ~PAGE_MASK;
16059 -       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16060 -
16061 -       idx = FIX_BTMAP_BEGIN;
16062 -       while (nrpages > 0) {
16063 -               clear_fixmap(idx);
16064 -               --idx;
16065 -               --nrpages;
16066 -       }
16067 -}
16068 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
16069 +++ sle11-2009-05-14/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
16070 @@ -0,0 +1,1413 @@
16071 +/*
16072 + * Copyright 2002 Andi Kleen, SuSE Labs.
16073 + * Thanks to Ben LaHaise for precious feedback.
16074 + */
16075 +#include <linux/highmem.h>
16076 +#include <linux/bootmem.h>
16077 +#include <linux/module.h>
16078 +#include <linux/sched.h>
16079 +#include <linux/slab.h>
16080 +#include <linux/mm.h>
16081 +#include <linux/interrupt.h>
16082 +
16083 +#include <asm/e820.h>
16084 +#include <asm/processor.h>
16085 +#include <asm/tlbflush.h>
16086 +#include <asm/sections.h>
16087 +#include <asm/uaccess.h>
16088 +#include <asm/pgalloc.h>
16089 +#include <asm/proto.h>
16090 +#include <asm/mmu_context.h>
16091 +
16092 +#ifndef CONFIG_X86_64
16093 +#define TASK_SIZE64 TASK_SIZE
16094 +#endif
16095 +
16096 +static void _pin_lock(struct mm_struct *mm, int lock) {
16097 +       if (lock)
16098 +               spin_lock(&mm->page_table_lock);
16099 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16100 +       /* While mm->page_table_lock protects us against insertions and
16101 +        * removals of higher level page table pages, it doesn't protect
16102 +        * against updates of pte-s. Such updates, however, require the
16103 +        * pte pages to be in consistent state (unpinned+writable or
16104 +        * pinned+readonly). The pinning and attribute changes, however
16105 +        * cannot be done atomically, which is why such updates must be
16106 +        * prevented from happening concurrently.
16107 +        * Note that no pte lock can ever elsewhere be acquired nesting
16108 +        * with an already acquired one in the same mm, or with the mm's
16109 +        * page_table_lock already acquired, as that would break in the
16110 +        * non-split case (where all these are actually resolving to the
16111 +        * one page_table_lock). Thus acquiring all of them here is not
16112 +        * going to result in dead locks, and the order of acquires
16113 +        * doesn't matter.
16114 +        */
16115 +       {
16116 +               pgd_t *pgd = mm->pgd;
16117 +               unsigned g;
16118 +
16119 +               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16120 +                       pud_t *pud;
16121 +                       unsigned u;
16122 +
16123 +                       if (pgd_none(*pgd))
16124 +                               continue;
16125 +                       pud = pud_offset(pgd, 0);
16126 +                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16127 +                               pmd_t *pmd;
16128 +                               unsigned m;
16129 +
16130 +                               if (pud_none(*pud))
16131 +                                       continue;
16132 +                               pmd = pmd_offset(pud, 0);
16133 +                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16134 +                                       spinlock_t *ptl;
16135 +
16136 +                                       if (pmd_none(*pmd))
16137 +                                               continue;
16138 +                                       ptl = pte_lockptr(0, pmd);
16139 +                                       if (lock)
16140 +                                               spin_lock(ptl);
16141 +                                       else
16142 +                                               spin_unlock(ptl);
16143 +                               }
16144 +                       }
16145 +               }
16146 +       }
16147 +#endif
16148 +       if (!lock)
16149 +               spin_unlock(&mm->page_table_lock);
16150 +}
16151 +#define pin_lock(mm) _pin_lock(mm, 1)
16152 +#define pin_unlock(mm) _pin_lock(mm, 0)
16153 +
16154 +#define PIN_BATCH sizeof(void *)
16155 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16156 +
16157 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16158 +                                            unsigned int cpu, unsigned int seq)
16159 +{
16160 +       unsigned long pfn = page_to_pfn(page);
16161 +
16162 +       if (PageHighMem(page)) {
16163 +               if (pgprot_val(flags) & _PAGE_RW)
16164 +                       ClearPagePinned(page);
16165 +               else
16166 +                       SetPagePinned(page);
16167 +       } else {
16168 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16169 +                                       (unsigned long)__va(pfn << PAGE_SHIFT),
16170 +                                       pfn_pte(pfn, flags), 0);
16171 +               if (unlikely(++seq == PIN_BATCH)) {
16172 +                       if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16173 +                                                               PIN_BATCH, NULL)))
16174 +                               BUG();
16175 +                       seq = 0;
16176 +               }
16177 +       }
16178 +
16179 +       return seq;
16180 +}
16181 +
16182 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16183 +{
16184 +       pgd_t       *pgd = pgd_base;
16185 +       pud_t       *pud;
16186 +       pmd_t       *pmd;
16187 +       int          g,u,m;
16188 +       unsigned int cpu, seq;
16189 +       multicall_entry_t *mcl;
16190 +
16191 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16192 +               return;
16193 +
16194 +       cpu = get_cpu();
16195 +
16196 +       /*
16197 +        * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16198 +        * may not be the 'current' task's pagetables (e.g., current may be
16199 +        * 32-bit, but the pagetables may be for a 64-bit task).
16200 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16201 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16202 +        */
16203 +       for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16204 +               if (pgd_none(*pgd))
16205 +                       continue;
16206 +               pud = pud_offset(pgd, 0);
16207 +               if (PTRS_PER_PUD > 1) /* not folded */
16208 +                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16209 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16210 +                       if (pud_none(*pud))
16211 +                               continue;
16212 +                       pmd = pmd_offset(pud, 0);
16213 +                       if (PTRS_PER_PMD > 1) /* not folded */
16214 +                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16215 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16216 +                               if (pmd_none(*pmd))
16217 +                                       continue;
16218 +                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16219 +                       }
16220 +               }
16221 +       }
16222 +
16223 +       mcl = per_cpu(pb_mcl, cpu);
16224 +#ifdef CONFIG_X86_64
16225 +       if (unlikely(seq > PIN_BATCH - 2)) {
16226 +               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16227 +                       BUG();
16228 +               seq = 0;
16229 +       }
16230 +       MULTI_update_va_mapping(mcl + seq,
16231 +              (unsigned long)__user_pgd(pgd_base),
16232 +              pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16233 +              0);
16234 +       MULTI_update_va_mapping(mcl + seq + 1,
16235 +              (unsigned long)pgd_base,
16236 +              pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16237 +              UVMF_TLB_FLUSH);
16238 +       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16239 +               BUG();
16240 +#else
16241 +       if (likely(seq != 0)) {
16242 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16243 +                       (unsigned long)pgd_base,
16244 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16245 +                       UVMF_TLB_FLUSH);
16246 +               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16247 +                                                       seq + 1, NULL)))
16248 +                       BUG();
16249 +       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16250 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16251 +                       UVMF_TLB_FLUSH))
16252 +               BUG();
16253 +#endif
16254 +
16255 +       put_cpu();
16256 +}
16257 +
16258 +static void __pgd_pin(pgd_t *pgd)
16259 +{
16260 +       pgd_walk(pgd, PAGE_KERNEL_RO);
16261 +       kmap_flush_unused();
16262 +       xen_pgd_pin(__pa(pgd)); /* kernel */
16263 +#ifdef CONFIG_X86_64
16264 +       xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16265 +#endif
16266 +       SetPagePinned(virt_to_page(pgd));
16267 +}
16268 +
16269 +static void __pgd_unpin(pgd_t *pgd)
16270 +{
16271 +       xen_pgd_unpin(__pa(pgd));
16272 +#ifdef CONFIG_X86_64
16273 +       xen_pgd_unpin(__pa(__user_pgd(pgd)));
16274 +#endif
16275 +       pgd_walk(pgd, PAGE_KERNEL);
16276 +       ClearPagePinned(virt_to_page(pgd));
16277 +}
16278 +
16279 +void pgd_test_and_unpin(pgd_t *pgd)
16280 +{
16281 +       if (PagePinned(virt_to_page(pgd)))
16282 +               __pgd_unpin(pgd);
16283 +}
16284 +
16285 +void mm_pin(struct mm_struct *mm)
16286 +{
16287 +       if (xen_feature(XENFEAT_writable_page_tables))
16288 +               return;
16289 +
16290 +       pin_lock(mm);
16291 +       __pgd_pin(mm->pgd);
16292 +       pin_unlock(mm);
16293 +}
16294 +
16295 +void mm_unpin(struct mm_struct *mm)
16296 +{
16297 +       if (xen_feature(XENFEAT_writable_page_tables))
16298 +               return;
16299 +
16300 +       pin_lock(mm);
16301 +       __pgd_unpin(mm->pgd);
16302 +       pin_unlock(mm);
16303 +}
16304 +
16305 +void mm_pin_all(void)
16306 +{
16307 +       struct page *page;
16308 +       unsigned long flags;
16309 +
16310 +       if (xen_feature(XENFEAT_writable_page_tables))
16311 +               return;
16312 +
16313 +       /*
16314 +        * Allow uninterrupted access to the pgd_list. Also protects
16315 +        * __pgd_pin() by disabling preemption.
16316 +        * All other CPUs must be at a safe point (e.g., in stop_machine
16317 +        * or offlined entirely).
16318 +        */
16319 +       spin_lock_irqsave(&pgd_lock, flags);
16320 +       list_for_each_entry(page, &pgd_list, lru) {
16321 +               if (!PagePinned(page))
16322 +                       __pgd_pin((pgd_t *)page_address(page));
16323 +       }
16324 +       spin_unlock_irqrestore(&pgd_lock, flags);
16325 +}
16326 +
16327 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16328 +{
16329 +       if (!PagePinned(virt_to_page(mm->pgd)))
16330 +               mm_pin(mm);
16331 +}
16332 +
16333 +void arch_exit_mmap(struct mm_struct *mm)
16334 +{
16335 +       struct task_struct *tsk = current;
16336 +
16337 +       task_lock(tsk);
16338 +
16339 +       /*
16340 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16341 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16342 +        */
16343 +       if (tsk->active_mm == mm) {
16344 +               tsk->active_mm = &init_mm;
16345 +               atomic_inc(&init_mm.mm_count);
16346 +
16347 +               switch_mm(mm, &init_mm, tsk);
16348 +
16349 +               atomic_dec(&mm->mm_count);
16350 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
16351 +       }
16352 +
16353 +       task_unlock(tsk);
16354 +
16355 +       if (PagePinned(virt_to_page(mm->pgd))
16356 +           && atomic_read(&mm->mm_count) == 1
16357 +           && !mm->context.has_foreign_mappings)
16358 +               mm_unpin(mm);
16359 +}
16360 +
16361 +static void _pte_free(struct page *page, unsigned int order)
16362 +{
16363 +       BUG_ON(order);
16364 +       __pte_free(page);
16365 +}
16366 +
16367 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
16368 +{
16369 +       struct page *pte;
16370 +
16371 +#ifdef CONFIG_HIGHPTE
16372 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
16373 +#else
16374 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16375 +#endif
16376 +       if (pte) {
16377 +               pgtable_page_ctor(pte);
16378 +               SetPageForeign(pte, _pte_free);
16379 +               init_page_count(pte);
16380 +       }
16381 +       return pte;
16382 +}
16383 +
16384 +void __pte_free(pgtable_t pte)
16385 +{
16386 +       if (!PageHighMem(pte)) {
16387 +               unsigned long va = (unsigned long)page_address(pte);
16388 +               unsigned int level;
16389 +               pte_t *ptep = lookup_address(va, &level);
16390 +
16391 +               BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16392 +               if (!pte_write(*ptep)
16393 +                   && HYPERVISOR_update_va_mapping(va,
16394 +                                                   mk_pte(pte, PAGE_KERNEL),
16395 +                                                   0))
16396 +                       BUG();
16397 +       } else
16398 +#ifdef CONFIG_HIGHPTE
16399 +               ClearPagePinned(pte);
16400 +#else
16401 +               BUG();
16402 +#endif
16403 +
16404 +       ClearPageForeign(pte);
16405 +       init_page_count(pte);
16406 +       pgtable_page_dtor(pte);
16407 +       __free_page(pte);
16408 +}
16409 +
16410 +#if PAGETABLE_LEVELS >= 3
16411 +static void _pmd_free(struct page *page, unsigned int order)
16412 +{
16413 +       BUG_ON(order);
16414 +       __pmd_free(page);
16415 +}
16416 +
16417 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
16418 +{
16419 +       struct page *pmd;
16420 +
16421 +       pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16422 +       if (!pmd)
16423 +               return NULL;
16424 +       SetPageForeign(pmd, _pmd_free);
16425 +       init_page_count(pmd);
16426 +       return page_address(pmd);
16427 +}
16428 +
16429 +void __pmd_free(pgtable_t pmd)
16430 +{
16431 +       unsigned long va = (unsigned long)page_address(pmd);
16432 +       unsigned int level;
16433 +       pte_t *ptep = lookup_address(va, &level);
16434 +
16435 +       BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16436 +       if (!pte_write(*ptep)
16437 +           && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
16438 +               BUG();
16439 +
16440 +       ClearPageForeign(pmd);
16441 +       init_page_count(pmd);
16442 +       __free_page(pmd);
16443 +}
16444 +#endif
16445 +
16446 +/* blktap and gntdev need this, as otherwise they would implicitly (and
16447 + * needlessly, as they never use it) reference init_mm. */
16448 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
16449 +                                 unsigned long addr, pte_t *ptep, int full)
16450 +{
16451 +       return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
16452 +}
16453 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
16454 +
16455 +/*
16456 + * The current flushing context - we pass it instead of 5 arguments:
16457 + */
16458 +struct cpa_data {
16459 +       unsigned long   vaddr;
16460 +       pgprot_t        mask_set;
16461 +       pgprot_t        mask_clr;
16462 +       int             numpages;
16463 +       int             flushtlb;
16464 +       unsigned long   pfn;
16465 +};
16466 +
16467 +#ifdef CONFIG_X86_64
16468 +
16469 +static inline unsigned long highmap_start_pfn(void)
16470 +{
16471 +       return __pa(_text) >> PAGE_SHIFT;
16472 +}
16473 +
16474 +static inline unsigned long highmap_end_pfn(void)
16475 +{
16476 +       return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
16477 +}
16478 +
16479 +#endif
16480 +
16481 +#ifdef CONFIG_DEBUG_PAGEALLOC
16482 +# define debug_pagealloc 1
16483 +#else
16484 +# define debug_pagealloc 0
16485 +#endif
16486 +
16487 +static inline int
16488 +within(unsigned long addr, unsigned long start, unsigned long end)
16489 +{
16490 +       return addr >= start && addr < end;
16491 +}
16492 +
16493 +/*
16494 + * Flushing functions
16495 + */
16496 +
16497 +/**
16498 + * clflush_cache_range - flush a cache range with clflush
16499 + * @addr:      virtual start address
16500 + * @size:      number of bytes to flush
16501 + *
16502 + * clflush is an unordered instruction which needs fencing with mfence
16503 + * to avoid ordering issues.
16504 + */
16505 +void clflush_cache_range(void *vaddr, unsigned int size)
16506 +{
16507 +       void *vend = vaddr + size - 1;
16508 +
16509 +       mb();
16510 +
16511 +       for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
16512 +               clflush(vaddr);
16513 +       /*
16514 +        * Flush any possible final partial cacheline:
16515 +        */
16516 +       clflush(vend);
16517 +
16518 +       mb();
16519 +}
16520 +
16521 +static void __cpa_flush_all(void *arg)
16522 +{
16523 +       unsigned long cache = (unsigned long)arg;
16524 +
16525 +       /*
16526 +        * Flush all to work around Errata in early athlons regarding
16527 +        * large page flushing.
16528 +        */
16529 +       __flush_tlb_all();
16530 +
16531 +       if (cache && boot_cpu_data.x86_model >= 4)
16532 +               wbinvd();
16533 +}
16534 +
16535 +static void cpa_flush_all(unsigned long cache)
16536 +{
16537 +       BUG_ON(irqs_disabled());
16538 +
16539 +       on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
16540 +}
16541 +
16542 +static void __cpa_flush_range(void *arg)
16543 +{
16544 +       /*
16545 +        * We could optimize that further and do individual per page
16546 +        * tlb invalidates for a low number of pages. Caveat: we must
16547 +        * flush the high aliases on 64bit as well.
16548 +        */
16549 +       __flush_tlb_all();
16550 +}
16551 +
16552 +static void cpa_flush_range(unsigned long start, int numpages, int cache)
16553 +{
16554 +       unsigned int i, level;
16555 +       unsigned long addr;
16556 +
16557 +       BUG_ON(irqs_disabled());
16558 +       WARN_ON(PAGE_ALIGN(start) != start);
16559 +
16560 +       on_each_cpu(__cpa_flush_range, NULL, 1, 1);
16561 +
16562 +       if (!cache)
16563 +               return;
16564 +
16565 +       /*
16566 +        * We only need to flush on one CPU,
16567 +        * clflush is a MESI-coherent instruction that
16568 +        * will cause all other CPUs to flush the same
16569 +        * cachelines:
16570 +        */
16571 +       for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
16572 +               pte_t *pte = lookup_address(addr, &level);
16573 +
16574 +               /*
16575 +                * Only flush present addresses:
16576 +                */
16577 +               if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
16578 +                       clflush_cache_range((void *) addr, PAGE_SIZE);
16579 +       }
16580 +}
16581 +
16582 +/*
16583 + * Certain areas of memory on x86 require very specific protection flags,
16584 + * for example the BIOS area or kernel text. Callers don't always get this
16585 + * right (again, ioremap() on BIOS memory is not uncommon) so this function
16586 + * checks and fixes these known static required protection bits.
16587 + */
16588 +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
16589 +                                  unsigned long pfn)
16590 +{
16591 +       pgprot_t forbidden = __pgprot(0);
16592 +
16593 +#ifndef CONFIG_XEN
16594 +       /*
16595 +        * The BIOS area between 640k and 1Mb needs to be executable for
16596 +        * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
16597 +        */
16598 +       if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
16599 +               pgprot_val(forbidden) |= _PAGE_NX;
16600 +#endif
16601 +
16602 +       /*
16603 +        * The kernel text needs to be executable for obvious reasons
16604 +        * Does not cover __inittext since that is gone later on. On
16605 +        * 64bit we do not enforce !NX on the low mapping
16606 +        */
16607 +       if (within(address, (unsigned long)_text, (unsigned long)_etext))
16608 +               pgprot_val(forbidden) |= _PAGE_NX;
16609 +
16610 +       /*
16611 +        * The .rodata section needs to be read-only. Using the pfn
16612 +        * catches all aliases.
16613 +        */
16614 +       if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
16615 +                  __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
16616 +               pgprot_val(forbidden) |= _PAGE_RW;
16617 +
16618 +       prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
16619 +
16620 +       return prot;
16621 +}
16622 +
16623 +/*
16624 + * Lookup the page table entry for a virtual address. Return a pointer
16625 + * to the entry and the level of the mapping.
16626 + *
16627 + * Note: We return pud and pmd either when the entry is marked large
16628 + * or when the present bit is not set. Otherwise we would return a
16629 + * pointer to a nonexisting mapping.
16630 + */
16631 +pte_t *lookup_address(unsigned long address, unsigned int *level)
16632 +{
16633 +       pgd_t *pgd = pgd_offset_k(address);
16634 +       pud_t *pud;
16635 +       pmd_t *pmd;
16636 +
16637 +       *level = PG_LEVEL_NONE;
16638 +
16639 +       if (pgd_none(*pgd))
16640 +               return NULL;
16641 +
16642 +       pud = pud_offset(pgd, address);
16643 +       if (pud_none(*pud))
16644 +               return NULL;
16645 +
16646 +       *level = PG_LEVEL_1G;
16647 +       if (pud_large(*pud) || !pud_present(*pud))
16648 +               return (pte_t *)pud;
16649 +
16650 +       pmd = pmd_offset(pud, address);
16651 +       if (pmd_none(*pmd))
16652 +               return NULL;
16653 +
16654 +       *level = PG_LEVEL_2M;
16655 +       if (pmd_large(*pmd) || !pmd_present(*pmd))
16656 +               return (pte_t *)pmd;
16657 +
16658 +       *level = PG_LEVEL_4K;
16659 +
16660 +       return pte_offset_kernel(pmd, address);
16661 +}
16662 +
16663 +/*
16664 + * Set the new pmd in all the pgds we know about:
16665 + */
16666 +static void __set_pmd_pte(pte_t *kpte, unsigned long address,
16667 +                         unsigned int level, pte_t pte)
16668 +{
16669 +       /* change init_mm */
16670 +       switch(level) {
16671 +       case PG_LEVEL_2M:
16672 +               xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
16673 +               break;
16674 +#ifdef CONFIG_X86_64
16675 +       case PG_LEVEL_1G:
16676 +               xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
16677 +               break;
16678 +#endif
16679 +       default:
16680 +               BUG();
16681 +       }
16682 +#ifdef CONFIG_X86_32
16683 +       if (!SHARED_KERNEL_PMD) {
16684 +               struct page *page;
16685 +
16686 +               list_for_each_entry(page, &pgd_list, lru) {
16687 +                       pgd_t *pgd;
16688 +                       pud_t *pud;
16689 +                       pmd_t *pmd;
16690 +
16691 +                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
16692 +                       pud = pud_offset(pgd, address);
16693 +                       pmd = pmd_offset(pud, address);
16694 +                       xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
16695 +               }
16696 +       }
16697 +#endif
16698 +}
16699 +
16700 +static int
16701 +try_preserve_large_page(pte_t *kpte, unsigned long address,
16702 +                       struct cpa_data *cpa)
16703 +{
16704 +       unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
16705 +       pte_t new_pte, old_pte, *tmp;
16706 +       pgprot_t old_prot, new_prot;
16707 +       int i, do_split = 1;
16708 +       unsigned int level;
16709 +
16710 +       spin_lock_irqsave(&pgd_lock, flags);
16711 +       /*
16712 +        * Check for races, another CPU might have split this page
16713 +        * up already:
16714 +        */
16715 +       tmp = lookup_address(address, &level);
16716 +       if (tmp != kpte)
16717 +               goto out_unlock;
16718 +
16719 +       switch (level) {
16720 +       case PG_LEVEL_2M:
16721 +               psize = PMD_PAGE_SIZE;
16722 +               pmask = PMD_PAGE_MASK;
16723 +               break;
16724 +#ifdef CONFIG_X86_64
16725 +       case PG_LEVEL_1G:
16726 +               psize = PUD_PAGE_SIZE;
16727 +               pmask = PUD_PAGE_MASK;
16728 +               break;
16729 +#endif
16730 +       default:
16731 +               do_split = -EINVAL;
16732 +               goto out_unlock;
16733 +       }
16734 +
16735 +       /*
16736 +        * Calculate the number of pages, which fit into this large
16737 +        * page starting at address:
16738 +        */
16739 +       nextpage_addr = (address + psize) & pmask;
16740 +       numpages = (nextpage_addr - address) >> PAGE_SHIFT;
16741 +       if (numpages < cpa->numpages)
16742 +               cpa->numpages = numpages;
16743 +
16744 +       /*
16745 +        * We are safe now. Check whether the new pgprot is the same:
16746 +        */
16747 +       old_pte = *kpte;
16748 +       old_prot = new_prot = pte_pgprot(old_pte);
16749 +
16750 +       pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
16751 +       pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
16752 +
16753 +       /*
16754 +        * old_pte points to the large page base address. So we need
16755 +        * to add the offset of the virtual address:
16756 +        */
16757 +       pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
16758 +       cpa->pfn = pfn;
16759 +
16760 +       new_prot = static_protections(new_prot, address, pfn);
16761 +
16762 +       /*
16763 +        * We need to check the full range, whether
16764 +        * static_protection() requires a different pgprot for one of
16765 +        * the pages in the range we try to preserve:
16766 +        */
16767 +       if (pfn < max_mapnr) {
16768 +               addr = address + PAGE_SIZE;
16769 +               for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
16770 +                    i++, addr += PAGE_SIZE) {
16771 +                       pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
16772 +
16773 +                       if (pgprot_val(chk_prot) != pgprot_val(new_prot))
16774 +                               goto out_unlock;
16775 +               }
16776 +       }
16777 +
16778 +       /*
16779 +        * If there are no changes, return. maxpages has been updated
16780 +        * above:
16781 +        */
16782 +       if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
16783 +               do_split = 0;
16784 +               goto out_unlock;
16785 +       }
16786 +
16787 +       /*
16788 +        * We need to change the attributes. Check, whether we can
16789 +        * change the large page in one go. We request a split, when
16790 +        * the address is not aligned and the number of pages is
16791 +        * smaller than the number of pages in the large page. Note
16792 +        * that we limited the number of possible pages already to
16793 +        * the number of pages in the large page.
16794 +        */
16795 +       if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
16796 +               /*
16797 +                * The address is aligned and the number of pages
16798 +                * covers the full page.
16799 +                */
16800 +               new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
16801 +               __set_pmd_pte(kpte, address, level, new_pte);
16802 +               cpa->flushtlb = 1;
16803 +               do_split = 0;
16804 +       }
16805 +
16806 +out_unlock:
16807 +       spin_unlock_irqrestore(&pgd_lock, flags);
16808 +
16809 +       return do_split;
16810 +}
16811 +
16812 +static LIST_HEAD(page_pool);
16813 +static unsigned long pool_size, pool_pages, pool_low;
16814 +static unsigned long pool_used, pool_failed;
16815 +
16816 +static void cpa_fill_pool(struct page **ret)
16817 +{
16818 +       gfp_t gfp = GFP_KERNEL;
16819 +       unsigned long flags;
16820 +       struct page *p;
16821 +
16822 +       /*
16823 +        * Avoid recursion (on debug-pagealloc) and also signal
16824 +        * our priority to get to these pagetables:
16825 +        */
16826 +       if (current->flags & PF_MEMALLOC)
16827 +               return;
16828 +       current->flags |= PF_MEMALLOC;
16829 +
16830 +       /*
16831 +        * Allocate atomically from atomic contexts:
16832 +        */
16833 +       if (in_atomic() || irqs_disabled() || debug_pagealloc)
16834 +               gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
16835 +
16836 +       while (pool_pages < pool_size || (ret && !*ret)) {
16837 +               p = alloc_pages(gfp, 0);
16838 +               if (!p) {
16839 +                       pool_failed++;
16840 +                       break;
16841 +               }
16842 +               /*
16843 +                * If the call site needs a page right now, provide it:
16844 +                */
16845 +               if (ret && !*ret) {
16846 +                       *ret = p;
16847 +                       continue;
16848 +               }
16849 +               spin_lock_irqsave(&pgd_lock, flags);
16850 +               list_add(&p->lru, &page_pool);
16851 +               pool_pages++;
16852 +               spin_unlock_irqrestore(&pgd_lock, flags);
16853 +       }
16854 +
16855 +       current->flags &= ~PF_MEMALLOC;
16856 +}
16857 +
16858 +#define SHIFT_MB               (20 - PAGE_SHIFT)
16859 +#define ROUND_MB_GB            ((1 << 10) - 1)
16860 +#define SHIFT_MB_GB            10
16861 +#define POOL_PAGES_PER_GB      16
16862 +
16863 +void __init cpa_init(void)
16864 +{
16865 +       struct sysinfo si;
16866 +       unsigned long gb;
16867 +
16868 +       si_meminfo(&si);
16869 +       /*
16870 +        * Calculate the number of pool pages:
16871 +        *
16872 +        * Convert totalram (nr of pages) to MiB and round to the next
16873 +        * GiB. Shift MiB to Gib and multiply the result by
16874 +        * POOL_PAGES_PER_GB:
16875 +        */
16876 +       if (debug_pagealloc) {
16877 +               gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
16878 +               pool_size = POOL_PAGES_PER_GB * gb;
16879 +       } else {
16880 +               pool_size = 1;
16881 +       }
16882 +       pool_low = pool_size;
16883 +
16884 +       cpa_fill_pool(NULL);
16885 +       printk(KERN_DEBUG
16886 +              "CPA: page pool initialized %lu of %lu pages preallocated\n",
16887 +              pool_pages, pool_size);
16888 +}
16889 +
16890 +static int split_large_page(pte_t *kpte, unsigned long address)
16891 +{
16892 +       unsigned long flags, mfn, mfninc = 1;
16893 +       unsigned int i, level;
16894 +       pte_t *pbase, *tmp;
16895 +       pgprot_t ref_prot;
16896 +       struct page *base;
16897 +
16898 +       /*
16899 +        * Get a page from the pool. The pool list is protected by the
16900 +        * pgd_lock, which we have to take anyway for the split
16901 +        * operation:
16902 +        */
16903 +       spin_lock_irqsave(&pgd_lock, flags);
16904 +       if (list_empty(&page_pool)) {
16905 +               spin_unlock_irqrestore(&pgd_lock, flags);
16906 +               base = NULL;
16907 +               cpa_fill_pool(&base);
16908 +               if (!base)
16909 +                       return -ENOMEM;
16910 +               spin_lock_irqsave(&pgd_lock, flags);
16911 +       } else {
16912 +               base = list_first_entry(&page_pool, struct page, lru);
16913 +               list_del(&base->lru);
16914 +               pool_pages--;
16915 +
16916 +               if (pool_pages < pool_low)
16917 +                       pool_low = pool_pages;
16918 +       }
16919 +
16920 +       /*
16921 +        * Check for races, another CPU might have split this page
16922 +        * up for us already:
16923 +        */
16924 +       tmp = lookup_address(address, &level);
16925 +       if (tmp != kpte)
16926 +               goto out_unlock;
16927 +
16928 +       pbase = (pte_t *)page_address(base);
16929 +#ifdef CONFIG_X86_32
16930 +       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
16931 +#endif
16932 +       ref_prot = pte_pgprot(pte_clrhuge(*kpte));
16933 +
16934 +#ifdef CONFIG_X86_64
16935 +       if (level == PG_LEVEL_1G) {
16936 +               mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
16937 +               pgprot_val(ref_prot) |= _PAGE_PSE;
16938 +       }
16939 +#endif
16940 +
16941 +       /*
16942 +        * Get the target mfn from the original entry:
16943 +        */
16944 +       mfn = __pte_mfn(*kpte);
16945 +       for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
16946 +               set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
16947 +
16948 +       /*
16949 +        * Install the new, split up pagetable. Important details here:
16950 +        *
16951 +        * On Intel the NX bit of all levels must be cleared to make a
16952 +        * page executable. See section 4.13.2 of Intel 64 and IA-32
16953 +        * Architectures Software Developer's Manual).
16954 +        *
16955 +        * Mark the entry present. The current mapping might be
16956 +        * set to not present, which we preserved above.
16957 +        */
16958 +       if (!xen_feature(XENFEAT_writable_page_tables) &&
16959 +           HYPERVISOR_update_va_mapping((unsigned long)pbase,
16960 +                                        mk_pte(base, PAGE_KERNEL_RO), 0))
16961 +               BUG();
16962 +       ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
16963 +       pgprot_val(ref_prot) |= _PAGE_PRESENT;
16964 +       __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
16965 +       base = NULL;
16966 +
16967 +out_unlock:
16968 +       /*
16969 +        * If we dropped out via the lookup_address check under
16970 +        * pgd_lock then stick the page back into the pool:
16971 +        */
16972 +       if (base) {
16973 +               list_add(&base->lru, &page_pool);
16974 +               pool_pages++;
16975 +       } else
16976 +               pool_used++;
16977 +       spin_unlock_irqrestore(&pgd_lock, flags);
16978 +
16979 +       return 0;
16980 +}
16981 +
16982 +static int __change_page_attr(struct cpa_data *cpa, int primary)
16983 +{
16984 +       unsigned long address = cpa->vaddr;
16985 +       int do_split, err;
16986 +       unsigned int level;
16987 +       pte_t *kpte, old_pte;
16988 +
16989 +repeat:
16990 +       kpte = lookup_address(address, &level);
16991 +       if (!kpte)
16992 +               return primary ? -EINVAL : 0;
16993 +
16994 +       old_pte = *kpte;
16995 +       if (!__pte_val(old_pte)) {
16996 +               if (!primary)
16997 +                       return 0;
16998 +               printk(KERN_WARNING "CPA: called for zero pte. "
16999 +                      "vaddr = %lx cpa->vaddr = %lx\n", address,
17000 +                      cpa->vaddr);
17001 +               WARN_ON(1);
17002 +               return -EINVAL;
17003 +       }
17004 +
17005 +       if (level == PG_LEVEL_4K) {
17006 +               pte_t new_pte;
17007 +               pgprot_t new_prot = pte_pgprot(old_pte);
17008 +               unsigned long mfn = __pte_mfn(old_pte);
17009 +
17010 +               pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17011 +               pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17012 +
17013 +               new_prot = static_protections(new_prot, address,
17014 +                                             mfn_to_local_pfn(mfn));
17015 +
17016 +               /*
17017 +                * We need to keep the mfn from the existing PTE,
17018 +                * after all we're only going to change it's attributes
17019 +                * not the memory it points to
17020 +                */
17021 +               new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17022 +               cpa->pfn = mfn_to_local_pfn(mfn);
17023 +               /*
17024 +                * Do we really change anything ?
17025 +                */
17026 +               if (__pte_val(old_pte) != __pte_val(new_pte)) {
17027 +                       set_pte_atomic(kpte, new_pte);
17028 +                       cpa->flushtlb = 1;
17029 +               }
17030 +               cpa->numpages = 1;
17031 +               return 0;
17032 +       }
17033 +
17034 +       /*
17035 +        * Check, whether we can keep the large page intact
17036 +        * and just change the pte:
17037 +        */
17038 +       do_split = try_preserve_large_page(kpte, address, cpa);
17039 +       /*
17040 +        * When the range fits into the existing large page,
17041 +        * return. cp->numpages and cpa->tlbflush have been updated in
17042 +        * try_large_page:
17043 +        */
17044 +       if (do_split <= 0)
17045 +               return do_split;
17046 +
17047 +       /*
17048 +        * We have to split the large page:
17049 +        */
17050 +       err = split_large_page(kpte, address);
17051 +       if (!err) {
17052 +               cpa->flushtlb = 1;
17053 +               goto repeat;
17054 +       }
17055 +
17056 +       return err;
17057 +}
17058 +
17059 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17060 +
17061 +static int cpa_process_alias(struct cpa_data *cpa)
17062 +{
17063 +       struct cpa_data alias_cpa;
17064 +       int ret = 0;
17065 +
17066 +       if (cpa->pfn > max_pfn_mapped)
17067 +               return 0;
17068 +
17069 +       /*
17070 +        * No need to redo, when the primary call touched the direct
17071 +        * mapping already:
17072 +        */
17073 +       if (!within(cpa->vaddr, PAGE_OFFSET,
17074 +                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17075 +
17076 +               alias_cpa = *cpa;
17077 +               alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17078 +
17079 +               ret = __change_page_attr_set_clr(&alias_cpa, 0);
17080 +       }
17081 +
17082 +#ifdef CONFIG_X86_64
17083 +       if (ret)
17084 +               return ret;
17085 +       /*
17086 +        * No need to redo, when the primary call touched the high
17087 +        * mapping already:
17088 +        */
17089 +       if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17090 +               return 0;
17091 +
17092 +       /*
17093 +        * If the physical address is inside the kernel map, we need
17094 +        * to touch the high mapped kernel as well:
17095 +        */
17096 +       if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17097 +               return 0;
17098 +
17099 +       alias_cpa = *cpa;
17100 +       alias_cpa.vaddr =
17101 +               (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17102 +
17103 +       /*
17104 +        * The high mapping range is imprecise, so ignore the return value.
17105 +        */
17106 +       __change_page_attr_set_clr(&alias_cpa, 0);
17107 +#endif
17108 +       return ret;
17109 +}
17110 +
17111 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17112 +{
17113 +       int ret, numpages = cpa->numpages;
17114 +
17115 +       while (numpages) {
17116 +               /*
17117 +                * Store the remaining nr of pages for the large page
17118 +                * preservation check.
17119 +                */
17120 +               cpa->numpages = numpages;
17121 +
17122 +               ret = __change_page_attr(cpa, checkalias);
17123 +               if (ret)
17124 +                       return ret;
17125 +
17126 +               if (checkalias) {
17127 +                       ret = cpa_process_alias(cpa);
17128 +                       if (ret)
17129 +                               return ret;
17130 +               }
17131 +
17132 +               /*
17133 +                * Adjust the number of pages with the result of the
17134 +                * CPA operation. Either a large page has been
17135 +                * preserved or a single page update happened.
17136 +                */
17137 +               BUG_ON(cpa->numpages > numpages);
17138 +               numpages -= cpa->numpages;
17139 +               cpa->vaddr += cpa->numpages * PAGE_SIZE;
17140 +       }
17141 +       return 0;
17142 +}
17143 +
17144 +static inline int cache_attr(pgprot_t attr)
17145 +{
17146 +       return pgprot_val(attr) &
17147 +               (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17148 +}
17149 +
17150 +static int change_page_attr_set_clr(unsigned long addr, int numpages,
17151 +                                   pgprot_t mask_set, pgprot_t mask_clr)
17152 +{
17153 +       struct cpa_data cpa;
17154 +       int ret, cache, checkalias;
17155 +
17156 +       /*
17157 +        * Check, if we are requested to change a not supported
17158 +        * feature:
17159 +        */
17160 +       mask_set = canon_pgprot(mask_set);
17161 +       mask_clr = canon_pgprot(mask_clr);
17162 +       if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17163 +               return 0;
17164 +
17165 +       /* Ensure we are PAGE_SIZE aligned */
17166 +       if (addr & ~PAGE_MASK) {
17167 +               addr &= PAGE_MASK;
17168 +               /*
17169 +                * People should not be passing in unaligned addresses:
17170 +                */
17171 +               WARN_ON_ONCE(1);
17172 +       }
17173 +
17174 +       cpa.vaddr = addr;
17175 +       cpa.numpages = numpages;
17176 +       cpa.mask_set = mask_set;
17177 +       cpa.mask_clr = mask_clr;
17178 +       cpa.flushtlb = 0;
17179 +
17180 +       /* No alias checking for _NX bit modifications */
17181 +       checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17182 +
17183 +       ret = __change_page_attr_set_clr(&cpa, checkalias);
17184 +
17185 +       /*
17186 +        * Check whether we really changed something:
17187 +        */
17188 +       if (!cpa.flushtlb)
17189 +               goto out;
17190 +
17191 +       /*
17192 +        * No need to flush, when we did not set any of the caching
17193 +        * attributes:
17194 +        */
17195 +       cache = cache_attr(mask_set);
17196 +
17197 +       /*
17198 +        * On success we use clflush, when the CPU supports it to
17199 +        * avoid the wbindv. If the CPU does not support it and in the
17200 +        * error case we fall back to cpa_flush_all (which uses
17201 +        * wbindv):
17202 +        */
17203 +       if (!ret && cpu_has_clflush)
17204 +               cpa_flush_range(addr, numpages, cache);
17205 +       else
17206 +               cpa_flush_all(cache);
17207 +
17208 +out:
17209 +       cpa_fill_pool(NULL);
17210 +
17211 +       return ret;
17212 +}
17213 +
17214 +static inline int change_page_attr_set(unsigned long addr, int numpages,
17215 +                                      pgprot_t mask)
17216 +{
17217 +       return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17218 +}
17219 +
17220 +static inline int change_page_attr_clear(unsigned long addr, int numpages,
17221 +                                        pgprot_t mask)
17222 +{
17223 +       return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17224 +}
17225 +
17226 +int set_memory_uc(unsigned long addr, int numpages)
17227 +{
17228 +       return change_page_attr_set(addr, numpages,
17229 +                                   __pgprot(_PAGE_PCD));
17230 +}
17231 +EXPORT_SYMBOL(set_memory_uc);
17232 +
17233 +int set_memory_wb(unsigned long addr, int numpages)
17234 +{
17235 +       return change_page_attr_clear(addr, numpages,
17236 +                                     __pgprot(_PAGE_PCD | _PAGE_PWT));
17237 +}
17238 +EXPORT_SYMBOL(set_memory_wb);
17239 +
17240 +int set_memory_x(unsigned long addr, int numpages)
17241 +{
17242 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17243 +}
17244 +EXPORT_SYMBOL(set_memory_x);
17245 +
17246 +int set_memory_nx(unsigned long addr, int numpages)
17247 +{
17248 +       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17249 +}
17250 +EXPORT_SYMBOL(set_memory_nx);
17251 +
17252 +int set_memory_ro(unsigned long addr, int numpages)
17253 +{
17254 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17255 +}
17256 +
17257 +int set_memory_rw(unsigned long addr, int numpages)
17258 +{
17259 +       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17260 +}
17261 +
17262 +int set_memory_np(unsigned long addr, int numpages)
17263 +{
17264 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17265 +}
17266 +
17267 +int set_pages_uc(struct page *page, int numpages)
17268 +{
17269 +       unsigned long addr = (unsigned long)page_address(page);
17270 +
17271 +       return set_memory_uc(addr, numpages);
17272 +}
17273 +EXPORT_SYMBOL(set_pages_uc);
17274 +
17275 +int set_pages_wb(struct page *page, int numpages)
17276 +{
17277 +       unsigned long addr = (unsigned long)page_address(page);
17278 +
17279 +       return set_memory_wb(addr, numpages);
17280 +}
17281 +EXPORT_SYMBOL(set_pages_wb);
17282 +
17283 +int set_pages_x(struct page *page, int numpages)
17284 +{
17285 +       unsigned long addr = (unsigned long)page_address(page);
17286 +
17287 +       return set_memory_x(addr, numpages);
17288 +}
17289 +EXPORT_SYMBOL(set_pages_x);
17290 +
17291 +int set_pages_nx(struct page *page, int numpages)
17292 +{
17293 +       unsigned long addr = (unsigned long)page_address(page);
17294 +
17295 +       return set_memory_nx(addr, numpages);
17296 +}
17297 +EXPORT_SYMBOL(set_pages_nx);
17298 +
17299 +int set_pages_ro(struct page *page, int numpages)
17300 +{
17301 +       unsigned long addr = (unsigned long)page_address(page);
17302 +
17303 +       return set_memory_ro(addr, numpages);
17304 +}
17305 +
17306 +int set_pages_rw(struct page *page, int numpages)
17307 +{
17308 +       unsigned long addr = (unsigned long)page_address(page);
17309 +
17310 +       return set_memory_rw(addr, numpages);
17311 +}
17312 +
17313 +#ifdef CONFIG_DEBUG_PAGEALLOC
17314 +
17315 +static int __set_pages_p(struct page *page, int numpages)
17316 +{
17317 +       struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17318 +                               .numpages = numpages,
17319 +                               .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
17320 +                               .mask_clr = __pgprot(0)};
17321 +
17322 +       return __change_page_attr_set_clr(&cpa, 1);
17323 +}
17324 +
17325 +static int __set_pages_np(struct page *page, int numpages)
17326 +{
17327 +       struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17328 +                               .numpages = numpages,
17329 +                               .mask_set = __pgprot(0),
17330 +                               .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
17331 +
17332 +       return __change_page_attr_set_clr(&cpa, 1);
17333 +}
17334 +
17335 +void kernel_map_pages(struct page *page, int numpages, int enable)
17336 +{
17337 +       if (PageHighMem(page))
17338 +               return;
17339 +       if (!enable) {
17340 +               debug_check_no_locks_freed(page_address(page),
17341 +                                          numpages * PAGE_SIZE);
17342 +       }
17343 +
17344 +       /*
17345 +        * If page allocator is not up yet then do not call c_p_a():
17346 +        */
17347 +       if (!debug_pagealloc_enabled)
17348 +               return;
17349 +
17350 +       /*
17351 +        * The return value is ignored as the calls cannot fail.
17352 +        * Large pages are kept enabled at boot time, and are
17353 +        * split up quickly with DEBUG_PAGEALLOC. If a splitup
17354 +        * fails here (due to temporary memory shortage) no damage
17355 +        * is done because we just keep the largepage intact up
17356 +        * to the next attempt when it will likely be split up:
17357 +        */
17358 +       if (enable)
17359 +               __set_pages_p(page, numpages);
17360 +       else
17361 +               __set_pages_np(page, numpages);
17362 +
17363 +       /*
17364 +        * We should perform an IPI and flush all tlbs,
17365 +        * but that can deadlock->flush only current cpu:
17366 +        */
17367 +       __flush_tlb_all();
17368 +
17369 +       /*
17370 +        * Try to refill the page pool here. We can do this only after
17371 +        * the tlb flush.
17372 +        */
17373 +       cpa_fill_pool(NULL);
17374 +}
17375 +
17376 +#ifdef CONFIG_HIBERNATION
17377 +
17378 +bool kernel_page_present(struct page *page)
17379 +{
17380 +       unsigned int level;
17381 +       pte_t *pte;
17382 +
17383 +       if (PageHighMem(page))
17384 +               return false;
17385 +
17386 +       pte = lookup_address((unsigned long)page_address(page), &level);
17387 +       return (__pte_val(*pte) & _PAGE_PRESENT);
17388 +}
17389 +
17390 +#endif /* CONFIG_HIBERNATION */
17391 +
17392 +#endif /* CONFIG_DEBUG_PAGEALLOC */
17393 +
17394 +static inline int in_secondary_range(unsigned long va)
17395 +{
17396 +#ifdef CONFIG_X86_64
17397 +       return va >= VMALLOC_START && va < VMALLOC_END;
17398 +#else
17399 +       return va >= (unsigned long)high_memory;
17400 +#endif
17401 +}
17402 +
17403 +static void __make_page_readonly(unsigned long va)
17404 +{
17405 +       pte_t *pte;
17406 +       unsigned int level;
17407 +
17408 +       pte = lookup_address(va, &level);
17409 +       BUG_ON(!pte || level != PG_LEVEL_4K);
17410 +       if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
17411 +               BUG();
17412 +       if (in_secondary_range(va)) {
17413 +               unsigned long pfn = pte_pfn(*pte);
17414 +
17415 +#ifdef CONFIG_HIGHMEM
17416 +               if (pfn >= highstart_pfn)
17417 +                       kmap_flush_unused(); /* flush stale writable kmaps */
17418 +               else
17419 +#endif
17420 +                       __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
17421 +       }
17422 +}
17423 +
17424 +static void __make_page_writable(unsigned long va)
17425 +{
17426 +       pte_t *pte;
17427 +       unsigned int level;
17428 +
17429 +       pte = lookup_address(va, &level);
17430 +       BUG_ON(!pte || level != PG_LEVEL_4K);
17431 +       if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
17432 +               BUG();
17433 +       if (in_secondary_range(va)) {
17434 +               unsigned long pfn = pte_pfn(*pte);
17435 +
17436 +#ifdef CONFIG_HIGHMEM
17437 +               if (pfn < highstart_pfn)
17438 +#endif
17439 +                       __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
17440 +       }
17441 +}
17442 +
17443 +void make_page_readonly(void *va, unsigned int feature)
17444 +{
17445 +       if (!xen_feature(feature))
17446 +               __make_page_readonly((unsigned long)va);
17447 +}
17448 +
17449 +void make_page_writable(void *va, unsigned int feature)
17450 +{
17451 +       if (!xen_feature(feature))
17452 +               __make_page_writable((unsigned long)va);
17453 +}
17454 +
17455 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17456 +{
17457 +       unsigned long addr;
17458 +
17459 +       if (xen_feature(feature))
17460 +               return;
17461 +
17462 +       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17463 +               __make_page_readonly(addr);
17464 +}
17465 +
17466 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17467 +{
17468 +       unsigned long addr;
17469 +
17470 +       if (xen_feature(feature))
17471 +               return;
17472 +
17473 +       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17474 +               __make_page_writable(addr);
17475 +}
17476 +
17477 +/*
17478 + * The testcases use internal knowledge of the implementation that shouldn't
17479 + * be exposed to the rest of the kernel. Include these directly here.
17480 + */
17481 +#ifdef CONFIG_CPA_DEBUG
17482 +#include "pageattr-test.c"
17483 +#endif
17484 --- sle11-2009-05-14.orig/arch/x86/mm/pageattr_64-xen.c 2009-02-16 16:18:36.000000000 +0100
17485 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
17486 @@ -1,542 +0,0 @@
17487 -/*
17488 - * Copyright 2002 Andi Kleen, SuSE Labs.
17489 - * Thanks to Ben LaHaise for precious feedback.
17490 - */
17491 -
17492 -#include <linux/mm.h>
17493 -#include <linux/sched.h>
17494 -#include <linux/highmem.h>
17495 -#include <linux/module.h>
17496 -#include <linux/slab.h>
17497 -#include <asm/uaccess.h>
17498 -#include <asm/processor.h>
17499 -#include <asm/tlbflush.h>
17500 -#include <asm/io.h>
17501 -
17502 -#ifdef CONFIG_XEN
17503 -#include <asm/pgalloc.h>
17504 -#include <asm/mmu_context.h>
17505 -
17506 -static void _pin_lock(struct mm_struct *mm, int lock) {
17507 -       if (lock)
17508 -               spin_lock(&mm->page_table_lock);
17509 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17510 -       /* While mm->page_table_lock protects us against insertions and
17511 -        * removals of higher level page table pages, it doesn't protect
17512 -        * against updates of pte-s. Such updates, however, require the
17513 -        * pte pages to be in consistent state (unpinned+writable or
17514 -        * pinned+readonly). The pinning and attribute changes, however
17515 -        * cannot be done atomically, which is why such updates must be
17516 -        * prevented from happening concurrently.
17517 -        * Note that no pte lock can ever elsewhere be acquired nesting
17518 -        * with an already acquired one in the same mm, or with the mm's
17519 -        * page_table_lock already acquired, as that would break in the
17520 -        * non-split case (where all these are actually resolving to the
17521 -        * one page_table_lock). Thus acquiring all of them here is not
17522 -        * going to result in dead locks, and the order of acquires
17523 -        * doesn't matter.
17524 -        */
17525 -       {
17526 -               pgd_t *pgd = mm->pgd;
17527 -               unsigned g;
17528 -
17529 -               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17530 -                       pud_t *pud;
17531 -                       unsigned u;
17532 -
17533 -                       if (pgd_none(*pgd))
17534 -                               continue;
17535 -                       pud = pud_offset(pgd, 0);
17536 -                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17537 -                               pmd_t *pmd;
17538 -                               unsigned m;
17539 -
17540 -                               if (pud_none(*pud))
17541 -                                       continue;
17542 -                               pmd = pmd_offset(pud, 0);
17543 -                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17544 -                                       spinlock_t *ptl;
17545 -
17546 -                                       if (pmd_none(*pmd))
17547 -                                               continue;
17548 -                                       ptl = pte_lockptr(0, pmd);
17549 -                                       if (lock)
17550 -                                               spin_lock(ptl);
17551 -                                       else
17552 -                                               spin_unlock(ptl);
17553 -                               }
17554 -                       }
17555 -               }
17556 -       }
17557 -#endif
17558 -       if (!lock)
17559 -               spin_unlock(&mm->page_table_lock);
17560 -}
17561 -#define pin_lock(mm) _pin_lock(mm, 1)
17562 -#define pin_unlock(mm) _pin_lock(mm, 0)
17563 -
17564 -#define PIN_BATCH 8
17565 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17566 -
17567 -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
17568 -                                            unsigned int cpu, unsigned int seq)
17569 -{
17570 -       struct page *page = virt_to_page(pt);
17571 -       unsigned long pfn = page_to_pfn(page);
17572 -
17573 -       MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17574 -               (unsigned long)__va(pfn << PAGE_SHIFT),
17575 -               pfn_pte(pfn, flags), 0);
17576 -       if (unlikely(++seq == PIN_BATCH)) {
17577 -               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17578 -                                                       PIN_BATCH, NULL)))
17579 -                       BUG();
17580 -               seq = 0;
17581 -       }
17582 -
17583 -       return seq;
17584 -}
17585 -
17586 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17587 -{
17588 -       pgd_t       *pgd = pgd_base;
17589 -       pud_t       *pud;
17590 -       pmd_t       *pmd;
17591 -       pte_t       *pte;
17592 -       int          g,u,m;
17593 -       unsigned int cpu, seq;
17594 -       multicall_entry_t *mcl;
17595 -
17596 -       cpu = get_cpu();
17597 -
17598 -       /*
17599 -        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
17600 -        * be the 'current' task's pagetables (e.g., current may be 32-bit,
17601 -        * but the pagetables may be for a 64-bit task).
17602 -        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
17603 -        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
17604 -        */
17605 -       for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17606 -               if (pgd_none(*pgd))
17607 -                       continue;
17608 -               pud = pud_offset(pgd, 0);
17609 -               if (PTRS_PER_PUD > 1) /* not folded */
17610 -                       seq = pgd_walk_set_prot(pud,flags,cpu,seq);
17611 -               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17612 -                       if (pud_none(*pud))
17613 -                               continue;
17614 -                       pmd = pmd_offset(pud, 0);
17615 -                       if (PTRS_PER_PMD > 1) /* not folded */
17616 -                               seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
17617 -                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17618 -                               if (pmd_none(*pmd))
17619 -                                       continue;
17620 -                               pte = pte_offset_kernel(pmd,0);
17621 -                               seq = pgd_walk_set_prot(pte,flags,cpu,seq);
17622 -                       }
17623 -               }
17624 -       }
17625 -
17626 -       mcl = per_cpu(pb_mcl, cpu);
17627 -       if (unlikely(seq > PIN_BATCH - 2)) {
17628 -               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
17629 -                       BUG();
17630 -               seq = 0;
17631 -       }
17632 -       MULTI_update_va_mapping(mcl + seq,
17633 -              (unsigned long)__user_pgd(pgd_base),
17634 -              pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
17635 -              0);
17636 -       MULTI_update_va_mapping(mcl + seq + 1,
17637 -              (unsigned long)pgd_base,
17638 -              pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17639 -              UVMF_TLB_FLUSH);
17640 -       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
17641 -               BUG();
17642 -
17643 -       put_cpu();
17644 -}
17645 -
17646 -static void __pgd_pin(pgd_t *pgd)
17647 -{
17648 -       pgd_walk(pgd, PAGE_KERNEL_RO);
17649 -       xen_pgd_pin(__pa(pgd)); /* kernel */
17650 -       xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
17651 -       SetPagePinned(virt_to_page(pgd));
17652 -}
17653 -
17654 -static void __pgd_unpin(pgd_t *pgd)
17655 -{
17656 -       xen_pgd_unpin(__pa(pgd));
17657 -       xen_pgd_unpin(__pa(__user_pgd(pgd)));
17658 -       pgd_walk(pgd, PAGE_KERNEL);
17659 -       ClearPagePinned(virt_to_page(pgd));
17660 -}
17661 -
17662 -void pgd_test_and_unpin(pgd_t *pgd)
17663 -{
17664 -       if (PagePinned(virt_to_page(pgd)))
17665 -               __pgd_unpin(pgd);
17666 -}
17667 -
17668 -void mm_pin(struct mm_struct *mm)
17669 -{
17670 -       if (xen_feature(XENFEAT_writable_page_tables))
17671 -               return;
17672 -
17673 -       pin_lock(mm);
17674 -       __pgd_pin(mm->pgd);
17675 -       pin_unlock(mm);
17676 -}
17677 -
17678 -void mm_unpin(struct mm_struct *mm)
17679 -{
17680 -       if (xen_feature(XENFEAT_writable_page_tables))
17681 -               return;
17682 -
17683 -       pin_lock(mm);
17684 -       __pgd_unpin(mm->pgd);
17685 -       pin_unlock(mm);
17686 -}
17687 -
17688 -void mm_pin_all(void)
17689 -{
17690 -       struct page *page;
17691 -       unsigned long flags;
17692 -
17693 -       if (xen_feature(XENFEAT_writable_page_tables))
17694 -               return;
17695 -
17696 -       /*
17697 -        * Allow uninterrupted access to the pgd_list. Also protects
17698 -        * __pgd_pin() by disabling preemption.
17699 -        * All other CPUs must be at a safe point (e.g., in stop_machine
17700 -        * or offlined entirely).
17701 -        */
17702 -       spin_lock_irqsave(&pgd_lock, flags);
17703 -       list_for_each_entry(page, &pgd_list, lru) {
17704 -               if (!PagePinned(page))
17705 -                       __pgd_pin((pgd_t *)page_address(page));
17706 -       }
17707 -       spin_unlock_irqrestore(&pgd_lock, flags);
17708 -}
17709 -
17710 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17711 -{
17712 -       if (!PagePinned(virt_to_page(mm->pgd)))
17713 -               mm_pin(mm);
17714 -}
17715 -
17716 -void arch_exit_mmap(struct mm_struct *mm)
17717 -{
17718 -       struct task_struct *tsk = current;
17719 -
17720 -       task_lock(tsk);
17721 -
17722 -       /*
17723 -        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17724 -        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17725 -        */
17726 -       if (tsk->active_mm == mm) {
17727 -               tsk->active_mm = &init_mm;
17728 -               atomic_inc(&init_mm.mm_count);
17729 -
17730 -               switch_mm(mm, &init_mm, tsk);
17731 -
17732 -               atomic_dec(&mm->mm_count);
17733 -               BUG_ON(atomic_read(&mm->mm_count) == 0);
17734 -       }
17735 -
17736 -       task_unlock(tsk);
17737 -
17738 -       if (PagePinned(virt_to_page(mm->pgd))
17739 -           && (atomic_read(&mm->mm_count) == 1)
17740 -           && !mm->context.has_foreign_mappings)
17741 -               mm_unpin(mm);
17742 -}
17743 -
17744 -static void _pte_free(struct page *page, unsigned int order)
17745 -{
17746 -       BUG_ON(order);
17747 -       pte_free(page);
17748 -}
17749 -
17750 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17751 -{
17752 -       struct page *pte;
17753 -
17754 -       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17755 -       if (pte) {
17756 -               SetPageForeign(pte, _pte_free);
17757 -               init_page_count(pte);
17758 -       }
17759 -       return pte;
17760 -}
17761 -
17762 -void pte_free(struct page *pte)
17763 -{
17764 -       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
17765 -
17766 -       if (!pte_write(*virt_to_ptep(va)))
17767 -               if (HYPERVISOR_update_va_mapping(
17768 -                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
17769 -                       BUG();
17770 -
17771 -       ClearPageForeign(pte);
17772 -       init_page_count(pte);
17773 -
17774 -       __free_page(pte);
17775 -}
17776 -#endif /* CONFIG_XEN */
17777 -
17778 -pte_t *lookup_address(unsigned long address)
17779 -{
17780 -       pgd_t *pgd = pgd_offset_k(address);
17781 -       pud_t *pud;
17782 -       pmd_t *pmd;
17783 -       pte_t *pte;
17784 -       if (pgd_none(*pgd))
17785 -               return NULL;
17786 -       pud = pud_offset(pgd, address);
17787 -       if (!pud_present(*pud))
17788 -               return NULL;
17789 -       pmd = pmd_offset(pud, address);
17790 -       if (!pmd_present(*pmd))
17791 -               return NULL;
17792 -       if (pmd_large(*pmd))
17793 -               return (pte_t *)pmd;
17794 -       pte = pte_offset_kernel(pmd, address);
17795 -       if (pte && !pte_present(*pte))
17796 -               pte = NULL;
17797 -       return pte;
17798 -}
17799 -
17800 -static struct page *split_large_page(unsigned long address, pgprot_t prot,
17801 -                                    pgprot_t ref_prot)
17802 -{
17803 -       int i;
17804 -       unsigned long addr;
17805 -       struct page *base = alloc_pages(GFP_KERNEL, 0);
17806 -       pte_t *pbase;
17807 -       if (!base)
17808 -               return NULL;
17809 -       /*
17810 -        * page_private is used to track the number of entries in
17811 -        * the page table page have non standard attributes.
17812 -        */
17813 -       SetPagePrivate(base);
17814 -       page_private(base) = 0;
17815 -
17816 -       address = __pa(address);
17817 -       addr = address & LARGE_PAGE_MASK;
17818 -       pbase = (pte_t *)page_address(base);
17819 -       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
17820 -               pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
17821 -                                  addr == address ? prot : ref_prot);
17822 -       }
17823 -       return base;
17824 -}
17825 -
17826 -void clflush_cache_range(void *adr, int size)
17827 -{
17828 -       int i;
17829 -       for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
17830 -               clflush(adr+i);
17831 -}
17832 -
17833 -static void flush_kernel_map(void *arg)
17834 -{
17835 -       struct list_head *l = (struct list_head *)arg;
17836 -       struct page *pg;
17837 -
17838 -       /* When clflush is available always use it because it is
17839 -          much cheaper than WBINVD. */
17840 -       /* clflush is still broken. Disable for now. */
17841 -       if (1 || !cpu_has_clflush)
17842 -               asm volatile("wbinvd" ::: "memory");
17843 -       else list_for_each_entry(pg, l, lru) {
17844 -               void *adr = page_address(pg);
17845 -               clflush_cache_range(adr, PAGE_SIZE);
17846 -       }
17847 -       __flush_tlb_all();
17848 -}
17849 -
17850 -static inline void flush_map(struct list_head *l)
17851 -{
17852 -       on_each_cpu(flush_kernel_map, l, 1, 1);
17853 -}
17854 -
17855 -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
17856 -
17857 -static inline void save_page(struct page *fpage)
17858 -{
17859 -       if (!test_and_set_bit(PG_arch_1, &fpage->flags))
17860 -               list_add(&fpage->lru, &deferred_pages);
17861 -}
17862 -
17863 -/*
17864 - * No more special protections in this 2/4MB area - revert to a
17865 - * large page again.
17866 - */
17867 -static void revert_page(unsigned long address, pgprot_t ref_prot)
17868 -{
17869 -       pgd_t *pgd;
17870 -       pud_t *pud;
17871 -       pmd_t *pmd;
17872 -       pte_t large_pte;
17873 -       unsigned long pfn;
17874 -
17875 -       pgd = pgd_offset_k(address);
17876 -       BUG_ON(pgd_none(*pgd));
17877 -       pud = pud_offset(pgd,address);
17878 -       BUG_ON(pud_none(*pud));
17879 -       pmd = pmd_offset(pud, address);
17880 -       BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
17881 -       pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
17882 -       large_pte = pfn_pte(pfn, ref_prot);
17883 -       large_pte = pte_mkhuge(large_pte);
17884 -       set_pte((pte_t *)pmd, large_pte);
17885 -}
17886 -
17887 -static int
17888 -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
17889 -                                  pgprot_t ref_prot)
17890 -{
17891 -       pte_t *kpte;
17892 -       struct page *kpte_page;
17893 -       pgprot_t ref_prot2;
17894 -
17895 -       kpte = lookup_address(address);
17896 -       if (!kpte) return 0;
17897 -       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
17898 -       BUG_ON(PageLRU(kpte_page));
17899 -       BUG_ON(PageCompound(kpte_page));
17900 -       if (pgprot_val(prot) != pgprot_val(ref_prot)) {
17901 -               if (!pte_huge(*kpte)) {
17902 -                       set_pte(kpte, pfn_pte(pfn, prot));
17903 -               } else {
17904 -                       /*
17905 -                        * split_large_page will take the reference for this
17906 -                        * change_page_attr on the split page.
17907 -                        */
17908 -                       struct page *split;
17909 -                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
17910 -                       split = split_large_page(address, prot, ref_prot2);
17911 -                       if (!split)
17912 -                               return -ENOMEM;
17913 -                       pgprot_val(ref_prot2) &= ~_PAGE_NX;
17914 -                       set_pte(kpte, mk_pte(split, ref_prot2));
17915 -                       kpte_page = split;
17916 -               }
17917 -               page_private(kpte_page)++;
17918 -       } else if (!pte_huge(*kpte)) {
17919 -               set_pte(kpte, pfn_pte(pfn, ref_prot));
17920 -               BUG_ON(page_private(kpte_page) == 0);
17921 -               page_private(kpte_page)--;
17922 -       } else
17923 -               BUG();
17924 -
17925 -       /* on x86-64 the direct mapping set at boot is not using 4k pages */
17926 -       /*
17927 -        * ..., but the XEN guest kernels (currently) do:
17928 -        * If the pte was reserved, it means it was created at boot
17929 -        * time (not via split_large_page) and in turn we must not
17930 -        * replace it with a large page.
17931 -        */
17932 -#ifndef CONFIG_XEN
17933 -       BUG_ON(PageReserved(kpte_page));
17934 -#else
17935 -       if (PageReserved(kpte_page))
17936 -               return 0;
17937 -#endif
17938 -
17939 -       save_page(kpte_page);
17940 -       if (page_private(kpte_page) == 0)
17941 -               revert_page(address, ref_prot);
17942 -       return 0;
17943 -}
17944 -
17945 -/*
17946 - * Change the page attributes of an page in the linear mapping.
17947 - *
17948 - * This should be used when a page is mapped with a different caching policy
17949 - * than write-back somewhere - some CPUs do not like it when mappings with
17950 - * different caching policies exist. This changes the page attributes of the
17951 - * in kernel linear mapping too.
17952 - *
17953 - * The caller needs to ensure that there are no conflicting mappings elsewhere.
17954 - * This function only deals with the kernel linear map.
17955 - *
17956 - * Caller must call global_flush_tlb() after this.
17957 - */
17958 -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
17959 -{
17960 -       int err = 0, kernel_map = 0;
17961 -       int i;
17962 -
17963 -       if (address >= __START_KERNEL_map
17964 -           && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
17965 -               address = (unsigned long)__va(__pa(address));
17966 -               kernel_map = 1;
17967 -       }
17968 -
17969 -       down_write(&init_mm.mmap_sem);
17970 -       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
17971 -               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
17972 -
17973 -               if (!kernel_map || pte_present(pfn_pte(0, prot))) {
17974 -                       err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
17975 -                       if (err)
17976 -                               break;
17977 -               }
17978 -               /* Handle kernel mapping too which aliases part of the
17979 -                * lowmem */
17980 -               if (__pa(address) < KERNEL_TEXT_SIZE) {
17981 -                       unsigned long addr2;
17982 -                       pgprot_t prot2;
17983 -                       addr2 = __START_KERNEL_map + __pa(address);
17984 -                       /* Make sure the kernel mappings stay executable */
17985 -                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
17986 -                       err = __change_page_attr(addr2, pfn, prot2,
17987 -                                                PAGE_KERNEL_EXEC);
17988 -               }
17989 -       }
17990 -       up_write(&init_mm.mmap_sem);
17991 -       return err;
17992 -}
17993 -
17994 -/* Don't call this for MMIO areas that may not have a mem_map entry */
17995 -int change_page_attr(struct page *page, int numpages, pgprot_t prot)
17996 -{
17997 -       unsigned long addr = (unsigned long)page_address(page);
17998 -       return change_page_attr_addr(addr, numpages, prot);
17999 -}
18000 -
18001 -void global_flush_tlb(void)
18002 -{
18003 -       struct page *pg, *next;
18004 -       struct list_head l;
18005 -
18006 -       /*
18007 -        * Write-protect the semaphore, to exclude two contexts
18008 -        * doing a list_replace_init() call in parallel and to
18009 -        * exclude new additions to the deferred_pages list:
18010 -        */
18011 -       down_write(&init_mm.mmap_sem);
18012 -       list_replace_init(&deferred_pages, &l);
18013 -       up_write(&init_mm.mmap_sem);
18014 -
18015 -       flush_map(&l);
18016 -
18017 -       list_for_each_entry_safe(pg, next, &l, lru) {
18018 -               list_del(&pg->lru);
18019 -               clear_bit(PG_arch_1, &pg->flags);
18020 -               if (page_private(pg) != 0)
18021 -                       continue;
18022 -               ClearPagePrivate(pg);
18023 -               __free_page(pg);
18024 -       }
18025 -}
18026 -
18027 -EXPORT_SYMBOL(change_page_attr);
18028 -EXPORT_SYMBOL(global_flush_tlb);
18029 --- sle11-2009-05-14.orig/arch/x86/mm/pgtable_32-xen.c  2009-02-16 16:18:36.000000000 +0100
18030 +++ sle11-2009-05-14/arch/x86/mm/pgtable_32-xen.c       2009-03-16 16:33:40.000000000 +0100
18031 @@ -29,8 +29,6 @@
18032  #include <xen/features.h>
18033  #include <asm/hypervisor.h>
18034
18035 -static void pgd_test_and_unpin(pgd_t *pgd);
18036 -
18037  void show_mem(void)
18038  {
18039         int total = 0, reserved = 0;
18040 @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18041         return pte;
18042  }
18043
18044 -static void _pte_free(struct page *page, unsigned int order)
18045 -{
18046 -       BUG_ON(order);
18047 -       pte_free(page);
18048 -}
18049 -
18050 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18051 -{
18052 -       struct page *pte;
18053 -
18054 -#ifdef CONFIG_HIGHPTE
18055 -       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18056 -#else
18057 -       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18058 -#endif
18059 -       if (pte) {
18060 -               SetPageForeign(pte, _pte_free);
18061 -               init_page_count(pte);
18062 -       }
18063 -       return pte;
18064 -}
18065 -
18066 -void pte_free(struct page *pte)
18067 -{
18068 -       unsigned long pfn = page_to_pfn(pte);
18069 -
18070 -       if (!PageHighMem(pte)) {
18071 -               unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18072 -
18073 -               if (!pte_write(*virt_to_ptep(va)))
18074 -                       if (HYPERVISOR_update_va_mapping(
18075 -                               va, pfn_pte(pfn, PAGE_KERNEL), 0))
18076 -                               BUG();
18077 -       } else
18078 -               ClearPagePinned(pte);
18079 -
18080 -       ClearPageForeign(pte);
18081 -       init_page_count(pte);
18082 -
18083 -       __free_page(pte);
18084 -}
18085 -
18086 -void pmd_ctor(struct kmem_cache *cache, void *pmd)
18087 -{
18088 -       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18089 -}
18090 -
18091  /*
18092   * List of all pgd's needed for non-PAE so it can invalidate entries
18093   * in both cached and uncached pgd's; not needed for PAE since the
18094 @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18095   * vmalloc faults work because attached pagetables are never freed.
18096   * -- wli
18097   */
18098 -DEFINE_SPINLOCK(pgd_lock);
18099 -struct page *pgd_list;
18100 -
18101  static inline void pgd_list_add(pgd_t *pgd)
18102  {
18103         struct page *page = virt_to_page(pgd);
18104 -       page->index = (unsigned long)pgd_list;
18105 -       if (pgd_list)
18106 -               set_page_private(pgd_list, (unsigned long)&page->index);
18107 -       pgd_list = page;
18108 -       set_page_private(page, (unsigned long)&pgd_list);
18109 +
18110 +       list_add(&page->lru, &pgd_list);
18111  }
18112
18113  static inline void pgd_list_del(pgd_t *pgd)
18114  {
18115 -       struct page *next, **pprev, *page = virt_to_page(pgd);
18116 -       next = (struct page *)page->index;
18117 -       pprev = (struct page **)page_private(page);
18118 -       *pprev = next;
18119 -       if (next)
18120 -               set_page_private(next, (unsigned long)pprev);
18121 -}
18122 +       struct page *page = virt_to_page(pgd);
18123
18124 +       list_del(&page->lru);
18125 +}
18126
18127 +#define UNSHARED_PTRS_PER_PGD                          \
18128 +       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18129
18130 -#if (PTRS_PER_PMD == 1)
18131 -/* Non-PAE pgd constructor */
18132 -static void pgd_ctor(void *pgd)
18133 +static void pgd_ctor(void *p)
18134  {
18135 +       pgd_t *pgd = p;
18136         unsigned long flags;
18137
18138 -       /* !PAE, no pagetable sharing */
18139 +       pgd_test_and_unpin(pgd);
18140 +
18141 +       /* Clear usermode parts of PGD */
18142         memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18143
18144         spin_lock_irqsave(&pgd_lock, flags);
18145
18146 -       /* must happen under lock */
18147 -       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18148 -                       swapper_pg_dir + USER_PTRS_PER_PGD,
18149 -                       KERNEL_PGD_PTRS);
18150 -
18151 -       paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18152 -                               __pa(swapper_pg_dir) >> PAGE_SHIFT,
18153 -                               USER_PTRS_PER_PGD,
18154 -                               KERNEL_PGD_PTRS);
18155 -       pgd_list_add(pgd);
18156 -       spin_unlock_irqrestore(&pgd_lock, flags);
18157 -}
18158 -#else  /* PTRS_PER_PMD > 1 */
18159 -/* PAE pgd constructor */
18160 -static void pgd_ctor(void *pgd)
18161 -{
18162 -       /* PAE, kernel PMD may be shared */
18163 -
18164 -       if (SHARED_KERNEL_PMD) {
18165 -               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18166 +       /* If the pgd points to a shared pagetable level (either the
18167 +          ptes in non-PAE, or shared PMD in PAE), then just copy the
18168 +          references from swapper_pg_dir. */
18169 +       if (PAGETABLE_LEVELS == 2 ||
18170 +           (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18171 +               clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18172                                 swapper_pg_dir + USER_PTRS_PER_PGD,
18173                                 KERNEL_PGD_PTRS);
18174 -       } else {
18175 -               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18176 +               paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18177 +                                       __pa(swapper_pg_dir) >> PAGE_SHIFT,
18178 +                                       USER_PTRS_PER_PGD,
18179 +                                       KERNEL_PGD_PTRS);
18180         }
18181 +
18182 +       /* list required to sync kernel mapping updates */
18183 +       if (PAGETABLE_LEVELS == 2)
18184 +               pgd_list_add(pgd);
18185 +
18186 +       spin_unlock_irqrestore(&pgd_lock, flags);
18187  }
18188 -#endif /* PTRS_PER_PMD */
18189
18190  static void pgd_dtor(void *pgd)
18191  {
18192         unsigned long flags; /* can be called from interrupt context */
18193
18194 -       if (SHARED_KERNEL_PMD)
18195 -               return;
18196 -
18197 -       paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18198 -       spin_lock_irqsave(&pgd_lock, flags);
18199 -       pgd_list_del(pgd);
18200 -       spin_unlock_irqrestore(&pgd_lock, flags);
18201 +       if (!SHARED_KERNEL_PMD) {
18202 +               spin_lock_irqsave(&pgd_lock, flags);
18203 +               pgd_list_del(pgd);
18204 +               spin_unlock_irqrestore(&pgd_lock, flags);
18205 +       }
18206
18207         pgd_test_and_unpin(pgd);
18208  }
18209
18210 -#define UNSHARED_PTRS_PER_PGD                          \
18211 -       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18212 -
18213 -/* If we allocate a pmd for part of the kernel address space, then
18214 -   make sure its initialized with the appropriate kernel mappings.
18215 -   Otherwise use a cached zeroed pmd.  */
18216 -static pmd_t *pmd_cache_alloc(int idx)
18217 +#ifdef CONFIG_X86_PAE
18218 +/*
18219 + * Mop up any pmd pages which may still be attached to the pgd.
18220 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
18221 + * preallocate which never got a corresponding vma will need to be
18222 + * freed manually.
18223 + */
18224 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18225  {
18226 -       pmd_t *pmd;
18227 +       int i;
18228
18229 -       if (idx >= USER_PTRS_PER_PGD) {
18230 -               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18231 +       for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18232 +               pgd_t pgd = pgdp[i];
18233
18234 -#ifndef CONFIG_XEN
18235 -               if (pmd)
18236 -                       memcpy(pmd,
18237 -                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18238 -                              sizeof(pmd_t) * PTRS_PER_PMD);
18239 -#endif
18240 -       } else
18241 -               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18242 +               if (__pgd_val(pgd) != 0) {
18243 +                       pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18244
18245 -       return pmd;
18246 -}
18247 +                       pgdp[i] = xen_make_pgd(0);
18248
18249 -static void pmd_cache_free(pmd_t *pmd, int idx)
18250 -{
18251 -       if (idx >= USER_PTRS_PER_PGD) {
18252 -               make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18253 -               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18254 -               free_page((unsigned long)pmd);
18255 -       } else
18256 -               kmem_cache_free(pmd_cache, pmd);
18257 +                       paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18258 +                       pmd_free(mm, pmd);
18259 +               }
18260 +       }
18261  }
18262
18263 -pgd_t *pgd_alloc(struct mm_struct *mm)
18264 +/*
18265 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18266 + * updating the top-level pagetable entries to guarantee the
18267 + * processor notices the update.  Since this is expensive, and
18268 + * all 4 top-level entries are used almost immediately in a
18269 + * new process's life, we just pre-populate them here.
18270 + *
18271 + * Also, if we're in a paravirt environment where the kernel pmd is
18272 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18273 + * and initialize the kernel pmds here.
18274 + */
18275 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18276  {
18277 +       pud_t *pud;
18278 +       pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18279 +       unsigned long addr, flags;
18280         int i;
18281 -       pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18282 -       pmd_t **pmds = NULL;
18283 -       unsigned long flags;
18284 -
18285 -       pgd_test_and_unpin(pgd);
18286 -
18287 -       if (PTRS_PER_PMD == 1 || !pgd)
18288 -               return pgd;
18289 -
18290 -#ifdef CONFIG_XEN
18291 -       if (!SHARED_KERNEL_PMD) {
18292 -               /*
18293 -                * We can race save/restore (if we sleep during a GFP_KERNEL memory
18294 -                * allocation). We therefore store virtual addresses of pmds as they
18295 -                * do not change across save/restore, and poke the machine addresses
18296 -                * into the pgdir under the pgd_lock.
18297 -                */
18298 -               pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18299 -               if (!pmds) {
18300 -                       quicklist_free(0, pgd_dtor, pgd);
18301 -                       return NULL;
18302 -               }
18303 -       }
18304 -#endif
18305
18306 -       /* Allocate pmds, remember virtual addresses. */
18307 -       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18308 -               pmd_t *pmd = pmd_cache_alloc(i);
18309 -
18310 -               if (!pmd)
18311 +       /*
18312 +        * We can race save/restore (if we sleep during a GFP_KERNEL memory
18313 +        * allocation). We therefore store virtual addresses of pmds as they
18314 +        * do not change across save/restore, and poke the machine addresses
18315 +        * into the pgdir under the pgd_lock.
18316 +        */
18317 +       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18318 +               pmds[i] = pmd_alloc_one(mm, addr);
18319 +               if (!pmds[i])
18320                         goto out_oom;
18321 -
18322 -               paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18323 -               if (pmds)
18324 -                       pmds[i] = pmd;
18325 -               else
18326 -                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18327         }
18328
18329 -#ifdef CONFIG_XEN
18330 -       if (SHARED_KERNEL_PMD)
18331 -               return pgd;
18332 -
18333         spin_lock_irqsave(&pgd_lock, flags);
18334
18335         /* Protect against save/restore: move below 4GB under pgd_lock. */
18336 -       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18337 -               int rc = xen_create_contiguous_region(
18338 -                       (unsigned long)pgd, 0, 32);
18339 -               if (rc) {
18340 -                       spin_unlock_irqrestore(&pgd_lock, flags);
18341 -                       goto out_oom;
18342 -               }
18343 +       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18344 +           && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18345 +               spin_unlock_irqrestore(&pgd_lock, flags);
18346 +out_oom:
18347 +               while (i--)
18348 +                       pmd_free(mm, pmds[i]);
18349 +               return 0;
18350         }
18351
18352         /* Copy kernel pmd contents and write-protect the new pmds. */
18353 -       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18354 -               memcpy(pmds[i],
18355 -                      (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18356 -                      sizeof(pmd_t) * PTRS_PER_PMD);
18357 -               make_lowmem_page_readonly(
18358 -                       pmds[i], XENFEAT_writable_page_tables);
18359 -       }
18360 +       pud = pud_offset(pgd, 0);
18361 +       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18362 +            i++, pud++, addr += PUD_SIZE) {
18363 +               if (i >= USER_PTRS_PER_PGD) {
18364 +                       memcpy(pmds[i],
18365 +                              (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18366 +                              sizeof(pmd_t) * PTRS_PER_PMD);
18367 +                       make_lowmem_page_readonly(
18368 +                               pmds[i], XENFEAT_writable_page_tables);
18369 +               }
18370
18371 -       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18372 -       for (i = 0; i < PTRS_PER_PGD; i++)
18373 -               set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18374 +               /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18375 +               pud_populate(mm, pud, pmds[i]);
18376 +       }
18377
18378 -       /* Ensure this pgd gets picked up and pinned on save/restore. */
18379 +       /* List required to sync kernel mapping updates and
18380 +        * to pin/unpin on save/restore. */
18381         pgd_list_add(pgd);
18382
18383         spin_unlock_irqrestore(&pgd_lock, flags);
18384
18385 -       kfree(pmds);
18386 -#endif
18387 +       return 1;
18388 +}
18389 +#else  /* !CONFIG_X86_PAE */
18390 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
18391 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18392 +{
18393 +       return 1;
18394 +}
18395
18396 -       return pgd;
18397 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18398 +{
18399 +}
18400 +#endif /* CONFIG_X86_PAE */
18401
18402 -out_oom:
18403 -       if (!pmds) {
18404 -               for (i--; i >= 0; i--) {
18405 -                       pgd_t pgdent = pgd[i];
18406 -                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
18407 -                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18408 -                       pmd_cache_free(pmd, i);
18409 -               }
18410 -       } else {
18411 -               for (i--; i >= 0; i--) {
18412 -                       paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18413 -                       pmd_cache_free(pmds[i], i);
18414 -               }
18415 -               kfree(pmds);
18416 +pgd_t *pgd_alloc(struct mm_struct *mm)
18417 +{
18418 +       pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18419 +
18420 +       /* so that alloc_pd can use it */
18421 +       mm->pgd = pgd;
18422 +       if (pgd)
18423 +               pgd_ctor(pgd);
18424 +
18425 +       if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18426 +               free_page((unsigned long)pgd);
18427 +               pgd = NULL;
18428         }
18429 -       quicklist_free(0, pgd_dtor, pgd);
18430 -       return NULL;
18431 +
18432 +       return pgd;
18433  }
18434
18435 -void pgd_free(pgd_t *pgd)
18436 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18437  {
18438 -       int i;
18439 -
18440         /*
18441          * After this the pgd should not be pinned for the duration of this
18442          * function's execution. We should never sleep and thus never race:
18443 @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18444          *  2. The machine addresses in PGD entries will not become invalid
18445          *     due to a concurrent save/restore.
18446          */
18447 -       pgd_test_and_unpin(pgd);
18448 +       pgd_dtor(pgd);
18449
18450 -       /* in the PAE case user pgd entries are overwritten before usage */
18451 -       if (PTRS_PER_PMD > 1) {
18452 -               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18453 -                       pgd_t pgdent = pgd[i];
18454 -                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
18455 -                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18456 -                       pmd_cache_free(pmd, i);
18457 -               }
18458 +       if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18459 +               xen_destroy_contiguous_region((unsigned long)pgd, 0);
18460
18461 -               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18462 -                       xen_destroy_contiguous_region((unsigned long)pgd, 0);
18463 -       }
18464 +       pgd_mop_up_pmds(mm, pgd);
18465 +       free_page((unsigned long)pgd);
18466 +}
18467
18468 -       /* in the non-PAE case, free_pgtables() clears user pgd entries */
18469 -       quicklist_free(0, pgd_dtor, pgd);
18470 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18471 +{
18472 +       pgtable_page_dtor(pte);
18473 +       paravirt_release_pt(page_to_pfn(pte));
18474 +       tlb_remove_page(tlb, pte);
18475  }
18476
18477 -void check_pgt_cache(void)
18478 +#ifdef CONFIG_X86_PAE
18479 +
18480 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18481  {
18482 -       quicklist_trim(0, pgd_dtor, 25, 16);
18483 +       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18484 +       tlb_remove_page(tlb, virt_to_page(pmd));
18485  }
18486
18487 +#endif
18488 +
18489  void make_lowmem_page_readonly(void *va, unsigned int feature)
18490  {
18491         pte_t *pte;
18492 +       unsigned int level;
18493         int rc;
18494
18495         if (xen_feature(feature))
18496                 return;
18497
18498 -       pte = virt_to_ptep(va);
18499 +       pte = lookup_address((unsigned long)va, &level);
18500 +       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18501         rc = HYPERVISOR_update_va_mapping(
18502                 (unsigned long)va, pte_wrprotect(*pte), 0);
18503         BUG_ON(rc);
18504 @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18505  void make_lowmem_page_writable(void *va, unsigned int feature)
18506  {
18507         pte_t *pte;
18508 +       unsigned int level;
18509         int rc;
18510
18511         if (xen_feature(feature))
18512                 return;
18513
18514 -       pte = virt_to_ptep(va);
18515 +       pte = lookup_address((unsigned long)va, &level);
18516 +       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18517         rc = HYPERVISOR_update_va_mapping(
18518                 (unsigned long)va, pte_mkwrite(*pte), 0);
18519         BUG_ON(rc);
18520  }
18521 -
18522 -void make_page_readonly(void *va, unsigned int feature)
18523 -{
18524 -       pte_t *pte;
18525 -       int rc;
18526 -
18527 -       if (xen_feature(feature))
18528 -               return;
18529 -
18530 -       pte = virt_to_ptep(va);
18531 -       rc = HYPERVISOR_update_va_mapping(
18532 -               (unsigned long)va, pte_wrprotect(*pte), 0);
18533 -       if (rc) /* fallback? */
18534 -               xen_l1_entry_update(pte, pte_wrprotect(*pte));
18535 -       if ((unsigned long)va >= (unsigned long)high_memory) {
18536 -               unsigned long pfn = pte_pfn(*pte);
18537 -#ifdef CONFIG_HIGHMEM
18538 -               if (pfn >= highstart_pfn)
18539 -                       kmap_flush_unused(); /* flush stale writable kmaps */
18540 -               else
18541 -#endif
18542 -                       make_lowmem_page_readonly(
18543 -                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18544 -       }
18545 -}
18546 -
18547 -void make_page_writable(void *va, unsigned int feature)
18548 -{
18549 -       pte_t *pte;
18550 -       int rc;
18551 -
18552 -       if (xen_feature(feature))
18553 -               return;
18554 -
18555 -       pte = virt_to_ptep(va);
18556 -       rc = HYPERVISOR_update_va_mapping(
18557 -               (unsigned long)va, pte_mkwrite(*pte), 0);
18558 -       if (rc) /* fallback? */
18559 -               xen_l1_entry_update(pte, pte_mkwrite(*pte));
18560 -       if ((unsigned long)va >= (unsigned long)high_memory) {
18561 -               unsigned long pfn = pte_pfn(*pte);
18562 -#ifdef CONFIG_HIGHMEM
18563 -               if (pfn < highstart_pfn)
18564 -#endif
18565 -                       make_lowmem_page_writable(
18566 -                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18567 -       }
18568 -}
18569 -
18570 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18571 -{
18572 -       if (xen_feature(feature))
18573 -               return;
18574 -
18575 -       while (nr-- != 0) {
18576 -               make_page_readonly(va, feature);
18577 -               va = (void *)((unsigned long)va + PAGE_SIZE);
18578 -       }
18579 -}
18580 -
18581 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18582 -{
18583 -       if (xen_feature(feature))
18584 -               return;
18585 -
18586 -       while (nr-- != 0) {
18587 -               make_page_writable(va, feature);
18588 -               va = (void *)((unsigned long)va + PAGE_SIZE);
18589 -       }
18590 -}
18591 -
18592 -static void _pin_lock(struct mm_struct *mm, int lock) {
18593 -       if (lock)
18594 -               spin_lock(&mm->page_table_lock);
18595 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18596 -       /* While mm->page_table_lock protects us against insertions and
18597 -        * removals of higher level page table pages, it doesn't protect
18598 -        * against updates of pte-s. Such updates, however, require the
18599 -        * pte pages to be in consistent state (unpinned+writable or
18600 -        * pinned+readonly). The pinning and attribute changes, however
18601 -        * cannot be done atomically, which is why such updates must be
18602 -        * prevented from happening concurrently.
18603 -        * Note that no pte lock can ever elsewhere be acquired nesting
18604 -        * with an already acquired one in the same mm, or with the mm's
18605 -        * page_table_lock already acquired, as that would break in the
18606 -        * non-split case (where all these are actually resolving to the
18607 -        * one page_table_lock). Thus acquiring all of them here is not
18608 -        * going to result in dead locks, and the order of acquires
18609 -        * doesn't matter.
18610 -        */
18611 -       {
18612 -               pgd_t *pgd = mm->pgd;
18613 -               unsigned g;
18614 -
18615 -               for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18616 -                       pud_t *pud;
18617 -                       unsigned u;
18618 -
18619 -                       if (pgd_none(*pgd))
18620 -                               continue;
18621 -                       pud = pud_offset(pgd, 0);
18622 -                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18623 -                               pmd_t *pmd;
18624 -                               unsigned m;
18625 -
18626 -                               if (pud_none(*pud))
18627 -                                       continue;
18628 -                               pmd = pmd_offset(pud, 0);
18629 -                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18630 -                                       spinlock_t *ptl;
18631 -
18632 -                                       if (pmd_none(*pmd))
18633 -                                               continue;
18634 -                                       ptl = pte_lockptr(0, pmd);
18635 -                                       if (lock)
18636 -                                               spin_lock(ptl);
18637 -                                       else
18638 -                                               spin_unlock(ptl);
18639 -                               }
18640 -                       }
18641 -               }
18642 -       }
18643 -#endif
18644 -       if (!lock)
18645 -               spin_unlock(&mm->page_table_lock);
18646 -}
18647 -#define pin_lock(mm) _pin_lock(mm, 1)
18648 -#define pin_unlock(mm) _pin_lock(mm, 0)
18649 -
18650 -#define PIN_BATCH 4
18651 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18652 -
18653 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18654 -                                             unsigned int cpu, unsigned seq)
18655 -{
18656 -       unsigned long pfn = page_to_pfn(page);
18657 -
18658 -       if (PageHighMem(page)) {
18659 -               if (pgprot_val(flags) & _PAGE_RW)
18660 -                       ClearPagePinned(page);
18661 -               else
18662 -                       SetPagePinned(page);
18663 -       } else {
18664 -               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18665 -                               (unsigned long)__va(pfn << PAGE_SHIFT),
18666 -                               pfn_pte(pfn, flags), 0);
18667 -               if (unlikely(++seq == PIN_BATCH)) {
18668 -                       if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18669 -                                                               PIN_BATCH, NULL)))
18670 -                               BUG();
18671 -                       seq = 0;
18672 -               }
18673 -       }
18674 -
18675 -       return seq;
18676 -}
18677 -
18678 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18679 -{
18680 -       pgd_t *pgd = pgd_base;
18681 -       pud_t *pud;
18682 -       pmd_t *pmd;
18683 -       int    g, u, m;
18684 -       unsigned int cpu, seq;
18685 -
18686 -       if (xen_feature(XENFEAT_auto_translated_physmap))
18687 -               return;
18688 -
18689 -       cpu = get_cpu();
18690 -
18691 -       for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18692 -               if (pgd_none(*pgd))
18693 -                       continue;
18694 -               pud = pud_offset(pgd, 0);
18695 -               if (PTRS_PER_PUD > 1) /* not folded */
18696 -                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18697 -               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18698 -                       if (pud_none(*pud))
18699 -                               continue;
18700 -                       pmd = pmd_offset(pud, 0);
18701 -                       if (PTRS_PER_PMD > 1) /* not folded */
18702 -                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18703 -                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18704 -                               if (pmd_none(*pmd))
18705 -                                       continue;
18706 -                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18707 -                       }
18708 -               }
18709 -       }
18710 -
18711 -       if (likely(seq != 0)) {
18712 -               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18713 -                       (unsigned long)pgd_base,
18714 -                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18715 -                       UVMF_TLB_FLUSH);
18716 -               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18717 -                                                       seq + 1, NULL)))
18718 -                       BUG();
18719 -       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18720 -                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18721 -                       UVMF_TLB_FLUSH))
18722 -               BUG();
18723 -
18724 -       put_cpu();
18725 -}
18726 -
18727 -static void __pgd_pin(pgd_t *pgd)
18728 -{
18729 -       pgd_walk(pgd, PAGE_KERNEL_RO);
18730 -       kmap_flush_unused();
18731 -       xen_pgd_pin(__pa(pgd));
18732 -       SetPagePinned(virt_to_page(pgd));
18733 -}
18734 -
18735 -static void __pgd_unpin(pgd_t *pgd)
18736 -{
18737 -       xen_pgd_unpin(__pa(pgd));
18738 -       pgd_walk(pgd, PAGE_KERNEL);
18739 -       ClearPagePinned(virt_to_page(pgd));
18740 -}
18741 -
18742 -static void pgd_test_and_unpin(pgd_t *pgd)
18743 -{
18744 -       if (PagePinned(virt_to_page(pgd)))
18745 -               __pgd_unpin(pgd);
18746 -}
18747 -
18748 -void mm_pin(struct mm_struct *mm)
18749 -{
18750 -       if (xen_feature(XENFEAT_writable_page_tables))
18751 -               return;
18752 -       pin_lock(mm);
18753 -       __pgd_pin(mm->pgd);
18754 -       pin_unlock(mm);
18755 -}
18756 -
18757 -void mm_unpin(struct mm_struct *mm)
18758 -{
18759 -       if (xen_feature(XENFEAT_writable_page_tables))
18760 -               return;
18761 -       pin_lock(mm);
18762 -       __pgd_unpin(mm->pgd);
18763 -       pin_unlock(mm);
18764 -}
18765 -
18766 -void mm_pin_all(void)
18767 -{
18768 -       struct page *page;
18769 -       unsigned long flags;
18770 -
18771 -       if (xen_feature(XENFEAT_writable_page_tables))
18772 -               return;
18773 -
18774 -       /*
18775 -        * Allow uninterrupted access to the pgd_list. Also protects
18776 -        * __pgd_pin() by disabling preemption.
18777 -        * All other CPUs must be at a safe point (e.g., in stop_machine
18778 -        * or offlined entirely).
18779 -        */
18780 -       spin_lock_irqsave(&pgd_lock, flags);
18781 -       for (page = pgd_list; page; page = (struct page *)page->index) {
18782 -               if (!PagePinned(page))
18783 -                       __pgd_pin((pgd_t *)page_address(page));
18784 -       }
18785 -       spin_unlock_irqrestore(&pgd_lock, flags);
18786 -}
18787 -
18788 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18789 -{
18790 -       if (!PagePinned(virt_to_page(mm->pgd)))
18791 -               mm_pin(mm);
18792 -}
18793 -
18794 -void arch_exit_mmap(struct mm_struct *mm)
18795 -{
18796 -       struct task_struct *tsk = current;
18797 -
18798 -       task_lock(tsk);
18799 -
18800 -       /*
18801 -        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18802 -        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18803 -        */
18804 -       if (tsk->active_mm == mm) {
18805 -               tsk->active_mm = &init_mm;
18806 -               atomic_inc(&init_mm.mm_count);
18807 -
18808 -               switch_mm(mm, &init_mm, tsk);
18809 -
18810 -               atomic_dec(&mm->mm_count);
18811 -               BUG_ON(atomic_read(&mm->mm_count) == 0);
18812 -       }
18813 -
18814 -       task_unlock(tsk);
18815 -
18816 -       if (PagePinned(virt_to_page(mm->pgd)) &&
18817 -           (atomic_read(&mm->mm_count) == 1) &&
18818 -           !mm->context.has_foreign_mappings)
18819 -               mm_unpin(mm);
18820 -}
18821 --- sle11-2009-05-14.orig/arch/x86/pci/irq-xen.c        2009-02-16 16:18:36.000000000 +0100
18822 +++ sle11-2009-05-14/arch/x86/pci/irq-xen.c     2009-03-16 16:33:40.000000000 +0100
18823 @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18824  {
18825         static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18826
18827 +       WARN_ON_ONCE(pirq >= 16);
18828         return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18829  }
18830
18831 @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18832  {
18833         static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18834         unsigned int val = irqmap[irq];
18835 -
18836 +
18837 +       WARN_ON_ONCE(pirq >= 16);
18838         if (val) {
18839                 write_config_nybble(router, 0x48, pirq-1, val);
18840                 return 1;
18841 @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18842  static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18843  {
18844         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18845 +
18846 +       WARN_ON_ONCE(pirq >= 5);
18847         return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18848  }
18849
18850  static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18851  {
18852         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18853 +
18854 +       WARN_ON_ONCE(pirq >= 5);
18855         write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18856         return 1;
18857  }
18858 @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18859  static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18860  {
18861         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18862 +
18863 +       WARN_ON_ONCE(pirq >= 4);
18864         return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18865  }
18866
18867  static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18868  {
18869         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18870 +
18871 +       WARN_ON_ONCE(pirq >= 4);
18872         write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18873         return 1;
18874  }
18875 @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
18876
18877  static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18878  {
18879 +       WARN_ON_ONCE(pirq >= 9);
18880         if (pirq > 8) {
18881                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18882                 return 0;
18883 @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
18884
18885  static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18886  {
18887 +       WARN_ON_ONCE(pirq >= 9);
18888         if (pirq > 8) {
18889                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18890                 return 0;
18891 @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
18892   */
18893  static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18894  {
18895 -       outb_p(pirq, 0xc00);
18896 +       outb(pirq, 0xc00);
18897         return inb(0xc01) & 0xf;
18898  }
18899
18900  static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18901  {
18902 -       outb_p(pirq, 0xc00);
18903 -       outb_p(irq, 0xc01);
18904 +       outb(pirq, 0xc00);
18905 +       outb(irq, 0xc01);
18906         return 1;
18907  }
18908
18909 @@ -575,6 +587,10 @@ static __init int intel_router_probe(str
18910                 case PCI_DEVICE_ID_INTEL_ICH9_4:
18911                 case PCI_DEVICE_ID_INTEL_ICH9_5:
18912                 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
18913 +               case PCI_DEVICE_ID_INTEL_ICH10_0:
18914 +               case PCI_DEVICE_ID_INTEL_ICH10_1:
18915 +               case PCI_DEVICE_ID_INTEL_ICH10_2:
18916 +               case PCI_DEVICE_ID_INTEL_ICH10_3:
18917                         r->name = "PIIX/ICH";
18918                         r->get = pirq_piix_get;
18919                         r->set = pirq_piix_set;
18920 --- sle11-2009-05-14.orig/arch/x86/vdso/Makefile        2008-11-25 12:35:54.000000000 +0100
18921 +++ sle11-2009-05-14/arch/x86/vdso/Makefile     2009-03-16 16:33:40.000000000 +0100
18922 @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y)         += int80
18923  vdso32.so-$(CONFIG_COMPAT)     += syscall
18924  vdso32.so-$(VDSO32-y)          += sysenter
18925  xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
18926 +xen-vdso32-$(CONFIG_X86_32)    += syscall
18927  vdso32.so-$(CONFIG_XEN)                += $(xen-vdso32-y)
18928
18929  vdso32-images                  = $(vdso32.so-y:%=vdso32-%.so)
18930 --- sle11-2009-05-14.orig/arch/x86/vdso/vdso32/syscall.S        2009-05-14 10:56:29.000000000 +0200
18931 +++ sle11-2009-05-14/arch/x86/vdso/vdso32/syscall.S     2009-03-16 16:33:40.000000000 +0100
18932 @@ -19,8 +19,10 @@ __kernel_vsyscall:
18933  .Lpush_ebp:
18934         movl    %ecx, %ebp
18935         syscall
18936 +#ifndef CONFIG_XEN
18937         movl    $__USER32_DS, %ecx
18938         movl    %ecx, %ss
18939 +#endif
18940         movl    %ebp, %ecx
18941         popl    %ebp
18942  .Lpop_ebp:
18943 --- sle11-2009-05-14.orig/arch/x86/vdso/vdso32.S        2009-05-14 10:56:29.000000000 +0200
18944 +++ sle11-2009-05-14/arch/x86/vdso/vdso32.S     2009-03-16 16:33:40.000000000 +0100
18945 @@ -19,4 +19,16 @@ vdso32_sysenter_start:
18946         .incbin "arch/x86/vdso/vdso32-sysenter.so"
18947  vdso32_sysenter_end:
18948
18949 +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
18950 +       .globl vdso32_int80_start, vdso32_int80_end
18951 +vdso32_int80_start:
18952 +       .incbin "arch/x86/vdso/vdso32-int80.so"
18953 +vdso32_int80_end:
18954 +#elif defined(CONFIG_X86_XEN)
18955 +       .globl vdso32_syscall_start, vdso32_syscall_end
18956 +vdso32_syscall_start:
18957 +       .incbin "arch/x86/vdso/vdso32-syscall.so"
18958 +vdso32_syscall_end:
18959 +#endif
18960 +
18961  __FINIT
18962 --- sle11-2009-05-14.orig/arch/x86/vdso/vdso32-setup.c  2008-11-25 12:35:53.000000000 +0100
18963 +++ sle11-2009-05-14/arch/x86/vdso/vdso32-setup.c       2009-03-16 16:33:40.000000000 +0100
18964 @@ -26,10 +26,6 @@
18965  #include <asm/vdso.h>
18966  #include <asm/proto.h>
18967
18968 -#ifdef CONFIG_XEN
18969 -#include <xen/interface/callback.h>
18970 -#endif
18971 -
18972  enum {
18973         VDSO_DISABLED = 0,
18974         VDSO_ENABLED = 1,
18975 @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
18976
18977  void enable_sep_cpu(void)
18978  {
18979 -#ifndef CONFIG_XEN
18980         int cpu = get_cpu();
18981         struct tss_struct *tss = &per_cpu(init_tss, cpu);
18982
18983 @@ -244,35 +239,6 @@ void enable_sep_cpu(void)
18984         wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
18985         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
18986         put_cpu();
18987 -#else
18988 -       extern asmlinkage void ia32pv_sysenter_target(void);
18989 -       static struct callback_register sysenter = {
18990 -               .type = CALLBACKTYPE_sysenter,
18991 -               .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
18992 -       };
18993 -
18994 -       if (!boot_cpu_has(X86_FEATURE_SEP))
18995 -               return;
18996 -
18997 -       get_cpu();
18998 -
18999 -       if (xen_feature(XENFEAT_supervisor_mode_kernel))
19000 -               sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19001 -
19002 -       switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19003 -       case 0:
19004 -               break;
19005 -#if CONFIG_XEN_COMPAT < 0x030200
19006 -       case -ENOSYS:
19007 -               sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19008 -               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19009 -                       break;
19010 -#endif
19011 -       default:
19012 -               clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19013 -               break;
19014 -       }
19015 -#endif
19016  }
19017
19018  static struct vm_area_struct gate_vma;
19019 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
19020 +++ sle11-2009-05-14/arch/x86/vdso/vdso32-setup-xen.c   2009-03-16 16:33:40.000000000 +0100
19021 @@ -0,0 +1,506 @@
19022 +/*
19023 + * (C) Copyright 2002 Linus Torvalds
19024 + * Portions based on the vdso-randomization code from exec-shield:
19025 + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19026 + *
19027 + * This file contains the needed initializations to support sysenter.
19028 + */
19029 +
19030 +#include <linux/init.h>
19031 +#include <linux/smp.h>
19032 +#include <linux/thread_info.h>
19033 +#include <linux/sched.h>
19034 +#include <linux/gfp.h>
19035 +#include <linux/string.h>
19036 +#include <linux/elf.h>
19037 +#include <linux/mm.h>
19038 +#include <linux/err.h>
19039 +#include <linux/module.h>
19040 +
19041 +#include <asm/cpufeature.h>
19042 +#include <asm/msr.h>
19043 +#include <asm/pgtable.h>
19044 +#include <asm/unistd.h>
19045 +#include <asm/elf.h>
19046 +#include <asm/tlbflush.h>
19047 +#include <asm/vdso.h>
19048 +#include <asm/proto.h>
19049 +
19050 +#include <xen/interface/callback.h>
19051 +
19052 +enum {
19053 +       VDSO_DISABLED = 0,
19054 +       VDSO_ENABLED = 1,
19055 +       VDSO_COMPAT = 2,
19056 +};
19057 +
19058 +#ifdef CONFIG_COMPAT_VDSO
19059 +#define VDSO_DEFAULT   VDSO_COMPAT
19060 +#else
19061 +#define VDSO_DEFAULT   VDSO_ENABLED
19062 +#endif
19063 +
19064 +#ifdef CONFIG_X86_64
19065 +#define vdso_enabled                   sysctl_vsyscall32
19066 +#define arch_setup_additional_pages    syscall32_setup_pages
19067 +#endif
19068 +
19069 +/*
19070 + * This is the difference between the prelinked addresses in the vDSO images
19071 + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19072 + * in the user address space.
19073 + */
19074 +#define VDSO_ADDR_ADJUST       (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19075 +
19076 +/*
19077 + * Should the kernel map a VDSO page into processes and pass its
19078 + * address down to glibc upon exec()?
19079 + */
19080 +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19081 +
19082 +static int __init vdso_setup(char *s)
19083 +{
19084 +       vdso_enabled = simple_strtoul(s, NULL, 0);
19085 +
19086 +       return 1;
19087 +}
19088 +
19089 +/*
19090 + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19091 + * behavior on both 64-bit and 32-bit kernels.
19092 + * On 32-bit kernels, vdso=[012] means the same thing.
19093 + */
19094 +__setup("vdso32=", vdso_setup);
19095 +
19096 +#ifdef CONFIG_X86_32
19097 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19098 +
19099 +EXPORT_SYMBOL_GPL(vdso_enabled);
19100 +#endif
19101 +
19102 +static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19103 +                               unsigned offset, unsigned size)
19104 +{
19105 +       Elf32_Sym *sym = (void *)ehdr + offset;
19106 +       unsigned nsym = size / sizeof(*sym);
19107 +       unsigned i;
19108 +
19109 +       for(i = 0; i < nsym; i++, sym++) {
19110 +               if (sym->st_shndx == SHN_UNDEF ||
19111 +                   sym->st_shndx == SHN_ABS)
19112 +                       continue;  /* skip */
19113 +
19114 +               if (sym->st_shndx > SHN_LORESERVE) {
19115 +                       printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19116 +                              sym->st_shndx);
19117 +                       continue;
19118 +               }
19119 +
19120 +               switch(ELF_ST_TYPE(sym->st_info)) {
19121 +               case STT_OBJECT:
19122 +               case STT_FUNC:
19123 +               case STT_SECTION:
19124 +               case STT_FILE:
19125 +                       sym->st_value += VDSO_ADDR_ADJUST;
19126 +               }
19127 +       }
19128 +}
19129 +
19130 +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19131 +{
19132 +       Elf32_Dyn *dyn = (void *)ehdr + offset;
19133 +
19134 +       for(; dyn->d_tag != DT_NULL; dyn++)
19135 +               switch(dyn->d_tag) {
19136 +               case DT_PLTGOT:
19137 +               case DT_HASH:
19138 +               case DT_STRTAB:
19139 +               case DT_SYMTAB:
19140 +               case DT_RELA:
19141 +               case DT_INIT:
19142 +               case DT_FINI:
19143 +               case DT_REL:
19144 +               case DT_DEBUG:
19145 +               case DT_JMPREL:
19146 +               case DT_VERSYM:
19147 +               case DT_VERDEF:
19148 +               case DT_VERNEED:
19149 +               case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19150 +                       /* definitely pointers needing relocation */
19151 +                       dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19152 +                       break;
19153 +
19154 +               case DT_ENCODING ... OLD_DT_LOOS-1:
19155 +               case DT_LOOS ... DT_HIOS-1:
19156 +                       /* Tags above DT_ENCODING are pointers if
19157 +                          they're even */
19158 +                       if (dyn->d_tag >= DT_ENCODING &&
19159 +                           (dyn->d_tag & 1) == 0)
19160 +                               dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19161 +                       break;
19162 +
19163 +               case DT_VERDEFNUM:
19164 +               case DT_VERNEEDNUM:
19165 +               case DT_FLAGS_1:
19166 +               case DT_RELACOUNT:
19167 +               case DT_RELCOUNT:
19168 +               case DT_VALRNGLO ... DT_VALRNGHI:
19169 +                       /* definitely not pointers */
19170 +                       break;
19171 +
19172 +               case OLD_DT_LOOS ... DT_LOOS-1:
19173 +               case DT_HIOS ... DT_VALRNGLO-1:
19174 +               default:
19175 +                       if (dyn->d_tag > DT_ENCODING)
19176 +                               printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19177 +                                      dyn->d_tag);
19178 +                       break;
19179 +               }
19180 +}
19181 +
19182 +static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19183 +{
19184 +       Elf32_Phdr *phdr;
19185 +       Elf32_Shdr *shdr;
19186 +       int i;
19187 +
19188 +       BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19189 +              !elf_check_arch_ia32(ehdr) ||
19190 +              ehdr->e_type != ET_DYN);
19191 +
19192 +       ehdr->e_entry += VDSO_ADDR_ADJUST;
19193 +
19194 +       /* rebase phdrs */
19195 +       phdr = (void *)ehdr + ehdr->e_phoff;
19196 +       for (i = 0; i < ehdr->e_phnum; i++) {
19197 +               phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19198 +
19199 +               /* relocate dynamic stuff */
19200 +               if (phdr[i].p_type == PT_DYNAMIC)
19201 +                       reloc_dyn(ehdr, phdr[i].p_offset);
19202 +       }
19203 +
19204 +       /* rebase sections */
19205 +       shdr = (void *)ehdr + ehdr->e_shoff;
19206 +       for(i = 0; i < ehdr->e_shnum; i++) {
19207 +               if (!(shdr[i].sh_flags & SHF_ALLOC))
19208 +                       continue;
19209 +
19210 +               shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19211 +
19212 +               if (shdr[i].sh_type == SHT_SYMTAB ||
19213 +                   shdr[i].sh_type == SHT_DYNSYM)
19214 +                       reloc_symtab(ehdr, shdr[i].sh_offset,
19215 +                                    shdr[i].sh_size);
19216 +       }
19217 +}
19218 +
19219 +/*
19220 + * These symbols are defined by vdso32.S to mark the bounds
19221 + * of the ELF DSO images included therein.
19222 + */
19223 +extern const char vdso32_default_start, vdso32_default_end;
19224 +extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19225 +static struct page *vdso32_pages[1];
19226 +
19227 +#ifdef CONFIG_X86_64
19228 +
19229 +#if CONFIG_XEN_COMPAT < 0x030200
19230 +static int use_int80 = 1;
19231 +#endif
19232 +static int use_sysenter __read_mostly = -1;
19233 +
19234 +#define        vdso32_sysenter()       (use_sysenter > 0)
19235 +
19236 +/* May not be __init: called during resume */
19237 +void syscall32_cpu_init(void)
19238 +{
19239 +       static const struct callback_register cstar = {
19240 +               .type = CALLBACKTYPE_syscall32,
19241 +               .address = (unsigned long)ia32_cstar_target
19242 +       };
19243 +       static const struct callback_register sysenter = {
19244 +               .type = CALLBACKTYPE_sysenter,
19245 +               .address = (unsigned long)ia32_sysenter_target
19246 +       };
19247 +
19248 +       if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19249 +           (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19250 +#if CONFIG_XEN_COMPAT < 0x030200
19251 +               return;
19252 +       use_int80 = 0;
19253 +#else
19254 +               BUG();
19255 +#endif
19256 +
19257 +       if (use_sysenter < 0)
19258 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19259 +}
19260 +
19261 +#define compat_uses_vma                1
19262 +
19263 +static inline void map_compat_vdso(int map)
19264 +{
19265 +}
19266 +
19267 +#else  /* CONFIG_X86_32 */
19268 +
19269 +#define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
19270 +
19271 +extern asmlinkage void ia32pv_cstar_target(void);
19272 +static const struct callback_register __cpuinitconst cstar = {
19273 +       .type = CALLBACKTYPE_syscall32,
19274 +       .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19275 +};
19276 +
19277 +void __cpuinit enable_sep_cpu(void)
19278 +{
19279 +       extern asmlinkage void ia32pv_sysenter_target(void);
19280 +       static struct callback_register __cpuinitdata sysenter = {
19281 +               .type = CALLBACKTYPE_sysenter,
19282 +               .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19283 +       };
19284 +
19285 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19286 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19287 +                       BUG();
19288 +               return;
19289 +       }
19290 +
19291 +       if (!boot_cpu_has(X86_FEATURE_SEP))
19292 +               return;
19293 +
19294 +       if (xen_feature(XENFEAT_supervisor_mode_kernel))
19295 +               sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19296 +
19297 +       switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19298 +       case 0:
19299 +               break;
19300 +#if CONFIG_XEN_COMPAT < 0x030200
19301 +       case -ENOSYS:
19302 +               sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19303 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19304 +                       break;
19305 +#endif
19306 +       default:
19307 +               setup_clear_cpu_cap(X86_FEATURE_SEP);
19308 +               break;
19309 +       }
19310 +}
19311 +
19312 +static struct vm_area_struct gate_vma;
19313 +
19314 +static int __init gate_vma_init(void)
19315 +{
19316 +       gate_vma.vm_mm = NULL;
19317 +       gate_vma.vm_start = FIXADDR_USER_START;
19318 +       gate_vma.vm_end = FIXADDR_USER_END;
19319 +       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19320 +       gate_vma.vm_page_prot = __P101;
19321 +       /*
19322 +        * Make sure the vDSO gets into every core dump.
19323 +        * Dumping its contents makes post-mortem fully interpretable later
19324 +        * without matching up the same kernel and hardware config to see
19325 +        * what PC values meant.
19326 +        */
19327 +       gate_vma.vm_flags |= VM_ALWAYSDUMP;
19328 +       return 0;
19329 +}
19330 +
19331 +#define compat_uses_vma                0
19332 +
19333 +static void map_compat_vdso(int map)
19334 +{
19335 +       static int vdso_mapped;
19336 +
19337 +       if (map == vdso_mapped)
19338 +               return;
19339 +
19340 +       vdso_mapped = map;
19341 +
19342 +       __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19343 +                    map ? PAGE_READONLY_EXEC : PAGE_NONE);
19344 +
19345 +       /* flush stray tlbs */
19346 +       flush_tlb_all();
19347 +}
19348 +
19349 +#endif /* CONFIG_X86_64 */
19350 +
19351 +int __init sysenter_setup(void)
19352 +{
19353 +       void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19354 +       const void *vsyscall;
19355 +       size_t vsyscall_len;
19356 +
19357 +       vdso32_pages[0] = virt_to_page(syscall_page);
19358 +
19359 +#ifdef CONFIG_X86_32
19360 +       gate_vma_init();
19361 +
19362 +       printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19363 +#endif
19364 +
19365 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19366 +       if (use_int80) {
19367 +               extern const char vdso32_int80_start, vdso32_int80_end;
19368 +
19369 +               vsyscall = &vdso32_int80_start;
19370 +               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19371 +       } else
19372 +#elif defined(CONFIG_X86_32)
19373 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)
19374 +           && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19375 +               || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19376 +               setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19377 +       barrier(); /* until clear_bit()'s constraints are correct ... */
19378 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19379 +               extern const char vdso32_syscall_start, vdso32_syscall_end;
19380 +
19381 +               vsyscall = &vdso32_syscall_start;
19382 +               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19383 +       } else
19384 +#endif
19385 +       if (!vdso32_sysenter()) {
19386 +               vsyscall = &vdso32_default_start;
19387 +               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19388 +       } else {
19389 +               vsyscall = &vdso32_sysenter_start;
19390 +               vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19391 +       }
19392 +
19393 +       memcpy(syscall_page, vsyscall, vsyscall_len);
19394 +       relocate_vdso(syscall_page);
19395 +
19396 +       return 0;
19397 +}
19398 +
19399 +/* Setup a VMA at program startup for the vsyscall page */
19400 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19401 +{
19402 +       struct mm_struct *mm = current->mm;
19403 +       unsigned long addr;
19404 +       int ret = 0;
19405 +       bool compat;
19406 +
19407 +       down_write(&mm->mmap_sem);
19408 +
19409 +       /* Test compat mode once here, in case someone
19410 +          changes it via sysctl */
19411 +       compat = (vdso_enabled == VDSO_COMPAT);
19412 +
19413 +       map_compat_vdso(compat);
19414 +
19415 +       if (compat)
19416 +               addr = VDSO_HIGH_BASE;
19417 +       else {
19418 +               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19419 +               if (IS_ERR_VALUE(addr)) {
19420 +                       ret = addr;
19421 +                       goto up_fail;
19422 +               }
19423 +       }
19424 +
19425 +       if (compat_uses_vma || !compat) {
19426 +               /*
19427 +                * MAYWRITE to allow gdb to COW and set breakpoints
19428 +                *
19429 +                * Make sure the vDSO gets into every core dump.
19430 +                * Dumping its contents makes post-mortem fully
19431 +                * interpretable later without matching up the same
19432 +                * kernel and hardware config to see what PC values
19433 +                * meant.
19434 +                */
19435 +               ret = install_special_mapping(mm, addr, PAGE_SIZE,
19436 +                                             VM_READ|VM_EXEC|
19437 +                                             VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19438 +                                             VM_ALWAYSDUMP,
19439 +                                             vdso32_pages);
19440 +
19441 +               if (ret)
19442 +                       goto up_fail;
19443 +       }
19444 +
19445 +       current->mm->context.vdso = (void *)addr;
19446 +       current_thread_info()->sysenter_return =
19447 +               VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19448 +
19449 +  up_fail:
19450 +       up_write(&mm->mmap_sem);
19451 +
19452 +       return ret;
19453 +}
19454 +
19455 +#ifdef CONFIG_X86_64
19456 +
19457 +/*
19458 + * This must be done early in case we have an initrd containing 32-bit
19459 + * binaries (e.g., hotplug). This could be pushed upstream.
19460 + */
19461 +core_initcall(sysenter_setup);
19462 +
19463 +#ifdef CONFIG_SYSCTL
19464 +/* Register vsyscall32 into the ABI table */
19465 +#include <linux/sysctl.h>
19466 +
19467 +static ctl_table abi_table2[] = {
19468 +       {
19469 +               .procname       = "vsyscall32",
19470 +               .data           = &sysctl_vsyscall32,
19471 +               .maxlen         = sizeof(int),
19472 +               .mode           = 0644,
19473 +               .proc_handler   = proc_dointvec
19474 +       },
19475 +       {}
19476 +};
19477 +
19478 +static ctl_table abi_root_table2[] = {
19479 +       {
19480 +               .ctl_name = CTL_ABI,
19481 +               .procname = "abi",
19482 +               .mode = 0555,
19483 +               .child = abi_table2
19484 +       },
19485 +       {}
19486 +};
19487 +
19488 +static __init int ia32_binfmt_init(void)
19489 +{
19490 +       register_sysctl_table(abi_root_table2);
19491 +       return 0;
19492 +}
19493 +__initcall(ia32_binfmt_init);
19494 +#endif
19495 +
19496 +#else  /* CONFIG_X86_32 */
19497 +
19498 +const char *arch_vma_name(struct vm_area_struct *vma)
19499 +{
19500 +       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19501 +               return "[vdso]";
19502 +       return NULL;
19503 +}
19504 +
19505 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19506 +{
19507 +       struct mm_struct *mm = tsk->mm;
19508 +
19509 +       /* Check to see if this task was created in compat vdso mode */
19510 +       if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19511 +               return &gate_vma;
19512 +       return NULL;
19513 +}
19514 +
19515 +int in_gate_area(struct task_struct *task, unsigned long addr)
19516 +{
19517 +       const struct vm_area_struct *vma = get_gate_vma(task);
19518 +
19519 +       return vma && addr >= vma->vm_start && addr < vma->vm_end;
19520 +}
19521 +
19522 +int in_gate_area_no_task(unsigned long addr)
19523 +{
19524 +       return 0;
19525 +}
19526 +
19527 +#endif /* CONFIG_X86_64 */
19528 --- sle11-2009-05-14.orig/drivers/pci/msi-xen.c 2009-02-16 16:18:36.000000000 +0100
19529 +++ sle11-2009-05-14/drivers/pci/msi-xen.c      2009-03-16 16:33:40.000000000 +0100
19530 @@ -43,6 +43,53 @@ struct msi_pirq_entry {
19531         int entry_nr;
19532  };
19533
19534 +/* Arch hooks */
19535 +
19536 +int __attribute__ ((weak))
19537 +arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19538 +{
19539 +       return 0;
19540 +}
19541 +
19542 +#ifndef CONFIG_XEN
19543 +int __attribute__ ((weak))
19544 +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19545 +{
19546 +       return 0;
19547 +}
19548 +
19549 +int __attribute__ ((weak))
19550 +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19551 +{
19552 +       struct msi_desc *entry;
19553 +       int ret;
19554 +
19555 +       list_for_each_entry(entry, &dev->msi_list, list) {
19556 +               ret = arch_setup_msi_irq(dev, entry);
19557 +               if (ret)
19558 +                       return ret;
19559 +       }
19560 +
19561 +       return 0;
19562 +}
19563 +
19564 +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19565 +{
19566 +       return;
19567 +}
19568 +
19569 +void __attribute__ ((weak))
19570 +arch_teardown_msi_irqs(struct pci_dev *dev)
19571 +{
19572 +       struct msi_desc *entry;
19573 +
19574 +       list_for_each_entry(entry, &dev->msi_list, list) {
19575 +               if (entry->irq != 0)
19576 +                       arch_teardown_msi_irq(entry->irq);
19577 +       }
19578 +}
19579 +#endif
19580 +
19581  static void msi_set_enable(struct pci_dev *dev, int enable)
19582  {
19583         int pos;
19584 @@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19585                 pci_intx(dev, enable);
19586  }
19587
19588 -#ifdef CONFIG_PM
19589  static void __pci_restore_msi_state(struct pci_dev *dev)
19590  {
19591         int pirq;
19592 @@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19593         __pci_restore_msi_state(dev);
19594         __pci_restore_msix_state(dev);
19595  }
19596 -#endif /* CONFIG_PM */
19597 +EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19598
19599  /**
19600   * msi_capability_init - configure device's MSI capability structure
19601 @@ -755,51 +801,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19602         INIT_LIST_HEAD(&dev->msi_list);
19603  #endif
19604  }
19605 -
19606 -
19607 -/* Arch hooks */
19608 -
19609 -int __attribute__ ((weak))
19610 -arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19611 -{
19612 -       return 0;
19613 -}
19614 -
19615 -#ifndef CONFIG_XEN
19616 -int __attribute__ ((weak))
19617 -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19618 -{
19619 -       return 0;
19620 -}
19621 -
19622 -int __attribute__ ((weak))
19623 -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19624 -{
19625 -       struct msi_desc *entry;
19626 -       int ret;
19627 -
19628 -       list_for_each_entry(entry, &dev->msi_list, list) {
19629 -               ret = arch_setup_msi_irq(dev, entry);
19630 -               if (ret)
19631 -                       return ret;
19632 -       }
19633 -
19634 -       return 0;
19635 -}
19636 -
19637 -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19638 -{
19639 -       return;
19640 -}
19641 -
19642 -void __attribute__ ((weak))
19643 -arch_teardown_msi_irqs(struct pci_dev *dev)
19644 -{
19645 -       struct msi_desc *entry;
19646 -
19647 -       list_for_each_entry(entry, &dev->msi_list, list) {
19648 -               if (entry->irq != 0)
19649 -                       arch_teardown_msi_irq(entry->irq);
19650 -       }
19651 -}
19652 -#endif
19653 --- sle11-2009-05-14.orig/drivers/pci/pci.c     2009-05-14 10:56:29.000000000 +0200
19654 +++ sle11-2009-05-14/drivers/pci/pci.c  2009-03-16 16:33:40.000000000 +0100
19655 @@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19656   * Restore the BAR values for a given device, so as to make it
19657   * accessible by its driver.
19658   */
19659 +#ifndef CONFIG_XEN
19660  static void
19661 +#else
19662 +EXPORT_SYMBOL_GPL(pci_restore_bars);
19663 +void
19664 +#endif
19665  pci_restore_bars(struct pci_dev *dev)
19666  {
19667         int i, numres;
19668 --- sle11-2009-05-14.orig/drivers/xen/balloon/sysfs.c   2009-03-04 11:25:55.000000000 +0100
19669 +++ sle11-2009-05-14/drivers/xen/balloon/sysfs.c        2009-03-16 16:33:40.000000000 +0100
19670 @@ -108,7 +108,7 @@ static struct attribute_group balloon_in
19671  };
19672
19673  static struct sysdev_class balloon_sysdev_class = {
19674 -       set_kset_name(BALLOON_CLASS_NAME),
19675 +       .name = BALLOON_CLASS_NAME,
19676  };
19677
19678  static struct sys_device balloon_sysdev;
19679 --- sle11-2009-05-14.orig/drivers/xen/blkback/blkback.c 2009-02-16 16:18:36.000000000 +0100
19680 +++ sle11-2009-05-14/drivers/xen/blkback/blkback.c      2009-03-16 16:33:40.000000000 +0100
19681 @@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19682                 return;
19683         if (blkif->plug->unplug_fn)
19684                 blkif->plug->unplug_fn(blkif->plug);
19685 -       blk_put_queue(blkif->plug);
19686 +       kobject_put(&blkif->plug->kobj);
19687         blkif->plug = NULL;
19688  }
19689
19690 @@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19691         if (q == blkif->plug)
19692                 return;
19693         unplug_queue(blkif);
19694 -       blk_get_queue(q);
19695 +       WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19696 +       kobject_get(&q->kobj);
19697         blkif->plug = q;
19698  }
19699
19700 --- sle11-2009-05-14.orig/drivers/xen/blkfront/blkfront.c       2009-02-16 16:18:36.000000000 +0100
19701 +++ sle11-2009-05-14/drivers/xen/blkfront/blkfront.c    2009-03-24 10:12:53.000000000 +0100
19702 @@ -713,7 +713,6 @@ static irqreturn_t blkif_int(int irq, vo
19703         RING_IDX i, rp;
19704         unsigned long flags;
19705         struct blkfront_info *info = (struct blkfront_info *)dev_id;
19706 -       int uptodate;
19707
19708         spin_lock_irqsave(&blkif_io_lock, flags);
19709
19710 @@ -738,13 +737,13 @@ static irqreturn_t blkif_int(int irq, vo
19711
19712                 ADD_ID_TO_FREELIST(info, id);
19713
19714 -               uptodate = (bret->status == BLKIF_RSP_OKAY);
19715 +               ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19716                 switch (bret->operation) {
19717                 case BLKIF_OP_WRITE_BARRIER:
19718                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19719                                 printk("blkfront: %s: write barrier op failed\n",
19720                                        info->gd->disk_name);
19721 -                               uptodate = -EOPNOTSUPP;
19722 +                               ret = -EOPNOTSUPP;
19723                                 info->feature_barrier = 0;
19724                                 xlvbd_barrier(info);
19725                         }
19726 @@ -755,10 +754,8 @@ static irqreturn_t blkif_int(int irq, vo
19727                                 DPRINTK("Bad return from blkdev data "
19728                                         "request: %x\n", bret->status);
19729
19730 -                       ret = end_that_request_first(req, uptodate,
19731 -                               req->hard_nr_sectors);
19732 +                       ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19733                         BUG_ON(ret);
19734 -                       end_that_request_last(req, uptodate);
19735                         break;
19736                 default:
19737                         BUG();
19738 --- sle11-2009-05-14.orig/drivers/xen/blktap/blktap.c   2009-04-20 11:37:50.000000000 +0200
19739 +++ sle11-2009-05-14/drivers/xen/blktap/blktap.c        2009-04-20 11:38:54.000000000 +0200
19740 @@ -331,8 +331,8 @@ static pte_t blktap_clear_pte(struct vm_
19741          * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19742          */
19743         if (uvaddr < uvstart || vma->vm_file == NULL)
19744 -               return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19745 -                                              ptep, is_fullmm);
19746 +               return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19747 +                                                  is_fullmm);
19748
19749         info = vma->vm_file->private_data;
19750         priv = vma->vm_private_data;
19751 @@ -379,8 +379,8 @@ static pte_t blktap_clear_pte(struct vm_
19752                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19753
19754                 /* USING SHADOW PAGE TABLES. */
19755 -               copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19756 -                                              is_fullmm);
19757 +               copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19758 +                                                  is_fullmm);
19759         }
19760
19761         if (count) {
19762 --- sle11-2009-05-14.orig/drivers/xen/core/Makefile     2009-05-14 10:56:29.000000000 +0200
19763 +++ sle11-2009-05-14/drivers/xen/core/Makefile  2009-03-16 16:33:40.000000000 +0100
19764 @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR)  += hypervis
19765  obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
19766  obj-$(CONFIG_XEN_SYSFS)                += xen_sysfs.o
19767  obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
19768 +obj-$(CONFIG_X86_SMP)          += spinlock.o
19769  obj-$(CONFIG_KEXEC)            += machine_kexec.o
19770  obj-$(CONFIG_XEN_XENCOMM)      += xencomm.o
19771 --- sle11-2009-05-14.orig/drivers/xen/core/evtchn.c     2009-03-04 11:25:55.000000000 +0100
19772 +++ sle11-2009-05-14/drivers/xen/core/evtchn.c  2009-03-16 16:33:40.000000000 +0100
19773 @@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc
19774
19775  /* Upcall to generic IRQ layer. */
19776  #ifdef CONFIG_X86
19777 -extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19778 +extern unsigned int do_IRQ(struct pt_regs *regs);
19779  void __init xen_init_IRQ(void);
19780  void __init init_IRQ(void)
19781  {
19782 @@ -203,13 +203,11 @@ void __init init_IRQ(void)
19783  }
19784  #if defined (__i386__)
19785  static inline void exit_idle(void) {}
19786 -#define IRQ_REG orig_eax
19787  #elif defined (__x86_64__)
19788  #include <asm/idle.h>
19789 -#define IRQ_REG orig_rax
19790  #endif
19791  #define do_IRQ(irq, regs) do {         \
19792 -       (regs)->IRQ_REG = ~(irq);       \
19793 +       (regs)->orig_ax = ~(irq);       \
19794         do_IRQ((regs));                 \
19795  } while (0)
19796  #endif
19797 @@ -670,13 +668,12 @@ static void set_affinity_irq(unsigned in
19798  int resend_irq_on_evtchn(unsigned int irq)
19799  {
19800         int masked, evtchn = evtchn_from_irq(irq);
19801 -       shared_info_t *s = HYPERVISOR_shared_info;
19802
19803         if (!VALID_EVTCHN(evtchn))
19804                 return 1;
19805
19806         masked = test_and_set_evtchn_mask(evtchn);
19807 -       synch_set_bit(evtchn, s->evtchn_pending);
19808 +       set_evtchn(evtchn);
19809         if (!masked)
19810                 unmask_evtchn(evtchn);
19811
19812 @@ -969,6 +966,43 @@ void disable_all_local_evtchn(void)
19813                         synch_set_bit(i, &s->evtchn_mask[0]);
19814  }
19815
19816 +/* Clear an irq's pending state, in preparation for polling on it. */
19817 +void xen_clear_irq_pending(int irq)
19818 +{
19819 +       int evtchn = evtchn_from_irq(irq);
19820 +
19821 +       if (VALID_EVTCHN(evtchn))
19822 +               clear_evtchn(evtchn);
19823 +}
19824 +
19825 +/* Set an irq's pending state, to avoid blocking on it. */
19826 +void xen_set_irq_pending(int irq)
19827 +{
19828 +       int evtchn = evtchn_from_irq(irq);
19829 +
19830 +       if (VALID_EVTCHN(evtchn))
19831 +               set_evtchn(evtchn);
19832 +}
19833 +
19834 +/* Test an irq's pending state. */
19835 +int xen_test_irq_pending(int irq)
19836 +{
19837 +       int evtchn = evtchn_from_irq(irq);
19838 +
19839 +       return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19840 +}
19841 +
19842 +/* Poll waiting for an irq to become pending.  In the usual case, the
19843 +   irq will be disabled so it won't deliver an interrupt. */
19844 +void xen_poll_irq(int irq)
19845 +{
19846 +       evtchn_port_t evtchn = evtchn_from_irq(irq);
19847 +
19848 +       if (VALID_EVTCHN(evtchn)
19849 +           && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19850 +               BUG();
19851 +}
19852 +
19853  static void restore_cpu_virqs(unsigned int cpu)
19854  {
19855         struct evtchn_bind_virq bind_virq;
19856 @@ -1022,8 +1056,8 @@ static void restore_cpu_ipis(unsigned in
19857                 bind_evtchn_to_cpu(evtchn, cpu);
19858
19859                 /* Ready for use. */
19860 -               unmask_evtchn(evtchn);
19861 -
19862 +               if (!(irq_desc[irq].status & IRQ_DISABLED))
19863 +                       unmask_evtchn(evtchn);
19864         }
19865  }
19866
19867 --- sle11-2009-05-14.orig/drivers/xen/core/hypervisor_sysfs.c   2008-12-15 11:27:22.000000000 +0100
19868 +++ sle11-2009-05-14/drivers/xen/core/hypervisor_sysfs.c        2009-03-16 16:33:40.000000000 +0100
19869 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19870         if (!is_running_on_xen())
19871                 return -ENODEV;
19872
19873 -       hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19874 +       hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19875         return 0;
19876  }
19877
19878 --- sle11-2009-05-14.orig/drivers/xen/core/smpboot.c    2009-02-16 16:18:36.000000000 +0100
19879 +++ sle11-2009-05-14/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
19880 @@ -135,6 +135,10 @@ static int __cpuinit xen_smp_intr_init(u
19881                 goto fail;
19882         per_cpu(callfunc_irq, cpu) = rc;
19883
19884 +       rc = xen_spinlock_init(cpu);
19885 +       if (rc < 0)
19886 +               goto fail;
19887 +
19888         if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
19889                 goto fail;
19890
19891 @@ -145,6 +149,7 @@ static int __cpuinit xen_smp_intr_init(u
19892                 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19893         if (per_cpu(callfunc_irq, cpu) >= 0)
19894                 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19895 +       xen_spinlock_cleanup(cpu);
19896         return rc;
19897  }
19898
19899 @@ -156,6 +161,7 @@ static void xen_smp_intr_exit(unsigned i
19900
19901         unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19902         unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19903 +       xen_spinlock_cleanup(cpu);
19904  }
19905  #endif
19906
19907 @@ -208,36 +214,25 @@ static void __cpuinit cpu_initialize_con
19908         smp_trap_init(ctxt.trap_ctxt);
19909
19910         ctxt.ldt_ents = 0;
19911 -       ctxt.gdt_ents = GDT_SIZE / 8;
19912 -
19913 -#ifdef __i386__
19914         ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
19915 +       ctxt.gdt_ents = GDT_SIZE / 8;
19916
19917         ctxt.user_regs.cs = __KERNEL_CS;
19918 -       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
19919 +       ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
19920
19921         ctxt.kernel_ss = __KERNEL_DS;
19922 -       ctxt.kernel_sp = idle->thread.esp0;
19923 +       ctxt.kernel_sp = idle->thread.sp0;
19924
19925 -       ctxt.event_callback_cs     = __KERNEL_CS;
19926         ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
19927 -       ctxt.failsafe_callback_cs  = __KERNEL_CS;
19928         ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19929 +#ifdef __i386__
19930 +       ctxt.event_callback_cs     = __KERNEL_CS;
19931 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
19932
19933         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
19934
19935         ctxt.user_regs.fs = __KERNEL_PERCPU;
19936  #else /* __x86_64__ */
19937 -       ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
19938 -
19939 -       ctxt.user_regs.cs = __KERNEL_CS;
19940 -       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
19941 -
19942 -       ctxt.kernel_ss = __KERNEL_DS;
19943 -       ctxt.kernel_sp = idle->thread.rsp0;
19944 -
19945 -       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
19946 -       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19947         ctxt.syscall_callback_eip  = (unsigned long)system_call;
19948
19949         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
19950 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
19951 +++ sle11-2009-05-14/drivers/xen/core/spinlock.c        2009-03-16 16:33:40.000000000 +0100
19952 @@ -0,0 +1,161 @@
19953 +/*
19954 + *     Xen spinlock functions
19955 + *
19956 + *     See arch/x86/xen/smp.c for copyright and credits for derived
19957 + *     portions of this file.
19958 + */
19959 +
19960 +#include <linux/init.h>
19961 +#include <linux/irq.h>
19962 +#include <linux/kernel.h>
19963 +#include <linux/kernel_stat.h>
19964 +#include <linux/module.h>
19965 +#include <xen/evtchn.h>
19966 +
19967 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
19968 +
19969 +static DEFINE_PER_CPU(int, spinlock_irq) = -1;
19970 +static char spinlock_name[NR_CPUS][15];
19971 +
19972 +struct spinning {
19973 +       raw_spinlock_t *lock;
19974 +       unsigned int ticket;
19975 +       struct spinning *prev;
19976 +};
19977 +static DEFINE_PER_CPU(struct spinning *, spinning);
19978 +/*
19979 + * Protect removal of objects: Addition can be done lockless, and even
19980 + * removal itself doesn't need protection - what needs to be prevented is
19981 + * removed objects going out of scope (as they're allocated on the stack.
19982 + */
19983 +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
19984 +
19985 +int __cpuinit xen_spinlock_init(unsigned int cpu)
19986 +{
19987 +       int rc;
19988 +
19989 +       sprintf(spinlock_name[cpu], "spinlock%u", cpu);
19990 +       rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
19991 +                                   cpu,
19992 +                                   smp_reschedule_interrupt,
19993 +                                   IRQF_DISABLED|IRQF_NOBALANCING,
19994 +                                   spinlock_name[cpu],
19995 +                                   NULL);
19996 +       if (rc < 0)
19997 +               return rc;
19998 +
19999 +       disable_irq(rc); /* make sure it's never delivered */
20000 +       per_cpu(spinlock_irq, cpu) = rc;
20001 +
20002 +       return 0;
20003 +}
20004 +
20005 +void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20006 +{
20007 +       if (per_cpu(spinlock_irq, cpu) >= 0)
20008 +               unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20009 +       per_cpu(spinlock_irq, cpu) = -1;
20010 +}
20011 +
20012 +int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20013 +{
20014 +       int rc = 0, irq = __get_cpu_var(spinlock_irq);
20015 +       raw_rwlock_t *rm_lock;
20016 +       unsigned long flags;
20017 +       struct spinning spinning;
20018 +
20019 +       /* If kicker interrupt not initialized yet, just spin. */
20020 +       if (unlikely(irq < 0))
20021 +               return 0;
20022 +
20023 +       token >>= TICKET_SHIFT;
20024 +
20025 +       /* announce we're spinning */
20026 +       spinning.ticket = token;
20027 +       spinning.lock = lock;
20028 +       spinning.prev = __get_cpu_var(spinning);
20029 +       smp_wmb();
20030 +       __get_cpu_var(spinning) = &spinning;
20031 +
20032 +       /* clear pending */
20033 +       xen_clear_irq_pending(irq);
20034 +
20035 +       do {
20036 +               /* Check again to make sure it didn't become free while
20037 +                * we weren't looking. */
20038 +               if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20039 +                       /* If we interrupted another spinlock while it was
20040 +                        * blocking, make sure it doesn't block (again)
20041 +                        * without rechecking the lock. */
20042 +                       if (spinning.prev)
20043 +                               xen_set_irq_pending(irq);
20044 +                       rc = 1;
20045 +                       break;
20046 +               }
20047 +
20048 +               /* block until irq becomes pending */
20049 +               xen_poll_irq(irq);
20050 +       } while (!xen_test_irq_pending(irq));
20051 +
20052 +       /* Leave the irq pending so that any interrupted blocker will
20053 +        * re-check. */
20054 +       kstat_this_cpu.irqs[irq] += !rc;
20055 +
20056 +       /* announce we're done */
20057 +       __get_cpu_var(spinning) = spinning.prev;
20058 +       rm_lock = &__get_cpu_var(spinning_rm_lock);
20059 +       raw_local_irq_save(flags);
20060 +       __raw_write_lock(rm_lock);
20061 +       __raw_write_unlock(rm_lock);
20062 +       raw_local_irq_restore(flags);
20063 +
20064 +       return rc;
20065 +}
20066 +EXPORT_SYMBOL(xen_spin_wait);
20067 +
20068 +unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20069 +{
20070 +       return token;//todo
20071 +}
20072 +EXPORT_SYMBOL(xen_spin_adjust);
20073 +
20074 +int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20075 +                         unsigned int flags)
20076 +{
20077 +       return xen_spin_wait(lock, *token);//todo
20078 +}
20079 +EXPORT_SYMBOL(xen_spin_wait_flags);
20080 +
20081 +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20082 +{
20083 +       unsigned int cpu;
20084 +
20085 +       token &= (1U << TICKET_SHIFT) - 1;
20086 +       for_each_online_cpu(cpu) {
20087 +               raw_rwlock_t *rm_lock;
20088 +               unsigned long flags;
20089 +               struct spinning *spinning;
20090 +
20091 +               if (cpu == raw_smp_processor_id())
20092 +                       continue;
20093 +
20094 +               rm_lock = &per_cpu(spinning_rm_lock, cpu);
20095 +               raw_local_irq_save(flags);
20096 +               __raw_read_lock(rm_lock);
20097 +
20098 +               spinning = per_cpu(spinning, cpu);
20099 +               smp_rmb();
20100 +               if (spinning
20101 +                   && (spinning->lock != lock || spinning->ticket != token))
20102 +                       spinning = NULL;
20103 +
20104 +               __raw_read_unlock(rm_lock);
20105 +               raw_local_irq_restore(flags);
20106 +
20107 +               if (unlikely(spinning)) {
20108 +                       notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20109 +                       return;
20110 +               }
20111 +       }
20112 +}
20113 +EXPORT_SYMBOL(xen_spin_kick);
20114 --- sle11-2009-05-14.orig/drivers/xen/core/xen_sysfs.c  2008-12-15 11:27:22.000000000 +0100
20115 +++ sle11-2009-05-14/drivers/xen/core/xen_sysfs.c       2009-03-16 16:33:40.000000000 +0100
20116 @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20117
20118  static int __init xen_sysfs_type_init(void)
20119  {
20120 -       return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20121 +       return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20122  }
20123
20124  static void xen_sysfs_type_destroy(void)
20125  {
20126 -       sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20127 +       sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20128  }
20129
20130  /* xen version attributes */
20131 @@ -90,13 +90,12 @@ static struct attribute_group version_gr
20132
20133  static int __init xen_sysfs_version_init(void)
20134  {
20135 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20136 -                                 &version_group);
20137 +       return sysfs_create_group(hypervisor_kobj, &version_group);
20138  }
20139
20140  static void xen_sysfs_version_destroy(void)
20141  {
20142 -       sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20143 +       sysfs_remove_group(hypervisor_kobj, &version_group);
20144  }
20145
20146  /* UUID */
20147 @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20148
20149  static int __init xen_sysfs_uuid_init(void)
20150  {
20151 -       return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20152 +       return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20153  }
20154
20155  static void xen_sysfs_uuid_destroy(void)
20156  {
20157 -       sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20158 +       sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20159  }
20160
20161  /* xen compilation attributes */
20162 @@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20163
20164  int __init static xen_compilation_init(void)
20165  {
20166 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20167 -                                 &xen_compilation_group);
20168 +       return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20169  }
20170
20171  static void xen_compilation_destroy(void)
20172  {
20173 -       sysfs_remove_group(&hypervisor_subsys.kobj,
20174 -                          &xen_compilation_group);
20175 +       sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20176  }
20177
20178  /* xen properties info */
20179 @@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20180
20181  static int __init xen_properties_init(void)
20182  {
20183 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20184 -                                 &xen_properties_group);
20185 +       return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20186  }
20187
20188  static void xen_properties_destroy(void)
20189  {
20190 -       sysfs_remove_group(&hypervisor_subsys.kobj,
20191 -                          &xen_properties_group);
20192 +       sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20193  }
20194
20195  #ifdef CONFIG_KEXEC
20196 @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20197
20198  static int __init xen_sysfs_vmcoreinfo_init(void)
20199  {
20200 -       return sysfs_create_file(&hypervisor_subsys.kobj,
20201 -                                &vmcoreinfo_attr.attr);
20202 +       return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20203  }
20204
20205  static void xen_sysfs_vmcoreinfo_destroy(void)
20206  {
20207 -       sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20208 +       sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20209  }
20210
20211  #endif
20212 --- sle11-2009-05-14.orig/drivers/xen/gntdev/gntdev.c   2009-03-04 11:28:34.000000000 +0100
20213 +++ sle11-2009-05-14/drivers/xen/gntdev/gntdev.c        2009-03-16 16:33:40.000000000 +0100
20214 @@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20215                                        op.status);
20216                 } else {
20217                         /* USING SHADOW PAGE TABLES. */
20218 -                       copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20219 +                       copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20220                 }
20221
20222                 /* Finally, we unmap the grant from kernel space. */
20223 @@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20224                                     >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20225
20226         } else {
20227 -               copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20228 +               copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20229         }
20230
20231         return copy;
20232 --- sle11-2009-05-14.orig/drivers/xen/scsifront/scsifront.c     2009-02-16 16:18:36.000000000 +0100
20233 +++ sle11-2009-05-14/drivers/xen/scsifront/scsifront.c  2009-03-16 16:33:40.000000000 +0100
20234 @@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20235                 return -ENOMEM;
20236         }
20237
20238 -       if (sc->use_sg) {
20239 +       if (scsi_bufflen(sc)) {
20240                 /* quoted scsi_lib.c/scsi_req_map_sg . */
20241 -               struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20242 -               unsigned int data_len = sc->request_bufflen;
20243 +               struct scatterlist *sg, *sgl = scsi_sglist(sc);
20244 +               unsigned int data_len = scsi_bufflen(sc);
20245
20246 -               nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20247 +               nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20248                 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20249                         printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20250                         ref_cnt = (-E2BIG);
20251                         goto big_to_sg;
20252                 }
20253
20254 -               for_each_sg (sgl, sg, sc->use_sg, i) {
20255 +               for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20256                         page = sg_page(sg);
20257                         off = sg->offset;
20258                         len = sg->length;
20259 @@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20260                                 ref_cnt++;
20261                         }
20262                 }
20263 -       } else if (sc->request_bufflen) {
20264 -               unsigned long end   = ((unsigned long)sc->request_buffer
20265 -                                       + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20266 -               unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20267 -
20268 -               page = virt_to_page(sc->request_buffer);
20269 -               nr_pages = end - start;
20270 -               len = sc->request_bufflen;
20271 -
20272 -               if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20273 -                       ref_cnt = (-E2BIG);
20274 -                       goto big_to_sg;
20275 -               }
20276 -
20277 -               buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20278 -
20279 -               off = offset_in_page((unsigned long)sc->request_buffer);
20280 -               for (i = 0; i < nr_pages; i++) {
20281 -                       bytes = PAGE_SIZE - off;
20282 -
20283 -                       if (bytes > len)
20284 -                               bytes = len;
20285 -
20286 -                       ref = gnttab_claim_grant_reference(&gref_head);
20287 -                       BUG_ON(ref == -ENOSPC);
20288 -
20289 -                       gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20290 -                               buffer_pfn, write);
20291 -
20292 -                       info->shadow[id].gref[i] = ref;
20293 -                       ring_req->seg[i].gref     = ref;
20294 -                       ring_req->seg[i].offset   = (uint16_t)off;
20295 -                       ring_req->seg[i].length   = (uint16_t)bytes;
20296 -
20297 -                       buffer_pfn++;
20298 -                       len -= bytes;
20299 -                       off = 0;
20300 -                       ref_cnt++;
20301 -               }
20302         }
20303
20304  big_to_sg:
20305 --- sle11-2009-05-14.orig/drivers/xen/xenoprof/xenoprofile.c    2009-03-11 15:39:38.000000000 +0100
20306 +++ sle11-2009-05-14/drivers/xen/xenoprof/xenoprofile.c 2009-03-16 16:33:40.000000000 +0100
20307 @@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de
20308
20309
20310  static struct sysdev_class oprofile_sysclass = {
20311 -       set_kset_name("oprofile"),
20312 +       .name           = "oprofile",
20313         .resume         = xenoprof_resume,
20314         .suspend        = xenoprof_suspend
20315  };
20316 --- sle11-2009-05-14.orig/include/asm-x86/e820.h        2009-05-14 10:56:29.000000000 +0200
20317 +++ sle11-2009-05-14/include/asm-x86/e820.h     2009-03-16 16:33:40.000000000 +0100
20318 @@ -127,7 +127,11 @@ extern char *memory_setup(void);
20319  #endif /* __KERNEL__ */
20320  #endif /* __ASSEMBLY__ */
20321
20322 +#ifndef CONFIG_XEN
20323  #define ISA_START_ADDRESS      0xa0000
20324 +#else
20325 +#define ISA_START_ADDRESS      0
20326 +#endif
20327  #define ISA_END_ADDRESS                0x100000
20328  #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
20329
20330 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/agp.h    2009-02-16 16:18:36.000000000 +0100
20331 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/agp.h 2009-03-16 16:33:40.000000000 +0100
20332 @@ -13,18 +13,13 @@
20333   * page. This avoids data corruption on some CPUs.
20334   */
20335
20336 -/*
20337 - * Caller's responsibility to call global_flush_tlb() for performance
20338 - * reasons
20339 - */
20340  #define map_page_into_agp(page) ( \
20341         xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20342 -       ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20343 +       ?: set_pages_uc(page, 1))
20344  #define unmap_page_from_agp(page) ( \
20345         xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20346         /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20347 -       change_page_attr(page, 1, PAGE_KERNEL))
20348 -#define flush_agp_mappings() global_flush_tlb()
20349 +       set_pages_wb(page, 1))
20350
20351  /*
20352   * Could use CLFLUSH here if the cpu supports it. But then it would
20353 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/desc.h   2009-02-16 16:18:36.000000000 +0100
20354 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/desc.h        2009-03-16 16:33:40.000000000 +0100
20355 @@ -1,5 +1,404 @@
20356 +#ifndef _ASM_DESC_H_
20357 +#define _ASM_DESC_H_
20358 +
20359 +#ifndef __ASSEMBLY__
20360 +#include <asm/desc_defs.h>
20361 +#include <asm/ldt.h>
20362 +#include <asm/mmu.h>
20363 +#include <linux/smp.h>
20364 +
20365 +static inline void fill_ldt(struct desc_struct *desc,
20366 +                           const struct user_desc *info)
20367 +{
20368 +       desc->limit0 = info->limit & 0x0ffff;
20369 +       desc->base0 = info->base_addr & 0x0000ffff;
20370 +
20371 +       desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20372 +       desc->type = (info->read_exec_only ^ 1) << 1;
20373 +       desc->type |= info->contents << 2;
20374 +       desc->s = 1;
20375 +       desc->dpl = 0x3;
20376 +       desc->p = info->seg_not_present ^ 1;
20377 +       desc->limit = (info->limit & 0xf0000) >> 16;
20378 +       desc->avl = info->useable;
20379 +       desc->d = info->seg_32bit;
20380 +       desc->g = info->limit_in_pages;
20381 +       desc->base2 = (info->base_addr & 0xff000000) >> 24;
20382 +}
20383 +
20384 +#ifndef CONFIG_X86_NO_IDT
20385 +extern struct desc_ptr idt_descr;
20386 +extern gate_desc idt_table[];
20387 +#endif
20388 +
20389 +#ifdef CONFIG_X86_64
20390 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20391 +extern struct desc_ptr cpu_gdt_descr[];
20392 +/* the cpu gdt accessor */
20393 +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
20394 +
20395 +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
20396 +                            unsigned dpl, unsigned ist, unsigned seg)
20397 +{
20398 +       gate->offset_low = PTR_LOW(func);
20399 +       gate->segment = __KERNEL_CS;
20400 +       gate->ist = ist;
20401 +       gate->p = 1;
20402 +       gate->dpl = dpl;
20403 +       gate->zero0 = 0;
20404 +       gate->zero1 = 0;
20405 +       gate->type = type;
20406 +       gate->offset_middle = PTR_MIDDLE(func);
20407 +       gate->offset_high = PTR_HIGH(func);
20408 +}
20409 +
20410 +#else
20411 +struct gdt_page {
20412 +       struct desc_struct gdt[GDT_ENTRIES];
20413 +} __attribute__((aligned(PAGE_SIZE)));
20414 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
20415 +
20416 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20417 +{
20418 +       return per_cpu(gdt_page, cpu).gdt;
20419 +}
20420 +
20421 +static inline void pack_gate(gate_desc *gate, unsigned char type,
20422 +       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
20423 +
20424 +{
20425 +       gate->a = (seg << 16) | (base & 0xffff);
20426 +       gate->b = (base & 0xffff0000) |
20427 +                 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
20428 +}
20429 +
20430 +#endif
20431 +
20432 +static inline int desc_empty(const void *ptr)
20433 +{
20434 +       const u32 *desc = ptr;
20435 +       return !(desc[0] | desc[1]);
20436 +}
20437 +
20438 +#ifndef CONFIG_XEN
20439 +#define load_TR_desc() native_load_tr_desc()
20440 +#define load_gdt(dtr) native_load_gdt(dtr)
20441 +#define load_idt(dtr) native_load_idt(dtr)
20442 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20443 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20444 +
20445 +#define store_gdt(dtr) native_store_gdt(dtr)
20446 +#define store_idt(dtr) native_store_idt(dtr)
20447 +#define store_tr(tr) (tr = native_store_tr())
20448 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20449 +
20450 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
20451 +#define set_ldt native_set_ldt
20452 +
20453 +#define write_ldt_entry(dt, entry, desc) \
20454 +                               native_write_ldt_entry(dt, entry, desc)
20455 +#define write_gdt_entry(dt, entry, desc, type) \
20456 +                               native_write_gdt_entry(dt, entry, desc, type)
20457 +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
20458 +
20459 +static inline void native_write_idt_entry(gate_desc *idt, int entry,
20460 +                                         const gate_desc *gate)
20461 +{
20462 +       memcpy(&idt[entry], gate, sizeof(*gate));
20463 +}
20464 +
20465 +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
20466 +                                         const void *desc)
20467 +{
20468 +       memcpy(&ldt[entry], desc, 8);
20469 +}
20470 +
20471 +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
20472 +                                         const void *desc, int type)
20473 +{
20474 +       unsigned int size;
20475 +       switch (type) {
20476 +       case DESC_TSS:
20477 +               size = sizeof(tss_desc);
20478 +               break;
20479 +       case DESC_LDT:
20480 +               size = sizeof(ldt_desc);
20481 +               break;
20482 +       default:
20483 +               size = sizeof(struct desc_struct);
20484 +               break;
20485 +       }
20486 +       memcpy(&gdt[entry], desc, size);
20487 +}
20488 +#endif
20489 +
20490 +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
20491 +                                  unsigned long limit, unsigned char type,
20492 +                                  unsigned char flags)
20493 +{
20494 +       desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
20495 +       desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20496 +                 (limit & 0x000f0000) | ((type & 0xff) << 8) |
20497 +                 ((flags & 0xf) << 20);
20498 +       desc->p = 1;
20499 +}
20500 +
20501 +
20502 +#ifndef CONFIG_XEN
20503 +static inline void set_tssldt_descriptor(void *d, unsigned long addr,
20504 +                                        unsigned type, unsigned size)
20505 +{
20506 +#ifdef CONFIG_X86_64
20507 +       struct ldttss_desc64 *desc = d;
20508 +       memset(desc, 0, sizeof(*desc));
20509 +       desc->limit0 = size & 0xFFFF;
20510 +       desc->base0 = PTR_LOW(addr);
20511 +       desc->base1 = PTR_MIDDLE(addr) & 0xFF;
20512 +       desc->type = type;
20513 +       desc->p = 1;
20514 +       desc->limit1 = (size >> 16) & 0xF;
20515 +       desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
20516 +       desc->base3 = PTR_HIGH(addr);
20517 +#else
20518 +
20519 +       pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
20520 +#endif
20521 +}
20522 +
20523 +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
20524 +{
20525 +       struct desc_struct *d = get_cpu_gdt_table(cpu);
20526 +       tss_desc tss;
20527 +
20528 +       /*
20529 +        * sizeof(unsigned long) coming from an extra "long" at the end
20530 +        * of the iobitmap. See tss_struct definition in processor.h
20531 +        *
20532 +        * -1? seg base+limit should be pointing to the address of the
20533 +        * last valid byte
20534 +        */
20535 +       set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
20536 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20537 +       write_gdt_entry(d, entry, &tss, DESC_TSS);
20538 +}
20539 +
20540 +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20541 +
20542 +static inline void native_set_ldt(const void *addr, unsigned int entries)
20543 +{
20544 +       if (likely(entries == 0))
20545 +               __asm__ __volatile__("lldt %w0"::"q" (0));
20546 +       else {
20547 +               unsigned cpu = smp_processor_id();
20548 +               ldt_desc ldt;
20549 +
20550 +               set_tssldt_descriptor(&ldt, (unsigned long)addr,
20551 +                                     DESC_LDT, entries * sizeof(ldt) - 1);
20552 +               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
20553 +                               &ldt, DESC_LDT);
20554 +               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20555 +       }
20556 +}
20557 +
20558 +static inline void native_load_tr_desc(void)
20559 +{
20560 +       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20561 +}
20562 +
20563 +static inline void native_load_gdt(const struct desc_ptr *dtr)
20564 +{
20565 +       asm volatile("lgdt %0"::"m" (*dtr));
20566 +}
20567 +
20568 +static inline void native_load_idt(const struct desc_ptr *dtr)
20569 +{
20570 +       asm volatile("lidt %0"::"m" (*dtr));
20571 +}
20572 +
20573 +static inline void native_store_gdt(struct desc_ptr *dtr)
20574 +{
20575 +       asm volatile("sgdt %0":"=m" (*dtr));
20576 +}
20577 +
20578 +static inline void native_store_idt(struct desc_ptr *dtr)
20579 +{
20580 +       asm volatile("sidt %0":"=m" (*dtr));
20581 +}
20582 +
20583 +static inline unsigned long native_store_tr(void)
20584 +{
20585 +       unsigned long tr;
20586 +       asm volatile("str %0":"=r" (tr));
20587 +       return tr;
20588 +}
20589 +
20590 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20591 +{
20592 +       unsigned int i;
20593 +       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20594 +
20595 +       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20596 +               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20597 +}
20598 +#else
20599 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20600 +#define set_ldt xen_set_ldt
20601 +
20602 +extern int write_ldt_entry(struct desc_struct *ldt, int entry,
20603 +                          const void *desc);
20604 +extern int write_gdt_entry(struct desc_struct *gdt, int entry,
20605 +                          const void *desc, int type);
20606 +
20607 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20608 +{
20609 +       unsigned int i;
20610 +       struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20611 +
20612 +       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20613 +               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20614 +                                                *(u64 *)&t->tls_array[i]))
20615 +                       BUG();
20616 +}
20617 +#endif
20618 +
20619 +#define _LDT_empty(info) (\
20620 +       (info)->base_addr       == 0    && \
20621 +       (info)->limit           == 0    && \
20622 +       (info)->contents        == 0    && \
20623 +       (info)->read_exec_only  == 1    && \
20624 +       (info)->seg_32bit       == 0    && \
20625 +       (info)->limit_in_pages  == 0    && \
20626 +       (info)->seg_not_present == 1    && \
20627 +       (info)->useable         == 0)
20628 +
20629 +#ifdef CONFIG_X86_64
20630 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
20631 +#else
20632 +#define LDT_empty(info) (_LDT_empty(info))
20633 +#endif
20634 +
20635 +static inline void clear_LDT(void)
20636 +{
20637 +       set_ldt(NULL, 0);
20638 +}
20639 +
20640 +/*
20641 + * load one particular LDT into the current CPU
20642 + */
20643 +static inline void load_LDT_nolock(mm_context_t *pc)
20644 +{
20645 +       set_ldt(pc->ldt, pc->size);
20646 +}
20647 +
20648 +static inline void load_LDT(mm_context_t *pc)
20649 +{
20650 +       preempt_disable();
20651 +       load_LDT_nolock(pc);
20652 +       preempt_enable();
20653 +}
20654 +
20655 +static inline unsigned long get_desc_base(const struct desc_struct *desc)
20656 +{
20657 +       return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
20658 +}
20659 +
20660 +static inline unsigned long get_desc_limit(const struct desc_struct *desc)
20661 +{
20662 +       return desc->limit0 | (desc->limit << 16);
20663 +}
20664 +
20665 +#ifndef CONFIG_X86_NO_IDT
20666 +static inline void _set_gate(int gate, unsigned type, void *addr,
20667 +                             unsigned dpl, unsigned ist, unsigned seg)
20668 +{
20669 +       gate_desc s;
20670 +       pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
20671 +       /*
20672 +        * does not need to be atomic because it is only done once at
20673 +        * setup time
20674 +        */
20675 +       write_idt_entry(idt_table, gate, &s);
20676 +}
20677 +
20678 +/*
20679 + * This needs to use 'idt_table' rather than 'idt', and
20680 + * thus use the _nonmapped_ version of the IDT, as the
20681 + * Pentium F0 0F bugfix can have resulted in the mapped
20682 + * IDT being write-protected.
20683 + */
20684 +static inline void set_intr_gate(unsigned int n, void *addr)
20685 +{
20686 +       BUG_ON((unsigned)n > 0xFF);
20687 +       _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
20688 +}
20689 +
20690 +/*
20691 + * This routine sets up an interrupt gate at directory privilege level 3.
20692 + */
20693 +static inline void set_system_intr_gate(unsigned int n, void *addr)
20694 +{
20695 +       BUG_ON((unsigned)n > 0xFF);
20696 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20697 +}
20698 +
20699 +static inline void set_trap_gate(unsigned int n, void *addr)
20700 +{
20701 +       BUG_ON((unsigned)n > 0xFF);
20702 +       _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
20703 +}
20704 +
20705 +static inline void set_system_gate(unsigned int n, void *addr)
20706 +{
20707 +       BUG_ON((unsigned)n > 0xFF);
20708  #ifdef CONFIG_X86_32
20709 -# include "desc_32.h"
20710 +       _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
20711 +#else
20712 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20713 +#endif
20714 +}
20715 +
20716 +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
20717 +{
20718 +       BUG_ON((unsigned)n > 0xFF);
20719 +       _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
20720 +}
20721 +
20722 +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
20723 +{
20724 +       BUG_ON((unsigned)n > 0xFF);
20725 +       _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
20726 +}
20727 +
20728 +static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
20729 +{
20730 +       BUG_ON((unsigned)n > 0xFF);
20731 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
20732 +}
20733 +#endif
20734 +
20735  #else
20736 -# include "desc_64.h"
20737 +/*
20738 + * GET_DESC_BASE reads the descriptor base of the specified segment.
20739 + *
20740 + * Args:
20741 + *    idx - descriptor index
20742 + *    gdt - GDT pointer
20743 + *    base - 32bit register to which the base will be written
20744 + *    lo_w - lo word of the "base" register
20745 + *    lo_b - lo byte of the "base" register
20746 + *    hi_b - hi byte of the low word of the "base" register
20747 + *
20748 + * Example:
20749 + *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20750 + *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20751 + */
20752 +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20753 +       movb idx*8+4(gdt), lo_b; \
20754 +       movb idx*8+7(gdt), hi_b; \
20755 +       shll $16, base; \
20756 +       movw idx*8+2(gdt), lo_w;
20757 +
20758 +
20759 +#endif /* __ASSEMBLY__ */
20760 +
20761  #endif
20762 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/desc_32.h        2008-12-15 11:27:22.000000000 +0100
20763 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
20764 @@ -1,262 +0,0 @@
20765 -#ifndef __ARCH_DESC_H
20766 -#define __ARCH_DESC_H
20767 -
20768 -#include <asm/ldt.h>
20769 -#include <asm/segment.h>
20770 -
20771 -#ifndef __ASSEMBLY__
20772 -
20773 -#include <linux/preempt.h>
20774 -#include <linux/smp.h>
20775 -
20776 -#include <asm/mmu.h>
20777 -
20778 -struct Xgt_desc_struct {
20779 -       unsigned short size;
20780 -       unsigned long address __attribute__((packed));
20781 -       unsigned short pad;
20782 -} __attribute__ ((packed));
20783 -
20784 -struct gdt_page
20785 -{
20786 -       struct desc_struct gdt[GDT_ENTRIES];
20787 -} __attribute__((aligned(PAGE_SIZE)));
20788 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
20789 -
20790 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20791 -{
20792 -       return per_cpu(gdt_page, cpu).gdt;
20793 -}
20794 -
20795 -extern struct Xgt_desc_struct idt_descr;
20796 -extern struct desc_struct idt_table[];
20797 -extern void set_intr_gate(unsigned int irq, void * addr);
20798 -
20799 -static inline void pack_descriptor(__u32 *a, __u32 *b,
20800 -       unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20801 -{
20802 -       *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20803 -       *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20804 -               (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20805 -}
20806 -
20807 -static inline void pack_gate(__u32 *a, __u32 *b,
20808 -       unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20809 -{
20810 -       *a = (seg << 16) | (base & 0xffff);
20811 -       *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20812 -}
20813 -
20814 -#define DESCTYPE_LDT   0x82    /* present, system, DPL-0, LDT */
20815 -#define DESCTYPE_TSS   0x89    /* present, system, DPL-0, 32-bit TSS */
20816 -#define DESCTYPE_TASK  0x85    /* present, system, DPL-0, task gate */
20817 -#define DESCTYPE_INT   0x8e    /* present, system, DPL-0, interrupt gate */
20818 -#define DESCTYPE_TRAP  0x8f    /* present, system, DPL-0, trap gate */
20819 -#define DESCTYPE_DPL3  0x60    /* DPL-3 */
20820 -#define DESCTYPE_S     0x10    /* !system */
20821 -
20822 -#ifndef CONFIG_XEN
20823 -#define load_TR_desc() native_load_tr_desc()
20824 -#define load_gdt(dtr) native_load_gdt(dtr)
20825 -#define load_idt(dtr) native_load_idt(dtr)
20826 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20827 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20828 -
20829 -#define store_gdt(dtr) native_store_gdt(dtr)
20830 -#define store_idt(dtr) native_store_idt(dtr)
20831 -#define store_tr(tr) (tr = native_store_tr())
20832 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20833 -
20834 -#define load_TLS(t, cpu) native_load_tls(t, cpu)
20835 -#define set_ldt native_set_ldt
20836 -
20837 -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20838 -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20839 -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20840 -
20841 -static inline void write_dt_entry(struct desc_struct *dt,
20842 -                                 int entry, u32 entry_low, u32 entry_high)
20843 -{
20844 -       dt[entry].a = entry_low;
20845 -       dt[entry].b = entry_high;
20846 -}
20847 -
20848 -static inline void native_set_ldt(const void *addr, unsigned int entries)
20849 -{
20850 -       if (likely(entries == 0))
20851 -               __asm__ __volatile__("lldt %w0"::"q" (0));
20852 -       else {
20853 -               unsigned cpu = smp_processor_id();
20854 -               __u32 a, b;
20855 -
20856 -               pack_descriptor(&a, &b, (unsigned long)addr,
20857 -                               entries * sizeof(struct desc_struct) - 1,
20858 -                               DESCTYPE_LDT, 0);
20859 -               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20860 -               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20861 -       }
20862 -}
20863 -
20864 -
20865 -static inline void native_load_tr_desc(void)
20866 -{
20867 -       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20868 -}
20869 -
20870 -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20871 -{
20872 -       asm volatile("lgdt %0"::"m" (*dtr));
20873 -}
20874 -
20875 -static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20876 -{
20877 -       asm volatile("lidt %0"::"m" (*dtr));
20878 -}
20879 -
20880 -static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20881 -{
20882 -       asm ("sgdt %0":"=m" (*dtr));
20883 -}
20884 -
20885 -static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20886 -{
20887 -       asm ("sidt %0":"=m" (*dtr));
20888 -}
20889 -
20890 -static inline unsigned long native_store_tr(void)
20891 -{
20892 -       unsigned long tr;
20893 -       asm ("str %0":"=r" (tr));
20894 -       return tr;
20895 -}
20896 -
20897 -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20898 -{
20899 -       unsigned int i;
20900 -       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20901 -
20902 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20903 -               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20904 -}
20905 -#else
20906 -#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20907 -#define set_ldt xen_set_ldt
20908 -
20909 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20910 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20911 -
20912 -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20913 -{
20914 -       unsigned int i;
20915 -       struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20916 -
20917 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20918 -               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20919 -                                                *(u64 *)&t->tls_array[i]))
20920 -                       BUG();
20921 -}
20922 -#endif
20923 -
20924 -#ifndef CONFIG_X86_NO_IDT
20925 -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20926 -{
20927 -       __u32 a, b;
20928 -       pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20929 -       write_idt_entry(idt_table, gate, a, b);
20930 -}
20931 -#endif
20932 -
20933 -#ifndef CONFIG_X86_NO_TSS
20934 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20935 -{
20936 -       __u32 a, b;
20937 -       pack_descriptor(&a, &b, (unsigned long)addr,
20938 -                       offsetof(struct tss_struct, __cacheline_filler) - 1,
20939 -                       DESCTYPE_TSS, 0);
20940 -       write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20941 -}
20942 -#endif
20943 -
20944 -
20945 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20946 -
20947 -#define LDT_entry_a(info) \
20948 -       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20949 -
20950 -#define LDT_entry_b(info) \
20951 -       (((info)->base_addr & 0xff000000) | \
20952 -       (((info)->base_addr & 0x00ff0000) >> 16) | \
20953 -       ((info)->limit & 0xf0000) | \
20954 -       (((info)->read_exec_only ^ 1) << 9) | \
20955 -       ((info)->contents << 10) | \
20956 -       (((info)->seg_not_present ^ 1) << 15) | \
20957 -       ((info)->seg_32bit << 22) | \
20958 -       ((info)->limit_in_pages << 23) | \
20959 -       ((info)->useable << 20) | \
20960 -       0x7000)
20961 -
20962 -#define LDT_empty(info) (\
20963 -       (info)->base_addr       == 0    && \
20964 -       (info)->limit           == 0    && \
20965 -       (info)->contents        == 0    && \
20966 -       (info)->read_exec_only  == 1    && \
20967 -       (info)->seg_32bit       == 0    && \
20968 -       (info)->limit_in_pages  == 0    && \
20969 -       (info)->seg_not_present == 1    && \
20970 -       (info)->useable         == 0    )
20971 -
20972 -static inline void clear_LDT(void)
20973 -{
20974 -       set_ldt(NULL, 0);
20975 -}
20976 -
20977 -/*
20978 - * load one particular LDT into the current CPU
20979 - */
20980 -static inline void load_LDT_nolock(mm_context_t *pc)
20981 -{
20982 -       set_ldt(pc->ldt, pc->size);
20983 -}
20984 -
20985 -static inline void load_LDT(mm_context_t *pc)
20986 -{
20987 -       preempt_disable();
20988 -       load_LDT_nolock(pc);
20989 -       preempt_enable();
20990 -}
20991 -
20992 -static inline unsigned long get_desc_base(unsigned long *desc)
20993 -{
20994 -       unsigned long base;
20995 -       base = ((desc[0] >> 16)  & 0x0000ffff) |
20996 -               ((desc[1] << 16) & 0x00ff0000) |
20997 -               (desc[1] & 0xff000000);
20998 -       return base;
20999 -}
21000 -
21001 -#else /* __ASSEMBLY__ */
21002 -
21003 -/*
21004 - * GET_DESC_BASE reads the descriptor base of the specified segment.
21005 - *
21006 - * Args:
21007 - *    idx - descriptor index
21008 - *    gdt - GDT pointer
21009 - *    base - 32bit register to which the base will be written
21010 - *    lo_w - lo word of the "base" register
21011 - *    lo_b - lo byte of the "base" register
21012 - *    hi_b - hi byte of the low word of the "base" register
21013 - *
21014 - * Example:
21015 - *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21016 - *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21017 - */
21018 -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21019 -       movb idx*8+4(gdt), lo_b; \
21020 -       movb idx*8+7(gdt), hi_b; \
21021 -       shll $16, base; \
21022 -       movw idx*8+2(gdt), lo_w;
21023 -
21024 -#endif /* !__ASSEMBLY__ */
21025 -
21026 -#endif
21027 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/desc_64.h        2009-02-16 16:18:36.000000000 +0100
21028 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
21029 @@ -1,228 +0,0 @@
21030 -/* Written 2000 by Andi Kleen */
21031 -#ifndef __ARCH_DESC_H
21032 -#define __ARCH_DESC_H
21033 -
21034 -#include <linux/threads.h>
21035 -#include <asm/ldt.h>
21036 -
21037 -#ifndef __ASSEMBLY__
21038 -
21039 -#include <linux/string.h>
21040 -#include <linux/smp.h>
21041 -#include <asm/desc_defs.h>
21042 -
21043 -#include <asm/segment.h>
21044 -#include <asm/mmu.h>
21045 -
21046 -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
21047 -
21048 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
21049 -
21050 -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
21051 -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
21052 -
21053 -static inline void clear_LDT(void)
21054 -{
21055 -       int cpu = get_cpu();
21056 -
21057 -       /*
21058 -        * NB. We load the default_ldt for lcall7/27 handling on demand, as
21059 -        * it slows down context switching. Noone uses it anyway.
21060 -        */
21061 -       cpu = cpu;              /* XXX avoid compiler warning */
21062 -       xen_set_ldt(NULL, 0);
21063 -       put_cpu();
21064 -}
21065 -
21066 -#ifndef CONFIG_X86_NO_TSS
21067 -static inline unsigned long __store_tr(void)
21068 -{
21069 -       unsigned long tr;
21070 -
21071 -       asm volatile ("str %w0":"=r" (tr));
21072 -       return tr;
21073 -}
21074 -
21075 -#define store_tr(tr) (tr) = __store_tr()
21076 -#endif
21077 -
21078 -/*
21079 - * This is the ldt that every process will get unless we need
21080 - * something other than this.
21081 - */
21082 -extern struct desc_struct default_ldt[];
21083 -#ifndef CONFIG_X86_NO_IDT
21084 -extern struct gate_struct idt_table[];
21085 -#endif
21086 -extern struct desc_ptr cpu_gdt_descr[];
21087 -
21088 -/* the cpu gdt accessor */
21089 -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
21090 -
21091 -#ifndef CONFIG_XEN
21092 -static inline void load_gdt(const struct desc_ptr *ptr)
21093 -{
21094 -       asm volatile("lgdt %w0"::"m" (*ptr));
21095 -}
21096 -
21097 -static inline void store_gdt(struct desc_ptr *ptr)
21098 -{
21099 -       asm("sgdt %w0":"=m" (*ptr));
21100 -}
21101 -#endif
21102 -
21103 -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
21104 -{
21105 -       struct gate_struct s;
21106 -       s.offset_low = PTR_LOW(func);
21107 -       s.segment = __KERNEL_CS;
21108 -       s.ist = ist;
21109 -       s.p = 1;
21110 -       s.dpl = dpl;
21111 -       s.zero0 = 0;
21112 -       s.zero1 = 0;
21113 -       s.type = type;
21114 -       s.offset_middle = PTR_MIDDLE(func);
21115 -       s.offset_high = PTR_HIGH(func);
21116 -       /* does not need to be atomic because it is only done once at setup time */
21117 -       memcpy(adr, &s, 16);
21118 -}
21119 -
21120 -#ifndef CONFIG_X86_NO_IDT
21121 -static inline void set_intr_gate(int nr, void *func)
21122 -{
21123 -       BUG_ON((unsigned)nr > 0xFF);
21124 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
21125 -}
21126 -
21127 -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
21128 -{
21129 -       BUG_ON((unsigned)nr > 0xFF);
21130 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
21131 -}
21132 -
21133 -static inline void set_system_gate(int nr, void *func)
21134 -{
21135 -       BUG_ON((unsigned)nr > 0xFF);
21136 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
21137 -}
21138 -
21139 -static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
21140 -{
21141 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
21142 -}
21143 -
21144 -static inline void load_idt(const struct desc_ptr *ptr)
21145 -{
21146 -       asm volatile("lidt %w0"::"m" (*ptr));
21147 -}
21148 -
21149 -static inline void store_idt(struct desc_ptr *dtr)
21150 -{
21151 -       asm("sidt %w0":"=m" (*dtr));
21152 -}
21153 -#endif
21154 -
21155 -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
21156 -                                        unsigned size)
21157 -{
21158 -       struct ldttss_desc d;
21159 -       memset(&d,0,sizeof(d));
21160 -       d.limit0 = size & 0xFFFF;
21161 -       d.base0 = PTR_LOW(tss);
21162 -       d.base1 = PTR_MIDDLE(tss) & 0xFF;
21163 -       d.type = type;
21164 -       d.p = 1;
21165 -       d.limit1 = (size >> 16) & 0xF;
21166 -       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
21167 -       d.base3 = PTR_HIGH(tss);
21168 -       memcpy(ptr, &d, 16);
21169 -}
21170 -
21171 -#ifndef CONFIG_X86_NO_TSS
21172 -static inline void set_tss_desc(unsigned cpu, void *addr)
21173 -{
21174 -       /*
21175 -        * sizeof(unsigned long) coming from an extra "long" at the end
21176 -        * of the iobitmap. See tss_struct definition in processor.h
21177 -        *
21178 -        * -1? seg base+limit should be pointing to the address of the
21179 -        * last valid byte
21180 -        */
21181 -       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
21182 -               (unsigned long)addr, DESC_TSS,
21183 -               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21184 -}
21185 -#endif
21186 -
21187 -static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
21188 -{
21189 -       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
21190 -                             DESC_LDT, size * 8 - 1);
21191 -}
21192 -
21193 -#define LDT_entry_a(info) \
21194 -       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
21195 -/* Don't allow setting of the lm bit. It is useless anyways because
21196 -   64bit system calls require __USER_CS. */
21197 -#define LDT_entry_b(info) \
21198 -       (((info)->base_addr & 0xff000000) | \
21199 -       (((info)->base_addr & 0x00ff0000) >> 16) | \
21200 -       ((info)->limit & 0xf0000) | \
21201 -       (((info)->read_exec_only ^ 1) << 9) | \
21202 -       ((info)->contents << 10) | \
21203 -       (((info)->seg_not_present ^ 1) << 15) | \
21204 -       ((info)->seg_32bit << 22) | \
21205 -       ((info)->limit_in_pages << 23) | \
21206 -       ((info)->useable << 20) | \
21207 -       /* ((info)->lm << 21) | */ \
21208 -       0x7000)
21209 -
21210 -#define LDT_empty(info) (\
21211 -       (info)->base_addr       == 0    && \
21212 -       (info)->limit           == 0    && \
21213 -       (info)->contents        == 0    && \
21214 -       (info)->read_exec_only  == 1    && \
21215 -       (info)->seg_32bit       == 0    && \
21216 -       (info)->limit_in_pages  == 0    && \
21217 -       (info)->seg_not_present == 1    && \
21218 -       (info)->useable         == 0    && \
21219 -       (info)->lm              == 0)
21220 -
21221 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
21222 -{
21223 -       unsigned int i;
21224 -       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
21225 -
21226 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21227 -               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21228 -                                                t->tls_array[i]))
21229 -                       BUG();
21230 -}
21231 -
21232 -/*
21233 - * load one particular LDT into the current CPU
21234 - */
21235 -static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
21236 -{
21237 -       void *segments = pc->ldt;
21238 -       int count = pc->size;
21239 -
21240 -       if (likely(!count))
21241 -               segments = NULL;
21242 -
21243 -       xen_set_ldt(segments, count);
21244 -}
21245 -
21246 -static inline void load_LDT(mm_context_t *pc)
21247 -{
21248 -       int cpu = get_cpu();
21249 -       load_LDT_nolock(pc, cpu);
21250 -       put_cpu();
21251 -}
21252 -
21253 -extern struct desc_ptr idt_descr;
21254 -
21255 -#endif /* !__ASSEMBLY__ */
21256 -
21257 -#endif
21258 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-02-16 16:18:36.000000000 +0100
21259 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/dma-mapping_32.h      2009-03-16 16:33:40.000000000 +0100
21260 @@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21261         dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21262  }
21263
21264 -static inline void
21265 +extern void
21266  dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21267 -                   enum dma_data_direction direction)
21268 -{
21269 -       if (swiotlb)
21270 -               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21271 -       flush_write_buffers();
21272 -}
21273 +                   enum dma_data_direction direction);
21274
21275 -static inline void
21276 +extern void
21277  dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21278 -                   enum dma_data_direction direction)
21279 -{
21280 -       if (swiotlb)
21281 -               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21282 -       flush_write_buffers();
21283 -}
21284 +                   enum dma_data_direction direction);
21285
21286  extern int
21287  dma_mapping_error(dma_addr_t dma_addr);
21288 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_32.h      2009-02-16 16:17:21.000000000 +0100
21289 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_32.h   2009-03-16 16:33:40.000000000 +0100
21290 @@ -64,7 +64,7 @@ enum fixed_addresses {
21291  #endif
21292  #ifdef CONFIG_X86_VISWS_APIC
21293         FIX_CO_CPU,     /* Cobalt timer */
21294 -       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
21295 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
21296         FIX_LI_PCIA,    /* Lithium PCI Bridge A */
21297         FIX_LI_PCIB,    /* Lithium PCI Bridge B */
21298  #endif
21299 @@ -73,7 +73,7 @@ enum fixed_addresses {
21300  #endif
21301  #ifdef CONFIG_X86_CYCLONE_TIMER
21302         FIX_CYCLONE_TIMER, /*cyclone timer register*/
21303 -#endif
21304 +#endif
21305  #ifdef CONFIG_HIGHMEM
21306         FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21307         FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21308 @@ -93,11 +93,23 @@ enum fixed_addresses {
21309         FIX_ISAMAP_END,
21310         FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21311         __end_of_permanent_fixed_addresses,
21312 -       /* temporary boot-time mappings, used before ioremap() is functional */
21313 -#define NR_FIX_BTMAPS  16
21314 -       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21315 -       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21316 +       /*
21317 +        * 256 temporary boot-time mappings, used by early_ioremap(),
21318 +        * before ioremap() is functional.
21319 +        *
21320 +        * We round it up to the next 512 pages boundary so that we
21321 +        * can have a single pgd entry and a single pte table:
21322 +        */
21323 +#define NR_FIX_BTMAPS          64
21324 +#define FIX_BTMAPS_NESTING     4
21325 +       FIX_BTMAP_END =
21326 +               __end_of_permanent_fixed_addresses + 512 -
21327 +                       (__end_of_permanent_fixed_addresses & 511),
21328 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21329         FIX_WP_TEST,
21330 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21331 +       FIX_OHCI1394_BASE,
21332 +#endif
21333         __end_of_fixed_addresses
21334  };
21335
21336 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_64.h      2009-02-16 16:17:21.000000000 +0100
21337 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_64.h   2009-03-16 16:33:40.000000000 +0100
21338 @@ -15,6 +15,7 @@
21339  #include <asm/apicdef.h>
21340  #include <asm/page.h>
21341  #include <asm/vsyscall.h>
21342 +#include <asm/efi.h>
21343  #include <asm/acpi.h>
21344
21345  /*
21346 @@ -46,6 +47,10 @@ enum fixed_addresses {
21347         FIX_IO_APIC_BASE_0,
21348         FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21349  #endif
21350 +#ifdef CONFIG_EFI
21351 +       FIX_EFI_IO_MAP_LAST_PAGE,
21352 +       FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21353 +#endif
21354  #ifdef CONFIG_ACPI
21355         FIX_ACPI_BEGIN,
21356         FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21357 @@ -55,10 +60,22 @@ enum fixed_addresses {
21358         FIX_ISAMAP_END,
21359         FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21360         __end_of_permanent_fixed_addresses,
21361 -       /* temporary boot-time mappings, used before ioremap() is functional */
21362 -#define NR_FIX_BTMAPS  16
21363 -       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21364 -       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21365 +       /*
21366 +        * 256 temporary boot-time mappings, used by early_ioremap(),
21367 +        * before ioremap() is functional.
21368 +        *
21369 +        * We round it up to the next 512 pages boundary so that we
21370 +        * can have a single pgd entry and a single pte table:
21371 +        */
21372 +#define NR_FIX_BTMAPS          64
21373 +#define FIX_BTMAPS_NESTING     4
21374 +       FIX_BTMAP_END =
21375 +               __end_of_permanent_fixed_addresses + 512 -
21376 +                       (__end_of_permanent_fixed_addresses & 511),
21377 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21378 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21379 +       FIX_OHCI1394_BASE,
21380 +#endif
21381         __end_of_fixed_addresses
21382  };
21383
21384 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/highmem.h        2009-02-16 16:17:21.000000000 +0100
21385 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/highmem.h     2009-03-16 16:33:40.000000000 +0100
21386 @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21387   * easily, subsequent pte tables have to be allocated in one physical
21388   * chunk of RAM.
21389   */
21390 -#ifdef CONFIG_X86_PAE
21391 -#define LAST_PKMAP 512
21392 -#else
21393 -#define LAST_PKMAP 1024
21394 -#endif
21395  /*
21396   * Ordering is:
21397   *
21398 @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21399   * VMALLOC_START
21400   * high_memory
21401   */
21402 -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21403  #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21404  #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21405  #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21406
21407 -extern void * FASTCALL(kmap_high(struct page *page));
21408 -extern void FASTCALL(kunmap_high(struct page *page));
21409 +extern void *kmap_high(struct page *page);
21410 +extern void kunmap_high(struct page *page);
21411
21412  void *kmap(struct page *page);
21413  void kunmap(struct page *page);
21414 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/hypervisor.h     2009-02-16 16:18:36.000000000 +0100
21415 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/hypervisor.h  2009-03-16 16:33:40.000000000 +0100
21416 @@ -264,6 +264,25 @@ HYPERVISOR_poll(
21417         return rc;
21418  }
21419
21420 +static inline int __must_check
21421 +HYPERVISOR_poll_no_timeout(
21422 +       evtchn_port_t *ports, unsigned int nr_ports)
21423 +{
21424 +       int rc;
21425 +       struct sched_poll sched_poll = {
21426 +               .nr_ports = nr_ports
21427 +       };
21428 +       set_xen_guest_handle(sched_poll.ports, ports);
21429 +
21430 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21431 +#if CONFIG_XEN_COMPAT <= 0x030002
21432 +       if (rc == -ENOSYS)
21433 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21434 +#endif
21435 +
21436 +       return rc;
21437 +}
21438 +
21439  #ifdef CONFIG_XEN
21440
21441  static inline void
21442 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/irqflags.h       2009-02-16 16:18:36.000000000 +0100
21443 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/irqflags.h    2009-03-16 16:33:40.000000000 +0100
21444 @@ -1,5 +1,247 @@
21445 -#ifdef CONFIG_X86_32
21446 -# include "irqflags_32.h"
21447 +#ifndef _X86_IRQFLAGS_H_
21448 +#define _X86_IRQFLAGS_H_
21449 +
21450 +#include <asm/processor-flags.h>
21451 +
21452 +#ifndef __ASSEMBLY__
21453 +/*
21454 + * The use of 'barrier' in the following reflects their use as local-lock
21455 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21456 + * critical operations are executed. All critical operations must complete
21457 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21458 + * includes these barriers, for example.
21459 + */
21460 +
21461 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21462 +
21463 +#define xen_restore_fl(f)                                      \
21464 +do {                                                           \
21465 +       vcpu_info_t *_vcpu;                                     \
21466 +       barrier();                                              \
21467 +       _vcpu = current_vcpu_info();                            \
21468 +       if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
21469 +               barrier(); /* unmask then check (avoid races) */\
21470 +               if (unlikely(_vcpu->evtchn_upcall_pending))     \
21471 +                       force_evtchn_callback();                \
21472 +       }                                                       \
21473 +} while (0)
21474 +
21475 +#define xen_irq_disable()                                      \
21476 +do {                                                           \
21477 +       current_vcpu_info()->evtchn_upcall_mask = 1;            \
21478 +       barrier();                                              \
21479 +} while (0)
21480 +
21481 +#define xen_irq_enable()                                       \
21482 +do {                                                           \
21483 +       vcpu_info_t *_vcpu;                                     \
21484 +       barrier();                                              \
21485 +       _vcpu = current_vcpu_info();                            \
21486 +       _vcpu->evtchn_upcall_mask = 0;                          \
21487 +       barrier(); /* unmask then check (avoid races) */        \
21488 +       if (unlikely(_vcpu->evtchn_upcall_pending))             \
21489 +               force_evtchn_callback();                        \
21490 +} while (0)
21491 +
21492 +void xen_safe_halt(void);
21493 +
21494 +void xen_halt(void);
21495 +
21496 +#define __raw_local_save_flags() xen_save_fl()
21497 +
21498 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21499 +
21500 +#define raw_local_irq_disable()        xen_irq_disable()
21501 +
21502 +#define raw_local_irq_enable() xen_irq_enable()
21503 +
21504 +/*
21505 + * Used in the idle loop; sti takes one instruction cycle
21506 + * to complete:
21507 + */
21508 +static inline void raw_safe_halt(void)
21509 +{
21510 +       xen_safe_halt();
21511 +}
21512 +
21513 +/*
21514 + * Used when interrupts are already enabled or to
21515 + * shutdown the processor:
21516 + */
21517 +static inline void halt(void)
21518 +{
21519 +       xen_halt();
21520 +}
21521 +
21522 +/*
21523 + * For spinlocks, etc:
21524 + */
21525 +#define __raw_local_irq_save()                                         \
21526 +({                                                                     \
21527 +       unsigned long flags = __raw_local_save_flags();                 \
21528 +                                                                       \
21529 +       raw_local_irq_disable();                                        \
21530 +                                                                       \
21531 +       flags;                                                          \
21532 +})
21533  #else
21534 -# include "irqflags_64.h"
21535 +
21536 +/* Offsets into shared_info_t. */
21537 +#define evtchn_upcall_pending          /* 0 */
21538 +#define evtchn_upcall_mask             1
21539 +
21540 +#define sizeof_vcpu_shift              6
21541 +
21542 +#ifdef CONFIG_X86_64
21543 +# define __REG_si %rsi
21544 +# define __CPU_num %gs:pda_cpunumber
21545 +#else
21546 +# define __REG_si %esi
21547 +# define __CPU_num TI_cpu(%ebp)
21548 +#endif
21549 +
21550 +#ifdef CONFIG_SMP
21551 +#define GET_VCPU_INFO          movl __CPU_num,%esi                     ; \
21552 +                               shl $sizeof_vcpu_shift,%esi             ; \
21553 +                               add HYPERVISOR_shared_info,__REG_si
21554 +#else
21555 +#define GET_VCPU_INFO          mov HYPERVISOR_shared_info,__REG_si
21556 +#endif
21557 +
21558 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(__REG_si)
21559 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(__REG_si)
21560 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(__REG_si)
21561 +#define DISABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
21562 +                               __DISABLE_INTERRUPTS
21563 +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO                           ; \
21564 +                               __ENABLE_INTERRUPTS
21565 +
21566 +#ifndef CONFIG_X86_64
21567 +#define INTERRUPT_RETURN               iret
21568 +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS              ; \
21569 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
21570 +       __TEST_PENDING                                                  ; \
21571 +       jnz  14f        /* process more events if necessary... */       ; \
21572 +       movl PT_ESI(%esp), %esi                                         ; \
21573 +       sysexit                                                         ; \
21574 +14:    __DISABLE_INTERRUPTS                                            ; \
21575 +       TRACE_IRQS_OFF                                                  ; \
21576 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
21577 +       push %esp                                                       ; \
21578 +       call evtchn_do_upcall                                           ; \
21579 +       add  $4,%esp                                                    ; \
21580 +       jmp  ret_from_intr
21581 +#endif
21582 +
21583 +
21584 +#endif /* __ASSEMBLY__ */
21585 +
21586 +#ifndef __ASSEMBLY__
21587 +#define raw_local_save_flags(flags) \
21588 +               do { (flags) = __raw_local_save_flags(); } while (0)
21589 +
21590 +#define raw_local_irq_save(flags) \
21591 +               do { (flags) = __raw_local_irq_save(); } while (0)
21592 +
21593 +static inline int raw_irqs_disabled_flags(unsigned long flags)
21594 +{
21595 +       return (flags != 0);
21596 +}
21597 +
21598 +#define raw_irqs_disabled()                                            \
21599 +({                                                                     \
21600 +       unsigned long flags = __raw_local_save_flags();                 \
21601 +                                                                       \
21602 +       raw_irqs_disabled_flags(flags);                                 \
21603 +})
21604 +
21605 +/*
21606 + * makes the traced hardirq state match with the machine state
21607 + *
21608 + * should be a rarely used function, only in places where its
21609 + * otherwise impossible to know the irq state, like in traps.
21610 + */
21611 +static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21612 +{
21613 +       if (raw_irqs_disabled_flags(flags))
21614 +               trace_hardirqs_off();
21615 +       else
21616 +               trace_hardirqs_on();
21617 +}
21618 +
21619 +#define trace_hardirqs_fixup() \
21620 +       trace_hardirqs_fixup_flags(__raw_local_save_flags())
21621 +
21622 +#else
21623 +
21624 +#ifdef CONFIG_X86_64
21625 +/*
21626 + * Currently paravirt can't handle swapgs nicely when we
21627 + * don't have a stack we can rely on (such as a user space
21628 + * stack).  So we either find a way around these or just fault
21629 + * and emulate if a guest tries to call swapgs directly.
21630 + *
21631 + * Either way, this is a good way to document that we don't
21632 + * have a reliable stack. x86_64 only.
21633 + */
21634 +#define SWAPGS_UNSAFE_STACK    swapgs
21635 +#define ARCH_TRACE_IRQS_ON             call trace_hardirqs_on_thunk
21636 +#define ARCH_TRACE_IRQS_OFF            call trace_hardirqs_off_thunk
21637 +#define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
21638 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
21639 +       TRACE_IRQS_ON; \
21640 +       ENABLE_INTERRUPTS(CLBR_NONE); \
21641 +       SAVE_REST; \
21642 +       LOCKDEP_SYS_EXIT; \
21643 +       RESTORE_REST; \
21644 +       __DISABLE_INTERRUPTS; \
21645 +       TRACE_IRQS_OFF;
21646 +
21647 +#else
21648 +#define ARCH_TRACE_IRQS_ON                     \
21649 +       pushl %eax;                             \
21650 +       pushl %ecx;                             \
21651 +       pushl %edx;                             \
21652 +       call trace_hardirqs_on;                 \
21653 +       popl %edx;                              \
21654 +       popl %ecx;                              \
21655 +       popl %eax;
21656 +
21657 +#define ARCH_TRACE_IRQS_OFF                    \
21658 +       pushl %eax;                             \
21659 +       pushl %ecx;                             \
21660 +       pushl %edx;                             \
21661 +       call trace_hardirqs_off;                \
21662 +       popl %edx;                              \
21663 +       popl %ecx;                              \
21664 +       popl %eax;
21665 +
21666 +#define ARCH_LOCKDEP_SYS_EXIT                  \
21667 +       pushl %eax;                             \
21668 +       pushl %ecx;                             \
21669 +       pushl %edx;                             \
21670 +       call lockdep_sys_exit;                  \
21671 +       popl %edx;                              \
21672 +       popl %ecx;                              \
21673 +       popl %eax;
21674 +
21675 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ
21676 +#endif
21677 +
21678 +#ifdef CONFIG_TRACE_IRQFLAGS
21679 +#  define TRACE_IRQS_ON                ARCH_TRACE_IRQS_ON
21680 +#  define TRACE_IRQS_OFF       ARCH_TRACE_IRQS_OFF
21681 +#else
21682 +#  define TRACE_IRQS_ON
21683 +#  define TRACE_IRQS_OFF
21684 +#endif
21685 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
21686 +#  define LOCKDEP_SYS_EXIT     ARCH_LOCKDEP_SYS_EXIT
21687 +#  define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
21688 +# else
21689 +#  define LOCKDEP_SYS_EXIT
21690 +#  define LOCKDEP_SYS_EXIT_IRQ
21691 +# endif
21692 +
21693 +#endif /* __ASSEMBLY__ */
21694  #endif
21695 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/irqflags_32.h    2009-02-16 16:18:36.000000000 +0100
21696 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
21697 @@ -1,212 +0,0 @@
21698 -/*
21699 - * include/asm-i386/irqflags.h
21700 - *
21701 - * IRQ flags handling
21702 - *
21703 - * This file gets included from lowlevel asm headers too, to provide
21704 - * wrapped versions of the local_irq_*() APIs, based on the
21705 - * raw_local_irq_*() functions from the lowlevel headers.
21706 - */
21707 -#ifndef _ASM_IRQFLAGS_H
21708 -#define _ASM_IRQFLAGS_H
21709 -
21710 -#ifndef __ASSEMBLY__
21711 -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21712 -
21713 -#define xen_restore_fl(f)                                      \
21714 -do {                                                           \
21715 -       vcpu_info_t *_vcpu;                                     \
21716 -       barrier();                                              \
21717 -       _vcpu = current_vcpu_info();                            \
21718 -       if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
21719 -               barrier(); /* unmask then check (avoid races) */\
21720 -               if (unlikely(_vcpu->evtchn_upcall_pending))     \
21721 -                       force_evtchn_callback();                \
21722 -       }                                                       \
21723 -} while (0)
21724 -
21725 -#define xen_irq_disable()                                      \
21726 -do {                                                           \
21727 -       current_vcpu_info()->evtchn_upcall_mask = 1;            \
21728 -       barrier();                                              \
21729 -} while (0)
21730 -
21731 -#define xen_irq_enable()                                       \
21732 -do {                                                           \
21733 -       vcpu_info_t *_vcpu;                                     \
21734 -       barrier();                                              \
21735 -       _vcpu = current_vcpu_info();                            \
21736 -       _vcpu->evtchn_upcall_mask = 0;                          \
21737 -       barrier(); /* unmask then check (avoid races) */        \
21738 -       if (unlikely(_vcpu->evtchn_upcall_pending))             \
21739 -               force_evtchn_callback();                        \
21740 -} while (0)
21741 -
21742 -void xen_safe_halt(void);
21743 -
21744 -void xen_halt(void);
21745 -
21746 -/*
21747 - * The use of 'barrier' in the following reflects their use as local-lock
21748 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21749 - * critical operations are executed. All critical operations must complete
21750 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21751 - * includes these barriers, for example.
21752 - */
21753 -
21754 -#define __raw_local_save_flags() xen_save_fl()
21755 -
21756 -#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21757 -
21758 -#define raw_local_irq_disable()        xen_irq_disable()
21759 -
21760 -#define raw_local_irq_enable() xen_irq_enable()
21761 -
21762 -/*
21763 - * Used in the idle loop; sti takes one instruction cycle
21764 - * to complete:
21765 - */
21766 -static inline void raw_safe_halt(void)
21767 -{
21768 -       xen_safe_halt();
21769 -}
21770 -
21771 -/*
21772 - * Used when interrupts are already enabled or to
21773 - * shutdown the processor:
21774 - */
21775 -static inline void halt(void)
21776 -{
21777 -       xen_halt();
21778 -}
21779 -
21780 -/*
21781 - * For spinlocks, etc:
21782 - */
21783 -#define __raw_local_irq_save()                                         \
21784 -({                                                                     \
21785 -       unsigned long flags = __raw_local_save_flags();                 \
21786 -                                                                       \
21787 -       raw_local_irq_disable();                                        \
21788 -                                                                       \
21789 -       flags;                                                          \
21790 -})
21791 -
21792 -#else
21793 -/* Offsets into shared_info_t. */
21794 -#define evtchn_upcall_pending          /* 0 */
21795 -#define evtchn_upcall_mask             1
21796 -
21797 -#define sizeof_vcpu_shift              6
21798 -
21799 -#ifdef CONFIG_SMP
21800 -#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
21801 -                               shl  $sizeof_vcpu_shift,%esi            ; \
21802 -                               addl HYPERVISOR_shared_info,%esi
21803 -#else
21804 -#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
21805 -#endif
21806 -
21807 -#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
21808 -#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
21809 -#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
21810 -#define DISABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
21811 -                               __DISABLE_INTERRUPTS
21812 -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO                           ; \
21813 -                               __ENABLE_INTERRUPTS
21814 -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS                  ; \
21815 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
21816 -       __TEST_PENDING                                                  ; \
21817 -       jnz  14f        /* process more events if necessary... */       ; \
21818 -       movl PT_ESI(%esp), %esi                                         ; \
21819 -       sysexit                                                         ; \
21820 -14:    __DISABLE_INTERRUPTS                                            ; \
21821 -       TRACE_IRQS_OFF                                                  ; \
21822 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
21823 -       push %esp                                                       ; \
21824 -       call evtchn_do_upcall                                           ; \
21825 -       add  $4,%esp                                                    ; \
21826 -       jmp  ret_from_intr
21827 -#define INTERRUPT_RETURN       iret
21828 -#endif /* __ASSEMBLY__ */
21829 -
21830 -#ifndef __ASSEMBLY__
21831 -#define raw_local_save_flags(flags) \
21832 -               do { (flags) = __raw_local_save_flags(); } while (0)
21833 -
21834 -#define raw_local_irq_save(flags) \
21835 -               do { (flags) = __raw_local_irq_save(); } while (0)
21836 -
21837 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21838 -{
21839 -       return (flags != 0);
21840 -}
21841 -
21842 -#define raw_irqs_disabled()                                            \
21843 -({                                                                     \
21844 -       unsigned long flags = __raw_local_save_flags();                 \
21845 -                                                                       \
21846 -       raw_irqs_disabled_flags(flags);                                 \
21847 -})
21848 -
21849 -/*
21850 - * makes the traced hardirq state match with the machine state
21851 - *
21852 - * should be a rarely used function, only in places where its
21853 - * otherwise impossible to know the irq state, like in traps.
21854 - */
21855 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21856 -{
21857 -       if (raw_irqs_disabled_flags(flags))
21858 -               trace_hardirqs_off();
21859 -       else
21860 -               trace_hardirqs_on();
21861 -}
21862 -
21863 -#define trace_hardirqs_fixup() \
21864 -       trace_hardirqs_fixup_flags(__raw_local_save_flags())
21865 -#endif /* __ASSEMBLY__ */
21866 -
21867 -/*
21868 - * Do the CPU's IRQ-state tracing from assembly code. We call a
21869 - * C function, so save all the C-clobbered registers:
21870 - */
21871 -#ifdef CONFIG_TRACE_IRQFLAGS
21872 -
21873 -# define TRACE_IRQS_ON                         \
21874 -       pushl %eax;                             \
21875 -       pushl %ecx;                             \
21876 -       pushl %edx;                             \
21877 -       call trace_hardirqs_on;                 \
21878 -       popl %edx;                              \
21879 -       popl %ecx;                              \
21880 -       popl %eax;
21881 -
21882 -# define TRACE_IRQS_OFF                                \
21883 -       pushl %eax;                             \
21884 -       pushl %ecx;                             \
21885 -       pushl %edx;                             \
21886 -       call trace_hardirqs_off;                \
21887 -       popl %edx;                              \
21888 -       popl %ecx;                              \
21889 -       popl %eax;
21890 -
21891 -#else
21892 -# define TRACE_IRQS_ON
21893 -# define TRACE_IRQS_OFF
21894 -#endif
21895 -
21896 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
21897 -# define LOCKDEP_SYS_EXIT                      \
21898 -       pushl %eax;                             \
21899 -       pushl %ecx;                             \
21900 -       pushl %edx;                             \
21901 -       call lockdep_sys_exit;                  \
21902 -       popl %edx;                              \
21903 -       popl %ecx;                              \
21904 -       popl %eax;
21905 -#else
21906 -# define LOCKDEP_SYS_EXIT
21907 -#endif
21908 -
21909 -#endif
21910 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/irqflags_64.h    2009-02-16 16:18:36.000000000 +0100
21911 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
21912 @@ -1,178 +0,0 @@
21913 -/*
21914 - * include/asm-x86_64/irqflags.h
21915 - *
21916 - * IRQ flags handling
21917 - *
21918 - * This file gets included from lowlevel asm headers too, to provide
21919 - * wrapped versions of the local_irq_*() APIs, based on the
21920 - * raw_local_irq_*() functions from the lowlevel headers.
21921 - */
21922 -#ifndef _ASM_IRQFLAGS_H
21923 -#define _ASM_IRQFLAGS_H
21924 -#include <asm/processor-flags.h>
21925 -
21926 -#ifndef __ASSEMBLY__
21927 -/*
21928 - * Interrupt control:
21929 - */
21930 -
21931 -/*
21932 - * The use of 'barrier' in the following reflects their use as local-lock
21933 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21934 - * critical operations are executed. All critical operations must complete
21935 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21936 - * includes these barriers, for example.
21937 - */
21938 -
21939 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
21940 -
21941 -#define raw_local_save_flags(flags) \
21942 -               do { (flags) = __raw_local_save_flags(); } while (0)
21943 -
21944 -#define raw_local_irq_restore(x)                                       \
21945 -do {                                                                   \
21946 -       vcpu_info_t *_vcpu;                                             \
21947 -       barrier();                                                      \
21948 -       _vcpu = current_vcpu_info();            \
21949 -       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
21950 -               barrier(); /* unmask then check (avoid races) */        \
21951 -               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
21952 -                       force_evtchn_callback();                        \
21953 -       }                                                               \
21954 -} while (0)
21955 -
21956 -#ifdef CONFIG_X86_VSMP
21957 -
21958 -/*
21959 - * Interrupt control for the VSMP architecture:
21960 - */
21961 -
21962 -static inline void raw_local_irq_disable(void)
21963 -{
21964 -       unsigned long flags = __raw_local_save_flags();
21965 -
21966 -       raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
21967 -}
21968 -
21969 -static inline void raw_local_irq_enable(void)
21970 -{
21971 -       unsigned long flags = __raw_local_save_flags();
21972 -
21973 -       raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
21974 -}
21975 -
21976 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21977 -{
21978 -       return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
21979 -}
21980 -
21981 -#else /* CONFIG_X86_VSMP */
21982 -
21983 -#define raw_local_irq_disable()                                                \
21984 -do {                                                                   \
21985 -       current_vcpu_info()->evtchn_upcall_mask = 1;                                    \
21986 -       barrier();                                                      \
21987 -} while (0)
21988 -
21989 -#define raw_local_irq_enable()                                         \
21990 -do {                                                                   \
21991 -       vcpu_info_t *_vcpu;                                             \
21992 -       barrier();                                                      \
21993 -       _vcpu = current_vcpu_info();            \
21994 -       _vcpu->evtchn_upcall_mask = 0;                                  \
21995 -       barrier(); /* unmask then check (avoid races) */                \
21996 -       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
21997 -               force_evtchn_callback();                                \
21998 -} while (0)
21999 -
22000 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22001 -{
22002 -       return (flags != 0);
22003 -}
22004 -
22005 -#endif
22006 -
22007 -/*
22008 - * For spinlocks, etc.:
22009 - */
22010 -
22011 -#define __raw_local_irq_save()                                         \
22012 -({                                                                     \
22013 -       unsigned long flags = __raw_local_save_flags();                 \
22014 -                                                                       \
22015 -       raw_local_irq_disable();                                        \
22016 -                                                                       \
22017 -       flags;                                                          \
22018 -})
22019 -
22020 -#define raw_local_irq_save(flags) \
22021 -               do { (flags) = __raw_local_irq_save(); } while (0)
22022 -
22023 -#define raw_irqs_disabled()                                            \
22024 -({                                                                     \
22025 -       unsigned long flags = __raw_local_save_flags();                 \
22026 -                                                                       \
22027 -       raw_irqs_disabled_flags(flags);                                 \
22028 -})
22029 -
22030 -/*
22031 - * makes the traced hardirq state match with the machine state
22032 - *
22033 - * should be a rarely used function, only in places where its
22034 - * otherwise impossible to know the irq state, like in traps.
22035 - */
22036 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22037 -{
22038 -       if (raw_irqs_disabled_flags(flags))
22039 -               trace_hardirqs_off();
22040 -       else
22041 -               trace_hardirqs_on();
22042 -}
22043 -
22044 -#define trace_hardirqs_fixup() \
22045 -       trace_hardirqs_fixup_flags(__raw_local_save_flags())
22046 -/*
22047 - * Used in the idle loop; sti takes one instruction cycle
22048 - * to complete:
22049 - */
22050 -void xen_safe_halt(void);
22051 -static inline void raw_safe_halt(void)
22052 -{
22053 -       xen_safe_halt();
22054 -}
22055 -
22056 -/*
22057 - * Used when interrupts are already enabled or to
22058 - * shutdown the processor:
22059 - */
22060 -void xen_halt(void);
22061 -static inline void halt(void)
22062 -{
22063 -       xen_halt();
22064 -}
22065 -
22066 -#else /* __ASSEMBLY__: */
22067 -# ifdef CONFIG_TRACE_IRQFLAGS
22068 -#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk
22069 -#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk
22070 -# else
22071 -#  define TRACE_IRQS_ON
22072 -#  define TRACE_IRQS_OFF
22073 -# endif
22074 -# ifdef CONFIG_DEBUG_LOCK_ALLOC
22075 -#  define LOCKDEP_SYS_EXIT     call lockdep_sys_exit_thunk
22076 -#  define LOCKDEP_SYS_EXIT_IRQ \
22077 -       TRACE_IRQS_ON; \
22078 -       sti; \
22079 -       SAVE_REST; \
22080 -       LOCKDEP_SYS_EXIT; \
22081 -       RESTORE_REST; \
22082 -       cli; \
22083 -       TRACE_IRQS_OFF;
22084 -# else
22085 -#  define LOCKDEP_SYS_EXIT
22086 -#  define LOCKDEP_SYS_EXIT_IRQ
22087 -# endif
22088 -#endif
22089 -
22090 -#endif
22091 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/maddr_32.h       2009-02-16 16:17:21.000000000 +0100
22092 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/maddr_32.h    2009-03-16 16:33:40.000000000 +0100
22093 @@ -1,6 +1,7 @@
22094  #ifndef _I386_MADDR_H
22095  #define _I386_MADDR_H
22096
22097 +#include <asm/bug.h>
22098  #include <xen/features.h>
22099  #include <xen/interface/xen.h>
22100
22101 @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22102         phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22103         return phys;
22104  }
22105 -#endif
22106 -
22107 -#ifdef CONFIG_X86_PAE
22108 -#define __pte_ma(x)    ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22109 -extern unsigned long long __supported_pte_mask;
22110 -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22111 -{
22112 -       pte_t pte;
22113 -
22114 -       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22115 -                                       (pgprot_val(pgprot) >> 32);
22116 -       pte.pte_high &= (__supported_pte_mask >> 32);
22117 -       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22118 -                                                       __supported_pte_mask;
22119 -       return pte;
22120 -}
22121  #else
22122 -#define __pte_ma(x)    ((pte_t) { (x) } )
22123 -#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22124 +#define pte_phys_to_machine phys_to_machine
22125 +#define pte_machine_to_phys machine_to_phys
22126  #endif
22127
22128  #else /* !CONFIG_XEN */
22129 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/maddr_64.h       2009-05-14 10:56:29.000000000 +0200
22130 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/maddr_64.h    2009-03-16 16:33:40.000000000 +0100
22131 @@ -1,6 +1,7 @@
22132  #ifndef _X86_64_MADDR_H
22133  #define _X86_64_MADDR_H
22134
22135 +#include <asm/bug.h>
22136  #include <xen/features.h>
22137  #include <xen/interface/xen.h>
22138
22139 @@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22140  #ifdef CONFIG_XEN
22141
22142  extern unsigned long *phys_to_machine_mapping;
22143 +extern unsigned long  max_mapnr;
22144
22145  #undef machine_to_phys_mapping
22146  extern unsigned long *machine_to_phys_mapping;
22147 @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22148  {
22149         if (xen_feature(XENFEAT_auto_translated_physmap))
22150                 return pfn;
22151 -       BUG_ON(end_pfn && pfn >= end_pfn);
22152 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22153         return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22154  }
22155
22156 @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22157  {
22158         if (xen_feature(XENFEAT_auto_translated_physmap))
22159                 return 1;
22160 -       BUG_ON(end_pfn && pfn >= end_pfn);
22161 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22162         return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22163  }
22164
22165 @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22166                 return mfn;
22167
22168         if (unlikely((mfn >> machine_to_phys_order) != 0))
22169 -               return end_pfn;
22170 +               return max_mapnr;
22171
22172         /* The array access can fail (e.g., device space beyond end of RAM). */
22173         asm (
22174 @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22175                 "       .quad 1b,3b\n"
22176                 ".previous"
22177                 : "=r" (pfn)
22178 -               : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22179 +               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22180
22181         return pfn;
22182  }
22183 @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22184  static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22185  {
22186         unsigned long pfn = mfn_to_pfn(mfn);
22187 -       if ((pfn < end_pfn)
22188 +       if ((pfn < max_mapnr)
22189             && !xen_feature(XENFEAT_auto_translated_physmap)
22190             && (phys_to_machine_mapping[pfn] != mfn))
22191 -               return end_pfn; /* force !pfn_valid() */
22192 +               return max_mapnr; /* force !pfn_valid() */
22193         return pfn;
22194  }
22195
22196  static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22197  {
22198 -       BUG_ON(end_pfn && pfn >= end_pfn);
22199 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22200         if (xen_feature(XENFEAT_auto_translated_physmap)) {
22201                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22202                 return;
22203 @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22204         return phys;
22205  }
22206
22207 -#define __pte_ma(x)     ((pte_t) { (x) } )
22208 -#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22209 -
22210  #else /* !CONFIG_XEN */
22211
22212  #define pfn_to_mfn(pfn) (pfn)
22213 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-02-16 16:17:21.000000000 +0100
22214 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_32.h      2009-03-16 16:33:40.000000000 +0100
22215 @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22216                 : : "r" (0) );
22217  }
22218
22219 -void leave_mm(unsigned long cpu);
22220 -
22221  static inline void switch_mm(struct mm_struct *prev,
22222                              struct mm_struct *next,
22223                              struct task_struct *tsk)
22224 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-02-16 16:17:21.000000000 +0100
22225 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_64.h      2009-03-16 16:33:40.000000000 +0100
22226 @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22227  extern void mm_unpin(struct mm_struct *mm);
22228  void mm_pin_all(void);
22229
22230 -static inline void load_cr3(pgd_t *pgd)
22231 -{
22232 -       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22233 -                    "memory");
22234 -}
22235 -
22236  static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22237                              struct task_struct *tsk)
22238  {
22239 @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22240                 op++;
22241
22242                 if (unlikely(next->context.ldt != prev->context.ldt)) {
22243 -                       /* load_LDT_nolock(&next->context, cpu) */
22244 +                       /* load_LDT_nolock(&next->context) */
22245                         op->cmd = MMUEXT_SET_LDT;
22246                         op->arg1.linear_addr = (unsigned long)next->context.ldt;
22247                         op->arg2.nr_ents     = next->context.size;
22248 @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22249         else {
22250                 write_pda(mmu_state, TLBSTATE_OK);
22251                 if (read_pda(active_mm) != next)
22252 -                       out_of_line_bug();
22253 +                       BUG();
22254                 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22255                         /* We were in lazy tlb mode and leave_mm disabled
22256                          * tlb flush IPI delivery. We must reload CR3
22257 @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22258                          */
22259                          load_cr3(next->pgd);
22260                          xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22261 -                       load_LDT_nolock(&next->context, cpu);
22262 +                       load_LDT_nolock(&next->context);
22263                 }
22264         }
22265  #endif
22266 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page.h   2009-02-16 16:18:36.000000000 +0100
22267 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page.h        2009-03-16 16:33:40.000000000 +0100
22268 @@ -1,13 +1,231 @@
22269 +#ifndef _ASM_X86_PAGE_H
22270 +#define _ASM_X86_PAGE_H
22271 +
22272 +#include <linux/const.h>
22273 +
22274 +/* PAGE_SHIFT determines the page size */
22275 +#define PAGE_SHIFT     12
22276 +#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
22277 +#define PAGE_MASK      (~(PAGE_SIZE-1))
22278 +
22279  #ifdef __KERNEL__
22280 -# ifdef CONFIG_X86_32
22281 -#  include "page_32.h"
22282 -# else
22283 -#  include "page_64.h"
22284 -# endif
22285 +
22286 +/*
22287 + * Need to repeat this here in order to not include pgtable.h (which in turn
22288 + * depends on definitions made here), but to be able to use the symbolics
22289 + * below. The preprocessor will warn if the two definitions aren't identical.
22290 + */
22291 +#define _PAGE_BIT_PRESENT      0
22292 +#define _PAGE_PRESENT          (_AC(1, L)<<_PAGE_BIT_PRESENT)
22293 +#define _PAGE_BIT_IO           9
22294 +#define _PAGE_IO               (_AC(1, L)<<_PAGE_BIT_IO)
22295 +
22296 +#define PHYSICAL_PAGE_MASK     (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22297 +#define PTE_MASK               _AT(pteval_t, PHYSICAL_PAGE_MASK)
22298 +
22299 +#define PMD_PAGE_SIZE          (_AC(1, UL) << PMD_SHIFT)
22300 +#define PMD_PAGE_MASK          (~(PMD_PAGE_SIZE-1))
22301 +
22302 +#define HPAGE_SHIFT            PMD_SHIFT
22303 +#define HPAGE_SIZE             (_AC(1,UL) << HPAGE_SHIFT)
22304 +#define HPAGE_MASK             (~(HPAGE_SIZE - 1))
22305 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
22306 +
22307 +/* to align the pointer to the (next) page boundary */
22308 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22309 +
22310 +#define __PHYSICAL_MASK                _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22311 +#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22312 +
22313 +#ifndef __ASSEMBLY__
22314 +#include <linux/types.h>
22315 +#endif
22316 +
22317 +#ifdef CONFIG_X86_64
22318 +#include <asm/page_64.h>
22319 +#define max_pfn_mapped         end_pfn_map
22320 +#else
22321 +#include <asm/page_32.h>
22322 +#define max_pfn_mapped         max_low_pfn
22323 +#endif /* CONFIG_X86_64 */
22324 +
22325 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
22326 +
22327 +#define VM_DATA_DEFAULT_FLAGS \
22328 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22329 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22330 +
22331 +
22332 +#ifndef __ASSEMBLY__
22333 +
22334 +extern int page_is_ram(unsigned long pagenr);
22335 +
22336 +struct page;
22337 +
22338 +static inline void clear_user_page(void *page, unsigned long vaddr,
22339 +                               struct page *pg)
22340 +{
22341 +       clear_page(page);
22342 +}
22343 +
22344 +static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22345 +                               struct page *topage)
22346 +{
22347 +       copy_page(to, from);
22348 +}
22349 +
22350 +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22351 +       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22352 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22353 +
22354 +typedef struct { pgprotval_t pgprot; } pgprot_t;
22355 +
22356 +#define pgprot_val(x)  ((x).pgprot)
22357 +#define __pgprot(x)    ((pgprot_t) { (x) } )
22358 +
22359 +#include <asm/maddr.h>
22360 +
22361 +typedef struct { pgdval_t pgd; } pgd_t;
22362 +
22363 +#define __pgd_ma(x) ((pgd_t) { (x) } )
22364 +static inline pgd_t xen_make_pgd(pgdval_t val)
22365 +{
22366 +       if (val & _PAGE_PRESENT)
22367 +               val = pte_phys_to_machine(val);
22368 +       return (pgd_t) { val };
22369 +}
22370 +
22371 +#define __pgd_val(x) ((x).pgd)
22372 +static inline pgdval_t xen_pgd_val(pgd_t pgd)
22373 +{
22374 +       pgdval_t ret = __pgd_val(pgd);
22375 +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22376 +       if (ret)
22377 +               ret = machine_to_phys(ret) | _PAGE_PRESENT;
22378 +#else
22379 +       if (ret & _PAGE_PRESENT)
22380 +               ret = pte_machine_to_phys(ret);
22381 +#endif
22382 +       return ret;
22383 +}
22384 +
22385 +#if PAGETABLE_LEVELS >= 3
22386 +#if PAGETABLE_LEVELS == 4
22387 +typedef struct { pudval_t pud; } pud_t;
22388 +
22389 +#define __pud_ma(x) ((pud_t) { (x) } )
22390 +static inline pud_t xen_make_pud(pudval_t val)
22391 +{
22392 +       if (val & _PAGE_PRESENT)
22393 +               val = pte_phys_to_machine(val);
22394 +       return (pud_t) { val };
22395 +}
22396 +
22397 +#define __pud_val(x) ((x).pud)
22398 +static inline pudval_t xen_pud_val(pud_t pud)
22399 +{
22400 +       pudval_t ret = __pud_val(pud);
22401 +       if (ret & _PAGE_PRESENT)
22402 +               ret = pte_machine_to_phys(ret);
22403 +       return ret;
22404 +}
22405 +#else  /* PAGETABLE_LEVELS == 3 */
22406 +#include <asm-generic/pgtable-nopud.h>
22407 +
22408 +#define __pud_val(x) __pgd_val((x).pgd)
22409 +static inline pudval_t xen_pud_val(pud_t pud)
22410 +{
22411 +       return xen_pgd_val(pud.pgd);
22412 +}
22413 +#endif /* PAGETABLE_LEVELS == 4 */
22414 +
22415 +typedef struct { pmdval_t pmd; } pmd_t;
22416 +
22417 +#define __pmd_ma(x)    ((pmd_t) { (x) } )
22418 +static inline pmd_t xen_make_pmd(pmdval_t val)
22419 +{
22420 +       if (val & _PAGE_PRESENT)
22421 +               val = pte_phys_to_machine(val);
22422 +       return (pmd_t) { val };
22423 +}
22424 +
22425 +#define __pmd_val(x) ((x).pmd)
22426 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
22427 +{
22428 +       pmdval_t ret = __pmd_val(pmd);
22429 +#if CONFIG_XEN_COMPAT <= 0x030002
22430 +       if (ret)
22431 +               ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22432  #else
22433 -# ifdef __i386__
22434 -#  include "page_32.h"
22435 -# else
22436 -#  include "page_64.h"
22437 -# endif
22438 +       if (ret & _PAGE_PRESENT)
22439 +               ret = pte_machine_to_phys(ret);
22440 +#endif
22441 +       return ret;
22442 +}
22443 +#else  /* PAGETABLE_LEVELS == 2 */
22444 +#include <asm-generic/pgtable-nopmd.h>
22445 +
22446 +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
22447 +#define __pmd_val(x) __pgd_val((x).pud.pgd)
22448 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
22449 +{
22450 +       return xen_pgd_val(pmd.pud.pgd);
22451 +}
22452 +#endif /* PAGETABLE_LEVELS >= 3 */
22453 +
22454 +#define __pte_ma(x) ((pte_t) { .pte = (x) } )
22455 +static inline pte_t xen_make_pte(pteval_t val)
22456 +{
22457 +       if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22458 +               val = pte_phys_to_machine(val);
22459 +       return (pte_t) { .pte = val };
22460 +}
22461 +
22462 +#define __pte_val(x) ((x).pte)
22463 +static inline pteval_t xen_pte_val(pte_t pte)
22464 +{
22465 +       pteval_t ret = __pte_val(pte);
22466 +       if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22467 +               ret = pte_machine_to_phys(ret);
22468 +       return ret;
22469 +}
22470 +
22471 +#define pgd_val(x)     xen_pgd_val(x)
22472 +#define __pgd(x)       xen_make_pgd(x)
22473 +
22474 +#ifndef __PAGETABLE_PUD_FOLDED
22475 +#define pud_val(x)     xen_pud_val(x)
22476 +#define __pud(x)       xen_make_pud(x)
22477 +#endif
22478 +
22479 +#ifndef __PAGETABLE_PMD_FOLDED
22480 +#define pmd_val(x)     xen_pmd_val(x)
22481 +#define __pmd(x)       xen_make_pmd(x)
22482  #endif
22483 +
22484 +#define pte_val(x)     xen_pte_val(x)
22485 +#define __pte(x)       xen_make_pte(x)
22486 +
22487 +#define __pa(x)                __phys_addr((unsigned long)(x))
22488 +/* __pa_symbol should be used for C visible symbols.
22489 +   This seems to be the official gcc blessed way to do such arithmetic. */
22490 +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
22491 +
22492 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
22493 +
22494 +#define __boot_va(x)           __va(x)
22495 +#define __boot_pa(x)           __pa(x)
22496 +
22497 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22498 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
22499 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22500 +
22501 +#endif /* __ASSEMBLY__ */
22502 +
22503 +#include <asm-generic/memory_model.h>
22504 +#include <asm-generic/page.h>
22505 +
22506 +#define __HAVE_ARCH_GATE_AREA 1
22507 +
22508 +#endif /* __KERNEL__ */
22509 +#endif /* _ASM_X86_PAGE_H */
22510 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page_64.h        2009-02-16 16:18:36.000000000 +0100
22511 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page_64.h     2009-03-16 16:33:40.000000000 +0100
22512 @@ -1,37 +1,9 @@
22513  #ifndef _X86_64_PAGE_H
22514  #define _X86_64_PAGE_H
22515
22516 -/* #include <linux/string.h> */
22517 -#ifndef __ASSEMBLY__
22518 -#include <linux/kernel.h>
22519 -#include <linux/types.h>
22520 -#include <asm/bug.h>
22521 -#endif
22522 -#include <linux/const.h>
22523 -#include <xen/interface/xen.h>
22524 -
22525 -/*
22526 - * Need to repeat this here in order to not include pgtable.h (which in turn
22527 - * depends on definitions made here), but to be able to use the symbolic
22528 - * below. The preprocessor will warn if the two definitions aren't identical.
22529 - */
22530 -#define _PAGE_PRESENT  0x001
22531 -#define _PAGE_IO       0x200
22532 -
22533 -/* PAGE_SHIFT determines the page size */
22534 -#define PAGE_SHIFT     12
22535 -#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
22536 -#define PAGE_MASK      (~(PAGE_SIZE-1))
22537 -
22538 -/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22539 -#define __PHYSICAL_MASK_SHIFT  46
22540 -#define __PHYSICAL_MASK                ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22541 -#define __VIRTUAL_MASK_SHIFT   48
22542 -#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22543 -
22544 -#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22545 +#define PAGETABLE_LEVELS       4
22546
22547 -#define THREAD_ORDER 1
22548 +#define THREAD_ORDER   1
22549  #define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
22550  #define CURRENT_MASK (~(THREAD_SIZE-1))
22551
22552 @@ -51,106 +23,10 @@
22553  #define MCE_STACK 5
22554  #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
22555
22556 -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22557 -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22558 -
22559 -#define HPAGE_SHIFT PMD_SHIFT
22560 -#define HPAGE_SIZE     (_AC(1,UL) << HPAGE_SHIFT)
22561 -#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
22562 -#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
22563 -
22564 -#ifdef __KERNEL__
22565 -#ifndef __ASSEMBLY__
22566 -
22567 -extern unsigned long end_pfn;
22568 -
22569 -#include <asm/maddr.h>
22570 -
22571 -void clear_page(void *);
22572 -void copy_page(void *, void *);
22573 -
22574 -#define clear_user_page(page, vaddr, pg)       clear_page(page)
22575 -#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
22576 -
22577 -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22578 -       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22579 -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22580 -
22581 -/*
22582 - * These are used to make use of C type-checking..
22583 - */
22584 -typedef struct { unsigned long pte; } pte_t;
22585 -typedef struct { unsigned long pmd; } pmd_t;
22586 -typedef struct { unsigned long pud; } pud_t;
22587 -typedef struct { unsigned long pgd; } pgd_t;
22588 -#define PTE_MASK       PHYSICAL_PAGE_MASK
22589 -
22590 -typedef struct { unsigned long pgprot; } pgprot_t;
22591 -
22592 -#define __pte_val(x) ((x).pte)
22593 -#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO))  \
22594 -                   == _PAGE_PRESENT ?                          \
22595 -                   pte_machine_to_phys(__pte_val(x)) :         \
22596 -                   __pte_val(x))
22597 -
22598 -#define __pmd_val(x) ((x).pmd)
22599 -static inline unsigned long pmd_val(pmd_t x)
22600 -{
22601 -       unsigned long ret = __pmd_val(x);
22602 -#if CONFIG_XEN_COMPAT <= 0x030002
22603 -       if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22604 -#else
22605 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22606 -#endif
22607 -       return ret;
22608 -}
22609 -
22610 -#define __pud_val(x) ((x).pud)
22611 -static inline unsigned long pud_val(pud_t x)
22612 -{
22613 -       unsigned long ret = __pud_val(x);
22614 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22615 -       return ret;
22616 -}
22617 -
22618 -#define __pgd_val(x) ((x).pgd)
22619 -static inline unsigned long pgd_val(pgd_t x)
22620 -{
22621 -       unsigned long ret = __pgd_val(x);
22622 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22623 -       return ret;
22624 -}
22625 -
22626 -#define pgprot_val(x)  ((x).pgprot)
22627 -
22628 -static inline pte_t __pte(unsigned long x)
22629 -{
22630 -       if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22631 -               x = pte_phys_to_machine(x);
22632 -       return ((pte_t) { (x) });
22633 -}
22634 -
22635 -static inline pmd_t __pmd(unsigned long x)
22636 -{
22637 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22638 -       return ((pmd_t) { (x) });
22639 -}
22640 -
22641 -static inline pud_t __pud(unsigned long x)
22642 -{
22643 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22644 -       return ((pud_t) { (x) });
22645 -}
22646 -
22647 -static inline pgd_t __pgd(unsigned long x)
22648 -{
22649 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22650 -       return ((pgd_t) { (x) });
22651 -}
22652 -
22653 -#define __pgprot(x)    ((pgprot_t) { (x) } )
22654 +#define PUD_PAGE_SIZE          (_AC(1, UL) << PUD_SHIFT)
22655 +#define PUD_PAGE_MASK          (~(PUD_PAGE_SIZE-1))
22656
22657 -#endif /* !__ASSEMBLY__ */
22658 +#define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
22659
22660  #define __PHYSICAL_START       CONFIG_PHYSICAL_START
22661  #define __KERNEL_ALIGN         0x200000
22662 @@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22663
22664  #define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
22665  #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
22666 -#define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
22667
22668  #if CONFIG_XEN_COMPAT <= 0x030002
22669  #undef LOAD_OFFSET
22670  #define LOAD_OFFSET            0
22671  #endif
22672
22673 -/* to align the pointer to the (next) page boundary */
22674 -#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22675 -
22676 -#define KERNEL_TEXT_SIZE  (40*1024*1024)
22677 -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22678 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22679 +#define __PHYSICAL_MASK_SHIFT  46
22680 +#define __VIRTUAL_MASK_SHIFT   48
22681
22682 -#define PAGE_OFFSET            __PAGE_OFFSET
22683 +/*
22684 + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22685 + * arch/x86/kernel/head_64.S), and it is mapped here:
22686 + */
22687 +#define KERNEL_IMAGE_SIZE      (128*1024*1024)
22688 +#define KERNEL_IMAGE_START     _AC(0xffffffff80000000, UL)
22689
22690  #ifndef __ASSEMBLY__
22691 +void clear_page(void *page);
22692 +void copy_page(void *to, void *from);
22693 +
22694 +extern unsigned long end_pfn;
22695 +extern unsigned long end_pfn_map;
22696 +
22697  static inline unsigned long __phys_addr(unsigned long x)
22698  {
22699 -       return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22700 +       return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22701  }
22702 -#endif
22703
22704 -#define __pa(x)                __phys_addr((unsigned long)(x))
22705 -#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22706 +#define __phys_reloc_hide(x)   (x)
22707
22708 -#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
22709 -#define __boot_va(x)           __va(x)
22710 -#define __boot_pa(x)           __pa(x)
22711 -#ifdef CONFIG_FLATMEM
22712 -#define pfn_valid(pfn)         ((pfn) < end_pfn)
22713 -#endif
22714 +/*
22715 + * These are used to make use of C type-checking..
22716 + */
22717 +typedef unsigned long  pteval_t;
22718 +typedef unsigned long  pmdval_t;
22719 +typedef unsigned long  pudval_t;
22720 +typedef unsigned long  pgdval_t;
22721 +typedef unsigned long  pgprotval_t;
22722 +typedef unsigned long  phys_addr_t;
22723
22724 -#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22725 -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22726 -#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
22727 -
22728 -#define VM_DATA_DEFAULT_FLAGS \
22729 -       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22730 -        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22731 +typedef struct page *pgtable_t;
22732 +
22733 +typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22734
22735 -#define __HAVE_ARCH_GATE_AREA 1
22736  #define vmemmap ((struct page *)VMEMMAP_START)
22737
22738 -#include <asm-generic/memory_model.h>
22739 -#include <asm-generic/page.h>
22740 +#endif /* !__ASSEMBLY__ */
22741 +
22742 +#ifdef CONFIG_FLATMEM
22743 +#define pfn_valid(pfn)          ((pfn) < max_mapnr)
22744 +#endif
22745
22746 -#endif /* __KERNEL__ */
22747
22748  #endif /* _X86_64_PAGE_H */
22749 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pci.h    2009-02-16 16:18:36.000000000 +0100
22750 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
22751 @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
22752
22753
22754  #ifdef CONFIG_PCI
22755 +extern void early_quirks(void);
22756  static inline void pci_dma_burst_advice(struct pci_dev *pdev,
22757                                         enum pci_dma_burst_strategy *strat,
22758                                         unsigned long *strategy_parameter)
22759 @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
22760         *strat = PCI_DMA_BURST_INFINITY;
22761         *strategy_parameter = ~0UL;
22762  }
22763 +#else
22764 +static inline void early_quirks(void) { }
22765  #endif
22766
22767 -
22768  #endif  /* __KERNEL__ */
22769
22770  #ifdef CONFIG_X86_32
22771 @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
22772  /* generic pci stuff */
22773  #include <asm-generic/pci.h>
22774
22775 +#ifdef CONFIG_NUMA
22776 +/* Returns the node based on pci bus */
22777 +static inline int __pcibus_to_node(struct pci_bus *bus)
22778 +{
22779 +       struct pci_sysdata *sd = bus->sysdata;
22780 +
22781 +       return sd->node;
22782 +}
22783
22784 +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
22785 +{
22786 +       return node_to_cpumask(__pcibus_to_node(bus));
22787 +}
22788 +#endif
22789
22790  #endif
22791 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h     2009-02-16 16:17:21.000000000 +0100
22792 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgalloc_32.h  2009-03-16 16:33:40.000000000 +0100
22793 @@ -3,69 +3,109 @@
22794
22795  #include <linux/threads.h>
22796  #include <linux/mm.h>          /* for struct page */
22797 +#include <linux/pagemap.h>
22798 +#include <asm/tlb.h>
22799 +#include <asm-generic/tlb.h>
22800  #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
22801
22802  #define paravirt_alloc_pt(mm, pfn) do { } while (0)
22803 -#define paravirt_alloc_pd(pfn) do { } while (0)
22804 -#define paravirt_alloc_pd(pfn) do { } while (0)
22805 +#define paravirt_alloc_pd(mm, pfn) do { } while (0)
22806  #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
22807  #define paravirt_release_pt(pfn) do { } while (0)
22808  #define paravirt_release_pd(pfn) do { } while (0)
22809
22810 -#define pmd_populate_kernel(mm, pmd, pte)                      \
22811 -do {                                                           \
22812 -       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);         \
22813 -       set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));           \
22814 -} while (0)
22815 -
22816 -#define pmd_populate(mm, pmd, pte)                                     \
22817 -do {                                                                   \
22818 -       unsigned long pfn = page_to_pfn(pte);                           \
22819 -       paravirt_alloc_pt(mm, pfn);                                     \
22820 -       if (PagePinned(virt_to_page((mm)->pgd))) {                      \
22821 -               if (!PageHighMem(pte))                                  \
22822 -                       BUG_ON(HYPERVISOR_update_va_mapping(            \
22823 -                         (unsigned long)__va(pfn << PAGE_SHIFT),       \
22824 -                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));            \
22825 -               else if (!test_and_set_bit(PG_pinned, &pte->flags))     \
22826 -                       kmap_flush_unused();                            \
22827 -               set_pmd(pmd,                                            \
22828 -                       __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
22829 -       } else                                                  \
22830 -               *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
22831 -} while (0)
22832 +static inline void pmd_populate_kernel(struct mm_struct *mm,
22833 +                                      pmd_t *pmd, pte_t *pte)
22834 +{
22835 +       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
22836 +       set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
22837 +}
22838 +
22839 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22840 +{
22841 +       unsigned long pfn = page_to_pfn(pte);
22842 +
22843 +       paravirt_alloc_pt(mm, pfn);
22844 +       if (PagePinned(virt_to_page(mm->pgd))) {
22845 +               if (!PageHighMem(pte))
22846 +                       BUG_ON(HYPERVISOR_update_va_mapping(
22847 +                         (unsigned long)__va(pfn << PAGE_SHIFT),
22848 +                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22849 +               else if (!test_and_set_bit(PG_pinned, &pte->flags))
22850 +                       kmap_flush_unused();
22851 +               set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
22852 +       } else
22853 +               *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
22854 +}
22855 +#define pmd_pgtable(pmd) pmd_page(pmd)
22856
22857  /*
22858   * Allocate and free page tables.
22859   */
22860 +extern void pgd_test_and_unpin(pgd_t *);
22861  extern pgd_t *pgd_alloc(struct mm_struct *);
22862 -extern void pgd_free(pgd_t *pgd);
22863 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
22864
22865  extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
22866 -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
22867 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
22868
22869 -static inline void pte_free_kernel(pte_t *pte)
22870 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
22871  {
22872         make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
22873         free_page((unsigned long)pte);
22874  }
22875
22876 -extern void pte_free(struct page *pte);
22877 +extern void __pte_free(pgtable_t);
22878 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
22879 +{
22880 +       __pte_free(pte);
22881 +}
22882 +
22883
22884 -#define __pte_free_tlb(tlb,pte)                                        \
22885 -do {                                                                   \
22886 -       paravirt_release_pt(page_to_pfn(pte));                          \
22887 -       tlb_remove_page((tlb),(pte));                                   \
22888 -} while (0)
22889 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
22890
22891  #ifdef CONFIG_X86_PAE
22892  /*
22893   * In the PAE case we free the pmds as part of the pgd.
22894   */
22895 -#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
22896 -#define pmd_free(x)                    do { } while (0)
22897 -#define __pmd_free_tlb(tlb,x)          do { } while (0)
22898 -#define pud_populate(mm, pmd, pte)     BUG()
22899 -#endif
22900 +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
22901 +
22902 +extern void __pmd_free(pgtable_t);
22903 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
22904 +{
22905 +       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
22906 +       __pmd_free(virt_to_page(pmd));
22907 +}
22908 +
22909 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
22910 +
22911 +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
22912 +{
22913 +       struct page *page = virt_to_page(pmd);
22914 +       unsigned long pfn = page_to_pfn(page);
22915 +
22916 +       paravirt_alloc_pd(mm, pfn);
22917 +
22918 +       /* Note: almost everything apart from _PAGE_PRESENT is
22919 +          reserved at the pmd (PDPT) level. */
22920 +       if (PagePinned(virt_to_page(mm->pgd))) {
22921 +               BUG_ON(PageHighMem(page));
22922 +               BUG_ON(HYPERVISOR_update_va_mapping(
22923 +                         (unsigned long)__va(pfn << PAGE_SHIFT),
22924 +                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22925 +               set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
22926 +       } else
22927 +               *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
22928 +
22929 +       /*
22930 +        * According to Intel App note "TLBs, Paging-Structure Caches,
22931 +        * and Their Invalidation", April 2007, document 317080-001,
22932 +        * section 8.1: in PAE mode we explicitly have to flush the
22933 +        * TLB via cr3 if the top-level pgd is changed...
22934 +        */
22935 +       if (mm == current->active_mm)
22936 +               xen_tlb_flush();
22937 +}
22938 +#endif /* CONFIG_X86_PAE */
22939
22940  #endif /* _I386_PGALLOC_H */
22941 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h     2009-02-16 16:18:36.000000000 +0100
22942 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgalloc_64.h  2009-03-16 16:33:40.000000000 +0100
22943 @@ -6,30 +6,13 @@
22944  #include <linux/mm.h>
22945  #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
22946
22947 -#include <xen/features.h>
22948 -void make_page_readonly(void *va, unsigned int feature);
22949 -void make_page_writable(void *va, unsigned int feature);
22950 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
22951 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
22952 +pmd_t *early_get_pmd(unsigned long va);
22953 +void early_make_page_readonly(void *va, unsigned int feature);
22954
22955  #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
22956
22957 -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
22958 -{
22959 -       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
22960 -}
22961 -
22962 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22963 -{
22964 -       if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22965 -               BUG_ON(HYPERVISOR_update_va_mapping(
22966 -                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22967 -                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22968 -               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22969 -       } else {
22970 -               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
22971 -       }
22972 -}
22973 +#define pmd_populate_kernel(mm, pmd, pte) \
22974 +               set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
22975
22976  static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
22977  {
22978 @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
22979         }
22980  }
22981
22982 -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
22983 -extern void pte_free(struct page *pte);
22984 +#define pmd_pgtable(pmd) pmd_page(pmd)
22985
22986 -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
22987 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22988  {
22989 -       struct page *pg;
22990 -
22991 -       pg = pte_alloc_one(mm, addr);
22992 -       return pg ? page_address(pg) : NULL;
22993 +       if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22994 +               BUG_ON(HYPERVISOR_update_va_mapping(
22995 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22996 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22997 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22998 +       } else {
22999 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23000 +       }
23001  }
23002
23003 -static inline void pmd_free(pmd_t *pmd)
23004 +extern void __pmd_free(pgtable_t);
23005 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23006  {
23007         BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23008 -       pte_free(virt_to_page(pmd));
23009 +       __pmd_free(virt_to_page(pmd));
23010  }
23011
23012 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23013 +
23014  static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23015  {
23016 -       struct page *pg;
23017 -
23018 -       pg = pte_alloc_one(mm, addr);
23019 -       return pg ? page_address(pg) : NULL;
23020 +       return (pud_t *)pmd_alloc_one(mm, addr);
23021  }
23022
23023 -static inline void pud_free(pud_t *pud)
23024 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23025  {
23026         BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23027 -       pte_free(virt_to_page(pud));
23028 +       __pmd_free(virt_to_page(pud));
23029  }
23030
23031  static inline void pgd_list_add(pgd_t *pgd)
23032  {
23033         struct page *page = virt_to_page(pgd);
23034 +       unsigned long flags;
23035
23036 -       spin_lock(&pgd_lock);
23037 +       spin_lock_irqsave(&pgd_lock, flags);
23038         list_add(&page->lru, &pgd_list);
23039 -       spin_unlock(&pgd_lock);
23040 +       spin_unlock_irqrestore(&pgd_lock, flags);
23041  }
23042
23043  static inline void pgd_list_del(pgd_t *pgd)
23044  {
23045         struct page *page = virt_to_page(pgd);
23046 +       unsigned long flags;
23047
23048 -       spin_lock(&pgd_lock);
23049 +       spin_lock_irqsave(&pgd_lock, flags);
23050         list_del(&page->lru);
23051 -       spin_unlock(&pgd_lock);
23052 +       spin_unlock_irqrestore(&pgd_lock, flags);
23053  }
23054
23055  extern void pgd_test_and_unpin(pgd_t *);
23056 @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23057         return pgd;
23058  }
23059
23060 -static inline void pgd_free(pgd_t *pgd)
23061 +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23062  {
23063         pgd_test_and_unpin(pgd);
23064         pgd_list_del(pgd);
23065 @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23066         return pte;
23067  }
23068
23069 +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23070 +
23071  /* Should really implement gc for free page table pages. This could be
23072     done with a reference count in struct page. */
23073
23074 -static inline void pte_free_kernel(pte_t *pte)
23075 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23076  {
23077         BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23078         make_page_writable(pte, XENFEAT_writable_page_tables);
23079         free_page((unsigned long)pte);
23080  }
23081
23082 -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23083 +extern void __pte_free(pgtable_t);
23084 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23085 +{
23086 +       __pte_free(pte);
23087 +}
23088 +
23089 +#define __pte_free_tlb(tlb,pte)                                \
23090 +do {                                                   \
23091 +       pgtable_page_dtor((pte));                               \
23092 +       tlb_remove_page((tlb), (pte));                  \
23093 +} while (0)
23094 +
23095  #define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
23096  #define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
23097
23098 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable.h        2009-02-16 16:18:36.000000000 +0100
23099 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable.h     2009-03-16 16:33:40.000000000 +0100
23100 @@ -1,5 +1,467 @@
23101 +#ifndef _ASM_X86_PGTABLE_H
23102 +#define _ASM_X86_PGTABLE_H
23103 +
23104 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
23105 +#define FIRST_USER_ADDRESS     0
23106 +
23107 +#define _PAGE_BIT_PRESENT      0
23108 +#define _PAGE_BIT_RW           1
23109 +#define _PAGE_BIT_USER         2
23110 +#define _PAGE_BIT_PWT          3
23111 +#define _PAGE_BIT_PCD          4
23112 +#define _PAGE_BIT_ACCESSED     5
23113 +#define _PAGE_BIT_DIRTY                6
23114 +#define _PAGE_BIT_FILE         6
23115 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
23116 +#define _PAGE_BIT_PAT          7       /* on 4KB pages */
23117 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
23118 +#define _PAGE_BIT_IO           9       /* Mapped page is I/O or foreign and
23119 +                                        * has no associated page struct. */
23120 +#define _PAGE_BIT_UNUSED2      10      /* available for programmer */
23121 +#define _PAGE_BIT_UNUSED3      11
23122 +#define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
23123 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
23124 +
23125 +/*
23126 + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
23127 + * sign-extended value on 32-bit with all 1's in the upper word,
23128 + * which preserves the upper pte values on 64-bit ptes:
23129 + */
23130 +#define _PAGE_PRESENT  (_AC(1, L)<<_PAGE_BIT_PRESENT)
23131 +#define _PAGE_RW       (_AC(1, L)<<_PAGE_BIT_RW)
23132 +#define _PAGE_USER     (_AC(1, L)<<_PAGE_BIT_USER)
23133 +#define _PAGE_PWT      (_AC(1, L)<<_PAGE_BIT_PWT)
23134 +#define _PAGE_PCD      (_AC(1, L)<<_PAGE_BIT_PCD)
23135 +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
23136 +#define _PAGE_DIRTY    (_AC(1, L)<<_PAGE_BIT_DIRTY)
23137 +#define _PAGE_PSE      (_AC(1, L)<<_PAGE_BIT_PSE)      /* 2MB page */
23138 +#define _PAGE_GLOBAL   (_AC(1, L)<<_PAGE_BIT_GLOBAL)   /* Global TLB entry */
23139 +#define _PAGE_IO       (_AC(1, L)<<_PAGE_BIT_IO)
23140 +#define _PAGE_UNUSED2  (_AC(1, L)<<_PAGE_BIT_UNUSED2)
23141 +#define _PAGE_UNUSED3  (_AC(1, L)<<_PAGE_BIT_UNUSED3)
23142 +#define _PAGE_PAT      (_AC(1, L)<<_PAGE_BIT_PAT)
23143 +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
23144 +
23145 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
23146 +#define _PAGE_NX       (_AC(1, ULL) << _PAGE_BIT_NX)
23147 +#else
23148 +#define _PAGE_NX       0
23149 +#endif
23150 +
23151 +/* If _PAGE_PRESENT is clear, we use these: */
23152 +#define _PAGE_FILE     _PAGE_DIRTY     /* nonlinear file mapping, saved PTE; unset:swap */
23153 +#define _PAGE_PROTNONE _PAGE_PSE       /* if the user mapped it with PROT_NONE;
23154 +                                          pte_present gives true */
23155 +
23156 +#ifndef __ASSEMBLY__
23157 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
23158 +extern unsigned int __kernel_page_user;
23159 +#else
23160 +#define __kernel_page_user 0
23161 +#endif
23162 +#endif
23163 +
23164 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23165 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
23166 +
23167 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23168 +
23169 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23170 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23171 +
23172 +#define PAGE_SHARED_EXEC       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23173 +#define PAGE_COPY_NOEXEC       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23174 +#define PAGE_COPY_EXEC         __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23175 +#define PAGE_COPY              PAGE_COPY_NOEXEC
23176 +#define PAGE_READONLY          __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23177 +#define PAGE_READONLY_EXEC     __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23178 +
23179 +#ifdef CONFIG_X86_32
23180 +#define _PAGE_KERNEL_EXEC \
23181 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23182 +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
23183 +
23184 +#ifndef __ASSEMBLY__
23185 +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23186 +#endif /* __ASSEMBLY__ */
23187 +#else
23188 +#define __PAGE_KERNEL_EXEC                                             \
23189 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
23190 +#define __PAGE_KERNEL          (__PAGE_KERNEL_EXEC | _PAGE_NX)
23191 +#endif
23192 +
23193 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
23194 +#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23195 +#define __PAGE_KERNEL_EXEC_NOCACHE     (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
23196 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
23197 +#define __PAGE_KERNEL_UC_MINUS         (__PAGE_KERNEL | _PAGE_PCD)
23198 +#define __PAGE_KERNEL_VSYSCALL         (__PAGE_KERNEL_RX | _PAGE_USER)
23199 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
23200 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
23201 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23202 +
23203 +/*
23204 + * We don't support GLOBAL page in xenolinux64
23205 + */
23206 +#define MAKE_GLOBAL(x)                 __pgprot((x))
23207 +
23208 +#define PAGE_KERNEL                    MAKE_GLOBAL(__PAGE_KERNEL)
23209 +#define PAGE_KERNEL_RO                 MAKE_GLOBAL(__PAGE_KERNEL_RO)
23210 +#define PAGE_KERNEL_EXEC               MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
23211 +#define PAGE_KERNEL_RX                 MAKE_GLOBAL(__PAGE_KERNEL_RX)
23212 +#define PAGE_KERNEL_NOCACHE            MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
23213 +#define PAGE_KERNEL_UC_MINUS           MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
23214 +#define PAGE_KERNEL_EXEC_NOCACHE       MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
23215 +#define PAGE_KERNEL_LARGE              MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
23216 +#define PAGE_KERNEL_LARGE_EXEC         MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
23217 +#define PAGE_KERNEL_VSYSCALL           MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
23218 +#define PAGE_KERNEL_VSYSCALL_NOCACHE   MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
23219 +
23220 +/*         xwr */
23221 +#define __P000 PAGE_NONE
23222 +#define __P001 PAGE_READONLY
23223 +#define __P010 PAGE_COPY
23224 +#define __P011 PAGE_COPY
23225 +#define __P100 PAGE_READONLY_EXEC
23226 +#define __P101 PAGE_READONLY_EXEC
23227 +#define __P110 PAGE_COPY_EXEC
23228 +#define __P111 PAGE_COPY_EXEC
23229 +
23230 +#define __S000 PAGE_NONE
23231 +#define __S001 PAGE_READONLY
23232 +#define __S010 PAGE_SHARED
23233 +#define __S011 PAGE_SHARED
23234 +#define __S100 PAGE_READONLY_EXEC
23235 +#define __S101 PAGE_READONLY_EXEC
23236 +#define __S110 PAGE_SHARED_EXEC
23237 +#define __S111 PAGE_SHARED_EXEC
23238 +
23239 +#ifndef __ASSEMBLY__
23240 +
23241 +/*
23242 + * ZERO_PAGE is a global shared page that is always zero: used
23243 + * for zero-mapped memory areas etc..
23244 + */
23245 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
23246 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23247 +
23248 +extern spinlock_t pgd_lock;
23249 +extern struct list_head pgd_list;
23250 +
23251 +/*
23252 + * The following only work if pte_present() is true.
23253 + * Undefined behaviour if not..
23254 + */
23255 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
23256 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
23257 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
23258 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
23259 +static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
23260 +static inline int pte_global(pte_t pte)        { return 0; }
23261 +static inline int pte_exec(pte_t pte)          { return !(__pte_val(pte) & _PAGE_NX); }
23262 +
23263 +static inline int pmd_large(pmd_t pte) {
23264 +       return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
23265 +               (_PAGE_PSE|_PAGE_PRESENT);
23266 +}
23267 +
23268 +static inline pte_t pte_mkclean(pte_t pte)     { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
23269 +static inline pte_t pte_mkold(pte_t pte)       { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
23270 +static inline pte_t pte_wrprotect(pte_t pte)   { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
23271 +static inline pte_t pte_mkexec(pte_t pte)      { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
23272 +static inline pte_t pte_mkdirty(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
23273 +static inline pte_t pte_mkyoung(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
23274 +static inline pte_t pte_mkwrite(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
23275 +static inline pte_t pte_mkhuge(pte_t pte)      { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
23276 +static inline pte_t pte_clrhuge(pte_t pte)     { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
23277 +static inline pte_t pte_mkglobal(pte_t pte)    { return pte; }
23278 +static inline pte_t pte_clrglobal(pte_t pte)   { return pte; }
23279 +
23280 +extern pteval_t __supported_pte_mask;
23281 +
23282 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23283 +{
23284 +       return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
23285 +                     pgprot_val(pgprot)) & __supported_pte_mask);
23286 +}
23287 +
23288 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
23289 +{
23290 +       return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
23291 +                        pgprot_val(pgprot)) & __supported_pte_mask);
23292 +}
23293 +
23294 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23295 +{
23296 +       return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
23297 +                     pgprot_val(pgprot)) & __supported_pte_mask);
23298 +}
23299 +
23300 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23301 +{
23302 +       pteval_t val = pte_val(pte);
23303 +
23304 +       val &= _PAGE_CHG_MASK;
23305 +       val |= pgprot_val(newprot) & __supported_pte_mask;
23306 +
23307 +       return __pte(val);
23308 +}
23309 +
23310 +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
23311 +
23312 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
23313 +
23314 +#define set_pte(ptep, pte)             xen_set_pte(ptep, pte)
23315 +#define set_pte_at(mm, addr, ptep, pte)        xen_set_pte_at(mm, addr, ptep, pte)
23316 +
23317 +#define set_pte_atomic(ptep, pte)                                      \
23318 +       xen_set_pte_atomic(ptep, pte)
23319 +
23320 +#define set_pmd(pmdp, pmd)             xen_set_pmd(pmdp, pmd)
23321 +
23322 +#ifndef __PAGETABLE_PUD_FOLDED
23323 +#define set_pgd(pgdp, pgd)             xen_set_pgd(pgdp, pgd)
23324 +#define pgd_clear(pgd)                 xen_pgd_clear(pgd)
23325 +#endif
23326 +
23327 +#ifndef set_pud
23328 +# define set_pud(pudp, pud)            xen_set_pud(pudp, pud)
23329 +#endif
23330 +
23331 +#ifndef __PAGETABLE_PMD_FOLDED
23332 +#define pud_clear(pud)                 xen_pud_clear(pud)
23333 +#endif
23334 +
23335 +#define pte_clear(mm, addr, ptep)      xen_pte_clear(mm, addr, ptep)
23336 +#define pmd_clear(pmd)                 xen_pmd_clear(pmd)
23337 +
23338 +#define pte_update(mm, addr, ptep)              do { } while (0)
23339 +#define pte_update_defer(mm, addr, ptep)        do { } while (0)
23340 +
23341 +#endif /* __ASSEMBLY__ */
23342 +
23343  #ifdef CONFIG_X86_32
23344  # include "pgtable_32.h"
23345  #else
23346  # include "pgtable_64.h"
23347  #endif
23348 +
23349 +#ifndef __ASSEMBLY__
23350 +
23351 +enum {
23352 +       PG_LEVEL_NONE,
23353 +       PG_LEVEL_4K,
23354 +       PG_LEVEL_2M,
23355 +       PG_LEVEL_1G,
23356 +};
23357 +
23358 +/*
23359 + * Helper function that returns the kernel pagetable entry controlling
23360 + * the virtual address 'address'. NULL means no pagetable entry present.
23361 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23362 + * as a pte too.
23363 + */
23364 +extern pte_t *lookup_address(unsigned long address, unsigned int *level);
23365 +
23366 +/* local pte updates need not use xchg for locking */
23367 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23368 +{
23369 +       xen_set_pte(ptep, __pte(0));
23370 +       return res;
23371 +}
23372 +
23373 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23374 +                                 pte_t *ptep , pte_t pte)
23375 +{
23376 +       if ((mm != current->mm && mm != &init_mm) ||
23377 +           HYPERVISOR_update_va_mapping(addr, pte, 0))
23378 +               xen_set_pte(ptep, pte);
23379 +}
23380 +
23381 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
23382 +                                pte_t *ptep)
23383 +{
23384 +       if ((mm != current->mm && mm != &init_mm)
23385 +           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
23386 +               __xen_pte_clear(ptep);
23387 +}
23388 +
23389 +#ifndef CONFIG_PARAVIRT
23390 +/*
23391 + * Rules for using pte_update - it must be called after any PTE update which
23392 + * has not been done using the set_pte / clear_pte interfaces.  It is used by
23393 + * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
23394 + * updates should either be sets, clears, or set_pte_atomic for P->P
23395 + * transitions, which means this hook should only be called for user PTEs.
23396 + * This hook implies a P->P protection or access change has taken place, which
23397 + * requires a subsequent TLB flush.  The notification can optionally be delayed
23398 + * until the TLB flush event by using the pte_update_defer form of the
23399 + * interface, but care must be taken to assure that the flush happens while
23400 + * still holding the same page table lock so that the shadow and primary pages
23401 + * do not become out of sync on SMP.
23402 + */
23403 +#define pte_update(mm, addr, ptep)             do { } while (0)
23404 +#define pte_update_defer(mm, addr, ptep)       do { } while (0)
23405 +#endif
23406 +
23407 +/*
23408 + * We only update the dirty/accessed state if we set
23409 + * the dirty bit by hand in the kernel, since the hardware
23410 + * will do the accessed bit for us, and we don't want to
23411 + * race with other CPU's that might be updating the dirty
23412 + * bit at the same time.
23413 + */
23414 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23415 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
23416 +({                                                                     \
23417 +       int __changed = !pte_same(*(ptep), entry);                      \
23418 +       if (__changed && (dirty)) {                                     \
23419 +               if ( likely((vma)->vm_mm == current->mm) ) {            \
23420 +                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
23421 +                               entry,                                  \
23422 +                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23423 +                                       UVMF_INVLPG|UVMF_MULTI));       \
23424 +               } else {                                                \
23425 +                       xen_l1_entry_update(ptep, entry);               \
23426 +                       flush_tlb_page(vma, address);                   \
23427 +               }                                                       \
23428 +       }                                                               \
23429 +       __changed;                                                      \
23430 +})
23431 +
23432 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23433 +#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
23434 +       int __ret = 0;                                                  \
23435 +       if (pte_young(*(ptep)))                                         \
23436 +               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
23437 +                                          &(ptep)->pte);               \
23438 +       if (__ret)                                                      \
23439 +               pte_update((vma)->vm_mm, addr, ptep);                   \
23440 +       __ret;                                                          \
23441 +})
23442 +
23443 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23444 +#define ptep_clear_flush_young(vma, address, ptep)                     \
23445 +({                                                                     \
23446 +       pte_t __pte = *(ptep);                                          \
23447 +       int __young = pte_young(__pte);                                 \
23448 +       __pte = pte_mkold(__pte);                                       \
23449 +       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
23450 +               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23451 +       else if (__young)                                               \
23452 +               (ptep)->pte_low = __pte.pte_low;                        \
23453 +       __young;                                                        \
23454 +})
23455 +
23456 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23457 +#define ptep_clear_flush(vma, addr, ptep)                      \
23458 +({                                                             \
23459 +       pte_t *__ptep = (ptep);                                 \
23460 +       pte_t __res = *__ptep;                                  \
23461 +       if (!pte_none(__res) &&                                 \
23462 +           ((vma)->vm_mm != current->mm ||                     \
23463 +            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
23464 +                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23465 +                               UVMF_INVLPG|UVMF_MULTI))) {     \
23466 +               __xen_pte_clear(__ptep);                        \
23467 +               flush_tlb_page(vma, addr);                      \
23468 +       }                                                       \
23469 +       __res;                                                  \
23470 +})
23471 +
23472 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23473 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23474 +{
23475 +       pte_t pte = *ptep;
23476 +       if (!pte_none(pte)
23477 +           && (mm != &init_mm
23478 +               || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23479 +               pte = xen_ptep_get_and_clear(ptep, pte);
23480 +               pte_update(mm, addr, ptep);
23481 +       }
23482 +       return pte;
23483 +}
23484 +
23485 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23486 +#define ptep_get_and_clear_full(mm, addr, ptep, full)          \
23487 +       ((full) ? ({                                            \
23488 +               pte_t *__ptep = (ptep);                         \
23489 +               pte_t __res = *__ptep;                          \
23490 +               if (!PagePinned(virt_to_page((mm)->pgd)))       \
23491 +                       __xen_pte_clear(__ptep);                \
23492 +               else if (!pte_none(__res))                      \
23493 +                       xen_l1_entry_update(__ptep, __pte(0));  \
23494 +               __res;                                          \
23495 +        }) :                                                   \
23496 +        ptep_get_and_clear(mm, addr, ptep))
23497 +
23498 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
23499 +
23500 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23501 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23502 +{
23503 +       pte_t pte = *ptep;
23504 +       if (pte_write(pte))
23505 +               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23506 +}
23507 +
23508 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23509 +       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23510 +
23511 +#define arbitrary_virt_to_machine(va)                                  \
23512 +({                                                                     \
23513 +       unsigned int __lvl;                                             \
23514 +       pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);    \
23515 +       BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
23516 +       (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT)                      \
23517 +        | ((unsigned long)(va) & (PAGE_SIZE - 1)));                    \
23518 +})
23519 +
23520 +#ifdef CONFIG_HIGHPTE
23521 +#include <asm/io.h>
23522 +struct page *kmap_atomic_to_page(void *);
23523 +#define ptep_to_machine(ptep)                                          \
23524 +({                                                                     \
23525 +       pte_t *__ptep = (ptep);                                         \
23526 +       page_to_phys(kmap_atomic_to_page(__ptep))                       \
23527 +               | ((unsigned long)__ptep & (PAGE_SIZE - 1));            \
23528 +})
23529 +#else
23530 +#define ptep_to_machine(ptep)  virt_to_machine(ptep)
23531 +#endif
23532 +
23533 +#include <asm-generic/pgtable.h>
23534 +
23535 +#include <xen/features.h>
23536 +void make_page_readonly(void *va, unsigned int feature);
23537 +void make_page_writable(void *va, unsigned int feature);
23538 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23539 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23540 +
23541 +struct vm_area_struct;
23542 +
23543 +int direct_remap_pfn_range(struct vm_area_struct *vma,
23544 +                           unsigned long address,
23545 +                           unsigned long mfn,
23546 +                           unsigned long size,
23547 +                           pgprot_t prot,
23548 +                           domid_t  domid);
23549 +int direct_kernel_remap_pfn_range(unsigned long address,
23550 +                                 unsigned long mfn,
23551 +                                 unsigned long size,
23552 +                                 pgprot_t prot,
23553 +                                 domid_t  domid);
23554 +int create_lookup_pte_addr(struct mm_struct *mm,
23555 +                           unsigned long address,
23556 +                           uint64_t *ptep);
23557 +int touch_pte_range(struct mm_struct *mm,
23558 +                    unsigned long address,
23559 +                    unsigned long size);
23560 +
23561 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23562 +               unsigned long addr, unsigned long end, pgprot_t newprot,
23563 +               int dirty_accountable);
23564 +
23565 +#endif /* __ASSEMBLY__ */
23566 +
23567 +#endif /* _ASM_X86_PGTABLE_H */
23568 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-02-16 16:17:21.000000000 +0100
23569 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable-3level.h      2009-03-16 16:33:40.000000000 +0100
23570 @@ -18,16 +18,18 @@
23571         printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23572                &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23573
23574 -#define pud_none(pud)                          0
23575 -#define pud_bad(pud)                           0
23576 -#define pud_present(pud)                       1
23577
23578 -/*
23579 - * All present pages with !NX bit are kernel-executable:
23580 - */
23581 -static inline int pte_exec_kernel(pte_t pte)
23582 +static inline int pud_none(pud_t pud)
23583 +{
23584 +       return __pud_val(pud) == 0;
23585 +}
23586 +static inline int pud_bad(pud_t pud)
23587  {
23588 -       return !(__pte_val(pte) & _PAGE_NX);
23589 +       return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23590 +}
23591 +static inline int pud_present(pud_t pud)
23592 +{
23593 +       return __pud_val(pud) & _PAGE_PRESENT;
23594  }
23595
23596  /* Rules for using set_pte: the pte being assigned *must* be
23597 @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23598         ptep->pte_low = pte.pte_low;
23599  }
23600
23601 -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23602 -                                 pte_t *ptep , pte_t pte)
23603 -{
23604 -       if ((mm != current->mm && mm != &init_mm) ||
23605 -           HYPERVISOR_update_va_mapping(addr, pte, 0))
23606 -               xen_set_pte(ptep, pte);
23607 -}
23608 -
23609  static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23610  {
23611         set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23612 @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23613   * entry, so clear the bottom half first and enforce ordering with a compiler
23614   * barrier.
23615   */
23616 -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23617 +static inline void __xen_pte_clear(pte_t *ptep)
23618  {
23619 -       if ((mm != current->mm && mm != &init_mm)
23620 -           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23621 -               ptep->pte_low = 0;
23622 -               smp_wmb();
23623 -               ptep->pte_high = 0;
23624 -       }
23625 +       ptep->pte_low = 0;
23626 +       smp_wmb();
23627 +       ptep->pte_high = 0;
23628  }
23629
23630  static inline void xen_pmd_clear(pmd_t *pmd)
23631 @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23632         xen_l2_entry_update(pmd, __pmd(0));
23633  }
23634
23635 -#define set_pte(ptep, pte)                     xen_set_pte(ptep, pte)
23636 -#define set_pte_at(mm, addr, ptep, pte)                xen_set_pte_at(mm, addr, ptep, pte)
23637 -#define set_pte_atomic(ptep, pte)              xen_set_pte_atomic(ptep, pte)
23638 -#define set_pmd(pmdp, pmd)                     xen_set_pmd(pmdp, pmd)
23639 -#define set_pud(pudp, pud)                     xen_set_pud(pudp, pud)
23640 -#define pte_clear(mm, addr, ptep)              xen_pte_clear(mm, addr, ptep)
23641 -#define pmd_clear(pmd)                         xen_pmd_clear(pmd)
23642 +static inline void pud_clear(pud_t *pudp)
23643 +{
23644 +       pgdval_t pgd;
23645 +
23646 +       set_pud(pudp, __pud(0));
23647
23648 -/*
23649 - * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23650 - * the TLB via cr3 if the top-level pgd is changed...
23651 - * We do not let the generic code free and clear pgd entries due to
23652 - * this erratum.
23653 - */
23654 -static inline void pud_clear (pud_t * pud) { }
23655 +       /*
23656 +        * According to Intel App note "TLBs, Paging-Structure Caches,
23657 +        * and Their Invalidation", April 2007, document 317080-001,
23658 +        * section 8.1: in PAE mode we explicitly have to flush the
23659 +        * TLB via cr3 if the top-level pgd is changed...
23660 +        *
23661 +        * Make sure the pud entry we're updating is within the
23662 +        * current pgd to avoid unnecessary TLB flushes.
23663 +        */
23664 +       pgd = read_cr3();
23665 +       if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23666 +               xen_tlb_flush();
23667 +}
23668
23669  #define pud_page(pud) \
23670  ((struct page *) __va(pud_val(pud) & PAGE_MASK))
23671 @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
23672  #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
23673  #endif
23674
23675 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23676 -#define ptep_clear_flush(vma, addr, ptep)                      \
23677 -({                                                             \
23678 -       pte_t *__ptep = (ptep);                                 \
23679 -       pte_t __res = *__ptep;                                  \
23680 -       if (!pte_none(__res) &&                                 \
23681 -           ((vma)->vm_mm != current->mm ||                     \
23682 -            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
23683 -                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23684 -                               UVMF_INVLPG|UVMF_MULTI))) {     \
23685 -               __ptep->pte_low = 0;                            \
23686 -               smp_wmb();                                      \
23687 -               __ptep->pte_high = 0;                           \
23688 -               flush_tlb_page(vma, addr);                      \
23689 -       }                                                       \
23690 -       __res;                                                  \
23691 -})
23692 -
23693  #define __HAVE_ARCH_PTE_SAME
23694  static inline int pte_same(pte_t a, pte_t b)
23695  {
23696 @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
23697                        mfn_to_local_pfn(__pte_mfn(_pte)) :      \
23698                        __pte_mfn(_pte))
23699
23700 -extern unsigned long long __supported_pte_mask;
23701 -
23702 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23703 -{
23704 -       return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
23705 -                     pgprot_val(pgprot)) & __supported_pte_mask);
23706 -}
23707 -
23708 -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23709 -{
23710 -       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
23711 -                     pgprot_val(pgprot)) & __supported_pte_mask);
23712 -}
23713 -
23714  /*
23715   * Bits 0, 6 and 7 are taken in the low part of the pte,
23716   * put the 32 bits of offset into the high part.
23717   */
23718  #define pte_to_pgoff(pte) ((pte).pte_high)
23719 -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
23720 +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
23721  #define PTE_FILE_MAX_BITS       32
23722
23723  /* Encode and de-code a swap entry */
23724 @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
23725  #define __swp_offset(x)                        ((x).val >> 5)
23726  #define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
23727  #define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
23728 -#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
23729 -
23730 -#define __pmd_free_tlb(tlb, x)         do { } while (0)
23731 +#define __swp_entry_to_pte(x)          ((pte_t){ { .pte_high = (x).val } })
23732
23733  #endif /* _I386_PGTABLE_3LEVEL_H */
23734 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_32.h     2009-02-16 16:18:36.000000000 +0100
23735 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_32.h  2009-03-16 16:33:40.000000000 +0100
23736 @@ -1,8 +1,6 @@
23737  #ifndef _I386_PGTABLE_H
23738  #define _I386_PGTABLE_H
23739
23740 -#include <asm/hypervisor.h>
23741 -
23742  /*
23743   * The Linux memory management assumes a three-level page table setup. On
23744   * the i386, we use that, but "fold" the mid level into the top-level page
23745 @@ -25,20 +23,10 @@
23746
23747  struct vm_area_struct;
23748
23749 -/*
23750 - * ZERO_PAGE is a global shared page that is always zero: used
23751 - * for zero-mapped memory areas etc..
23752 - */
23753 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23754 -extern unsigned long empty_zero_page[1024];
23755  extern pgd_t *swapper_pg_dir;
23756 -extern struct kmem_cache *pmd_cache;
23757 -extern spinlock_t pgd_lock;
23758 -extern struct page *pgd_list;
23759 -void check_pgt_cache(void);
23760
23761 -void pmd_ctor(struct kmem_cache *, void *);
23762 -void pgtable_cache_init(void);
23763 +static inline void pgtable_cache_init(void) { }
23764 +static inline void check_pgt_cache(void) { }
23765  void paging_init(void);
23766
23767
23768 @@ -58,16 +46,9 @@ void paging_init(void);
23769  #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
23770  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
23771
23772 -#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
23773 -#define FIRST_USER_ADDRESS     0
23774 -
23775  #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23776  #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23777
23778 -#define TWOLEVEL_PGDIR_SHIFT   22
23779 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23780 -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23781 -
23782  /* Just any arbitrary offset to the start of the vmalloc VM area: the
23783   * current 8MB value just means that there will be a 8MB "hole" after the
23784   * physical memory until the kernel virtual memory starts.  That means that
23785 @@ -78,121 +59,19 @@ void paging_init(void);
23786  #define VMALLOC_OFFSET (8*1024*1024)
23787  #define VMALLOC_START  (((unsigned long) high_memory + \
23788                         2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23789 -#ifdef CONFIG_HIGHMEM
23790 -# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
23791 -#else
23792 -# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
23793 -#endif
23794 -
23795 -/*
23796 - * _PAGE_PSE set in the page directory entry just means that
23797 - * the page directory entry points directly to a 4MB-aligned block of
23798 - * memory.
23799 - */
23800 -#define _PAGE_BIT_PRESENT      0
23801 -#define _PAGE_BIT_RW           1
23802 -#define _PAGE_BIT_USER         2
23803 -#define _PAGE_BIT_PWT          3
23804 -#define _PAGE_BIT_PCD          4
23805 -#define _PAGE_BIT_ACCESSED     5
23806 -#define _PAGE_BIT_DIRTY                6
23807 -#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
23808 -#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
23809 -/*#define _PAGE_BIT_UNUSED1    9*/     /* available for programmer */
23810 -#define _PAGE_BIT_UNUSED2      10
23811 -#define _PAGE_BIT_UNUSED3      11
23812 -#define _PAGE_BIT_NX           63
23813 -
23814 -#define _PAGE_PRESENT  0x001
23815 -#define _PAGE_RW       0x002
23816 -#define _PAGE_USER     0x004
23817 -#define _PAGE_PWT      0x008
23818 -#define _PAGE_PCD      0x010
23819 -#define _PAGE_ACCESSED 0x020
23820 -#define _PAGE_DIRTY    0x040
23821 -#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
23822 -#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
23823 -/*#define _PAGE_UNUSED1        0x200*/ /* available for programmer */
23824 -#define _PAGE_UNUSED2  0x400
23825 -#define _PAGE_UNUSED3  0x800
23826 -
23827 -/* If _PAGE_PRESENT is clear, we use these: */
23828 -#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
23829 -#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
23830 -                                  pte_present gives true */
23831  #ifdef CONFIG_X86_PAE
23832 -#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
23833 +#define LAST_PKMAP 512
23834  #else
23835 -#define _PAGE_NX       0
23836 +#define LAST_PKMAP 1024
23837  #endif
23838
23839 -/* Mapped page is I/O or foreign and has no associated page struct. */
23840 -#define _PAGE_IO       0x200
23841 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23842
23843 -#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23844 -#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23845 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23846 -
23847 -#define PAGE_NONE \
23848 -       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23849 -#define PAGE_SHARED \
23850 -       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23851 -
23852 -#define PAGE_SHARED_EXEC \
23853 -       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23854 -#define PAGE_COPY_NOEXEC \
23855 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23856 -#define PAGE_COPY_EXEC \
23857 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23858 -#define PAGE_COPY \
23859 -       PAGE_COPY_NOEXEC
23860 -#define PAGE_READONLY \
23861 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23862 -#define PAGE_READONLY_EXEC \
23863 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23864 -
23865 -#define _PAGE_KERNEL \
23866 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23867 -#define _PAGE_KERNEL_EXEC \
23868 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23869 -
23870 -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23871 -#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
23872 -#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23873 -#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
23874 -#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
23875 -#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23876 -
23877 -#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
23878 -#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
23879 -#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
23880 -#define PAGE_KERNEL_RX         __pgprot(__PAGE_KERNEL_RX)
23881 -#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
23882 -#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
23883 -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23884 -
23885 -/*
23886 - * The i386 can't do page protection for execute, and considers that
23887 - * the same are read. Also, write permissions imply read permissions.
23888 - * This is the closest we can get..
23889 - */
23890 -#define __P000 PAGE_NONE
23891 -#define __P001 PAGE_READONLY
23892 -#define __P010 PAGE_COPY
23893 -#define __P011 PAGE_COPY
23894 -#define __P100 PAGE_READONLY_EXEC
23895 -#define __P101 PAGE_READONLY_EXEC
23896 -#define __P110 PAGE_COPY_EXEC
23897 -#define __P111 PAGE_COPY_EXEC
23898 -
23899 -#define __S000 PAGE_NONE
23900 -#define __S001 PAGE_READONLY
23901 -#define __S010 PAGE_SHARED
23902 -#define __S011 PAGE_SHARED
23903 -#define __S100 PAGE_READONLY_EXEC
23904 -#define __S101 PAGE_READONLY_EXEC
23905 -#define __S110 PAGE_SHARED_EXEC
23906 -#define __S111 PAGE_SHARED_EXEC
23907 +#ifdef CONFIG_HIGHMEM
23908 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
23909 +#else
23910 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
23911 +#endif
23912
23913  /*
23914   * Define this if things work differently on an i386 and an i486:
23915 @@ -221,28 +100,6 @@ extern unsigned long pg0[];
23916
23917  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23918
23919 -/*
23920 - * The following only work if pte_present() is true.
23921 - * Undefined behaviour if not..
23922 - */
23923 -static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
23924 -static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
23925 -static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
23926 -static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
23927 -
23928 -/*
23929 - * The following only works if pte_present() is not true.
23930 - */
23931 -static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
23932 -
23933 -static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23934 -static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23935 -static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
23936 -static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23937 -static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23938 -static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
23939 -static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
23940 -
23941  #ifdef CONFIG_X86_PAE
23942  # include <asm/pgtable-3level.h>
23943  #else
23944 @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23945  #endif
23946
23947  /*
23948 - * Rules for using pte_update - it must be called after any PTE update which
23949 - * has not been done using the set_pte / clear_pte interfaces.  It is used by
23950 - * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
23951 - * updates should either be sets, clears, or set_pte_atomic for P->P
23952 - * transitions, which means this hook should only be called for user PTEs.
23953 - * This hook implies a P->P protection or access change has taken place, which
23954 - * requires a subsequent TLB flush.  The notification can optionally be delayed
23955 - * until the TLB flush event by using the pte_update_defer form of the
23956 - * interface, but care must be taken to assure that the flush happens while
23957 - * still holding the same page table lock so that the shadow and primary pages
23958 - * do not become out of sync on SMP.
23959 - */
23960 -#define pte_update(mm, addr, ptep)             do { } while (0)
23961 -#define pte_update_defer(mm, addr, ptep)       do { } while (0)
23962 -
23963 -/* local pte updates need not use xchg for locking */
23964 -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23965 -{
23966 -       xen_set_pte(ptep, __pte(0));
23967 -       return res;
23968 -}
23969 -
23970 -/*
23971 - * We only update the dirty/accessed state if we set
23972 - * the dirty bit by hand in the kernel, since the hardware
23973 - * will do the accessed bit for us, and we don't want to
23974 - * race with other CPU's that might be updating the dirty
23975 - * bit at the same time.
23976 - */
23977 -#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23978 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
23979 -({                                                                     \
23980 -       int __changed = !pte_same(*(ptep), entry);                      \
23981 -       if (__changed && (dirty)) {                                     \
23982 -               if ( likely((vma)->vm_mm == current->mm) ) {            \
23983 -                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
23984 -                               entry,                                  \
23985 -                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23986 -                                       UVMF_INVLPG|UVMF_MULTI));       \
23987 -               } else {                                                \
23988 -                       xen_l1_entry_update(ptep, entry);               \
23989 -                       flush_tlb_page(vma, address);                   \
23990 -               }                                                       \
23991 -       }                                                               \
23992 -       __changed;                                                      \
23993 -})
23994 -
23995 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23996 -#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
23997 -       int __ret = 0;                                                  \
23998 -       if (pte_young(*(ptep)))                                         \
23999 -               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
24000 -                                               &(ptep)->pte_low);      \
24001 -       if (__ret)                                                      \
24002 -               pte_update((vma)->vm_mm, addr, ptep);                   \
24003 -       __ret;                                                          \
24004 -})
24005 -
24006 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24007 -#define ptep_clear_flush_young(vma, address, ptep)                     \
24008 -({                                                                     \
24009 -       pte_t __pte = *(ptep);                                          \
24010 -       int __young = pte_young(__pte);                                 \
24011 -       __pte = pte_mkold(__pte);                                       \
24012 -       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
24013 -               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24014 -       else if (__young)                                               \
24015 -               (ptep)->pte_low = __pte.pte_low;                        \
24016 -       __young;                                                        \
24017 -})
24018 -
24019 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24020 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24021 -{
24022 -       pte_t pte = *ptep;
24023 -       if (!pte_none(pte)
24024 -           && (mm != &init_mm
24025 -               || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24026 -               pte = xen_ptep_get_and_clear(ptep, pte);
24027 -               pte_update(mm, addr, ptep);
24028 -       }
24029 -       return pte;
24030 -}
24031 -
24032 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24033 -#define ptep_get_and_clear_full(mm, addr, ptep, full)                  \
24034 -       ((full) ? ({                                                    \
24035 -               pte_t __res = *(ptep);                                  \
24036 -               if (PagePinned(virt_to_page((mm)->pgd)))                \
24037 -                       xen_l1_entry_update(ptep, __pte(0));            \
24038 -               else                                                    \
24039 -                       *(ptep) = __pte(0);                             \
24040 -               __res;                                                  \
24041 -        }) :                                                           \
24042 -        ptep_get_and_clear(mm, addr, ptep))
24043 -
24044 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24045 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24046 -{
24047 -       pte_t pte = *ptep;
24048 -       if (pte_write(pte))
24049 -               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24050 -}
24051 -
24052 -/*
24053   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
24054   *
24055   *  dst - pointer to pgd range anwhere on a pgd page
24056 @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
24057
24058  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
24059
24060 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24061 -{
24062 -       /*
24063 -        * Since this might change the present bit (which controls whether
24064 -        * a pte_t object has undergone p2m translation), we must use
24065 -        * pte_val() on the input pte and __pte() for the return value.
24066 -        */
24067 -       paddr_t pteval = pte_val(pte);
24068 -
24069 -       pteval &= _PAGE_CHG_MASK;
24070 -       pteval |= pgprot_val(newprot);
24071 -#ifdef CONFIG_X86_PAE
24072 -       pteval &= __supported_pte_mask;
24073 -#endif
24074 -       return __pte(pteval);
24075 -}
24076 -
24077 -#define pmd_large(pmd) \
24078 -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
24079 -
24080  /*
24081   * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
24082   *
24083 @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
24084   */
24085  #define pgd_offset_k(address) pgd_offset(&init_mm, address)
24086
24087 +static inline int pud_large(pud_t pud) { return 0; }
24088 +
24089  /*
24090   * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
24091   *
24092 @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
24093  #define pmd_page_vaddr(pmd) \
24094                 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
24095
24096 -/*
24097 - * Helper function that returns the kernel pagetable entry controlling
24098 - * the virtual address 'address'. NULL means no pagetable entry present.
24099 - * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24100 - * as a pte too.
24101 - */
24102 -extern pte_t *lookup_address(unsigned long address);
24103 -
24104 -/*
24105 - * Make a given kernel text page executable/non-executable.
24106 - * Returns the previous executability setting of that page (which
24107 - * is used to restore the previous state). Used by the SMP bootup code.
24108 - * NOTE: this is an __init function for security reasons.
24109 - */
24110 -#ifdef CONFIG_X86_PAE
24111 - extern int set_kernel_exec(unsigned long vaddr, int enable);
24112 -#else
24113 - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
24114 -#endif
24115 -
24116  #if defined(CONFIG_HIGHPTE)
24117  #define pte_offset_map(dir, address) \
24118         ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
24119 @@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo
24120   */
24121  #define update_mmu_cache(vma,address,pte) do { } while (0)
24122
24123 -#include <xen/features.h>
24124  void make_lowmem_page_readonly(void *va, unsigned int feature);
24125  void make_lowmem_page_writable(void *va, unsigned int feature);
24126 -void make_page_readonly(void *va, unsigned int feature);
24127 -void make_page_writable(void *va, unsigned int feature);
24128 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
24129 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
24130 -
24131 -#define virt_to_ptep(va)                                               \
24132 -({                                                                     \
24133 -       pte_t *__ptep = lookup_address((unsigned long)(va));            \
24134 -       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
24135 -       __ptep;                                                         \
24136 -})
24137 -
24138 -#define arbitrary_virt_to_machine(va)                                  \
24139 -       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
24140 -        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24141 -
24142 -#ifdef CONFIG_HIGHPTE
24143 -#include <asm/io.h>
24144 -struct page *kmap_atomic_to_page(void *);
24145 -#define ptep_to_machine(ptep)                                          \
24146 -({                                                                     \
24147 -       pte_t *__ptep = (ptep);                                         \
24148 -       page_to_phys(kmap_atomic_to_page(__ptep))                       \
24149 -               | ((unsigned long)__ptep & (PAGE_SIZE - 1));            \
24150 -})
24151 -#else
24152 -#define ptep_to_machine(ptep)  virt_to_machine(ptep)
24153 -#endif
24154
24155  #endif /* !__ASSEMBLY__ */
24156
24157 +/*
24158 + * kern_addr_valid() is (1) for FLATMEM and (0) for
24159 + * SPARSEMEM and DISCONTIGMEM
24160 + */
24161  #ifdef CONFIG_FLATMEM
24162  #define kern_addr_valid(addr)  (1)
24163 -#endif /* CONFIG_FLATMEM */
24164 -
24165 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24166 -                           unsigned long address,
24167 -                           unsigned long mfn,
24168 -                           unsigned long size,
24169 -                           pgprot_t prot,
24170 -                           domid_t  domid);
24171 -int direct_kernel_remap_pfn_range(unsigned long address,
24172 -                                 unsigned long mfn,
24173 -                                 unsigned long size,
24174 -                                 pgprot_t prot,
24175 -                                 domid_t  domid);
24176 -int create_lookup_pte_addr(struct mm_struct *mm,
24177 -                           unsigned long address,
24178 -                           uint64_t *ptep);
24179 -int touch_pte_range(struct mm_struct *mm,
24180 -                    unsigned long address,
24181 -                    unsigned long size);
24182 -
24183 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24184 -               unsigned long addr, unsigned long end, pgprot_t newprot,
24185 -               int dirty_accountable);
24186 -
24187 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24188 -       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24189 +#else
24190 +#define kern_addr_valid(kaddr) (0)
24191 +#endif
24192
24193  #define io_remap_pfn_range(vma,from,pfn,size,prot) \
24194  direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
24195
24196 -#include <asm-generic/pgtable.h>
24197 -
24198  #endif /* _I386_PGTABLE_H */
24199 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_64.h     2009-02-16 16:18:36.000000000 +0100
24200 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_64.h  2009-03-16 16:33:40.000000000 +0100
24201 @@ -13,49 +13,26 @@
24202  #include <linux/threads.h>
24203  #include <linux/sched.h>
24204  #include <asm/pda.h>
24205 -#ifdef CONFIG_XEN
24206 -#include <asm/hypervisor.h>
24207
24208 +#ifdef CONFIG_XEN
24209  extern pud_t level3_user_pgt[512];
24210
24211  extern void xen_init_pt(void);
24212 -
24213 -extern pte_t *lookup_address(unsigned long address);
24214 -
24215 -#define virt_to_ptep(va)                                               \
24216 -({                                                                     \
24217 -       pte_t *__ptep = lookup_address((unsigned long)(va));            \
24218 -       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
24219 -       __ptep;                                                         \
24220 -})
24221 -
24222 -#define arbitrary_virt_to_machine(va)                                  \
24223 -       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
24224 -        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24225 -
24226 -#define ptep_to_machine(ptep)  virt_to_machine(ptep)
24227  #endif
24228
24229  extern pud_t level3_kernel_pgt[512];
24230  extern pud_t level3_ident_pgt[512];
24231  extern pmd_t level2_kernel_pgt[512];
24232  extern pgd_t init_level4_pgt[];
24233 -extern unsigned long __supported_pte_mask;
24234
24235  #define swapper_pg_dir init_level4_pgt
24236
24237  extern void paging_init(void);
24238 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24239 -
24240 -/*
24241 - * ZERO_PAGE is a global shared page that is always zero: used
24242 - * for zero-mapped memory areas etc..
24243 - */
24244 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24245 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24246
24247  #endif /* !__ASSEMBLY__ */
24248
24249 +#define SHARED_KERNEL_PMD      1
24250 +
24251  /*
24252   * PGDIR_SHIFT determines what a top-level page table entry can map
24253   */
24254 @@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24255  #define pgd_none(x)    (!__pgd_val(x))
24256  #define pud_none(x)    (!__pud_val(x))
24257
24258 -static inline void set_pte(pte_t *dst, pte_t val)
24259 +struct mm_struct;
24260 +
24261 +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24262 +
24263 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24264 +{
24265 +       *ptep = pte;
24266 +}
24267 +
24268 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24269  {
24270 -       *dst = val;
24271 +       xen_set_pte(ptep, pte);
24272  }
24273
24274 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24275 -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24276 -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24277 +#ifdef CONFIG_SMP
24278 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24279 +{
24280 +       return __pte_ma(xchg(&xp->pte, 0));
24281 +}
24282 +#else
24283 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24284 +#endif
24285
24286 -static inline void pud_clear (pud_t * pud)
24287 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24288  {
24289 -       set_pud(pud, __pud(0));
24290 +       xen_l2_entry_update(pmdp, pmd);
24291 +}
24292 +
24293 +static inline void xen_pmd_clear(pmd_t *pmd)
24294 +{
24295 +       xen_set_pmd(pmd, xen_make_pmd(0));
24296 +}
24297 +
24298 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24299 +{
24300 +       xen_l3_entry_update(pudp, pud);
24301 +}
24302 +
24303 +static inline void xen_pud_clear(pud_t *pud)
24304 +{
24305 +       xen_set_pud(pud, xen_make_pud(0));
24306  }
24307
24308  #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24309
24310 -static inline void pgd_clear (pgd_t * pgd)
24311 +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24312  {
24313 -        set_pgd(pgd, __pgd(0));
24314 -        set_pgd(__user_pgd(pgd), __pgd(0));
24315 +       xen_l4_entry_update(pgdp, pgd);
24316  }
24317
24318 -#define pte_same(a, b)         ((a).pte == (b).pte)
24319 +static inline void xen_pgd_clear(pgd_t * pgd)
24320 +{
24321 +       xen_set_pgd(pgd, xen_make_pgd(0));
24322 +       xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24323 +}
24324
24325 -#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24326 +#define pte_same(a, b)         ((a).pte == (b).pte)
24327
24328  #endif /* !__ASSEMBLY__ */
24329
24330 @@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24331  #define PGDIR_SIZE     (_AC(1,UL) << PGDIR_SHIFT)
24332  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
24333
24334 -#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
24335 -#define FIRST_USER_ADDRESS     0
24336
24337  #define MAXMEM          _AC(0x3fffffffffff, UL)
24338  #define VMALLOC_START    _AC(0xffffc20000000000, UL)
24339 @@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24340  #define MODULES_END      _AC(0xfffffffffff00000, UL)
24341  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
24342
24343 -#define _PAGE_BIT_PRESENT      0
24344 -#define _PAGE_BIT_RW           1
24345 -#define _PAGE_BIT_USER         2
24346 -#define _PAGE_BIT_PWT          3
24347 -#define _PAGE_BIT_PCD          4
24348 -#define _PAGE_BIT_ACCESSED     5
24349 -#define _PAGE_BIT_DIRTY                6
24350 -#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
24351 -#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
24352 -#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
24353 -
24354 -#define _PAGE_PRESENT  0x001
24355 -#define _PAGE_RW       0x002
24356 -#define _PAGE_USER     0x004
24357 -#define _PAGE_PWT      0x008
24358 -#define _PAGE_PCD      0x010
24359 -#define _PAGE_ACCESSED 0x020
24360 -#define _PAGE_DIRTY    0x040
24361 -#define _PAGE_PSE      0x080   /* 2MB page */
24362 -#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
24363 -#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
24364 -
24365 -#define _PAGE_PROTNONE 0x080   /* If not present */
24366 -#define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
24367 -
24368 -/* Mapped page is I/O or foreign and has no associated page struct. */
24369 -#define _PAGE_IO       0x200
24370 -
24371 -#ifndef __ASSEMBLY__
24372 -#if CONFIG_XEN_COMPAT <= 0x030002
24373 -extern unsigned int __kernel_page_user;
24374 -#else
24375 -#define __kernel_page_user 0
24376 -#endif
24377 -#endif
24378 -
24379 -#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24380 -#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24381 -
24382 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24383 -
24384 -#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24385 -#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24386 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24387 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24388 -#define PAGE_COPY PAGE_COPY_NOEXEC
24389 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24390 -#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24391 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24392 -#define __PAGE_KERNEL \
24393 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24394 -#define __PAGE_KERNEL_EXEC \
24395 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24396 -#define __PAGE_KERNEL_NOCACHE \
24397 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24398 -#define __PAGE_KERNEL_RO \
24399 -       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24400 -#define __PAGE_KERNEL_VSYSCALL \
24401 -       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24402 -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24403 -       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24404 -#define __PAGE_KERNEL_LARGE \
24405 -       (__PAGE_KERNEL | _PAGE_PSE)
24406 -#define __PAGE_KERNEL_LARGE_EXEC \
24407 -       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24408 -
24409 -/*
24410 - * We don't support GLOBAL page in xenolinux64
24411 - */
24412 -#define MAKE_GLOBAL(x) __pgprot((x))
24413 -
24414 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24415 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24416 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24417 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24418 -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24419 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24420 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24421 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24422 -
24423 -/*         xwr */
24424 -#define __P000 PAGE_NONE
24425 -#define __P001 PAGE_READONLY
24426 -#define __P010 PAGE_COPY
24427 -#define __P011 PAGE_COPY
24428 -#define __P100 PAGE_READONLY_EXEC
24429 -#define __P101 PAGE_READONLY_EXEC
24430 -#define __P110 PAGE_COPY_EXEC
24431 -#define __P111 PAGE_COPY_EXEC
24432 -
24433 -#define __S000 PAGE_NONE
24434 -#define __S001 PAGE_READONLY
24435 -#define __S010 PAGE_SHARED
24436 -#define __S011 PAGE_SHARED
24437 -#define __S100 PAGE_READONLY_EXEC
24438 -#define __S101 PAGE_READONLY_EXEC
24439 -#define __S110 PAGE_SHARED_EXEC
24440 -#define __S111 PAGE_SHARED_EXEC
24441 -
24442  #ifndef __ASSEMBLY__
24443
24444  static inline unsigned long pgd_bad(pgd_t pgd)
24445 @@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24446         return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24447  }
24448
24449 -#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
24450 -       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
24451 -           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
24452 -               set_pte((ptep), (pteval));                              \
24453 -} while (0)
24454 -
24455  #define pte_none(x)    (!(x).pte)
24456  #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24457 -#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24458
24459 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24460 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))        /* FIXME: is this right? */
24461
24462  #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24463  #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24464         __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24465 -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn :       \
24466 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr :     \
24467                        (_pte).pte & _PAGE_PRESENT ?             \
24468                        mfn_to_local_pfn(__pte_mfn(_pte)) :      \
24469                        __pte_mfn(_pte))
24470
24471  #define pte_page(x)    pfn_to_page(pte_pfn(x))
24472
24473 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24474 -{
24475 -       unsigned long pte = page_nr << PAGE_SHIFT;
24476 -       pte |= pgprot_val(pgprot);
24477 -       pte &= __supported_pte_mask;
24478 -       return __pte(pte);
24479 -}
24480 -
24481 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24482 -{
24483 -       pte_t pte = *ptep;
24484 -       if (!pte_none(pte)) {
24485 -               if ((mm != &init_mm) ||
24486 -                   HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24487 -                       pte = __pte_ma(xchg(&ptep->pte, 0));
24488 -       }
24489 -       return pte;
24490 -}
24491 -
24492 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24493 -{
24494 -       if (full) {
24495 -               pte_t pte = *ptep;
24496 -               if (PagePinned(virt_to_page(mm->pgd)))
24497 -                       xen_l1_entry_update(ptep, __pte(0));
24498 -               else
24499 -                       *ptep = __pte(0);
24500 -               return pte;
24501 -       }
24502 -       return ptep_get_and_clear(mm, addr, ptep);
24503 -}
24504 -
24505 -#define ptep_clear_flush(vma, addr, ptep)                      \
24506 -({                                                             \
24507 -       pte_t *__ptep = (ptep);                                 \
24508 -       pte_t __res = *__ptep;                                  \
24509 -       if (!pte_none(__res) &&                                 \
24510 -           ((vma)->vm_mm != current->mm ||                     \
24511 -            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
24512 -                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24513 -                               UVMF_INVLPG|UVMF_MULTI))) {     \
24514 -               __ptep->pte = 0;                                \
24515 -               flush_tlb_page(vma, addr);                      \
24516 -       }                                                       \
24517 -       __res;                                                  \
24518 -})
24519 -
24520 -/*
24521 - * The following only work if pte_present() is true.
24522 - * Undefined behaviour if not..
24523 - */
24524 -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24525 -static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
24526 -static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
24527 -static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
24528 -static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
24529 -static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
24530 -
24531 -static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24532 -static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24533 -static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24534 -static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24535 -static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24536 -static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24537 -static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
24538 -static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= _PAGE_PSE; return pte; }
24539 -static inline pte_t pte_clrhuge(pte_t pte)     { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24540 -
24541 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24542 -{
24543 -       if (!pte_young(*ptep))
24544 -               return 0;
24545 -       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24546 -}
24547 -
24548 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24549 -{
24550 -       pte_t pte = *ptep;
24551 -       if (pte_write(pte))
24552 -               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24553 -}
24554 -
24555  /*
24556   * Macro to mark a page protection value as "uncacheable".
24557   */
24558  #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24559
24560 -static inline int pmd_large(pmd_t pte) {
24561 -       return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24562 -}
24563 -
24564
24565  /*
24566   * Conversion functions: convert a page and protection to a page entry,
24567 @@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24568  #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24569  #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24570  #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24571 +static inline int pgd_large(pgd_t pgd) { return 0; }
24572  #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24573
24574  /* PUD - Level3 access */
24575 @@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24576  #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24577  #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24578
24579 +static inline int pud_large(pud_t pte)
24580 +{
24581 +       return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24582 +               (_PAGE_PSE|_PAGE_PRESENT);
24583 +}
24584 +
24585  /* PMD  - Level 2 access */
24586  #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24587  #define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24588 @@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24589  #else
24590  #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24591  #endif
24592 -#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
24593  #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24594  #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24595
24596  #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24597 -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24598 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24599  #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24600
24601  /* PTE - Level 1 access. */
24602
24603  /* page, protection -> pte */
24604  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
24605 -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24606
24607 -/* Change flags of a PTE */
24608 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24609 -{
24610 -       /*
24611 -        * Since this might change the present bit (which controls whether
24612 -        * a pte_t object has undergone p2m translation), we must use
24613 -        * pte_val() on the input pte and __pte() for the return value.
24614 -        */
24615 -       unsigned long pteval = pte_val(pte);
24616 -
24617 -       pteval &= _PAGE_CHG_MASK;
24618 -       pteval |= pgprot_val(newprot);
24619 -       pteval &= __supported_pte_mask;
24620 -       return __pte(pteval);
24621 -}
24622 -
24623  #define pte_index(address) \
24624                 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24625  #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24626 @@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24627
24628  #define update_mmu_cache(vma,address,pte) do { } while (0)
24629
24630 -/*
24631 - * Rules for using ptep_establish: the pte MUST be a user pte, and
24632 - * must be a present->present transition.
24633 - */
24634 -#define __HAVE_ARCH_PTEP_ESTABLISH
24635 -#define ptep_establish(vma, address, ptep, pteval)                     \
24636 -       do {                                                            \
24637 -               if ( likely((vma)->vm_mm == current->mm) ) {            \
24638 -                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
24639 -                               pteval,                                 \
24640 -                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24641 -                                       UVMF_INVLPG|UVMF_MULTI));       \
24642 -               } else {                                                \
24643 -                       xen_l1_entry_update(ptep, pteval);              \
24644 -                       flush_tlb_page(vma, address);                   \
24645 -               }                                                       \
24646 -       } while (0)
24647 -
24648 -/* We only update the dirty/accessed state if we set
24649 - * the dirty bit by hand in the kernel, since the hardware
24650 - * will do the accessed bit for us, and we don't want to
24651 - * race with other CPU's that might be updating the dirty
24652 - * bit at the same time. */
24653 -#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24654 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
24655 -({                                                                     \
24656 -       int __changed = !pte_same(*(ptep), entry);                      \
24657 -       if (__changed && (dirty))                                       \
24658 -               ptep_establish(vma, address, ptep, entry);              \
24659 -       __changed;                                                      \
24660 -})
24661 -
24662 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24663 -#define ptep_clear_flush_young(vma, address, ptep)                     \
24664 -({                                                                     \
24665 -       pte_t __pte = *(ptep);                                          \
24666 -       int __young = pte_young(__pte);                                 \
24667 -       __pte = pte_mkold(__pte);                                       \
24668 -       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
24669 -               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24670 -       else if (__young)                                               \
24671 -               set_pte(ptep, __pte);                                   \
24672 -       __young;                                                        \
24673 -})
24674 -
24675  /* Encode and de-code a swap entry */
24676  #define __swp_type(x)                  (((x).val >> 1) & 0x3f)
24677  #define __swp_offset(x)                        ((x).val >> 8)
24678  #define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24679  #define __pte_to_swp_entry(pte)                ((swp_entry_t) { __pte_val(pte) })
24680 -#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
24681 -
24682 -extern spinlock_t pgd_lock;
24683 -extern struct list_head pgd_list;
24684 +#define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
24685
24686  extern int kern_addr_valid(unsigned long addr);
24687 -
24688 -#define DOMID_LOCAL (0xFFFFU)
24689 -
24690 -struct vm_area_struct;
24691 -
24692 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24693 -                            unsigned long address,
24694 -                            unsigned long mfn,
24695 -                            unsigned long size,
24696 -                            pgprot_t prot,
24697 -                            domid_t  domid);
24698 -
24699 -int direct_kernel_remap_pfn_range(unsigned long address,
24700 -                                 unsigned long mfn,
24701 -                                 unsigned long size,
24702 -                                 pgprot_t prot,
24703 -                                 domid_t  domid);
24704 -
24705 -int create_lookup_pte_addr(struct mm_struct *mm,
24706 -                           unsigned long address,
24707 -                           uint64_t *ptep);
24708 -
24709 -int touch_pte_range(struct mm_struct *mm,
24710 -                    unsigned long address,
24711 -                    unsigned long size);
24712 -
24713 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24714 -               unsigned long addr, unsigned long end, pgprot_t newprot,
24715 -               int dirty_accountable);
24716 -
24717 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24718 -       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24719 -
24720 -pte_t *lookup_address(unsigned long addr);
24721 +extern void cleanup_highmap(void);
24722
24723  #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
24724                 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24725
24726  #define HAVE_ARCH_UNMAPPED_AREA
24727 +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24728
24729  #define pgtable_cache_init()   do { } while (0)
24730  #define check_pgt_cache()      do { } while (0)
24731 @@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24732  #define        kc_offset_to_vaddr(o) \
24733     (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24734
24735 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24736 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24737 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24738 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24739 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24740  #define __HAVE_ARCH_PTE_SAME
24741 -#include <asm-generic/pgtable.h>
24742  #endif /* !__ASSEMBLY__ */
24743
24744  #endif /* _X86_64_PGTABLE_H */
24745 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/processor.h      2009-02-16 16:18:36.000000000 +0100
24746 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/processor.h   2009-03-16 16:33:40.000000000 +0100
24747 @@ -1,5 +1,793 @@
24748 +#ifndef __ASM_X86_PROCESSOR_H
24749 +#define __ASM_X86_PROCESSOR_H
24750 +
24751 +#include <asm/processor-flags.h>
24752 +
24753 +/* migration helpers, for KVM - will be removed in 2.6.25: */
24754 +#include <asm/vm86.h>
24755 +#define Xgt_desc_struct        desc_ptr
24756 +
24757 +/* Forward declaration, a strange C thing */
24758 +struct task_struct;
24759 +struct mm_struct;
24760 +
24761 +#include <asm/vm86.h>
24762 +#include <asm/math_emu.h>
24763 +#include <asm/segment.h>
24764 +#include <asm/types.h>
24765 +#include <asm/sigcontext.h>
24766 +#include <asm/current.h>
24767 +#include <asm/cpufeature.h>
24768 +#include <asm/system.h>
24769 +#include <asm/page.h>
24770 +#include <asm/percpu.h>
24771 +#include <asm/msr.h>
24772 +#include <asm/desc_defs.h>
24773 +#include <asm/nops.h>
24774 +#include <linux/personality.h>
24775 +#include <linux/cpumask.h>
24776 +#include <linux/cache.h>
24777 +#include <linux/threads.h>
24778 +#include <linux/init.h>
24779 +#include <xen/interface/physdev.h>
24780 +
24781 +/*
24782 + * Default implementation of macro that returns current
24783 + * instruction pointer ("program counter").
24784 + */
24785 +static inline void *current_text_addr(void)
24786 +{
24787 +       void *pc;
24788 +       asm volatile("mov $1f,%0\n1:":"=r" (pc));
24789 +       return pc;
24790 +}
24791 +
24792 +#ifdef CONFIG_X86_VSMP
24793 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
24794 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
24795 +#else
24796 +#define ARCH_MIN_TASKALIGN     16
24797 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
24798 +#endif
24799 +
24800 +/*
24801 + *  CPU type and hardware bug flags. Kept separately for each CPU.
24802 + *  Members of this structure are referenced in head.S, so think twice
24803 + *  before touching them. [mj]
24804 + */
24805 +
24806 +struct cpuinfo_x86 {
24807 +       __u8    x86;            /* CPU family */
24808 +       __u8    x86_vendor;     /* CPU vendor */
24809 +       __u8    x86_model;
24810 +       __u8    x86_mask;
24811 +#ifdef CONFIG_X86_32
24812 +       char    wp_works_ok;    /* It doesn't on 386's */
24813 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
24814 +       char    hard_math;
24815 +       char    rfu;
24816 +       char    fdiv_bug;
24817 +       char    f00f_bug;
24818 +       char    coma_bug;
24819 +       char    pad0;
24820 +#else
24821 +       /* number of 4K pages in DTLB/ITLB combined(in pages)*/
24822 +       int     x86_tlbsize;
24823 +       __u8    x86_virt_bits, x86_phys_bits;
24824 +       /* cpuid returned core id bits */
24825 +       __u8    x86_coreid_bits;
24826 +       /* Max extended CPUID function supported */
24827 +       __u32   extended_cpuid_level;
24828 +#endif
24829 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
24830 +       __u32   x86_capability[NCAPINTS];
24831 +       char    x86_vendor_id[16];
24832 +       char    x86_model_id[64];
24833 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
24834 +                                   call  */
24835 +       int     x86_cache_alignment;    /* In bytes */
24836 +       int     x86_power;
24837 +       unsigned long loops_per_jiffy;
24838 +#ifdef CONFIG_SMP
24839 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
24840 +#endif
24841 +       u16 x86_max_cores;              /* cpuid returned max cores value */
24842 +       u16 apicid;
24843 +       u16 x86_clflush_size;
24844 +#ifdef CONFIG_SMP
24845 +       u16 booted_cores;               /* number of cores as seen by OS */
24846 +       u16 phys_proc_id;               /* Physical processor id. */
24847 +       u16 cpu_core_id;                /* Core id */
24848 +       u16 cpu_index;                  /* index into per_cpu list */
24849 +#endif
24850 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
24851 +
24852 +#define X86_VENDOR_INTEL 0
24853 +#define X86_VENDOR_CYRIX 1
24854 +#define X86_VENDOR_AMD 2
24855 +#define X86_VENDOR_UMC 3
24856 +#define X86_VENDOR_NEXGEN 4
24857 +#define X86_VENDOR_CENTAUR 5
24858 +#define X86_VENDOR_TRANSMETA 7
24859 +#define X86_VENDOR_NSC 8
24860 +#define X86_VENDOR_NUM 9
24861 +#define X86_VENDOR_UNKNOWN 0xff
24862 +
24863 +/*
24864 + * capabilities of CPUs
24865 + */
24866 +extern struct cpuinfo_x86 boot_cpu_data;
24867 +extern struct cpuinfo_x86 new_cpu_data;
24868 +extern __u32 cleared_cpu_caps[NCAPINTS];
24869 +
24870 +#ifdef CONFIG_SMP
24871 +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
24872 +#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
24873 +#define current_cpu_data       cpu_data(smp_processor_id())
24874 +#else
24875 +#define cpu_data(cpu)          boot_cpu_data
24876 +#define current_cpu_data       boot_cpu_data
24877 +#endif
24878 +
24879 +void cpu_detect(struct cpuinfo_x86 *c);
24880 +
24881 +extern void identify_cpu(struct cpuinfo_x86 *);
24882 +extern void identify_boot_cpu(void);
24883 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
24884 +extern void print_cpu_info(struct cpuinfo_x86 *);
24885 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
24886 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
24887 +extern unsigned short num_cache_leaves;
24888 +
24889 +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
24890 +extern void detect_ht(struct cpuinfo_x86 *c);
24891 +#else
24892 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
24893 +#endif
24894 +
24895 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
24896 +                            unsigned int *ecx, unsigned int *edx)
24897 +{
24898 +       /* ecx is often an input as well as an output. */
24899 +       __asm__(XEN_CPUID
24900 +               : "=a" (*eax),
24901 +                 "=b" (*ebx),
24902 +                 "=c" (*ecx),
24903 +                 "=d" (*edx)
24904 +               : "0" (*eax), "2" (*ecx));
24905 +}
24906 +
24907 +static inline void load_cr3(pgd_t *pgdir)
24908 +{
24909 +       write_cr3(__pa(pgdir));
24910 +}
24911 +
24912 +#ifndef CONFIG_X86_NO_TSS
24913 +#ifdef CONFIG_X86_32
24914 +/* This is the TSS defined by the hardware. */
24915 +struct x86_hw_tss {
24916 +       unsigned short  back_link, __blh;
24917 +       unsigned long   sp0;
24918 +       unsigned short  ss0, __ss0h;
24919 +       unsigned long   sp1;
24920 +       unsigned short  ss1, __ss1h;    /* ss1 caches MSR_IA32_SYSENTER_CS */
24921 +       unsigned long   sp2;
24922 +       unsigned short  ss2, __ss2h;
24923 +       unsigned long   __cr3;
24924 +       unsigned long   ip;
24925 +       unsigned long   flags;
24926 +       unsigned long   ax, cx, dx, bx;
24927 +       unsigned long   sp, bp, si, di;
24928 +       unsigned short  es, __esh;
24929 +       unsigned short  cs, __csh;
24930 +       unsigned short  ss, __ssh;
24931 +       unsigned short  ds, __dsh;
24932 +       unsigned short  fs, __fsh;
24933 +       unsigned short  gs, __gsh;
24934 +       unsigned short  ldt, __ldth;
24935 +       unsigned short  trace, io_bitmap_base;
24936 +} __attribute__((packed));
24937 +extern struct tss_struct doublefault_tss;
24938 +#else
24939 +struct x86_hw_tss {
24940 +       u32 reserved1;
24941 +       u64 sp0;
24942 +       u64 sp1;
24943 +       u64 sp2;
24944 +       u64 reserved2;
24945 +       u64 ist[7];
24946 +       u32 reserved3;
24947 +       u32 reserved4;
24948 +       u16 reserved5;
24949 +       u16 io_bitmap_base;
24950 +} __attribute__((packed)) ____cacheline_aligned;
24951 +#endif
24952 +#endif /* CONFIG_X86_NO_TSS */
24953 +
24954 +/*
24955 + * Size of io_bitmap.
24956 + */
24957 +#define IO_BITMAP_BITS  65536
24958 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
24959 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
24960 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
24961 +#define INVALID_IO_BITMAP_OFFSET 0x8000
24962 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
24963 +
24964 +#ifndef CONFIG_X86_NO_TSS
24965 +struct tss_struct {
24966 +       struct x86_hw_tss x86_tss;
24967 +
24968 +       /*
24969 +        * The extra 1 is there because the CPU will access an
24970 +        * additional byte beyond the end of the IO permission
24971 +        * bitmap. The extra byte must be all 1 bits, and must
24972 +        * be within the limit.
24973 +        */
24974 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
24975 +       /*
24976 +        * Cache the current maximum and the last task that used the bitmap:
24977 +        */
24978 +       unsigned long io_bitmap_max;
24979 +       struct thread_struct *io_bitmap_owner;
24980 +       /*
24981 +        * pads the TSS to be cacheline-aligned (size is 0x100)
24982 +        */
24983 +       unsigned long __cacheline_filler[35];
24984 +       /*
24985 +        * .. and then another 0x100 bytes for emergency kernel stack
24986 +        */
24987 +       unsigned long stack[64];
24988 +} __attribute__((packed));
24989 +
24990 +DECLARE_PER_CPU(struct tss_struct, init_tss);
24991 +
24992 +/* Save the original ist values for checking stack pointers during debugging */
24993 +struct orig_ist {
24994 +       unsigned long ist[7];
24995 +};
24996 +#endif /* CONFIG_X86_NO_TSS */
24997 +
24998 +#define        MXCSR_DEFAULT           0x1f80
24999 +
25000 +struct i387_fsave_struct {
25001 +       u32     cwd;
25002 +       u32     swd;
25003 +       u32     twd;
25004 +       u32     fip;
25005 +       u32     fcs;
25006 +       u32     foo;
25007 +       u32     fos;
25008 +       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25009 +       u32     status;         /* software status information */
25010 +};
25011 +
25012 +struct i387_fxsave_struct {
25013 +       u16     cwd;
25014 +       u16     swd;
25015 +       u16     twd;
25016 +       u16     fop;
25017 +       union {
25018 +               struct {
25019 +                       u64     rip;
25020 +                       u64     rdp;
25021 +               };
25022 +               struct {
25023 +                       u32     fip;
25024 +                       u32     fcs;
25025 +                       u32     foo;
25026 +                       u32     fos;
25027 +               };
25028 +       };
25029 +       u32     mxcsr;
25030 +       u32     mxcsr_mask;
25031 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
25032 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
25033 +       u32     padding[24];
25034 +} __attribute__((aligned(16)));
25035 +
25036 +struct i387_soft_struct {
25037 +       u32     cwd;
25038 +       u32     swd;
25039 +       u32     twd;
25040 +       u32     fip;
25041 +       u32     fcs;
25042 +       u32     foo;
25043 +       u32     fos;
25044 +       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25045 +       u8      ftop, changed, lookahead, no_update, rm, alimit;
25046 +       struct info     *info;
25047 +       u32     entry_eip;
25048 +};
25049 +
25050 +union i387_union {
25051 +       struct i387_fsave_struct        fsave;
25052 +       struct i387_fxsave_struct       fxsave;
25053 +       struct i387_soft_struct         soft;
25054 +};
25055 +
25056 +#ifdef CONFIG_X86_32
25057 +DECLARE_PER_CPU(u8, cpu_llc_id);
25058 +#elif !defined(CONFIG_X86_NO_TSS)
25059 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
25060 +#endif
25061 +
25062 +extern void print_cpu_info(struct cpuinfo_x86 *);
25063 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25064 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25065 +extern unsigned short num_cache_leaves;
25066 +
25067 +struct thread_struct {
25068 +/* cached TLS descriptors. */
25069 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25070 +       unsigned long   sp0;
25071 +       unsigned long   sp;
25072 +#ifdef CONFIG_X86_32
25073 +       unsigned long   sysenter_cs;
25074 +#else
25075 +       unsigned long   usersp; /* Copy from PDA */
25076 +       unsigned short  es, ds, fsindex, gsindex;
25077 +#endif
25078 +       unsigned long   ip;
25079 +       unsigned long   fs;
25080 +       unsigned long   gs;
25081 +/* Hardware debugging registers */
25082 +       unsigned long   debugreg0;
25083 +       unsigned long   debugreg1;
25084 +       unsigned long   debugreg2;
25085 +       unsigned long   debugreg3;
25086 +       unsigned long   debugreg6;
25087 +       unsigned long   debugreg7;
25088 +/* fault info */
25089 +       unsigned long   cr2, trap_no, error_code;
25090 +/* floating point info */
25091 +       union i387_union        i387 __attribute__((aligned(16)));;
25092 +#ifdef CONFIG_X86_32
25093 +/* virtual 86 mode info */
25094 +       struct vm86_struct __user *vm86_info;
25095 +       unsigned long           screen_bitmap;
25096 +       unsigned long           v86flags, v86mask, saved_sp0;
25097 +       unsigned int            saved_fs, saved_gs;
25098 +#endif
25099 +/* IO permissions */
25100 +       unsigned long   *io_bitmap_ptr;
25101 +       unsigned long   iopl;
25102 +/* max allowed port in the bitmap, in bytes: */
25103 +       unsigned io_bitmap_max;
25104 +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
25105 +       unsigned long   debugctlmsr;
25106 +/* Debug Store - if not 0 points to a DS Save Area configuration;
25107 + *               goes into MSR_IA32_DS_AREA */
25108 +       unsigned long   ds_area_msr;
25109 +};
25110 +
25111 +static inline unsigned long xen_get_debugreg(int regno)
25112 +{
25113 +       return HYPERVISOR_get_debugreg(regno);
25114 +}
25115 +
25116 +static inline void xen_set_debugreg(int regno, unsigned long value)
25117 +{
25118 +       WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25119 +}
25120 +
25121 +/*
25122 + * Set IOPL bits in EFLAGS from given mask
25123 + */
25124 +static inline void xen_set_iopl_mask(unsigned mask)
25125 +{
25126 +       struct physdev_set_iopl set_iopl;
25127 +
25128 +       /* Force the change at ring 0. */
25129 +       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25130 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25131 +}
25132 +
25133 +#ifndef CONFIG_X86_NO_TSS
25134 +static inline void native_load_sp0(struct tss_struct *tss,
25135 +                                  struct thread_struct *thread)
25136 +{
25137 +       tss->x86_tss.sp0 = thread->sp0;
25138 +#ifdef CONFIG_X86_32
25139 +       /* Only happens when SEP is enabled, no need to test "SEP"arately */
25140 +       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25141 +               tss->x86_tss.ss1 = thread->sysenter_cs;
25142 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25143 +       }
25144 +#endif
25145 +}
25146 +#else
25147 +#define xen_load_sp0(tss, thread) do { \
25148 +       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
25149 +               BUG(); \
25150 +} while (0)
25151 +#endif
25152 +
25153 +#define __cpuid xen_cpuid
25154 +#define paravirt_enabled() 0
25155 +
25156 +/*
25157 + * These special macros can be used to get or set a debugging register
25158 + */
25159 +#define get_debugreg(var, register)                            \
25160 +       (var) = xen_get_debugreg(register)
25161 +#define set_debugreg(value, register)                          \
25162 +       xen_set_debugreg(register, value)
25163 +
25164 +#define load_sp0 xen_load_sp0
25165 +
25166 +#define set_iopl_mask xen_set_iopl_mask
25167 +
25168 +/*
25169 + * Save the cr4 feature set we're using (ie
25170 + * Pentium 4MB enable and PPro Global page
25171 + * enable), so that any CPU's that boot up
25172 + * after us can get the correct flags.
25173 + */
25174 +extern unsigned long mmu_cr4_features;
25175 +
25176 +static inline void set_in_cr4(unsigned long mask)
25177 +{
25178 +       unsigned cr4;
25179 +       mmu_cr4_features |= mask;
25180 +       cr4 = read_cr4();
25181 +       cr4 |= mask;
25182 +       write_cr4(cr4);
25183 +}
25184 +
25185 +static inline void clear_in_cr4(unsigned long mask)
25186 +{
25187 +       unsigned cr4;
25188 +       mmu_cr4_features &= ~mask;
25189 +       cr4 = read_cr4();
25190 +       cr4 &= ~mask;
25191 +       write_cr4(cr4);
25192 +}
25193 +
25194 +struct microcode_header {
25195 +       unsigned int hdrver;
25196 +       unsigned int rev;
25197 +       unsigned int date;
25198 +       unsigned int sig;
25199 +       unsigned int cksum;
25200 +       unsigned int ldrver;
25201 +       unsigned int pf;
25202 +       unsigned int datasize;
25203 +       unsigned int totalsize;
25204 +       unsigned int reserved[3];
25205 +};
25206 +
25207 +struct microcode {
25208 +       struct microcode_header hdr;
25209 +       unsigned int bits[0];
25210 +};
25211 +
25212 +typedef struct microcode microcode_t;
25213 +typedef struct microcode_header microcode_header_t;
25214 +
25215 +/* microcode format is extended from prescott processors */
25216 +struct extended_signature {
25217 +       unsigned int sig;
25218 +       unsigned int pf;
25219 +       unsigned int cksum;
25220 +};
25221 +
25222 +struct extended_sigtable {
25223 +       unsigned int count;
25224 +       unsigned int cksum;
25225 +       unsigned int reserved[3];
25226 +       struct extended_signature sigs[0];
25227 +};
25228 +
25229 +typedef struct {
25230 +       unsigned long seg;
25231 +} mm_segment_t;
25232 +
25233 +
25234 +/*
25235 + * create a kernel thread without removing it from tasklists
25236 + */
25237 +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
25238 +
25239 +/* Free all resources held by a thread. */
25240 +extern void release_thread(struct task_struct *);
25241 +
25242 +/* Prepare to copy thread state - unlazy all lazy status */
25243 +extern void prepare_to_copy(struct task_struct *tsk);
25244 +
25245 +unsigned long get_wchan(struct task_struct *p);
25246 +
25247 +/*
25248 + * Generic CPUID function
25249 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25250 + * resulting in stale register contents being returned.
25251 + */
25252 +static inline void cpuid(unsigned int op,
25253 +                        unsigned int *eax, unsigned int *ebx,
25254 +                        unsigned int *ecx, unsigned int *edx)
25255 +{
25256 +       *eax = op;
25257 +       *ecx = 0;
25258 +       __cpuid(eax, ebx, ecx, edx);
25259 +}
25260 +
25261 +/* Some CPUID calls want 'count' to be placed in ecx */
25262 +static inline void cpuid_count(unsigned int op, int count,
25263 +                              unsigned int *eax, unsigned int *ebx,
25264 +                              unsigned int *ecx, unsigned int *edx)
25265 +{
25266 +       *eax = op;
25267 +       *ecx = count;
25268 +       __cpuid(eax, ebx, ecx, edx);
25269 +}
25270 +
25271 +/*
25272 + * CPUID functions returning a single datum
25273 + */
25274 +static inline unsigned int cpuid_eax(unsigned int op)
25275 +{
25276 +       unsigned int eax, ebx, ecx, edx;
25277 +
25278 +       cpuid(op, &eax, &ebx, &ecx, &edx);
25279 +       return eax;
25280 +}
25281 +static inline unsigned int cpuid_ebx(unsigned int op)
25282 +{
25283 +       unsigned int eax, ebx, ecx, edx;
25284 +
25285 +       cpuid(op, &eax, &ebx, &ecx, &edx);
25286 +       return ebx;
25287 +}
25288 +static inline unsigned int cpuid_ecx(unsigned int op)
25289 +{
25290 +       unsigned int eax, ebx, ecx, edx;
25291 +
25292 +       cpuid(op, &eax, &ebx, &ecx, &edx);
25293 +       return ecx;
25294 +}
25295 +static inline unsigned int cpuid_edx(unsigned int op)
25296 +{
25297 +       unsigned int eax, ebx, ecx, edx;
25298 +
25299 +       cpuid(op, &eax, &ebx, &ecx, &edx);
25300 +       return edx;
25301 +}
25302 +
25303 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25304 +static inline void rep_nop(void)
25305 +{
25306 +       __asm__ __volatile__("rep;nop": : :"memory");
25307 +}
25308 +
25309 +/* Stop speculative execution */
25310 +static inline void sync_core(void)
25311 +{
25312 +       int tmp;
25313 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1)
25314 +                                         : "ebx", "ecx", "edx", "memory");
25315 +}
25316 +
25317 +#define cpu_relax()   rep_nop()
25318 +
25319 +static inline void __monitor(const void *eax, unsigned long ecx,
25320 +               unsigned long edx)
25321 +{
25322 +       /* "monitor %eax,%ecx,%edx;" */
25323 +       asm volatile(
25324 +               ".byte 0x0f,0x01,0xc8;"
25325 +               : :"a" (eax), "c" (ecx), "d"(edx));
25326 +}
25327 +
25328 +static inline void __mwait(unsigned long eax, unsigned long ecx)
25329 +{
25330 +       /* "mwait %eax,%ecx;" */
25331 +       asm volatile(
25332 +               ".byte 0x0f,0x01,0xc9;"
25333 +               : :"a" (eax), "c" (ecx));
25334 +}
25335 +
25336 +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
25337 +{
25338 +       /* "mwait %eax,%ecx;" */
25339 +       asm volatile(
25340 +               "sti; .byte 0x0f,0x01,0xc9;"
25341 +               : :"a" (eax), "c" (ecx));
25342 +}
25343 +
25344 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25345 +
25346 +extern int force_mwait;
25347 +
25348 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
25349 +
25350 +extern unsigned long boot_option_idle_override;
25351 +
25352 +extern void enable_sep_cpu(void);
25353 +extern int sysenter_setup(void);
25354 +
25355 +/* Defined in head.S */
25356 +extern struct desc_ptr early_gdt_descr;
25357 +
25358 +extern void cpu_set_gdt(int);
25359 +extern void switch_to_new_gdt(void);
25360 +extern void cpu_init(void);
25361 +extern void init_gdt(int cpu);
25362 +
25363 +/* from system description table in BIOS.  Mostly for MCA use, but
25364 + * others may find it useful. */
25365 +extern unsigned int machine_id;
25366 +extern unsigned int machine_submodel_id;
25367 +extern unsigned int BIOS_revision;
25368 +
25369 +/* Boot loader type from the setup header */
25370 +extern int bootloader_type;
25371 +
25372 +extern char ignore_fpu_irq;
25373 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25374 +
25375 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
25376 +#define ARCH_HAS_PREFETCHW
25377 +#define ARCH_HAS_SPINLOCK_PREFETCH
25378 +
25379 +#ifdef CONFIG_X86_32
25380 +#define BASE_PREFETCH  ASM_NOP4
25381 +#define ARCH_HAS_PREFETCH
25382 +#else
25383 +#define BASE_PREFETCH  "prefetcht0 (%1)"
25384 +#endif
25385 +
25386 +/* Prefetch instructions for Pentium III and AMD Athlon */
25387 +/* It's not worth to care about 3dnow! prefetches for the K6
25388 +   because they are microcoded there and very slow.
25389 +   However we don't do prefetches for pre XP Athlons currently
25390 +   That should be fixed. */
25391 +static inline void prefetch(const void *x)
25392 +{
25393 +       alternative_input(BASE_PREFETCH,
25394 +                         "prefetchnta (%1)",
25395 +                         X86_FEATURE_XMM,
25396 +                         "r" (x));
25397 +}
25398 +
25399 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
25400 +   spinlocks to avoid one state transition in the cache coherency protocol. */
25401 +static inline void prefetchw(const void *x)
25402 +{
25403 +       alternative_input(BASE_PREFETCH,
25404 +                         "prefetchw (%1)",
25405 +                         X86_FEATURE_3DNOW,
25406 +                         "r" (x));
25407 +}
25408 +
25409 +#define spin_lock_prefetch(x)  prefetchw(x)
25410  #ifdef CONFIG_X86_32
25411 -# include "processor_32.h"
25412 +/*
25413 + * User space process size: 3GB (default).
25414 + */
25415 +#define TASK_SIZE      (PAGE_OFFSET)
25416 +#define STACK_TOP      TASK_SIZE
25417 +#define STACK_TOP_MAX  STACK_TOP
25418 +
25419 +#define INIT_THREAD  {                                                 \
25420 +       .sp0 = sizeof(init_stack) + (long)&init_stack,                  \
25421 +       .vm86_info = NULL,                                              \
25422 +       .sysenter_cs = __KERNEL_CS,                                     \
25423 +       .io_bitmap_ptr = NULL,                                          \
25424 +       .fs = __KERNEL_PERCPU,                                          \
25425 +}
25426 +
25427 +/*
25428 + * Note that the .io_bitmap member must be extra-big. This is because
25429 + * the CPU will access an additional byte beyond the end of the IO
25430 + * permission bitmap. The extra byte must be all 1 bits, and must
25431 + * be within the limit.
25432 + */
25433 +#define INIT_TSS  {                                                    \
25434 +       .x86_tss = {                                                    \
25435 +               .sp0            = sizeof(init_stack) + (long)&init_stack, \
25436 +               .ss0            = __KERNEL_DS,                          \
25437 +               .ss1            = __KERNEL_CS,                          \
25438 +               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
25439 +        },                                                             \
25440 +       .io_bitmap      = { [0 ... IO_BITMAP_LONGS] = ~0 },             \
25441 +}
25442 +
25443 +#define start_thread(regs, new_eip, new_esp) do {              \
25444 +       __asm__("movl %0,%%gs": :"r" (0));                      \
25445 +       regs->fs = 0;                                           \
25446 +       set_fs(USER_DS);                                        \
25447 +       regs->ds = __USER_DS;                                   \
25448 +       regs->es = __USER_DS;                                   \
25449 +       regs->ss = __USER_DS;                                   \
25450 +       regs->cs = __USER_CS;                                   \
25451 +       regs->ip = new_eip;                                     \
25452 +       regs->sp = new_esp;                                     \
25453 +} while (0)
25454 +
25455 +
25456 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
25457 +
25458 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
25459 +#define KSTK_TOP(info)                                                 \
25460 +({                                                                     \
25461 +       unsigned long *__ptr = (unsigned long *)(info);                 \
25462 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
25463 +})
25464 +
25465 +/*
25466 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25467 + * This is necessary to guarantee that the entire "struct pt_regs"
25468 + * is accessable even if the CPU haven't stored the SS/ESP registers
25469 + * on the stack (interrupt gate does not save these registers
25470 + * when switching to the same priv ring).
25471 + * Therefore beware: accessing the ss/esp fields of the
25472 + * "struct pt_regs" is possible, but they may contain the
25473 + * completely wrong values.
25474 + */
25475 +#define task_pt_regs(task)                                             \
25476 +({                                                                     \
25477 +       struct pt_regs *__regs__;                                       \
25478 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25479 +       __regs__ - 1;                                                   \
25480 +})
25481 +
25482 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
25483 +
25484  #else
25485 -# include "processor_64.h"
25486 +/*
25487 + * User space process size. 47bits minus one guard page.
25488 + */
25489 +#define TASK_SIZE64    (0x800000000000UL - 4096)
25490 +
25491 +/* This decides where the kernel will search for a free chunk of vm
25492 + * space during mmap's.
25493 + */
25494 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
25495 +                          0xc0000000 : 0xFFFFe000)
25496 +
25497 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? \
25498 +                                IA32_PAGE_OFFSET : TASK_SIZE64)
25499 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? \
25500 +                                 IA32_PAGE_OFFSET : TASK_SIZE64)
25501 +
25502 +#define STACK_TOP              TASK_SIZE
25503 +#define STACK_TOP_MAX          TASK_SIZE64
25504 +
25505 +#define INIT_THREAD  { \
25506 +       .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25507 +}
25508 +
25509 +#define INIT_TSS  { \
25510 +       .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25511 +}
25512 +
25513 +#define start_thread(regs, new_rip, new_rsp) do {                           \
25514 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));  \
25515 +       load_gs_index(0);                                                    \
25516 +       (regs)->ip = (new_rip);                                              \
25517 +       (regs)->sp = (new_rsp);                                              \
25518 +       write_pda(oldrsp, (new_rsp));                                        \
25519 +       (regs)->cs = __USER_CS;                                              \
25520 +       (regs)->ss = __USER_DS;                                              \
25521 +       (regs)->flags = 0x200;                                               \
25522 +       set_fs(USER_DS);                                                     \
25523 +} while (0)
25524 +
25525 +/*
25526 + * Return saved PC of a blocked thread.
25527 + * What is this good for? it will be always the scheduler or ret_from_fork.
25528 + */
25529 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
25530 +
25531 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
25532 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
25533 +#endif /* CONFIG_X86_64 */
25534 +
25535 +/* This decides where the kernel will search for a free chunk of vm
25536 + * space during mmap's.
25537 + */
25538 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
25539 +
25540 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
25541 +
25542  #endif
25543 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/processor_32.h   2009-02-16 16:18:36.000000000 +0100
25544 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
25545 @@ -1,751 +0,0 @@
25546 -/*
25547 - * include/asm-i386/processor.h
25548 - *
25549 - * Copyright (C) 1994 Linus Torvalds
25550 - */
25551 -
25552 -#ifndef __ASM_I386_PROCESSOR_H
25553 -#define __ASM_I386_PROCESSOR_H
25554 -
25555 -#include <asm/vm86.h>
25556 -#include <asm/math_emu.h>
25557 -#include <asm/segment.h>
25558 -#include <asm/page.h>
25559 -#include <asm/types.h>
25560 -#include <asm/sigcontext.h>
25561 -#include <asm/cpufeature.h>
25562 -#include <asm/msr.h>
25563 -#include <asm/system.h>
25564 -#include <linux/cache.h>
25565 -#include <linux/threads.h>
25566 -#include <asm/percpu.h>
25567 -#include <linux/cpumask.h>
25568 -#include <linux/init.h>
25569 -#include <asm/processor-flags.h>
25570 -#include <xen/interface/physdev.h>
25571 -
25572 -/* flag for disabling the tsc */
25573 -#define tsc_disable 0
25574 -
25575 -struct desc_struct {
25576 -       unsigned long a,b;
25577 -};
25578 -
25579 -#define desc_empty(desc) \
25580 -               (!((desc)->a | (desc)->b))
25581 -
25582 -#define desc_equal(desc1, desc2) \
25583 -               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25584 -/*
25585 - * Default implementation of macro that returns current
25586 - * instruction pointer ("program counter").
25587 - */
25588 -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25589 -
25590 -/*
25591 - *  CPU type and hardware bug flags. Kept separately for each CPU.
25592 - *  Members of this structure are referenced in head.S, so think twice
25593 - *  before touching them. [mj]
25594 - */
25595 -
25596 -struct cpuinfo_x86 {
25597 -       __u8    x86;            /* CPU family */
25598 -       __u8    x86_vendor;     /* CPU vendor */
25599 -       __u8    x86_model;
25600 -       __u8    x86_mask;
25601 -       char    wp_works_ok;    /* It doesn't on 386's */
25602 -       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
25603 -       char    hard_math;
25604 -       char    rfu;
25605 -               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
25606 -       unsigned long   x86_capability[NCAPINTS];
25607 -       char    x86_vendor_id[16];
25608 -       char    x86_model_id[64];
25609 -       int     x86_cache_size;  /* in KB - valid for CPUS which support this
25610 -                                   call  */
25611 -       int     x86_cache_alignment;    /* In bytes */
25612 -       char    fdiv_bug;
25613 -       char    f00f_bug;
25614 -       char    coma_bug;
25615 -       char    pad0;
25616 -       int     x86_power;
25617 -       unsigned long loops_per_jiffy;
25618 -#ifdef CONFIG_SMP
25619 -       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
25620 -#endif
25621 -       unsigned char x86_max_cores;    /* cpuid returned max cores value */
25622 -       unsigned char apicid;
25623 -       unsigned short x86_clflush_size;
25624 -#ifdef CONFIG_SMP
25625 -       unsigned char booted_cores;     /* number of cores as seen by OS */
25626 -       __u8 phys_proc_id;              /* Physical processor id. */
25627 -       __u8 cpu_core_id;               /* Core id */
25628 -       __u8 cpu_index;                 /* index into per_cpu list */
25629 -#endif
25630 -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25631 -
25632 -#define X86_VENDOR_INTEL 0
25633 -#define X86_VENDOR_CYRIX 1
25634 -#define X86_VENDOR_AMD 2
25635 -#define X86_VENDOR_UMC 3
25636 -#define X86_VENDOR_NEXGEN 4
25637 -#define X86_VENDOR_CENTAUR 5
25638 -#define X86_VENDOR_TRANSMETA 7
25639 -#define X86_VENDOR_NSC 8
25640 -#define X86_VENDOR_NUM 9
25641 -#define X86_VENDOR_UNKNOWN 0xff
25642 -
25643 -/*
25644 - * capabilities of CPUs
25645 - */
25646 -
25647 -extern struct cpuinfo_x86 boot_cpu_data;
25648 -extern struct cpuinfo_x86 new_cpu_data;
25649 -#ifndef CONFIG_X86_NO_TSS
25650 -extern struct tss_struct doublefault_tss;
25651 -DECLARE_PER_CPU(struct tss_struct, init_tss);
25652 -#endif
25653 -
25654 -#ifdef CONFIG_SMP
25655 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25656 -#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
25657 -#define current_cpu_data       cpu_data(smp_processor_id())
25658 -#else
25659 -#define cpu_data(cpu)          boot_cpu_data
25660 -#define current_cpu_data       boot_cpu_data
25661 -#endif
25662 -
25663 -/*
25664 - * the following now lives in the per cpu area:
25665 - * extern      int cpu_llc_id[NR_CPUS];
25666 - */
25667 -DECLARE_PER_CPU(u8, cpu_llc_id);
25668 -extern char ignore_fpu_irq;
25669 -
25670 -void __init cpu_detect(struct cpuinfo_x86 *c);
25671 -
25672 -extern void identify_boot_cpu(void);
25673 -extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25674 -extern void print_cpu_info(struct cpuinfo_x86 *);
25675 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25676 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25677 -extern unsigned short num_cache_leaves;
25678 -
25679 -#ifdef CONFIG_X86_HT
25680 -extern void detect_ht(struct cpuinfo_x86 *c);
25681 -#else
25682 -static inline void detect_ht(struct cpuinfo_x86 *c) {}
25683 -#endif
25684 -
25685 -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25686 -                            unsigned int *ecx, unsigned int *edx)
25687 -{
25688 -       /* ecx is often an input as well as an output. */
25689 -       __asm__(XEN_CPUID
25690 -               : "=a" (*eax),
25691 -                 "=b" (*ebx),
25692 -                 "=c" (*ecx),
25693 -                 "=d" (*edx)
25694 -               : "0" (*eax), "2" (*ecx));
25695 -}
25696 -
25697 -#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25698 -
25699 -/*
25700 - * Save the cr4 feature set we're using (ie
25701 - * Pentium 4MB enable and PPro Global page
25702 - * enable), so that any CPU's that boot up
25703 - * after us can get the correct flags.
25704 - */
25705 -extern unsigned long mmu_cr4_features;
25706 -
25707 -static inline void set_in_cr4 (unsigned long mask)
25708 -{
25709 -       unsigned cr4;
25710 -       mmu_cr4_features |= mask;
25711 -       cr4 = read_cr4();
25712 -       cr4 |= mask;
25713 -       write_cr4(cr4);
25714 -}
25715 -
25716 -static inline void clear_in_cr4 (unsigned long mask)
25717 -{
25718 -       unsigned cr4;
25719 -       mmu_cr4_features &= ~mask;
25720 -       cr4 = read_cr4();
25721 -       cr4 &= ~mask;
25722 -       write_cr4(cr4);
25723 -}
25724 -
25725 -/* Stop speculative execution */
25726 -static inline void sync_core(void)
25727 -{
25728 -       int tmp;
25729 -       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25730 -}
25731 -
25732 -static inline void __monitor(const void *eax, unsigned long ecx,
25733 -               unsigned long edx)
25734 -{
25735 -       /* "monitor %eax,%ecx,%edx;" */
25736 -       asm volatile(
25737 -               ".byte 0x0f,0x01,0xc8;"
25738 -               : :"a" (eax), "c" (ecx), "d"(edx));
25739 -}
25740 -
25741 -static inline void __mwait(unsigned long eax, unsigned long ecx)
25742 -{
25743 -       /* "mwait %eax,%ecx;" */
25744 -       asm volatile(
25745 -               ".byte 0x0f,0x01,0xc9;"
25746 -               : :"a" (eax), "c" (ecx));
25747 -}
25748 -
25749 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25750 -
25751 -/* from system description table in BIOS.  Mostly for MCA use, but
25752 -others may find it useful. */
25753 -extern unsigned int machine_id;
25754 -extern unsigned int machine_submodel_id;
25755 -extern unsigned int BIOS_revision;
25756 -extern unsigned int mca_pentium_flag;
25757 -
25758 -/* Boot loader type from the setup header */
25759 -extern int bootloader_type;
25760 -
25761 -/*
25762 - * User space process size: 3GB (default).
25763 - */
25764 -#define TASK_SIZE      (PAGE_OFFSET)
25765 -
25766 -/* This decides where the kernel will search for a free chunk of vm
25767 - * space during mmap's.
25768 - */
25769 -#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
25770 -
25771 -#define HAVE_ARCH_PICK_MMAP_LAYOUT
25772 -
25773 -extern void hard_disable_TSC(void);
25774 -extern void disable_TSC(void);
25775 -extern void hard_enable_TSC(void);
25776 -
25777 -/*
25778 - * Size of io_bitmap.
25779 - */
25780 -#define IO_BITMAP_BITS  65536
25781 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25782 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25783 -#ifndef CONFIG_X86_NO_TSS
25784 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25785 -#endif
25786 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25787 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25788 -
25789 -struct i387_fsave_struct {
25790 -       long    cwd;
25791 -       long    swd;
25792 -       long    twd;
25793 -       long    fip;
25794 -       long    fcs;
25795 -       long    foo;
25796 -       long    fos;
25797 -       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25798 -       long    status;         /* software status information */
25799 -};
25800 -
25801 -struct i387_fxsave_struct {
25802 -       unsigned short  cwd;
25803 -       unsigned short  swd;
25804 -       unsigned short  twd;
25805 -       unsigned short  fop;
25806 -       long    fip;
25807 -       long    fcs;
25808 -       long    foo;
25809 -       long    fos;
25810 -       long    mxcsr;
25811 -       long    mxcsr_mask;
25812 -       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
25813 -       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
25814 -       long    padding[56];
25815 -} __attribute__ ((aligned (16)));
25816 -
25817 -struct i387_soft_struct {
25818 -       long    cwd;
25819 -       long    swd;
25820 -       long    twd;
25821 -       long    fip;
25822 -       long    fcs;
25823 -       long    foo;
25824 -       long    fos;
25825 -       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25826 -       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
25827 -       struct info     *info;
25828 -       unsigned long   entry_eip;
25829 -};
25830 -
25831 -union i387_union {
25832 -       struct i387_fsave_struct        fsave;
25833 -       struct i387_fxsave_struct       fxsave;
25834 -       struct i387_soft_struct soft;
25835 -};
25836 -
25837 -typedef struct {
25838 -       unsigned long seg;
25839 -} mm_segment_t;
25840 -
25841 -struct thread_struct;
25842 -
25843 -#ifndef CONFIG_X86_NO_TSS
25844 -/* This is the TSS defined by the hardware. */
25845 -struct i386_hw_tss {
25846 -       unsigned short  back_link,__blh;
25847 -       unsigned long   esp0;
25848 -       unsigned short  ss0,__ss0h;
25849 -       unsigned long   esp1;
25850 -       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25851 -       unsigned long   esp2;
25852 -       unsigned short  ss2,__ss2h;
25853 -       unsigned long   __cr3;
25854 -       unsigned long   eip;
25855 -       unsigned long   eflags;
25856 -       unsigned long   eax,ecx,edx,ebx;
25857 -       unsigned long   esp;
25858 -       unsigned long   ebp;
25859 -       unsigned long   esi;
25860 -       unsigned long   edi;
25861 -       unsigned short  es, __esh;
25862 -       unsigned short  cs, __csh;
25863 -       unsigned short  ss, __ssh;
25864 -       unsigned short  ds, __dsh;
25865 -       unsigned short  fs, __fsh;
25866 -       unsigned short  gs, __gsh;
25867 -       unsigned short  ldt, __ldth;
25868 -       unsigned short  trace, io_bitmap_base;
25869 -} __attribute__((packed));
25870 -
25871 -struct tss_struct {
25872 -       struct i386_hw_tss x86_tss;
25873 -
25874 -       /*
25875 -        * The extra 1 is there because the CPU will access an
25876 -        * additional byte beyond the end of the IO permission
25877 -        * bitmap. The extra byte must be all 1 bits, and must
25878 -        * be within the limit.
25879 -        */
25880 -       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
25881 -       /*
25882 -        * Cache the current maximum and the last task that used the bitmap:
25883 -        */
25884 -       unsigned long io_bitmap_max;
25885 -       struct thread_struct *io_bitmap_owner;
25886 -       /*
25887 -        * pads the TSS to be cacheline-aligned (size is 0x100)
25888 -        */
25889 -       unsigned long __cacheline_filler[35];
25890 -       /*
25891 -        * .. and then another 0x100 bytes for emergency kernel stack
25892 -        */
25893 -       unsigned long stack[64];
25894 -} __attribute__((packed));
25895 -#endif
25896 -
25897 -#define ARCH_MIN_TASKALIGN     16
25898 -
25899 -struct thread_struct {
25900 -/* cached TLS descriptors. */
25901 -       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25902 -       unsigned long   esp0;
25903 -       unsigned long   sysenter_cs;
25904 -       unsigned long   eip;
25905 -       unsigned long   esp;
25906 -       unsigned long   fs;
25907 -       unsigned long   gs;
25908 -/* Hardware debugging registers */
25909 -       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
25910 -/* fault info */
25911 -       unsigned long   cr2, trap_no, error_code;
25912 -/* floating point info */
25913 -       union i387_union        i387;
25914 -/* virtual 86 mode info */
25915 -       struct vm86_struct __user * vm86_info;
25916 -       unsigned long           screen_bitmap;
25917 -       unsigned long           v86flags, v86mask, saved_esp0;
25918 -       unsigned int            saved_fs, saved_gs;
25919 -/* IO permissions */
25920 -       unsigned long   *io_bitmap_ptr;
25921 -       unsigned long   iopl;
25922 -/* max allowed port in the bitmap, in bytes: */
25923 -       unsigned long   io_bitmap_max;
25924 -};
25925 -
25926 -#define INIT_THREAD  {                                                 \
25927 -       .esp0 = sizeof(init_stack) + (long)&init_stack,                 \
25928 -       .vm86_info = NULL,                                              \
25929 -       .sysenter_cs = __KERNEL_CS,                                     \
25930 -       .io_bitmap_ptr = NULL,                                          \
25931 -       .fs = __KERNEL_PERCPU,                                          \
25932 -}
25933 -
25934 -/*
25935 - * Note that the .io_bitmap member must be extra-big. This is because
25936 - * the CPU will access an additional byte beyond the end of the IO
25937 - * permission bitmap. The extra byte must be all 1 bits, and must
25938 - * be within the limit.
25939 - */
25940 -#define INIT_TSS  {                                                    \
25941 -       .x86_tss = {                                                    \
25942 -               .esp0           = sizeof(init_stack) + (long)&init_stack, \
25943 -               .ss0            = __KERNEL_DS,                          \
25944 -               .ss1            = __KERNEL_CS,                          \
25945 -               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
25946 -        },                                                             \
25947 -       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
25948 -}
25949 -
25950 -#define start_thread(regs, new_eip, new_esp) do {              \
25951 -       __asm__("movl %0,%%gs": :"r" (0));                      \
25952 -       regs->xfs = 0;                                          \
25953 -       set_fs(USER_DS);                                        \
25954 -       regs->xds = __USER_DS;                                  \
25955 -       regs->xes = __USER_DS;                                  \
25956 -       regs->xss = __USER_DS;                                  \
25957 -       regs->xcs = __USER_CS;                                  \
25958 -       regs->eip = new_eip;                                    \
25959 -       regs->esp = new_esp;                                    \
25960 -} while (0)
25961 -
25962 -/* Forward declaration, a strange C thing */
25963 -struct task_struct;
25964 -struct mm_struct;
25965 -
25966 -/* Free all resources held by a thread. */
25967 -extern void release_thread(struct task_struct *);
25968 -
25969 -/* Prepare to copy thread state - unlazy all lazy status */
25970 -extern void prepare_to_copy(struct task_struct *tsk);
25971 -
25972 -/*
25973 - * create a kernel thread without removing it from tasklists
25974 - */
25975 -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25976 -
25977 -extern unsigned long thread_saved_pc(struct task_struct *tsk);
25978 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25979 -
25980 -unsigned long get_wchan(struct task_struct *p);
25981 -
25982 -#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
25983 -#define KSTK_TOP(info)                                                 \
25984 -({                                                                     \
25985 -       unsigned long *__ptr = (unsigned long *)(info);                 \
25986 -       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
25987 -})
25988 -
25989 -/*
25990 - * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25991 - * This is necessary to guarantee that the entire "struct pt_regs"
25992 - * is accessable even if the CPU haven't stored the SS/ESP registers
25993 - * on the stack (interrupt gate does not save these registers
25994 - * when switching to the same priv ring).
25995 - * Therefore beware: accessing the xss/esp fields of the
25996 - * "struct pt_regs" is possible, but they may contain the
25997 - * completely wrong values.
25998 - */
25999 -#define task_pt_regs(task)                                             \
26000 -({                                                                     \
26001 -       struct pt_regs *__regs__;                                       \
26002 -       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
26003 -       __regs__ - 1;                                                   \
26004 -})
26005 -
26006 -#define KSTK_EIP(task) (task_pt_regs(task)->eip)
26007 -#define KSTK_ESP(task) (task_pt_regs(task)->esp)
26008 -
26009 -
26010 -struct microcode_header {
26011 -       unsigned int hdrver;
26012 -       unsigned int rev;
26013 -       unsigned int date;
26014 -       unsigned int sig;
26015 -       unsigned int cksum;
26016 -       unsigned int ldrver;
26017 -       unsigned int pf;
26018 -       unsigned int datasize;
26019 -       unsigned int totalsize;
26020 -       unsigned int reserved[3];
26021 -};
26022 -
26023 -struct microcode {
26024 -       struct microcode_header hdr;
26025 -       unsigned int bits[0];
26026 -};
26027 -
26028 -typedef struct microcode microcode_t;
26029 -typedef struct microcode_header microcode_header_t;
26030 -
26031 -/* microcode format is extended from prescott processors */
26032 -struct extended_signature {
26033 -       unsigned int sig;
26034 -       unsigned int pf;
26035 -       unsigned int cksum;
26036 -};
26037 -
26038 -struct extended_sigtable {
26039 -       unsigned int count;
26040 -       unsigned int cksum;
26041 -       unsigned int reserved[3];
26042 -       struct extended_signature sigs[0];
26043 -};
26044 -
26045 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26046 -static inline void rep_nop(void)
26047 -{
26048 -       __asm__ __volatile__("rep;nop": : :"memory");
26049 -}
26050 -
26051 -#define cpu_relax()    rep_nop()
26052 -
26053 -#ifndef CONFIG_X86_NO_TSS
26054 -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
26055 -{
26056 -       tss->x86_tss.esp0 = thread->esp0;
26057 -       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
26058 -       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26059 -               tss->x86_tss.ss1 = thread->sysenter_cs;
26060 -               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26061 -       }
26062 -}
26063 -#else
26064 -#define xen_load_esp0(tss, thread) do { \
26065 -       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
26066 -               BUG(); \
26067 -} while (0)
26068 -#endif
26069 -
26070 -
26071 -static inline unsigned long xen_get_debugreg(int regno)
26072 -{
26073 -       return HYPERVISOR_get_debugreg(regno);
26074 -}
26075 -
26076 -static inline void xen_set_debugreg(int regno, unsigned long value)
26077 -{
26078 -       WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26079 -}
26080 -
26081 -/*
26082 - * Set IOPL bits in EFLAGS from given mask
26083 - */
26084 -static inline void xen_set_iopl_mask(unsigned mask)
26085 -{
26086 -       struct physdev_set_iopl set_iopl;
26087 -
26088 -       /* Force the change at ring 0. */
26089 -       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26090 -       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26091 -}
26092 -
26093 -
26094 -#define paravirt_enabled() 0
26095 -#define __cpuid xen_cpuid
26096 -
26097 -#define load_esp0 xen_load_esp0
26098 -
26099 -/*
26100 - * These special macros can be used to get or set a debugging register
26101 - */
26102 -#define get_debugreg(var, register)                            \
26103 -       (var) = xen_get_debugreg(register)
26104 -#define set_debugreg(value, register)                          \
26105 -       xen_set_debugreg(register, value)
26106 -
26107 -#define set_iopl_mask xen_set_iopl_mask
26108 -
26109 -/*
26110 - * Generic CPUID function
26111 - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26112 - * resulting in stale register contents being returned.
26113 - */
26114 -static inline void cpuid(unsigned int op,
26115 -                        unsigned int *eax, unsigned int *ebx,
26116 -                        unsigned int *ecx, unsigned int *edx)
26117 -{
26118 -       *eax = op;
26119 -       *ecx = 0;
26120 -       __cpuid(eax, ebx, ecx, edx);
26121 -}
26122 -
26123 -/* Some CPUID calls want 'count' to be placed in ecx */
26124 -static inline void cpuid_count(unsigned int op, int count,
26125 -                              unsigned int *eax, unsigned int *ebx,
26126 -                              unsigned int *ecx, unsigned int *edx)
26127 -{
26128 -       *eax = op;
26129 -       *ecx = count;
26130 -       __cpuid(eax, ebx, ecx, edx);
26131 -}
26132 -
26133 -/*
26134 - * CPUID functions returning a single datum
26135 - */
26136 -static inline unsigned int cpuid_eax(unsigned int op)
26137 -{
26138 -       unsigned int eax, ebx, ecx, edx;
26139 -
26140 -       cpuid(op, &eax, &ebx, &ecx, &edx);
26141 -       return eax;
26142 -}
26143 -static inline unsigned int cpuid_ebx(unsigned int op)
26144 -{
26145 -       unsigned int eax, ebx, ecx, edx;
26146 -
26147 -       cpuid(op, &eax, &ebx, &ecx, &edx);
26148 -       return ebx;
26149 -}
26150 -static inline unsigned int cpuid_ecx(unsigned int op)
26151 -{
26152 -       unsigned int eax, ebx, ecx, edx;
26153 -
26154 -       cpuid(op, &eax, &ebx, &ecx, &edx);
26155 -       return ecx;
26156 -}
26157 -static inline unsigned int cpuid_edx(unsigned int op)
26158 -{
26159 -       unsigned int eax, ebx, ecx, edx;
26160 -
26161 -       cpuid(op, &eax, &ebx, &ecx, &edx);
26162 -       return edx;
26163 -}
26164 -
26165 -/* generic versions from gas */
26166 -#define GENERIC_NOP1   ".byte 0x90\n"
26167 -#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
26168 -#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
26169 -#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
26170 -#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
26171 -#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
26172 -#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
26173 -#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
26174 -
26175 -/* Opteron nops */
26176 -#define K8_NOP1 GENERIC_NOP1
26177 -#define K8_NOP2        ".byte 0x66,0x90\n"
26178 -#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
26179 -#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
26180 -#define K8_NOP5        K8_NOP3 K8_NOP2
26181 -#define K8_NOP6        K8_NOP3 K8_NOP3
26182 -#define K8_NOP7        K8_NOP4 K8_NOP3
26183 -#define K8_NOP8        K8_NOP4 K8_NOP4
26184 -
26185 -/* K7 nops */
26186 -/* uses eax dependencies (arbitary choice) */
26187 -#define K7_NOP1  GENERIC_NOP1
26188 -#define K7_NOP2        ".byte 0x8b,0xc0\n"
26189 -#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
26190 -#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
26191 -#define K7_NOP5        K7_NOP4 ASM_NOP1
26192 -#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
26193 -#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
26194 -#define K7_NOP8        K7_NOP7 ASM_NOP1
26195 -
26196 -/* P6 nops */
26197 -/* uses eax dependencies (Intel-recommended choice) */
26198 -#define P6_NOP1        GENERIC_NOP1
26199 -#define P6_NOP2        ".byte 0x66,0x90\n"
26200 -#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
26201 -#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
26202 -#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
26203 -#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26204 -#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26205 -#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26206 -
26207 -#ifdef CONFIG_MK8
26208 -#define ASM_NOP1 K8_NOP1
26209 -#define ASM_NOP2 K8_NOP2
26210 -#define ASM_NOP3 K8_NOP3
26211 -#define ASM_NOP4 K8_NOP4
26212 -#define ASM_NOP5 K8_NOP5
26213 -#define ASM_NOP6 K8_NOP6
26214 -#define ASM_NOP7 K8_NOP7
26215 -#define ASM_NOP8 K8_NOP8
26216 -#elif defined(CONFIG_MK7)
26217 -#define ASM_NOP1 K7_NOP1
26218 -#define ASM_NOP2 K7_NOP2
26219 -#define ASM_NOP3 K7_NOP3
26220 -#define ASM_NOP4 K7_NOP4
26221 -#define ASM_NOP5 K7_NOP5
26222 -#define ASM_NOP6 K7_NOP6
26223 -#define ASM_NOP7 K7_NOP7
26224 -#define ASM_NOP8 K7_NOP8
26225 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
26226 -      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
26227 -      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
26228 -#define ASM_NOP1 P6_NOP1
26229 -#define ASM_NOP2 P6_NOP2
26230 -#define ASM_NOP3 P6_NOP3
26231 -#define ASM_NOP4 P6_NOP4
26232 -#define ASM_NOP5 P6_NOP5
26233 -#define ASM_NOP6 P6_NOP6
26234 -#define ASM_NOP7 P6_NOP7
26235 -#define ASM_NOP8 P6_NOP8
26236 -#else
26237 -#define ASM_NOP1 GENERIC_NOP1
26238 -#define ASM_NOP2 GENERIC_NOP2
26239 -#define ASM_NOP3 GENERIC_NOP3
26240 -#define ASM_NOP4 GENERIC_NOP4
26241 -#define ASM_NOP5 GENERIC_NOP5
26242 -#define ASM_NOP6 GENERIC_NOP6
26243 -#define ASM_NOP7 GENERIC_NOP7
26244 -#define ASM_NOP8 GENERIC_NOP8
26245 -#endif
26246 -
26247 -#define ASM_NOP_MAX 8
26248 -
26249 -/* Prefetch instructions for Pentium III and AMD Athlon */
26250 -/* It's not worth to care about 3dnow! prefetches for the K6
26251 -   because they are microcoded there and very slow.
26252 -   However we don't do prefetches for pre XP Athlons currently
26253 -   That should be fixed. */
26254 -#define ARCH_HAS_PREFETCH
26255 -static inline void prefetch(const void *x)
26256 -{
26257 -       alternative_input(ASM_NOP4,
26258 -                         "prefetchnta (%1)",
26259 -                         X86_FEATURE_XMM,
26260 -                         "r" (x));
26261 -}
26262 -
26263 -#define ARCH_HAS_PREFETCH
26264 -#define ARCH_HAS_PREFETCHW
26265 -#define ARCH_HAS_SPINLOCK_PREFETCH
26266 -
26267 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
26268 -   spinlocks to avoid one state transition in the cache coherency protocol. */
26269 -static inline void prefetchw(const void *x)
26270 -{
26271 -       alternative_input(ASM_NOP4,
26272 -                         "prefetchw (%1)",
26273 -                         X86_FEATURE_3DNOW,
26274 -                         "r" (x));
26275 -}
26276 -#define spin_lock_prefetch(x)  prefetchw(x)
26277 -
26278 -extern void select_idle_routine(const struct cpuinfo_x86 *c);
26279 -
26280 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26281 -
26282 -extern unsigned long boot_option_idle_override;
26283 -extern void enable_sep_cpu(void);
26284 -extern int sysenter_setup(void);
26285 -
26286 -/* Defined in head.S */
26287 -extern struct Xgt_desc_struct early_gdt_descr;
26288 -
26289 -extern void cpu_set_gdt(int);
26290 -extern void switch_to_new_gdt(void);
26291 -extern void cpu_init(void);
26292 -extern void init_gdt(int cpu);
26293 -
26294 -extern int force_mwait;
26295 -
26296 -#endif /* __ASM_I386_PROCESSOR_H */
26297 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/processor_64.h   2009-02-16 16:18:36.000000000 +0100
26298 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
26299 @@ -1,461 +0,0 @@
26300 -/*
26301 - * include/asm-x86_64/processor.h
26302 - *
26303 - * Copyright (C) 1994 Linus Torvalds
26304 - */
26305 -
26306 -#ifndef __ASM_X86_64_PROCESSOR_H
26307 -#define __ASM_X86_64_PROCESSOR_H
26308 -
26309 -#include <asm/segment.h>
26310 -#include <asm/page.h>
26311 -#include <asm/types.h>
26312 -#include <asm/sigcontext.h>
26313 -#include <asm/cpufeature.h>
26314 -#include <linux/threads.h>
26315 -#include <asm/msr.h>
26316 -#include <asm/current.h>
26317 -#include <asm/system.h>
26318 -#include <asm/mmsegment.h>
26319 -#include <asm/percpu.h>
26320 -#include <linux/personality.h>
26321 -#include <linux/cpumask.h>
26322 -#include <asm/processor-flags.h>
26323 -
26324 -#define TF_MASK                0x00000100
26325 -#define IF_MASK                0x00000200
26326 -#define IOPL_MASK      0x00003000
26327 -#define NT_MASK                0x00004000
26328 -#define VM_MASK                0x00020000
26329 -#define AC_MASK                0x00040000
26330 -#define VIF_MASK       0x00080000      /* virtual interrupt flag */
26331 -#define VIP_MASK       0x00100000      /* virtual interrupt pending */
26332 -#define ID_MASK                0x00200000
26333 -
26334 -#define desc_empty(desc) \
26335 -               (!((desc)->a | (desc)->b))
26336 -
26337 -#define desc_equal(desc1, desc2) \
26338 -               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
26339 -
26340 -/*
26341 - * Default implementation of macro that returns current
26342 - * instruction pointer ("program counter").
26343 - */
26344 -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
26345 -
26346 -/*
26347 - *  CPU type and hardware bug flags. Kept separately for each CPU.
26348 - */
26349 -
26350 -struct cpuinfo_x86 {
26351 -       __u8    x86;            /* CPU family */
26352 -       __u8    x86_vendor;     /* CPU vendor */
26353 -       __u8    x86_model;
26354 -       __u8    x86_mask;
26355 -       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
26356 -       __u32   x86_capability[NCAPINTS];
26357 -       char    x86_vendor_id[16];
26358 -       char    x86_model_id[64];
26359 -       int     x86_cache_size;  /* in KB */
26360 -       int     x86_clflush_size;
26361 -       int     x86_cache_alignment;
26362 -       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26363 -        __u8    x86_virt_bits, x86_phys_bits;
26364 -       __u8    x86_max_cores;  /* cpuid returned max cores value */
26365 -        __u32   x86_power;
26366 -       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
26367 -       unsigned long loops_per_jiffy;
26368 -#ifdef CONFIG_SMP
26369 -       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
26370 -#endif
26371 -       __u8    apicid;
26372 -#ifdef CONFIG_SMP
26373 -       __u8    booted_cores;   /* number of cores as seen by OS */
26374 -       __u8    phys_proc_id;   /* Physical Processor id. */
26375 -       __u8    cpu_core_id;    /* Core id. */
26376 -       __u8    cpu_index;      /* index into per_cpu list */
26377 -#endif
26378 -} ____cacheline_aligned;
26379 -
26380 -#define X86_VENDOR_INTEL 0
26381 -#define X86_VENDOR_CYRIX 1
26382 -#define X86_VENDOR_AMD 2
26383 -#define X86_VENDOR_UMC 3
26384 -#define X86_VENDOR_NEXGEN 4
26385 -#define X86_VENDOR_CENTAUR 5
26386 -#define X86_VENDOR_TRANSMETA 7
26387 -#define X86_VENDOR_NUM 8
26388 -#define X86_VENDOR_UNKNOWN 0xff
26389 -
26390 -#ifdef CONFIG_SMP
26391 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26392 -#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
26393 -#define current_cpu_data       cpu_data(smp_processor_id())
26394 -#else
26395 -#define cpu_data(cpu)          boot_cpu_data
26396 -#define current_cpu_data       boot_cpu_data
26397 -#endif
26398 -
26399 -extern char ignore_irq13;
26400 -
26401 -extern void identify_cpu(struct cpuinfo_x86 *);
26402 -extern void print_cpu_info(struct cpuinfo_x86 *);
26403 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26404 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26405 -extern unsigned short num_cache_leaves;
26406 -
26407 -/*
26408 - * Save the cr4 feature set we're using (ie
26409 - * Pentium 4MB enable and PPro Global page
26410 - * enable), so that any CPU's that boot up
26411 - * after us can get the correct flags.
26412 - */
26413 -extern unsigned long mmu_cr4_features;
26414 -
26415 -static inline void set_in_cr4 (unsigned long mask)
26416 -{
26417 -       mmu_cr4_features |= mask;
26418 -       __asm__("movq %%cr4,%%rax\n\t"
26419 -               "orq %0,%%rax\n\t"
26420 -               "movq %%rax,%%cr4\n"
26421 -               : : "irg" (mask)
26422 -               :"ax");
26423 -}
26424 -
26425 -static inline void clear_in_cr4 (unsigned long mask)
26426 -{
26427 -       mmu_cr4_features &= ~mask;
26428 -       __asm__("movq %%cr4,%%rax\n\t"
26429 -               "andq %0,%%rax\n\t"
26430 -               "movq %%rax,%%cr4\n"
26431 -               : : "irg" (~mask)
26432 -               :"ax");
26433 -}
26434 -
26435 -
26436 -/*
26437 - * User space process size. 47bits minus one guard page.
26438 - */
26439 -#define TASK_SIZE64    (0x800000000000UL - 4096)
26440 -
26441 -/* This decides where the kernel will search for a free chunk of vm
26442 - * space during mmap's.
26443 - */
26444 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
26445 -
26446 -#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26447 -#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26448 -
26449 -#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
26450 -
26451 -/*
26452 - * Size of io_bitmap.
26453 - */
26454 -#define IO_BITMAP_BITS  65536
26455 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26456 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26457 -#ifndef CONFIG_X86_NO_TSS
26458 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
26459 -#endif
26460 -#define INVALID_IO_BITMAP_OFFSET 0x8000
26461 -
26462 -struct i387_fxsave_struct {
26463 -       u16     cwd;
26464 -       u16     swd;
26465 -       u16     twd;
26466 -       u16     fop;
26467 -       u64     rip;
26468 -       u64     rdp;
26469 -       u32     mxcsr;
26470 -       u32     mxcsr_mask;
26471 -       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
26472 -       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
26473 -       u32     padding[24];
26474 -} __attribute__ ((aligned (16)));
26475 -
26476 -union i387_union {
26477 -       struct i387_fxsave_struct       fxsave;
26478 -};
26479 -
26480 -#ifndef CONFIG_X86_NO_TSS
26481 -struct tss_struct {
26482 -       u32 reserved1;
26483 -       u64 rsp0;
26484 -       u64 rsp1;
26485 -       u64 rsp2;
26486 -       u64 reserved2;
26487 -       u64 ist[7];
26488 -       u32 reserved3;
26489 -       u32 reserved4;
26490 -       u16 reserved5;
26491 -       u16 io_bitmap_base;
26492 -       /*
26493 -        * The extra 1 is there because the CPU will access an
26494 -        * additional byte beyond the end of the IO permission
26495 -        * bitmap. The extra byte must be all 1 bits, and must
26496 -        * be within the limit. Thus we have:
26497 -        *
26498 -        * 128 bytes, the bitmap itself, for ports 0..0x3ff
26499 -        * 8 bytes, for an extra "long" of ~0UL
26500 -        */
26501 -       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26502 -} __attribute__((packed)) ____cacheline_aligned;
26503 -
26504 -DECLARE_PER_CPU(struct tss_struct,init_tss);
26505 -#endif
26506 -
26507 -
26508 -extern struct cpuinfo_x86 boot_cpu_data;
26509 -#ifndef CONFIG_X86_NO_TSS
26510 -/* Save the original ist values for checking stack pointers during debugging */
26511 -struct orig_ist {
26512 -       unsigned long ist[7];
26513 -};
26514 -DECLARE_PER_CPU(struct orig_ist, orig_ist);
26515 -#endif
26516 -
26517 -#ifdef CONFIG_X86_VSMP
26518 -#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
26519 -#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
26520 -#else
26521 -#define ARCH_MIN_TASKALIGN     16
26522 -#define ARCH_MIN_MMSTRUCT_ALIGN        0
26523 -#endif
26524 -
26525 -struct thread_struct {
26526 -       unsigned long   rsp0;
26527 -       unsigned long   rsp;
26528 -       unsigned long   userrsp;        /* Copy from PDA */
26529 -       unsigned long   fs;
26530 -       unsigned long   gs;
26531 -       unsigned short  es, ds, fsindex, gsindex;
26532 -/* Hardware debugging registers */
26533 -       unsigned long   debugreg0;
26534 -       unsigned long   debugreg1;
26535 -       unsigned long   debugreg2;
26536 -       unsigned long   debugreg3;
26537 -       unsigned long   debugreg6;
26538 -       unsigned long   debugreg7;
26539 -/* fault info */
26540 -       unsigned long   cr2, trap_no, error_code;
26541 -/* floating point info */
26542 -       union i387_union        i387  __attribute__((aligned(16)));
26543 -/* IO permissions. the bitmap could be moved into the GDT, that would make
26544 -   switch faster for a limited number of ioperm using tasks. -AK */
26545 -       int             ioperm;
26546 -       unsigned long   *io_bitmap_ptr;
26547 -       unsigned io_bitmap_max;
26548 -/* cached TLS descriptors. */
26549 -       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26550 -       unsigned int    iopl;
26551 -} __attribute__((aligned(16)));
26552 -
26553 -#define INIT_THREAD  { \
26554 -       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26555 -}
26556 -
26557 -#ifndef CONFIG_X86_NO_TSS
26558 -#define INIT_TSS  { \
26559 -       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26560 -}
26561 -#endif
26562 -
26563 -#define INIT_MMAP \
26564 -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26565 -
26566 -#define start_thread(regs,new_rip,new_rsp) do { \
26567 -       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
26568 -       load_gs_index(0);                                                       \
26569 -       (regs)->rip = (new_rip);                                                 \
26570 -       (regs)->rsp = (new_rsp);                                                 \
26571 -       write_pda(oldrsp, (new_rsp));                                            \
26572 -       (regs)->cs = __USER_CS;                                                  \
26573 -       (regs)->ss = __USER_DS;                                                  \
26574 -       (regs)->eflags = 0x200;                                                  \
26575 -       set_fs(USER_DS);                                                         \
26576 -} while(0)
26577 -
26578 -#define get_debugreg(var, register)                            \
26579 -       var = HYPERVISOR_get_debugreg(register)
26580 -#define set_debugreg(value, register) do {                     \
26581 -       if (HYPERVISOR_set_debugreg(register, value))           \
26582 -               BUG();                                          \
26583 -} while (0)
26584 -
26585 -struct task_struct;
26586 -struct mm_struct;
26587 -
26588 -/* Free all resources held by a thread. */
26589 -extern void release_thread(struct task_struct *);
26590 -
26591 -/* Prepare to copy thread state - unlazy all lazy status */
26592 -extern void prepare_to_copy(struct task_struct *tsk);
26593 -
26594 -/*
26595 - * create a kernel thread without removing it from tasklists
26596 - */
26597 -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26598 -
26599 -/*
26600 - * Return saved PC of a blocked thread.
26601 - * What is this good for? it will be always the scheduler or ret_from_fork.
26602 - */
26603 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26604 -
26605 -extern unsigned long get_wchan(struct task_struct *p);
26606 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26607 -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26608 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26609 -
26610 -
26611 -struct microcode_header {
26612 -       unsigned int hdrver;
26613 -       unsigned int rev;
26614 -       unsigned int date;
26615 -       unsigned int sig;
26616 -       unsigned int cksum;
26617 -       unsigned int ldrver;
26618 -       unsigned int pf;
26619 -       unsigned int datasize;
26620 -       unsigned int totalsize;
26621 -       unsigned int reserved[3];
26622 -};
26623 -
26624 -struct microcode {
26625 -       struct microcode_header hdr;
26626 -       unsigned int bits[0];
26627 -};
26628 -
26629 -typedef struct microcode microcode_t;
26630 -typedef struct microcode_header microcode_header_t;
26631 -
26632 -/* microcode format is extended from prescott processors */
26633 -struct extended_signature {
26634 -       unsigned int sig;
26635 -       unsigned int pf;
26636 -       unsigned int cksum;
26637 -};
26638 -
26639 -struct extended_sigtable {
26640 -       unsigned int count;
26641 -       unsigned int cksum;
26642 -       unsigned int reserved[3];
26643 -       struct extended_signature sigs[0];
26644 -};
26645 -
26646 -
26647 -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26648 -#define ASM_NOP1 P6_NOP1
26649 -#define ASM_NOP2 P6_NOP2
26650 -#define ASM_NOP3 P6_NOP3
26651 -#define ASM_NOP4 P6_NOP4
26652 -#define ASM_NOP5 P6_NOP5
26653 -#define ASM_NOP6 P6_NOP6
26654 -#define ASM_NOP7 P6_NOP7
26655 -#define ASM_NOP8 P6_NOP8
26656 -#else
26657 -#define ASM_NOP1 K8_NOP1
26658 -#define ASM_NOP2 K8_NOP2
26659 -#define ASM_NOP3 K8_NOP3
26660 -#define ASM_NOP4 K8_NOP4
26661 -#define ASM_NOP5 K8_NOP5
26662 -#define ASM_NOP6 K8_NOP6
26663 -#define ASM_NOP7 K8_NOP7
26664 -#define ASM_NOP8 K8_NOP8
26665 -#endif
26666 -
26667 -/* Opteron nops */
26668 -#define K8_NOP1 ".byte 0x90\n"
26669 -#define K8_NOP2        ".byte 0x66,0x90\n"
26670 -#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
26671 -#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
26672 -#define K8_NOP5        K8_NOP3 K8_NOP2
26673 -#define K8_NOP6        K8_NOP3 K8_NOP3
26674 -#define K8_NOP7        K8_NOP4 K8_NOP3
26675 -#define K8_NOP8        K8_NOP4 K8_NOP4
26676 -
26677 -/* P6 nops */
26678 -/* uses eax dependencies (Intel-recommended choice) */
26679 -#define P6_NOP1        ".byte 0x90\n"
26680 -#define P6_NOP2        ".byte 0x66,0x90\n"
26681 -#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
26682 -#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
26683 -#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
26684 -#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26685 -#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26686 -#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26687 -
26688 -#define ASM_NOP_MAX 8
26689 -
26690 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26691 -static inline void rep_nop(void)
26692 -{
26693 -       __asm__ __volatile__("rep;nop": : :"memory");
26694 -}
26695 -
26696 -/* Stop speculative execution */
26697 -static inline void sync_core(void)
26698 -{
26699 -       int tmp;
26700 -       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26701 -}
26702 -
26703 -#define ARCH_HAS_PREFETCHW 1
26704 -static inline void prefetchw(void *x)
26705 -{
26706 -       alternative_input("prefetcht0 (%1)",
26707 -                         "prefetchw (%1)",
26708 -                         X86_FEATURE_3DNOW,
26709 -                         "r" (x));
26710 -}
26711 -
26712 -#define ARCH_HAS_SPINLOCK_PREFETCH 1
26713 -
26714 -#define spin_lock_prefetch(x)  prefetchw(x)
26715 -
26716 -#define cpu_relax()   rep_nop()
26717 -
26718 -static inline void __monitor(const void *eax, unsigned long ecx,
26719 -               unsigned long edx)
26720 -{
26721 -       /* "monitor %eax,%ecx,%edx;" */
26722 -       asm volatile(
26723 -               ".byte 0x0f,0x01,0xc8;"
26724 -               : :"a" (eax), "c" (ecx), "d"(edx));
26725 -}
26726 -
26727 -static inline void __mwait(unsigned long eax, unsigned long ecx)
26728 -{
26729 -       /* "mwait %eax,%ecx;" */
26730 -       asm volatile(
26731 -               ".byte 0x0f,0x01,0xc9;"
26732 -               : :"a" (eax), "c" (ecx));
26733 -}
26734 -
26735 -static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26736 -{
26737 -       /* "mwait %eax,%ecx;" */
26738 -       asm volatile(
26739 -               "sti; .byte 0x0f,0x01,0xc9;"
26740 -               : :"a" (eax), "c" (ecx));
26741 -}
26742 -
26743 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26744 -
26745 -#define stack_current() \
26746 -({                                                             \
26747 -       struct thread_info *ti;                                 \
26748 -       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
26749 -       ti->task;                                       \
26750 -})
26751 -
26752 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26753 -
26754 -extern unsigned long boot_option_idle_override;
26755 -/* Boot loader type from the setup header */
26756 -extern int bootloader_type;
26757 -
26758 -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26759 -
26760 -#endif /* __ASM_X86_64_PROCESSOR_H */
26761 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/segment.h        2009-02-16 16:18:36.000000000 +0100
26762 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/segment.h     2009-03-16 16:33:40.000000000 +0100
26763 @@ -1,5 +1,204 @@
26764 +#ifndef _ASM_X86_SEGMENT_H_
26765 +#define _ASM_X86_SEGMENT_H_
26766 +
26767 +/* Simple and small GDT entries for booting only */
26768 +
26769 +#define GDT_ENTRY_BOOT_CS      2
26770 +#define __BOOT_CS              (GDT_ENTRY_BOOT_CS * 8)
26771 +
26772 +#define GDT_ENTRY_BOOT_DS      (GDT_ENTRY_BOOT_CS + 1)
26773 +#define __BOOT_DS              (GDT_ENTRY_BOOT_DS * 8)
26774 +
26775 +#define GDT_ENTRY_BOOT_TSS     (GDT_ENTRY_BOOT_CS + 2)
26776 +#define __BOOT_TSS             (GDT_ENTRY_BOOT_TSS * 8)
26777 +
26778  #ifdef CONFIG_X86_32
26779 -# include "segment_32.h"
26780 +/*
26781 + * The layout of the per-CPU GDT under Linux:
26782 + *
26783 + *   0 - null
26784 + *   1 - reserved
26785 + *   2 - reserved
26786 + *   3 - reserved
26787 + *
26788 + *   4 - unused                        <==== new cacheline
26789 + *   5 - unused
26790 + *
26791 + *  ------- start of TLS (Thread-Local Storage) segments:
26792 + *
26793 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
26794 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
26795 + *   8 - TLS segment #3
26796 + *   9 - reserved
26797 + *  10 - reserved
26798 + *  11 - reserved
26799 + *
26800 + *  ------- start of kernel segments:
26801 + *
26802 + *  12 - kernel code segment           <==== new cacheline
26803 + *  13 - kernel data segment
26804 + *  14 - default user CS
26805 + *  15 - default user DS
26806 + *  16 - TSS
26807 + *  17 - LDT
26808 + *  18 - PNPBIOS support (16->32 gate)
26809 + *  19 - PNPBIOS support
26810 + *  20 - PNPBIOS support
26811 + *  21 - PNPBIOS support
26812 + *  22 - PNPBIOS support
26813 + *  23 - APM BIOS support
26814 + *  24 - APM BIOS support
26815 + *  25 - APM BIOS support
26816 + *
26817 + *  26 - ESPFIX small SS
26818 + *  27 - per-cpu                       [ offset to per-cpu data area ]
26819 + *  28 - unused
26820 + *  29 - unused
26821 + *  30 - unused
26822 + *  31 - TSS for double fault handler
26823 + */
26824 +#define GDT_ENTRY_TLS_MIN      6
26825 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
26826 +
26827 +#define GDT_ENTRY_DEFAULT_USER_CS      14
26828 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
26829 +
26830 +#define GDT_ENTRY_DEFAULT_USER_DS      15
26831 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
26832 +
26833 +#define GDT_ENTRY_KERNEL_BASE  12
26834 +
26835 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
26836 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
26837 +
26838 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
26839 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
26840 +
26841 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
26842 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
26843 +
26844 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
26845 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
26846 +
26847 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
26848 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
26849 +
26850 +#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
26851 +#ifdef CONFIG_SMP
26852 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
26853  #else
26854 -# include "../../segment_64.h"
26855 +#define __KERNEL_PERCPU 0
26856 +#endif
26857 +
26858 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
26859 +
26860 +/*
26861 + * The GDT has 32 entries
26862 + */
26863 +#define GDT_ENTRIES 32
26864 +
26865 +/* The PnP BIOS entries in the GDT */
26866 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
26867 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
26868 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
26869 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
26870 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
26871 +
26872 +/* The PnP BIOS selectors */
26873 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
26874 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
26875 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
26876 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
26877 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
26878 +
26879 +/* Bottom two bits of selector give the ring privilege level */
26880 +#define SEGMENT_RPL_MASK       0x3
26881 +/* Bit 2 is table indicator (LDT/GDT) */
26882 +#define SEGMENT_TI_MASK                0x4
26883 +
26884 +/* User mode is privilege level 3 */
26885 +#define USER_RPL               0x3
26886 +/* LDT segment has TI set, GDT has it cleared */
26887 +#define SEGMENT_LDT            0x4
26888 +#define SEGMENT_GDT            0x0
26889 +
26890 +/*
26891 + * Matching rules for certain types of segments.
26892 + */
26893 +
26894 +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
26895 +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
26896 +                                   || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
26897 +
26898 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
26899 +#define SEGMENT_IS_FLAT_CODE(x)  (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
26900 +                                  || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
26901 +                                  || ((x) & ~3) == (FLAT_USER_CS & ~3))
26902 +
26903 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
26904 +#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
26905 +
26906 +#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
26907 +
26908 +#else
26909 +#include <asm/cache.h>
26910 +
26911 +#define __KERNEL_CS    0x10
26912 +#define __KERNEL_DS    0x18
26913 +
26914 +#define __KERNEL32_CS   0x08
26915 +
26916 +/*
26917 + * we cannot use the same code segment descriptor for user and kernel
26918 + * -- not even in the long flat mode, because of different DPL /kkeil
26919 + * The segment offset needs to contain a RPL. Grr. -AK
26920 + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
26921 + */
26922 +
26923 +#define __USER32_CS   0x23   /* 4*8+3 */
26924 +#define __USER_DS     0x2b   /* 5*8+3 */
26925 +#define __USER_CS     0x33   /* 6*8+3 */
26926 +#define __USER32_DS    __USER_DS
26927 +
26928 +#define GDT_ENTRY_TSS 8        /* needs two entries */
26929 +#define GDT_ENTRY_LDT 10 /* needs two entries */
26930 +#define GDT_ENTRY_TLS_MIN 12
26931 +#define GDT_ENTRY_TLS_MAX 14
26932 +
26933 +#define GDT_ENTRY_PER_CPU 15   /* Abused to load per CPU data from limit */
26934 +#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU * 8 + 3)
26935 +
26936 +/* TLS indexes for 64bit - hardcoded in arch_prctl */
26937 +#define FS_TLS 0
26938 +#define GS_TLS 1
26939 +
26940 +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
26941 +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
26942 +
26943 +#define GDT_ENTRIES 16
26944 +
26945 +#endif
26946 +
26947 +/* User mode is privilege level 3 */
26948 +#define USER_RPL               0x3
26949 +/* LDT segment has TI set, GDT has it cleared */
26950 +#define SEGMENT_LDT            0x4
26951 +#define SEGMENT_GDT            0x0
26952 +
26953 +/* Bottom two bits of selector give the ring privilege level */
26954 +#define SEGMENT_RPL_MASK       0x3
26955 +/* Bit 2 is table indicator (LDT/GDT) */
26956 +#define SEGMENT_TI_MASK                0x4
26957 +
26958 +#define IDT_ENTRIES 256
26959 +#define GDT_SIZE (GDT_ENTRIES * 8)
26960 +#define GDT_ENTRY_TLS_ENTRIES 3
26961 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
26962 +
26963 +#ifdef __KERNEL__
26964 +#ifndef __ASSEMBLY__
26965 +extern const char early_idt_handlers[IDT_ENTRIES][10];
26966 +#endif
26967 +#endif
26968 +
26969  #endif
26970 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/segment_32.h     2008-12-15 11:27:22.000000000 +0100
26971 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
26972 @@ -1,150 +0,0 @@
26973 -#ifndef _ASM_SEGMENT_H
26974 -#define _ASM_SEGMENT_H
26975 -
26976 -/*
26977 - * The layout of the per-CPU GDT under Linux:
26978 - *
26979 - *   0 - null
26980 - *   1 - reserved
26981 - *   2 - reserved
26982 - *   3 - reserved
26983 - *
26984 - *   4 - unused                        <==== new cacheline
26985 - *   5 - unused
26986 - *
26987 - *  ------- start of TLS (Thread-Local Storage) segments:
26988 - *
26989 - *   6 - TLS segment #1                        [ glibc's TLS segment ]
26990 - *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
26991 - *   8 - TLS segment #3
26992 - *   9 - reserved
26993 - *  10 - reserved
26994 - *  11 - reserved
26995 - *
26996 - *  ------- start of kernel segments:
26997 - *
26998 - *  12 - kernel code segment           <==== new cacheline
26999 - *  13 - kernel data segment
27000 - *  14 - default user CS
27001 - *  15 - default user DS
27002 - *  16 - TSS
27003 - *  17 - LDT
27004 - *  18 - PNPBIOS support (16->32 gate)
27005 - *  19 - PNPBIOS support
27006 - *  20 - PNPBIOS support
27007 - *  21 - PNPBIOS support
27008 - *  22 - PNPBIOS support
27009 - *  23 - APM BIOS support
27010 - *  24 - APM BIOS support
27011 - *  25 - APM BIOS support
27012 - *
27013 - *  26 - ESPFIX small SS
27014 - *  27 - per-cpu                       [ offset to per-cpu data area ]
27015 - *  28 - unused
27016 - *  29 - unused
27017 - *  30 - unused
27018 - *  31 - TSS for double fault handler
27019 - */
27020 -#define GDT_ENTRY_TLS_ENTRIES  3
27021 -#define GDT_ENTRY_TLS_MIN      6
27022 -#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27023 -
27024 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27025 -
27026 -#define GDT_ENTRY_DEFAULT_USER_CS      14
27027 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27028 -
27029 -#define GDT_ENTRY_DEFAULT_USER_DS      15
27030 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27031 -
27032 -#define GDT_ENTRY_KERNEL_BASE  12
27033 -
27034 -#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
27035 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27036 -
27037 -#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
27038 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27039 -
27040 -#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
27041 -#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
27042 -
27043 -#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
27044 -#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
27045 -
27046 -#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
27047 -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27048 -
27049 -#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
27050 -#ifdef CONFIG_SMP
27051 -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27052 -#else
27053 -#define __KERNEL_PERCPU 0
27054 -#endif
27055 -
27056 -#define GDT_ENTRY_DOUBLEFAULT_TSS      31
27057 -
27058 -/*
27059 - * The GDT has 32 entries
27060 - */
27061 -#define GDT_ENTRIES 32
27062 -#define GDT_SIZE (GDT_ENTRIES * 8)
27063 -
27064 -/* Simple and small GDT entries for booting only */
27065 -
27066 -#define GDT_ENTRY_BOOT_CS              2
27067 -#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
27068 -
27069 -#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
27070 -#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
27071 -
27072 -/* The PnP BIOS entries in the GDT */
27073 -#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
27074 -#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
27075 -#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
27076 -#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
27077 -#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
27078 -
27079 -/* The PnP BIOS selectors */
27080 -#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
27081 -#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
27082 -#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
27083 -#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27084 -#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27085 -
27086 -/*
27087 - * The interrupt descriptor table has room for 256 idt's,
27088 - * the global descriptor table is dependent on the number
27089 - * of tasks we can have..
27090 - */
27091 -#define IDT_ENTRIES 256
27092 -
27093 -/* Bottom two bits of selector give the ring privilege level */
27094 -#define SEGMENT_RPL_MASK       0x3
27095 -/* Bit 2 is table indicator (LDT/GDT) */
27096 -#define SEGMENT_TI_MASK                0x4
27097 -
27098 -/* User mode is privilege level 3 */
27099 -#define USER_RPL               0x3
27100 -/* LDT segment has TI set, GDT has it cleared */
27101 -#define SEGMENT_LDT            0x4
27102 -#define SEGMENT_GDT            0x0
27103 -
27104 -#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27105 -
27106 -/*
27107 - * Matching rules for certain types of segments.
27108 - */
27109 -
27110 -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27111 -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27112 -                                   || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27113 -
27114 -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27115 -#define SEGMENT_IS_FLAT_CODE(x)  (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27116 -                                  || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27117 -                                  || ((x) & ~3) == (FLAT_USER_CS & ~3))
27118 -
27119 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27120 -#define SEGMENT_IS_PNP_CODE(x)   (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27121 -
27122 -#endif
27123 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-02-16 16:18:36.000000000 +0100
27124 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/smp_32.h      2009-03-16 16:33:40.000000000 +0100
27125 @@ -1,56 +1,51 @@
27126  #ifndef __ASM_SMP_H
27127  #define __ASM_SMP_H
27128
27129 +#ifndef __ASSEMBLY__
27130 +#include <linux/cpumask.h>
27131 +#include <linux/init.h>
27132 +
27133  /*
27134   * We need the APIC definitions automatically as part of 'smp.h'
27135   */
27136 -#ifndef __ASSEMBLY__
27137 -#include <linux/kernel.h>
27138 -#include <linux/threads.h>
27139 -#include <linux/cpumask.h>
27140 +#ifdef CONFIG_X86_LOCAL_APIC
27141 +# include <asm/mpspec.h>
27142 +# include <asm/apic.h>
27143 +# ifdef CONFIG_X86_IO_APIC
27144 +#  include <asm/io_apic.h>
27145 +# endif
27146  #endif
27147
27148 -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27149 -#include <linux/bitops.h>
27150 -#include <asm/mpspec.h>
27151 -#include <asm/apic.h>
27152 -#ifdef CONFIG_X86_IO_APIC
27153 -#include <asm/io_apic.h>
27154 -#endif
27155 -#endif
27156 +#define cpu_callout_map cpu_possible_map
27157 +#define cpu_callin_map cpu_possible_map
27158
27159 -#define BAD_APICID 0xFFu
27160 -#ifdef CONFIG_SMP
27161 -#ifndef __ASSEMBLY__
27162 +extern int smp_num_siblings;
27163 +extern unsigned int num_processors;
27164
27165 -/*
27166 - * Private routines/data
27167 - */
27168 -
27169  extern void smp_alloc_memory(void);
27170 -extern int pic_mode;
27171 -extern int smp_num_siblings;
27172 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27173 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27174 +extern void lock_ipi_call_lock(void);
27175 +extern void unlock_ipi_call_lock(void);
27176
27177  extern void (*mtrr_hook) (void);
27178  extern void zap_low_mappings (void);
27179 -extern void lock_ipi_call_lock(void);
27180 -extern void unlock_ipi_call_lock(void);
27181
27182 -#define MAX_APICID 256
27183 -extern u8 __initdata x86_cpu_to_apicid_init[];
27184 -extern void *x86_cpu_to_apicid_ptr;
27185 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27186 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27187 +DECLARE_PER_CPU(u8, cpu_llc_id);
27188  DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27189
27190 -#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27191 -
27192  #ifdef CONFIG_HOTPLUG_CPU
27193  extern void cpu_exit_clear(void);
27194  extern void cpu_uninit(void);
27195  #endif
27196
27197 +#ifdef CONFIG_SMP
27198 +
27199  #ifndef CONFIG_XEN
27200 +
27201 +/* Globals due to paravirt */
27202 +extern void set_cpu_sibling_map(int cpu);
27203 +
27204  struct smp_ops
27205  {
27206         void (*smp_prepare_boot_cpu)(void);
27207 @@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27208  int native_cpu_up(unsigned int cpunum);
27209  void native_smp_cpus_done(unsigned int max_cpus);
27210
27211 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp)            \
27212 -do { } while (0)
27213 -
27214 -#else
27215 +#ifndef CONFIG_PARAVIRT
27216 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27217 +#endif
27218
27219 +#else /* CONFIG_XEN */
27220
27221  void xen_smp_send_stop(void);
27222  void xen_smp_send_reschedule(int cpu);
27223 @@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27224  #define smp_send_reschedule    xen_smp_send_reschedule
27225  #define smp_call_function_mask xen_smp_call_function_mask
27226
27227 -#endif
27228 +extern void prefill_possible_map(void);
27229 +
27230 +#endif /* CONFIG_XEN */
27231 +
27232 +extern int __cpu_disable(void);
27233 +extern void __cpu_die(unsigned int cpu);
27234
27235  /*
27236   * This function is needed by all SMP systems. It must _always_ be valid
27237 @@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27238  DECLARE_PER_CPU(int, cpu_number);
27239  #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27240
27241 -extern cpumask_t cpu_possible_map;
27242 -#define cpu_callin_map cpu_possible_map
27243 +#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27244 +
27245 +#define safe_smp_processor_id() smp_processor_id()
27246
27247  /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27248  static inline int num_booting_cpus(void)
27249  {
27250 -       return cpus_weight(cpu_possible_map);
27251 +       return cpus_weight(cpu_callout_map);
27252  }
27253
27254 -#define safe_smp_processor_id() smp_processor_id()
27255 -extern int __cpu_disable(void);
27256 -extern void __cpu_die(unsigned int cpu);
27257 -extern void prefill_possible_map(void);
27258 -extern unsigned int num_processors;
27259 -
27260 -#endif /* !__ASSEMBLY__ */
27261 -
27262  #else /* CONFIG_SMP */
27263
27264  #define safe_smp_processor_id()                0
27265  #define cpu_physical_id(cpu)           boot_cpu_physical_apicid
27266
27267 -#define NO_PROC_ID             0xFF            /* No processor magic marker */
27268 -
27269 -#endif /* CONFIG_SMP */
27270 -
27271 -#ifndef __ASSEMBLY__
27272 +#endif /* !CONFIG_SMP */
27273
27274  #ifdef CONFIG_X86_LOCAL_APIC
27275
27276 -#ifdef APIC_DEFINITION
27277 +static __inline int logical_smp_processor_id(void)
27278 +{
27279 +       /* we don't want to mark this access volatile - bad code generation */
27280 +       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27281 +}
27282 +
27283 +# ifdef APIC_DEFINITION
27284  extern int hard_smp_processor_id(void);
27285 -#else
27286 -#include <mach_apicdef.h>
27287 +# else
27288 +#  include <mach_apicdef.h>
27289  static inline int hard_smp_processor_id(void)
27290  {
27291         /* we don't want to mark this access volatile - bad code generation */
27292 -       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27293 +       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27294  }
27295 -#endif /* APIC_DEFINITION */
27296 +# endif /* APIC_DEFINITION */
27297
27298  #else /* CONFIG_X86_LOCAL_APIC */
27299
27300 -#ifndef CONFIG_SMP
27301 -#define hard_smp_processor_id()                0
27302 -#endif
27303 +# ifndef CONFIG_SMP
27304 +#  define hard_smp_processor_id()      0
27305 +# endif
27306
27307  #endif /* CONFIG_X86_LOCAL_APIC */
27308
27309 -extern u8 apicid_2_node[];
27310 -
27311 -#ifdef CONFIG_X86_LOCAL_APIC
27312 -static __inline int logical_smp_processor_id(void)
27313 -{
27314 -       /* we don't want to mark this access volatile - bad code generation */
27315 -       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27316 -}
27317 -#endif
27318 -#endif
27319 -
27320 +#endif /* !ASSEMBLY */
27321  #endif
27322 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-02-16 16:18:36.000000000 +0100
27323 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/smp_64.h      2009-03-16 16:33:40.000000000 +0100
27324 @@ -1,139 +1,103 @@
27325  #ifndef __ASM_SMP_H
27326  #define __ASM_SMP_H
27327
27328 -/*
27329 - * We need the APIC definitions automatically as part of 'smp.h'
27330 - */
27331 -#include <linux/threads.h>
27332  #include <linux/cpumask.h>
27333 -#include <linux/bitops.h>
27334  #include <linux/init.h>
27335 -extern int disable_apic;
27336
27337  #ifdef CONFIG_X86_LOCAL_APIC
27338 -#include <asm/mpspec.h>
27339 +/*
27340 + * We need the APIC definitions automatically as part of 'smp.h'
27341 + */
27342  #include <asm/apic.h>
27343  #ifdef CONFIG_X86_IO_APIC
27344  #include <asm/io_apic.h>
27345  #endif
27346 -#include <asm/thread_info.h>
27347 +#include <asm/mpspec.h>
27348  #endif
27349 -
27350 -#ifdef CONFIG_SMP
27351 -
27352  #include <asm/pda.h>
27353 +#include <asm/thread_info.h>
27354
27355 -struct pt_regs;
27356 -
27357 -extern cpumask_t cpu_present_mask;
27358 -extern cpumask_t cpu_possible_map;
27359 -extern cpumask_t cpu_online_map;
27360  extern cpumask_t cpu_initialized;
27361
27362 -/*
27363 - * Private routines/data
27364 - */
27365 -
27366 +extern int smp_num_siblings;
27367 +extern unsigned int num_processors;
27368 +
27369  extern void smp_alloc_memory(void);
27370 -extern volatile unsigned long smp_invalidate_needed;
27371  extern void lock_ipi_call_lock(void);
27372  extern void unlock_ipi_call_lock(void);
27373 -extern int smp_num_siblings;
27374 -extern void smp_send_reschedule(int cpu);
27375 +
27376  extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27377                                   void *info, int wait);
27378
27379 -/*
27380 - * cpu_sibling_map and cpu_core_map now live
27381 - * in the per cpu area
27382 - *
27383 - * extern cpumask_t cpu_sibling_map[NR_CPUS];
27384 - * extern cpumask_t cpu_core_map[NR_CPUS];
27385 - */
27386  DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27387  DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27388 -DECLARE_PER_CPU(u8, cpu_llc_id);
27389 -
27390 -#define SMP_TRAMPOLINE_BASE 0x6000
27391 +DECLARE_PER_CPU(u16, cpu_llc_id);
27392 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27393 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27394
27395 -/*
27396 - * On x86 all CPUs are mapped 1:1 to the APIC space.
27397 - * This simplifies scheduling and IPI sending and
27398 - * compresses data structures.
27399 - */
27400 -
27401 -static inline int num_booting_cpus(void)
27402 +#ifdef CONFIG_X86_LOCAL_APIC
27403 +static inline int cpu_present_to_apicid(int mps_cpu)
27404  {
27405 -       return cpus_weight(cpu_possible_map);
27406 +       if (cpu_present(mps_cpu))
27407 +               return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27408 +       else
27409 +               return BAD_APICID;
27410  }
27411 +#endif
27412
27413 -#define raw_smp_processor_id() read_pda(cpunumber)
27414 +#ifdef CONFIG_SMP
27415 +
27416 +#define SMP_TRAMPOLINE_BASE 0x6000
27417
27418  extern int __cpu_disable(void);
27419  extern void __cpu_die(unsigned int cpu);
27420  extern void prefill_possible_map(void);
27421 -extern unsigned num_processors;
27422  extern unsigned __cpuinitdata disabled_cpus;
27423
27424 -#define NO_PROC_ID             0xFF            /* No processor magic marker */
27425 -
27426 -#endif /* CONFIG_SMP */
27427 +#define raw_smp_processor_id() read_pda(cpunumber)
27428 +#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27429
27430 -#define safe_smp_processor_id()                smp_processor_id()
27431 -
27432 -#ifdef CONFIG_X86_LOCAL_APIC
27433 -static inline int hard_smp_processor_id(void)
27434 -{
27435 -       /* we don't want to mark this access volatile - bad code generation */
27436 -       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27437 -}
27438 -#endif
27439 +#define stack_smp_processor_id()                                       \
27440 +       ({                                                              \
27441 +       struct thread_info *ti;                                         \
27442 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
27443 +       ti->cpu;                                                        \
27444 +})
27445
27446  /*
27447 - * Some lowlevel functions might want to know about
27448 - * the real APIC ID <-> CPU # mapping.
27449 + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27450 + * scheduling and IPI sending and compresses data structures.
27451   */
27452 -extern u8 __initdata x86_cpu_to_apicid_init[];
27453 -extern void *x86_cpu_to_apicid_ptr;
27454 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);        /* physical ID */
27455 -extern u8 bios_cpu_apicid[];
27456 -
27457 -#ifdef CONFIG_X86_LOCAL_APIC
27458 -static inline int cpu_present_to_apicid(int mps_cpu)
27459 +static inline int num_booting_cpus(void)
27460  {
27461 -       if (mps_cpu < NR_CPUS)
27462 -               return (int)bios_cpu_apicid[mps_cpu];
27463 -       else
27464 -               return BAD_APICID;
27465 +       return cpus_weight(cpu_possible_map);
27466  }
27467 -#endif
27468
27469 -#ifndef CONFIG_SMP
27470 +extern void smp_send_reschedule(int cpu);
27471 +
27472 +#else /* CONFIG_SMP */
27473 +
27474 +extern unsigned int boot_cpu_id;
27475 +#define cpu_physical_id(cpu)   boot_cpu_id
27476  #define stack_smp_processor_id() 0
27477 -#define cpu_logical_map(x) (x)
27478 -#else
27479 -#include <asm/thread_info.h>
27480 -#define stack_smp_processor_id() \
27481 -({                                                             \
27482 -       struct thread_info *ti;                                 \
27483 -       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
27484 -       ti->cpu;                                                \
27485 -})
27486 -#endif
27487 +
27488 +#endif /* !CONFIG_SMP */
27489 +
27490 +#define safe_smp_processor_id()                smp_processor_id()
27491
27492  #ifdef CONFIG_X86_LOCAL_APIC
27493  static __inline int logical_smp_processor_id(void)
27494  {
27495         /* we don't want to mark this access volatile - bad code generation */
27496 -       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27497 +       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27498 +}
27499 +
27500 +static inline int hard_smp_processor_id(void)
27501 +{
27502 +       /* we don't want to mark this access volatile - bad code generation */
27503 +       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27504  }
27505  #endif
27506
27507 -#ifdef CONFIG_SMP
27508 -#define cpu_physical_id(cpu)           per_cpu(x86_cpu_to_apicid, cpu)
27509 -#else
27510 -extern unsigned int boot_cpu_id;
27511 -#define cpu_physical_id(cpu)           boot_cpu_id
27512 -#endif /* !CONFIG_SMP */
27513  #endif
27514
27515 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
27516 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/spinlock.h    2009-03-16 16:33:40.000000000 +0100
27517 @@ -0,0 +1,333 @@
27518 +#ifndef _X86_SPINLOCK_H_
27519 +#define _X86_SPINLOCK_H_
27520 +
27521 +#include <asm/atomic.h>
27522 +#include <asm/rwlock.h>
27523 +#include <asm/page.h>
27524 +#include <asm/processor.h>
27525 +#include <linux/compiler.h>
27526 +
27527 +/*
27528 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
27529 + *
27530 + * Simple spin lock operations.  There are two variants, one clears IRQ's
27531 + * on the local processor, one does not.
27532 + *
27533 + * These are fair FIFO ticket locks, which are currently limited to 256
27534 + * CPUs.
27535 + *
27536 + * (the type definitions are in asm/spinlock_types.h)
27537 + */
27538 +
27539 +#ifdef CONFIG_X86_32
27540 +# define LOCK_PTR_REG "a"
27541 +# define REG_PTR_MODE "k"
27542 +#else
27543 +# define LOCK_PTR_REG "D"
27544 +# define REG_PTR_MODE "q"
27545 +#endif
27546 +
27547 +#if defined(CONFIG_X86_32) && \
27548 +       (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27549 +/*
27550 + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27551 + * (PPro errata 66, 92)
27552 + */
27553 +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27554 +#else
27555 +# define UNLOCK_LOCK_PREFIX
27556 +#endif
27557 +
27558 +int xen_spinlock_init(unsigned int cpu);
27559 +void xen_spinlock_cleanup(unsigned int cpu);
27560 +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27561 +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27562 +                              unsigned int flags);
27563 +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27564 +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27565 +
27566 +/*
27567 + * Ticket locks are conceptually two parts, one indicating the current head of
27568 + * the queue, and the other indicating the current tail. The lock is acquired
27569 + * by atomically noting the tail and incrementing it by one (thus adding
27570 + * ourself to the queue and noting our position), then waiting until the head
27571 + * becomes equal to the the initial value of the tail.
27572 + *
27573 + * We use an xadd covering *both* parts of the lock, to increment the tail and
27574 + * also load the position of the head, which takes care of memory ordering
27575 + * issues and should be optimal for the uncontended case. Note the tail must be
27576 + * in the high part, because a wide xadd increment of the low part would carry
27577 + * up and contaminate the high part.
27578 + *
27579 + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27580 + * save some instructions and make the code more elegant. There really isn't
27581 + * much between them in performance though, especially as locks are out of line.
27582 + */
27583 +#if (NR_CPUS < 256)
27584 +#define TICKET_SHIFT 8
27585 +#define __raw_spin_lock_preamble \
27586 +       asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27587 +           "cmpb %h0, %b0\n\t" \
27588 +           "sete %1" \
27589 +           : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27590 +           : "0" (0x0100) \
27591 +           : "memory", "cc")
27592 +#define __raw_spin_lock_body \
27593 +       asm("1:\t" \
27594 +           "cmpb %h0, %b0\n\t" \
27595 +           "je 2f\n\t" \
27596 +           "decl %1\n\t" \
27597 +           "jz 2f\n\t" \
27598 +           "rep ; nop\n\t" \
27599 +           "movb %2, %b0\n\t" \
27600 +           /* don't need lfence here, because loads are in-order */ \
27601 +           "jmp 1b\n" \
27602 +           "2:" \
27603 +           : "+Q" (token), "+g" (count) \
27604 +           : "m" (lock->slock) \
27605 +           : "memory", "cc")
27606 +
27607 +
27608 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27609 +{
27610 +       int tmp, new;
27611 +
27612 +       asm("movzwl %2, %0\n\t"
27613 +           "cmpb %h0, %b0\n\t"
27614 +           "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27615 +           "jne 1f\n\t"
27616 +           LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27617 +           "1:\t"
27618 +           "sete %b1\n\t"
27619 +           "movzbl %b1, %0\n\t"
27620 +           : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27621 +           :
27622 +           : "memory", "cc");
27623 +
27624 +       return tmp;
27625 +}
27626 +
27627 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27628 +{
27629 +       unsigned int token;
27630 +       unsigned char kick;
27631 +
27632 +       asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27633 +           "movzwl %2, %0\n\t"
27634 +           "cmpb %h0, %b0\n\t"
27635 +           "setne %1"
27636 +           : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27637 +           :
27638 +           : "memory", "cc");
27639 +       if (kick)
27640 +               xen_spin_kick(lock, token);
27641 +}
27642 +#else
27643 +#define TICKET_SHIFT 16
27644 +#define __raw_spin_lock_preamble \
27645 +       do { \
27646 +               unsigned int tmp; \
27647 +               asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27648 +                   "shldl $16, %0, %3\n\t" \
27649 +                   "cmpw %w3, %w0\n\t" \
27650 +                   "sete %1"
27651 +                   : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27652 +                     "=&g" (tmp) \
27653 +                   : "0" (0x00010000) \
27654 +                   : "memory", "cc"); \
27655 +       } while (0)
27656 +#define __raw_spin_lock_body \
27657 +       do { \
27658 +               unsigned int tmp; \
27659 +               asm("shldl $16, %0, %2\n" \
27660 +                   "1:\t" \
27661 +                   "cmpw %w2, %w0\n\t" \
27662 +                   "je 2f\n\t" \
27663 +                   "decl %1\n\t" \
27664 +                   "jz 2f\n\t" \
27665 +                   "rep ; nop\n\t" \
27666 +                   "movw %3, %w0\n\t" \
27667 +                   /* don't need lfence here, because loads are in-order */ \
27668 +                   "jmp 1b\n" \
27669 +                   "2:" \
27670 +                   : "+r" (token), "+g" (count), "=&g" (tmp) \
27671 +                   : "m" (lock->slock) \
27672 +                   : "memory", "cc"); \
27673 +       } while (0)
27674 +
27675 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27676 +{
27677 +       int tmp;
27678 +       int new;
27679 +
27680 +       asm("movl %2, %0\n\t"
27681 +           "movl %0, %1\n\t"
27682 +           "roll $16, %0\n\t"
27683 +           "cmpl %0, %1\n\t"
27684 +           "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
27685 +           "jne 1f\n\t"
27686 +           LOCK_PREFIX "cmpxchgl %1, %2\n"
27687 +           "1:\t"
27688 +           "sete %b1\n\t"
27689 +           "movzbl %b1, %0\n\t"
27690 +           : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27691 +           :
27692 +           : "memory", "cc");
27693 +
27694 +       return tmp;
27695 +}
27696 +
27697 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27698 +{
27699 +       unsigned int token, tmp;
27700 +       bool kick;
27701 +
27702 +       asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
27703 +           "movl %2, %0\n\t"
27704 +           "shldl $16, %0, %3\n\t"
27705 +           "cmpw %w3, %w0\n\t"
27706 +           "setne %1"
27707 +           : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
27708 +           :
27709 +           : "memory", "cc");
27710 +       if (kick)
27711 +               xen_spin_kick(lock, token);
27712 +}
27713 +#endif
27714 +
27715 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
27716 +{
27717 +       int tmp = *(volatile signed int *)(&(lock)->slock);
27718 +
27719 +       return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
27720 +}
27721 +
27722 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
27723 +{
27724 +       int tmp = *(volatile signed int *)(&(lock)->slock);
27725 +
27726 +       return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
27727 +}
27728 +
27729 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
27730 +{
27731 +       unsigned int token, count;
27732 +       bool free;
27733 +
27734 +       __raw_spin_lock_preamble;
27735 +       if (unlikely(!free))
27736 +               token = xen_spin_adjust(lock, token);
27737 +       do {
27738 +               count = 1 << 10;
27739 +               __raw_spin_lock_body;
27740 +       } while (unlikely(!count) && !xen_spin_wait(lock, token));
27741 +}
27742 +
27743 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
27744 +                                        unsigned long flags)
27745 +{
27746 +       unsigned int token, count;
27747 +       bool free;
27748 +
27749 +       __raw_spin_lock_preamble;
27750 +       if (unlikely(!free))
27751 +               token = xen_spin_adjust(lock, token);
27752 +       do {
27753 +               count = 1 << 10;
27754 +               __raw_spin_lock_body;
27755 +       } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
27756 +}
27757 +
27758 +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
27759 +{
27760 +       while (__raw_spin_is_locked(lock))
27761 +               cpu_relax();
27762 +}
27763 +
27764 +/*
27765 + * Read-write spinlocks, allowing multiple readers
27766 + * but only one writer.
27767 + *
27768 + * NOTE! it is quite common to have readers in interrupts
27769 + * but no interrupt writers. For those circumstances we
27770 + * can "mix" irq-safe locks - any writer needs to get a
27771 + * irq-safe write-lock, but readers can get non-irqsafe
27772 + * read-locks.
27773 + *
27774 + * On x86, we implement read-write locks as a 32-bit counter
27775 + * with the high bit (sign) being the "contended" bit.
27776 + */
27777 +
27778 +/**
27779 + * read_can_lock - would read_trylock() succeed?
27780 + * @lock: the rwlock in question.
27781 + */
27782 +static inline int __raw_read_can_lock(raw_rwlock_t *lock)
27783 +{
27784 +       return (int)(lock)->lock > 0;
27785 +}
27786 +
27787 +/**
27788 + * write_can_lock - would write_trylock() succeed?
27789 + * @lock: the rwlock in question.
27790 + */
27791 +static inline int __raw_write_can_lock(raw_rwlock_t *lock)
27792 +{
27793 +       return (lock)->lock == RW_LOCK_BIAS;
27794 +}
27795 +
27796 +static inline void __raw_read_lock(raw_rwlock_t *rw)
27797 +{
27798 +       asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
27799 +                    "jns 1f\n"
27800 +                    "call __read_lock_failed\n\t"
27801 +                    "1:\n"
27802 +                    ::LOCK_PTR_REG (rw) : "memory");
27803 +}
27804 +
27805 +static inline void __raw_write_lock(raw_rwlock_t *rw)
27806 +{
27807 +       asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
27808 +                    "jz 1f\n"
27809 +                    "call __write_lock_failed\n\t"
27810 +                    "1:\n"
27811 +                    ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
27812 +}
27813 +
27814 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
27815 +{
27816 +       atomic_t *count = (atomic_t *)lock;
27817 +
27818 +       atomic_dec(count);
27819 +       if (atomic_read(count) >= 0)
27820 +               return 1;
27821 +       atomic_inc(count);
27822 +       return 0;
27823 +}
27824 +
27825 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
27826 +{
27827 +       atomic_t *count = (atomic_t *)lock;
27828 +
27829 +       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
27830 +               return 1;
27831 +       atomic_add(RW_LOCK_BIAS, count);
27832 +       return 0;
27833 +}
27834 +
27835 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
27836 +{
27837 +       asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
27838 +}
27839 +
27840 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
27841 +{
27842 +       asm volatile(LOCK_PREFIX "addl %1, %0"
27843 +                    : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
27844 +}
27845 +
27846 +#define _raw_spin_relax(lock)  cpu_relax()
27847 +#define _raw_read_relax(lock)  cpu_relax()
27848 +#define _raw_write_relax(lock) cpu_relax()
27849 +
27850 +#endif
27851 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/system.h 2009-02-16 16:18:36.000000000 +0100
27852 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/system.h      2009-03-16 16:33:40.000000000 +0100
27853 @@ -1,5 +1,393 @@
27854 +#ifndef _ASM_X86_SYSTEM_H_
27855 +#define _ASM_X86_SYSTEM_H_
27856 +
27857 +#include <asm/asm.h>
27858 +#include <asm/segment.h>
27859 +#include <asm/cpufeature.h>
27860 +#include <asm/cmpxchg.h>
27861 +#include <asm/nops.h>
27862 +#include <asm/hypervisor.h>
27863 +
27864 +#include <linux/kernel.h>
27865 +#include <linux/irqflags.h>
27866 +
27867 +/* entries in ARCH_DLINFO: */
27868 +#ifdef CONFIG_IA32_EMULATION
27869 +# define AT_VECTOR_SIZE_ARCH 2
27870 +#else
27871 +# define AT_VECTOR_SIZE_ARCH 1
27872 +#endif
27873 +
27874 +#ifdef CONFIG_X86_32
27875 +
27876 +struct task_struct; /* one of the stranger aspects of C forward declarations */
27877 +struct task_struct *__switch_to(struct task_struct *prev,
27878 +                               struct task_struct *next);
27879 +
27880 +/*
27881 + * Saving eflags is important. It switches not only IOPL between tasks,
27882 + * it also protects other tasks from NT leaking through sysenter etc.
27883 + */
27884 +#define switch_to(prev, next, last) do {                               \
27885 +       unsigned long esi, edi;                                         \
27886 +       asm volatile("pushfl\n\t"               /* Save flags */        \
27887 +                    "pushl %%ebp\n\t"                                  \
27888 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
27889 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
27890 +                    "movl $1f,%1\n\t"          /* save EIP */          \
27891 +                    "pushl %6\n\t"             /* restore EIP */       \
27892 +                    "jmp __switch_to\n"                                \
27893 +                    "1:\t"                                             \
27894 +                    "popl %%ebp\n\t"                                   \
27895 +                    "popfl"                                            \
27896 +                    :"=m" (prev->thread.sp), "=m" (prev->thread.ip),   \
27897 +                     "=a" (last), "=S" (esi), "=D" (edi)               \
27898 +                    :"m" (next->thread.sp), "m" (next->thread.ip),     \
27899 +                     "2" (prev), "d" (next));                          \
27900 +} while (0)
27901 +
27902 +/*
27903 + * disable hlt during certain critical i/o operations
27904 + */
27905 +#define HAVE_DISABLE_HLT
27906 +#else
27907 +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
27908 +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
27909 +
27910 +/* frame pointer must be last for get_wchan */
27911 +#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
27912 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
27913 +
27914 +#define __EXTRA_CLOBBER  \
27915 +       , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
27916 +         "r12", "r13", "r14", "r15"
27917 +
27918 +/* Save restore flags to clear handle leaking NT */
27919 +#define switch_to(prev, next, last) \
27920 +       asm volatile(SAVE_CONTEXT                                                   \
27921 +            "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
27922 +            "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
27923 +            "call __switch_to\n\t"                                       \
27924 +            ".globl thread_return\n"                                     \
27925 +            "thread_return:\n\t"                                         \
27926 +            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
27927 +            "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
27928 +            LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
27929 +            "movq %%rax,%%rdi\n\t"                                       \
27930 +            "jc   ret_from_fork\n\t"                                     \
27931 +            RESTORE_CONTEXT                                              \
27932 +            : "=a" (last)                                                \
27933 +            : [next] "S" (next), [prev] "D" (prev),                      \
27934 +              [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
27935 +              [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
27936 +              [tif_fork] "i" (TIF_FORK),                                 \
27937 +              [thread_info] "i" (offsetof(struct task_struct, stack)),   \
27938 +              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
27939 +            : "memory", "cc" __EXTRA_CLOBBER)
27940 +#endif
27941 +
27942 +#ifdef __KERNEL__
27943 +#define _set_base(addr, base) do { unsigned long __pr; \
27944 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27945 +       "rorl $16,%%edx\n\t" \
27946 +       "movb %%dl,%2\n\t" \
27947 +       "movb %%dh,%3" \
27948 +       :"=&d" (__pr) \
27949 +       :"m" (*((addr)+2)), \
27950 +        "m" (*((addr)+4)), \
27951 +        "m" (*((addr)+7)), \
27952 +        "0" (base) \
27953 +       ); } while (0)
27954 +
27955 +#define _set_limit(addr, limit) do { unsigned long __lr; \
27956 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27957 +       "rorl $16,%%edx\n\t" \
27958 +       "movb %2,%%dh\n\t" \
27959 +       "andb $0xf0,%%dh\n\t" \
27960 +       "orb %%dh,%%dl\n\t" \
27961 +       "movb %%dl,%2" \
27962 +       :"=&d" (__lr) \
27963 +       :"m" (*(addr)), \
27964 +        "m" (*((addr)+6)), \
27965 +        "0" (limit) \
27966 +       ); } while (0)
27967 +
27968 +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
27969 +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
27970 +
27971 +extern void load_gs_index(unsigned);
27972 +
27973 +/*
27974 + * Load a segment. Fall back on loading the zero
27975 + * segment if something goes wrong..
27976 + */
27977 +#define loadsegment(seg, value)                        \
27978 +       asm volatile("\n"                       \
27979 +               "1:\t"                          \
27980 +               "movl %k0,%%" #seg "\n"         \
27981 +               "2:\n"                          \
27982 +               ".section .fixup,\"ax\"\n"      \
27983 +               "3:\t"                          \
27984 +               "movl %k1, %%" #seg "\n\t"      \
27985 +               "jmp 2b\n"                      \
27986 +               ".previous\n"                   \
27987 +               _ASM_EXTABLE(1b,3b)             \
27988 +               : :"r" (value), "r" (0))
27989 +
27990 +
27991 +/*
27992 + * Save a segment register away
27993 + */
27994 +#define savesegment(seg, value) \
27995 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
27996 +
27997 +static inline unsigned long get_limit(unsigned long segment)
27998 +{
27999 +       unsigned long __limit;
28000 +       __asm__("lsll %1,%0"
28001 +               :"=r" (__limit):"r" (segment));
28002 +       return __limit+1;
28003 +}
28004 +
28005 +static inline void xen_clts(void)
28006 +{
28007 +       HYPERVISOR_fpu_taskswitch(0);
28008 +}
28009 +
28010 +static inline void xen_stts(void)
28011 +{
28012 +       HYPERVISOR_fpu_taskswitch(1);
28013 +}
28014 +
28015 +/*
28016 + * Volatile isn't enough to prevent the compiler from reordering the
28017 + * read/write functions for the control registers and messing everything up.
28018 + * A memory clobber would solve the problem, but would prevent reordering of
28019 + * all loads stores around it, which can hurt performance. Solution is to
28020 + * use a variable and mimic reads and writes to it to enforce serialization
28021 + */
28022 +static unsigned long __force_order;
28023 +
28024 +static inline unsigned long xen_read_cr0(void)
28025 +{
28026 +       unsigned long val;
28027 +       asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28028 +       return val;
28029 +}
28030 +
28031 +static inline void xen_write_cr0(unsigned long val)
28032 +{
28033 +       asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28034 +}
28035 +
28036 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28037 +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28038 +
28039 +static inline unsigned long xen_read_cr3(void)
28040 +{
28041 +       unsigned long val;
28042 +       asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28043 +#ifdef CONFIG_X86_32
28044 +       return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28045 +#else
28046 +       return machine_to_phys(val);
28047 +#endif
28048 +}
28049 +
28050 +static inline void xen_write_cr3(unsigned long val)
28051 +{
28052 +#ifdef CONFIG_X86_32
28053 +       val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28054 +#else
28055 +       val = phys_to_machine(val);
28056 +#endif
28057 +       asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28058 +}
28059 +
28060 +static inline unsigned long xen_read_cr4(void)
28061 +{
28062 +       unsigned long val;
28063 +       asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28064 +       return val;
28065 +}
28066 +
28067 +#define xen_read_cr4_safe() xen_read_cr4()
28068 +
28069 +static inline void xen_write_cr4(unsigned long val)
28070 +{
28071 +       asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28072 +}
28073 +
28074 +#ifdef CONFIG_X86_64
28075 +static inline unsigned long xen_read_cr8(void)
28076 +{
28077 +       return 0;
28078 +}
28079 +
28080 +static inline void xen_write_cr8(unsigned long val)
28081 +{
28082 +       BUG_ON(val);
28083 +}
28084 +#endif
28085 +
28086 +static inline void xen_wbinvd(void)
28087 +{
28088 +       asm volatile("wbinvd": : :"memory");
28089 +}
28090 +#define read_cr0()     (xen_read_cr0())
28091 +#define write_cr0(x)   (xen_write_cr0(x))
28092 +#define read_cr2()     (xen_read_cr2())
28093 +#define write_cr2(x)   (xen_write_cr2(x))
28094 +#define read_cr3()     (xen_read_cr3())
28095 +#define write_cr3(x)   (xen_write_cr3(x))
28096 +#define read_cr4()     (xen_read_cr4())
28097 +#define read_cr4_safe()        (xen_read_cr4_safe())
28098 +#define write_cr4(x)   (xen_write_cr4(x))
28099 +#define wbinvd()       (xen_wbinvd())
28100 +#ifdef CONFIG_X86_64
28101 +#define read_cr8()     (xen_read_cr8())
28102 +#define write_cr8(x)   (xen_write_cr8(x))
28103 +#endif
28104 +
28105 +/* Clear the 'TS' bit */
28106 +#define clts()         (xen_clts())
28107 +#define stts()         (xen_stts())
28108 +
28109 +#endif /* __KERNEL__ */
28110 +
28111 +static inline void clflush(volatile void *__p)
28112 +{
28113 +       asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28114 +}
28115 +
28116 +#define nop() __asm__ __volatile__ ("nop")
28117 +
28118 +void disable_hlt(void);
28119 +void enable_hlt(void);
28120 +
28121 +extern int es7000_plat;
28122 +void cpu_idle_wait(void);
28123 +
28124 +extern unsigned long arch_align_stack(unsigned long sp);
28125 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28126 +
28127 +void default_idle(void);
28128 +
28129 +/*
28130 + * Force strict CPU ordering.
28131 + * And yes, this is required on UP too when we're talking
28132 + * to devices.
28133 + */
28134  #ifdef CONFIG_X86_32
28135 -# include "system_32.h"
28136 +/*
28137 + * For now, "wmb()" doesn't actually do anything, as all
28138 + * Intel CPU's follow what Intel calls a *Processor Order*,
28139 + * in which all writes are seen in the program order even
28140 + * outside the CPU.
28141 + *
28142 + * I expect future Intel CPU's to have a weaker ordering,
28143 + * but I'd also expect them to finally get their act together
28144 + * and add some real memory barriers if so.
28145 + *
28146 + * Some non intel clones support out of order store. wmb() ceases to be a
28147 + * nop for these.
28148 + */
28149 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28150 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28151 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28152 +#else
28153 +#define mb()   asm volatile("mfence":::"memory")
28154 +#define rmb()  asm volatile("lfence":::"memory")
28155 +#define wmb()  asm volatile("sfence" ::: "memory")
28156 +#endif
28157 +
28158 +/**
28159 + * read_barrier_depends - Flush all pending reads that subsequents reads
28160 + * depend on.
28161 + *
28162 + * No data-dependent reads from memory-like regions are ever reordered
28163 + * over this barrier.  All reads preceding this primitive are guaranteed
28164 + * to access memory (but not necessarily other CPUs' caches) before any
28165 + * reads following this primitive that depend on the data return by
28166 + * any of the preceding reads.  This primitive is much lighter weight than
28167 + * rmb() on most CPUs, and is never heavier weight than is
28168 + * rmb().
28169 + *
28170 + * These ordering constraints are respected by both the local CPU
28171 + * and the compiler.
28172 + *
28173 + * Ordering is not guaranteed by anything other than these primitives,
28174 + * not even by data dependencies.  See the documentation for
28175 + * memory_barrier() for examples and URLs to more information.
28176 + *
28177 + * For example, the following code would force ordering (the initial
28178 + * value of "a" is zero, "b" is one, and "p" is "&a"):
28179 + *
28180 + * <programlisting>
28181 + *     CPU 0                           CPU 1
28182 + *
28183 + *     b = 2;
28184 + *     memory_barrier();
28185 + *     p = &b;                         q = p;
28186 + *                                     read_barrier_depends();
28187 + *                                     d = *q;
28188 + * </programlisting>
28189 + *
28190 + * because the read of "*q" depends on the read of "p" and these
28191 + * two reads are separated by a read_barrier_depends().  However,
28192 + * the following code, with the same initial values for "a" and "b":
28193 + *
28194 + * <programlisting>
28195 + *     CPU 0                           CPU 1
28196 + *
28197 + *     a = 2;
28198 + *     memory_barrier();
28199 + *     b = 3;                          y = b;
28200 + *                                     read_barrier_depends();
28201 + *                                     x = a;
28202 + * </programlisting>
28203 + *
28204 + * does not enforce ordering, since there is no data dependency between
28205 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
28206 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
28207 + * in cases like this where there are no data dependencies.
28208 + **/
28209 +
28210 +#define read_barrier_depends() do { } while (0)
28211 +
28212 +#ifdef CONFIG_SMP
28213 +#define smp_mb()       mb()
28214 +#ifdef CONFIG_X86_PPRO_FENCE
28215 +# define smp_rmb()     rmb()
28216  #else
28217 -# include "system_64.h"
28218 +# define smp_rmb()     barrier()
28219 +#endif
28220 +#ifdef CONFIG_X86_OOSTORE
28221 +# define smp_wmb()     wmb()
28222 +#else
28223 +# define smp_wmb()     barrier()
28224 +#endif
28225 +#define smp_read_barrier_depends()     read_barrier_depends()
28226 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28227 +#else
28228 +#define smp_mb()       barrier()
28229 +#define smp_rmb()      barrier()
28230 +#define smp_wmb()      barrier()
28231 +#define smp_read_barrier_depends()     do { } while (0)
28232 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
28233 +#endif
28234 +
28235 +/*
28236 + * Stop RDTSC speculation. This is needed when you need to use RDTSC
28237 + * (or get_cycles or vread that possibly accesses the TSC) in a defined
28238 + * code region.
28239 + *
28240 + * (Could use an alternative three way for this if there was one.)
28241 + */
28242 +static inline void rdtsc_barrier(void)
28243 +{
28244 +       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
28245 +       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
28246 +}
28247 +
28248  #endif
28249 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/system_32.h      2009-02-16 16:18:36.000000000 +0100
28250 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
28251 @@ -1,312 +0,0 @@
28252 -#ifndef __ASM_SYSTEM_H
28253 -#define __ASM_SYSTEM_H
28254 -
28255 -#include <linux/kernel.h>
28256 -#include <asm/segment.h>
28257 -#include <asm/cpufeature.h>
28258 -#include <asm/cmpxchg.h>
28259 -#include <asm/synch_bitops.h>
28260 -#include <asm/hypervisor.h>
28261 -
28262 -#ifdef __KERNEL__
28263 -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28264 -
28265 -struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
28266 -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28267 -
28268 -/*
28269 - * Saving eflags is important. It switches not only IOPL between tasks,
28270 - * it also protects other tasks from NT leaking through sysenter etc.
28271 - */
28272 -#define switch_to(prev,next,last) do {                                 \
28273 -       unsigned long esi,edi;                                          \
28274 -       asm volatile("pushfl\n\t"               /* Save flags */        \
28275 -                    "pushl %%ebp\n\t"                                  \
28276 -                    "movl %%esp,%0\n\t"        /* save ESP */          \
28277 -                    "movl %5,%%esp\n\t"        /* restore ESP */       \
28278 -                    "movl $1f,%1\n\t"          /* save EIP */          \
28279 -                    "pushl %6\n\t"             /* restore EIP */       \
28280 -                    "jmp __switch_to\n"                                \
28281 -                    "1:\t"                                             \
28282 -                    "popl %%ebp\n\t"                                   \
28283 -                    "popfl"                                            \
28284 -                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
28285 -                     "=a" (last),"=S" (esi),"=D" (edi)                 \
28286 -                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
28287 -                     "2" (prev), "d" (next));                          \
28288 -} while (0)
28289 -
28290 -#define _set_base(addr,base) do { unsigned long __pr; \
28291 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28292 -       "rorl $16,%%edx\n\t" \
28293 -       "movb %%dl,%2\n\t" \
28294 -       "movb %%dh,%3" \
28295 -       :"=&d" (__pr) \
28296 -       :"m" (*((addr)+2)), \
28297 -        "m" (*((addr)+4)), \
28298 -        "m" (*((addr)+7)), \
28299 -         "0" (base) \
28300 -        ); } while(0)
28301 -
28302 -#define _set_limit(addr,limit) do { unsigned long __lr; \
28303 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28304 -       "rorl $16,%%edx\n\t" \
28305 -       "movb %2,%%dh\n\t" \
28306 -       "andb $0xf0,%%dh\n\t" \
28307 -       "orb %%dh,%%dl\n\t" \
28308 -       "movb %%dl,%2" \
28309 -       :"=&d" (__lr) \
28310 -       :"m" (*(addr)), \
28311 -        "m" (*((addr)+6)), \
28312 -        "0" (limit) \
28313 -        ); } while(0)
28314 -
28315 -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28316 -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28317 -
28318 -/*
28319 - * Load a segment. Fall back on loading the zero
28320 - * segment if something goes wrong..
28321 - */
28322 -#define loadsegment(seg,value)                 \
28323 -       asm volatile("\n"                       \
28324 -               "1:\t"                          \
28325 -               "mov %0,%%" #seg "\n"           \
28326 -               "2:\n"                          \
28327 -               ".section .fixup,\"ax\"\n"      \
28328 -               "3:\t"                          \
28329 -               "pushl $0\n\t"                  \
28330 -               "popl %%" #seg "\n\t"           \
28331 -               "jmp 2b\n"                      \
28332 -               ".previous\n"                   \
28333 -               ".section __ex_table,\"a\"\n\t" \
28334 -               ".align 4\n\t"                  \
28335 -               ".long 1b,3b\n"                 \
28336 -               ".previous"                     \
28337 -               : :"rm" (value))
28338 -
28339 -/*
28340 - * Save a segment register away
28341 - */
28342 -#define savesegment(seg, value) \
28343 -       asm volatile("mov %%" #seg ",%0":"=rm" (value))
28344 -
28345 -static inline void xen_clts(void)
28346 -{
28347 -       HYPERVISOR_fpu_taskswitch(0);
28348 -}
28349 -
28350 -static inline unsigned long xen_read_cr0(void)
28351 -{
28352 -       unsigned long val;
28353 -       asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28354 -       return val;
28355 -}
28356 -
28357 -static inline void xen_write_cr0(unsigned long val)
28358 -{
28359 -       asm volatile("movl %0,%%cr0": :"r" (val));
28360 -}
28361 -
28362 -#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28363 -
28364 -static inline void xen_write_cr2(unsigned long val)
28365 -{
28366 -       asm volatile("movl %0,%%cr2": :"r" (val));
28367 -}
28368 -
28369 -static inline unsigned long xen_read_cr3(void)
28370 -{
28371 -       unsigned long val;
28372 -       asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28373 -       return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28374 -}
28375 -
28376 -static inline void xen_write_cr3(unsigned long val)
28377 -{
28378 -       val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28379 -       asm volatile("movl %0,%%cr3": :"r" (val));
28380 -}
28381 -
28382 -static inline unsigned long xen_read_cr4(void)
28383 -{
28384 -       unsigned long val;
28385 -       asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28386 -       return val;
28387 -}
28388 -
28389 -static inline unsigned long xen_read_cr4_safe(void)
28390 -{
28391 -       unsigned long val;
28392 -       /* This could fault if %cr4 does not exist */
28393 -       asm volatile("1: movl %%cr4, %0         \n"
28394 -               "2:                             \n"
28395 -               ".section __ex_table,\"a\"      \n"
28396 -               ".long 1b,2b                    \n"
28397 -               ".previous                      \n"
28398 -               : "=r" (val): "0" (0));
28399 -       return val;
28400 -}
28401 -
28402 -static inline void xen_write_cr4(unsigned long val)
28403 -{
28404 -       asm volatile("movl %0,%%cr4": :"r" (val));
28405 -}
28406 -
28407 -static inline void xen_wbinvd(void)
28408 -{
28409 -       asm volatile("wbinvd": : :"memory");
28410 -}
28411 -
28412 -static inline void clflush(volatile void *__p)
28413 -{
28414 -       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28415 -}
28416 -
28417 -#define read_cr0()     (xen_read_cr0())
28418 -#define write_cr0(x)   (xen_write_cr0(x))
28419 -#define read_cr2()     (xen_read_cr2())
28420 -#define write_cr2(x)   (xen_write_cr2(x))
28421 -#define read_cr3()     (xen_read_cr3())
28422 -#define write_cr3(x)   (xen_write_cr3(x))
28423 -#define read_cr4()     (xen_read_cr4())
28424 -#define read_cr4_safe()        (xen_read_cr4_safe())
28425 -#define write_cr4(x)   (xen_write_cr4(x))
28426 -#define wbinvd()       (xen_wbinvd())
28427 -
28428 -/* Clear the 'TS' bit */
28429 -#define clts()         (xen_clts())
28430 -
28431 -/* Set the 'TS' bit */
28432 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28433 -
28434 -#endif /* __KERNEL__ */
28435 -
28436 -static inline unsigned long get_limit(unsigned long segment)
28437 -{
28438 -       unsigned long __limit;
28439 -       __asm__("lsll %1,%0"
28440 -               :"=r" (__limit):"r" (segment));
28441 -       return __limit+1;
28442 -}
28443 -
28444 -#define nop() __asm__ __volatile__ ("nop")
28445 -
28446 -/*
28447 - * Force strict CPU ordering.
28448 - * And yes, this is required on UP too when we're talking
28449 - * to devices.
28450 - *
28451 - * For now, "wmb()" doesn't actually do anything, as all
28452 - * Intel CPU's follow what Intel calls a *Processor Order*,
28453 - * in which all writes are seen in the program order even
28454 - * outside the CPU.
28455 - *
28456 - * I expect future Intel CPU's to have a weaker ordering,
28457 - * but I'd also expect them to finally get their act together
28458 - * and add some real memory barriers if so.
28459 - *
28460 - * Some non intel clones support out of order store. wmb() ceases to be a
28461 - * nop for these.
28462 - */
28463 -
28464 -
28465 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28466 -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28467 -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28468 -
28469 -/**
28470 - * read_barrier_depends - Flush all pending reads that subsequents reads
28471 - * depend on.
28472 - *
28473 - * No data-dependent reads from memory-like regions are ever reordered
28474 - * over this barrier.  All reads preceding this primitive are guaranteed
28475 - * to access memory (but not necessarily other CPUs' caches) before any
28476 - * reads following this primitive that depend on the data return by
28477 - * any of the preceding reads.  This primitive is much lighter weight than
28478 - * rmb() on most CPUs, and is never heavier weight than is
28479 - * rmb().
28480 - *
28481 - * These ordering constraints are respected by both the local CPU
28482 - * and the compiler.
28483 - *
28484 - * Ordering is not guaranteed by anything other than these primitives,
28485 - * not even by data dependencies.  See the documentation for
28486 - * memory_barrier() for examples and URLs to more information.
28487 - *
28488 - * For example, the following code would force ordering (the initial
28489 - * value of "a" is zero, "b" is one, and "p" is "&a"):
28490 - *
28491 - * <programlisting>
28492 - *     CPU 0                           CPU 1
28493 - *
28494 - *     b = 2;
28495 - *     memory_barrier();
28496 - *     p = &b;                         q = p;
28497 - *                                     read_barrier_depends();
28498 - *                                     d = *q;
28499 - * </programlisting>
28500 - *
28501 - * because the read of "*q" depends on the read of "p" and these
28502 - * two reads are separated by a read_barrier_depends().  However,
28503 - * the following code, with the same initial values for "a" and "b":
28504 - *
28505 - * <programlisting>
28506 - *     CPU 0                           CPU 1
28507 - *
28508 - *     a = 2;
28509 - *     memory_barrier();
28510 - *     b = 3;                          y = b;
28511 - *                                     read_barrier_depends();
28512 - *                                     x = a;
28513 - * </programlisting>
28514 - *
28515 - * does not enforce ordering, since there is no data dependency between
28516 - * the read of "a" and the read of "b".  Therefore, on some CPUs, such
28517 - * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
28518 - * in cases like this where there are no data dependencies.
28519 - **/
28520 -
28521 -#define read_barrier_depends() do { } while(0)
28522 -
28523 -#ifdef CONFIG_SMP
28524 -#define smp_mb()       mb()
28525 -#ifdef CONFIG_X86_PPRO_FENCE
28526 -# define smp_rmb()     rmb()
28527 -#else
28528 -# define smp_rmb()     barrier()
28529 -#endif
28530 -#ifdef CONFIG_X86_OOSTORE
28531 -# define smp_wmb()     wmb()
28532 -#else
28533 -# define smp_wmb()     barrier()
28534 -#endif
28535 -#define smp_read_barrier_depends()     read_barrier_depends()
28536 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28537 -#else
28538 -#define smp_mb()       barrier()
28539 -#define smp_rmb()      barrier()
28540 -#define smp_wmb()      barrier()
28541 -#define smp_read_barrier_depends()     do { } while(0)
28542 -#define set_mb(var, value) do { var = value; barrier(); } while (0)
28543 -#endif
28544 -
28545 -#include <linux/irqflags.h>
28546 -
28547 -/*
28548 - * disable hlt during certain critical i/o operations
28549 - */
28550 -#define HAVE_DISABLE_HLT
28551 -void disable_hlt(void);
28552 -void enable_hlt(void);
28553 -
28554 -extern int es7000_plat;
28555 -void cpu_idle_wait(void);
28556 -
28557 -extern unsigned long arch_align_stack(unsigned long sp);
28558 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28559 -
28560 -void default_idle(void);
28561 -void __show_registers(struct pt_regs *, int all);
28562 -
28563 -#endif
28564 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/system_64.h      2009-02-16 16:18:36.000000000 +0100
28565 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/system_64.h   2009-03-16 16:33:40.000000000 +0100
28566 @@ -1,122 +1,9 @@
28567  #ifndef __ASM_SYSTEM_H
28568  #define __ASM_SYSTEM_H
28569
28570 -#include <linux/kernel.h>
28571  #include <asm/segment.h>
28572  #include <asm/cmpxchg.h>
28573
28574 -#include <asm/synch_bitops.h>
28575 -#include <asm/hypervisor.h>
28576 -#include <xen/interface/arch-x86_64.h>
28577 -
28578 -#ifdef __KERNEL__
28579 -
28580 -/* entries in ARCH_DLINFO: */
28581 -#ifdef CONFIG_IA32_EMULATION
28582 -# define AT_VECTOR_SIZE_ARCH 2
28583 -#else
28584 -# define AT_VECTOR_SIZE_ARCH 1
28585 -#endif
28586 -
28587 -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28588 -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28589 -
28590 -/* frame pointer must be last for get_wchan */
28591 -#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28592 -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28593 -
28594 -#define __EXTRA_CLOBBER  \
28595 -       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28596 -
28597 -/* Save restore flags to clear handle leaking NT */
28598 -#define switch_to(prev,next,last) \
28599 -       asm volatile(SAVE_CONTEXT                                                   \
28600 -                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
28601 -                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
28602 -                    "call __switch_to\n\t"                                       \
28603 -                    ".globl thread_return\n"                                   \
28604 -                    "thread_return:\n\t"                                           \
28605 -                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
28606 -                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
28607 -                    LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
28608 -                    "movq %%rax,%%rdi\n\t"                                       \
28609 -                    "jc   ret_from_fork\n\t"                                     \
28610 -                    RESTORE_CONTEXT                                                \
28611 -                    : "=a" (last)                                                \
28612 -                    : [next] "S" (next), [prev] "D" (prev),                      \
28613 -                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28614 -                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28615 -                      [tif_fork] "i" (TIF_FORK),                         \
28616 -                      [thread_info] "i" (offsetof(struct task_struct, stack)), \
28617 -                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
28618 -                    : "memory", "cc" __EXTRA_CLOBBER)
28619 -
28620 -extern void load_gs_index(unsigned);
28621 -
28622 -/*
28623 - * Load a segment. Fall back on loading the zero
28624 - * segment if something goes wrong..
28625 - */
28626 -#define loadsegment(seg,value) \
28627 -       asm volatile("\n"                       \
28628 -               "1:\t"                          \
28629 -               "movl %k0,%%" #seg "\n"         \
28630 -               "2:\n"                          \
28631 -               ".section .fixup,\"ax\"\n"      \
28632 -               "3:\t"                          \
28633 -               "movl %1,%%" #seg "\n\t"        \
28634 -               "jmp 2b\n"                      \
28635 -               ".previous\n"                   \
28636 -               ".section __ex_table,\"a\"\n\t" \
28637 -               ".align 8\n\t"                  \
28638 -               ".quad 1b,3b\n"                 \
28639 -               ".previous"                     \
28640 -               : :"r" (value), "r" (0))
28641 -
28642 -/*
28643 - * Clear and set 'TS' bit respectively
28644 - */
28645 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
28646 -
28647 -static inline unsigned long read_cr0(void)
28648 -{
28649 -       unsigned long cr0;
28650 -       asm volatile("movq %%cr0,%0" : "=r" (cr0));
28651 -       return cr0;
28652 -}
28653 -
28654 -static inline void write_cr0(unsigned long val)
28655 -{
28656 -       asm volatile("movq %0,%%cr0" :: "r" (val));
28657 -}
28658 -
28659 -#define read_cr2() current_vcpu_info()->arch.cr2
28660 -
28661 -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28662 -
28663 -#define read_cr3() ({ \
28664 -       unsigned long __dummy; \
28665 -       asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28666 -       machine_to_phys(__dummy); \
28667 -})
28668 -
28669 -static inline void write_cr3(unsigned long val)
28670 -{
28671 -       val = phys_to_machine(val);
28672 -       asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28673 -}
28674 -
28675 -static inline unsigned long read_cr4(void)
28676 -{
28677 -       unsigned long cr4;
28678 -       asm volatile("movq %%cr4,%0" : "=r" (cr4));
28679 -       return cr4;
28680 -}
28681 -
28682 -static inline void write_cr4(unsigned long val)
28683 -{
28684 -       asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28685 -}
28686
28687  static inline unsigned long read_cr8(void)
28688  {
28689 @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28690         BUG_ON(val);
28691  }
28692
28693 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28694 -
28695 -#define wbinvd() \
28696 -       __asm__ __volatile__ ("wbinvd": : :"memory")
28697 -
28698 -#endif /* __KERNEL__ */
28699 -
28700 -static inline void clflush(volatile void *__p)
28701 -{
28702 -       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28703 -}
28704 -
28705 -#define nop() __asm__ __volatile__ ("nop")
28706 -
28707 -#ifdef CONFIG_SMP
28708 -#define smp_mb()       mb()
28709 -#define smp_rmb()      barrier()
28710 -#define smp_wmb()      barrier()
28711 -#define smp_read_barrier_depends()     do {} while(0)
28712 -#else
28713 -#define smp_mb()       barrier()
28714 -#define smp_rmb()      barrier()
28715 -#define smp_wmb()      barrier()
28716 -#define smp_read_barrier_depends()     do {} while(0)
28717 -#endif
28718 -
28719 -
28720 -/*
28721 - * Force strict CPU ordering.
28722 - * And yes, this is required on UP too when we're talking
28723 - * to devices.
28724 - */
28725 -#define mb()   asm volatile("mfence":::"memory")
28726 -#define rmb()  asm volatile("lfence":::"memory")
28727 -#define wmb()  asm volatile("sfence" ::: "memory")
28728 -
28729 -#define read_barrier_depends() do {} while(0)
28730 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28731 -
28732 -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28733 -
28734  #include <linux/irqflags.h>
28735
28736 -void cpu_idle_wait(void);
28737 -
28738 -extern unsigned long arch_align_stack(unsigned long sp);
28739 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28740 -
28741  #endif
28742 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/tlbflush.h       2009-02-16 16:18:36.000000000 +0100
28743 +++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/tlbflush.h    2009-03-16 16:33:40.000000000 +0100
28744 @@ -1,5 +1,106 @@
28745 +#ifndef _ASM_X86_TLBFLUSH_H
28746 +#define _ASM_X86_TLBFLUSH_H
28747 +
28748 +#include <linux/mm.h>
28749 +#include <linux/sched.h>
28750 +
28751 +#include <asm/processor.h>
28752 +#include <asm/system.h>
28753 +
28754 +#define __flush_tlb() xen_tlb_flush()
28755 +#define __flush_tlb_global() xen_tlb_flush()
28756 +#define __flush_tlb_single(addr) xen_invlpg(addr)
28757 +#define __flush_tlb_all() xen_tlb_flush()
28758 +#define __flush_tlb_one(addr) xen_invlpg(addr)
28759 +
28760  #ifdef CONFIG_X86_32
28761 -# include "tlbflush_32.h"
28762 +# define TLB_FLUSH_ALL 0xffffffff
28763  #else
28764 -# include "tlbflush_64.h"
28765 +# define TLB_FLUSH_ALL -1ULL
28766  #endif
28767 +
28768 +/*
28769 + * TLB flushing:
28770 + *
28771 + *  - flush_tlb() flushes the current mm struct TLBs
28772 + *  - flush_tlb_all() flushes all processes TLBs
28773 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
28774 + *  - flush_tlb_page(vma, vmaddr) flushes one page
28775 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
28776 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28777 + *
28778 + * ..but the i386 has somewhat limited tlb flushing capabilities,
28779 + * and page-granular flushes are available only on i486 and up.
28780 + *
28781 + * x86-64 can only flush individual pages or full VMs. For a range flush
28782 + * we always do the full VM. Might be worth trying if for a small
28783 + * range a few INVLPGs in a row are a win.
28784 + */
28785 +
28786 +#ifndef CONFIG_SMP
28787 +
28788 +#define flush_tlb() __flush_tlb()
28789 +#define flush_tlb_all() __flush_tlb_all()
28790 +#define local_flush_tlb() __flush_tlb()
28791 +
28792 +static inline void flush_tlb_mm(struct mm_struct *mm)
28793 +{
28794 +       if (mm == current->active_mm)
28795 +               __flush_tlb();
28796 +}
28797 +
28798 +static inline void flush_tlb_page(struct vm_area_struct *vma,
28799 +                                 unsigned long addr)
28800 +{
28801 +       if (vma->vm_mm == current->active_mm)
28802 +               __flush_tlb_one(addr);
28803 +}
28804 +
28805 +static inline void flush_tlb_range(struct vm_area_struct *vma,
28806 +                                  unsigned long start, unsigned long end)
28807 +{
28808 +       if (vma->vm_mm == current->active_mm)
28809 +               __flush_tlb();
28810 +}
28811 +
28812 +#else  /* SMP */
28813 +
28814 +#include <asm/smp.h>
28815 +
28816 +#define local_flush_tlb() __flush_tlb()
28817 +
28818 +#define flush_tlb_all xen_tlb_flush_all
28819 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
28820 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28821 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28822 +
28823 +#define flush_tlb()    flush_tlb_current_task()
28824 +
28825 +static inline void flush_tlb_range(struct vm_area_struct *vma,
28826 +                                  unsigned long start, unsigned long end)
28827 +{
28828 +       flush_tlb_mm(vma->vm_mm);
28829 +}
28830 +
28831 +#define TLBSTATE_OK    1
28832 +#define TLBSTATE_LAZY  2
28833 +
28834 +#ifdef CONFIG_X86_32
28835 +struct tlb_state
28836 +{
28837 +       struct mm_struct *active_mm;
28838 +       int state;
28839 +       char __cacheline_padding[L1_CACHE_BYTES-8];
28840 +};
28841 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28842 +#endif
28843 +
28844 +#endif /* SMP */
28845 +
28846 +static inline void flush_tlb_kernel_range(unsigned long start,
28847 +                                         unsigned long end)
28848 +{
28849 +       flush_tlb_all();
28850 +}
28851 +
28852 +#endif /* _ASM_X86_TLBFLUSH_H */
28853 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/tlbflush_32.h    2009-02-16 16:18:36.000000000 +0100
28854 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
28855 @@ -1,99 +0,0 @@
28856 -#ifndef _I386_TLBFLUSH_H
28857 -#define _I386_TLBFLUSH_H
28858 -
28859 -#include <linux/mm.h>
28860 -#include <asm/processor.h>
28861 -
28862 -#define __flush_tlb() xen_tlb_flush()
28863 -#define __flush_tlb_global() xen_tlb_flush()
28864 -#define __flush_tlb_all() xen_tlb_flush()
28865 -
28866 -#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
28867 -
28868 -#define __flush_tlb_single(addr) xen_invlpg(addr)
28869 -
28870 -#define __flush_tlb_one(addr) __flush_tlb_single(addr)
28871 -
28872 -/*
28873 - * TLB flushing:
28874 - *
28875 - *  - flush_tlb() flushes the current mm struct TLBs
28876 - *  - flush_tlb_all() flushes all processes TLBs
28877 - *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
28878 - *  - flush_tlb_page(vma, vmaddr) flushes one page
28879 - *  - flush_tlb_range(vma, start, end) flushes a range of pages
28880 - *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28881 - *
28882 - * ..but the i386 has somewhat limited tlb flushing capabilities,
28883 - * and page-granular flushes are available only on i486 and up.
28884 - */
28885 -
28886 -#define TLB_FLUSH_ALL  0xffffffff
28887 -
28888 -
28889 -#ifndef CONFIG_SMP
28890 -
28891 -#include <linux/sched.h>
28892 -
28893 -#define flush_tlb() __flush_tlb()
28894 -#define flush_tlb_all() __flush_tlb_all()
28895 -#define local_flush_tlb() __flush_tlb()
28896 -
28897 -static inline void flush_tlb_mm(struct mm_struct *mm)
28898 -{
28899 -       if (mm == current->active_mm)
28900 -               __flush_tlb();
28901 -}
28902 -
28903 -static inline void flush_tlb_page(struct vm_area_struct *vma,
28904 -       unsigned long addr)
28905 -{
28906 -       if (vma->vm_mm == current->active_mm)
28907 -               __flush_tlb_one(addr);
28908 -}
28909 -
28910 -static inline void flush_tlb_range(struct vm_area_struct *vma,
28911 -       unsigned long start, unsigned long end)
28912 -{
28913 -       if (vma->vm_mm == current->active_mm)
28914 -               __flush_tlb();
28915 -}
28916 -
28917 -#else  /* SMP */
28918 -
28919 -#include <asm/smp.h>
28920 -
28921 -#define local_flush_tlb() \
28922 -       __flush_tlb()
28923 -
28924 -#define flush_tlb_all xen_tlb_flush_all
28925 -#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
28926 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28927 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28928 -
28929 -#define flush_tlb()    flush_tlb_current_task()
28930 -
28931 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
28932 -{
28933 -       flush_tlb_mm(vma->vm_mm);
28934 -}
28935 -
28936 -#define TLBSTATE_OK    1
28937 -#define TLBSTATE_LAZY  2
28938 -
28939 -struct tlb_state
28940 -{
28941 -       struct mm_struct *active_mm;
28942 -       int state;
28943 -       char __cacheline_padding[L1_CACHE_BYTES-8];
28944 -};
28945 -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28946 -#endif /* SMP */
28947 -
28948 -static inline void flush_tlb_kernel_range(unsigned long start,
28949 -                                       unsigned long end)
28950 -{
28951 -       flush_tlb_all();
28952 -}
28953 -
28954 -#endif /* _I386_TLBFLUSH_H */
28955 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/tlbflush_64.h    2009-02-16 16:18:36.000000000 +0100
28956 +++ /dev/null   1970-01-01 00:00:00.000000000 +0000
28957 @@ -1,97 +0,0 @@
28958 -#ifndef _X8664_TLBFLUSH_H
28959 -#define _X8664_TLBFLUSH_H
28960 -
28961 -#include <linux/mm.h>
28962 -#include <linux/sched.h>
28963 -#include <asm/processor.h>
28964 -#include <asm/system.h>
28965 -
28966 -#define __flush_tlb()  xen_tlb_flush()
28967 -
28968 -/*
28969 - * Global pages have to be flushed a bit differently. Not a real
28970 - * performance problem because this does not happen often.
28971 - */
28972 -#define __flush_tlb_global()   xen_tlb_flush()
28973 -
28974 -#define __flush_tlb_all() __flush_tlb_global()
28975 -
28976 -#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
28977 -
28978 -
28979 -/*
28980 - * TLB flushing:
28981 - *
28982 - *  - flush_tlb() flushes the current mm struct TLBs
28983 - *  - flush_tlb_all() flushes all processes TLBs
28984 - *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
28985 - *  - flush_tlb_page(vma, vmaddr) flushes one page
28986 - *  - flush_tlb_range(vma, start, end) flushes a range of pages
28987 - *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28988 - *
28989 - * x86-64 can only flush individual pages or full VMs. For a range flush
28990 - * we always do the full VM. Might be worth trying if for a small
28991 - * range a few INVLPGs in a row are a win.
28992 - */
28993 -
28994 -#ifndef CONFIG_SMP
28995 -
28996 -#define flush_tlb() __flush_tlb()
28997 -#define flush_tlb_all() __flush_tlb_all()
28998 -#define local_flush_tlb() __flush_tlb()
28999 -
29000 -static inline void flush_tlb_mm(struct mm_struct *mm)
29001 -{
29002 -       if (mm == current->active_mm)
29003 -               __flush_tlb();
29004 -}
29005 -
29006 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29007 -       unsigned long addr)
29008 -{
29009 -       if (vma->vm_mm == current->active_mm)
29010 -               __flush_tlb_one(addr);
29011 -}
29012 -
29013 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29014 -       unsigned long start, unsigned long end)
29015 -{
29016 -       if (vma->vm_mm == current->active_mm)
29017 -               __flush_tlb();
29018 -}
29019 -
29020 -#else
29021 -
29022 -#include <asm/smp.h>
29023 -
29024 -#define local_flush_tlb() \
29025 -       __flush_tlb()
29026 -
29027 -#define flush_tlb_all xen_tlb_flush_all
29028 -#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29029 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29030 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29031 -
29032 -#define flush_tlb()    flush_tlb_current_task()
29033 -
29034 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29035 -{
29036 -       flush_tlb_mm(vma->vm_mm);
29037 -}
29038 -
29039 -#define TLBSTATE_OK    1
29040 -#define TLBSTATE_LAZY  2
29041 -
29042 -/* Roughly an IPI every 20MB with 4k pages for freeing page table
29043 -   ranges. Cost is about 42k of memory for each CPU. */
29044 -#define ARCH_FREE_PTE_NR 5350
29045 -
29046 -#endif
29047 -
29048 -static inline void flush_tlb_kernel_range(unsigned long start,
29049 -                                       unsigned long end)
29050 -{
29051 -       flush_tlb_all();
29052 -}
29053 -
29054 -#endif /* _X8664_TLBFLUSH_H */
29055 --- sle11-2009-05-14.orig/include/asm-x86/mach-xen/irq_vectors.h        2009-05-14 10:56:29.000000000 +0200
29056 +++ sle11-2009-05-14/include/asm-x86/mach-xen/irq_vectors.h     2009-03-16 16:33:40.000000000 +0100
29057 @@ -82,7 +82,8 @@
29058
29059  #define RESCHEDULE_VECTOR      0
29060  #define CALL_FUNCTION_VECTOR   1
29061 -#define NR_IPIS                        2
29062 +#define SPIN_UNLOCK_VECTOR     2
29063 +#define NR_IPIS                        3
29064
29065  /*
29066   * The maximum number of vectors supported by i386 processors
29067 --- sle11-2009-05-14.orig/include/asm-x86/mmu.h 2009-02-16 16:18:36.000000000 +0100
29068 +++ sle11-2009-05-14/include/asm-x86/mmu.h      2009-03-16 16:33:40.000000000 +0100
29069 @@ -23,7 +23,7 @@ typedef struct {
29070         void *vdso;
29071  } mm_context_t;
29072
29073 -#ifdef CONFIG_SMP
29074 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29075  void leave_mm(int cpu);
29076  #else
29077  static inline void leave_mm(int cpu)
29078 --- sle11-2009-05-14.orig/include/asm-x86/ptrace.h      2009-05-14 10:56:29.000000000 +0200
29079 +++ sle11-2009-05-14/include/asm-x86/ptrace.h   2009-03-16 16:33:40.000000000 +0100
29080 @@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29081  extern void user_disable_single_step(struct task_struct *);
29082
29083  extern void user_enable_block_step(struct task_struct *);
29084 -#ifdef CONFIG_X86_DEBUGCTLMSR
29085 +#if defined(CONFIG_XEN)
29086 +#define arch_has_block_step()  (0)
29087 +#elif defined(CONFIG_X86_DEBUGCTLMSR)
29088  #define arch_has_block_step()  (1)
29089  #else
29090  #define arch_has_block_step()  (boot_cpu_data.x86 >= 6)
29091 --- sle11-2009-05-14.orig/include/asm-x86/thread_info.h 2009-02-16 16:17:21.000000000 +0100
29092 +++ sle11-2009-05-14/include/asm-x86/thread_info.h      2009-03-16 16:33:40.000000000 +0100
29093 @@ -94,6 +94,9 @@ struct thread_info {
29094  #define TIF_DS_AREA_MSR                26      /* uses thread_struct.ds_area_msr */
29095  #define TIF_BTS_TRACE_TS       27      /* record scheduling event timestamps */
29096  #define TIF_PERFMON_CTXSW      28      /* perfmon needs ctxsw calls */
29097 +#ifdef CONFIG_X86_XEN
29098 +#define TIF_CSTAR              31      /* cstar-based syscall (special handling) */
29099 +#endif
29100
29101  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
29102  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
29103 @@ -118,6 +121,7 @@ struct thread_info {
29104  #define _TIF_BTS_TRACE_TS      (1 << TIF_BTS_TRACE_TS)
29105  #define _TIF_PERFMON_WORK      (1 << TIF_PERFMON_WORK)
29106  #define _TIF_PERFMON_CTXSW     (1 << TIF_PERFMON_CTXSW)
29107 +#define _TIF_CSTAR             (1 << TIF_CSTAR)
29108
29109  /* work to do in syscall_trace_enter() */
29110  #define _TIF_WORK_SYSCALL_ENTRY        \
29111 @@ -147,12 +151,12 @@ struct thread_info {
29112         (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29113          _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29114
29115 -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29116 -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29117  #else
29118 -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29119 -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29120 +#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29121 +     /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29122  #endif
29123 +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29124 +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29125
29126  #define PREEMPT_ACTIVE         0x10000000
29127
29128 --- sle11-2009-05-14.orig/include/asm-x86/time.h        2009-05-14 10:56:29.000000000 +0200
29129 +++ sle11-2009-05-14/include/asm-x86/time.h     2009-03-16 16:33:40.000000000 +0100
29130 @@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29131
29132  extern unsigned long __init calibrate_cpu(void);
29133
29134 +#ifdef CONFIG_XEN
29135 +extern int xen_independent_wallclock(void);
29136 +extern unsigned long xen_read_persistent_clock(void);
29137 +extern int xen_update_persistent_clock(void);
29138 +#endif
29139 +
29140  #endif
29141 --- sle11-2009-05-14.orig/include/linux/page-flags.h    2009-02-16 16:17:21.000000000 +0100
29142 +++ sle11-2009-05-14/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
29143 @@ -102,8 +102,8 @@ enum pageflags {
29144         PG_foreign,             /* Page is owned by foreign allocator. */
29145         PG_pinned,              /* Cannot alias with PG_owner_priv_1 since
29146                                  * bad_page() checks include this bit.
29147 -                                * Also cannot use PG_arch_1 since that now
29148 -                                * has a different purpose on x86. */
29149 +                                * Should not use PG_arch_1 as that may have
29150 +                                * a different purpose elsewhere. */
29151  #endif
29152         __NR_PAGEFLAGS,
29153
29154 --- sle11-2009-05-14.orig/include/linux/pci.h   2008-12-15 11:27:22.000000000 +0100
29155 +++ sle11-2009-05-14/include/linux/pci.h        2009-03-16 16:33:40.000000000 +0100
29156 @@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29157  void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29158  int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29159  int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29160 +#ifdef CONFIG_XEN
29161 +void pci_restore_bars(struct pci_dev *);
29162 +#endif
29163
29164  /* ROM control related routines */
29165  void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29166 --- sle11-2009-05-14.orig/include/xen/evtchn.h  2009-03-04 11:28:34.000000000 +0100
29167 +++ sle11-2009-05-14/include/xen/evtchn.h       2009-03-16 16:33:40.000000000 +0100
29168 @@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29169         synch_clear_bit(port, s->evtchn_pending);
29170  }
29171
29172 +static inline void set_evtchn(int port)
29173 +{
29174 +       shared_info_t *s = HYPERVISOR_shared_info;
29175 +       synch_set_bit(port, s->evtchn_pending);
29176 +}
29177 +
29178 +static inline int test_evtchn(int port)
29179 +{
29180 +       shared_info_t *s = HYPERVISOR_shared_info;
29181 +       return synch_test_bit(port, s->evtchn_pending);
29182 +}
29183 +
29184  static inline void notify_remote_via_evtchn(int port)
29185  {
29186         struct evtchn_send send = { .port = port };
29187         VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29188  }
29189
29190 +/* Clear an irq's pending state, in preparation for polling on it. */
29191 +void xen_clear_irq_pending(int irq);
29192 +
29193 +/* Set an irq's pending state, to avoid blocking on it. */
29194 +void xen_set_irq_pending(int irq);
29195 +
29196 +/* Test an irq's pending state. */
29197 +int xen_test_irq_pending(int irq);
29198 +
29199 +/* Poll waiting for an irq to become pending.  In the usual case, the
29200 +   irq will be disabled so it won't deliver an interrupt. */
29201 +void xen_poll_irq(int irq);
29202 +
29203  /*
29204   * Use these to access the event channel underlying the IRQ handle returned
29205   * by bind_*_to_irqhandler().
29206 --- sle11-2009-05-14.orig/kernel/sysctl_check.c 2009-02-16 16:18:36.000000000 +0100
29207 +++ sle11-2009-05-14/kernel/sysctl_check.c      2009-03-16 16:33:40.000000000 +0100
29208 @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29209  };
29210
29211  #ifdef CONFIG_XEN
29212 -static struct trans_ctl_table trans_xen_table[] = {
29213 +static const struct trans_ctl_table trans_xen_table[] = {
29214         { CTL_XEN_INDEPENDENT_WALLCLOCK,        "independent_wallclock" },
29215         { CTL_XEN_PERMITTED_CLOCK_JITTER,       "permitted_clock_jitter" },
29216         {}
29217 --- sle11-2009-05-14.orig/lib/swiotlb-xen.c     2009-02-16 16:18:36.000000000 +0100
29218 +++ sle11-2009-05-14/lib/swiotlb-xen.c  2009-03-16 16:33:40.000000000 +0100
29219 @@ -30,7 +30,6 @@
29220  #include <asm/gnttab_dma.h>
29221
29222  int swiotlb;
29223 -EXPORT_SYMBOL(swiotlb);
29224
29225  #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29226
29227 @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29228         }
29229  }
29230
29231 +static inline unsigned int is_span_boundary(unsigned int index,
29232 +                                           unsigned int nslots,
29233 +                                           unsigned long offset_slots,
29234 +                                           unsigned long max_slots)
29235 +{
29236 +       unsigned long offset = (offset_slots + index) & (max_slots - 1);
29237 +       return offset + nslots > max_slots;
29238 +}
29239 +
29240  /*
29241   * Allocates bounce buffer and returns its kernel virtual address.
29242   */
29243 @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29244         unsigned int nslots, stride, index, wrap;
29245         struct phys_addr slot_buf;
29246         int i;
29247 +       unsigned long mask;
29248 +       unsigned long offset_slots;
29249 +       unsigned long max_slots;
29250 +
29251 +       mask = dma_get_seg_boundary(hwdev);
29252 +       offset_slots = -IO_TLB_SEGSIZE;
29253 +       max_slots = mask + 1
29254 +                   ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29255 +                   : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29256
29257         /*
29258          * For mappings greater than a page, we limit the stride (and
29259 @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29260          */
29261         spin_lock_irqsave(&io_tlb_lock, flags);
29262         {
29263 -               wrap = index = ALIGN(io_tlb_index, stride);
29264 -
29265 +               index = ALIGN(io_tlb_index, stride);
29266                 if (index >= iotlb_nslabs)
29267 -                       wrap = index = 0;
29268 +                       index = 0;
29269 +               wrap = index;
29270
29271                 do {
29272 +                       while (is_span_boundary(index, nslots, offset_slots,
29273 +                                               max_slots)) {
29274 +                               index += stride;
29275 +                               if (index >= iotlb_nslabs)
29276 +                                       index = 0;
29277 +                               if (index == wrap)
29278 +                                       goto not_found;
29279 +                       }
29280 +
29281                         /*
29282                          * If we find a slot that indicates we have 'nslots'
29283                          * number of contiguous buffers, we allocate the
29284 @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29285                                 index = 0;
29286                 } while (index != wrap);
29287
29288 +  not_found:
29289                 spin_unlock_irqrestore(&io_tlb_lock, flags);
29290                 return NULL;
29291         }