src/patches/60034_xen3-patch-2.6.25.patch1

   1 From: kernel.org
   2 Subject: 2.6.25
   3 Patch-mainline: 2.6.25
   4
   5 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
   6
   7 Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
   8
   9 ---
  10  arch/x86/Kconfig                              |   18
  11  arch/x86/Kconfig.debug                        |    1
  12  arch/x86/ia32/ia32entry-xen.S                 |   12
  13  arch/x86/kernel/Makefile                      |    3
  14  arch/x86/kernel/acpi/boot.c                   |    3
  15  arch/x86/kernel/acpi/sleep-xen.c              |   95 +
  16  arch/x86/kernel/acpi/sleep_32-xen.c           |  117 --
  17  arch/x86/kernel/acpi/sleep_64-xen.c           |  125 --
  18  arch/x86/kernel/apic_32-xen.c                 |    2
  19  arch/x86/kernel/apic_64-xen.c                 |   73 -
  20  arch/x86/kernel/asm-offsets_32.c              |    2
  21  arch/x86/kernel/cpu/common-xen.c              |  214 +--
  22  arch/x86/kernel/cpu/mtrr/main-xen.c           |   19
  23  arch/x86/kernel/e820_32-xen.c                 |  275 -----
  24  arch/x86/kernel/e820_64-xen.c                 |  485 +++++---
  25  arch/x86/kernel/early_printk-xen.c            |    2
  26  arch/x86/kernel/entry_32-xen.S                |  195 +++
  27  arch/x86/kernel/entry_64-xen.S                |   91 -
  28  arch/x86/kernel/fixup.c                       |    2
  29  arch/x86/kernel/genapic_64-xen.c              |   15
  30  arch/x86/kernel/head64-xen.c                  |   63 +
  31  arch/x86/kernel/head_32-xen.S                 |    3
  32  arch/x86/kernel/init_task-xen.c               |    2
  33  arch/x86/kernel/io_apic_32-xen.c              |   15
  34  arch/x86/kernel/io_apic_64-xen.c              |  110 +-
  35  arch/x86/kernel/ioport-xen.c                  |  112 ++
  36  arch/x86/kernel/ioport_32-xen.c               |  121 --
  37  arch/x86/kernel/ioport_64-xen.c               |   99 -
  38  arch/x86/kernel/irq_32-xen.c                  |   22
  39  arch/x86/kernel/irq_64-xen.c                  |   43
  40  arch/x86/kernel/ldt-xen.c                     |  272 +++++
  41  arch/x86/kernel/ldt_32-xen.c                  |  265 ----
  42  arch/x86/kernel/ldt_64-xen.c                  |  271 ----
  43  arch/x86/kernel/machine_kexec_64.c            |    2
  44  arch/x86/kernel/microcode-xen.c               |    2
  45  arch/x86/kernel/mpparse_32-xen.c              |   49
  46  arch/x86/kernel/mpparse_64-xen.c              |   30
  47  arch/x86/kernel/pci-dma-xen.c                 |   20
  48  arch/x86/kernel/process_32-xen.c              |  438 ++------
  49  arch/x86/kernel/process_64-xen.c              |  303 ++---
  50  arch/x86/kernel/quirks-xen.c                  |   82 -
  51  arch/x86/kernel/rtc.c                         |    8
  52  arch/x86/kernel/setup64-xen.c                 |   70 +
  53  arch/x86/kernel/setup_32-xen.c                |  311 ++++-
  54  arch/x86/kernel/setup_64-xen.c                |  686 ++++++------
  55  arch/x86/kernel/smp_32-xen.c                  |    5
  56  arch/x86/kernel/smp_64-xen.c                  |   91 -
  57  arch/x86/kernel/time_32-xen.c                 |  136 --
  58  arch/x86/kernel/traps_32-xen.c                |  320 +++--
  59  arch/x86/kernel/traps_64-xen.c                |  371 +++---
  60  arch/x86/kernel/vsyscall_64-xen.c             |   60 -
  61  arch/x86/kernel/xen_entry_64.S                |   36
  62  arch/x86/mach-xen/setup.c                     |   11
  63  arch/x86/mm/fault-xen.c                       | 1026 ++++++++++++++++++
  64  arch/x86/mm/fault_32-xen.c                    |  757 -------------
  65  arch/x86/mm/fault_64-xen.c                    |  686 ------------
  66  arch/x86/mm/highmem_32-xen.c                  |   45
  67  arch/x86/mm/hypervisor.c                      |   10
  68  arch/x86/mm/init_32-xen.c                     |  464 +++-----
  69  arch/x86/mm/init_64-xen.c                     |  517 ++++-----
  70  arch/x86/mm/ioremap-xen.c                     |  685 ++++++++++++
  71  arch/x86/mm/ioremap_32-xen.c                  |  445 --------
  72  arch/x86/mm/pageattr-xen.c                    | 1412 ++++++++++++++++++++++++++
  73  arch/x86/mm/pageattr_64-xen.c                 |  542 ---------
  74  arch/x86/mm/pgtable_32-xen.c                  |  672 ++----------
  75  arch/x86/pci/irq-xen.c                        |   24
  76  arch/x86/vdso/Makefile                        |    1
  77  arch/x86/vdso/vdso32-setup-xen.c              |  506 +++++++++
  78  arch/x86/vdso/vdso32-setup.c                  |   34
  79  arch/x86/vdso/vdso32.S                        |   12
  80  arch/x86/vdso/vdso32/syscall.S                |    2
  81  drivers/pci/msi-xen.c                         |   98 -
  82  drivers/pci/pci.c                             |    5
  83  drivers/xen/balloon/sysfs.c                   |    2
  84  drivers/xen/blkback/blkback.c                 |    5
  85  drivers/xen/blkfront/blkfront.c               |    9
  86  drivers/xen/blktap/blktap.c                   |    8
  87  drivers/xen/core/Makefile                     |    1
  88  drivers/xen/core/evtchn.c                     |   46
  89  drivers/xen/core/hypervisor_sysfs.c           |    2
  90  drivers/xen/core/smpboot.c                    |   29
  91  drivers/xen/core/spinlock.c                   |  161 ++
  92  drivers/xen/core/xen_sysfs.c                  |   30
  93  drivers/xen/gntdev/gntdev.c                   |    4
  94  drivers/xen/scsifront/scsifront.c             |   49
  95  drivers/xen/xenoprof/xenoprofile.c            |    2
  96  include/asm-x86/mach-xen/asm/agp.h            |    9
  97  include/asm-x86/mach-xen/asm/desc.h           |  403 +++++++
  98  include/asm-x86/mach-xen/asm/desc_32.h        |  262 ----
  99  include/asm-x86/mach-xen/asm/desc_64.h        |  228 ----
 100  include/asm-x86/mach-xen/asm/dma-mapping_32.h |   18
 101  include/asm-x86/mach-xen/asm/fixmap_32.h      |   24
 102  include/asm-x86/mach-xen/asm/fixmap_64.h      |   25
 103  include/asm-x86/mach-xen/asm/highmem.h        |   10
 104  include/asm-x86/mach-xen/asm/hypervisor.h     |   19
 105  include/asm-x86/mach-xen/asm/io_32.h          |   69 -
 106  include/asm-x86/mach-xen/asm/io_64.h          |   62 -
 107  include/asm-x86/mach-xen/asm/irqflags.h       |  248 ++++
 108  include/asm-x86/mach-xen/asm/irqflags_32.h    |  212 ---
 109  include/asm-x86/mach-xen/asm/irqflags_64.h    |  178 ---
 110  include/asm-x86/mach-xen/asm/maddr_32.h       |   21
 111  include/asm-x86/mach-xen/asm/maddr_64.h       |   19
 112  include/asm-x86/mach-xen/asm/mmu_context_32.h |    2
 113  include/asm-x86/mach-xen/asm/mmu_context_64.h |   12
 114  include/asm-x86/mach-xen/asm/page.h           |  238 ++++
 115  include/asm-x86/mach-xen/asm/page_64.h        |  196 ---
 116  include/asm-x86/mach-xen/asm/pci.h            |   17
 117  include/asm-x86/mach-xen/asm/pci_64.h         |    1
 118  include/asm-x86/mach-xen/asm/pgalloc_32.h     |  116 +-
 119  include/asm-x86/mach-xen/asm/pgalloc_64.h     |   87 -
 120  include/asm-x86/mach-xen/asm/pgtable-3level.h |  107 -
 121  include/asm-x86/mach-xen/asm/pgtable.h        |  449 ++++++++
 122  include/asm-x86/mach-xen/asm/pgtable_32.h     |  361 ------
 123  include/asm-x86/mach-xen/asm/pgtable_64.h     |  400 +------
 124  include/asm-x86/mach-xen/asm/processor.h      |  792 ++++++++++++++
 125  include/asm-x86/mach-xen/asm/processor_32.h   |  751 -------------
 126  include/asm-x86/mach-xen/asm/processor_64.h   |  461 --------
 127  include/asm-x86/mach-xen/asm/segment.h        |  203 +++
 128  include/asm-x86/mach-xen/asm/segment_32.h     |  150 --
 129  include/asm-x86/mach-xen/asm/smp_32.h         |  125 +-
 130  include/asm-x86/mach-xen/asm/smp_64.h         |  138 --
 131  include/asm-x86/mach-xen/asm/spinlock.h       |  333 ++++++
 132  include/asm-x86/mach-xen/asm/system.h         |  392 +++++++
 133  include/asm-x86/mach-xen/asm/system_32.h      |  312 -----
 134  include/asm-x86/mach-xen/asm/system_64.h      |  159 --
 135  include/asm-x86/mach-xen/asm/tlbflush.h       |  105 +
 136  include/asm-x86/mach-xen/asm/tlbflush_32.h    |   99 -
 137  include/asm-x86/mach-xen/asm/tlbflush_64.h    |   97 -
 138  include/asm-x86/mach-xen/irq_vectors.h        |    3
 139  include/asm-x86/mmu.h                         |    2
 140  include/asm-x86/ptrace.h                      |    4
 141  include/asm-x86/thread_info.h                 |   12
 142  include/asm-x86/time.h                        |    6
 143  include/linux/page-flags.h                    |    4
 144  include/linux/pci.h                           |    3
 145  include/xen/evtchn.h                          |   25
 146  kernel/sysctl_check.c                         |    2
 147  lib/swiotlb-xen.c                             |   35
 148  138 files changed, 11322 insertions(+), 11153 deletions(-)
 149
 150 --- a/arch/x86/ia32/ia32entry-xen.S
 151 +++ b/arch/x86/ia32/ia32entry-xen.S
 152 @@ -12,7 +12,6 @@
 153  #include <asm/ia32_unistd.h>
 154  #include <asm/thread_info.h>
 155  #include <asm/segment.h>
 156 -#include <asm/vsyscall32.h>
 157  #include <asm/irqflags.h>
 158  #include <linux/linkage.h>
 159
 160 @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
 161         CFI_RESTORE     rcx
 162         movl    %ebp,%ebp               /* zero extension */
 163         movl    %eax,%eax
 164 +       movl    48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
 165         movl    $__USER32_DS,40(%rsp)
 166         movq    %rbp,32(%rsp)
 167         movl    $__USER32_CS,16(%rsp)
 168 -       movl    $VSYSCALL32_SYSEXIT,8(%rsp)
 169 +       movq    %r10,8(%rsp)
 170         movq    %rax,(%rsp)
 171         cld
 172         SAVE_ARGS 0,0,1
 173 @@ -582,8 +582,8 @@ ia32_sys_call_table:
 174         .quad compat_sys_futex          /* 240 */
 175         .quad compat_sys_sched_setaffinity
 176         .quad compat_sys_sched_getaffinity
 177 -       .quad sys32_set_thread_area
 178 -       .quad sys32_get_thread_area
 179 +       .quad sys_set_thread_area
 180 +       .quad sys_get_thread_area
 181         .quad compat_sys_io_setup       /* 245 */
 182         .quad sys_io_destroy
 183         .quad compat_sys_io_getevents
 184 @@ -661,7 +661,9 @@ ia32_sys_call_table:
 185         .quad sys_epoll_pwait
 186         .quad compat_sys_utimensat      /* 320 */
 187         .quad compat_sys_signalfd
 188 -       .quad compat_sys_timerfd
 189 +       .quad sys_timerfd_create
 190         .quad sys_eventfd
 191         .quad sys32_fallocate
 192 +       .quad compat_sys_timerfd_settime        /* 325 */
 193 +       .quad compat_sys_timerfd_gettime
 194  ia32_syscall_end:
 195 --- a/arch/x86/Kconfig
 196 +++ b/arch/x86/Kconfig
 197 @@ -27,7 +27,7 @@ config X86
 198         select HAVE_KRETPROBES
 199         select HAVE_DYNAMIC_FTRACE
 200         select HAVE_FTRACE
 201 -       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 202 +       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
 203         select HAVE_ARCH_KGDB if !X86_VOYAGER
 204         select HAVE_ARCH_TRACEHOOK
 205         select HAVE_GENERIC_DMA_COHERENT if X86_32
 206 @@ -208,14 +208,12 @@ config X86_TRAMPOLINE
 207         default y
 208
 209  config X86_NO_TSS
 210 -       bool
 211 +       def_bool y
 212         depends on XEN
 213 -       default y
 214
 215  config X86_NO_IDT
 216 -       bool
 217 +       def_bool y
 218         depends on XEN
 219 -       default y
 220
 221  config KTIME_SCALAR
 222         def_bool X86_32
 223 @@ -724,9 +722,8 @@ config X86_VISWS_APIC
 224         depends on X86_32 && X86_VISWS
 225
 226  config X86_XEN_GENAPIC
 227 -       bool
 228 +       def_bool y
 229         depends on X86_64_XEN
 230 -       default y
 231
 232  config X86_MCE
 233         bool "Machine Check Exception"
 234 @@ -1113,7 +1110,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
 235
 236  config ARCH_SPARSEMEM_DEFAULT
 237         def_bool y
 238 -       depends on X86_64
 239 +       depends on X86_64 && !X86_64_XEN
 240
 241  config ARCH_SPARSEMEM_ENABLE
 242         def_bool y
 243 @@ -1743,10 +1740,10 @@ config PCI_MMCONFIG
 244         depends on X86_64 && PCI && ACPI
 245
 246  config XEN_PCIDEV_FRONTEND
 247 -       bool "Xen PCI Frontend" if X86_64
 248 +       def_bool y
 249 +       prompt "Xen PCI Frontend" if X86_64
 250         depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
 251         select HOTPLUG
 252 -       default y
 253         help
 254           The PCI device frontend driver allows the kernel to import arbitrary
 255           PCI devices from a PCI backend to support PCI driver domains.
 256 @@ -1754,7 +1751,6 @@ config XEN_PCIDEV_FRONTEND
 257  config XEN_PCIDEV_FE_DEBUG
 258         bool "Xen PCI Frontend Debugging"
 259         depends on XEN_PCIDEV_FRONTEND
 260 -       default n
 261         help
 262           Enables some debug statements within the PCI Frontend.
 263
 264 --- a/arch/x86/Kconfig.debug
 265 +++ b/arch/x86/Kconfig.debug
 266 @@ -266,6 +266,7 @@ config DEBUG_BOOT_PARAMS
 267         bool "Debug boot parameters"
 268         depends on DEBUG_KERNEL
 269         depends on DEBUG_FS
 270 +       depends on !XEN
 271         help
 272           This option will cause struct boot_params to be exported via debugfs.
 273
 274 --- a/arch/x86/kernel/acpi/boot.c
 275 +++ b/arch/x86/kernel/acpi/boot.c
 276 @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
 277  #ifndef CONFIG_XEN
 278         if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
 279                 return __va(phys);
 280 +#else
 281 +       if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
 282 +               return isa_bus_to_virt(phys);
 283  #endif
 284
 285         offset = phys & (PAGE_SIZE - 1);
 286 --- a/arch/x86/kernel/acpi/sleep_32-xen.c
 287 +++ /dev/null
 288 @@ -1,117 +0,0 @@
 289 -/*
 290 - * sleep.c - x86-specific ACPI sleep support.
 291 - *
 292 - *  Copyright (C) 2001-2003 Patrick Mochel
 293 - *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
 294 - */
 295 -
 296 -#include <linux/acpi.h>
 297 -#include <linux/bootmem.h>
 298 -#include <linux/dmi.h>
 299 -#include <linux/cpumask.h>
 300 -
 301 -#include <asm/smp.h>
 302 -
 303 -#ifndef CONFIG_ACPI_PV_SLEEP
 304 -/* address in low memory of the wakeup routine. */
 305 -unsigned long acpi_wakeup_address = 0;
 306 -unsigned long acpi_realmode_flags;
 307 -extern char wakeup_start, wakeup_end;
 308 -
 309 -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
 310 -#endif
 311 -
 312 -/**
 313 - * acpi_save_state_mem - save kernel state
 314 - *
 315 - * Create an identity mapped page table and copy the wakeup routine to
 316 - * low memory.
 317 - */
 318 -int acpi_save_state_mem(void)
 319 -{
 320 -#ifndef CONFIG_ACPI_PV_SLEEP
 321 -       if (!acpi_wakeup_address)
 322 -               return 1;
 323 -       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 324 -              &wakeup_end - &wakeup_start);
 325 -       acpi_copy_wakeup_routine(acpi_wakeup_address);
 326 -#endif
 327 -       return 0;
 328 -}
 329 -
 330 -/*
 331 - * acpi_restore_state - undo effects of acpi_save_state_mem
 332 - */
 333 -void acpi_restore_state_mem(void)
 334 -{
 335 -}
 336 -
 337 -/**
 338 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 339 - *
 340 - * We allocate a page from the first 1MB of memory for the wakeup
 341 - * routine for when we come back from a sleep state. The
 342 - * runtime allocator allows specification of <16MB pages, but not
 343 - * <1MB pages.
 344 - */
 345 -void __init acpi_reserve_bootmem(void)
 346 -{
 347 -#ifndef CONFIG_ACPI_PV_SLEEP
 348 -       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
 349 -               printk(KERN_ERR
 350 -                      "ACPI: Wakeup code way too big, S3 disabled.\n");
 351 -               return;
 352 -       }
 353 -
 354 -       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
 355 -       if (!acpi_wakeup_address)
 356 -               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 357 -#endif
 358 -}
 359 -
 360 -#ifndef CONFIG_ACPI_PV_SLEEP
 361 -static int __init acpi_sleep_setup(char *str)
 362 -{
 363 -       while ((str != NULL) && (*str != '\0')) {
 364 -               if (strncmp(str, "s3_bios", 7) == 0)
 365 -                       acpi_realmode_flags |= 1;
 366 -               if (strncmp(str, "s3_mode", 7) == 0)
 367 -                       acpi_realmode_flags |= 2;
 368 -               if (strncmp(str, "s3_beep", 7) == 0)
 369 -                       acpi_realmode_flags |= 4;
 370 -               str = strchr(str, ',');
 371 -               if (str != NULL)
 372 -                       str += strspn(str, ", \t");
 373 -       }
 374 -       return 1;
 375 -}
 376 -
 377 -__setup("acpi_sleep=", acpi_sleep_setup);
 378 -
 379 -/* Ouch, we want to delete this. We already have better version in userspace, in
 380 -   s2ram from suspend.sf.net project */
 381 -static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
 382 -{
 383 -       acpi_realmode_flags |= 2;
 384 -       return 0;
 385 -}
 386 -
 387 -static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
 388 -       {                       /* Reset video mode after returning from ACPI S3 sleep */
 389 -        .callback = reset_videomode_after_s3,
 390 -        .ident = "Toshiba Satellite 4030cdt",
 391 -        .matches = {
 392 -                    DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
 393 -                    },
 394 -        },
 395 -       {}
 396 -};
 397 -
 398 -static int __init acpisleep_dmi_init(void)
 399 -{
 400 -       dmi_check_system(acpisleep_dmi_table);
 401 -       return 0;
 402 -}
 403 -
 404 -core_initcall(acpisleep_dmi_init);
 405 -#endif /* CONFIG_ACPI_PV_SLEEP */
 406 --- a/arch/x86/kernel/acpi/sleep_64-xen.c
 407 +++ /dev/null
 408 @@ -1,125 +0,0 @@
 409 -/*
 410 - *  acpi.c - Architecture-Specific Low-Level ACPI Support
 411 - *
 412 - *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 413 - *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
 414 - *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
 415 - *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
 416 - *  Copyright (C) 2003 Pavel Machek, SuSE Labs
 417 - *
 418 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 419 - *
 420 - *  This program is free software; you can redistribute it and/or modify
 421 - *  it under the terms of the GNU General Public License as published by
 422 - *  the Free Software Foundation; either version 2 of the License, or
 423 - *  (at your option) any later version.
 424 - *
 425 - *  This program is distributed in the hope that it will be useful,
 426 - *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 427 - *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 428 - *  GNU General Public License for more details.
 429 - *
 430 - *  You should have received a copy of the GNU General Public License
 431 - *  along with this program; if not, write to the Free Software
 432 - *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 433 - *
 434 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 435 - */
 436 -
 437 -#include <linux/kernel.h>
 438 -#include <linux/init.h>
 439 -#include <linux/types.h>
 440 -#include <linux/stddef.h>
 441 -#include <linux/slab.h>
 442 -#include <linux/pci.h>
 443 -#include <linux/bootmem.h>
 444 -#include <linux/acpi.h>
 445 -#include <linux/cpumask.h>
 446 -
 447 -#include <asm/mpspec.h>
 448 -#include <asm/io.h>
 449 -#include <asm/apic.h>
 450 -#include <asm/apicdef.h>
 451 -#include <asm/page.h>
 452 -#include <asm/pgtable.h>
 453 -#include <asm/pgalloc.h>
 454 -#include <asm/io_apic.h>
 455 -#include <asm/proto.h>
 456 -#include <asm/tlbflush.h>
 457 -
 458 -/* --------------------------------------------------------------------------
 459 -                              Low-Level Sleep Support
 460 -   -------------------------------------------------------------------------- */
 461 -
 462 -#ifndef CONFIG_ACPI_PV_SLEEP
 463 -/* address in low memory of the wakeup routine. */
 464 -unsigned long acpi_wakeup_address = 0;
 465 -unsigned long acpi_realmode_flags;
 466 -extern char wakeup_start, wakeup_end;
 467 -
 468 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 469 -#endif
 470 -
 471 -/**
 472 - * acpi_save_state_mem - save kernel state
 473 - *
 474 - * Create an identity mapped page table and copy the wakeup routine to
 475 - * low memory.
 476 - */
 477 -int acpi_save_state_mem(void)
 478 -{
 479 -#ifndef CONFIG_ACPI_PV_SLEEP
 480 -       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 481 -              &wakeup_end - &wakeup_start);
 482 -       acpi_copy_wakeup_routine(acpi_wakeup_address);
 483 -#endif
 484 -       return 0;
 485 -}
 486 -
 487 -/*
 488 - * acpi_restore_state
 489 - */
 490 -void acpi_restore_state_mem(void)
 491 -{
 492 -}
 493 -
 494 -/**
 495 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 496 - *
 497 - * We allocate a page in low memory for the wakeup
 498 - * routine for when we come back from a sleep state. The
 499 - * runtime allocator allows specification of <16M pages, but not
 500 - * <1M pages.
 501 - */
 502 -void __init acpi_reserve_bootmem(void)
 503 -{
 504 -#ifndef CONFIG_ACPI_PV_SLEEP
 505 -       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
 506 -       if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
 507 -               printk(KERN_CRIT
 508 -                      "ACPI: Wakeup code way too big, will crash on attempt"
 509 -                      " to suspend\n");
 510 -#endif
 511 -}
 512 -
 513 -#ifndef CONFIG_ACPI_PV_SLEEP
 514 -static int __init acpi_sleep_setup(char *str)
 515 -{
 516 -       while ((str != NULL) && (*str != '\0')) {
 517 -               if (strncmp(str, "s3_bios", 7) == 0)
 518 -                       acpi_realmode_flags |= 1;
 519 -               if (strncmp(str, "s3_mode", 7) == 0)
 520 -                       acpi_realmode_flags |= 2;
 521 -               if (strncmp(str, "s3_beep", 7) == 0)
 522 -                       acpi_realmode_flags |= 4;
 523 -               str = strchr(str, ',');
 524 -               if (str != NULL)
 525 -                       str += strspn(str, ", \t");
 526 -       }
 527 -
 528 -       return 1;
 529 -}
 530 -
 531 -__setup("acpi_sleep=", acpi_sleep_setup);
 532 -#endif                         /* CONFIG_ACPI_PV_SLEEP */
 533 -
 534 --- /dev/null
 535 +++ b/arch/x86/kernel/acpi/sleep-xen.c
 536 @@ -0,0 +1,95 @@
 537 +/*
 538 + * sleep.c - x86-specific ACPI sleep support.
 539 + *
 540 + *  Copyright (C) 2001-2003 Patrick Mochel
 541 + *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
 542 + */
 543 +
 544 +#include <linux/acpi.h>
 545 +#include <linux/bootmem.h>
 546 +#include <linux/dmi.h>
 547 +#include <linux/cpumask.h>
 548 +
 549 +#include <asm/smp.h>
 550 +
 551 +#ifndef CONFIG_ACPI_PV_SLEEP
 552 +/* address in low memory of the wakeup routine. */
 553 +unsigned long acpi_wakeup_address = 0;
 554 +unsigned long acpi_realmode_flags;
 555 +extern char wakeup_start, wakeup_end;
 556 +
 557 +extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 558 +#endif
 559 +
 560 +/**
 561 + * acpi_save_state_mem - save kernel state
 562 + *
 563 + * Create an identity mapped page table and copy the wakeup routine to
 564 + * low memory.
 565 + */
 566 +int acpi_save_state_mem(void)
 567 +{
 568 +#ifndef CONFIG_ACPI_PV_SLEEP
 569 +       if (!acpi_wakeup_address) {
 570 +               printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
 571 +               return -ENOMEM;
 572 +       }
 573 +       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 574 +              &wakeup_end - &wakeup_start);
 575 +       acpi_copy_wakeup_routine(acpi_wakeup_address);
 576 +#endif
 577 +
 578 +       return 0;
 579 +}
 580 +
 581 +/*
 582 + * acpi_restore_state - undo effects of acpi_save_state_mem
 583 + */
 584 +void acpi_restore_state_mem(void)
 585 +{
 586 +}
 587 +
 588 +
 589 +/**
 590 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 591 + *
 592 + * We allocate a page from the first 1MB of memory for the wakeup
 593 + * routine for when we come back from a sleep state. The
 594 + * runtime allocator allows specification of <16MB pages, but not
 595 + * <1MB pages.
 596 + */
 597 +void __init acpi_reserve_bootmem(void)
 598 +{
 599 +#ifndef CONFIG_ACPI_PV_SLEEP
 600 +       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
 601 +               printk(KERN_ERR
 602 +                      "ACPI: Wakeup code way too big, S3 disabled.\n");
 603 +               return;
 604 +       }
 605 +
 606 +       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
 607 +       if (!acpi_wakeup_address)
 608 +               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 609 +#endif
 610 +}
 611 +
 612 +
 613 +#ifndef CONFIG_ACPI_PV_SLEEP
 614 +static int __init acpi_sleep_setup(char *str)
 615 +{
 616 +       while ((str != NULL) && (*str != '\0')) {
 617 +               if (strncmp(str, "s3_bios", 7) == 0)
 618 +                       acpi_realmode_flags |= 1;
 619 +               if (strncmp(str, "s3_mode", 7) == 0)
 620 +                       acpi_realmode_flags |= 2;
 621 +               if (strncmp(str, "s3_beep", 7) == 0)
 622 +                       acpi_realmode_flags |= 4;
 623 +               str = strchr(str, ',');
 624 +               if (str != NULL)
 625 +                       str += strspn(str, ", \t");
 626 +       }
 627 +       return 1;
 628 +}
 629 +
 630 +__setup("acpi_sleep=", acpi_sleep_setup);
 631 +#endif /* CONFIG_ACPI_PV_SLEEP */
 632 --- a/arch/x86/kernel/apic_32-xen.c
 633 +++ b/arch/x86/kernel/apic_32-xen.c
 634 @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
 635   * This initializes the IO-APIC and APIC hardware if this is
 636   * a UP kernel.
 637   */
 638 -int __init APIC_init_uniprocessor (void)
 639 +int __init APIC_init_uniprocessor(void)
 640  {
 641  #ifdef CONFIG_X86_IO_APIC
 642         if (smp_found_config)
 643 --- a/arch/x86/kernel/apic_64-xen.c
 644 +++ b/arch/x86/kernel/apic_64-xen.c
 645 @@ -34,34 +34,17 @@
 646  #include <asm/hpet.h>
 647  #include <asm/idle.h>
 648
 649 -int apic_verbosity;
 650 +int disable_apic;
 651
 652  /*
 653 - * 'what should we do if we get a hw irq event on an illegal vector'.
 654 - * each architecture has to answer this themselves.
 655 + * Debug level, exported for io_apic.c
 656   */
 657 -void ack_bad_irq(unsigned int irq)
 658 -{
 659 -       printk("unexpected IRQ trap at irq %02x\n", irq);
 660 -       /*
 661 -        * Currently unexpected vectors happen only on SMP and APIC.
 662 -        * We _must_ ack these because every local APIC has only N
 663 -        * irq slots per priority level, and a 'hanging, unacked' IRQ
 664 -        * holds up an irq slot - in excessive cases (when multiple
 665 -        * unexpected vectors occur) that might lock up the APIC
 666 -        * completely.
 667 -        * But don't ack when the APIC is disabled. -AK
 668 -        */
 669 -       if (!disable_apic)
 670 -               ack_APIC_irq();
 671 -}
 672 -
 673 -int setup_profiling_timer(unsigned int multiplier)
 674 -{
 675 -       return -EINVAL;
 676 -}
 677 +int apic_verbosity;
 678
 679 -void smp_local_timer_interrupt(void)
 680 +/*
 681 + * The guts of the apic timer interrupt
 682 + */
 683 +static void local_apic_timer_interrupt(void)
 684  {
 685  #ifndef CONFIG_XEN
 686         int cpu = smp_processor_id();
 687 @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
 688          */
 689         exit_idle();
 690         irq_enter();
 691 -       smp_local_timer_interrupt();
 692 +       local_apic_timer_interrupt();
 693         irq_exit();
 694         set_irq_regs(old_regs);
 695  }
 696
 697 +int setup_profiling_timer(unsigned int multiplier)
 698 +{
 699 +       return -EINVAL;
 700 +}
 701 +
 702 +/*
 703 + * This initializes the IO-APIC and APIC hardware if this is
 704 + * a UP kernel.
 705 + */
 706 +int __init APIC_init_uniprocessor(void)
 707 +{
 708 +#ifdef CONFIG_X86_IO_APIC
 709 +       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
 710 +               setup_IO_APIC();
 711 +#endif
 712 +
 713 +       return 1;
 714 +}
 715 +
 716 +/*
 717 + * Local APIC interrupts
 718 + */
 719 +
 720  /*
 721   * This interrupt should _never_ happen with our APIC/SMP architecture
 722   */
 723 @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
 724  /*
 725   * This interrupt should never happen with our APIC/SMP architecture
 726   */
 727 -
 728  asmlinkage void smp_error_interrupt(void)
 729  {
 730         unsigned int v, v1;
 731 @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
 732                 smp_processor_id(), v , v1);
 733         irq_exit();
 734  }
 735 -
 736 -int disable_apic;
 737 -
 738 -/*
 739 - * This initializes the IO-APIC and APIC hardware if this is
 740 - * a UP kernel.
 741 - */
 742 -int __init APIC_init_uniprocessor (void)
 743 -{
 744 -#ifdef CONFIG_X86_IO_APIC
 745 -       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
 746 -               setup_IO_APIC();
 747 -#endif
 748 -
 749 -       return 1;
 750 -}
 751 --- a/arch/x86/kernel/asm-offsets_32.c
 752 +++ b/arch/x86/kernel/asm-offsets_32.c
 753 @@ -23,8 +23,10 @@
 754  #include <xen/interface/xen.h>
 755  #endif
 756
 757 +#ifdef CONFIG_LGUEST_GUEST
 758  #include <linux/lguest.h>
 759  #include "../../../drivers/lguest/lg.h"
 760 +#endif
 761
 762  /* workaround for a warning with -Wmissing-prototypes */
 763  void foo(void);
 764 --- a/arch/x86/kernel/cpu/common-xen.c
 765 +++ b/arch/x86/kernel/cpu/common-xen.c
 766 @@ -27,45 +27,50 @@
 767  #include "cpu.h"
 768
 769  DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
 770 -       [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
 771 -       [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
 772 -       [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
 773 -       [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
 774 +       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
 775 +       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
 776 +       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
 777 +       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
 778  #ifndef CONFIG_XEN
 779         /*
 780          * Segments used for calling PnP BIOS have byte granularity.
 781          * They code segments and data segments have fixed 64k limits,
 782          * the transfer segment sizes are set at run time.
 783          */
 784 -       [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
 785 -       [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
 786 -       [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
 787 -       [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
 788 -       [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
 789 +       /* 32-bit code */
 790 +       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
 791 +       /* 16-bit code */
 792 +       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
 793 +       /* 16-bit data */
 794 +       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
 795 +       /* 16-bit data */
 796 +       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
 797 +       /* 16-bit data */
 798 +       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
 799         /*
 800          * The APM segments have byte granularity and their bases
 801          * are set at run time.  All have 64k limits.
 802          */
 803 -       [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
 804 +       /* 32-bit code */
 805 +       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
 806         /* 16-bit code */
 807 -       [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
 808 -       [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
 809 +       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
 810 +       /* data */
 811 +       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
 812
 813 -       [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
 814 +       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
 815  #endif
 816 -       [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
 817 +       [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 818  } };
 819  EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 820
 821 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 822 +
 823  static int cachesize_override __cpuinitdata = -1;
 824 -static int disable_x86_fxsr __cpuinitdata;
 825  static int disable_x86_serial_nr __cpuinitdata = 1;
 826 -static int disable_x86_sep __cpuinitdata;
 827
 828  struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 829
 830 -extern int disable_pse;
 831 -
 832  static void __cpuinit default_init(struct cpuinfo_x86 * c)
 833  {
 834         /* Not much we can do here... */
 835 @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
 836
 837  static int __init x86_fxsr_setup(char * s)
 838  {
 839 -       /* Tell all the other CPUs to not use it... */
 840 -       disable_x86_fxsr = 1;
 841 -
 842 -       /*
 843 -        * ... and clear the bits early in the boot_cpu_data
 844 -        * so that the bootup process doesn't try to do this
 845 -        * either.
 846 -        */
 847 -       clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
 848 -       clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
 849 +       setup_clear_cpu_cap(X86_FEATURE_FXSR);
 850 +       setup_clear_cpu_cap(X86_FEATURE_XMM);
 851         return 1;
 852  }
 853  __setup("nofxsr", x86_fxsr_setup);
 854 @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
 855
 856  static int __init x86_sep_setup(char * s)
 857  {
 858 -       disable_x86_sep = 1;
 859 +       setup_clear_cpu_cap(X86_FEATURE_SEP);
 860         return 1;
 861  }
 862  __setup("nosep", x86_sep_setup);
 863 @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
 864  void __init cpu_detect(struct cpuinfo_x86 *c)
 865  {
 866         /* Get vendor name */
 867 -       cpuid(0x00000000, &c->cpuid_level,
 868 -             (int *)&c->x86_vendor_id[0],
 869 -             (int *)&c->x86_vendor_id[8],
 870 -             (int *)&c->x86_vendor_id[4]);
 871 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
 872 +             (unsigned int *)&c->x86_vendor_id[0],
 873 +             (unsigned int *)&c->x86_vendor_id[8],
 874 +             (unsigned int *)&c->x86_vendor_id[4]);
 875
 876         c->x86 = 4;
 877         if (c->cpuid_level >= 0x00000001) {
 878 @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
 879                 if (c->x86 >= 0x6)
 880                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
 881                 c->x86_mask = tfms & 15;
 882 -               if (cap0 & (1<<19))
 883 +               if (cap0 & (1<<19)) {
 884                         c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 885 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 886 +               }
 887 +       }
 888 +}
 889 +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
 890 +{
 891 +       u32 tfms, xlvl;
 892 +       unsigned int ebx;
 893 +
 894 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
 895 +       if (have_cpuid_p()) {
 896 +               /* Intel-defined flags: level 0x00000001 */
 897 +               if (c->cpuid_level >= 0x00000001) {
 898 +                       u32 capability, excap;
 899 +                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 900 +                       c->x86_capability[0] = capability;
 901 +                       c->x86_capability[4] = excap;
 902 +               }
 903 +
 904 +               /* AMD-defined flags: level 0x80000001 */
 905 +               xlvl = cpuid_eax(0x80000000);
 906 +               if ((xlvl & 0xffff0000) == 0x80000000) {
 907 +                       if (xlvl >= 0x80000001) {
 908 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
 909 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
 910 +                       }
 911 +               }
 912 +
 913         }
 914 +
 915  }
 916
 917  /* Do minimum CPU detection early.
 918 @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
 919         struct cpuinfo_x86 *c = &boot_cpu_data;
 920
 921         c->x86_cache_alignment = 32;
 922 +       c->x86_clflush_size = 32;
 923
 924         if (!have_cpuid_p())
 925                 return;
 926 @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
 927         cpu_detect(c);
 928
 929         get_cpu_vendor(c, 1);
 930 +
 931 +       switch (c->x86_vendor) {
 932 +       case X86_VENDOR_AMD:
 933 +               early_init_amd(c);
 934 +               break;
 935 +       case X86_VENDOR_INTEL:
 936 +               early_init_intel(c);
 937 +               break;
 938 +       }
 939 +
 940 +       early_get_cap(c);
 941  }
 942
 943  static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 944  {
 945         u32 tfms, xlvl;
 946 -       int ebx;
 947 +       unsigned int ebx;
 948
 949         if (have_cpuid_p()) {
 950                 /* Get vendor name */
 951 -               cpuid(0x00000000, &c->cpuid_level,
 952 -                     (int *)&c->x86_vendor_id[0],
 953 -                     (int *)&c->x86_vendor_id[8],
 954 -                     (int *)&c->x86_vendor_id[4]);
 955 +               cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
 956 +                     (unsigned int *)&c->x86_vendor_id[0],
 957 +                     (unsigned int *)&c->x86_vendor_id[8],
 958 +                     (unsigned int *)&c->x86_vendor_id[4]);
 959
 960                 get_cpu_vendor(c, 0);
 961                 /* Initialize the standard set of capabilities */
 962 @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
 963                 init_scattered_cpuid_features(c);
 964         }
 965
 966 -       early_intel_workaround(c);
 967 -
 968  #ifdef CONFIG_X86_HT
 969         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 970  #endif
 971 @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
 972  /*
 973   * This does the hard work of actually picking apart the CPU stuff...
 974   */
 975 -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 976 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 977  {
 978         int i;
 979
 980 @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
 981
 982         generic_identify(c);
 983
 984 -       printk(KERN_DEBUG "CPU: After generic identify, caps:");
 985 -       for (i = 0; i < NCAPINTS; i++)
 986 -               printk(" %08lx", c->x86_capability[i]);
 987 -       printk("\n");
 988 -
 989 -       if (this_cpu->c_identify) {
 990 +       if (this_cpu->c_identify)
 991                 this_cpu->c_identify(c);
 992
 993 -               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
 994 -               for (i = 0; i < NCAPINTS; i++)
 995 -                       printk(" %08lx", c->x86_capability[i]);
 996 -               printk("\n");
 997 -       }
 998 -
 999         /*
1000          * Vendor-specific initialization.  In this section we
1001          * canonicalize the feature flags, meaning if there are
1002 @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
1003          * we do "generic changes."
1004          */
1005
1006 -       /* TSC disabled? */
1007 -       if ( tsc_disable )
1008 -               clear_bit(X86_FEATURE_TSC, c->x86_capability);
1009 -
1010 -       /* FXSR disabled? */
1011 -       if (disable_x86_fxsr) {
1012 -               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1013 -               clear_bit(X86_FEATURE_XMM, c->x86_capability);
1014 -       }
1015 -
1016 -       /* SEP disabled? */
1017 -       if (disable_x86_sep)
1018 -               clear_bit(X86_FEATURE_SEP, c->x86_capability);
1019 -
1020 -       if (disable_pse)
1021 -               clear_bit(X86_FEATURE_PSE, c->x86_capability);
1022 -
1023         /* If the model name is still unset, do table lookup. */
1024         if ( !c->x86_model_id[0] ) {
1025                 char *p;
1026 @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
1027                                 c->x86, c->x86_model);
1028         }
1029
1030 -       /* Now the feature flags better reflect actual CPU features! */
1031 -
1032 -       printk(KERN_DEBUG "CPU: After all inits, caps:");
1033 -       for (i = 0; i < NCAPINTS; i++)
1034 -               printk(" %08lx", c->x86_capability[i]);
1035 -       printk("\n");
1036 -
1037         /*
1038          * On SMP, boot_cpu_data holds the common feature set between
1039          * all CPUs; so make sure that we indicate which features are
1040 @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
1041                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1042         }
1043
1044 +       /* Clear all flags overriden by options */
1045 +       for (i = 0; i < NCAPINTS; i++)
1046 +               c->x86_capability[i] &= ~cleared_cpu_caps[i];
1047 +
1048         /* Init Machine Check Exception if available. */
1049         mcheck_init(c);
1050 +
1051 +       select_idle_routine(c);
1052  }
1053
1054  void __init identify_boot_cpu(void)
1055 @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
1056         identify_cpu(&boot_cpu_data);
1057         sysenter_setup();
1058         enable_sep_cpu();
1059 -       mtrr_bp_init();
1060  }
1061
1062  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1063 @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
1064  }
1065  #endif
1066
1067 +static __init int setup_noclflush(char *arg)
1068 +{
1069 +       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1070 +       return 1;
1071 +}
1072 +__setup("noclflush", setup_noclflush);
1073 +
1074  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1075  {
1076         char *vendor = NULL;
1077 @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
1078                 printk("\n");
1079  }
1080
1081 +static __init int setup_disablecpuid(char *arg)
1082 +{
1083 +       int bit;
1084 +       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1085 +               setup_clear_cpu_cap(bit);
1086 +       else
1087 +               return 0;
1088 +       return 1;
1089 +}
1090 +__setup("clearcpuid=", setup_disablecpuid);
1091 +
1092  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1093
1094  /* This is hacky. :)
1095 @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
1096   * They will insert themselves into the cpu_devs structure.
1097   * Then, when cpu_init() is called, we can just iterate over that array.
1098   */
1099 -
1100 -extern int intel_cpu_init(void);
1101 -extern int cyrix_init_cpu(void);
1102 -extern int nsc_init_cpu(void);
1103 -extern int amd_init_cpu(void);
1104 -extern int centaur_init_cpu(void);
1105 -extern int transmeta_init_cpu(void);
1106 -extern int nexgen_init_cpu(void);
1107 -extern int umc_init_cpu(void);
1108 -
1109  void __init early_cpu_init(void)
1110  {
1111         intel_cpu_init();
1112 @@ -627,21 +641,13 @@ void __init early_cpu_init(void)
1113         nexgen_init_cpu();
1114         umc_init_cpu();
1115         early_cpu_detect();
1116 -
1117 -#ifdef CONFIG_DEBUG_PAGEALLOC
1118 -       /* pse is not compatible with on-the-fly unmapping,
1119 -        * disable it even if the cpus claim to support it.
1120 -        */
1121 -       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1122 -       disable_pse = 1;
1123 -#endif
1124  }
1125
1126  /* Make sure %fs is initialized properly in idle threads */
1127 -struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1128 +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1129  {
1130         memset(regs, 0, sizeof(struct pt_regs));
1131 -       regs->xfs = __KERNEL_PERCPU;
1132 +       regs->fs = __KERNEL_PERCPU;
1133         return regs;
1134  }
1135
1136 @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1137   * it's on the real one. */
1138  void switch_to_new_gdt(void)
1139  {
1140 -       struct Xgt_desc_struct gdt_descr;
1141 +       struct desc_ptr gdt_descr;
1142         unsigned long va, frames[16];
1143         int f;
1144
1145 @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1146
1147         if (cpu_has_vme || cpu_has_de)
1148                 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1149 -       if (tsc_disable && cpu_has_tsc) {
1150 -               printk(KERN_NOTICE "Disabling TSC...\n");
1151 -               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1152 -               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1153 -               set_in_cr4(X86_CR4_TSD);
1154 -       }
1155
1156         switch_to_new_gdt();
1157
1158 @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1159                 BUG();
1160         enter_lazy_tlb(&init_mm, curr);
1161
1162 -       load_esp0(t, thread);
1163 +       load_sp0(t, thread);
1164
1165         load_LDT(&init_mm.context);
1166
1167 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
1168 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
1169 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1170
1171  struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1172  unsigned int num_var_ranges;
1173 -unsigned int *usage_table;
1174 +unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1175
1176  static void __init set_num_var_ranges(void)
1177  {
1178 @@ -52,17 +52,12 @@ static void __init init_table(void)
1179         int i, max;
1180
1181         max = num_var_ranges;
1182 -       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1183 -           == NULL) {
1184 -               printk(KERN_ERR "mtrr: could not allocate\n");
1185 -               return;
1186 -       }
1187         for (i = 0; i < max; i++)
1188 -               usage_table[i] = 0;
1189 +               mtrr_usage_table[i] = 0;
1190  }
1191
1192  int mtrr_add_page(unsigned long base, unsigned long size,
1193 -                 unsigned int type, char increment)
1194 +                 unsigned int type, bool increment)
1195  {
1196         int error;
1197         struct xen_platform_op op;
1198 @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1199         }
1200
1201         if (increment)
1202 -               ++usage_table[op.u.add_memtype.reg];
1203 +               ++mtrr_usage_table[op.u.add_memtype.reg];
1204
1205         mutex_unlock(&mtrr_mutex);
1206
1207 @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1208
1209  int
1210  mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1211 -        char increment)
1212 +        bool increment)
1213  {
1214         if (mtrr_check(base, size))
1215                 return -EINVAL;
1216 @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1217                         goto out;
1218                 }
1219         }
1220 -       if (usage_table[reg] < 1) {
1221 +       if (mtrr_usage_table[reg] < 1) {
1222                 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1223                 goto out;
1224         }
1225 -       if (--usage_table[reg] < 1) {
1226 +       if (--mtrr_usage_table[reg] < 1) {
1227                 op.cmd = XENPF_del_memtype;
1228                 op.u.del_memtype.handle = 0;
1229                 op.u.del_memtype.reg    = reg;
1230 --- a/arch/x86/kernel/e820_32-xen.c
1231 +++ b/arch/x86/kernel/e820_32-xen.c
1232 @@ -7,7 +7,6 @@
1233  #include <linux/kexec.h>
1234  #include <linux/module.h>
1235  #include <linux/mm.h>
1236 -#include <linux/efi.h>
1237  #include <linux/pfn.h>
1238  #include <linux/uaccess.h>
1239  #include <linux/suspend.h>
1240 @@ -18,11 +17,6 @@
1241  #include <asm/setup.h>
1242  #include <xen/interface/memory.h>
1243
1244 -#ifdef CONFIG_EFI
1245 -int efi_enabled = 0;
1246 -EXPORT_SYMBOL(efi_enabled);
1247 -#endif
1248 -
1249  struct e820map e820;
1250  struct change_member {
1251         struct e820entry *pbios; /* pointer to original bios entry */
1252 @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1253  EXPORT_SYMBOL(pci_mem_start);
1254  #endif
1255  extern int user_defined_memmap;
1256 -struct resource data_resource = {
1257 -       .name   = "Kernel data",
1258 -       .start  = 0,
1259 -       .end    = 0,
1260 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1261 -};
1262 -
1263 -struct resource code_resource = {
1264 -       .name   = "Kernel code",
1265 -       .start  = 0,
1266 -       .end    = 0,
1267 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1268 -};
1269 -
1270 -struct resource bss_resource = {
1271 -       .name   = "Kernel bss",
1272 -       .start  = 0,
1273 -       .end    = 0,
1274 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1275 -};
1276
1277  static struct resource system_rom_resource = {
1278         .name   = "System ROM",
1279 @@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1280         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1281  };
1282
1283 -static struct resource video_ram_resource = {
1284 -       .name   = "Video RAM area",
1285 -       .start  = 0xa0000,
1286 -       .end    = 0xbffff,
1287 -       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
1288 -};
1289 -
1290 -static struct resource standard_io_resources[] = { {
1291 -       .name   = "dma1",
1292 -       .start  = 0x0000,
1293 -       .end    = 0x001f,
1294 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1295 -}, {
1296 -       .name   = "pic1",
1297 -       .start  = 0x0020,
1298 -       .end    = 0x0021,
1299 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1300 -}, {
1301 -       .name   = "timer0",
1302 -       .start  = 0x0040,
1303 -       .end    = 0x0043,
1304 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1305 -}, {
1306 -       .name   = "timer1",
1307 -       .start  = 0x0050,
1308 -       .end    = 0x0053,
1309 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1310 -}, {
1311 -       .name   = "keyboard",
1312 -       .start  = 0x0060,
1313 -       .end    = 0x006f,
1314 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1315 -}, {
1316 -       .name   = "dma page reg",
1317 -       .start  = 0x0080,
1318 -       .end    = 0x008f,
1319 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1320 -}, {
1321 -       .name   = "pic2",
1322 -       .start  = 0x00a0,
1323 -       .end    = 0x00a1,
1324 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1325 -}, {
1326 -       .name   = "dma2",
1327 -       .start  = 0x00c0,
1328 -       .end    = 0x00df,
1329 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1330 -}, {
1331 -       .name   = "fpu",
1332 -       .start  = 0x00f0,
1333 -       .end    = 0x00ff,
1334 -       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
1335 -} };
1336 -
1337  #define ROMSIGNATURE 0xaa55
1338
1339  static int __init romsignature(const unsigned char *rom)
1340 @@ -272,10 +192,9 @@ static struct e820map machine_e820;
1341   * Request address space for all standard RAM and ROM resources
1342   * and also for regions reported as reserved by the e820.
1343   */
1344 -static void __init
1345 -legacy_init_iomem_resources(struct resource *code_resource,
1346 -                           struct resource *data_resource,
1347 -                           struct resource *bss_resource)
1348 +void __init init_iomem_resources(struct resource *code_resource,
1349 +               struct resource *data_resource,
1350 +               struct resource *bss_resource)
1351  {
1352         int i;
1353
1354 @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1355
1356  #undef e820
1357
1358 -/*
1359 - * Request address space for all standard resources
1360 - *
1361 - * This is called just before pcibios_init(), which is also a
1362 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1363 - */
1364 -static int __init request_standard_resources(void)
1365 -{
1366 -       int i;
1367 -
1368 -       /* Nothing to do if not running in dom0. */
1369 -       if (!is_initial_xendomain())
1370 -               return 0;
1371 -
1372 -       printk("Setting up standard PCI resources\n");
1373 -       if (efi_enabled)
1374 -               efi_initialize_iomem_resources(&code_resource,
1375 -                               &data_resource, &bss_resource);
1376 -       else
1377 -               legacy_init_iomem_resources(&code_resource,
1378 -                               &data_resource, &bss_resource);
1379 -
1380 -       /* EFI systems may still have VGA */
1381 -       request_resource(&iomem_resource, &video_ram_resource);
1382 -
1383 -       /* request I/O space for devices used on all i[345]86 PCs */
1384 -       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1385 -               request_resource(&ioport_resource, &standard_io_resources[i]);
1386 -       return 0;
1387 -}
1388 -
1389 -subsys_initcall(request_standard_resources);
1390 -
1391  #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1392  /**
1393   * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1394 @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1395  {
1396         int x;
1397
1398 -       if (!efi_enabled) {
1399 -                       x = e820.nr_map;
1400 -
1401 -               if (x == E820MAX) {
1402 -                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1403 -                   return;
1404 -               }
1405 +       x = e820.nr_map;
1406
1407 -               e820.map[x].addr = start;
1408 -               e820.map[x].size = size;
1409 -               e820.map[x].type = type;
1410 -               e820.nr_map++;
1411 +       if (x == E820MAX) {
1412 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1413 +               return;
1414         }
1415 +
1416 +       e820.map[x].addr = start;
1417 +       e820.map[x].size = size;
1418 +       e820.map[x].type = type;
1419 +       e820.nr_map++;
1420  } /* add_memory_region */
1421
1422  /*
1423 @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1424  }
1425
1426  /*
1427 - * Callback for efi_memory_walk.
1428 - */
1429 -static int __init
1430 -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1431 -{
1432 -       unsigned long *max_pfn = arg, pfn;
1433 -
1434 -       if (start < end) {
1435 -               pfn = PFN_UP(end -1);
1436 -               if (pfn > *max_pfn)
1437 -                       *max_pfn = pfn;
1438 -       }
1439 -       return 0;
1440 -}
1441 -
1442 -static int __init
1443 -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1444 -{
1445 -       memory_present(0, PFN_UP(start), PFN_DOWN(end));
1446 -       return 0;
1447 -}
1448 -
1449 -/*
1450   * Find the highest page frame number we have available
1451   */
1452  void __init find_max_pfn(void)
1453 @@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1454         int i;
1455
1456         max_pfn = 0;
1457 -       if (efi_enabled) {
1458 -               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1459 -               efi_memmap_walk(efi_memory_present_wrapper, NULL);
1460 -               return;
1461 -       }
1462
1463         for (i = 0; i < e820.nr_map; i++) {
1464                 unsigned long start, end;
1465 @@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1466  }
1467
1468  /*
1469 - * Free all available memory for boot time allocation.  Used
1470 - * as a callback function by efi_memory_walk()
1471 - */
1472 -
1473 -static int __init
1474 -free_available_memory(unsigned long start, unsigned long end, void *arg)
1475 -{
1476 -       /* check max_low_pfn */
1477 -       if (start >= (max_low_pfn << PAGE_SHIFT))
1478 -               return 0;
1479 -       if (end >= (max_low_pfn << PAGE_SHIFT))
1480 -               end = max_low_pfn << PAGE_SHIFT;
1481 -       if (start < end)
1482 -               free_bootmem(start, end - start);
1483 -
1484 -       return 0;
1485 -}
1486 -/*
1487   * Register fully available low RAM pages with the bootmem allocator.
1488   */
1489  void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1490  {
1491         int i;
1492
1493 -       if (efi_enabled) {
1494 -               efi_memmap_walk(free_available_memory, NULL);
1495 -               return;
1496 -       }
1497         for (i = 0; i < e820.nr_map; i++) {
1498                 unsigned long curr_pfn, last_pfn, size;
1499                 /*
1500 @@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1501         }
1502  }
1503
1504 -static __init __always_inline void efi_limit_regions(unsigned long long size)
1505 -{
1506 -       unsigned long long current_addr = 0;
1507 -       efi_memory_desc_t *md, *next_md;
1508 -       void *p, *p1;
1509 -       int i, j;
1510 -
1511 -       j = 0;
1512 -       p1 = memmap.map;
1513 -       for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1514 -               md = p;
1515 -               next_md = p1;
1516 -               current_addr = md->phys_addr +
1517 -                       PFN_PHYS(md->num_pages);
1518 -               if (is_available_memory(md)) {
1519 -                       if (md->phys_addr >= size) continue;
1520 -                       memcpy(next_md, md, memmap.desc_size);
1521 -                       if (current_addr >= size) {
1522 -                               next_md->num_pages -=
1523 -                                       PFN_UP(current_addr-size);
1524 -                       }
1525 -                       p1 += memmap.desc_size;
1526 -                       next_md = p1;
1527 -                       j++;
1528 -               } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1529 -                          EFI_MEMORY_RUNTIME) {
1530 -                       /* In order to make runtime services
1531 -                        * available we have to include runtime
1532 -                        * memory regions in memory map */
1533 -                       memcpy(next_md, md, memmap.desc_size);
1534 -                       p1 += memmap.desc_size;
1535 -                       next_md = p1;
1536 -                       j++;
1537 -               }
1538 -       }
1539 -       memmap.nr_map = j;
1540 -       memmap.map_end = memmap.map +
1541 -               (memmap.nr_map * memmap.desc_size);
1542 -}
1543 -
1544  void __init limit_regions(unsigned long long size)
1545  {
1546         unsigned long long current_addr = 0;
1547         int i;
1548
1549         print_memory_map("limit_regions start");
1550 -       if (efi_enabled) {
1551 -               efi_limit_regions(size);
1552 -               return;
1553 -       }
1554         for (i = 0; i < e820.nr_map; i++) {
1555                 current_addr = e820.map[i].addr + e820.map[i].size;
1556                 if (current_addr < size)
1557 @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1558         return 0;
1559  }
1560  early_param("memmap", parse_memmap);
1561 +
1562 +#ifndef CONFIG_XEN
1563 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1564 +                               unsigned new_type)
1565 +{
1566 +       int i;
1567 +
1568 +       BUG_ON(old_type == new_type);
1569 +
1570 +       for (i = 0; i < e820.nr_map; i++) {
1571 +               struct e820entry *ei = &e820.map[i];
1572 +               u64 final_start, final_end;
1573 +               if (ei->type != old_type)
1574 +                       continue;
1575 +               /* totally covered? */
1576 +               if (ei->addr >= start && ei->size <= size) {
1577 +                       ei->type = new_type;
1578 +                       continue;
1579 +               }
1580 +               /* partially covered */
1581 +               final_start = max(start, ei->addr);
1582 +               final_end = min(start + size, ei->addr + ei->size);
1583 +               if (final_start >= final_end)
1584 +                       continue;
1585 +               add_memory_region(final_start, final_end - final_start,
1586 +                                        new_type);
1587 +       }
1588 +}
1589 +
1590 +void __init update_e820(void)
1591 +{
1592 +       u8 nr_map;
1593 +
1594 +       nr_map = e820.nr_map;
1595 +       if (sanitize_e820_map(e820.map, &nr_map))
1596 +               return;
1597 +       e820.nr_map = nr_map;
1598 +       printk(KERN_INFO "modified physical RAM map:\n");
1599 +       print_memory_map("modified");
1600 +}
1601 +#endif
1602 --- a/arch/x86/kernel/e820_64-xen.c
1603 +++ b/arch/x86/kernel/e820_64-xen.c
1604 @@ -1,4 +1,4 @@
1605 -/*
1606 +/*
1607   * Handle the memory map.
1608   * The functions here do the job until bootmem takes over.
1609   *
1610 @@ -26,6 +26,7 @@
1611  #include <asm/proto.h>
1612  #include <asm/setup.h>
1613  #include <asm/sections.h>
1614 +#include <asm/kdebug.h>
1615  #include <xen/interface/memory.h>
1616
1617  struct e820map e820 __initdata;
1618 @@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1619  struct e820map machine_e820;
1620  #endif
1621
1622 -/*
1623 +/*
1624   * PFN of last memory page.
1625   */
1626 -unsigned long end_pfn;
1627 -EXPORT_SYMBOL(end_pfn);
1628 +unsigned long end_pfn;
1629
1630 -/*
1631 +/*
1632   * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1633   * The direct mapping extends to end_pfn_map, so that we can directly access
1634   * apertures, ACPI and other tables without having to play with fixmaps.
1635 - */
1636 -unsigned long end_pfn_map;
1637 + */
1638 +unsigned long end_pfn_map;
1639
1640 -/*
1641 +/*
1642   * Last pfn which the user wants to use.
1643   */
1644  static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1645
1646 -extern struct resource code_resource, data_resource, bss_resource;
1647 -
1648 -/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1649 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
1650 -{
1651 -       unsigned long addr = *addrp, last = addr + size;
1652 +/*
1653 + * Early reserved memory areas.
1654 + */
1655 +#define MAX_EARLY_RES 20
1656
1657 +struct early_res {
1658 +       unsigned long start, end;
1659 +       char name[16];
1660 +};
1661 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1662  #ifndef CONFIG_XEN
1663 -       /* various gunk below that needed for SMP startup */
1664 -       if (addr < 0x8000) {
1665 -               *addrp = PAGE_ALIGN(0x8000);
1666 -               return 1;
1667 -       }
1668 -
1669 -       /* direct mapping tables of the kernel */
1670 -       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1671 -               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1672 -               return 1;
1673 -       }
1674 -
1675 -       /* initrd */
1676 -#ifdef CONFIG_BLK_DEV_INITRD
1677 -       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1678 -               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1679 -               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
1680 -               unsigned long ramdisk_end   = ramdisk_image+ramdisk_size;
1681 -
1682 -               if (last >= ramdisk_image && addr < ramdisk_end) {
1683 -                       *addrp = PAGE_ALIGN(ramdisk_end);
1684 -                       return 1;
1685 -               }
1686 -       }
1687 +       { 0, PAGE_SIZE, "BIOS data page" },                     /* BIOS data page */
1688 +#ifdef CONFIG_SMP
1689 +       { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1690  #endif
1691 -       /* kernel code */
1692 -       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1693 -               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1694 -               return 1;
1695 -       }
1696 +#endif
1697 +       {}
1698 +};
1699
1700 -       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1701 -               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1702 -               return 1;
1703 +void __init reserve_early(unsigned long start, unsigned long end, char *name)
1704 +{
1705 +       int i;
1706 +       struct early_res *r;
1707 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1708 +               r = &early_res[i];
1709 +               if (end > r->start && start < r->end)
1710 +                       panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1711 +                             start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1712         }
1713 +       if (i >= MAX_EARLY_RES)
1714 +               panic("Too many early reservations");
1715 +       r = &early_res[i];
1716 +       r->start = start;
1717 +       r->end = end;
1718 +       if (name)
1719 +               strncpy(r->name, name, sizeof(r->name) - 1);
1720 +}
1721
1722 -#ifdef CONFIG_NUMA
1723 -       /* NUMA memory to node map */
1724 -       if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1725 -               *addrp = nodemap_addr + nodemap_size;
1726 -               return 1;
1727 +void __init early_res_to_bootmem(void)
1728 +{
1729 +       int i;
1730 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1731 +               struct early_res *r = &early_res[i];
1732 +               printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1733 +                       r->start, r->end - 1, r->name);
1734 +               reserve_bootmem_generic(r->start, r->end - r->start);
1735         }
1736 -#endif
1737 -       /* XXX ramdisk image here? */
1738 -#else
1739 -       if (last < (table_end<<PAGE_SHIFT)) {
1740 -               *addrp = table_end << PAGE_SHIFT;
1741 -               return 1;
1742 +}
1743 +
1744 +/* Check for already reserved areas */
1745 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
1746 +{
1747 +       int i;
1748 +       unsigned long addr = *addrp, last;
1749 +       int changed = 0;
1750 +again:
1751 +       last = addr + size;
1752 +       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1753 +               struct early_res *r = &early_res[i];
1754 +               if (last >= r->start && addr < r->end) {
1755 +                       *addrp = addr = r->end;
1756 +                       changed = 1;
1757 +                       goto again;
1758 +               }
1759         }
1760 -#endif
1761 -       return 0;
1762 -}
1763 +       return changed;
1764 +}
1765
1766  /*
1767   * This function checks if any part of the range <start,end> is mapped
1768   * with type.
1769   */
1770 -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1771 -{
1772 +int
1773 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1774 +{
1775         int i;
1776
1777  #ifndef CONFIG_XEN
1778 -       for (i = 0; i < e820.nr_map; i++) {
1779 -               struct e820entry *ei = &e820.map[i];
1780 +       for (i = 0; i < e820.nr_map; i++) {
1781 +               struct e820entry *ei = &e820.map[i];
1782  #else
1783         if (!is_initial_xendomain())
1784                 return 0;
1785 @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1786                 const struct e820entry *ei = &machine_e820.map[i];
1787  #endif
1788
1789 -               if (type && ei->type != type)
1790 +               if (type && ei->type != type)
1791                         continue;
1792                 if (ei->addr >= end || ei->addr + ei->size <= start)
1793 -                       continue;
1794 -               return 1;
1795 -       }
1796 +                       continue;
1797 +               return 1;
1798 +       }
1799         return 0;
1800  }
1801  EXPORT_SYMBOL_GPL(e820_any_mapped);
1802 @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1803   * Note: this function only works correct if the e820 table is sorted and
1804   * not-overlapping, which is the case
1805   */
1806 -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1807 +int __init e820_all_mapped(unsigned long start, unsigned long end,
1808 +                          unsigned type)
1809  {
1810         int i;
1811
1812 @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1813                  */
1814                 if (ei->addr <= start)
1815                         start = ei->addr + ei->size;
1816 -               /* if start is now at or beyond end, we're done, full coverage */
1817 +               /*
1818 +                * if start is now at or beyond end, we're done, full
1819 +                * coverage
1820 +                */
1821                 if (start >= end)
1822 -                       return 1; /* we're done */
1823 +                       return 1;
1824         }
1825         return 0;
1826  }
1827
1828 -/*
1829 - * Find a free area in a specific range.
1830 - */
1831 -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1832 -{
1833 -       int i;
1834 -       for (i = 0; i < e820.nr_map; i++) {
1835 -               struct e820entry *ei = &e820.map[i];
1836 -               unsigned long addr = ei->addr, last;
1837 -               if (ei->type != E820_RAM)
1838 -                       continue;
1839 -               if (addr < start)
1840 +/*
1841 + * Find a free area with specified alignment in a specific range.
1842 + */
1843 +unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1844 +                                   unsigned size, unsigned long align)
1845 +{
1846 +       int i;
1847 +       unsigned long mask = ~(align - 1);
1848 +
1849 +       for (i = 0; i < e820.nr_map; i++) {
1850 +               struct e820entry *ei = &e820.map[i];
1851 +               unsigned long addr = ei->addr, last;
1852 +
1853 +               if (ei->type != E820_RAM)
1854 +                       continue;
1855 +               if (addr < start)
1856                         addr = start;
1857 -               if (addr > ei->addr + ei->size)
1858 -                       continue;
1859 +               if (addr > ei->addr + ei->size)
1860 +                       continue;
1861                 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1862                         ;
1863 -               last = PAGE_ALIGN(addr) + size;
1864 +               addr = (addr + align - 1) & mask;
1865 +               last = addr + size;
1866                 if (last > ei->addr + ei->size)
1867                         continue;
1868 -               if (last > end)
1869 +               if (last > end)
1870                         continue;
1871 -               return addr;
1872 -       }
1873 -       return -1UL;
1874 -}
1875 +               return addr;
1876 +       }
1877 +       return -1UL;
1878 +}
1879
1880  /*
1881   * Find the highest page frame number we have available
1882   */
1883  unsigned long __init e820_end_of_ram(void)
1884  {
1885 -       unsigned long end_pfn = 0;
1886 +       unsigned long end_pfn;
1887 +
1888         end_pfn = find_max_pfn_with_active_regions();
1889 -
1890 -       if (end_pfn > end_pfn_map)
1891 +
1892 +       if (end_pfn > end_pfn_map)
1893                 end_pfn_map = end_pfn;
1894         if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1895                 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1896         if (end_pfn > end_user_pfn)
1897                 end_pfn = end_user_pfn;
1898 -       if (end_pfn > end_pfn_map)
1899 -               end_pfn = end_pfn_map;
1900 +       if (end_pfn > end_pfn_map)
1901 +               end_pfn = end_pfn_map;
1902
1903 -       printk("end_pfn_map = %lu\n", end_pfn_map);
1904 -       return end_pfn;
1905 +       printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1906 +       return end_pfn;
1907  }
1908
1909  /*
1910   * Mark e820 reserved areas as busy for the resource manager.
1911   */
1912 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1913 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1914 +                                  struct resource *code_resource,
1915 +                                  struct resource *data_resource,
1916 +                                  struct resource *bss_resource)
1917  {
1918         int i;
1919         for (i = 0; i < nr_map; i++) {
1920 @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1921                 request_resource(&iomem_resource, res);
1922                 if (e820[i].type == E820_RAM) {
1923                         /*
1924 -                        *  We don't know which RAM region contains kernel data,
1925 -                        *  so we try it repeatedly and let the resource manager
1926 -                        *  test it.
1927 +                        * We don't know which RAM region contains kernel data,
1928 +                        * so we try it repeatedly and let the resource manager
1929 +                        * test it.
1930                          */
1931  #ifndef CONFIG_XEN
1932 -                       request_resource(res, &code_resource);
1933 -                       request_resource(res, &data_resource);
1934 -                       request_resource(res, &bss_resource);
1935 +                       request_resource(res, code_resource);
1936 +                       request_resource(res, data_resource);
1937 +                       request_resource(res, bss_resource);
1938  #endif
1939  #ifdef CONFIG_KEXEC
1940                         if (crashk_res.start != crashk_res.end)
1941 @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1942                         add_active_range(nid, ei_startpfn, ei_endpfn);
1943  }
1944
1945 -/*
1946 +/*
1947   * Add a memory region to the kernel e820 map.
1948 - */
1949 + */
1950  void __init add_memory_region(unsigned long start, unsigned long size, int type)
1951  {
1952         int x = e820.nr_map;
1953 @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1954  {
1955         unsigned long start_pfn = start >> PAGE_SHIFT;
1956         unsigned long end_pfn = end >> PAGE_SHIFT;
1957 -       unsigned long ei_startpfn;
1958 -       unsigned long ei_endpfn;
1959 -       unsigned long ram = 0;
1960 +       unsigned long ei_startpfn, ei_endpfn, ram = 0;
1961         int i;
1962
1963         for (i = 0; i < e820.nr_map; i++) {
1964 @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1965         return end - start - (ram << PAGE_SHIFT);
1966  }
1967
1968 -void __init e820_print_map(char *who)
1969 +static void __init e820_print_map(char *who)
1970  {
1971         int i;
1972
1973         for (i = 0; i < e820.nr_map; i++) {
1974                 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1975 -                       (unsigned long long) e820.map[i].addr,
1976 -                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1977 +                      (unsigned long long) e820.map[i].addr,
1978 +                      (unsigned long long)
1979 +                      (e820.map[i].addr + e820.map[i].size));
1980                 switch (e820.map[i].type) {
1981 -               case E820_RAM:  printk("(usable)\n");
1982 -                               break;
1983 +               case E820_RAM:
1984 +                       printk(KERN_CONT "(usable)\n");
1985 +                       break;
1986                 case E820_RESERVED:
1987 -                               printk("(reserved)\n");
1988 -                               break;
1989 +                       printk(KERN_CONT "(reserved)\n");
1990 +                       break;
1991                 case E820_ACPI:
1992 -                               printk("(ACPI data)\n");
1993 -                               break;
1994 +                       printk(KERN_CONT "(ACPI data)\n");
1995 +                       break;
1996                 case E820_NVS:
1997 -                               printk("(ACPI NVS)\n");
1998 -                               break;
1999 -               default:        printk("type %u\n", e820.map[i].type);
2000 -                               break;
2001 +                       printk(KERN_CONT "(ACPI NVS)\n");
2002 +                       break;
2003 +               default:
2004 +                       printk(KERN_CONT "type %u\n", e820.map[i].type);
2005 +                       break;
2006                 }
2007         }
2008  }
2009 @@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
2010  /*
2011   * Sanitize the BIOS e820 map.
2012   *
2013 - * Some e820 responses include overlapping entries.  The following
2014 + * Some e820 responses include overlapping entries. The following
2015   * replaces the original e820 map with a new one, removing overlaps.
2016   *
2017   */
2018 -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
2019 +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
2020  {
2021         struct change_member {
2022                 struct e820entry *pbios; /* pointer to original bios entry */
2023 @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
2024         int i;
2025
2026         /*
2027 -               Visually we're performing the following (1,2,3,4 = memory types)...
2028 +               Visually we're performing the following
2029 +               (1,2,3,4 = memory types)...
2030
2031                 Sample memory map (w/overlaps):
2032                    ____22__________________
2033 @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
2034         old_nr = *pnr_map;
2035
2036         /* bail out if we find any unreasonable addresses in bios map */
2037 -       for (i=0; i<old_nr; i++)
2038 +       for (i = 0; i < old_nr; i++)
2039                 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
2040                         return -1;
2041
2042         /* create pointers for initial change-point information (for sorting) */
2043 -       for (i=0; i < 2*old_nr; i++)
2044 +       for (i = 0; i < 2 * old_nr; i++)
2045                 change_point[i] = &change_point_list[i];
2046
2047         /* record all known change-points (starting and ending addresses),
2048            omitting those that are for empty memory regions */
2049         chgidx = 0;
2050 -       for (i=0; i < old_nr; i++)      {
2051 +       for (i = 0; i < old_nr; i++)    {
2052                 if (biosmap[i].size != 0) {
2053                         change_point[chgidx]->addr = biosmap[i].addr;
2054                         change_point[chgidx++]->pbios = &biosmap[i];
2055 -                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
2056 +                       change_point[chgidx]->addr = biosmap[i].addr +
2057 +                               biosmap[i].size;
2058                         change_point[chgidx++]->pbios = &biosmap[i];
2059                 }
2060         }
2061 @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
2062         still_changing = 1;
2063         while (still_changing)  {
2064                 still_changing = 0;
2065 -               for (i=1; i < chg_nr; i++)  {
2066 -                       /* if <current_addr> > <last_addr>, swap */
2067 -                       /* or, if current=<start_addr> & last=<end_addr>, swap */
2068 -                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
2069 -                               ((change_point[i]->addr == change_point[i-1]->addr) &&
2070 -                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
2071 -                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
2072 -                          )
2073 -                       {
2074 +               for (i = 1; i < chg_nr; i++)  {
2075 +                       unsigned long long curaddr, lastaddr;
2076 +                       unsigned long long curpbaddr, lastpbaddr;
2077 +
2078 +                       curaddr = change_point[i]->addr;
2079 +                       lastaddr = change_point[i - 1]->addr;
2080 +                       curpbaddr = change_point[i]->pbios->addr;
2081 +                       lastpbaddr = change_point[i - 1]->pbios->addr;
2082 +
2083 +                       /*
2084 +                        * swap entries, when:
2085 +                        *
2086 +                        * curaddr > lastaddr or
2087 +                        * curaddr == lastaddr and curaddr == curpbaddr and
2088 +                        * lastaddr != lastpbaddr
2089 +                        */
2090 +                       if (curaddr < lastaddr ||
2091 +                           (curaddr == lastaddr && curaddr == curpbaddr &&
2092 +                            lastaddr != lastpbaddr)) {
2093                                 change_tmp = change_point[i];
2094                                 change_point[i] = change_point[i-1];
2095                                 change_point[i-1] = change_tmp;
2096 -                               still_changing=1;
2097 +                               still_changing = 1;
2098                         }
2099                 }
2100         }
2101
2102         /* create a new bios memory map, removing overlaps */
2103 -       overlap_entries=0;       /* number of entries in the overlap table */
2104 -       new_bios_entry=0;        /* index for creating new bios map entries */
2105 +       overlap_entries = 0;     /* number of entries in the overlap table */
2106 +       new_bios_entry = 0;      /* index for creating new bios map entries */
2107         last_type = 0;           /* start with undefined memory type */
2108         last_addr = 0;           /* start with 0 as last starting address */
2109 +
2110         /* loop through change-points, determining affect on the new bios map */
2111 -       for (chgidx=0; chgidx < chg_nr; chgidx++)
2112 -       {
2113 +       for (chgidx = 0; chgidx < chg_nr; chgidx++) {
2114                 /* keep track of all overlapping bios entries */
2115 -               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
2116 -               {
2117 -                       /* add map entry to overlap list (> 1 entry implies an overlap) */
2118 -                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
2119 -               }
2120 -               else
2121 -               {
2122 -                       /* remove entry from list (order independent, so swap with last) */
2123 -                       for (i=0; i<overlap_entries; i++)
2124 -                       {
2125 -                               if (overlap_list[i] == change_point[chgidx]->pbios)
2126 -                                       overlap_list[i] = overlap_list[overlap_entries-1];
2127 +               if (change_point[chgidx]->addr ==
2128 +                   change_point[chgidx]->pbios->addr) {
2129 +                       /*
2130 +                        * add map entry to overlap list (> 1 entry
2131 +                        * implies an overlap)
2132 +                        */
2133 +                       overlap_list[overlap_entries++] =
2134 +                               change_point[chgidx]->pbios;
2135 +               } else {
2136 +                       /*
2137 +                        * remove entry from list (order independent,
2138 +                        * so swap with last)
2139 +                        */
2140 +                       for (i = 0; i < overlap_entries; i++) {
2141 +                               if (overlap_list[i] ==
2142 +                                   change_point[chgidx]->pbios)
2143 +                                       overlap_list[i] =
2144 +                                               overlap_list[overlap_entries-1];
2145                         }
2146                         overlap_entries--;
2147                 }
2148 -               /* if there are overlapping entries, decide which "type" to use */
2149 -               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2150 +               /*
2151 +                * if there are overlapping entries, decide which
2152 +                * "type" to use (larger value takes precedence --
2153 +                * 1=usable, 2,3,4,4+=unusable)
2154 +                */
2155                 current_type = 0;
2156 -               for (i=0; i<overlap_entries; i++)
2157 +               for (i = 0; i < overlap_entries; i++)
2158                         if (overlap_list[i]->type > current_type)
2159                                 current_type = overlap_list[i]->type;
2160 -               /* continue building up new bios map based on this information */
2161 +               /*
2162 +                * continue building up new bios map based on this
2163 +                * information
2164 +                */
2165                 if (current_type != last_type)  {
2166                         if (last_type != 0)      {
2167                                 new_bios[new_bios_entry].size =
2168                                         change_point[chgidx]->addr - last_addr;
2169 -                               /* move forward only if the new size was non-zero */
2170 +                               /*
2171 +                                * move forward only if the new size
2172 +                                * was non-zero
2173 +                                */
2174                                 if (new_bios[new_bios_entry].size != 0)
2175 +                                       /*
2176 +                                        * no more space left for new
2177 +                                        * bios entries ?
2178 +                                        */
2179                                         if (++new_bios_entry >= E820MAX)
2180 -                                               break;  /* no more space left for new bios entries */
2181 +                                               break;
2182                         }
2183                         if (current_type != 0)  {
2184 -                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2185 +                               new_bios[new_bios_entry].addr =
2186 +                                       change_point[chgidx]->addr;
2187                                 new_bios[new_bios_entry].type = current_type;
2188 -                               last_addr=change_point[chgidx]->addr;
2189 +                               last_addr = change_point[chgidx]->addr;
2190                         }
2191                         last_type = current_type;
2192                 }
2193         }
2194 -       new_nr = new_bios_entry;   /* retain count for new bios entries */
2195 +       /* retain count for new bios entries */
2196 +       new_nr = new_bios_entry;
2197
2198         /* copy new bios mapping into original location */
2199 -       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2200 +       memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2201         *pnr_map = new_nr;
2202
2203         return 0;
2204 @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2205   * will have given us a memory map that we can use to properly
2206   * set up memory.  If we aren't, we'll fake a memory map.
2207   */
2208 -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2209 +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2210  {
2211  #ifndef CONFIG_XEN
2212         /* Only one memory region (or negative)? Ignore it */
2213 @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2214                         return -1;
2215
2216                 add_memory_region(start, size, type);
2217 -       } while (biosmap++,--nr_map);
2218 +       } while (biosmap++, --nr_map);
2219
2220  #ifdef CONFIG_XEN
2221         if (is_initial_xendomain()) {
2222 @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2223         return 0;
2224  }
2225
2226 -void early_panic(char *msg)
2227 +static void early_panic(char *msg)
2228  {
2229         early_printk(msg);
2230         panic(msg);
2231  }
2232
2233 -#ifndef CONFIG_XEN
2234 -void __init setup_memory_region(void)
2235 +/* We're not void only for x86 32-bit compat */
2236 +char * __init machine_specific_memory_setup(void)
2237  {
2238 +#ifndef CONFIG_XEN
2239 +       char *who = "BIOS-e820";
2240         /*
2241          * Try to copy the BIOS-supplied E820-map.
2242          *
2243 @@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2244         sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2245         if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2246                 early_panic("Cannot find a valid memory map");
2247 -       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2248 -       e820_print_map("BIOS-e820");
2249 -}
2250 -
2251  #else  /* CONFIG_XEN */
2252 -
2253 -void __init setup_memory_region(void)
2254 -{
2255 +       char *who = "Xen";
2256         int rc;
2257         struct xen_memory_map memmap;
2258         /*
2259 @@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2260
2261         if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2262                 early_panic("Cannot find a valid memory map");
2263 -
2264 +#endif
2265         printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2266 -       e820_print_map("Xen");
2267 +       e820_print_map(who);
2268 +
2269 +       /* In case someone cares... */
2270 +       return who;
2271  }
2272 -#endif
2273
2274  static int __init parse_memopt(char *p)
2275  {
2276 @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2277         if (!p)
2278                 return -EINVAL;
2279         end_user_pfn = memparse(p, &p);
2280 -       end_user_pfn >>= PAGE_SHIFT;
2281 +       end_user_pfn >>= PAGE_SHIFT;
2282
2283         end = end_user_pfn<<PAGE_SHIFT;
2284         i = e820.nr_map-1;
2285 @@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2286         }
2287
2288         return 0;
2289 -}
2290 +}
2291  early_param("mem", parse_memopt);
2292
2293  static int userdef __initdata;
2294 @@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2295
2296         if (!strcmp(p, "exactmap")) {
2297  #ifdef CONFIG_CRASH_DUMP
2298 -               /* If we are doing a crash dump, we
2299 -                * still need to know the real mem
2300 -                * size before original memory map is
2301 +               /*
2302 +                * If we are doing a crash dump, we still need to know
2303 +                * the real mem size before original memory map is
2304                  * reset.
2305                  */
2306                 e820_register_active_regions(0, 0, -1UL);
2307 @@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2308         mem_size = memparse(p, &p);
2309         if (p == oldp)
2310                 return -EINVAL;
2311 +
2312 +       userdef = 1;
2313         if (*p == '@') {
2314                 start_at = memparse(p+1, &p);
2315                 add_memory_region(start_at, mem_size, E820_RAM);
2316 @@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2317  void __init finish_e820_parsing(void)
2318  {
2319         if (userdef) {
2320 +               char nr = e820.nr_map;
2321 +
2322 +               if (sanitize_e820_map(e820.map, &nr) < 0)
2323 +                       early_panic("Invalid user supplied memory map");
2324 +               e820.nr_map = nr;
2325 +
2326                 printk(KERN_INFO "user-defined physical RAM map:\n");
2327                 e820_print_map("user");
2328         }
2329  }
2330
2331 +#ifndef CONFIG_XEN
2332 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2333 +                               unsigned new_type)
2334 +{
2335 +       int i;
2336 +
2337 +       BUG_ON(old_type == new_type);
2338 +
2339 +       for (i = 0; i < e820.nr_map; i++) {
2340 +               struct e820entry *ei = &e820.map[i];
2341 +               u64 final_start, final_end;
2342 +               if (ei->type != old_type)
2343 +                       continue;
2344 +               /* totally covered? */
2345 +               if (ei->addr >= start && ei->size <= size) {
2346 +                       ei->type = new_type;
2347 +                       continue;
2348 +               }
2349 +               /* partially covered */
2350 +               final_start = max(start, ei->addr);
2351 +               final_end = min(start + size, ei->addr + ei->size);
2352 +               if (final_start >= final_end)
2353 +                       continue;
2354 +               add_memory_region(final_start, final_end - final_start,
2355 +                                        new_type);
2356 +       }
2357 +}
2358 +
2359 +void __init update_e820(void)
2360 +{
2361 +       u8 nr_map;
2362 +
2363 +       nr_map = e820.nr_map;
2364 +       if (sanitize_e820_map(e820.map, &nr_map))
2365 +               return;
2366 +       e820.nr_map = nr_map;
2367 +       printk(KERN_INFO "modified physical RAM map:\n");
2368 +       e820_print_map("modified");
2369 +}
2370 +#endif
2371 +
2372  unsigned long pci_mem_start = 0xaeedbabe;
2373  EXPORT_SYMBOL(pci_mem_start);
2374
2375 @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2376
2377         if (!found) {
2378                 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2379 -               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2380 -                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2381 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2382 +                      "address range\n"
2383 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource "
2384 +                      "registers may break!\n");
2385         }
2386
2387         /*
2388 @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2389         /* Fun with two's complement */
2390         pci_mem_start = (gapstart + round) & -round;
2391
2392 -       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2393 -               pci_mem_start, gapstart, gapsize);
2394 +       printk(KERN_INFO
2395 +              "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2396 +              pci_mem_start, gapstart, gapsize);
2397  }
2398
2399  int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2400 --- a/arch/x86/kernel/early_printk-xen.c
2401 +++ b/arch/x86/kernel/early_printk-xen.c
2402 @@ -222,7 +222,7 @@ static struct console simnow_console = {
2403  };
2404
2405  /* Direct interface for emergencies */
2406 -struct console *early_console = &early_vga_console;
2407 +static struct console *early_console = &early_vga_console;
2408  static int early_console_initialized = 0;
2409
2410  void early_printk(const char *fmt, ...)
2411 --- a/arch/x86/kernel/entry_32-xen.S
2412 +++ b/arch/x86/kernel/entry_32-xen.S
2413 @@ -59,7 +59,7 @@
2414   * for paravirtualization.  The following will never clobber any registers:
2415   *   INTERRUPT_RETURN (aka. "iret")
2416   *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2417 - *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2418 + *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2419   *
2420   * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2421   * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2422 @@ -282,16 +282,21 @@ END(resume_kernel)
2423  #endif
2424         CFI_ENDPROC
2425
2426 +       .macro test_tif ti_reg          # system call tracing in operation / emulation
2427 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2428 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2429 +       .endm
2430 +
2431  /* SYSENTER_RETURN points to after the "sysenter" instruction in
2432     the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2433
2434         # sysenter call handler stub
2435 -ENTRY(sysenter_entry)
2436 +ENTRY(ia32_sysenter_target)
2437         CFI_STARTPROC simple
2438         CFI_SIGNAL_FRAME
2439         CFI_DEF_CFA esp, 0
2440         CFI_REGISTER esp, ebp
2441 -       movl SYSENTER_stack_esp0(%esp),%esp
2442 +       movl SYSENTER_stack_sp0(%esp),%esp
2443  sysenter_past_esp:
2444         /*
2445          * No need to follow this irqs on/off section: the syscall
2446 @@ -334,9 +339,7 @@ sysenter_past_esp:
2447         CFI_ADJUST_CFA_OFFSET 4
2448         SAVE_ALL
2449         GET_THREAD_INFO(%ebp)
2450 -
2451 -       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2452 -       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2453 +       test_tif %ebp
2454         jnz syscall_trace_entry
2455         cmpl $(nr_syscalls), %eax
2456         jae syscall_badsys
2457 @@ -354,7 +357,7 @@ sysenter_past_esp:
2458         xorl %ebp,%ebp
2459         TRACE_IRQS_ON
2460  1:     mov  PT_FS(%esp), %fs
2461 -       ENABLE_INTERRUPTS_SYSEXIT
2462 +       ENABLE_INTERRUPTS_SYSCALL_RET
2463         CFI_ENDPROC
2464  .pushsection .fixup,"ax"
2465  2:     movl $0,PT_FS(%esp)
2466 @@ -363,10 +366,10 @@ sysenter_past_esp:
2467         .align 4
2468         .long 1b,2b
2469  .popsection
2470 -ENDPROC(sysenter_entry)
2471 +ENDPROC(ia32_sysenter_target)
2472
2473         # pv sysenter call handler stub
2474 -ENTRY(sysenter_entry_pv)
2475 +ENTRY(ia32pv_sysenter_target)
2476         RING0_INT_FRAME
2477         movl $__USER_DS,16(%esp)
2478         movl %ebp,12(%esp)
2479 @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2480  .previous
2481         /* fall through */
2482         CFI_ENDPROC
2483 -ENDPROC(sysenter_entry_pv)
2484 +ENDPROC(ia32pv_sysenter_target)
2485
2486         # system call handler stub
2487  ENTRY(system_call)
2488 @@ -398,9 +401,7 @@ ENTRY(system_call)
2489         CFI_ADJUST_CFA_OFFSET 4
2490         SAVE_ALL
2491         GET_THREAD_INFO(%ebp)
2492 -                                       # system call tracing in operation / emulation
2493 -       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2494 -       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2495 +       test_tif %ebp
2496         jnz syscall_trace_entry
2497         cmpl $(nr_syscalls), %eax
2498         jae syscall_badsys
2499 @@ -452,7 +453,8 @@ restore_nocheck_notrace:
2500         RESTORE_REGS
2501         addl $4, %esp                   # skip orig_eax/error_code
2502         CFI_ADJUST_CFA_OFFSET -4
2503 -1:     INTERRUPT_RETURN
2504 +irq_return:
2505 +       INTERRUPT_RETURN
2506  .section .fixup,"ax"
2507  iret_exc:
2508         pushl $0                        # no error code
2509 @@ -461,7 +463,7 @@ iret_exc:
2510  .previous
2511  .section __ex_table,"a"
2512         .align 4
2513 -       .long 1b,iret_exc
2514 +       .long irq_return,iret_exc
2515  .previous
2516
2517         CFI_RESTORE_STATE
2518 @@ -657,7 +659,7 @@ END(syscall_badsys)
2519   * Build the entry stubs and pointer table with
2520   * some assembler magic.
2521   */
2522 -.data
2523 +.section .rodata,"a"
2524  ENTRY(interrupt)
2525  .text
2526
2527 @@ -959,7 +961,7 @@ END(device_not_available)
2528   * that sets up the real kernel stack. Check here, since we can't
2529   * allow the wrong stack to be used.
2530   *
2531 - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2532 + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2533   * already pushed 3 words if it hits on the sysenter instruction:
2534   * eflags, cs and eip.
2535   *
2536 @@ -971,7 +973,7 @@ END(device_not_available)
2537         cmpw $__KERNEL_CS,4(%esp);              \
2538         jne ok;                                 \
2539  label:                                         \
2540 -       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
2541 +       movl SYSENTER_stack_sp0+offset(%esp),%esp;      \
2542         CFI_DEF_CFA esp, 0;                     \
2543         CFI_UNDEFINED eip;                      \
2544         pushfl;                                 \
2545 @@ -986,7 +988,7 @@ label:                                              \
2546  KPROBE_ENTRY(debug)
2547         RING0_INT_FRAME
2548  #ifndef CONFIG_XEN
2549 -       cmpl $sysenter_entry,(%esp)
2550 +       cmpl $ia32_sysenter_target,(%esp)
2551         jne debug_stack_correct
2552         FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2553  debug_stack_correct:
2554 @@ -1019,7 +1021,7 @@ KPROBE_ENTRY(nmi)
2555         popl %eax
2556         CFI_ADJUST_CFA_OFFSET -4
2557         je nmi_espfix_stack
2558 -       cmpl $sysenter_entry,(%esp)
2559 +       cmpl $ia32_sysenter_target,(%esp)
2560         je nmi_stack_fixup
2561         pushl %eax
2562         CFI_ADJUST_CFA_OFFSET 4
2563 @@ -1032,7 +1034,7 @@ KPROBE_ENTRY(nmi)
2564         popl %eax
2565         CFI_ADJUST_CFA_OFFSET -4
2566         jae nmi_stack_correct
2567 -       cmpl $sysenter_entry,12(%esp)
2568 +       cmpl $ia32_sysenter_target,12(%esp)
2569         je nmi_debug_stack_check
2570  nmi_stack_correct:
2571         /* We have a RING0_INT_FRAME here */
2572 @@ -1085,12 +1087,8 @@ nmi_espfix_stack:
2573         RESTORE_REGS
2574         lss 12+4(%esp), %esp            # back to espfix stack
2575         CFI_ADJUST_CFA_OFFSET -24
2576 -1:     INTERRUPT_RETURN
2577 +       jmp irq_return
2578         CFI_ENDPROC
2579 -.section __ex_table,"a"
2580 -       .align 4
2581 -       .long 1b,iret_exc
2582 -.previous
2583  #else
2584  KPROBE_ENTRY(nmi)
2585         RING0_INT_FRAME
2586 @@ -1108,17 +1106,17 @@ KPROBE_END(nmi)
2587
2588  #ifdef CONFIG_PARAVIRT
2589  ENTRY(native_iret)
2590 -1:     iret
2591 +       iret
2592  .section __ex_table,"a"
2593         .align 4
2594 -       .long 1b,iret_exc
2595 +       .long native_iret, iret_exc
2596  .previous
2597  END(native_iret)
2598
2599 -ENTRY(native_irq_enable_sysexit)
2600 +ENTRY(native_irq_enable_syscall_ret)
2601         sti
2602         sysexit
2603 -END(native_irq_enable_sysexit)
2604 +END(native_irq_enable_syscall_ret)
2605  #endif
2606
2607  KPROBE_ENTRY(int3)
2608 @@ -1267,7 +1265,144 @@ ENTRY(kernel_thread_helper)
2609         CFI_ENDPROC
2610  ENDPROC(kernel_thread_helper)
2611
2612 +#include <asm/alternative-asm.h>
2613 +
2614 +       # pv syscall call handler stub
2615 +ENTRY(ia32pv_cstar_target)
2616 +       RING0_INT_FRAME
2617 +       movl $__USER_DS,16(%esp)
2618 +       movl %ebp,%ecx
2619 +       movl $__USER_CS,4(%esp)
2620 +       movl 12(%esp),%ebp
2621 +       pushl %eax                      # save orig_eax
2622 +       CFI_ADJUST_CFA_OFFSET 4
2623 +/*
2624 + * Load the potential sixth argument from user stack.
2625 + * Careful about security.
2626 + */
2627 +       cmpl $__PAGE_OFFSET-4,%ebp
2628 +       CFI_REMEMBER_STATE
2629 +       ja cstar_fault
2630 +1:     movl (%ebp),%ebp
2631 +.section __ex_table,"a"
2632 +       .align 4
2633 +       .long 1b,cstar_fault
2634 +.previous
2635 +       SAVE_ALL
2636 +       GET_THREAD_INFO(%ebp)
2637 +       test_tif %ebp
2638 +       jnz cstar_trace_entry
2639 +       cmpl $nr_syscalls,%eax
2640 +       jae cstar_badsys
2641 +.Lcstar_call:
2642 +       btl %eax,cstar_special
2643 +       jc .Lcstar_special
2644 +       call *cstar_call_table(,%eax,4)
2645 +       movl %eax,PT_EAX(%esp)          # store the return value
2646 +.Lcstar_exit:
2647 +       movl PT_ECX(%esp),%ecx
2648 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2649 +       jmp syscall_exit
2650 +.Lcstar_special:
2651 +       movl PT_ECX(%esp),%ecx
2652 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2653 +       jmp syscall_call
2654 +cstar_set_tif:
2655 +       movl $cstar_clear_tif,(%esp)    # replace return address
2656 +       LOCK_PREFIX
2657 +       orl $_TIF_CSTAR,TI_flags(%ebp)
2658 +       jmp *sys_call_table(,%eax,4)
2659 +cstar_clear_tif:
2660 +       movl %eax,PT_EAX(%esp)          # store the return value
2661 +       LOCK_PREFIX
2662 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2663 +       jmp .Lcstar_exit
2664 +cstar_trace_entry:
2665 +       movl $-ENOSYS,PT_EAX(%esp)
2666 +       cmpl $nr_syscalls,%eax
2667 +       jae 1f
2668 +       btl %eax,cstar_special
2669 +       jc .Lcstar_trace_special
2670 +1:     movl %esp,%eax
2671 +       xorl %edx,%edx
2672 +       LOCK_PREFIX
2673 +       orl $_TIF_CSTAR,TI_flags(%ebp)
2674 +       call do_syscall_trace
2675 +       LOCK_PREFIX
2676 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2677 +       testl %eax,%eax
2678 +       jne .Lcstar_resume              # ret != 0 -> running under PTRACE_SYSEMU,
2679 +                                       # so must skip actual syscall
2680 +       movl PT_ORIG_EAX(%esp),%eax
2681 +       cmpl $nr_syscalls,%eax
2682 +       jb .Lcstar_call
2683 +       jmp .Lcstar_exit
2684 +.Lcstar_trace_special:
2685 +       movl PT_ECX(%esp),%ecx
2686 +       movl %esp,%eax
2687 +       xorl %edx,%edx
2688 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2689 +       call do_syscall_trace
2690 +       testl %eax,%eax
2691 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
2692 +                                       # so must skip actual syscall
2693 +       movl PT_ORIG_EAX(%esp),%eax
2694 +       cmpl $nr_syscalls,%eax
2695 +       jb syscall_call
2696 +       jmp syscall_exit
2697 +cstar_badsys:
2698 +       movl $-ENOSYS,PT_EAX(%esp)
2699 +.Lcstar_resume:
2700 +       movl PT_ECX(%esp),%ecx
2701 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2702 +       jmp resume_userspace
2703 +       CFI_RESTORE_STATE
2704 +cstar_fault:
2705 +       movl $-EFAULT,%eax
2706 +       SAVE_ALL
2707 +       GET_THREAD_INFO(%ebp)
2708 +       jmp .Lcstar_resume
2709 +       CFI_ENDPROC
2710 +ENDPROC(ia32pv_cstar_target)
2711 +
2712 +ENTRY(cstar_ret_from_fork)
2713 +       CFI_STARTPROC
2714 +       movl PT_ECX(%esp),%ecx
2715 +       GET_THREAD_INFO(%ebp)
2716 +       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
2717 +       LOCK_PREFIX
2718 +       andl $~_TIF_CSTAR,TI_flags(%ebp)
2719 +       jmp ret_from_fork
2720 +       CFI_ENDPROC
2721 +END(ret_from_fork)
2722 +
2723  .section .rodata,"a"
2724  #include "syscall_table_32.S"
2725
2726  syscall_table_size=(.-sys_call_table)
2727 +
2728 +#include <asm/unistd.h>
2729 +cstar_special:
2730 +nr=0
2731 +mask=0
2732 +.rept nr_syscalls+31
2733 + .irp n, __NR_sigreturn, __NR_rt_sigreturn
2734 +  .if nr == \n
2735 +   mask = mask | (1 << (\n & 31))
2736 +  .endif
2737 + .endr
2738 + nr = nr + 1
2739 + .if (nr & 31) == 0
2740 +  .long mask
2741 +  mask = 0
2742 + .endif
2743 +.endr
2744 +#define        sys_call_table cstar_call_table
2745 +#define        sys_fork cstar_set_tif
2746 +#define        sys_clone cstar_set_tif
2747 +#define        sys_vfork cstar_set_tif
2748 +#include "syscall_table_32.S"
2749 +#undef sys_call_table
2750 +#undef sys_fork
2751 +#undef sys_clone
2752 +#undef sys_vfork
2753 --- a/arch/x86/kernel/entry_64-xen.S
2754 +++ b/arch/x86/kernel/entry_64-xen.S
2755 @@ -54,17 +54,22 @@
2756  #include <asm/page.h>
2757  #include <asm/irqflags.h>
2758  #include <asm/errno.h>
2759 -#include <xen/interface/arch-x86_64.h>
2760 +#include <xen/interface/xen.h>
2761  #include <xen/interface/features.h>
2762
2763 -#include "xen_entry_64.S"
2764 -
2765         .code64
2766
2767  #ifndef CONFIG_PREEMPT
2768  #define retint_kernel retint_restore_args
2769  #endif
2770
2771 +#ifdef CONFIG_PARAVIRT
2772 +ENTRY(native_irq_enable_syscall_ret)
2773 +       movq    %gs:pda_oldrsp,%rsp
2774 +       swapgs
2775 +       sysretq
2776 +#endif /* CONFIG_PARAVIRT */
2777 +
2778
2779  .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2780  #ifdef CONFIG_TRACE_IRQFLAGS
2781 @@ -277,7 +282,7 @@ ret_from_sys_call:
2782  sysret_check:
2783         LOCKDEP_SYS_EXIT
2784         GET_THREAD_INFO(%rcx)
2785 -        XEN_BLOCK_EVENTS(%rsi)
2786 +       DISABLE_INTERRUPTS(CLBR_NONE)
2787         TRACE_IRQS_OFF
2788         movl threadinfo_flags(%rcx),%edx
2789         andl %edi,%edx
2790 @@ -287,7 +292,7 @@ sysret_check:
2791          * sysretq will re-enable interrupts:
2792          */
2793         TRACE_IRQS_ON
2794 -        XEN_UNBLOCK_EVENTS(%rsi)
2795 +       ENABLE_INTERRUPTS(CLBR_NONE)
2796         RESTORE_ARGS 0,8,0
2797          HYPERVISOR_IRET VGCF_IN_SYSCALL
2798
2799 @@ -298,7 +303,7 @@ sysret_careful:
2800         bt $TIF_NEED_RESCHED,%edx
2801         jnc sysret_signal
2802         TRACE_IRQS_ON
2803 -       XEN_UNBLOCK_EVENTS(%rsi)
2804 +       ENABLE_INTERRUPTS(CLBR_NONE)
2805         pushq %rdi
2806         CFI_ADJUST_CFA_OFFSET 8
2807         call schedule
2808 @@ -309,9 +314,8 @@ sysret_careful:
2809         /* Handle a signal */
2810  sysret_signal:
2811         TRACE_IRQS_ON
2812 -/*     sti */
2813 -        XEN_UNBLOCK_EVENTS(%rsi)
2814 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2815 +       ENABLE_INTERRUPTS(CLBR_NONE)
2816 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2817         jz    1f
2818
2819         /* Really a signal */
2820 @@ -323,7 +327,7 @@ sysret_signal:
2821  1:     movl $_TIF_NEED_RESCHED,%edi
2822         /* Use IRET because user could have changed frame. This
2823            works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2824 -       XEN_BLOCK_EVENTS(%rsi)
2825 +       DISABLE_INTERRUPTS(CLBR_NONE)
2826         TRACE_IRQS_OFF
2827         jmp int_with_check
2828
2829 @@ -355,7 +359,7 @@ tracesys:
2830   */
2831         .globl int_ret_from_sys_call
2832  int_ret_from_sys_call:
2833 -        XEN_BLOCK_EVENTS(%rsi)
2834 +       DISABLE_INTERRUPTS(CLBR_NONE)
2835         TRACE_IRQS_OFF
2836         testb $3,CS-ARGOFFSET(%rsp)
2837          jnz 1f
2838 @@ -381,22 +385,20 @@ int_careful:
2839         bt $TIF_NEED_RESCHED,%edx
2840         jnc  int_very_careful
2841         TRACE_IRQS_ON
2842 -/*     sti */
2843 -        XEN_UNBLOCK_EVENTS(%rsi)
2844 +       ENABLE_INTERRUPTS(CLBR_NONE)
2845         pushq %rdi
2846         CFI_ADJUST_CFA_OFFSET 8
2847         call schedule
2848         popq %rdi
2849         CFI_ADJUST_CFA_OFFSET -8
2850 -       XEN_BLOCK_EVENTS(%rsi)
2851 +       DISABLE_INTERRUPTS(CLBR_NONE)
2852         TRACE_IRQS_OFF
2853         jmp int_with_check
2854
2855         /* handle signals and tracing -- both require a full stack frame */
2856  int_very_careful:
2857         TRACE_IRQS_ON
2858 -/*     sti */
2859 -        XEN_UNBLOCK_EVENTS(%rsi)
2860 +       ENABLE_INTERRUPTS(CLBR_NONE)
2861         SAVE_REST
2862         /* Check for syscall exit trace */
2863         testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2864 @@ -411,7 +413,7 @@ int_very_careful:
2865         jmp int_restore_rest
2866
2867  int_signal:
2868 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2869 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2870         jz 1f
2871         movq %rsp,%rdi          # &ptregs -> arg1
2872         xorl %esi,%esi          # oldset -> arg2
2873 @@ -419,7 +421,7 @@ int_signal:
2874  1:     movl $_TIF_NEED_RESCHED,%edi
2875  int_restore_rest:
2876         RESTORE_REST
2877 -       XEN_BLOCK_EVENTS(%rsi)
2878 +       DISABLE_INTERRUPTS(CLBR_NONE)
2879         TRACE_IRQS_OFF
2880         jmp int_with_check
2881         CFI_ENDPROC
2882 @@ -474,6 +476,7 @@ ENTRY(stub_execve)
2883         CFI_REGISTER rip, r11
2884         SAVE_REST
2885         FIXUP_TOP_OF_STACK %r11
2886 +       movq %rsp, %rcx
2887         call sys_execve
2888         RESTORE_TOP_OF_STACK %r11
2889         movq %rax,RAX(%rsp)
2890 @@ -526,11 +529,10 @@ retint_check:
2891  retint_restore_args:   /* return to kernel space */
2892         movl EFLAGS-REST_SKIP(%rsp), %eax
2893         shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
2894 -       XEN_GET_VCPU_INFO(%rsi)
2895 +       GET_VCPU_INFO
2896         andb evtchn_upcall_mask(%rsi),%al
2897         andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
2898         jnz restore_all_enable_events   #        != 0 => enable event delivery
2899 -       XEN_PUT_VCPU_INFO(%rsi)
2900
2901         RESTORE_ARGS 0,8,0
2902         HYPERVISOR_IRET 0
2903 @@ -541,31 +543,29 @@ retint_careful:
2904         bt    $TIF_NEED_RESCHED,%edx
2905         jnc   retint_signal
2906         TRACE_IRQS_ON
2907 -       XEN_UNBLOCK_EVENTS(%rsi)
2908 -/*     sti */
2909 +       ENABLE_INTERRUPTS(CLBR_NONE)
2910         pushq %rdi
2911         CFI_ADJUST_CFA_OFFSET   8
2912         call  schedule
2913         popq %rdi
2914         CFI_ADJUST_CFA_OFFSET   -8
2915         GET_THREAD_INFO(%rcx)
2916 -       XEN_BLOCK_EVENTS(%rsi)
2917 -/*     cli */
2918 +       DISABLE_INTERRUPTS(CLBR_NONE)
2919         TRACE_IRQS_OFF
2920         jmp retint_check
2921
2922  retint_signal:
2923 -       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2924 +       testl $_TIF_DO_NOTIFY_MASK,%edx
2925         jz    retint_restore_args
2926         TRACE_IRQS_ON
2927 -        XEN_UNBLOCK_EVENTS(%rsi)
2928 +       ENABLE_INTERRUPTS(CLBR_NONE)
2929         SAVE_REST
2930         movq $-1,ORIG_RAX(%rsp)
2931         xorl %esi,%esi          # oldset
2932         movq %rsp,%rdi          # &pt_regs
2933         call do_notify_resume
2934         RESTORE_REST
2935 -        XEN_BLOCK_EVENTS(%rsi)
2936 +       DISABLE_INTERRUPTS(CLBR_NONE)
2937         TRACE_IRQS_OFF
2938         movl $_TIF_NEED_RESCHED,%edi
2939         GET_THREAD_INFO(%rcx)
2940 @@ -702,7 +702,7 @@ END(spurious_interrupt)
2941         rdmsr
2942         testl %edx,%edx
2943         js    1f
2944 -       swapgs
2945 +       SWAPGS
2946         xorl  %ebx,%ebx
2947  1:
2948  #endif
2949 @@ -719,8 +719,7 @@ END(spurious_interrupt)
2950         .if \ist
2951         addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2952         .endif
2953 -/*     cli */
2954 -       XEN_BLOCK_EVENTS(%rsi)
2955 +       DISABLE_INTERRUPTS(CLBR_NONE)
2956         .if \irqtrace
2957         TRACE_IRQS_OFF
2958         .endif
2959 @@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2960         .if \trace
2961         TRACE_IRQS_IRETQ 0
2962         .endif
2963 -       swapgs
2964 +       SWAPGS_UNSAFE_STACK
2965  paranoid_restore\trace:
2966         RESTORE_ALL 8
2967 -       iretq
2968 +       jmp irq_return
2969  paranoid_userspace\trace:
2970         GET_THREAD_INFO(%rcx)
2971         movl threadinfo_flags(%rcx),%ebx
2972 @@ -767,11 +766,11 @@ paranoid_userspace\trace:
2973         .if \trace
2974         TRACE_IRQS_ON
2975         .endif
2976 -       sti
2977 +       ENABLE_INTERRUPTS(CLBR_NONE)
2978         xorl %esi,%esi                  /* arg2: oldset */
2979         movq %rsp,%rdi                  /* arg1: &pt_regs */
2980         call do_notify_resume
2981 -       cli
2982 +       DISABLE_INTERRUPTS(CLBR_NONE)
2983         .if \trace
2984         TRACE_IRQS_OFF
2985         .endif
2986 @@ -780,9 +779,9 @@ paranoid_schedule\trace:
2987         .if \trace
2988         TRACE_IRQS_ON
2989         .endif
2990 -       sti
2991 +       ENABLE_INTERRUPTS(CLBR_ANY)
2992         call schedule
2993 -       cli
2994 +       DISABLE_INTERRUPTS(CLBR_ANY)
2995         .if \trace
2996         TRACE_IRQS_OFF
2997         .endif
2998 @@ -846,8 +845,7 @@ error_call_handler:
2999         call *%rax
3000  error_exit:
3001         RESTORE_REST
3002 -/*     cli */
3003 -       XEN_BLOCK_EVENTS(%rsi)
3004 +       DISABLE_INTERRUPTS(CLBR_NONE)
3005         TRACE_IRQS_OFF
3006         GET_THREAD_INFO(%rcx)
3007         testb $3,CS-ARGOFFSET(%rsp)
3008 @@ -875,7 +873,7 @@ error_kernelspace:
3009            iret run with kernel gs again, so don't set the user space flag.
3010            B stepping K8s sometimes report an truncated RIP for IRET
3011            exceptions returning to compat mode. Check for these here too. */
3012 -       leaq iret_label(%rip),%rbp
3013 +       leaq irq_return(%rip),%rbp
3014         cmpq %rbp,RIP(%rsp)
3015         je   error_swapgs
3016         movl %ebp,%ebp  /* zero extend */
3017 @@ -930,19 +928,17 @@ END(do_hypervisor_callback)
3018  restore_all_enable_events:
3019         CFI_DEFAULT_STACK adj=1
3020         TRACE_IRQS_ON
3021 -       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
3022 +       __ENABLE_INTERRUPTS
3023
3024  scrit: /**** START OF CRITICAL REGION ****/
3025 -       XEN_TEST_PENDING(%rsi)
3026 +       __TEST_PENDING
3027         CFI_REMEMBER_STATE
3028         jnz  14f                        # process more events if necessary...
3029 -       XEN_PUT_VCPU_INFO(%rsi)
3030          RESTORE_ARGS 0,8,0
3031          HYPERVISOR_IRET 0
3032
3033         CFI_RESTORE_STATE
3034 -14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
3035 -       XEN_PUT_VCPU_INFO(%rsi)
3036 +14:    __DISABLE_INTERRUPTS
3037         SAVE_REST
3038          movq %rsp,%rdi                  # set the argument again
3039         jmp  11b
3040 @@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
3041   *     rdi: name, rsi: argv, rdx: envp
3042   *
3043   * We want to fallback into:
3044 - *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
3045 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
3046   *
3047   * do_sys_execve asm fallback arguments:
3048 - *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
3049 + *     rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
3050   */
3051  ENTRY(kernel_execve)
3052         CFI_STARTPROC
3053         FAKE_STACK_FRAME $0
3054         SAVE_ALL
3055 +       movq %rsp,%rcx
3056         call sys_execve
3057         movq %rax, RAX(%rsp)
3058         RESTORE_REST
3059 @@ -1144,7 +1141,7 @@ do_nmi_callback:
3060         call do_nmi
3061         orl  $NMI_MASK,EFLAGS(%rsp)
3062         RESTORE_REST
3063 -       XEN_BLOCK_EVENTS(%rsi)
3064 +       DISABLE_INTERRUPTS(CLBR_NONE)
3065         TRACE_IRQS_OFF
3066         GET_THREAD_INFO(%rcx)
3067         jmp  retint_restore_args
3068 --- a/arch/x86/kernel/fixup.c
3069 +++ b/arch/x86/kernel/fixup.c
3070 @@ -36,7 +36,7 @@
3071
3072  #define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
3073
3074 -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3075 +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3076  {
3077         static unsigned long printed = 0;
3078         char info[100];
3079 --- a/arch/x86/kernel/genapic_64-xen.c
3080 +++ b/arch/x86/kernel/genapic_64-xen.c
3081 @@ -24,20 +24,13 @@
3082  #include <acpi/acpi_bus.h>
3083  #endif
3084
3085 -/*
3086 - * which logical CPU number maps to which CPU (physical APIC ID)
3087 - *
3088 - * The following static array is used during kernel startup
3089 - * and the x86_cpu_to_apicid_ptr contains the address of the
3090 - * array during this time.  Is it zeroed when the per_cpu
3091 - * data area is removed.
3092 - */
3093 +/* which logical CPU number maps to which CPU (physical APIC ID) */
3094  #ifndef CONFIG_XEN
3095 -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3096 +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
3097                                         = { [0 ... NR_CPUS-1] = BAD_APICID };
3098 -void *x86_cpu_to_apicid_ptr;
3099 +void *x86_cpu_to_apicid_early_ptr;
3100  #endif
3101 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
3102 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
3103  EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
3104
3105  #ifndef CONFIG_XEN
3106 --- a/arch/x86/kernel/head_32-xen.S
3107 +++ b/arch/x86/kernel/head_32-xen.S
3108 @@ -3,6 +3,7 @@
3109  .text
3110  #include <linux/elfnote.h>
3111  #include <linux/threads.h>
3112 +#include <linux/init.h>
3113  #include <linux/linkage.h>
3114  #include <asm/segment.h>
3115  #include <asm/page.h>
3116 @@ -88,7 +89,7 @@ ENTRY(_stext)
3117   */
3118  .section ".bss.page_aligned","wa"
3119         .align PAGE_SIZE_asm
3120 -ENTRY(swapper_pg_pmd)
3121 +ENTRY(swapper_pg_fixmap)
3122         .fill 1024,4,0
3123  ENTRY(empty_zero_page)
3124         .fill 4096,1,0
3125 --- a/arch/x86/kernel/head64-xen.c
3126 +++ b/arch/x86/kernel/head64-xen.c
3127 @@ -16,6 +16,7 @@
3128  #include <linux/kernel.h>
3129  #include <linux/string.h>
3130  #include <linux/percpu.h>
3131 +#include <linux/start_kernel.h>
3132  #include <linux/module.h>
3133
3134  #include <asm/processor.h>
3135 @@ -26,6 +27,8 @@
3136  #include <asm/pgtable.h>
3137  #include <asm/tlbflush.h>
3138  #include <asm/sections.h>
3139 +#include <asm/kdebug.h>
3140 +#include <asm/e820.h>
3141
3142  unsigned long start_pfn;
3143
3144 @@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3145  {
3146         pgd_t *pgd = pgd_offset_k(0UL);
3147         pgd_clear(pgd);
3148 -       __flush_tlb();
3149 +       __flush_tlb_all();
3150  }
3151
3152  /* Don't add a printk in there. printk relies on the PDA which is not initialized
3153 @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3154  unsigned int machine_to_phys_order;
3155  EXPORT_SYMBOL(machine_to_phys_order);
3156
3157 +#define EBDA_ADDR_POINTER 0x40E
3158 +
3159 +static __init void reserve_ebda(void)
3160 +{
3161 +#ifndef CONFIG_XEN
3162 +       unsigned ebda_addr, ebda_size;
3163 +
3164 +       /*
3165 +        * there is a real-mode segmented pointer pointing to the
3166 +        * 4K EBDA area at 0x40E
3167 +        */
3168 +       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3169 +       ebda_addr <<= 4;
3170 +
3171 +       if (!ebda_addr)
3172 +               return;
3173 +
3174 +       ebda_size = *(unsigned short *)__va(ebda_addr);
3175 +
3176 +       /* Round EBDA up to pages */
3177 +       if (ebda_size == 0)
3178 +               ebda_size = 1;
3179 +       ebda_size <<= 10;
3180 +       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3181 +       if (ebda_size > 64*1024)
3182 +               ebda_size = 64*1024;
3183 +
3184 +       reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3185 +#endif
3186 +}
3187 +
3188  void __init x86_64_start_kernel(char * real_mode_data)
3189  {
3190         struct xen_machphys_mapping mapping;
3191 @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3192         /* Make NULL pointers segfault */
3193         zap_identity_mappings();
3194
3195 -       for (i = 0; i < IDT_ENTRIES; i++)
3196 +       /* Cleanup the over mapped high alias */
3197 +       cleanup_highmap();
3198 +
3199 +       for (i = 0; i < IDT_ENTRIES; i++) {
3200 +#ifdef CONFIG_EARLY_PRINTK
3201 +               set_intr_gate(i, &early_idt_handlers[i]);
3202 +#else
3203                 set_intr_gate(i, early_idt_handler);
3204 +#endif
3205 +       }
3206         load_idt((const struct desc_ptr *)&idt_descr);
3207  #endif
3208
3209 @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3210
3211         pda_init(0);
3212         copy_bootdata(__va(real_mode_data));
3213 -#ifdef CONFIG_SMP
3214 -       cpu_set(0, cpu_online_map);
3215 -#endif
3216 +
3217 +       reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3218 +
3219 +       reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3220 +                     start_pfn << PAGE_SHIFT, "Xen provided");
3221 +
3222 +       reserve_ebda();
3223 +
3224 +       /*
3225 +        * At this point everything still needed from the boot loader
3226 +        * or BIOS or kernel text should be early reserved or marked not
3227 +        * RAM in e820. All other memory is free game.
3228 +        */
3229 +
3230         start_kernel();
3231  }
3232 --- a/arch/x86/kernel/init_task-xen.c
3233 +++ b/arch/x86/kernel/init_task-xen.c
3234 @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3235  #endif
3236  struct mm_struct init_mm = INIT_MM(init_mm);
3237  #undef swapper_pg_dir
3238 -EXPORT_SYMBOL(init_mm);
3239 +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3240
3241  /*
3242   * Initial thread structure.
3243 --- a/arch/x86/kernel/io_apic_32-xen.c
3244 +++ b/arch/x86/kernel/io_apic_32-xen.c
3245 @@ -35,6 +35,7 @@
3246  #include <linux/htirq.h>
3247  #include <linux/freezer.h>
3248  #include <linux/kthread.h>
3249 +#include <linux/jiffies.h>     /* time_after() */
3250
3251  #include <asm/io.h>
3252  #include <asm/smp.h>
3253 @@ -48,8 +49,6 @@
3254  #include <mach_apic.h>
3255  #include <mach_apicdef.h>
3256
3257 -#include "io_ports.h"
3258 -
3259  #ifdef CONFIG_XEN
3260  #include <xen/interface/xen.h>
3261  #include <xen/interface/physdev.h>
3262 @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3263  # include <asm/processor.h>    /* kernel_thread() */
3264  # include <linux/kernel_stat.h>        /* kstat */
3265  # include <linux/slab.h>               /* kmalloc() */
3266 -# include <linux/timer.h>      /* time_after() */
3267 +# include <linux/timer.h>
3268
3269  #define IRQBALANCE_CHECK_ARCH -999
3270  #define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
3271 @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3272  #endif
3273
3274  #ifndef CONFIG_SMP
3275 -void fastcall send_IPI_self(int vector)
3276 +void send_IPI_self(int vector)
3277  {
3278  #ifndef CONFIG_XEN
3279         unsigned int cfg;
3280 @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3281          * might have cached one ExtINT interrupt.  Finally, at
3282          * least one tick may be lost due to delays.
3283          */
3284 -       if (jiffies - t1 > 4)
3285 +       if (time_after(jiffies, t1 + 4))
3286                 return 1;
3287
3288         return 0;
3289 @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3290         .eoi            = ack_apic,
3291  };
3292
3293 -static void setup_nmi (void)
3294 +static void __init setup_nmi(void)
3295  {
3296         /*
3297          * Dirty trick to enable the NMI watchdog ...
3298 @@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3299          */
3300         apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3301
3302 -       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3303 +       enable_NMI_through_LVT0();
3304
3305         apic_printk(APIC_VERBOSE, " done.\n");
3306  }
3307 @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3308  }
3309
3310  static struct sysdev_class ioapic_sysdev_class = {
3311 -       set_kset_name("ioapic"),
3312 +       .name = "ioapic",
3313         .suspend = ioapic_suspend,
3314         .resume = ioapic_resume,
3315  };
3316 --- a/arch/x86/kernel/io_apic_64-xen.c
3317 +++ b/arch/x86/kernel/io_apic_64-xen.c
3318 @@ -32,9 +32,11 @@
3319  #include <linux/msi.h>
3320  #include <linux/htirq.h>
3321  #include <linux/dmar.h>
3322 +#include <linux/jiffies.h>
3323  #ifdef CONFIG_ACPI
3324  #include <acpi/acpi_bus.h>
3325  #endif
3326 +#include <linux/bootmem.h>
3327
3328  #include <asm/idle.h>
3329  #include <asm/io.h>
3330 @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3331         v = apic_read(APIC_LVR);
3332         printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3333         ver = GET_APIC_VERSION(v);
3334 -       maxlvt = get_maxlvt();
3335 +       maxlvt = lapic_get_maxlvt();
3336
3337         v = apic_read(APIC_TASKPRI);
3338         printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3339 @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3340  }
3341  #endif /* !CONFIG_XEN */
3342
3343 -static void __init enable_IO_APIC(void)
3344 +void __init enable_IO_APIC(void)
3345  {
3346         union IO_APIC_reg_01 reg_01;
3347  #ifndef CONFIG_XEN
3348 @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3349          */
3350
3351         /* jiffies wrap? */
3352 -       if (jiffies - t1 > 4)
3353 +       if (time_after(jiffies, t1 + 4))
3354                 return 1;
3355         return 0;
3356  }
3357 @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3358         if (likely(!cfg->move_in_progress))
3359                 return;
3360
3361 -       vector = ~get_irq_regs()->orig_rax;
3362 +       vector = ~get_irq_regs()->orig_ax;
3363         me = smp_processor_id();
3364         if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3365                 cpumask_t cleanup_mask;
3366 @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3367         int do_unmask_irq = 0;
3368
3369         irq_complete_move(irq);
3370 -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3371 +#ifdef CONFIG_GENERIC_PENDING_IRQ
3372         /* If we are moving the irq we need to mask it */
3373         if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3374                 do_unmask_irq = 1;
3375 @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3376         .end = end_lapic_irq,
3377  };
3378
3379 -static void setup_nmi (void)
3380 +static void __init setup_nmi(void)
3381  {
3382         /*
3383          * Dirty trick to enable the NMI watchdog ...
3384 @@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3385          */
3386         printk(KERN_INFO "activating NMI Watchdog ...");
3387
3388 -       enable_NMI_through_LVT0(NULL);
3389 +       enable_NMI_through_LVT0();
3390
3391         printk(" done.\n");
3392  }
3393 @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3394   *
3395   * FIXME: really need to revamp this for modern platforms only.
3396   */
3397 -static inline void check_timer(void)
3398 +static inline void __init check_timer(void)
3399  {
3400         struct irq_cfg *cfg = irq_cfg + 0;
3401         int apic1, pin1, apic2, pin2;
3402 @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3403  }
3404
3405  static struct sysdev_class ioapic_sysdev_class = {
3406 -       set_kset_name("ioapic"),
3407 +       .name = "ioapic",
3408         .suspend = ioapic_suspend,
3409         .resume = ioapic_resume,
3410  };
3411 @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3412         }
3413  }
3414  #endif
3415 -#endif /* !CONFIG_XEN */
3416
3417 +#define IOAPIC_RESOURCE_NAME_SIZE 11
3418 +
3419 +static struct resource *ioapic_resources;
3420 +
3421 +static struct resource * __init ioapic_setup_resources(void)
3422 +{
3423 +       unsigned long n;
3424 +       struct resource *res;
3425 +       char *mem;
3426 +       int i;
3427 +
3428 +       if (nr_ioapics <= 0)
3429 +               return NULL;
3430 +
3431 +       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3432 +       n *= nr_ioapics;
3433 +
3434 +       mem = alloc_bootmem(n);
3435 +       res = (void *)mem;
3436 +
3437 +       if (mem != NULL) {
3438 +               memset(mem, 0, n);
3439 +               mem += sizeof(struct resource) * nr_ioapics;
3440 +
3441 +               for (i = 0; i < nr_ioapics; i++) {
3442 +                       res[i].name = mem;
3443 +                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3444 +                       sprintf(mem,  "IOAPIC %u", i);
3445 +                       mem += IOAPIC_RESOURCE_NAME_SIZE;
3446 +               }
3447 +       }
3448 +
3449 +       ioapic_resources = res;
3450 +
3451 +       return res;
3452 +}
3453 +
3454 +void __init ioapic_init_mappings(void)
3455 +{
3456 +       unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3457 +       struct resource *ioapic_res;
3458 +       int i;
3459 +
3460 +       ioapic_res = ioapic_setup_resources();
3461 +       for (i = 0; i < nr_ioapics; i++) {
3462 +               if (smp_found_config) {
3463 +                       ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3464 +               } else {
3465 +                       ioapic_phys = (unsigned long)
3466 +                               alloc_bootmem_pages(PAGE_SIZE);
3467 +                       ioapic_phys = __pa(ioapic_phys);
3468 +               }
3469 +               set_fixmap_nocache(idx, ioapic_phys);
3470 +               apic_printk(APIC_VERBOSE,
3471 +                           "mapped IOAPIC to %016lx (%016lx)\n",
3472 +                           __fix_to_virt(idx), ioapic_phys);
3473 +               idx++;
3474 +
3475 +               if (ioapic_res != NULL) {
3476 +                       ioapic_res->start = ioapic_phys;
3477 +                       ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3478 +                       ioapic_res++;
3479 +               }
3480 +       }
3481 +}
3482 +
3483 +static int __init ioapic_insert_resources(void)
3484 +{
3485 +       int i;
3486 +       struct resource *r = ioapic_resources;
3487 +
3488 +       if (!r) {
3489 +               printk(KERN_ERR
3490 +                      "IO APIC resources could be not be allocated.\n");
3491 +               return -1;
3492 +       }
3493 +
3494 +       for (i = 0; i < nr_ioapics; i++) {
3495 +               insert_resource(&iomem_resource, r);
3496 +               r++;
3497 +       }
3498 +
3499 +       return 0;
3500 +}
3501 +
3502 +/* Insert the IO APIC resources after PCI initialization has occured to handle
3503 + * IO APICS that are mapped in on a BAR in PCI space. */
3504 +late_initcall(ioapic_insert_resources);
3505 +#endif /* !CONFIG_XEN */
3506 --- a/arch/x86/kernel/ioport_32-xen.c
3507 +++ /dev/null
3508 @@ -1,121 +0,0 @@
3509 -/*
3510 - * This contains the io-permission bitmap code - written by obz, with changes
3511 - * by Linus.
3512 - */
3513 -
3514 -#include <linux/sched.h>
3515 -#include <linux/kernel.h>
3516 -#include <linux/capability.h>
3517 -#include <linux/errno.h>
3518 -#include <linux/types.h>
3519 -#include <linux/ioport.h>
3520 -#include <linux/smp.h>
3521 -#include <linux/stddef.h>
3522 -#include <linux/slab.h>
3523 -#include <linux/thread_info.h>
3524 -#include <linux/syscalls.h>
3525 -#include <xen/interface/physdev.h>
3526 -
3527 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3528 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3529 -{
3530 -       unsigned long mask;
3531 -       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3532 -       unsigned int low_index = base & (BITS_PER_LONG-1);
3533 -       int length = low_index + extent;
3534 -
3535 -       if (low_index != 0) {
3536 -               mask = (~0UL << low_index);
3537 -               if (length < BITS_PER_LONG)
3538 -                       mask &= ~(~0UL << length);
3539 -               if (new_value)
3540 -                       *bitmap_base++ |= mask;
3541 -               else
3542 -                       *bitmap_base++ &= ~mask;
3543 -               length -= BITS_PER_LONG;
3544 -       }
3545 -
3546 -       mask = (new_value ? ~0UL : 0UL);
3547 -       while (length >= BITS_PER_LONG) {
3548 -               *bitmap_base++ = mask;
3549 -               length -= BITS_PER_LONG;
3550 -       }
3551 -
3552 -       if (length > 0) {
3553 -               mask = ~(~0UL << length);
3554 -               if (new_value)
3555 -                       *bitmap_base++ |= mask;
3556 -               else
3557 -                       *bitmap_base++ &= ~mask;
3558 -       }
3559 -}
3560 -
3561 -
3562 -/*
3563 - * this changes the io permissions bitmap in the current task.
3564 - */
3565 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3566 -{
3567 -       struct thread_struct * t = &current->thread;
3568 -       unsigned long *bitmap;
3569 -       struct physdev_set_iobitmap set_iobitmap;
3570 -
3571 -       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3572 -               return -EINVAL;
3573 -       if (turn_on && !capable(CAP_SYS_RAWIO))
3574 -               return -EPERM;
3575 -
3576 -       /*
3577 -        * If it's the first ioperm() call in this thread's lifetime, set the
3578 -        * IO bitmap up. ioperm() is much less timing critical than clone(),
3579 -        * this is why we delay this operation until now:
3580 -        */
3581 -       if (!t->io_bitmap_ptr) {
3582 -               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3583 -               if (!bitmap)
3584 -                       return -ENOMEM;
3585 -
3586 -               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3587 -               t->io_bitmap_ptr = bitmap;
3588 -               set_thread_flag(TIF_IO_BITMAP);
3589 -
3590 -               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3591 -               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3592 -               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3593 -                                             &set_iobitmap));
3594 -       }
3595 -
3596 -       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3597 -
3598 -       return 0;
3599 -}
3600 -
3601 -/*
3602 - * sys_iopl has to be used when you want to access the IO ports
3603 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3604 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3605 - *
3606 - * Here we just change the eflags value on the stack: we allow
3607 - * only the super-user to do it. This depends on the stack-layout
3608 - * on system-call entry - see also fork() and the signal handling
3609 - * code.
3610 - */
3611 -
3612 -asmlinkage long sys_iopl(unsigned long unused)
3613 -{
3614 -       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3615 -       unsigned int level = regs->ebx;
3616 -       struct thread_struct *t = &current->thread;
3617 -       unsigned int old = (t->iopl >> 12) & 3;
3618 -
3619 -       if (level > 3)
3620 -               return -EINVAL;
3621 -       /* Trying to gain more privileges? */
3622 -       if (level > old) {
3623 -               if (!capable(CAP_SYS_RAWIO))
3624 -                       return -EPERM;
3625 -       }
3626 -       t->iopl = level << 12;
3627 -       set_iopl_mask(t->iopl);
3628 -       return 0;
3629 -}
3630 --- a/arch/x86/kernel/ioport_64-xen.c
3631 +++ /dev/null
3632 @@ -1,99 +0,0 @@
3633 -/*
3634 - * This contains the io-permission bitmap code - written by obz, with changes
3635 - * by Linus.
3636 - */
3637 -
3638 -#include <linux/sched.h>
3639 -#include <linux/kernel.h>
3640 -#include <linux/capability.h>
3641 -#include <linux/errno.h>
3642 -#include <linux/types.h>
3643 -#include <linux/ioport.h>
3644 -#include <linux/mm.h>
3645 -#include <linux/smp.h>
3646 -#include <linux/stddef.h>
3647 -#include <linux/slab.h>
3648 -#include <linux/thread_info.h>
3649 -#include <linux/syscalls.h>
3650 -#include <xen/interface/physdev.h>
3651 -
3652 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3653 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3654 -{
3655 -       int i;
3656 -
3657 -       if (new_value)
3658 -               for (i = base; i < base + extent; i++)
3659 -                       __set_bit(i, bitmap);
3660 -       else
3661 -               for (i = base; i < base + extent; i++)
3662 -                       clear_bit(i, bitmap);
3663 -}
3664 -
3665 -/*
3666 - * this changes the io permissions bitmap in the current task.
3667 - */
3668 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3669 -{
3670 -       struct thread_struct * t = &current->thread;
3671 -       unsigned long *bitmap;
3672 -       struct physdev_set_iobitmap set_iobitmap;
3673 -
3674 -       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3675 -               return -EINVAL;
3676 -       if (turn_on && !capable(CAP_SYS_RAWIO))
3677 -               return -EPERM;
3678 -
3679 -       /*
3680 -        * If it's the first ioperm() call in this thread's lifetime, set the
3681 -        * IO bitmap up. ioperm() is much less timing critical than clone(),
3682 -        * this is why we delay this operation until now:
3683 -        */
3684 -       if (!t->io_bitmap_ptr) {
3685 -               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3686 -               if (!bitmap)
3687 -                       return -ENOMEM;
3688 -
3689 -               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3690 -               t->io_bitmap_ptr = bitmap;
3691 -               set_thread_flag(TIF_IO_BITMAP);
3692 -
3693 -               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3694 -               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3695 -               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3696 -                                             &set_iobitmap));
3697 -       }
3698 -
3699 -       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3700 -
3701 -       return 0;
3702 -}
3703 -
3704 -/*
3705 - * sys_iopl has to be used when you want to access the IO ports
3706 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3707 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3708 - *
3709 - */
3710 -
3711 -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3712 -{
3713 -       unsigned int old_iopl = current->thread.iopl;
3714 -       struct physdev_set_iopl set_iopl;
3715 -
3716 -       if (new_iopl > 3)
3717 -               return -EINVAL;
3718 -
3719 -       /* Need "raw I/O" privileges for direct port access. */
3720 -       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3721 -               return -EPERM;
3722 -
3723 -       /* Change our version of the privilege levels. */
3724 -       current->thread.iopl = new_iopl;
3725 -
3726 -       /* Force the change at ring 0. */
3727 -       set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3728 -       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3729 -
3730 -       return 0;
3731 -}
3732 --- /dev/null
3733 +++ b/arch/x86/kernel/ioport-xen.c
3734 @@ -0,0 +1,112 @@
3735 +/*
3736 + * This contains the io-permission bitmap code - written by obz, with changes
3737 + * by Linus. 32/64 bits code unification by Miguel Botón.
3738 + */
3739 +
3740 +#include <linux/sched.h>
3741 +#include <linux/kernel.h>
3742 +#include <linux/capability.h>
3743 +#include <linux/errno.h>
3744 +#include <linux/types.h>
3745 +#include <linux/ioport.h>
3746 +#include <linux/smp.h>
3747 +#include <linux/stddef.h>
3748 +#include <linux/slab.h>
3749 +#include <linux/thread_info.h>
3750 +#include <linux/syscalls.h>
3751 +#include <xen/interface/physdev.h>
3752 +
3753 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3754 +static void set_bitmap(unsigned long *bitmap, unsigned int base,
3755 +                      unsigned int extent, int new_value)
3756 +{
3757 +       unsigned int i;
3758 +
3759 +       for (i = base; i < base + extent; i++) {
3760 +               if (new_value)
3761 +                       __set_bit(i, bitmap);
3762 +               else
3763 +                       __clear_bit(i, bitmap);
3764 +       }
3765 +}
3766 +
3767 +/*
3768 + * this changes the io permissions bitmap in the current task.
3769 + */
3770 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3771 +{
3772 +       struct thread_struct * t = &current->thread;
3773 +       struct physdev_set_iobitmap set_iobitmap;
3774 +
3775 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3776 +               return -EINVAL;
3777 +       if (turn_on && !capable(CAP_SYS_RAWIO))
3778 +               return -EPERM;
3779 +
3780 +       /*
3781 +        * If it's the first ioperm() call in this thread's lifetime, set the
3782 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
3783 +        * this is why we delay this operation until now:
3784 +        */
3785 +       if (!t->io_bitmap_ptr) {
3786 +               unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3787 +
3788 +               if (!bitmap)
3789 +                       return -ENOMEM;
3790 +
3791 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
3792 +               t->io_bitmap_ptr = bitmap;
3793 +               set_thread_flag(TIF_IO_BITMAP);
3794 +
3795 +               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3796 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
3797 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3798 +                                             &set_iobitmap));
3799 +       }
3800 +
3801 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3802 +
3803 +       return 0;
3804 +}
3805 +
3806 +/*
3807 + * sys_iopl has to be used when you want to access the IO ports
3808 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3809 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
3810 + */
3811 +static int do_iopl(unsigned int level, struct thread_struct *t)
3812 +{
3813 +       unsigned int old = t->iopl >> 12;
3814 +
3815 +       if (level > 3)
3816 +               return -EINVAL;
3817 +       /* Trying to gain more privileges? */
3818 +       if (level > old) {
3819 +               if (!capable(CAP_SYS_RAWIO))
3820 +                       return -EPERM;
3821 +       }
3822 +
3823 +       return 0;
3824 +}
3825 +
3826 +#ifdef CONFIG_X86_32
3827 +asmlinkage long sys_iopl(unsigned long regsp)
3828 +{
3829 +       struct pt_regs *regs = (struct pt_regs *)&regsp;
3830 +       unsigned int level = regs->bx;
3831 +#else
3832 +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3833 +{
3834 +#endif
3835 +       struct thread_struct *t = &current->thread;
3836 +       int rc;
3837 +
3838 +       rc = do_iopl(level, t);
3839 +       if (rc < 0)
3840 +               goto out;
3841 +
3842 +       t->iopl = level << 12;
3843 +       set_iopl_mask(t->iopl);
3844 +out:
3845 +       return rc;
3846 +}
3847 --- a/arch/x86/kernel/irq_32-xen.c
3848 +++ b/arch/x86/kernel/irq_32-xen.c
3849 @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3850   * SMP cross-CPU interrupts have their own specific
3851   * handlers).
3852   */
3853 -fastcall unsigned int do_IRQ(struct pt_regs *regs)
3854 +unsigned int do_IRQ(struct pt_regs *regs)
3855  {
3856         struct pt_regs *old_regs;
3857         /* high bit used in ret_from_ code */
3858 -       int irq = ~regs->orig_eax;
3859 +       int irq = ~regs->orig_ax;
3860         struct irq_desc *desc = irq_desc + irq;
3861  #ifdef CONFIG_4KSTACKS
3862         union irq_ctx *curctx, *irqctx;
3863 @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3864  #ifdef CONFIG_DEBUG_STACKOVERFLOW
3865         /* Debugging check for stack overflow: is there less than 1KB free? */
3866         {
3867 -               long esp;
3868 +               long sp;
3869
3870                 __asm__ __volatile__("andl %%esp,%0" :
3871 -                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
3872 -               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3873 +                                       "=r" (sp) : "0" (THREAD_SIZE - 1));
3874 +               if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3875                         printk("do_IRQ: stack overflow: %ld\n",
3876 -                               esp - sizeof(struct thread_info));
3877 +                               sp - sizeof(struct thread_info));
3878                         dump_stack();
3879                 }
3880         }
3881 @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3882          * current stack (which is the irq stack already after all)
3883          */
3884         if (curctx != irqctx) {
3885 -               int arg1, arg2, ebx;
3886 +               int arg1, arg2, bx;
3887
3888                 /* build the stack frame on the IRQ stack */
3889                 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3890 @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3891                         (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3892
3893                 asm volatile(
3894 -                       "       xchgl  %%ebx,%%esp      \n"
3895 -                       "       call   *%%edi           \n"
3896 -                       "       movl   %%ebx,%%esp      \n"
3897 -                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3898 +                       "       xchgl  %%ebx,%%esp    \n"
3899 +                       "       call   *%%edi         \n"
3900 +                       "       movl   %%ebx,%%esp    \n"
3901 +                       : "=a" (arg1), "=d" (arg2), "=b" (bx)
3902                         :  "0" (irq),   "1" (desc),  "2" (isp),
3903                            "D" (desc->handle_irq)
3904                         : "memory", "cc"
3905 --- a/arch/x86/kernel/irq_64-xen.c
3906 +++ b/arch/x86/kernel/irq_64-xen.c
3907 @@ -20,6 +20,28 @@
3908
3909  atomic_t irq_err_count;
3910
3911 +/*
3912 + * 'what should we do if we get a hw irq event on an illegal vector'.
3913 + * each architecture has to answer this themselves.
3914 + */
3915 +void ack_bad_irq(unsigned int irq)
3916 +{
3917 +       printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3918 +#ifdef CONFIG_X86_LOCAL_APIC
3919 +       /*
3920 +        * Currently unexpected vectors happen only on SMP and APIC.
3921 +        * We _must_ ack these because every local APIC has only N
3922 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
3923 +        * holds up an irq slot - in excessive cases (when multiple
3924 +        * unexpected vectors occur) that might lock up the APIC
3925 +        * completely.
3926 +        * But don't ack when the APIC is disabled. -AK
3927 +        */
3928 +       if (!disable_apic)
3929 +               ack_APIC_irq();
3930 +#endif
3931 +}
3932 +
3933  #ifdef CONFIG_DEBUG_STACKOVERFLOW
3934  /*
3935   * Probabilistic stack overflow check:
3936 @@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3937         u64 curbase = (u64)task_stack_page(current);
3938         static unsigned long warned = -60*HZ;
3939
3940 -       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3941 -           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
3942 +       if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3943 +           regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
3944             time_after(jiffies, warned + 60*HZ)) {
3945 -               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3946 -                      current->comm, curbase, regs->rsp);
3947 +               printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3948 +                      current->comm, curbase, regs->sp);
3949                 show_stack(NULL,NULL);
3950                 warned = jiffies;
3951         }
3952 @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3953         struct pt_regs *old_regs = set_irq_regs(regs);
3954
3955         /* high bit used in ret_from_ code  */
3956 -       unsigned irq = ~regs->orig_rax;
3957 +       unsigned irq = ~regs->orig_ax;
3958
3959         /*exit_idle();*/
3960         /*irq_enter();*/
3961 @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3962         }
3963         local_irq_restore(flags);
3964  }
3965 -
3966 -#ifndef CONFIG_X86_LOCAL_APIC
3967 -/*
3968 - * 'what should we do if we get a hw irq event on an illegal vector'.
3969 - * each architecture has to answer this themselves.
3970 - */
3971 -void ack_bad_irq(unsigned int irq)
3972 -{
3973 -        printk("unexpected IRQ trap at irq %02x\n", irq);
3974 -}
3975 -#endif
3976 --- a/arch/x86/kernel/ldt_32-xen.c
3977 +++ /dev/null
3978 @@ -1,265 +0,0 @@
3979 -/*
3980 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3981 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3982 - */
3983 -
3984 -#include <linux/errno.h>
3985 -#include <linux/sched.h>
3986 -#include <linux/string.h>
3987 -#include <linux/mm.h>
3988 -#include <linux/smp.h>
3989 -#include <linux/vmalloc.h>
3990 -#include <linux/slab.h>
3991 -
3992 -#include <asm/uaccess.h>
3993 -#include <asm/system.h>
3994 -#include <asm/ldt.h>
3995 -#include <asm/desc.h>
3996 -#include <asm/mmu_context.h>
3997 -
3998 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
3999 -static void flush_ldt(void *null)
4000 -{
4001 -       if (current->active_mm)
4002 -               load_LDT(&current->active_mm->context);
4003 -}
4004 -#endif
4005 -
4006 -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4007 -{
4008 -       void *oldldt;
4009 -       void *newldt;
4010 -       int oldsize;
4011 -
4012 -       if (mincount <= pc->size)
4013 -               return 0;
4014 -       oldsize = pc->size;
4015 -       mincount = (mincount+511)&(~511);
4016 -       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4017 -               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4018 -       else
4019 -               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4020 -
4021 -       if (!newldt)
4022 -               return -ENOMEM;
4023 -
4024 -       if (oldsize)
4025 -               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4026 -       oldldt = pc->ldt;
4027 -       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4028 -       pc->ldt = newldt;
4029 -       wmb();
4030 -       pc->size = mincount;
4031 -       wmb();
4032 -
4033 -       if (reload) {
4034 -#ifdef CONFIG_SMP
4035 -               cpumask_t mask;
4036 -               preempt_disable();
4037 -#endif
4038 -               make_pages_readonly(
4039 -                       pc->ldt,
4040 -                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4041 -                       XENFEAT_writable_descriptor_tables);
4042 -               load_LDT(pc);
4043 -#ifdef CONFIG_SMP
4044 -               mask = cpumask_of_cpu(smp_processor_id());
4045 -               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4046 -                       smp_call_function(flush_ldt, NULL, 1, 1);
4047 -               preempt_enable();
4048 -#endif
4049 -       }
4050 -       if (oldsize) {
4051 -               make_pages_writable(
4052 -                       oldldt,
4053 -                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4054 -                       XENFEAT_writable_descriptor_tables);
4055 -               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4056 -                       vfree(oldldt);
4057 -               else
4058 -                       kfree(oldldt);
4059 -       }
4060 -       return 0;
4061 -}
4062 -
4063 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4064 -{
4065 -       int err = alloc_ldt(new, old->size, 0);
4066 -       if (err < 0)
4067 -               return err;
4068 -       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4069 -       make_pages_readonly(
4070 -               new->ldt,
4071 -               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4072 -               XENFEAT_writable_descriptor_tables);
4073 -       return 0;
4074 -}
4075 -
4076 -/*
4077 - * we do not have to muck with descriptors here, that is
4078 - * done in switch_mm() as needed.
4079 - */
4080 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4081 -{
4082 -       struct mm_struct * old_mm;
4083 -       int retval = 0;
4084 -
4085 -       mutex_init(&mm->context.lock);
4086 -       mm->context.size = 0;
4087 -       mm->context.has_foreign_mappings = 0;
4088 -       old_mm = current->mm;
4089 -       if (old_mm && old_mm->context.size > 0) {
4090 -               mutex_lock(&old_mm->context.lock);
4091 -               retval = copy_ldt(&mm->context, &old_mm->context);
4092 -               mutex_unlock(&old_mm->context.lock);
4093 -       }
4094 -       return retval;
4095 -}
4096 -
4097 -/*
4098 - * No need to lock the MM as we are the last user
4099 - */
4100 -void destroy_context(struct mm_struct *mm)
4101 -{
4102 -       if (mm->context.size) {
4103 -               if (mm == current->active_mm)
4104 -                       clear_LDT();
4105 -               make_pages_writable(
4106 -                       mm->context.ldt,
4107 -                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4108 -                       XENFEAT_writable_descriptor_tables);
4109 -               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4110 -                       vfree(mm->context.ldt);
4111 -               else
4112 -                       kfree(mm->context.ldt);
4113 -               mm->context.size = 0;
4114 -       }
4115 -}
4116 -
4117 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4118 -{
4119 -       int err;
4120 -       unsigned long size;
4121 -       struct mm_struct * mm = current->mm;
4122 -
4123 -       if (!mm->context.size)
4124 -               return 0;
4125 -       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4126 -               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4127 -
4128 -       mutex_lock(&mm->context.lock);
4129 -       size = mm->context.size*LDT_ENTRY_SIZE;
4130 -       if (size > bytecount)
4131 -               size = bytecount;
4132 -
4133 -       err = 0;
4134 -       if (copy_to_user(ptr, mm->context.ldt, size))
4135 -               err = -EFAULT;
4136 -       mutex_unlock(&mm->context.lock);
4137 -       if (err < 0)
4138 -               goto error_return;
4139 -       if (size != bytecount) {
4140 -               /* zero-fill the rest */
4141 -               if (clear_user(ptr+size, bytecount-size) != 0) {
4142 -                       err = -EFAULT;
4143 -                       goto error_return;
4144 -               }
4145 -       }
4146 -       return bytecount;
4147 -error_return:
4148 -       return err;
4149 -}
4150 -
4151 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4152 -{
4153 -       int err;
4154 -       unsigned long size;
4155 -
4156 -       err = 0;
4157 -       size = 5*sizeof(struct desc_struct);
4158 -       if (size > bytecount)
4159 -               size = bytecount;
4160 -
4161 -       err = size;
4162 -       if (clear_user(ptr, size))
4163 -               err = -EFAULT;
4164 -
4165 -       return err;
4166 -}
4167 -
4168 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4169 -{
4170 -       struct mm_struct * mm = current->mm;
4171 -       __u32 entry_1, entry_2;
4172 -       int error;
4173 -       struct user_desc ldt_info;
4174 -
4175 -       error = -EINVAL;
4176 -       if (bytecount != sizeof(ldt_info))
4177 -               goto out;
4178 -       error = -EFAULT;
4179 -       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4180 -               goto out;
4181 -
4182 -       error = -EINVAL;
4183 -       if (ldt_info.entry_number >= LDT_ENTRIES)
4184 -               goto out;
4185 -       if (ldt_info.contents == 3) {
4186 -               if (oldmode)
4187 -                       goto out;
4188 -               if (ldt_info.seg_not_present == 0)
4189 -                       goto out;
4190 -       }
4191 -
4192 -       mutex_lock(&mm->context.lock);
4193 -       if (ldt_info.entry_number >= mm->context.size) {
4194 -               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4195 -               if (error < 0)
4196 -                       goto out_unlock;
4197 -       }
4198 -
4199 -       /* Allow LDTs to be cleared by the user. */
4200 -       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4201 -               if (oldmode || LDT_empty(&ldt_info)) {
4202 -                       entry_1 = 0;
4203 -                       entry_2 = 0;
4204 -                       goto install;
4205 -               }
4206 -       }
4207 -
4208 -       entry_1 = LDT_entry_a(&ldt_info);
4209 -       entry_2 = LDT_entry_b(&ldt_info);
4210 -       if (oldmode)
4211 -               entry_2 &= ~(1 << 20);
4212 -
4213 -       /* Install the new entry ...  */
4214 -install:
4215 -       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4216 -                               entry_1, entry_2);
4217 -
4218 -out_unlock:
4219 -       mutex_unlock(&mm->context.lock);
4220 -out:
4221 -       return error;
4222 -}
4223 -
4224 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4225 -{
4226 -       int ret = -ENOSYS;
4227 -
4228 -       switch (func) {
4229 -       case 0:
4230 -               ret = read_ldt(ptr, bytecount);
4231 -               break;
4232 -       case 1:
4233 -               ret = write_ldt(ptr, bytecount, 1);
4234 -               break;
4235 -       case 2:
4236 -               ret = read_default_ldt(ptr, bytecount);
4237 -               break;
4238 -       case 0x11:
4239 -               ret = write_ldt(ptr, bytecount, 0);
4240 -               break;
4241 -       }
4242 -       return ret;
4243 -}
4244 --- a/arch/x86/kernel/ldt_64-xen.c
4245 +++ /dev/null
4246 @@ -1,271 +0,0 @@
4247 -/*
4248 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4249 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4250 - * Copyright (C) 2002 Andi Kleen
4251 - *
4252 - * This handles calls from both 32bit and 64bit mode.
4253 - */
4254 -
4255 -#include <linux/errno.h>
4256 -#include <linux/sched.h>
4257 -#include <linux/string.h>
4258 -#include <linux/mm.h>
4259 -#include <linux/smp.h>
4260 -#include <linux/vmalloc.h>
4261 -#include <linux/slab.h>
4262 -
4263 -#include <asm/uaccess.h>
4264 -#include <asm/system.h>
4265 -#include <asm/ldt.h>
4266 -#include <asm/desc.h>
4267 -#include <asm/proto.h>
4268 -#include <asm/pgalloc.h>
4269 -
4270 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4271 -static void flush_ldt(void *null)
4272 -{
4273 -       if (current->active_mm)
4274 -               load_LDT(&current->active_mm->context);
4275 -}
4276 -#endif
4277 -
4278 -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4279 -{
4280 -       void *oldldt;
4281 -       void *newldt;
4282 -       unsigned oldsize;
4283 -
4284 -       if (mincount <= (unsigned)pc->size)
4285 -               return 0;
4286 -       oldsize = pc->size;
4287 -       mincount = (mincount+511)&(~511);
4288 -       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4289 -               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4290 -       else
4291 -               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4292 -
4293 -       if (!newldt)
4294 -               return -ENOMEM;
4295 -
4296 -       if (oldsize)
4297 -               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4298 -       oldldt = pc->ldt;
4299 -       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4300 -       wmb();
4301 -       pc->ldt = newldt;
4302 -       wmb();
4303 -       pc->size = mincount;
4304 -       wmb();
4305 -       if (reload) {
4306 -#ifdef CONFIG_SMP
4307 -               cpumask_t mask;
4308 -
4309 -               preempt_disable();
4310 -#endif
4311 -               make_pages_readonly(
4312 -                       pc->ldt,
4313 -                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4314 -                       XENFEAT_writable_descriptor_tables);
4315 -               load_LDT(pc);
4316 -#ifdef CONFIG_SMP
4317 -               mask = cpumask_of_cpu(smp_processor_id());
4318 -               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4319 -                       smp_call_function(flush_ldt, NULL, 1, 1);
4320 -               preempt_enable();
4321 -#endif
4322 -       }
4323 -       if (oldsize) {
4324 -               make_pages_writable(
4325 -                       oldldt,
4326 -                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4327 -                       XENFEAT_writable_descriptor_tables);
4328 -               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4329 -                       vfree(oldldt);
4330 -               else
4331 -                       kfree(oldldt);
4332 -       }
4333 -       return 0;
4334 -}
4335 -
4336 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4337 -{
4338 -       int err = alloc_ldt(new, old->size, 0);
4339 -       if (err < 0)
4340 -               return err;
4341 -       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4342 -       make_pages_readonly(
4343 -               new->ldt,
4344 -               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4345 -               XENFEAT_writable_descriptor_tables);
4346 -       return 0;
4347 -}
4348 -
4349 -/*
4350 - * we do not have to muck with descriptors here, that is
4351 - * done in switch_mm() as needed.
4352 - */
4353 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4354 -{
4355 -       struct mm_struct * old_mm;
4356 -       int retval = 0;
4357 -
4358 -       memset(&mm->context, 0, sizeof(mm->context));
4359 -       mutex_init(&mm->context.lock);
4360 -       old_mm = current->mm;
4361 -       if (old_mm)
4362 -               mm->context.vdso = old_mm->context.vdso;
4363 -       if (old_mm && old_mm->context.size > 0) {
4364 -               mutex_lock(&old_mm->context.lock);
4365 -               retval = copy_ldt(&mm->context, &old_mm->context);
4366 -               mutex_unlock(&old_mm->context.lock);
4367 -       }
4368 -       return retval;
4369 -}
4370 -
4371 -/*
4372 - *
4373 - * Don't touch the LDT register - we're already in the next thread.
4374 - */
4375 -void destroy_context(struct mm_struct *mm)
4376 -{
4377 -       if (mm->context.size) {
4378 -               if (mm == current->active_mm)
4379 -                       clear_LDT();
4380 -               make_pages_writable(
4381 -                       mm->context.ldt,
4382 -                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4383 -                       XENFEAT_writable_descriptor_tables);
4384 -               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4385 -                       vfree(mm->context.ldt);
4386 -               else
4387 -                       kfree(mm->context.ldt);
4388 -               mm->context.size = 0;
4389 -       }
4390 -}
4391 -
4392 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4393 -{
4394 -       int err;
4395 -       unsigned long size;
4396 -       struct mm_struct * mm = current->mm;
4397 -
4398 -       if (!mm->context.size)
4399 -               return 0;
4400 -       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4401 -               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4402 -
4403 -       mutex_lock(&mm->context.lock);
4404 -       size = mm->context.size*LDT_ENTRY_SIZE;
4405 -       if (size > bytecount)
4406 -               size = bytecount;
4407 -
4408 -       err = 0;
4409 -       if (copy_to_user(ptr, mm->context.ldt, size))
4410 -               err = -EFAULT;
4411 -       mutex_unlock(&mm->context.lock);
4412 -       if (err < 0)
4413 -               goto error_return;
4414 -       if (size != bytecount) {
4415 -               /* zero-fill the rest */
4416 -               if (clear_user(ptr+size, bytecount-size) != 0) {
4417 -                       err = -EFAULT;
4418 -                       goto error_return;
4419 -               }
4420 -       }
4421 -       return bytecount;
4422 -error_return:
4423 -       return err;
4424 -}
4425 -
4426 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4427 -{
4428 -       /* Arbitrary number */
4429 -       /* x86-64 default LDT is all zeros */
4430 -       if (bytecount > 128)
4431 -               bytecount = 128;
4432 -       if (clear_user(ptr, bytecount))
4433 -               return -EFAULT;
4434 -       return bytecount;
4435 -}
4436 -
4437 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4438 -{
4439 -       struct task_struct *me = current;
4440 -       struct mm_struct * mm = me->mm;
4441 -       __u32 entry_1, entry_2, *lp;
4442 -       unsigned long mach_lp;
4443 -       int error;
4444 -       struct user_desc ldt_info;
4445 -
4446 -       error = -EINVAL;
4447 -
4448 -       if (bytecount != sizeof(ldt_info))
4449 -               goto out;
4450 -       error = -EFAULT;
4451 -       if (copy_from_user(&ldt_info, ptr, bytecount))
4452 -               goto out;
4453 -
4454 -       error = -EINVAL;
4455 -       if (ldt_info.entry_number >= LDT_ENTRIES)
4456 -               goto out;
4457 -       if (ldt_info.contents == 3) {
4458 -               if (oldmode)
4459 -                       goto out;
4460 -               if (ldt_info.seg_not_present == 0)
4461 -                       goto out;
4462 -       }
4463 -
4464 -       mutex_lock(&mm->context.lock);
4465 -       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4466 -               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4467 -               if (error < 0)
4468 -                       goto out_unlock;
4469 -       }
4470 -
4471 -       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4472 -       mach_lp = arbitrary_virt_to_machine(lp);
4473 -
4474 -       /* Allow LDTs to be cleared by the user. */
4475 -       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4476 -               if (oldmode || LDT_empty(&ldt_info)) {
4477 -                       entry_1 = 0;
4478 -                       entry_2 = 0;
4479 -                       goto install;
4480 -               }
4481 -       }
4482 -
4483 -       entry_1 = LDT_entry_a(&ldt_info);
4484 -       entry_2 = LDT_entry_b(&ldt_info);
4485 -       if (oldmode)
4486 -               entry_2 &= ~(1 << 20);
4487 -
4488 -       /* Install the new entry ...  */
4489 -install:
4490 -       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4491 -
4492 -out_unlock:
4493 -       mutex_unlock(&mm->context.lock);
4494 -out:
4495 -       return error;
4496 -}
4497 -
4498 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4499 -{
4500 -       int ret = -ENOSYS;
4501 -
4502 -       switch (func) {
4503 -       case 0:
4504 -               ret = read_ldt(ptr, bytecount);
4505 -               break;
4506 -       case 1:
4507 -               ret = write_ldt(ptr, bytecount, 1);
4508 -               break;
4509 -       case 2:
4510 -               ret = read_default_ldt(ptr, bytecount);
4511 -               break;
4512 -       case 0x11:
4513 -               ret = write_ldt(ptr, bytecount, 0);
4514 -               break;
4515 -       }
4516 -       return ret;
4517 -}
4518 --- /dev/null
4519 +++ b/arch/x86/kernel/ldt-xen.c
4520 @@ -0,0 +1,272 @@
4521 +/*
4522 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4523 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4524 + * Copyright (C) 2002 Andi Kleen
4525 + *
4526 + * This handles calls from both 32bit and 64bit mode.
4527 + */
4528 +
4529 +#include <linux/errno.h>
4530 +#include <linux/sched.h>
4531 +#include <linux/string.h>
4532 +#include <linux/mm.h>
4533 +#include <linux/smp.h>
4534 +#include <linux/vmalloc.h>
4535 +
4536 +#include <asm/uaccess.h>
4537 +#include <asm/system.h>
4538 +#include <asm/ldt.h>
4539 +#include <asm/desc.h>
4540 +#include <asm/mmu_context.h>
4541 +
4542 +#ifdef CONFIG_SMP
4543 +static void flush_ldt(void *null)
4544 +{
4545 +       if (current->active_mm)
4546 +               load_LDT(&current->active_mm->context);
4547 +}
4548 +#endif
4549 +
4550 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4551 +{
4552 +       void *oldldt, *newldt;
4553 +       int oldsize;
4554 +
4555 +       if (mincount <= pc->size)
4556 +               return 0;
4557 +       oldsize = pc->size;
4558 +       mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
4559 +                       (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
4560 +       if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
4561 +               newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
4562 +       else
4563 +               newldt = (void *)__get_free_page(GFP_KERNEL);
4564 +
4565 +       if (!newldt)
4566 +               return -ENOMEM;
4567 +
4568 +       if (oldsize)
4569 +               memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
4570 +       oldldt = pc->ldt;
4571 +       memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
4572 +              (mincount - oldsize) * LDT_ENTRY_SIZE);
4573 +
4574 +#ifdef CONFIG_X86_64
4575 +       /* CHECKME: Do we really need this ? */
4576 +       wmb();
4577 +#endif
4578 +       pc->ldt = newldt;
4579 +       wmb();
4580 +       pc->size = mincount;
4581 +       wmb();
4582 +
4583 +       if (reload) {
4584 +#ifdef CONFIG_SMP
4585 +               cpumask_t mask;
4586 +
4587 +               preempt_disable();
4588 +#endif
4589 +               make_pages_readonly(newldt,
4590 +                                   (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
4591 +                                   XENFEAT_writable_descriptor_tables);
4592 +               load_LDT(pc);
4593 +#ifdef CONFIG_SMP
4594 +               mask = cpumask_of_cpu(smp_processor_id());
4595 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4596 +                       smp_call_function(flush_ldt, NULL, 1, 1);
4597 +               preempt_enable();
4598 +#endif
4599 +       }
4600 +       if (oldsize) {
4601 +               make_pages_writable(oldldt,
4602 +                                   (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4603 +                                   XENFEAT_writable_descriptor_tables);
4604 +               if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
4605 +                       vfree(oldldt);
4606 +               else
4607 +                       put_page(virt_to_page(oldldt));
4608 +       }
4609 +       return 0;
4610 +}
4611 +
4612 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4613 +{
4614 +       int err = alloc_ldt(new, old->size, 0);
4615 +
4616 +       if (err < 0)
4617 +               return err;
4618 +       memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
4619 +       make_pages_readonly(new->ldt,
4620 +                           (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4621 +                           XENFEAT_writable_descriptor_tables);
4622 +       return 0;
4623 +}
4624 +
4625 +/*
4626 + * we do not have to muck with descriptors here, that is
4627 + * done in switch_mm() as needed.
4628 + */
4629 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4630 +{
4631 +       struct mm_struct *old_mm;
4632 +       int retval = 0;
4633 +
4634 +       memset(&mm->context, 0, sizeof(mm->context));
4635 +       mutex_init(&mm->context.lock);
4636 +       old_mm = current->mm;
4637 +       if (old_mm)
4638 +               mm->context.vdso = old_mm->context.vdso;
4639 +       if (old_mm && old_mm->context.size > 0) {
4640 +               mutex_lock(&old_mm->context.lock);
4641 +               retval = copy_ldt(&mm->context, &old_mm->context);
4642 +               mutex_unlock(&old_mm->context.lock);
4643 +       }
4644 +       return retval;
4645 +}
4646 +
4647 +/*
4648 + * No need to lock the MM as we are the last user
4649 + *
4650 + * 64bit: Don't touch the LDT register - we're already in the next thread.
4651 + */
4652 +void destroy_context(struct mm_struct *mm)
4653 +{
4654 +       if (mm->context.size) {
4655 +               /* CHECKME: Can this ever happen ? */
4656 +               if (mm == current->active_mm)
4657 +                       clear_LDT();
4658 +               make_pages_writable(mm->context.ldt,
4659 +                                   (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4660 +                                   XENFEAT_writable_descriptor_tables);
4661 +               if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
4662 +                       vfree(mm->context.ldt);
4663 +               else
4664 +                       put_page(virt_to_page(mm->context.ldt));
4665 +               mm->context.size = 0;
4666 +       }
4667 +}
4668 +
4669 +static int read_ldt(void __user *ptr, unsigned long bytecount)
4670 +{
4671 +       int err;
4672 +       unsigned long size;
4673 +       struct mm_struct *mm = current->mm;
4674 +
4675 +       if (!mm->context.size)
4676 +               return 0;
4677 +       if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4678 +               bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4679 +
4680 +       mutex_lock(&mm->context.lock);
4681 +       size = mm->context.size * LDT_ENTRY_SIZE;
4682 +       if (size > bytecount)
4683 +               size = bytecount;
4684 +
4685 +       err = 0;
4686 +       if (copy_to_user(ptr, mm->context.ldt, size))
4687 +               err = -EFAULT;
4688 +       mutex_unlock(&mm->context.lock);
4689 +       if (err < 0)
4690 +               goto error_return;
4691 +       if (size != bytecount) {
4692 +               /* zero-fill the rest */
4693 +               if (clear_user(ptr + size, bytecount - size) != 0) {
4694 +                       err = -EFAULT;
4695 +                       goto error_return;
4696 +               }
4697 +       }
4698 +       return bytecount;
4699 +error_return:
4700 +       return err;
4701 +}
4702 +
4703 +static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4704 +{
4705 +       /* CHECKME: Can we use _one_ random number ? */
4706 +#ifdef CONFIG_X86_32
4707 +       unsigned long size = 5 * sizeof(struct desc_struct);
4708 +#else
4709 +       unsigned long size = 128;
4710 +#endif
4711 +       if (bytecount > size)
4712 +               bytecount = size;
4713 +       if (clear_user(ptr, bytecount))
4714 +               return -EFAULT;
4715 +       return bytecount;
4716 +}
4717 +
4718 +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4719 +{
4720 +       struct mm_struct *mm = current->mm;
4721 +       struct desc_struct ldt;
4722 +       int error;
4723 +       struct user_desc ldt_info;
4724 +
4725 +       error = -EINVAL;
4726 +       if (bytecount != sizeof(ldt_info))
4727 +               goto out;
4728 +       error = -EFAULT;
4729 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4730 +               goto out;
4731 +
4732 +       error = -EINVAL;
4733 +       if (ldt_info.entry_number >= LDT_ENTRIES)
4734 +               goto out;
4735 +       if (ldt_info.contents == 3) {
4736 +               if (oldmode)
4737 +                       goto out;
4738 +               if (ldt_info.seg_not_present == 0)
4739 +                       goto out;
4740 +       }
4741 +
4742 +       mutex_lock(&mm->context.lock);
4743 +       if (ldt_info.entry_number >= mm->context.size) {
4744 +               error = alloc_ldt(&current->mm->context,
4745 +                                 ldt_info.entry_number + 1, 1);
4746 +               if (error < 0)
4747 +                       goto out_unlock;
4748 +       }
4749 +
4750 +       /* Allow LDTs to be cleared by the user. */
4751 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4752 +               if (oldmode || LDT_empty(&ldt_info)) {
4753 +                       memset(&ldt, 0, sizeof(ldt));
4754 +                       goto install;
4755 +               }
4756 +       }
4757 +
4758 +       fill_ldt(&ldt, &ldt_info);
4759 +       if (oldmode)
4760 +               ldt.avl = 0;
4761 +
4762 +       /* Install the new entry ...  */
4763 +install:
4764 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4765 +
4766 +out_unlock:
4767 +       mutex_unlock(&mm->context.lock);
4768 +out:
4769 +       return error;
4770 +}
4771 +
4772 +asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4773 +                             unsigned long bytecount)
4774 +{
4775 +       int ret = -ENOSYS;
4776 +
4777 +       switch (func) {
4778 +       case 0:
4779 +               ret = read_ldt(ptr, bytecount);
4780 +               break;
4781 +       case 1:
4782 +               ret = write_ldt(ptr, bytecount, 1);
4783 +               break;
4784 +       case 2:
4785 +               ret = read_default_ldt(ptr, bytecount);
4786 +               break;
4787 +       case 0x11:
4788 +               ret = write_ldt(ptr, bytecount, 0);
4789 +               break;
4790 +       }
4791 +       return ret;
4792 +}
4793 --- a/arch/x86/kernel/machine_kexec_64.c
4794 +++ b/arch/x86/kernel/machine_kexec_64.c
4795 @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4796
4797  void arch_crash_save_vmcoreinfo(void)
4798  {
4799 +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4800         VMCOREINFO_SYMBOL(phys_base);
4801 +#endif
4802         VMCOREINFO_SYMBOL(init_level4_pgt);
4803
4804  #ifdef CONFIG_NUMA
4805 --- a/arch/x86/kernel/Makefile
4806 +++ b/arch/x86/kernel/Makefile
4807 @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
4808
4809          obj-$(CONFIG_PCI_MMCONFIG)     += mmconf-fam10h_64.o
4810
4811 +       obj-$(CONFIG_XEN)               += nmi_64.o
4812         time_64-$(CONFIG_XEN)           += time_32.o
4813         pci-dma_64-$(CONFIG_XEN)        += pci-dma_32.o
4814  endif
4815
4816  disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
4817         smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
4818 -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
4819 -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
4820 --- a/arch/x86/kernel/microcode-xen.c
4821 +++ b/arch/x86/kernel/microcode-xen.c
4822 @@ -167,7 +167,7 @@ static int request_microcode(void)
4823         }
4824
4825         op.cmd = XENPF_microcode_update;
4826 -       set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4827 +       set_xen_guest_handle(op.u.microcode.data, firmware->data);
4828         op.u.microcode.length = firmware->size;
4829         error = HYPERVISOR_platform_op(&op);
4830
4831 --- a/arch/x86/kernel/mpparse_32-xen.c
4832 +++ b/arch/x86/kernel/mpparse_32-xen.c
4833 @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4834  /* Processor that is doing the boot up */
4835  unsigned int boot_cpu_physical_apicid = -1U;
4836  /* Internal processor count */
4837 -unsigned int __cpuinitdata num_processors;
4838 +unsigned int num_processors;
4839
4840  /* Bitmask of physically existing CPUs */
4841  physid_mask_t phys_cpu_present_map;
4842 @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4843         if (!(m->mpc_flags & MPC_APIC_USABLE))
4844                 return;
4845
4846 -       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4847 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4848                 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4849         if (nr_ioapics >= MAX_IO_APICS) {
4850                 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4851 @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4852
4853         mps_oem_check(mpc, oem, str);
4854
4855 -       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4856 +       printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4857
4858 -       /*
4859 +       /*
4860          * Save the local APIC address (it might be non-default) -- but only
4861          * if we're not using ACPI.
4862          */
4863 @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4864         unsigned long *bp = isa_bus_to_virt(base);
4865         struct intel_mp_floating *mpf;
4866
4867 -       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4868 +       printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4869         if (sizeof(*mpf) != 16)
4870                 printk("Error: MPF size\n");
4871
4872 @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4873
4874                         smp_found_config = 1;
4875  #ifndef CONFIG_XEN
4876 -                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
4877 -                                               virt_to_phys(mpf));
4878 -                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4879 +                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4880 +                               mpf, virt_to_phys(mpf));
4881 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4882 +                                       BOOTMEM_DEFAULT);
4883                         if (mpf->mpf_physptr) {
4884                                 /*
4885                                  * We cannot access to MPC table to compute
4886 @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4887                                 unsigned long end = max_low_pfn * PAGE_SIZE;
4888                                 if (mpf->mpf_physptr + size > end)
4889                                         size = end - mpf->mpf_physptr;
4890 -                               reserve_bootmem(mpf->mpf_physptr, size);
4891 +                               reserve_bootmem(mpf->mpf_physptr, size,
4892 +                                               BOOTMEM_DEFAULT);
4893                         }
4894  #else
4895 -                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
4896 -                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4897 +                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4898 +                               mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4899  #endif
4900
4901                         mpf_found = mpf;
4902 @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4903          */
4904         mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4905         mp_ioapic_routing[idx].gsi_base = gsi_base;
4906 -       mp_ioapic_routing[idx].gsi_end = gsi_base +
4907 +       mp_ioapic_routing[idx].gsi_end = gsi_base +
4908                 io_apic_get_redir_entries(idx);
4909
4910 -       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4911 -               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4912 -               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4913 -               mp_ioapic_routing[idx].gsi_base,
4914 -               mp_ioapic_routing[idx].gsi_end);
4915 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4916 +              "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4917 +              mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4918 +              mp_ioapic_routing[idx].gsi_base,
4919 +              mp_ioapic_routing[idx].gsi_end);
4920  }
4921
4922  void __init
4923 @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4924  }
4925
4926  #define MAX_GSI_NUM    4096
4927 +#define IRQ_COMPRESSION_START  64
4928
4929  int mp_register_gsi(u32 gsi, int triggering, int polarity)
4930  {
4931         int ioapic = -1;
4932         int ioapic_pin = 0;
4933         int idx, bit = 0;
4934 -       static int pci_irq = 16;
4935 +       static int pci_irq = IRQ_COMPRESSION_START;
4936         /*
4937 -        * Mapping between Global System Interrups, which
4938 +        * Mapping between Global System Interrupts, which
4939          * represent all possible interrupts, and IRQs
4940          * assigned to actual devices.
4941          */
4942 @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4943         if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4944                 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4945                         mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4946 -               return gsi_to_irq[gsi];
4947 +               return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4948         }
4949
4950         mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4951
4952 -       if (triggering == ACPI_LEVEL_SENSITIVE) {
4953 +       /*
4954 +        * For GSI >= 64, use IRQ compression
4955 +        */
4956 +       if ((gsi >= IRQ_COMPRESSION_START)
4957 +               && (triggering == ACPI_LEVEL_SENSITIVE)) {
4958                 /*
4959                  * For PCI devices assign IRQs in order, avoiding gaps
4960                  * due to unused I/O APIC pins.
4961 --- a/arch/x86/kernel/mpparse_64-xen.c
4962 +++ b/arch/x86/kernel/mpparse_64-xen.c
4963 @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4964  EXPORT_SYMBOL(boot_cpu_id);
4965
4966  /* Internal processor count */
4967 -unsigned int num_processors __cpuinitdata = 0;
4968 +unsigned int num_processors;
4969
4970  unsigned disabled_cpus __cpuinitdata;
4971
4972  /* Bitmask of physically existing CPUs */
4973  physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4974
4975 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4976 +#ifndef CONFIG_XEN
4977 +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4978 +                               = { [0 ... NR_CPUS-1] = BAD_APICID };
4979 +void *x86_bios_cpu_apicid_early_ptr;
4980 +#endif
4981 +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4982 +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4983
4984
4985  /*
4986 @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4987         physid_set(m->mpc_apicid, phys_cpu_present_map);
4988         if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4989                 /*
4990 -                * bios_cpu_apicid is required to have processors listed
4991 +                * x86_bios_cpu_apicid is required to have processors listed
4992                  * in same order as logical cpu numbers. Hence the first
4993                  * entry is BSP, and so on.
4994                  */
4995                 cpu = 0;
4996         }
4997 -       bios_cpu_apicid[cpu] = m->mpc_apicid;
4998 -       /*
4999 -        * We get called early in the the start_kernel initialization
5000 -        * process when the per_cpu data area is not yet setup, so we
5001 -        * use a static array that is removed after the per_cpu data
5002 -        * area is created.
5003 -        */
5004 -       if (x86_cpu_to_apicid_ptr) {
5005 -               u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
5006 -               x86_cpu_to_apicid[cpu] = m->mpc_apicid;
5007 +       /* are we being called early in kernel startup? */
5008 +       if (x86_cpu_to_apicid_early_ptr) {
5009 +               u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
5010 +               u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
5011 +
5012 +               cpu_to_apicid[cpu] = m->mpc_apicid;
5013 +               bios_cpu_apicid[cpu] = m->mpc_apicid;
5014         } else {
5015                 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
5016 +               per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
5017         }
5018
5019         cpu_set(cpu, cpu_possible_map);
5020 --- a/arch/x86/kernel/pci-dma-xen.c
5021 +++ b/arch/x86/kernel/pci-dma-xen.c
5022 @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
5023                 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
5024  }
5025  EXPORT_SYMBOL(dma_sync_single_for_device);
5026 +
5027 +void
5028 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
5029 +                   enum dma_data_direction direction)
5030 +{
5031 +       if (swiotlb)
5032 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
5033 +       flush_write_buffers();
5034 +}
5035 +EXPORT_SYMBOL(dma_sync_sg_for_cpu);
5036 +
5037 +void
5038 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
5039 +                   enum dma_data_direction direction)
5040 +{
5041 +       if (swiotlb)
5042 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
5043 +       flush_write_buffers();
5044 +}
5045 +EXPORT_SYMBOL(dma_sync_sg_for_device);
5046 --- a/arch/x86/kernel/process_32-xen.c
5047 +++ b/arch/x86/kernel/process_32-xen.c
5048 @@ -23,7 +23,6 @@
5049  #include <linux/slab.h>
5050  #include <linux/vmalloc.h>
5051  #include <linux/user.h>
5052 -#include <linux/a.out.h>
5053  #include <linux/interrupt.h>
5054  #include <linux/utsname.h>
5055  #include <linux/delay.h>
5056 @@ -59,8 +58,10 @@
5057
5058  #include <asm/tlbflush.h>
5059  #include <asm/cpu.h>
5060 +#include <asm/kdebug.h>
5061
5062  asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
5063 +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
5064
5065  static int hlt_counter;
5066
5067 @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
5068   */
5069  unsigned long thread_saved_pc(struct task_struct *tsk)
5070  {
5071 -       return ((unsigned long *)tsk->thread.esp)[3];
5072 +       return ((unsigned long *)tsk->thread.sp)[3];
5073  }
5074
5075  /*
5076 @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
5077   */
5078  void (*pm_idle)(void);
5079  EXPORT_SYMBOL(pm_idle);
5080 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5081
5082  void disable_hlt(void)
5083  {
5084 @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
5085   * to poll the ->work.need_resched flag instead of waiting for the
5086   * cross-CPU IPI to arrive. Use this option with caution.
5087   */
5088 -static void poll_idle (void)
5089 +static void poll_idle(void)
5090  {
5091         cpu_relax();
5092  }
5093 @@ -122,10 +122,19 @@ static void xen_idle(void)
5094         smp_mb();
5095
5096         local_irq_disable();
5097 -       if (!need_resched())
5098 +       if (!need_resched()) {
5099 +               ktime_t t0, t1;
5100 +               u64 t0n, t1n;
5101 +
5102 +               t0 = ktime_get();
5103 +               t0n = ktime_to_ns(t0);
5104                 safe_halt();    /* enables interrupts racelessly */
5105 -       else
5106 -               local_irq_enable();
5107 +               local_irq_disable();
5108 +               t1 = ktime_get();
5109 +               t1n = ktime_to_ns(t1);
5110 +               sched_clock_idle_wakeup_event(t1n - t0n);
5111 +       }
5112 +       local_irq_enable();
5113         current_thread_info()->status |= TS_POLLING;
5114  }
5115  #ifdef CONFIG_APM_MODULE
5116 @@ -168,13 +177,13 @@ void cpu_idle(void)
5117                 while (!need_resched()) {
5118                         void (*idle)(void);
5119
5120 -                       if (__get_cpu_var(cpu_idle_state))
5121 -                               __get_cpu_var(cpu_idle_state) = 0;
5122 -
5123                         check_pgt_cache();
5124                         rmb();
5125                         idle = xen_idle; /* no alternatives */
5126
5127 +                       if (rcu_pending(cpu))
5128 +                               rcu_check_callbacks(cpu, 0);
5129 +
5130                         if (cpu_is_offline(cpu))
5131                                 play_dead();
5132
5133 @@ -192,40 +201,19 @@ static void do_nothing(void *unused)
5134  {
5135  }
5136
5137 +/*
5138 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5139 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5140 + * handler on SMP systems.
5141 + *
5142 + * Caller must have changed pm_idle to the new value before the call. Old
5143 + * pm_idle value will not be used by any CPU after the return of this function.
5144 + */
5145  void cpu_idle_wait(void)
5146  {
5147 -       unsigned int cpu, this_cpu = get_cpu();
5148 -       cpumask_t map, tmp = current->cpus_allowed;
5149 -
5150 -       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5151 -       put_cpu();
5152 -
5153 -       cpus_clear(map);
5154 -       for_each_online_cpu(cpu) {
5155 -               per_cpu(cpu_idle_state, cpu) = 1;
5156 -               cpu_set(cpu, map);
5157 -       }
5158 -
5159 -       __get_cpu_var(cpu_idle_state) = 0;
5160 -
5161 -       wmb();
5162 -       do {
5163 -               ssleep(1);
5164 -               for_each_online_cpu(cpu) {
5165 -                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5166 -                               cpu_clear(cpu, map);
5167 -               }
5168 -               cpus_and(map, map, cpu_online_map);
5169 -               /*
5170 -                * We waited 1 sec, if a CPU still did not call idle
5171 -                * it may be because it is in idle and not waking up
5172 -                * because it has nothing to do.
5173 -                * Give all the remaining CPUS a kick.
5174 -                */
5175 -               smp_call_function_mask(map, do_nothing, 0, 0);
5176 -       } while (!cpus_empty(map));
5177 -
5178 -       set_cpus_allowed(current, tmp);
5179 +       smp_mb();
5180 +       /* kick all the CPUs so that they exit out of pm_idle */
5181 +       smp_call_function(do_nothing, NULL, 0, 1);
5182  }
5183  EXPORT_SYMBOL_GPL(cpu_idle_wait);
5184
5185 @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5186  {
5187         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5188         unsigned long d0, d1, d2, d3, d6, d7;
5189 -       unsigned long esp;
5190 +       unsigned long sp;
5191         unsigned short ss, gs;
5192
5193         if (user_mode_vm(regs)) {
5194 -               esp = regs->esp;
5195 -               ss = regs->xss & 0xffff;
5196 +               sp = regs->sp;
5197 +               ss = regs->ss & 0xffff;
5198                 savesegment(gs, gs);
5199         } else {
5200 -               esp = (unsigned long) (&regs->esp);
5201 +               sp = (unsigned long) (&regs->sp);
5202                 savesegment(ss, ss);
5203                 savesegment(gs, gs);
5204         }
5205 @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5206                         init_utsname()->version);
5207
5208         printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5209 -                       0xffff & regs->xcs, regs->eip, regs->eflags,
5210 +                       0xffff & regs->cs, regs->ip, regs->flags,
5211                         smp_processor_id());
5212 -       print_symbol("EIP is at %s\n", regs->eip);
5213 +       print_symbol("EIP is at %s\n", regs->ip);
5214
5215         printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5216 -               regs->eax, regs->ebx, regs->ecx, regs->edx);
5217 +               regs->ax, regs->bx, regs->cx, regs->dx);
5218         printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5219 -               regs->esi, regs->edi, regs->ebp, esp);
5220 +               regs->si, regs->di, regs->bp, sp);
5221         printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5222 -              regs->xds & 0xffff, regs->xes & 0xffff,
5223 -              regs->xfs & 0xffff, gs, ss);
5224 +              regs->ds & 0xffff, regs->es & 0xffff,
5225 +              regs->fs & 0xffff, gs, ss);
5226
5227         if (!all)
5228                 return;
5229 @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5230  void show_regs(struct pt_regs *regs)
5231  {
5232         __show_registers(regs, 1);
5233 -       show_trace(NULL, regs, &regs->esp);
5234 +       show_trace(NULL, regs, &regs->sp, regs->bp);
5235  }
5236
5237  /*
5238 - * This gets run with %ebx containing the
5239 - * function to call, and %edx containing
5240 + * This gets run with %bx containing the
5241 + * function to call, and %dx containing
5242   * the "args".
5243   */
5244  extern void kernel_thread_helper(void);
5245 @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5246
5247         memset(&regs, 0, sizeof(regs));
5248
5249 -       regs.ebx = (unsigned long) fn;
5250 -       regs.edx = (unsigned long) arg;
5251 +       regs.bx = (unsigned long) fn;
5252 +       regs.dx = (unsigned long) arg;
5253
5254 -       regs.xds = __USER_DS;
5255 -       regs.xes = __USER_DS;
5256 -       regs.xfs = __KERNEL_PERCPU;
5257 -       regs.orig_eax = -1;
5258 -       regs.eip = (unsigned long) kernel_thread_helper;
5259 -       regs.xcs = __KERNEL_CS | get_kernel_rpl();
5260 -       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5261 +       regs.ds = __USER_DS;
5262 +       regs.es = __USER_DS;
5263 +       regs.fs = __KERNEL_PERCPU;
5264 +       regs.orig_ax = -1;
5265 +       regs.ip = (unsigned long) kernel_thread_helper;
5266 +       regs.cs = __KERNEL_CS | get_kernel_rpl();
5267 +       regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5268
5269         /* Ok, create the new process.. */
5270         return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
5271 @@ -368,7 +356,12 @@ void flush_thread(void)
5272  {
5273         struct task_struct *tsk = current;
5274
5275 -       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5276 +       tsk->thread.debugreg0 = 0;
5277 +       tsk->thread.debugreg1 = 0;
5278 +       tsk->thread.debugreg2 = 0;
5279 +       tsk->thread.debugreg3 = 0;
5280 +       tsk->thread.debugreg6 = 0;
5281 +       tsk->thread.debugreg7 = 0;
5282         memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5283         clear_tsk_thread_flag(tsk, TIF_DEBUG);
5284         /*
5285 @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5286         unlazy_fpu(tsk);
5287  }
5288
5289 -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5290 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5291         unsigned long unused,
5292         struct task_struct * p, struct pt_regs * regs)
5293  {
5294 @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5295
5296         childregs = task_pt_regs(p);
5297         *childregs = *regs;
5298 -       childregs->eax = 0;
5299 -       childregs->esp = esp;
5300 +       childregs->ax = 0;
5301 +       childregs->sp = sp;
5302
5303 -       p->thread.esp = (unsigned long) childregs;
5304 -       p->thread.esp0 = (unsigned long) (childregs+1);
5305 +       p->thread.sp = (unsigned long) childregs;
5306 +       p->thread.sp0 = (unsigned long) (childregs+1);
5307
5308 -       p->thread.eip = (unsigned long) ret_from_fork;
5309 +       p->thread.ip = (unsigned long) ret_from_fork;
5310
5311 -       savesegment(gs,p->thread.gs);
5312 +       savesegment(gs, p->thread.gs);
5313
5314         tsk = current;
5315 +       if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5316 +               p->thread.ip = (unsigned long) cstar_ret_from_fork;
5317         if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5318                 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5319                                                 IO_BITMAP_BYTES, GFP_KERNEL);
5320 @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5321                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5322         }
5323
5324 +       err = 0;
5325 +
5326         /*
5327          * Set a new TLS for the child thread?
5328          */
5329 -       if (clone_flags & CLONE_SETTLS) {
5330 -               struct desc_struct *desc;
5331 -               struct user_desc info;
5332 -               int idx;
5333 -
5334 -               err = -EFAULT;
5335 -               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5336 -                       goto out;
5337 -               err = -EINVAL;
5338 -               if (LDT_empty(&info))
5339 -                       goto out;
5340 -
5341 -               idx = info.entry_number;
5342 -               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5343 -                       goto out;
5344 -
5345 -               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5346 -               desc->a = LDT_entry_a(&info);
5347 -               desc->b = LDT_entry_b(&info);
5348 -       }
5349 +       if (clone_flags & CLONE_SETTLS)
5350 +               err = do_set_thread_area(p, -1,
5351 +                       (struct user_desc __user *)childregs->si, 0);
5352
5353         p->thread.iopl = current->thread.iopl;
5354
5355 -       err = 0;
5356 - out:
5357         if (err && p->thread.io_bitmap_ptr) {
5358                 kfree(p->thread.io_bitmap_ptr);
5359                 p->thread.io_bitmap_max = 0;
5360 @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5361         return err;
5362  }
5363
5364 -/*
5365 - * fill in the user structure for a core dump..
5366 - */
5367 -void dump_thread(struct pt_regs * regs, struct user * dump)
5368 -{
5369 -       int i;
5370 -
5371 -/* changed the size calculations - should hopefully work better. lbt */
5372 -       dump->magic = CMAGIC;
5373 -       dump->start_code = 0;
5374 -       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5375 -       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5376 -       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5377 -       dump->u_dsize -= dump->u_tsize;
5378 -       dump->u_ssize = 0;
5379 -       for (i = 0; i < 8; i++)
5380 -               dump->u_debugreg[i] = current->thread.debugreg[i];
5381 -
5382 -       if (dump->start_stack < TASK_SIZE)
5383 -               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5384 -
5385 -       dump->regs.ebx = regs->ebx;
5386 -       dump->regs.ecx = regs->ecx;
5387 -       dump->regs.edx = regs->edx;
5388 -       dump->regs.esi = regs->esi;
5389 -       dump->regs.edi = regs->edi;
5390 -       dump->regs.ebp = regs->ebp;
5391 -       dump->regs.eax = regs->eax;
5392 -       dump->regs.ds = regs->xds;
5393 -       dump->regs.es = regs->xes;
5394 -       dump->regs.fs = regs->xfs;
5395 -       savesegment(gs,dump->regs.gs);
5396 -       dump->regs.orig_eax = regs->orig_eax;
5397 -       dump->regs.eip = regs->eip;
5398 -       dump->regs.cs = regs->xcs;
5399 -       dump->regs.eflags = regs->eflags;
5400 -       dump->regs.esp = regs->esp;
5401 -       dump->regs.ss = regs->xss;
5402 -
5403 -       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5404 -}
5405 -EXPORT_SYMBOL(dump_thread);
5406 -
5407 -/*
5408 - * Capture the user space registers if the task is not running (in user space)
5409 - */
5410 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5411 -{
5412 -       struct pt_regs ptregs = *task_pt_regs(tsk);
5413 -       ptregs.xcs &= 0xffff;
5414 -       ptregs.xds &= 0xffff;
5415 -       ptregs.xes &= 0xffff;
5416 -       ptregs.xss &= 0xffff;
5417 -
5418 -       elf_core_copy_regs(regs, &ptregs);
5419 -
5420 -       return 1;
5421 -}
5422 -
5423  #ifdef CONFIG_SECCOMP
5424 -void hard_disable_TSC(void)
5425 +static void hard_disable_TSC(void)
5426  {
5427         write_cr4(read_cr4() | X86_CR4_TSD);
5428  }
5429 @@ -534,7 +453,7 @@ void disable_TSC(void)
5430                 hard_disable_TSC();
5431         preempt_enable();
5432  }
5433 -void hard_enable_TSC(void)
5434 +static void hard_enable_TSC(void)
5435  {
5436         write_cr4(read_cr4() & ~X86_CR4_TSD);
5437  }
5438 @@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5439  static noinline void
5440  __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5441  {
5442 -       struct thread_struct *next;
5443 +       struct thread_struct *prev, *next;
5444 +       unsigned long debugctl;
5445
5446 +       prev = &prev_p->thread;
5447         next = &next_p->thread;
5448
5449 +       debugctl = prev->debugctlmsr;
5450 +       if (next->ds_area_msr != prev->ds_area_msr) {
5451 +               /* we clear debugctl to make sure DS
5452 +                * is not in use when we change it */
5453 +               debugctl = 0;
5454 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5455 +               wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5456 +       }
5457 +
5458 +       if (next->debugctlmsr != debugctl)
5459 +               wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5460 +
5461         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5462 -               set_debugreg(next->debugreg[0], 0);
5463 -               set_debugreg(next->debugreg[1], 1);
5464 -               set_debugreg(next->debugreg[2], 2);
5465 -               set_debugreg(next->debugreg[3], 3);
5466 +               set_debugreg(next->debugreg0, 0);
5467 +               set_debugreg(next->debugreg1, 1);
5468 +               set_debugreg(next->debugreg2, 2);
5469 +               set_debugreg(next->debugreg3, 3);
5470                 /* no 4 and 5 */
5471 -               set_debugreg(next->debugreg[6], 6);
5472 -               set_debugreg(next->debugreg[7], 7);
5473 +               set_debugreg(next->debugreg6, 6);
5474 +               set_debugreg(next->debugreg7, 7);
5475         }
5476
5477  #ifdef CONFIG_SECCOMP
5478 @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5479                         hard_enable_TSC();
5480         }
5481  #endif
5482 +
5483 +#ifdef X86_BTS
5484 +       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5485 +               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5486 +
5487 +       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5488 +               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5489 +#endif
5490  }
5491
5492  /*
5493 @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5494   * More important, however, is the fact that this allows us much
5495   * more flexibility.
5496   *
5497 - * The return value (in %eax) will be the "prev" task after
5498 + * The return value (in %ax) will be the "prev" task after
5499   * the task-switch, and shows up in ret_from_fork in entry.S,
5500   * for example.
5501   */
5502 -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5503 +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5504  {
5505         struct thread_struct *prev = &prev_p->thread,
5506                                  *next = &next_p->thread;
5507 @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5508  #endif
5509
5510         /*
5511 -        * Reload esp0.
5512 -        * This is load_esp0(tss, next) with a multicall.
5513 +        * Reload sp0.
5514 +        * This is load_sp0(tss, next) with a multicall.
5515          */
5516         mcl->op      = __HYPERVISOR_stack_switch;
5517         mcl->args[0] = __KERNEL_DS;
5518 -       mcl->args[1] = next->esp0;
5519 +       mcl->args[1] = next->sp0;
5520         mcl++;
5521
5522         /*
5523 @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5524
5525  asmlinkage int sys_fork(struct pt_regs regs)
5526  {
5527 -       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5528 +       return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5529  }
5530
5531  asmlinkage int sys_clone(struct pt_regs regs)
5532 @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5533         unsigned long newsp;
5534         int __user *parent_tidptr, *child_tidptr;
5535
5536 -       clone_flags = regs.ebx;
5537 -       newsp = regs.ecx;
5538 -       parent_tidptr = (int __user *)regs.edx;
5539 -       child_tidptr = (int __user *)regs.edi;
5540 +       clone_flags = regs.bx;
5541 +       newsp = regs.cx;
5542 +       parent_tidptr = (int __user *)regs.dx;
5543 +       child_tidptr = (int __user *)regs.di;
5544         if (!newsp)
5545 -               newsp = regs.esp;
5546 +               newsp = regs.sp;
5547         return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
5548  }
5549
5550 @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5551   */
5552  asmlinkage int sys_vfork(struct pt_regs regs)
5553  {
5554 -       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5555 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5556  }
5557
5558  /*
5559 @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5560         int error;
5561         char * filename;
5562
5563 -       filename = getname((char __user *) regs.ebx);
5564 +       filename = getname((char __user *) regs.bx);
5565         error = PTR_ERR(filename);
5566         if (IS_ERR(filename))
5567                 goto out;
5568         error = do_execve(filename,
5569 -                       (char __user * __user *) regs.ecx,
5570 -                       (char __user * __user *) regs.edx,
5571 +                       (char __user * __user *) regs.cx,
5572 +                       (char __user * __user *) regs.dx,
5573                         &regs);
5574         if (error == 0) {
5575 -               task_lock(current);
5576 -               current->ptrace &= ~PT_DTRACE;
5577 -               task_unlock(current);
5578                 /* Make sure we don't return using sysenter.. */
5579                 set_thread_flag(TIF_IRET);
5580         }
5581 @@ -800,145 +738,37 @@ out:
5582
5583  unsigned long get_wchan(struct task_struct *p)
5584  {
5585 -       unsigned long ebp, esp, eip;
5586 +       unsigned long bp, sp, ip;
5587         unsigned long stack_page;
5588         int count = 0;
5589         if (!p || p == current || p->state == TASK_RUNNING)
5590                 return 0;
5591         stack_page = (unsigned long)task_stack_page(p);
5592 -       esp = p->thread.esp;
5593 -       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5594 +       sp = p->thread.sp;
5595 +       if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5596                 return 0;
5597 -       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5598 -       ebp = *(unsigned long *) esp;
5599 +       /* include/asm-i386/system.h:switch_to() pushes bp last. */
5600 +       bp = *(unsigned long *) sp;
5601         do {
5602 -               if (ebp < stack_page || ebp > top_ebp+stack_page)
5603 +               if (bp < stack_page || bp > top_ebp+stack_page)
5604                         return 0;
5605 -               eip = *(unsigned long *) (ebp+4);
5606 -               if (!in_sched_functions(eip))
5607 -                       return eip;
5608 -               ebp = *(unsigned long *) ebp;
5609 +               ip = *(unsigned long *) (bp+4);
5610 +               if (!in_sched_functions(ip))
5611 +                       return ip;
5612 +               bp = *(unsigned long *) bp;
5613         } while (count++ < 16);
5614         return 0;
5615  }
5616
5617 -/*
5618 - * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5619 - */
5620 -static int get_free_idx(void)
5621 -{
5622 -       struct thread_struct *t = &current->thread;
5623 -       int idx;
5624 -
5625 -       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5626 -               if (desc_empty(t->tls_array + idx))
5627 -                       return idx + GDT_ENTRY_TLS_MIN;
5628 -       return -ESRCH;
5629 -}
5630 -
5631 -/*
5632 - * Set a given TLS descriptor:
5633 - */
5634 -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5635 -{
5636 -       struct thread_struct *t = &current->thread;
5637 -       struct user_desc info;
5638 -       struct desc_struct *desc;
5639 -       int cpu, idx;
5640 -
5641 -       if (copy_from_user(&info, u_info, sizeof(info)))
5642 -               return -EFAULT;
5643 -       idx = info.entry_number;
5644 -
5645 -       /*
5646 -        * index -1 means the kernel should try to find and
5647 -        * allocate an empty descriptor:
5648 -        */
5649 -       if (idx == -1) {
5650 -               idx = get_free_idx();
5651 -               if (idx < 0)
5652 -                       return idx;
5653 -               if (put_user(idx, &u_info->entry_number))
5654 -                       return -EFAULT;
5655 -       }
5656 -
5657 -       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5658 -               return -EINVAL;
5659 -
5660 -       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5661 -
5662 -       /*
5663 -        * We must not get preempted while modifying the TLS.
5664 -        */
5665 -       cpu = get_cpu();
5666 -
5667 -       if (LDT_empty(&info)) {
5668 -               desc->a = 0;
5669 -               desc->b = 0;
5670 -       } else {
5671 -               desc->a = LDT_entry_a(&info);
5672 -               desc->b = LDT_entry_b(&info);
5673 -       }
5674 -       load_TLS(t, cpu);
5675 -
5676 -       put_cpu();
5677 -
5678 -       return 0;
5679 -}
5680 -
5681 -/*
5682 - * Get the current Thread-Local Storage area:
5683 - */
5684 -
5685 -#define GET_BASE(desc) ( \
5686 -       (((desc)->a >> 16) & 0x0000ffff) | \
5687 -       (((desc)->b << 16) & 0x00ff0000) | \
5688 -       ( (desc)->b        & 0xff000000)   )
5689 -
5690 -#define GET_LIMIT(desc) ( \
5691 -       ((desc)->a & 0x0ffff) | \
5692 -        ((desc)->b & 0xf0000) )
5693 -
5694 -#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
5695 -#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
5696 -#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
5697 -#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
5698 -#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
5699 -#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
5700 -
5701 -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5702 -{
5703 -       struct user_desc info;
5704 -       struct desc_struct *desc;
5705 -       int idx;
5706 -
5707 -       if (get_user(idx, &u_info->entry_number))
5708 -               return -EFAULT;
5709 -       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5710 -               return -EINVAL;
5711 -
5712 -       memset(&info, 0, sizeof(info));
5713 -
5714 -       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5715 -
5716 -       info.entry_number = idx;
5717 -       info.base_addr = GET_BASE(desc);
5718 -       info.limit = GET_LIMIT(desc);
5719 -       info.seg_32bit = GET_32BIT(desc);
5720 -       info.contents = GET_CONTENTS(desc);
5721 -       info.read_exec_only = !GET_WRITABLE(desc);
5722 -       info.limit_in_pages = GET_LIMIT_PAGES(desc);
5723 -       info.seg_not_present = !GET_PRESENT(desc);
5724 -       info.useable = GET_USEABLE(desc);
5725 -
5726 -       if (copy_to_user(u_info, &info, sizeof(info)))
5727 -               return -EFAULT;
5728 -       return 0;
5729 -}
5730 -
5731  unsigned long arch_align_stack(unsigned long sp)
5732  {
5733         if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5734                 sp -= get_random_int() % 8192;
5735         return sp & ~0xf;
5736  }
5737 +
5738 +unsigned long arch_randomize_brk(struct mm_struct *mm)
5739 +{
5740 +       unsigned long range_end = mm->brk + 0x02000000;
5741 +       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5742 +}
5743 --- a/arch/x86/kernel/process_64-xen.c
5744 +++ b/arch/x86/kernel/process_64-xen.c
5745 @@ -3,7 +3,7 @@
5746   *
5747   *  Pentium III FXSR, SSE support
5748   *     Gareth Hughes <gareth@valinux.com>, May 2000
5749 - *
5750 + *
5751   *  X86-64 port
5752   *     Andi Kleen.
5753   *
5754 @@ -22,19 +22,18 @@
5755  #include <linux/cpu.h>
5756  #include <linux/errno.h>
5757  #include <linux/sched.h>
5758 +#include <linux/fs.h>
5759  #include <linux/kernel.h>
5760  #include <linux/mm.h>
5761 -#include <linux/fs.h>
5762  #include <linux/elfcore.h>
5763  #include <linux/smp.h>
5764  #include <linux/slab.h>
5765  #include <linux/user.h>
5766 -#include <linux/module.h>
5767 -#include <linux/a.out.h>
5768  #include <linux/interrupt.h>
5769 +#include <linux/utsname.h>
5770  #include <linux/delay.h>
5771 +#include <linux/module.h>
5772  #include <linux/ptrace.h>
5773 -#include <linux/utsname.h>
5774  #include <linux/random.h>
5775  #include <linux/notifier.h>
5776  #include <linux/kprobes.h>
5777 @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5778   */
5779  void (*pm_idle)(void);
5780  EXPORT_SYMBOL(pm_idle);
5781 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5782
5783  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5784
5785 @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5786  {
5787         atomic_notifier_chain_register(&idle_notifier, n);
5788  }
5789 -EXPORT_SYMBOL_GPL(idle_notifier_register);
5790 -
5791 -void idle_notifier_unregister(struct notifier_block *n)
5792 -{
5793 -       atomic_notifier_chain_unregister(&idle_notifier, n);
5794 -}
5795 -EXPORT_SYMBOL(idle_notifier_unregister);
5796
5797  void enter_idle(void)
5798  {
5799 @@ -116,7 +107,7 @@ void exit_idle(void)
5800   * to poll the ->need_resched flag instead of waiting for the
5801   * cross-CPU IPI to arrive. Use this option with caution.
5802   */
5803 -static void poll_idle (void)
5804 +static void poll_idle(void)
5805  {
5806         local_irq_enable();
5807         cpu_relax();
5808 @@ -131,10 +122,19 @@ static void xen_idle(void)
5809          */
5810         smp_mb();
5811         local_irq_disable();
5812 -       if (!need_resched())
5813 -               safe_halt();
5814 -       else
5815 -               local_irq_enable();
5816 +       if (!need_resched()) {
5817 +               ktime_t t0, t1;
5818 +               u64 t0n, t1n;
5819 +
5820 +               t0 = ktime_get();
5821 +               t0n = ktime_to_ns(t0);
5822 +               safe_halt();    /* enables interrupts racelessly */
5823 +               local_irq_disable();
5824 +               t1 = ktime_get();
5825 +               t1n = ktime_to_ns(t1);
5826 +               sched_clock_idle_wakeup_event(t1n - t0n);
5827 +       }
5828 +       local_irq_enable();
5829         current_thread_info()->status |= TS_POLLING;
5830  }
5831
5832 @@ -161,19 +161,15 @@ static inline void play_dead(void)
5833   * low exit latency (ie sit in a loop waiting for
5834   * somebody to say that they'd like to reschedule)
5835   */
5836 -void cpu_idle (void)
5837 +void cpu_idle(void)
5838  {
5839         current_thread_info()->status |= TS_POLLING;
5840         /* endless idle loop with no priority at all */
5841         while (1) {
5842 +               tick_nohz_stop_sched_tick();
5843                 while (!need_resched()) {
5844                         void (*idle)(void);
5845
5846 -                       if (__get_cpu_var(cpu_idle_state))
5847 -                               __get_cpu_var(cpu_idle_state) = 0;
5848 -
5849 -                       tick_nohz_stop_sched_tick();
5850 -
5851                         rmb();
5852                         idle = xen_idle; /* no alternatives */
5853                         if (cpu_is_offline(smp_processor_id()))
5854 @@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5855  {
5856  }
5857
5858 +/*
5859 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5860 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5861 + * handler on SMP systems.
5862 + *
5863 + * Caller must have changed pm_idle to the new value before the call. Old
5864 + * pm_idle value will not be used by any CPU after the return of this function.
5865 + */
5866  void cpu_idle_wait(void)
5867  {
5868 -       unsigned int cpu, this_cpu = get_cpu();
5869 -       cpumask_t map, tmp = current->cpus_allowed;
5870 -
5871 -       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5872 -       put_cpu();
5873 -
5874 -       cpus_clear(map);
5875 -       for_each_online_cpu(cpu) {
5876 -               per_cpu(cpu_idle_state, cpu) = 1;
5877 -               cpu_set(cpu, map);
5878 -       }
5879 -
5880 -       __get_cpu_var(cpu_idle_state) = 0;
5881 -
5882 -       wmb();
5883 -       do {
5884 -               ssleep(1);
5885 -               for_each_online_cpu(cpu) {
5886 -                       if (cpu_isset(cpu, map) &&
5887 -                                       !per_cpu(cpu_idle_state, cpu))
5888 -                               cpu_clear(cpu, map);
5889 -               }
5890 -               cpus_and(map, map, cpu_online_map);
5891 -               /*
5892 -                * We waited 1 sec, if a CPU still did not call idle
5893 -                * it may be because it is in idle and not waking up
5894 -                * because it has nothing to do.
5895 -                * Give all the remaining CPUS a kick.
5896 -                */
5897 -               smp_call_function_mask(map, do_nothing, 0, 0);
5898 -       } while (!cpus_empty(map));
5899 -
5900 -       set_cpus_allowed(current, tmp);
5901 +       smp_mb();
5902 +       /* kick all the CPUs so that they exit out of pm_idle */
5903 +       smp_call_function(do_nothing, NULL, 0, 1);
5904  }
5905  EXPORT_SYMBOL_GPL(cpu_idle_wait);
5906
5907 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5908 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5909  {
5910  }
5911
5912 -static int __init idle_setup (char *str)
5913 +static int __init idle_setup(char *str)
5914  {
5915         if (!strcmp(str, "poll")) {
5916                 printk("using polling idle threads.\n");
5917 @@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5918  }
5919  early_param("idle", idle_setup);
5920
5921 -/* Prints also some state that isn't saved in the pt_regs */
5922 +/* Prints also some state that isn't saved in the pt_regs */
5923  void __show_regs(struct pt_regs * regs)
5924  {
5925         unsigned long fs, gs, shadowgs;
5926         unsigned long d0, d1, d2, d3, d6, d7;
5927 -       unsigned int fsindex,gsindex;
5928 -       unsigned int ds,cs,es;
5929 +       unsigned int fsindex, gsindex;
5930 +       unsigned int ds, cs, es;
5931
5932         printk("\n");
5933         print_modules();
5934 @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5935                 init_utsname()->release,
5936                 (int)strcspn(init_utsname()->version, " "),
5937                 init_utsname()->version);
5938 -       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5939 -       printk_address(regs->rip);
5940 -       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
5941 -               regs->eflags);
5942 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5943 +       printk_address(regs->ip, 1);
5944 +       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
5945 +               regs->flags);
5946         printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5947 -              regs->rax, regs->rbx, regs->rcx);
5948 +              regs->ax, regs->bx, regs->cx);
5949         printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5950 -              regs->rdx, regs->rsi, regs->rdi);
5951 +              regs->dx, regs->si, regs->di);
5952         printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5953 -              regs->rbp, regs->r8, regs->r9);
5954 +              regs->bp, regs->r8, regs->r9);
5955         printk("R10: %016lx R11: %016lx R12: %016lx\n",
5956                regs->r10, regs->r11, regs->r12);
5957         printk("R13: %016lx R14: %016lx R15: %016lx\n",
5958 @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5959  {
5960         printk("CPU %d:", smp_processor_id());
5961         __show_regs(regs);
5962 -       show_trace(NULL, regs, (void *)(regs + 1));
5963 +       show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5964  }
5965
5966  /*
5967 @@ -329,7 +303,7 @@ void exit_thread(void)
5968         struct task_struct *me = current;
5969         struct thread_struct *t = &me->thread;
5970
5971 -       if (me->thread.io_bitmap_ptr) {
5972 +       if (me->thread.io_bitmap_ptr) {
5973  #ifndef CONFIG_X86_NO_TSS
5974                 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5975  #endif
5976 @@ -382,7 +356,7 @@ void flush_thread(void)
5977         tsk->thread.debugreg3 = 0;
5978         tsk->thread.debugreg6 = 0;
5979         tsk->thread.debugreg7 = 0;
5980 -       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5981 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5982         /*
5983          * Forget coprocessor state..
5984          */
5985 @@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5986
5987  static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5988  {
5989 -       struct user_desc ud = {
5990 +       struct user_desc ud = {
5991                 .base_addr = addr,
5992                 .limit = 0xfffff,
5993                 .seg_32bit = 1,
5994                 .limit_in_pages = 1,
5995                 .useable = 1,
5996         };
5997 -       struct n_desc_struct *desc = (void *)t->thread.tls_array;
5998 +       struct desc_struct *desc = t->thread.tls_array;
5999         desc += tls;
6000 -       desc->a = LDT_entry_a(&ud);
6001 -       desc->b = LDT_entry_b(&ud);
6002 +       fill_ldt(desc, &ud);
6003  }
6004
6005  static inline u32 read_32bit_tls(struct task_struct *t, int tls)
6006  {
6007 -       struct desc_struct *desc = (void *)t->thread.tls_array;
6008 -       desc += tls;
6009 -       return desc->base0 |
6010 -               (((u32)desc->base1) << 16) |
6011 -               (((u32)desc->base2) << 24);
6012 +       return get_desc_base(&t->thread.tls_array[tls]);
6013  }
6014
6015  /*
6016 @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
6017         unlazy_fpu(tsk);
6018  }
6019
6020 -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
6021 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
6022                 unsigned long unused,
6023         struct task_struct * p, struct pt_regs * regs)
6024  {
6025 @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
6026                         (THREAD_SIZE + task_stack_page(p))) - 1;
6027         *childregs = *regs;
6028
6029 -       childregs->rax = 0;
6030 -       childregs->rsp = rsp;
6031 -       if (rsp == ~0UL)
6032 -               childregs->rsp = (unsigned long)childregs;
6033 -
6034 -       p->thread.rsp = (unsigned long) childregs;
6035 -       p->thread.rsp0 = (unsigned long) (childregs+1);
6036 -       p->thread.userrsp = me->thread.userrsp;
6037 +       childregs->ax = 0;
6038 +       childregs->sp = sp;
6039 +       if (sp == ~0UL)
6040 +               childregs->sp = (unsigned long)childregs;
6041 +
6042 +       p->thread.sp = (unsigned long) childregs;
6043 +       p->thread.sp0 = (unsigned long) (childregs+1);
6044 +       p->thread.usersp = me->thread.usersp;
6045
6046         set_tsk_thread_flag(p, TIF_FORK);
6047
6048 @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
6049                 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
6050                                 IO_BITMAP_BYTES);
6051                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
6052 -       }
6053 +       }
6054
6055         /*
6056          * Set a new TLS for the child thread?
6057 @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
6058         if (clone_flags & CLONE_SETTLS) {
6059  #ifdef CONFIG_IA32_EMULATION
6060                 if (test_thread_flag(TIF_IA32))
6061 -                       err = ia32_child_tls(p, childregs);
6062 +                       err = do_set_thread_area(p, -1,
6063 +                               (struct user_desc __user *)childregs->si, 0);
6064                 else
6065  #endif
6066                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
6067 @@ -502,26 +472,32 @@ out:
6068         return err;
6069  }
6070
6071 -static inline void __save_init_fpu( struct task_struct *tsk )
6072 -{
6073 -       asm volatile( "rex64 ; fxsave %0 ; fnclex"
6074 -                     : "=m" (tsk->thread.i387.fxsave));
6075 -       tsk->thread_info->status &= ~TS_USEDFPU;
6076 -}
6077 -
6078  /*
6079   * This special macro can be used to load a debugging register
6080   */
6081 -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
6082 +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
6083
6084  static inline void __switch_to_xtra(struct task_struct *prev_p,
6085 -                                   struct task_struct *next_p)
6086 +                                   struct task_struct *next_p)
6087  {
6088         struct thread_struct *prev, *next;
6089 +       unsigned long debugctl;
6090
6091         prev = &prev_p->thread,
6092         next = &next_p->thread;
6093
6094 +       debugctl = prev->debugctlmsr;
6095 +       if (next->ds_area_msr != prev->ds_area_msr) {
6096 +               /* we clear debugctl to make sure DS
6097 +                * is not in use when we change it */
6098 +               debugctl = 0;
6099 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
6100 +               wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
6101 +       }
6102 +
6103 +       if (next->debugctlmsr != debugctl)
6104 +               wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
6105 +
6106         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
6107                 loaddebug(next, 0);
6108                 loaddebug(next, 1);
6109 @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
6110                 loaddebug(next, 6);
6111                 loaddebug(next, 7);
6112         }
6113 +
6114 +#ifdef X86_BTS
6115 +       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
6116 +               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
6117 +
6118 +       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
6119 +               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
6120 +#endif
6121  }
6122
6123  /*
6124   *     switch_to(x,y) should switch tasks from x to y.
6125   *
6126 - * This could still be optimized:
6127 + * This could still be optimized:
6128   * - fold all the options into a flag word and test it with a single test.
6129   * - could test fs/gs bitsliced
6130   *
6131 @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
6132  {
6133         struct thread_struct *prev = &prev_p->thread,
6134                                  *next = &next_p->thread;
6135 -       int cpu = smp_processor_id();
6136 +       int cpu = smp_processor_id();
6137  #ifndef CONFIG_X86_NO_TSS
6138         struct tss_struct *tss = &per_cpu(init_tss, cpu);
6139  #endif
6140 @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6141                 prev_p->fpu_counter = 0;
6142
6143         /*
6144 -        * Reload esp0, LDT and the page table pointer:
6145 +        * Reload sp0.
6146 +        * This is load_sp0(tss, next) with a multicall.
6147          */
6148         mcl->op      = __HYPERVISOR_stack_switch;
6149         mcl->args[0] = __KERNEL_DS;
6150 -       mcl->args[1] = next->rsp0;
6151 +       mcl->args[1] = next->sp0;
6152         mcl++;
6153
6154         /*
6155 @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6156          * This is load_TLS(next, cpu) with multicalls.
6157          */
6158  #define C(i) do {                                                      \
6159 -       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
6160 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
6161 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
6162                 mcl->op      = __HYPERVISOR_update_descriptor;          \
6163                 mcl->args[0] = virt_to_machine(                         \
6164 -                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
6165 -               mcl->args[1] = next->tls_array[i];                      \
6166 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6167 +               mcl->args[1] = *(u64 *)&next->tls_array[i];             \
6168                 mcl++;                                                  \
6169         }                                                               \
6170  } while (0)
6171 @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6172  #undef C
6173
6174         if (unlikely(prev->iopl != next->iopl)) {
6175 -               iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6176 +               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6177  #if CONFIG_XEN_COMPAT > 0x030002
6178                 mcl->op      = __HYPERVISOR_physdev_op;
6179                 mcl->args[0] = PHYSDEVOP_set_iopl;
6180 @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6181         /*
6182          * Switch the PDA context.
6183          */
6184 -       prev->userrsp = read_pda(oldrsp);
6185 -       write_pda(oldrsp, next->userrsp);
6186 +       prev->usersp = read_pda(oldrsp);
6187 +       write_pda(oldrsp, next->usersp);
6188         write_pda(pcurrent, next_p);
6189         write_pda(kernelstack,
6190         (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6191 @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6192         /*
6193          * Now maybe reload the debug registers
6194          */
6195 -       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6196 +       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6197 +                    task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6198                 __switch_to_xtra(prev_p, next_p);
6199
6200         /* If the task has used fpu the last 5 timeslices, just do a full
6201 @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6202  /*
6203   * sys_execve() executes a new program.
6204   */
6205 -asmlinkage
6206 +asmlinkage
6207  long sys_execve(char __user *name, char __user * __user *argv,
6208 -               char __user * __user *envp, struct pt_regs regs)
6209 +               char __user * __user *envp, struct pt_regs *regs)
6210  {
6211         long error;
6212         char * filename;
6213
6214         filename = getname(name);
6215         error = PTR_ERR(filename);
6216 -       if (IS_ERR(filename))
6217 +       if (IS_ERR(filename))
6218                 return error;
6219 -       error = do_execve(filename, argv, envp, &regs);
6220 -       if (error == 0) {
6221 -               task_lock(current);
6222 -               current->ptrace &= ~PT_DTRACE;
6223 -               task_unlock(current);
6224 -       }
6225 +       error = do_execve(filename, argv, envp, regs);
6226         putname(filename);
6227         return error;
6228  }
6229 @@ -728,18 +710,18 @@ void set_personality_64bit(void)
6230         /* inherit personality from parent */
6231
6232         /* Make sure to be in 64bit mode */
6233 -       clear_thread_flag(TIF_IA32);
6234 +       clear_thread_flag(TIF_IA32);
6235
6236         /* TBD: overwrites user setup. Should have two bits.
6237            But 64bit processes have always behaved this way,
6238            so it's not too bad. The main problem is just that
6239 -          32bit childs are affected again. */
6240 +          32bit childs are affected again. */
6241         current->personality &= ~READ_IMPLIES_EXEC;
6242  }
6243
6244  asmlinkage long sys_fork(struct pt_regs *regs)
6245  {
6246 -       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6247 +       return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6248  }
6249
6250  asmlinkage long
6251 @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6252           void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6253  {
6254         if (!newsp)
6255 -               newsp = regs->rsp;
6256 +               newsp = regs->sp;
6257         return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6258  }
6259
6260 @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6261   */
6262  asmlinkage long sys_vfork(struct pt_regs *regs)
6263  {
6264 -       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6265 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6266                     NULL, NULL);
6267  }
6268
6269  unsigned long get_wchan(struct task_struct *p)
6270  {
6271         unsigned long stack;
6272 -       u64 fp,rip;
6273 +       u64 fp,ip;
6274         int count = 0;
6275
6276         if (!p || p == current || p->state==TASK_RUNNING)
6277                 return 0;
6278         stack = (unsigned long)task_stack_page(p);
6279 -       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6280 +       if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6281                 return 0;
6282 -       fp = *(u64 *)(p->thread.rsp);
6283 +       fp = *(u64 *)(p->thread.sp);
6284         do {
6285                 if (fp < (unsigned long)stack ||
6286                     fp > (unsigned long)stack+THREAD_SIZE)
6287                         return 0;
6288 -               rip = *(u64 *)(fp+8);
6289 -               if (!in_sched_functions(rip))
6290 -                       return rip;
6291 +               ip = *(u64 *)(fp+8);
6292 +               if (!in_sched_functions(ip))
6293 +                       return ip;
6294                 fp = *(u64 *)fp;
6295         } while (count++ < 16);
6296         return 0;
6297 @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6298                 /* Not strictly needed for fs, but do it for symmetry
6299                    with gs */
6300                 if (addr >= TASK_SIZE_OF(task))
6301 -                       return -EPERM;
6302 +                       return -EPERM;
6303                 cpu = get_cpu();
6304 -               /* handle small bases via the GDT because that's faster to
6305 +               /* handle small bases via the GDT because that's faster to
6306                    switch. */
6307 -               if (addr <= 0xffffffff) {
6308 +               if (addr <= 0xffffffff) {
6309                         set_32bit_tls(task, FS_TLS, addr);
6310 -                       if (doit) {
6311 -                               load_TLS(&task->thread, cpu);
6312 +                       if (doit) {
6313 +                               load_TLS(&task->thread, cpu);
6314                                 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6315                         }
6316                         task->thread.fsindex = FS_TLS_SEL;
6317                         task->thread.fs = 0;
6318 -               } else {
6319 +               } else {
6320                         task->thread.fsindex = 0;
6321                         task->thread.fs = addr;
6322                         if (doit) {
6323 @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6324                 }
6325                 put_cpu();
6326                 break;
6327 -       case ARCH_GET_FS: {
6328 -               unsigned long base;
6329 +       case ARCH_GET_FS: {
6330 +               unsigned long base;
6331                 if (task->thread.fsindex == FS_TLS_SEL)
6332                         base = read_32bit_tls(task, FS_TLS);
6333                 else if (doit)
6334                         rdmsrl(MSR_FS_BASE, base);
6335                 else
6336                         base = task->thread.fs;
6337 -               ret = put_user(base, (unsigned long __user *)addr);
6338 -               break;
6339 +               ret = put_user(base, (unsigned long __user *)addr);
6340 +               break;
6341         }
6342 -       case ARCH_GET_GS: {
6343 +       case ARCH_GET_GS: {
6344                 unsigned long base;
6345                 unsigned gsindex;
6346                 if (task->thread.gsindex == GS_TLS_SEL)
6347                         base = read_32bit_tls(task, GS_TLS);
6348                 else if (doit) {
6349 -                       asm("movl %%gs,%0" : "=r" (gsindex));
6350 +                       asm("movl %%gs,%0" : "=r" (gsindex));
6351                         if (gsindex)
6352                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
6353                         else
6354 @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6355                 }
6356                 else
6357                         base = task->thread.gs;
6358 -               ret = put_user(base, (unsigned long __user *)addr);
6359 +               ret = put_user(base, (unsigned long __user *)addr);
6360                 break;
6361         }
6362
6363         default:
6364                 ret = -EINVAL;
6365                 break;
6366 -       }
6367 +       }
6368
6369 -       return ret;
6370 -}
6371 +       return ret;
6372 +}
6373
6374  long sys_arch_prctl(int code, unsigned long addr)
6375  {
6376         return do_arch_prctl(current, code, addr);
6377 -}
6378 -
6379 -/*
6380 - * Capture the user space registers if the task is not running (in user space)
6381 - */
6382 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6383 -{
6384 -       struct pt_regs *pp, ptregs;
6385 -
6386 -       pp = task_pt_regs(tsk);
6387 -
6388 -       ptregs = *pp;
6389 -       ptregs.cs &= 0xffff;
6390 -       ptregs.ss &= 0xffff;
6391 -
6392 -       elf_core_copy_regs(regs, &ptregs);
6393 -
6394 -        boot_option_idle_override = 1;
6395 -       return 1;
6396  }
6397
6398  unsigned long arch_align_stack(unsigned long sp)
6399 @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6400                 sp -= get_random_int() % 8192;
6401         return sp & ~0xf;
6402  }
6403 +
6404 +unsigned long arch_randomize_brk(struct mm_struct *mm)
6405 +{
6406 +       unsigned long range_end = mm->brk + 0x02000000;
6407 +       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6408 +}
6409 --- a/arch/x86/kernel/quirks-xen.c
6410 +++ b/arch/x86/kernel/quirks-xen.c
6411 @@ -9,7 +9,7 @@
6412  static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6413  {
6414         u8 config, rev;
6415 -       u32 word;
6416 +       u16 word;
6417
6418         /* BIOS may enable hardware IRQ balancing for
6419          * E7520/E7320/E7525(revision ID 0x9 and below)
6420 @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6421         pci_read_config_byte(dev, 0xf4, &config);
6422         pci_write_config_byte(dev, 0xf4, config|0x2);
6423
6424 -       /* read xTPR register */
6425 -       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6426 +       /*
6427 +        * read xTPR register.  We may not have a pci_dev for device 8
6428 +        * because it might be hidden until the above write.
6429 +        */
6430 +       pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6431
6432         if (!(word & (1 << 13))) {
6433                 struct xen_platform_op op;
6434
6435 -               printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6436 -                       "Disabling irq balancing and affinity\n");
6437 +               dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6438 +                       "disabling irq balancing and affinity\n");
6439                 op.cmd = XENPF_platform_quirk;
6440                 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6441                 WARN_ON(HYPERVISOR_platform_op(&op));
6442 @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6443         pci_read_config_dword(dev, 0xF0, &rcba);
6444         rcba &= 0xFFFFC000;
6445         if (rcba == 0) {
6446 -               printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6447 +               dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6448 +                       "cannot force enable HPET\n");
6449                 return;
6450         }
6451
6452         /* use bits 31:14, 16 kB aligned */
6453         rcba_base = ioremap_nocache(rcba, 0x4000);
6454         if (rcba_base == NULL) {
6455 -               printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6456 +               dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6457 +                       "cannot force enable HPET\n");
6458                 return;
6459         }
6460
6461 @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6462                 /* HPET is enabled in HPTC. Just not reported by BIOS */
6463                 val = val & 0x3;
6464                 force_hpet_address = 0xFED00000 | (val << 12);
6465 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6466 -                              force_hpet_address);
6467 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6468 +                       "0x%lx\n", force_hpet_address);
6469                 iounmap(rcba_base);
6470                 return;
6471         }
6472 @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6473         if (err) {
6474                 force_hpet_address = 0;
6475                 iounmap(rcba_base);
6476 -               printk(KERN_DEBUG "Failed to force enable HPET\n");
6477 +               dev_printk(KERN_DEBUG, &dev->dev,
6478 +                       "Failed to force enable HPET\n");
6479         } else {
6480                 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6481 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6482 -                              force_hpet_address);
6483 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6484 +                       "0x%lx\n", force_hpet_address);
6485         }
6486  }
6487
6488 @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6489                          ich_force_enable_hpet);
6490  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6491                          ich_force_enable_hpet);
6492 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6493 +                        ich_force_enable_hpet);
6494
6495
6496  static struct pci_dev *cached_dev;
6497 @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6498         if (val & 0x4) {
6499                 val &= 0x3;
6500                 force_hpet_address = 0xFED00000 | (val << 12);
6501 -               printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6502 -                              force_hpet_address);
6503 +               dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6504 +                       force_hpet_address);
6505                 return;
6506         }
6507
6508 @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6509                 /* HPET is enabled in HPTC. Just not reported by BIOS */
6510                 val &= 0x3;
6511                 force_hpet_address = 0xFED00000 | (val << 12);
6512 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6513 -                              force_hpet_address);
6514 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6515 +                       "0x%lx\n", force_hpet_address);
6516                 cached_dev = dev;
6517                 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6518                 return;
6519         }
6520
6521 -       printk(KERN_DEBUG "Failed to force enable HPET\n");
6522 +       dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6523  }
6524
6525  /*
6526 @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6527          */
6528         if (val & 0x80) {
6529                 force_hpet_address = (val & ~0x3ff);
6530 -               printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6531 -                              force_hpet_address);
6532 +               dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6533 +                       force_hpet_address);
6534                 return;
6535         }
6536
6537 @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6538         pci_read_config_dword(dev, 0x68, &val);
6539         if (val & 0x80) {
6540                 force_hpet_address = (val & ~0x3ff);
6541 -               printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6542 -                              force_hpet_address);
6543 +               dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6544 +                       "0x%lx\n", force_hpet_address);
6545                 cached_dev = dev;
6546                 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6547                 return;
6548         }
6549
6550 -       printk(KERN_DEBUG "Failed to force enable HPET\n");
6551 +       dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6552  }
6553
6554  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6555 @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6556         pci_read_config_dword(dev, 0x44, &val);
6557         force_hpet_address = val & 0xfffffffe;
6558         force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6559 -       printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6560 +       dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6561                 force_hpet_address);
6562         cached_dev = dev;
6563         return;
6564 @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6565                         nvidia_force_enable_hpet);
6566
6567  /* LPC bridges */
6568 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6569 +                       nvidia_force_enable_hpet);
6570  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6571                         nvidia_force_enable_hpet);
6572  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6573 @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6574  void force_hpet_resume(void)
6575  {
6576         switch (force_hpet_resume_type) {
6577 -           case ICH_FORCE_HPET_RESUME:
6578 -               return ich_force_hpet_resume();
6579 -
6580 -           case OLD_ICH_FORCE_HPET_RESUME:
6581 -               return old_ich_force_hpet_resume();
6582 -
6583 -           case VT8237_FORCE_HPET_RESUME:
6584 -               return vt8237_force_hpet_resume();
6585 -
6586 -           case NVIDIA_FORCE_HPET_RESUME:
6587 -               return nvidia_force_hpet_resume();
6588 -
6589 -           default:
6590 +       case ICH_FORCE_HPET_RESUME:
6591 +               ich_force_hpet_resume();
6592 +               return;
6593 +       case OLD_ICH_FORCE_HPET_RESUME:
6594 +               old_ich_force_hpet_resume();
6595 +               return;
6596 +       case VT8237_FORCE_HPET_RESUME:
6597 +               vt8237_force_hpet_resume();
6598 +               return;
6599 +       case NVIDIA_FORCE_HPET_RESUME:
6600 +               nvidia_force_hpet_resume();
6601 +               return;
6602 +       default:
6603                 break;
6604         }
6605  }
6606 --- a/arch/x86/kernel/rtc.c
6607 +++ b/arch/x86/kernel/rtc.c
6608 @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6609  {
6610         unsigned long retval, flags;
6611
6612 +#ifdef CONFIG_XEN
6613 +       if (!is_initial_xendomain())
6614 +               return xen_read_persistent_clock();
6615 +#endif
6616         spin_lock_irqsave(&rtc_lock, flags);
6617         retval = get_wallclock();
6618         spin_unlock_irqrestore(&rtc_lock, flags);
6619 @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6620
6621  int update_persistent_clock(struct timespec now)
6622  {
6623 +#ifdef CONFIG_XEN
6624 +       if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6625 +               return 0;
6626 +#endif
6627         return set_rtc_mmss(now.tv_sec);
6628  }
6629
6630 --- a/arch/x86/kernel/setup_32-xen.c
6631 +++ b/arch/x86/kernel/setup_32-xen.c
6632 @@ -47,9 +47,12 @@
6633  #include <linux/crash_dump.h>
6634  #include <linux/dmi.h>
6635  #include <linux/pfn.h>
6636 +#include <linux/pci.h>
6637 +#include <linux/init_ohci1394_dma.h>
6638
6639  #include <video/edid.h>
6640
6641 +#include <asm/mtrr.h>
6642  #include <asm/apic.h>
6643  #include <asm/e820.h>
6644  #include <asm/mpspec.h>
6645 @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6646         xen_panic_event, NULL, 0 /* try to go last */
6647  };
6648
6649 -int disable_pse __cpuinitdata = 0;
6650 -
6651  /*
6652   * Machine setup..
6653   */
6654 -extern struct resource code_resource;
6655 -extern struct resource data_resource;
6656 -extern struct resource bss_resource;
6657 +static struct resource data_resource = {
6658 +       .name   = "Kernel data",
6659 +       .start  = 0,
6660 +       .end    = 0,
6661 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6662 +};
6663 +
6664 +static struct resource code_resource = {
6665 +       .name   = "Kernel code",
6666 +       .start  = 0,
6667 +       .end    = 0,
6668 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6669 +};
6670 +
6671 +static struct resource bss_resource = {
6672 +       .name   = "Kernel bss",
6673 +       .start  = 0,
6674 +       .end    = 0,
6675 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6676 +};
6677 +
6678 +static struct resource video_ram_resource = {
6679 +       .name   = "Video RAM area",
6680 +       .start  = 0xa0000,
6681 +       .end    = 0xbffff,
6682 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
6683 +};
6684 +
6685 +static struct resource standard_io_resources[] = { {
6686 +       .name   = "dma1",
6687 +       .start  = 0x0000,
6688 +       .end    = 0x001f,
6689 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6690 +}, {
6691 +       .name   = "pic1",
6692 +       .start  = 0x0020,
6693 +       .end    = 0x0021,
6694 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6695 +}, {
6696 +       .name   = "timer0",
6697 +       .start  = 0x0040,
6698 +       .end    = 0x0043,
6699 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6700 +}, {
6701 +       .name   = "timer1",
6702 +       .start  = 0x0050,
6703 +       .end    = 0x0053,
6704 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6705 +}, {
6706 +       .name   = "keyboard",
6707 +       .start  = 0x0060,
6708 +       .end    = 0x006f,
6709 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6710 +}, {
6711 +       .name   = "dma page reg",
6712 +       .start  = 0x0080,
6713 +       .end    = 0x008f,
6714 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6715 +}, {
6716 +       .name   = "pic2",
6717 +       .start  = 0x00a0,
6718 +       .end    = 0x00a1,
6719 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6720 +}, {
6721 +       .name   = "dma2",
6722 +       .start  = 0x00c0,
6723 +       .end    = 0x00df,
6724 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6725 +}, {
6726 +       .name   = "fpu",
6727 +       .start  = 0x00f0,
6728 +       .end    = 0x00ff,
6729 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
6730 +} };
6731
6732  /* cpu data as detected by the assembly code in head.S */
6733  struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6734 @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6735  struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6736  EXPORT_SYMBOL(boot_cpu_data);
6737
6738 +#ifndef CONFIG_X86_PAE
6739  unsigned long mmu_cr4_features;
6740 +#else
6741 +unsigned long mmu_cr4_features = X86_CR4_PAE;
6742 +#endif
6743
6744  /* for MCA, but anyone else can use it if they want */
6745  unsigned int machine_id;
6746  unsigned int machine_submodel_id;
6747  unsigned int BIOS_revision;
6748 -unsigned int mca_pentium_flag;
6749
6750  /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6751  int bootloader_type;
6752 @@ -131,13 +206,17 @@ extern int root_mountflags;
6753
6754  unsigned long saved_videomode;
6755
6756 -#define RAMDISK_IMAGE_START_MASK       0x07FF
6757 +#define RAMDISK_IMAGE_START_MASK       0x07FF
6758  #define RAMDISK_PROMPT_FLAG            0x8000
6759 -#define RAMDISK_LOAD_FLAG              0x4000
6760 +#define RAMDISK_LOAD_FLAG              0x4000
6761
6762  static char __initdata command_line[COMMAND_LINE_SIZE];
6763
6764 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6765  struct boot_params __initdata boot_params;
6766 +#else
6767 +struct boot_params boot_params;
6768 +#endif
6769
6770  /*
6771   * Point at the empty zero page to start with. We map the real shared_info
6772 @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6773                 return -EINVAL;
6774
6775         if (strcmp(arg, "nopentium") == 0) {
6776 -               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6777 -               disable_pse = 1;
6778 +               setup_clear_cpu_cap(X86_FEATURE_PSE);
6779         } else {
6780                 /* If the user specifies memory size, we
6781                  * limit the BIOS-provided memory map to
6782 @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6783                  * trim the existing memory map.
6784                  */
6785                 unsigned long long mem_size;
6786 -
6787 +
6788                 mem_size = memparse(arg, &arg);
6789                 limit_regions(mem_size);
6790                 user_defined_memmap = 1;
6791 @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6792         unsigned int addr;
6793         addr = get_bios_ebda();
6794         if (addr)
6795 -               reserve_bootmem(addr, PAGE_SIZE);
6796 +               reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6797  }
6798  #endif
6799
6800 @@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6801         min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6802                 xen_start_info->nr_pt_frames;
6803
6804 -       find_max_pfn();
6805 -
6806         max_low_pfn = find_max_low_pfn();
6807
6808  #ifdef CONFIG_HIGHMEM
6809 @@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6810                                         (unsigned long)(total_mem >> 20));
6811                         crashk_res.start = crash_base;
6812                         crashk_res.end   = crash_base + crash_size - 1;
6813 -                       reserve_bootmem(crash_base, crash_size);
6814 +                       reserve_bootmem(crash_base, crash_size,
6815 +                                       BOOTMEM_DEFAULT);
6816                 } else
6817                         printk(KERN_INFO "crashkernel reservation failed - "
6818                                         "you have to specify a base address\n");
6819 @@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6820  {}
6821  #endif
6822
6823 +#ifdef CONFIG_BLK_DEV_INITRD
6824 +
6825 +static bool do_relocate_initrd = false;
6826 +
6827 +static void __init reserve_initrd(void)
6828 +{
6829 +       unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6830 +       unsigned long ramdisk_size  = xen_start_info->mod_len;
6831 +       unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
6832 +       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6833 +       unsigned long ramdisk_here;
6834 +
6835 +       initrd_start = 0;
6836 +
6837 +       if (!xen_start_info->mod_start || !ramdisk_size)
6838 +               return;         /* No initrd provided by bootloader */
6839 +
6840 +       if (ramdisk_end < ramdisk_image) {
6841 +               printk(KERN_ERR "initrd wraps around end of memory, "
6842 +                      "disabling initrd\n");
6843 +               return;
6844 +       }
6845 +       if (ramdisk_size >= end_of_lowmem/2) {
6846 +               printk(KERN_ERR "initrd too large to handle, "
6847 +                      "disabling initrd\n");
6848 +               return;
6849 +       }
6850 +       if (ramdisk_end <= end_of_lowmem) {
6851 +               /* All in lowmem, easy case */
6852 +               reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6853 +               initrd_start = ramdisk_image + PAGE_OFFSET;
6854 +               initrd_end = initrd_start+ramdisk_size;
6855 +               return;
6856 +       }
6857 +
6858 +       /* We need to move the initrd down into lowmem */
6859 +       ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6860 +
6861 +       /* Note: this includes all the lowmem currently occupied by
6862 +          the initrd, we rely on that fact to keep the data intact. */
6863 +       reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6864 +       initrd_start = ramdisk_here + PAGE_OFFSET;
6865 +       initrd_end   = initrd_start + ramdisk_size;
6866 +
6867 +       do_relocate_initrd = true;
6868 +}
6869 +
6870 +#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
6871 +
6872 +static void __init relocate_initrd(void)
6873 +{
6874 +       unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6875 +       unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
6876 +       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6877 +       unsigned long ramdisk_here;
6878 +       unsigned long slop, clen, mapaddr;
6879 +       char *p, *q;
6880 +
6881 +       if (!do_relocate_initrd)
6882 +               return;
6883 +
6884 +       ramdisk_here = initrd_start - PAGE_OFFSET;
6885 +
6886 +       q = (char *)initrd_start;
6887 +
6888 +       /* Copy any lowmem portion of the initrd */
6889 +       if (ramdisk_image < end_of_lowmem) {
6890 +               clen = end_of_lowmem - ramdisk_image;
6891 +               p = (char *)__va(ramdisk_image);
6892 +               memcpy(q, p, clen);
6893 +               q += clen;
6894 +               ramdisk_image += clen;
6895 +               ramdisk_size  -= clen;
6896 +       }
6897 +
6898 +       /* Copy the highmem portion of the initrd */
6899 +       while (ramdisk_size) {
6900 +               slop = ramdisk_image & ~PAGE_MASK;
6901 +               clen = ramdisk_size;
6902 +               if (clen > MAX_MAP_CHUNK-slop)
6903 +                       clen = MAX_MAP_CHUNK-slop;
6904 +               mapaddr = ramdisk_image & PAGE_MASK;
6905 +               p = early_ioremap(mapaddr, clen+slop);
6906 +               memcpy(q, p+slop, clen);
6907 +               early_iounmap(p, clen+slop);
6908 +               q += clen;
6909 +               ramdisk_image += clen;
6910 +               ramdisk_size  -= clen;
6911 +       }
6912 +}
6913 +
6914 +#endif /* CONFIG_BLK_DEV_INITRD */
6915 +
6916  void __init setup_bootmem_allocator(void)
6917  {
6918         unsigned long bootmap_size;
6919 @@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6920          * bootmem allocator with an invalid RAM area.
6921          */
6922         reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6923 -                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6924 +                        bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6925 +                        BOOTMEM_DEFAULT);
6926
6927  #ifndef CONFIG_XEN
6928         /*
6929          * reserve physical page 0 - it's a special BIOS page on many boxes,
6930          * enabling clean reboots, SMP operation, laptop functions.
6931          */
6932 -       reserve_bootmem(0, PAGE_SIZE);
6933 +       reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6934
6935         /* reserve EBDA region, it's a 4K region */
6936         reserve_ebda_region();
6937 @@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6938         unless you have no PS/2 mouse plugged in. */
6939         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6940             boot_cpu_data.x86 == 6)
6941 -            reserve_bootmem(0xa0000 - 4096, 4096);
6942 +            reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6943
6944  #ifdef CONFIG_SMP
6945         /*
6946 @@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6947          * FIXME: Don't need the extra page at 4K, but need to fix
6948          * trampoline before removing it. (see the GDT stuff)
6949          */
6950 -       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6951 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6952  #endif
6953  #ifdef CONFIG_ACPI_SLEEP
6954         /*
6955 @@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6956          */
6957         acpi_reserve_bootmem();
6958  #endif
6959 -       numa_kva_reserve();
6960  #endif /* !CONFIG_XEN */
6961
6962  #ifdef CONFIG_BLK_DEV_INITRD
6963 -       if (xen_start_info->mod_start) {
6964 -               unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6965 -               unsigned long ramdisk_size  = xen_start_info->mod_len;
6966 -               unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
6967 -               unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6968 -
6969 -               if (ramdisk_end <= end_of_lowmem) {
6970 -                       /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6971 -                       initrd_start = ramdisk_image + PAGE_OFFSET;
6972 -                       initrd_end = initrd_start+ramdisk_size;
6973 -                       initrd_below_start_ok = 1;
6974 -               } else {
6975 -                       printk(KERN_ERR "initrd extends beyond end of memory "
6976 -                              "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6977 -                              ramdisk_end, end_of_lowmem);
6978 -                       initrd_start = 0;
6979 -               }
6980 -       }
6981 +       reserve_initrd();
6982  #endif
6983 +       numa_kva_reserve();
6984         reserve_crashkernel();
6985  }
6986
6987 @@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6988         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6989         pre_setup_arch_hook();
6990         early_cpu_init();
6991 +       early_ioremap_init();
6992  #ifdef CONFIG_SMP
6993         prefill_possible_map();
6994  #endif
6995
6996 -       /*
6997 -        * FIXME: This isn't an official loader_type right
6998 -        * now but does currently work with elilo.
6999 -        * If we were configured as an EFI kernel, check to make
7000 -        * sure that we were loaded correctly from elilo and that
7001 -        * the system table is valid.  If not, then initialize normally.
7002 -        */
7003  #ifdef CONFIG_EFI
7004 -       if ((boot_params.hdr.type_of_loader == 0x50) &&
7005 -           boot_params.efi_info.efi_systab)
7006 +       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7007 +                    "EL32", 4))
7008                 efi_enabled = 1;
7009  #endif
7010
7011 @@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7012  #endif
7013
7014         ARCH_SETUP
7015 -       if (efi_enabled)
7016 -               efi_init();
7017 -       else {
7018 -               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7019 -               print_memory_map(memory_setup());
7020 -       }
7021 +
7022 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7023 +       print_memory_map(memory_setup());
7024
7025         copy_edd();
7026
7027 @@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7028         strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7029         *cmdline_p = command_line;
7030
7031 +       if (efi_enabled)
7032 +               efi_init();
7033 +
7034 +       /* update e820 for memory not covered by WB MTRRs */
7035 +       find_max_pfn();
7036 +       mtrr_bp_init();
7037 +#ifndef CONFIG_XEN
7038 +       if (mtrr_trim_uncached_memory(max_pfn))
7039 +               find_max_pfn();
7040 +#endif
7041 +
7042         max_low_pfn = setup_memory();
7043
7044  #ifdef CONFIG_VMI
7045 @@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7046         smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7047  #endif
7048         paging_init();
7049 +
7050 +       /*
7051 +        * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7052 +        */
7053 +
7054 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7055 +       if (init_ohci1394_dma_early)
7056 +               init_ohci1394_dma_on_all_controllers();
7057 +#endif
7058 +
7059         remapped_pgdat_init();
7060         sparse_init();
7061         zone_sizes_init();
7062 @@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7063          * NOTE: at this point the bootmem allocator is fully available.
7064          */
7065
7066 +#ifdef CONFIG_BLK_DEV_INITRD
7067 +       relocate_initrd();
7068 +#endif
7069 +
7070         paravirt_post_allocator_init();
7071
7072         if (is_initial_xendomain())
7073                 dmi_scan_machine();
7074
7075 +       io_delay_init();
7076 +
7077  #ifdef CONFIG_X86_GENERICARCH
7078         generic_apic_probe();
7079 -#endif
7080 -       if (efi_enabled)
7081 -               efi_map_memmap();
7082 +#endif
7083
7084         set_iopl.iopl = 1;
7085         WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7086 @@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7087         acpi_boot_table_init();
7088  #endif
7089
7090 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7091 +#ifndef CONFIG_XEN
7092         early_quirks();
7093  #endif
7094
7095 @@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7096         /* we're never actually going to get here... */
7097         return NOTIFY_DONE;
7098  }
7099 +
7100 +/*
7101 + * Request address space for all standard resources
7102 + *
7103 + * This is called just before pcibios_init(), which is also a
7104 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7105 + */
7106 +static int __init request_standard_resources(void)
7107 +{
7108 +       int i;
7109 +
7110 +       /* Nothing to do if not running in dom0. */
7111 +       if (!is_initial_xendomain())
7112 +               return 0;
7113 +
7114 +       printk(KERN_INFO "Setting up standard PCI resources\n");
7115 +       init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7116 +
7117 +       request_resource(&iomem_resource, &video_ram_resource);
7118 +
7119 +       /* request I/O space for devices used on all i[345]86 PCs */
7120 +       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7121 +               request_resource(&ioport_resource, &standard_io_resources[i]);
7122 +       return 0;
7123 +}
7124 +
7125 +subsys_initcall(request_standard_resources);
7126 --- a/arch/x86/kernel/setup_64-xen.c
7127 +++ b/arch/x86/kernel/setup_64-xen.c
7128 @@ -15,7 +15,6 @@
7129  #include <linux/ptrace.h>
7130  #include <linux/slab.h>
7131  #include <linux/user.h>
7132 -#include <linux/a.out.h>
7133  #include <linux/screen_info.h>
7134  #include <linux/ioport.h>
7135  #include <linux/delay.h>
7136 @@ -30,6 +29,7 @@
7137  #include <linux/crash_dump.h>
7138  #include <linux/root_dev.h>
7139  #include <linux/pci.h>
7140 +#include <linux/efi.h>
7141  #include <linux/acpi.h>
7142  #include <linux/kallsyms.h>
7143  #include <linux/edd.h>
7144 @@ -39,10 +39,13 @@
7145  #include <linux/dmi.h>
7146  #include <linux/dma-mapping.h>
7147  #include <linux/ctype.h>
7148 +#include <linux/uaccess.h>
7149 +#include <linux/init_ohci1394_dma.h>
7150
7151  #include <asm/mtrr.h>
7152  #include <asm/uaccess.h>
7153  #include <asm/system.h>
7154 +#include <asm/vsyscall.h>
7155  #include <asm/io.h>
7156  #include <asm/smp.h>
7157  #include <asm/msr.h>
7158 @@ -50,6 +53,7 @@
7159  #include <video/edid.h>
7160  #include <asm/e820.h>
7161  #include <asm/dma.h>
7162 +#include <asm/gart.h>
7163  #include <asm/mpspec.h>
7164  #include <asm/mmu_context.h>
7165  #include <asm/proto.h>
7166 @@ -59,6 +63,9 @@
7167  #include <asm/sections.h>
7168  #include <asm/dmi.h>
7169  #include <asm/cacheflush.h>
7170 +#include <asm/mce.h>
7171 +#include <asm/ds.h>
7172 +#include <asm/topology.h>
7173  #ifdef CONFIG_XEN
7174  #include <linux/percpu.h>
7175  #include <xen/interface/physdev.h>
7176 @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7177  struct cpuinfo_x86 boot_cpu_data __read_mostly;
7178  EXPORT_SYMBOL(boot_cpu_data);
7179
7180 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7181 +
7182  unsigned long mmu_cr4_features;
7183
7184  /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7185 @@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7186
7187  int force_mwait __cpuinitdata;
7188
7189 -/*
7190 +/*
7191   * Early DMI memory
7192   */
7193  int dmi_alloc_index;
7194 @@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7195
7196  #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7197
7198 -struct resource data_resource = {
7199 +static struct resource data_resource = {
7200         .name = "Kernel data",
7201         .start = 0,
7202         .end = 0,
7203         .flags = IORESOURCE_RAM,
7204  };
7205 -struct resource code_resource = {
7206 +static struct resource code_resource = {
7207         .name = "Kernel code",
7208         .start = 0,
7209         .end = 0,
7210         .flags = IORESOURCE_RAM,
7211  };
7212 -struct resource bss_resource = {
7213 +static struct resource bss_resource = {
7214         .name = "Kernel bss",
7215         .start = 0,
7216         .end = 0,
7217         .flags = IORESOURCE_RAM,
7218  };
7219
7220 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7221 +
7222  #ifdef CONFIG_PROC_VMCORE
7223  /* elfcorehdr= specifies the location of elf core header
7224   * stored by the crashed kernel. This option will be passed
7225 @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7226         unsigned long bootmap_size, bootmap;
7227
7228         bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7229 -       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7230 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7231 +                                PAGE_SIZE);
7232         if (bootmap == -1L)
7233 -               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7234 +               panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7235         bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7236         e820_register_active_regions(0, start_pfn, end_pfn);
7237  #ifdef CONFIG_XEN
7238 @@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7239  #else
7240         free_bootmem_with_active_regions(0, end_pfn);
7241  #endif
7242 -       reserve_bootmem(bootmap, bootmap_size);
7243 -}
7244 +       reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7245 +}
7246  #endif
7247
7248  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7249 @@ -249,27 +261,35 @@ static inline void copy_edd(void)
7250  #ifndef CONFIG_XEN
7251  static void __init reserve_crashkernel(void)
7252  {
7253 -       unsigned long long free_mem;
7254 +       unsigned long long total_mem;
7255         unsigned long long crash_size, crash_base;
7256         int ret;
7257
7258 -       free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7259 +       total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7260
7261 -       ret = parse_crashkernel(boot_command_line, free_mem,
7262 +       ret = parse_crashkernel(boot_command_line, total_mem,
7263                         &crash_size, &crash_base);
7264         if (ret == 0 && crash_size) {
7265 -               if (crash_base > 0) {
7266 -                       printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7267 -                                       "for crashkernel (System RAM: %ldMB)\n",
7268 -                                       (unsigned long)(crash_size >> 20),
7269 -                                       (unsigned long)(crash_base >> 20),
7270 -                                       (unsigned long)(free_mem >> 20));
7271 -                       crashk_res.start = crash_base;
7272 -                       crashk_res.end   = crash_base + crash_size - 1;
7273 -                       reserve_bootmem(crash_base, crash_size);
7274 -               } else
7275 +               if (crash_base <= 0) {
7276                         printk(KERN_INFO "crashkernel reservation failed - "
7277                                         "you have to specify a base address\n");
7278 +                       return;
7279 +               }
7280 +
7281 +               if (reserve_bootmem(crash_base, crash_size,
7282 +                                       BOOTMEM_EXCLUSIVE) < 0) {
7283 +                       printk(KERN_INFO "crashkernel reservation failed - "
7284 +                                       "memory is in use\n");
7285 +                       return;
7286 +               }
7287 +
7288 +               printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7289 +                               "for crashkernel (System RAM: %ldMB)\n",
7290 +                               (unsigned long)(crash_size >> 20),
7291 +                               (unsigned long)(crash_base >> 20),
7292 +                               (unsigned long)(total_mem >> 20));
7293 +               crashk_res.start = crash_base;
7294 +               crashk_res.end   = crash_base + crash_size - 1;
7295         }
7296  }
7297  #else
7298 @@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7299  {}
7300  #endif
7301
7302 -#ifndef CONFIG_XEN
7303 -#define EBDA_ADDR_POINTER 0x40E
7304 -
7305 -unsigned __initdata ebda_addr;
7306 -unsigned __initdata ebda_size;
7307 -
7308 -static void discover_ebda(void)
7309 +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7310 +void __attribute__((weak)) __init memory_setup(void)
7311  {
7312 -       /*
7313 -        * there is a real-mode segmented pointer pointing to the
7314 -        * 4K EBDA area at 0x40E
7315 -        */
7316 -       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7317 -       ebda_addr <<= 4;
7318 -
7319 -       ebda_size = *(unsigned short *)__va(ebda_addr);
7320 -
7321 -       /* Round EBDA up to pages */
7322 -       if (ebda_size == 0)
7323 -               ebda_size = 1;
7324 -       ebda_size <<= 10;
7325 -       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7326 -       if (ebda_size > 64*1024)
7327 -               ebda_size = 64*1024;
7328 +       machine_specific_memory_setup();
7329  }
7330 -#else
7331 -#define discover_ebda() ((void)0)
7332 -#endif
7333
7334 +/*
7335 + * setup_arch - architecture-specific boot-time initializations
7336 + *
7337 + * Note: On x86_64, fixmaps are ready for use even before this is called.
7338 + */
7339  void __init setup_arch(char **cmdline_p)
7340  {
7341 +       unsigned i;
7342 +
7343  #ifdef CONFIG_XEN
7344         extern struct e820map machine_e820;
7345
7346 @@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7347         /* Register a call for panic conditions. */
7348         atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7349
7350 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7351 +                                    VMASST_TYPE_writable_pagetables));
7352 +
7353 +       early_ioremap_init();
7354 +
7355         ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7356         screen_info = boot_params.screen_info;
7357
7358 @@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7359                 screen_info.orig_video_isVGA = 0;
7360
7361         copy_edid();
7362 -
7363 -       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7364 -                                    VMASST_TYPE_writable_pagetables));
7365 -
7366 -       ARCH_SETUP
7367  #else
7368         printk(KERN_INFO "Command line: %s\n", boot_command_line);
7369
7370 @@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7371         rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7372         rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7373  #endif
7374 -       setup_memory_region();
7375 +#ifdef CONFIG_EFI
7376 +       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7377 +                    "EL64", 4))
7378 +               efi_enabled = 1;
7379 +#endif
7380 +
7381 +       ARCH_SETUP
7382 +
7383 +       memory_setup();
7384         copy_edd();
7385
7386         if (!boot_params.hdr.root_flags)
7387 @@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7388
7389         parse_early_param();
7390
7391 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7392 +       if (init_ohci1394_dma_early)
7393 +               init_ohci1394_dma_on_all_controllers();
7394 +#endif
7395 +
7396         finish_e820_parsing();
7397
7398 +       early_gart_iommu_check();
7399 +
7400         e820_register_active_regions(0, 0, -1UL);
7401         /*
7402          * partially used pages are not usable - thus
7403          * we are rounding upwards:
7404          */
7405         end_pfn = e820_end_of_ram();
7406 +       /* update e820 for memory not covered by WB MTRRs */
7407 +       mtrr_bp_init();
7408 +#ifndef CONFIG_XEN
7409 +       if (mtrr_trim_uncached_memory(end_pfn)) {
7410 +               e820_register_active_regions(0, 0, -1UL);
7411 +               end_pfn = e820_end_of_ram();
7412 +       }
7413 +#endif
7414 +
7415         num_physpages = end_pfn;
7416 +       max_mapnr = end_pfn;
7417
7418         check_efer();
7419
7420 -       discover_ebda();
7421 -
7422         init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7423 +       if (efi_enabled)
7424 +               efi_init();
7425
7426         if (is_initial_xendomain())
7427                 dmi_scan_machine();
7428
7429 +       io_delay_init();
7430 +
7431  #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7432 -       /* setup to use the static apicid table during kernel startup */
7433 -       x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7434 +       /* setup to use the early static init tables during kernel startup */
7435 +       x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7436 +       x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7437 +#ifdef CONFIG_NUMA
7438 +       x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7439 +#endif
7440  #endif
7441
7442         /* How many end-of-memory variables you have, grandma! */
7443 @@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7444  #endif
7445
7446  #ifdef CONFIG_NUMA
7447 -       numa_initmem_init(0, end_pfn);
7448 +       numa_initmem_init(0, end_pfn);
7449  #else
7450         contig_initmem_init(0, end_pfn);
7451  #endif
7452
7453 -#ifdef CONFIG_XEN
7454 -       /*
7455 -        * Reserve kernel, physmap, start info, initial page tables, and
7456 -        * direct mapping.
7457 -        */
7458 -       reserve_bootmem_generic(__pa_symbol(&_text),
7459 -                               (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7460 -#else
7461 -       /* Reserve direct mapping */
7462 -       reserve_bootmem_generic(table_start << PAGE_SHIFT,
7463 -                               (table_end - table_start) << PAGE_SHIFT);
7464 -
7465 -       /* reserve kernel */
7466 -       reserve_bootmem_generic(__pa_symbol(&_text),
7467 -                               __pa_symbol(&_end) - __pa_symbol(&_text));
7468 +       early_res_to_bootmem();
7469
7470 +#ifndef CONFIG_XEN
7471 +#ifdef CONFIG_ACPI_SLEEP
7472         /*
7473 -        * reserve physical page 0 - it's a special BIOS page on many boxes,
7474 -        * enabling clean reboots, SMP operation, laptop functions.
7475 +        * Reserve low memory region for sleep support.
7476          */
7477 -       reserve_bootmem_generic(0, PAGE_SIZE);
7478 -
7479 -       /* reserve ebda region */
7480 -       if (ebda_addr)
7481 -               reserve_bootmem_generic(ebda_addr, ebda_size);
7482 -#ifdef CONFIG_NUMA
7483 -       /* reserve nodemap region */
7484 -       if (nodemap_addr)
7485 -               reserve_bootmem_generic(nodemap_addr, nodemap_size);
7486 +       acpi_reserve_bootmem();
7487  #endif
7488
7489 -#ifdef CONFIG_SMP
7490 -       /* Reserve SMP trampoline */
7491 -       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7492 -#endif
7493 +       if (efi_enabled)
7494 +               efi_reserve_bootmem();
7495  #endif
7496
7497 -#ifdef CONFIG_ACPI_SLEEP
7498 -       /*
7499 -        * Reserve low memory region for sleep support.
7500 -        */
7501 -       acpi_reserve_bootmem();
7502 -#endif
7503  #ifdef CONFIG_BLK_DEV_INITRD
7504  #ifdef CONFIG_XEN
7505         if (xen_start_info->mod_start) {
7506 @@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7507                         initrd_below_start_ok = 1;
7508  #endif
7509                 } else {
7510 +                       /* Assumes everything on node 0 */
7511 +                       free_bootmem(ramdisk_image, ramdisk_size);
7512                         printk(KERN_ERR "initrd extends beyond end of memory "
7513                                "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7514                                ramdisk_end, end_of_mem);
7515 @@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7516  #endif
7517         reserve_crashkernel();
7518         paging_init();
7519 +       map_vsyscall();
7520  #ifdef CONFIG_X86_LOCAL_APIC
7521         /*
7522 -        * Find and reserve possible boot-time SMP configuration:
7523 -        */
7524 +       * Find and reserve possible boot-time SMP configuration:
7525 +       */
7526         find_smp_config();
7527  #endif
7528  #ifdef CONFIG_XEN
7529 @@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7530  #endif
7531  #endif
7532
7533 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7534 +#ifndef CONFIG_XEN
7535         early_quirks();
7536  #endif
7537
7538 -       /*
7539 -        * set this early, so we dont allocate cpu0
7540 -        * if MADT list doesnt list BSP first
7541 -        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7542 -        */
7543 -       cpu_set(0, cpu_present_map);
7544  #ifdef CONFIG_ACPI
7545         /*
7546          * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7547 @@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7548                 get_smp_config();
7549  #ifndef CONFIG_XEN
7550         init_apic_mappings();
7551 +       ioapic_init_mappings();
7552  #endif
7553  #endif
7554  #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7555 @@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7556          */
7557  #ifdef CONFIG_XEN
7558         if (is_initial_xendomain())
7559 -               e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7560 +               e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7561 +                                      &code_resource, &data_resource, &bss_resource);
7562  #else
7563 -       e820_reserve_resources(e820.map, e820.nr_map);
7564 +       e820_reserve_resources(e820.map, e820.nr_map,
7565 +                              &code_resource, &data_resource, &bss_resource);
7566         e820_mark_nosave_regions();
7567  #endif
7568
7569 -       {
7570 -       unsigned i;
7571         /* request I/O space for devices used on all i[345]86 PCs */
7572         for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7573                 request_resource(&ioport_resource, &standard_io_resources[i]);
7574 -       }
7575
7576  #ifdef CONFIG_XEN
7577         if (is_initial_xendomain())
7578 @@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7579
7580  #ifdef CONFIG_VT
7581  #if defined(CONFIG_VGA_CONSOLE)
7582 -       conswitchp = &vga_con;
7583 +       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7584 +               conswitchp = &vga_con;
7585  #elif defined(CONFIG_DUMMY_CONSOLE)
7586         conswitchp = &dummy_con;
7587  #endif
7588 @@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7589
7590         if (n >= 0x80000005) {
7591                 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7592 -               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7593 -                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7594 -               c->x86_cache_size=(ecx>>24)+(edx>>24);
7595 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7596 +                      "D cache %dK (%d bytes/line)\n",
7597 +                      edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7598 +               c->x86_cache_size = (ecx>>24) + (edx>>24);
7599                 /* On K8 L1 TLB is inclusive, so don't count it */
7600                 c->x86_tlbsize = 0;
7601         }
7602 @@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7603                 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7604                 c->x86_cache_size, ecx & 0xFF);
7605         }
7606 -
7607 -       if (n >= 0x80000007)
7608 -               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7609         if (n >= 0x80000008) {
7610 -               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7611 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7612                 c->x86_virt_bits = (eax >> 8) & 0xff;
7613                 c->x86_phys_bits = eax & 0xff;
7614         }
7615  }
7616
7617  #ifdef CONFIG_NUMA
7618 -static int nearby_node(int apicid)
7619 +static int __cpuinit nearby_node(int apicid)
7620  {
7621 -       int i;
7622 +       int i, node;
7623 +
7624         for (i = apicid - 1; i >= 0; i--) {
7625 -               int node = apicid_to_node[i];
7626 +               node = apicid_to_node[i];
7627                 if (node != NUMA_NO_NODE && node_online(node))
7628                         return node;
7629         }
7630         for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7631 -               int node = apicid_to_node[i];
7632 +               node = apicid_to_node[i];
7633                 if (node != NUMA_NO_NODE && node_online(node))
7634                         return node;
7635         }
7636 @@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7637   * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7638   * Assumes number of cores is a power of two.
7639   */
7640 -static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7641 +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7642  {
7643  #ifdef CONFIG_SMP
7644         unsigned bits;
7645 @@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7646         int node = 0;
7647         unsigned apicid = hard_smp_processor_id();
7648  #endif
7649 -       unsigned ecx = cpuid_ecx(0x80000008);
7650 +       bits = c->x86_coreid_bits;
7651 +
7652 +       /* Low order bits define the core id (index of core in socket) */
7653 +       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7654 +       /* Convert the APIC ID into the socket ID */
7655 +       c->phys_proc_id = phys_pkg_id(bits);
7656 +
7657 +#ifdef CONFIG_NUMA
7658 +       node = c->phys_proc_id;
7659 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
7660 +               node = apicid_to_node[apicid];
7661 +       if (!node_online(node)) {
7662 +               /* Two possibilities here:
7663 +                  - The CPU is missing memory and no node was created.
7664 +                  In that case try picking one from a nearby CPU
7665 +                  - The APIC IDs differ from the HyperTransport node IDs
7666 +                  which the K8 northbridge parsing fills in.
7667 +                  Assume they are all increased by a constant offset,
7668 +                  but in the same order as the HT nodeids.
7669 +                  If that doesn't result in a usable node fall back to the
7670 +                  path for the previous case.  */
7671 +
7672 +               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7673 +
7674 +               if (ht_nodeid >= 0 &&
7675 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7676 +                       node = apicid_to_node[ht_nodeid];
7677 +               /* Pick a nearby node */
7678 +               if (!node_online(node))
7679 +                       node = nearby_node(apicid);
7680 +       }
7681 +       numa_set_node(cpu, node);
7682 +
7683 +       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7684 +#endif
7685 +#endif
7686 +}
7687 +
7688 +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7689 +{
7690 +#ifdef CONFIG_SMP
7691 +       unsigned bits, ecx;
7692 +
7693 +       /* Multi core CPU? */
7694 +       if (c->extended_cpuid_level < 0x80000008)
7695 +               return;
7696 +
7697 +       ecx = cpuid_ecx(0x80000008);
7698
7699         c->x86_max_cores = (ecx & 0xff) + 1;
7700
7701 @@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7702                         bits++;
7703         }
7704
7705 -       /* Low order bits define the core id (index of core in socket) */
7706 -       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7707 -       /* Convert the APIC ID into the socket ID */
7708 -       c->phys_proc_id = phys_pkg_id(bits);
7709 -
7710 -#ifdef CONFIG_NUMA
7711 -       node = c->phys_proc_id;
7712 -       if (apicid_to_node[apicid] != NUMA_NO_NODE)
7713 -               node = apicid_to_node[apicid];
7714 -       if (!node_online(node)) {
7715 -               /* Two possibilities here:
7716 -                  - The CPU is missing memory and no node was created.
7717 -                  In that case try picking one from a nearby CPU
7718 -                  - The APIC IDs differ from the HyperTransport node IDs
7719 -                  which the K8 northbridge parsing fills in.
7720 -                  Assume they are all increased by a constant offset,
7721 -                  but in the same order as the HT nodeids.
7722 -                  If that doesn't result in a usable node fall back to the
7723 -                  path for the previous case.  */
7724 -               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7725 -               if (ht_nodeid >= 0 &&
7726 -                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7727 -                       node = apicid_to_node[ht_nodeid];
7728 -               /* Pick a nearby node */
7729 -               if (!node_online(node))
7730 -                       node = nearby_node(apicid);
7731 -       }
7732 -       numa_set_node(cpu, node);
7733 +       c->x86_coreid_bits = bits;
7734
7735 -       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7736 -#endif
7737  #endif
7738  }
7739
7740 @@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7741  /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7742  static __cpuinit int amd_apic_timer_broken(void)
7743  {
7744 -       u32 lo, hi;
7745 -       u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7746 +       u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7747 +
7748         switch (eax & CPUID_XFAM) {
7749         case CPUID_XFAM_K8:
7750                 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7751 @@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7752  }
7753  #endif
7754
7755 +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7756 +{
7757 +       early_init_amd_mc(c);
7758 +
7759 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7760 +       if (c->x86_power & (1<<8))
7761 +               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7762 +}
7763 +
7764  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7765  {
7766         unsigned level;
7767 @@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7768         /*
7769          * Disable TLB flush filter by setting HWCR.FFDIS on K8
7770          * bit 6 of msr C001_0015
7771 -        *
7772 +        *
7773          * Errata 63 for SH-B3 steppings
7774          * Errata 122 for all steppings (F+ have it disabled by default)
7775          */
7776 @@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7777
7778         /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7779            3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7780 -       clear_bit(0*32+31, &c->x86_capability);
7781 -
7782 +       clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7783 +
7784         /* On C+ stepping K8 rep microcode works well for copy/memset */
7785         level = cpuid_eax(1);
7786 -       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7787 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7788 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7789 +                            level >= 0x0f58))
7790 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7791         if (c->x86 == 0x10 || c->x86 == 0x11)
7792 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7793 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7794
7795         /* Enable workaround for FXSAVE leak */
7796         if (c->x86 >= 6)
7797 -               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7798 +               set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7799
7800         level = get_model_name(c);
7801         if (!level) {
7802 -               switch (c->x86) {
7803 +               switch (c->x86) {
7804                 case 15:
7805                         /* Should distinguish Models here, but this is only
7806                            a fallback anyways. */
7807                         strcpy(c->x86_model_id, "Hammer");
7808 -                       break;
7809 -               }
7810 -       }
7811 +                       break;
7812 +               }
7813 +       }
7814         display_cacheinfo(c);
7815
7816 -       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7817 -       if (c->x86_power & (1<<8))
7818 -               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7819 -
7820         /* Multi core CPU? */
7821         if (c->extended_cpuid_level >= 0x80000008)
7822                 amd_detect_cmp(c);
7823 @@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7824                 num_cache_leaves = 3;
7825
7826         if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7827 -               set_bit(X86_FEATURE_K8, &c->x86_capability);
7828 -
7829 -       /* RDTSC can be speculated around */
7830 -       clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7831 +               set_cpu_cap(c, X86_FEATURE_K8);
7832
7833 -       /* Family 10 doesn't support C states in MWAIT so don't use it */
7834 -       if (c->x86 == 0x10 && !force_mwait)
7835 -               clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7836 +       /* MFENCE stops RDTSC speculation */
7837 +       set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7838
7839  #ifndef CONFIG_XEN
7840         if (amd_apic_timer_broken())
7841 @@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7842  #endif
7843  }
7844
7845 -static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7846 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7847  {
7848  #ifdef CONFIG_SMP
7849 -       u32     eax, ebx, ecx, edx;
7850 -       int     index_msb, core_bits;
7851 +       u32 eax, ebx, ecx, edx;
7852 +       int index_msb, core_bits;
7853
7854         cpuid(1, &eax, &ebx, &ecx, &edx);
7855
7856
7857         if (!cpu_has(c, X86_FEATURE_HT))
7858                 return;
7859 -       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7860 +       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7861                 goto out;
7862
7863         smp_num_siblings = (ebx & 0xff0000) >> 16;
7864
7865         if (smp_num_siblings == 1) {
7866                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
7867 -       } else if (smp_num_siblings > 1 ) {
7868 +       } else if (smp_num_siblings > 1) {
7869
7870                 if (smp_num_siblings > NR_CPUS) {
7871 -                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7872 +                       printk(KERN_WARNING "CPU: Unsupported number of "
7873 +                              "siblings %d", smp_num_siblings);
7874                         smp_num_siblings = 1;
7875                         return;
7876                 }
7877 @@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7878
7879                 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7880
7881 -               index_msb = get_count_order(smp_num_siblings) ;
7882 +               index_msb = get_count_order(smp_num_siblings);
7883
7884                 core_bits = get_count_order(c->x86_max_cores);
7885
7886 @@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7887         }
7888  out:
7889         if ((c->x86_max_cores * smp_num_siblings) > 1) {
7890 -               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7891 -               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7892 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
7893 +                      c->phys_proc_id);
7894 +               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
7895 +                      c->cpu_core_id);
7896         }
7897
7898  #endif
7899 @@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7900                 return 1;
7901  }
7902
7903 -static void srat_detect_node(void)
7904 +static void __cpuinit srat_detect_node(void)
7905  {
7906  #ifdef CONFIG_NUMA
7907         unsigned node;
7908 @@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7909         /* Don't do the funky fallback heuristics the AMD version employs
7910            for now. */
7911         node = apicid_to_node[apicid];
7912 -       if (node == NUMA_NO_NODE)
7913 +       if (node == NUMA_NO_NODE || !node_online(node))
7914                 node = first_node(node_online_map);
7915         numa_set_node(cpu, node);
7916
7917 @@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7918  #endif
7919  }
7920
7921 +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7922 +{
7923 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7924 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
7925 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7926 +}
7927 +
7928  static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7929  {
7930         /* Cache sizes */
7931         unsigned n;
7932
7933         init_intel_cacheinfo(c);
7934 -       if (c->cpuid_level > 9 ) {
7935 +       if (c->cpuid_level > 9) {
7936                 unsigned eax = cpuid_eax(10);
7937                 /* Check for version and the number of counters */
7938                 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7939 -                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7940 +                       set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7941         }
7942
7943         if (cpu_has_ds) {
7944                 unsigned int l1, l2;
7945                 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7946                 if (!(l1 & (1<<11)))
7947 -                       set_bit(X86_FEATURE_BTS, c->x86_capability);
7948 +                       set_cpu_cap(c, X86_FEATURE_BTS);
7949                 if (!(l1 & (1<<12)))
7950 -                       set_bit(X86_FEATURE_PEBS, c->x86_capability);
7951 +                       set_cpu_cap(c, X86_FEATURE_PEBS);
7952         }
7953
7954 +
7955 +       if (cpu_has_bts)
7956 +               ds_init_intel(c);
7957 +
7958         n = c->extended_cpuid_level;
7959         if (n >= 0x80000008) {
7960                 unsigned eax = cpuid_eax(0x80000008);
7961 @@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7962                 c->x86_cache_alignment = c->x86_clflush_size * 2;
7963         if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7964             (c->x86 == 0x6 && c->x86_model >= 0x0e))
7965 -               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7966 +               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7967         if (c->x86 == 6)
7968 -               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7969 -       if (c->x86 == 15)
7970 -               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7971 -       else
7972 -               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7973 -       c->x86_max_cores = intel_num_cpu_cores(c);
7974 +               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7975 +       set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7976 +       c->x86_max_cores = intel_num_cpu_cores(c);
7977
7978         srat_detect_node();
7979  }
7980 @@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7981                 c->x86_vendor = X86_VENDOR_UNKNOWN;
7982  }
7983
7984 -struct cpu_model_info {
7985 -       int vendor;
7986 -       int family;
7987 -       char *model_names[16];
7988 -};
7989 -
7990  /* Do some early cpuid on the boot CPU to get some parameter that are
7991     needed before check_bugs. Everything advanced is in identify_cpu
7992     below. */
7993 -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7994 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7995  {
7996 -       u32 tfms;
7997 +       u32 tfms, xlvl;
7998
7999         c->loops_per_jiffy = loops_per_jiffy;
8000         c->x86_cache_size = -1;
8001 @@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
8002         c->x86_clflush_size = 64;
8003         c->x86_cache_alignment = c->x86_clflush_size;
8004         c->x86_max_cores = 1;
8005 +       c->x86_coreid_bits = 0;
8006         c->extended_cpuid_level = 0;
8007         memset(&c->x86_capability, 0, sizeof c->x86_capability);
8008
8009 @@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8010               (unsigned int *)&c->x86_vendor_id[0],
8011               (unsigned int *)&c->x86_vendor_id[8],
8012               (unsigned int *)&c->x86_vendor_id[4]);
8013 -
8014 +
8015         get_cpu_vendor(c);
8016
8017         /* Initialize the standard set of capabilities */
8018 @@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8019                         c->x86 += (tfms >> 20) & 0xff;
8020                 if (c->x86 >= 0x6)
8021                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
8022 -               if (c->x86_capability[0] & (1<<19))
8023 +               if (c->x86_capability[0] & (1<<19))
8024                         c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8025         } else {
8026                 /* Have CPUID level 0 only - unheard of */
8027 @@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8028  #ifdef CONFIG_SMP
8029         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8030  #endif
8031 -}
8032 -
8033 -/*
8034 - * This does the hard work of actually picking apart the CPU stuff...
8035 - */
8036 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8037 -{
8038 -       int i;
8039 -       u32 xlvl;
8040 -
8041 -       early_identify_cpu(c);
8042 -
8043         /* AMD-defined flags: level 0x80000001 */
8044         xlvl = cpuid_eax(0x80000000);
8045         c->extended_cpuid_level = xlvl;
8046 @@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8047                         c->x86_capability[2] = cpuid_edx(0x80860001);
8048         }
8049
8050 +       c->extended_cpuid_level = cpuid_eax(0x80000000);
8051 +       if (c->extended_cpuid_level >= 0x80000007)
8052 +               c->x86_power = cpuid_edx(0x80000007);
8053 +
8054 +       switch (c->x86_vendor) {
8055 +       case X86_VENDOR_AMD:
8056 +               early_init_amd(c);
8057 +               break;
8058 +       case X86_VENDOR_INTEL:
8059 +               early_init_intel(c);
8060 +               break;
8061 +       }
8062 +
8063 +}
8064 +
8065 +/*
8066 + * This does the hard work of actually picking apart the CPU stuff...
8067 + */
8068 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8069 +{
8070 +       int i;
8071 +
8072 +       early_identify_cpu(c);
8073 +
8074         init_scattered_cpuid_features(c);
8075
8076         c->apicid = phys_pkg_id(0);
8077 @@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8078                 break;
8079         }
8080
8081 -       select_idle_routine(c);
8082 -       detect_ht(c);
8083 +       detect_ht(c);
8084
8085         /*
8086          * On SMP, boot_cpu_data holds the common feature set between
8087 @@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8088          */
8089         if (c != &boot_cpu_data) {
8090                 /* AND the already accumulated flags with these */
8091 -               for (i = 0 ; i < NCAPINTS ; i++)
8092 +               for (i = 0; i < NCAPINTS; i++)
8093                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8094         }
8095
8096 +       /* Clear all flags overriden by options */
8097 +       for (i = 0; i < NCAPINTS; i++)
8098 +               c->x86_capability[i] &= ~cleared_cpu_caps[i];
8099 +
8100  #ifdef CONFIG_X86_MCE
8101         mcheck_init(c);
8102  #endif
8103 +       select_idle_routine(c);
8104 +
8105         if (c != &boot_cpu_data)
8106                 mtrr_ap_init();
8107  #ifdef CONFIG_NUMA
8108         numa_add_cpu(smp_processor_id());
8109  #endif
8110 +
8111  }
8112 -
8113 +
8114 +static __init int setup_noclflush(char *arg)
8115 +{
8116 +       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8117 +       return 1;
8118 +}
8119 +__setup("noclflush", setup_noclflush);
8120
8121  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8122  {
8123         if (c->x86_model_id[0])
8124 -               printk("%s", c->x86_model_id);
8125 +               printk(KERN_CONT "%s", c->x86_model_id);
8126 +
8127 +       if (c->x86_mask || c->cpuid_level >= 0)
8128 +               printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8129 +       else
8130 +               printk(KERN_CONT "\n");
8131 +}
8132
8133 -       if (c->x86_mask || c->cpuid_level >= 0)
8134 -               printk(" stepping %02x\n", c->x86_mask);
8135 +static __init int setup_disablecpuid(char *arg)
8136 +{
8137 +       int bit;
8138 +       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8139 +               setup_clear_cpu_cap(bit);
8140         else
8141 -               printk("\n");
8142 +               return 0;
8143 +       return 1;
8144  }
8145 +__setup("clearcpuid=", setup_disablecpuid);
8146
8147  /*
8148   *     Get CPU information for use by the procfs.
8149 @@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8150  static int show_cpuinfo(struct seq_file *m, void *v)
8151  {
8152         struct cpuinfo_x86 *c = v;
8153 -       int cpu = 0;
8154 -
8155 -       /*
8156 -        * These flag bits must match the definitions in <asm/cpufeature.h>.
8157 -        * NULL means this bit is undefined or reserved; either way it doesn't
8158 -        * have meaning as far as Linux is concerned.  Note that it's important
8159 -        * to realize there is a difference between this table and CPUID -- if
8160 -        * applications want to get the raw CPUID data, they should access
8161 -        * /dev/cpu/<cpu_nr>/cpuid instead.
8162 -        */
8163 -       static const char *const x86_cap_flags[] = {
8164 -               /* Intel-defined */
8165 -               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8166 -               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8167 -               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8168 -               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8169 -
8170 -               /* AMD-defined */
8171 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8172 -               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8173 -               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8174 -               NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8175 -               "3dnowext", "3dnow",
8176 -
8177 -               /* Transmeta-defined */
8178 -               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8179 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8180 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8181 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8182 -
8183 -               /* Other (Linux-defined) */
8184 -               "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8185 -               NULL, NULL, NULL, NULL,
8186 -               "constant_tsc", "up", NULL, "arch_perfmon",
8187 -               "pebs", "bts", NULL, "sync_rdtsc",
8188 -               "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8189 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8190 -
8191 -               /* Intel-defined (#2) */
8192 -               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8193 -               "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8194 -               NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8195 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8196 -
8197 -               /* VIA/Cyrix/Centaur-defined */
8198 -               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8199 -               "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8200 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8201 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8202 -
8203 -               /* AMD-defined (#2) */
8204 -               "lahf_lm", "cmp_legacy", "svm", "extapic",
8205 -               "cr8_legacy", "abm", "sse4a", "misalignsse",
8206 -               "3dnowprefetch", "osvw", "ibs", "sse5",
8207 -               "skinit", "wdt", NULL, NULL,
8208 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8209 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8210 -
8211 -               /* Auxiliary (Linux-defined) */
8212 -               "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8213 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8214 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8215 -               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8216 -       };
8217 -       static const char *const x86_power_flags[] = {
8218 -               "ts",   /* temperature sensor */
8219 -               "fid",  /* frequency id control */
8220 -               "vid",  /* voltage id control */
8221 -               "ttp",  /* thermal trip */
8222 -               "tm",
8223 -               "stc",
8224 -               "100mhzsteps",
8225 -               "hwpstate",
8226 -               "",     /* tsc invariant mapped to constant_tsc */
8227 -               /* nothing */
8228 -       };
8229 -
8230 +       int cpu = 0, i;
8231
8232  #ifdef CONFIG_SMP
8233         cpu = c->cpu_index;
8234  #endif
8235
8236 -       seq_printf(m,"processor\t: %u\n"
8237 -                    "vendor_id\t: %s\n"
8238 -                    "cpu family\t: %d\n"
8239 -                    "model\t\t: %d\n"
8240 -                    "model name\t: %s\n",
8241 -                    (unsigned)cpu,
8242 -                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8243 -                    c->x86,
8244 -                    (int)c->x86_model,
8245 -                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
8246 -
8247 +       seq_printf(m, "processor\t: %u\n"
8248 +                  "vendor_id\t: %s\n"
8249 +                  "cpu family\t: %d\n"
8250 +                  "model\t\t: %d\n"
8251 +                  "model name\t: %s\n",
8252 +                  (unsigned)cpu,
8253 +                  c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8254 +                  c->x86,
8255 +                  (int)c->x86_model,
8256 +                  c->x86_model_id[0] ? c->x86_model_id : "unknown");
8257 +
8258         if (c->x86_mask || c->cpuid_level >= 0)
8259                 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8260         else
8261                 seq_printf(m, "stepping\t: unknown\n");
8262 -
8263 -       if (cpu_has(c,X86_FEATURE_TSC)) {
8264 +
8265 +       if (cpu_has(c, X86_FEATURE_TSC)) {
8266                 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8267 +
8268                 if (!freq)
8269                         freq = cpu_khz;
8270                 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8271 -                            freq / 1000, (freq % 1000));
8272 +                          freq / 1000, (freq % 1000));
8273         }
8274
8275         /* Cache size */
8276 -       if (c->x86_cache_size >= 0)
8277 +       if (c->x86_cache_size >= 0)
8278                 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8279 -
8280 +
8281  #ifdef CONFIG_SMP
8282         if (smp_num_siblings * c->x86_max_cores > 1) {
8283                 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8284 @@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8285                 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8286                 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8287         }
8288 -#endif
8289 +#endif
8290
8291         seq_printf(m,
8292 -               "fpu\t\t: yes\n"
8293 -               "fpu_exception\t: yes\n"
8294 -               "cpuid level\t: %d\n"
8295 -               "wp\t\t: yes\n"
8296 -               "flags\t\t:",
8297 +                  "fpu\t\t: yes\n"
8298 +                  "fpu_exception\t: yes\n"
8299 +                  "cpuid level\t: %d\n"
8300 +                  "wp\t\t: yes\n"
8301 +                  "flags\t\t:",
8302                    c->cpuid_level);
8303
8304 -       {
8305 -               int i;
8306 -               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8307 -                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8308 -                               seq_printf(m, " %s", x86_cap_flags[i]);
8309 -       }
8310 -
8311 +       for (i = 0; i < 32*NCAPINTS; i++)
8312 +               if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8313 +                       seq_printf(m, " %s", x86_cap_flags[i]);
8314 +
8315         seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8316                    c->loops_per_jiffy/(500000/HZ),
8317                    (c->loops_per_jiffy/(5000/HZ)) % 100);
8318
8319 -       if (c->x86_tlbsize > 0)
8320 +       if (c->x86_tlbsize > 0)
8321                 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8322         seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8323         seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8324
8325 -       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8326 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8327                    c->x86_phys_bits, c->x86_virt_bits);
8328
8329         seq_printf(m, "power management:");
8330 -       {
8331 -               unsigned i;
8332 -               for (i = 0; i < 32; i++)
8333 -                       if (c->x86_power & (1 << i)) {
8334 -                               if (i < ARRAY_SIZE(x86_power_flags) &&
8335 -                                       x86_power_flags[i])
8336 -                                       seq_printf(m, "%s%s",
8337 -                                               x86_power_flags[i][0]?" ":"",
8338 -                                               x86_power_flags[i]);
8339 -                               else
8340 -                                       seq_printf(m, " [%d]", i);
8341 -                       }
8342 +       for (i = 0; i < 32; i++) {
8343 +               if (c->x86_power & (1 << i)) {
8344 +                       if (i < ARRAY_SIZE(x86_power_flags) &&
8345 +                           x86_power_flags[i])
8346 +                               seq_printf(m, "%s%s",
8347 +                                          x86_power_flags[i][0]?" ":"",
8348 +                                          x86_power_flags[i]);
8349 +                       else
8350 +                               seq_printf(m, " [%d]", i);
8351 +               }
8352         }
8353
8354         seq_printf(m, "\n\n");
8355 @@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8356  {
8357  }
8358
8359 -struct seq_operations cpuinfo_op = {
8360 -       .start =c_start,
8361 +const struct seq_operations cpuinfo_op = {
8362 +       .start = c_start,
8363         .next = c_next,
8364         .stop = c_stop,
8365         .show = show_cpuinfo,
8366 --- a/arch/x86/kernel/setup64-xen.c
8367 +++ b/arch/x86/kernel/setup64-xen.c
8368 @@ -31,7 +31,11 @@
8369  #include <asm/hypervisor.h>
8370  #endif
8371
8372 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
8373  struct boot_params __initdata boot_params;
8374 +#else
8375 +struct boot_params boot_params;
8376 +#endif
8377
8378  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
8379
8380 @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
8381
8382  unsigned long __supported_pte_mask __read_mostly = ~0UL;
8383  EXPORT_SYMBOL(__supported_pte_mask);
8384 +
8385  static int do_not_nx __cpuinitdata = 0;
8386
8387  /* noexec=on|off
8388 @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
8389  __setup("noexec32=", nonx32_setup);
8390
8391  /*
8392 + * Copy data used in early init routines from the initial arrays to the
8393 + * per cpu data areas.  These arrays then become expendable and the
8394 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
8395 + */
8396 +static void __init setup_per_cpu_maps(void)
8397 +{
8398 +#ifndef CONFIG_XEN
8399 +       int cpu;
8400 +
8401 +       for_each_possible_cpu(cpu) {
8402 +#ifdef CONFIG_SMP
8403 +               if (per_cpu_offset(cpu)) {
8404 +#endif
8405 +                       per_cpu(x86_cpu_to_apicid, cpu) =
8406 +                                               x86_cpu_to_apicid_init[cpu];
8407 +                       per_cpu(x86_bios_cpu_apicid, cpu) =
8408 +                                               x86_bios_cpu_apicid_init[cpu];
8409 +#ifdef CONFIG_NUMA
8410 +                       per_cpu(x86_cpu_to_node_map, cpu) =
8411 +                                               x86_cpu_to_node_map_init[cpu];
8412 +#endif
8413 +#ifdef CONFIG_SMP
8414 +               }
8415 +               else
8416 +                       printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
8417 +                                                                       cpu);
8418 +#endif
8419 +       }
8420 +
8421 +       /* indicate the early static arrays will soon be gone */
8422 +       x86_cpu_to_apicid_early_ptr = NULL;
8423 +       x86_bios_cpu_apicid_early_ptr = NULL;
8424 +#ifdef CONFIG_NUMA
8425 +       x86_cpu_to_node_map_early_ptr = NULL;
8426 +#endif
8427 +#endif
8428 +}
8429 +
8430 +/*
8431   * Great future plan:
8432   * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
8433   * Always point %gs to its beginning
8434 @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
8435         printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
8436         for_each_cpu_mask (i, cpu_possible_map) {
8437                 char *ptr;
8438 +#ifndef CONFIG_NEED_MULTIPLE_NODES
8439 +               ptr = alloc_bootmem_pages(size);
8440 +#else
8441 +               int node = early_cpu_to_node(i);
8442
8443 -               if (!NODE_DATA(cpu_to_node(i))) {
8444 -                       printk("cpu with no node %d, num_online_nodes %d\n",
8445 -                              i, num_online_nodes());
8446 +               if (!node_online(node) || !NODE_DATA(node))
8447                         ptr = alloc_bootmem_pages(size);
8448 -               } else {
8449 -                       ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
8450 -               }
8451 +               else
8452 +                       ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
8453 +#endif
8454                 if (!ptr)
8455                         panic("Cannot allocate cpu data for CPU %d\n", i);
8456                 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
8457                 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
8458         }
8459 +
8460 +       /* setup percpu data maps early */
8461 +       setup_per_cpu_maps();
8462  }
8463
8464  #ifdef CONFIG_XEN
8465 @@ -224,7 +273,8 @@ void syscall_init(void)
8466         wrmsrl(MSR_CSTAR, ignore_sysret);
8467
8468         /* Flags to clear on syscall */
8469 -       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
8470 +       wrmsrl(MSR_SYSCALL_MASK,
8471 +              X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
8472  #endif
8473  #ifdef CONFIG_IA32_EMULATION
8474         syscall32_cpu_init ();
8475 @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
8476          */
8477  #ifndef CONFIG_XEN
8478         if (cpu)
8479 -               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
8480 +               memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
8481  #endif
8482
8483         cpu_gdt_descr[cpu].size = GDT_SIZE;
8484 @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
8485                                       v, cpu);
8486                 }
8487                 estacks += PAGE_SIZE << order[v];
8488 -               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
8489 +               orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
8490         }
8491
8492 -       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8493 +       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
8494         /*
8495          * <= is required because the CPU will access up to
8496          * 8 bits beyond the end of the IO permission bitmap.
8497 --- a/arch/x86/kernel/smp_32-xen.c
8498 +++ b/arch/x86/kernel/smp_32-xen.c
8499 @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8500         }
8501  }
8502
8503 -void fastcall send_IPI_self(int vector)
8504 +void send_IPI_self(int vector)
8505  {
8506         __send_IPI_shortcut(APIC_DEST_SELF, vector);
8507  }
8508 @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8509   * We need to reload %cr3 since the page tables may be going
8510   * away from under us..
8511   */
8512 -void leave_mm(unsigned long cpu)
8513 +void leave_mm(int cpu)
8514  {
8515         if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8516                 BUG();
8517         cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8518         load_cr3(swapper_pg_dir);
8519  }
8520 +EXPORT_SYMBOL_GPL(leave_mm);
8521
8522  /*
8523   *
8524 --- a/arch/x86/kernel/smp_64-xen.c
8525 +++ b/arch/x86/kernel/smp_64-xen.c
8526 @@ -33,7 +33,7 @@
8527
8528  #ifndef CONFIG_XEN
8529  /*
8530 - *     Smarter SMP flushing macros.
8531 + *     Smarter SMP flushing macros.
8532   *             c/o Linus Torvalds.
8533   *
8534   *     These mean you can really definitely utterly forget about
8535 @@ -41,15 +41,15 @@
8536   *
8537   *     Optimizations Manfred Spraul <manfred@colorfullife.com>
8538   *
8539 - *     More scalable flush, from Andi Kleen
8540 + *     More scalable flush, from Andi Kleen
8541   *
8542 - *     To avoid global state use 8 different call vectors.
8543 - *     Each CPU uses a specific vector to trigger flushes on other
8544 - *     CPUs. Depending on the received vector the target CPUs look into
8545 + *     To avoid global state use 8 different call vectors.
8546 + *     Each CPU uses a specific vector to trigger flushes on other
8547 + *     CPUs. Depending on the received vector the target CPUs look into
8548   *     the right per cpu variable for the flush data.
8549   *
8550 - *     With more than 8 CPUs they are hashed to the 8 available
8551 - *     vectors. The limited global vector space forces us to this right now.
8552 + *     With more than 8 CPUs they are hashed to the 8 available
8553 + *     vectors. The limited global vector space forces us to this right now.
8554   *     In future when interrupts are split into per CPU domains this could be
8555   *     fixed, at the cost of triggering multiple IPIs in some cases.
8556   */
8557 @@ -59,7 +59,6 @@ union smp_flush_state {
8558                 cpumask_t flush_cpumask;
8559                 struct mm_struct *flush_mm;
8560                 unsigned long flush_va;
8561 -#define FLUSH_ALL      -1ULL
8562                 spinlock_t tlbstate_lock;
8563         };
8564         char pad[SMP_CACHE_BYTES];
8565 @@ -71,16 +70,17 @@ union smp_flush_state {
8566  static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8567
8568  /*
8569 - * We cannot call mmdrop() because we are in interrupt context,
8570 + * We cannot call mmdrop() because we are in interrupt context,
8571   * instead update mm->cpu_vm_mask.
8572   */
8573 -static inline void leave_mm(unsigned long cpu)
8574 +void leave_mm(int cpu)
8575  {
8576         if (read_pda(mmu_state) == TLBSTATE_OK)
8577                 BUG();
8578         cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8579         load_cr3(swapper_pg_dir);
8580  }
8581 +EXPORT_SYMBOL_GPL(leave_mm);
8582
8583  /*
8584   *
8585 @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8586   * 1) switch_mm() either 1a) or 1b)
8587   * 1a) thread switch to a different mm
8588   * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8589 - *     Stop ipi delivery for the old mm. This is not synchronized with
8590 - *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
8591 - *     for the wrong mm, and in the worst case we perform a superfluous
8592 - *     tlb flush.
8593 + *     Stop ipi delivery for the old mm. This is not synchronized with
8594 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
8595 + *     for the wrong mm, and in the worst case we perform a superfluous
8596 + *     tlb flush.
8597   * 1a2) set cpu mmu_state to TLBSTATE_OK
8598 - *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8599 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8600   *     was in lazy tlb mode.
8601   * 1a3) update cpu active_mm
8602 - *     Now cpu0 accepts tlb flushes for the new mm.
8603 + *     Now cpu0 accepts tlb flushes for the new mm.
8604   * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8605 - *     Now the other cpus will send tlb flush ipis.
8606 + *     Now the other cpus will send tlb flush ipis.
8607   * 1a4) change cr3.
8608   * 1b) thread switch without mm change
8609   *     cpu active_mm is correct, cpu0 already handles
8610   *     flush ipis.
8611   * 1b1) set cpu mmu_state to TLBSTATE_OK
8612   * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8613 - *     Atomically set the bit [other cpus will start sending flush ipis],
8614 - *     and test the bit.
8615 + *     Atomically set the bit [other cpus will start sending flush ipis],
8616 + *     and test the bit.
8617   * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8618   * 2) switch %%esp, ie current
8619   *
8620 @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8621          * orig_rax contains the negated interrupt vector.
8622          * Use that to determine where the sender put the data.
8623          */
8624 -       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8625 +       sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8626         f = &per_cpu(flush_state, sender);
8627
8628         if (!cpu_isset(cpu, f->flush_cpumask))
8629                 goto out;
8630 -               /*
8631 +               /*
8632                  * This was a BUG() but until someone can quote me the
8633                  * line from the intel manual that guarantees an IPI to
8634                  * multiple CPUs is retried _only_ on the erroring CPUs
8635 @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8636                  *
8637                  * BUG();
8638                  */
8639 -
8640 +
8641         if (f->flush_mm == read_pda(active_mm)) {
8642                 if (read_pda(mmu_state) == TLBSTATE_OK) {
8643 -                       if (f->flush_va == FLUSH_ALL)
8644 +                       if (f->flush_va == TLB_FLUSH_ALL)
8645                                 local_flush_tlb();
8646                         else
8647                                 __flush_tlb_one(f->flush_va);
8648 @@ -170,19 +170,22 @@ out:
8649         add_pda(irq_tlb_count, 1);
8650  }
8651
8652 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8653 -                                               unsigned long va)
8654 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8655 +                            unsigned long va)
8656  {
8657         int sender;
8658         union smp_flush_state *f;
8659 +       cpumask_t cpumask = *cpumaskp;
8660
8661         /* Caller has disabled preemption */
8662         sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8663         f = &per_cpu(flush_state, sender);
8664
8665 -       /* Could avoid this lock when
8666 -          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8667 -          probably not worth checking this for a cache-hot lock. */
8668 +       /*
8669 +        * Could avoid this lock when
8670 +        * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8671 +        * probably not worth checking this for a cache-hot lock.
8672 +        */
8673         spin_lock(&f->tlbstate_lock);
8674
8675         f->flush_mm = mm;
8676 @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8677  int __cpuinit init_smp_flush(void)
8678  {
8679         int i;
8680 +
8681         for_each_cpu_mask(i, cpu_possible_map) {
8682                 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8683         }
8684         return 0;
8685  }
8686 -
8687  core_initcall(init_smp_flush);
8688 -
8689 +
8690  void flush_tlb_current_task(void)
8691  {
8692         struct mm_struct *mm = current->mm;
8693 @@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8694
8695         local_flush_tlb();
8696         if (!cpus_empty(cpu_mask))
8697 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8698 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8699         preempt_enable();
8700  }
8701 -EXPORT_SYMBOL(flush_tlb_current_task);
8702
8703  void flush_tlb_mm (struct mm_struct * mm)
8704  {
8705 @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8706                         leave_mm(smp_processor_id());
8707         }
8708         if (!cpus_empty(cpu_mask))
8709 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8710 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8711
8712         preempt_enable();
8713  }
8714 -EXPORT_SYMBOL(flush_tlb_mm);
8715
8716  void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8717  {
8718 @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8719         if (current->active_mm == mm) {
8720                 if(current->mm)
8721                         __flush_tlb_one(va);
8722 -                else
8723 -                       leave_mm(smp_processor_id());
8724 +               else
8725 +                       leave_mm(smp_processor_id());
8726         }
8727
8728         if (!cpus_empty(cpu_mask))
8729 @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8730
8731         preempt_enable();
8732  }
8733 -EXPORT_SYMBOL(flush_tlb_page);
8734
8735  static void do_flush_tlb_all(void* info)
8736  {
8737 @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8738   * this function sends a 'generic call function' IPI to all other CPU
8739   * of the system defined in the mask.
8740   */
8741 -
8742 -static int
8743 -__smp_call_function_mask(cpumask_t mask,
8744 -                        void (*func)(void *), void *info,
8745 -                        int wait)
8746 +static int __smp_call_function_mask(cpumask_t mask,
8747 +                                   void (*func)(void *), void *info,
8748 +                                   int wait)
8749  {
8750         struct call_data_struct data;
8751         cpumask_t allbutself;
8752 @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8753   */
8754
8755  int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8756 -       int nonatomic, int wait)
8757 +                             int nonatomic, int wait)
8758  {
8759         /* prevent preemption and reschedule on another processor */
8760 -       int ret;
8761 -       int me = get_cpu();
8762 +       int ret, me = get_cpu();
8763
8764         /* Can deadlock when called with interrupts disabled */
8765         WARN_ON(irqs_disabled());
8766 @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8767          */
8768         cpu_clear(smp_processor_id(), cpu_online_map);
8769         disable_all_local_evtchn();
8770 -       for (;;)
8771 +       for (;;)
8772                 halt();
8773 -}
8774 +}
8775
8776  void smp_send_stop(void)
8777  {
8778 --- a/arch/x86/kernel/time_32-xen.c
8779 +++ b/arch/x86/kernel/time_32-xen.c
8780 @@ -28,21 +28,9 @@
8781   *     serialize accesses to xtime/lost_ticks).
8782   */
8783
8784 -#include <linux/errno.h>
8785 -#include <linux/sched.h>
8786 -#include <linux/kernel.h>
8787 -#include <linux/param.h>
8788 -#include <linux/string.h>
8789 -#include <linux/mm.h>
8790 +#include <linux/init.h>
8791  #include <linux/interrupt.h>
8792  #include <linux/time.h>
8793 -#include <linux/delay.h>
8794 -#include <linux/init.h>
8795 -#include <linux/smp.h>
8796 -#include <linux/module.h>
8797 -#include <linux/sysdev.h>
8798 -#include <linux/bcd.h>
8799 -#include <linux/efi.h>
8800  #include <linux/mca.h>
8801  #include <linux/sysctl.h>
8802  #include <linux/percpu.h>
8803 @@ -50,26 +38,10 @@
8804  #include <linux/posix-timers.h>
8805  #include <linux/cpufreq.h>
8806  #include <linux/clocksource.h>
8807 +#include <linux/sysdev.h>
8808
8809 -#include <asm/io.h>
8810 -#include <asm/smp.h>
8811 -#include <asm/irq.h>
8812 -#include <asm/msr.h>
8813  #include <asm/delay.h>
8814 -#include <asm/mpspec.h>
8815 -#include <asm/uaccess.h>
8816 -#include <asm/processor.h>
8817 -#include <asm/timer.h>
8818  #include <asm/time.h>
8819 -#include <asm/sections.h>
8820 -
8821 -#include "mach_time.h"
8822 -
8823 -#include <linux/timex.h>
8824 -
8825 -#include <asm/hpet.h>
8826 -
8827 -#include <asm/arch_hooks.h>
8828
8829  #include <xen/evtchn.h>
8830  #include <xen/sysctl.h>
8831 @@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8832  unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
8833  EXPORT_SYMBOL(cpu_khz);
8834
8835 -DEFINE_SPINLOCK(rtc_lock);
8836 -EXPORT_SYMBOL(rtc_lock);
8837 -
8838  /* These are peridically updated in shared_info, and then copied here. */
8839  struct shadow_time_info {
8840         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
8841 @@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8842  }
8843  __setup("independent_wallclock", __independent_wallclock);
8844
8845 +int xen_independent_wallclock(void)
8846 +{
8847 +       return independent_wallclock;
8848 +}
8849 +
8850  /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8851  static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8852  static int __init __permitted_clock_jitter(char *str)
8853 @@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8854         return cmpxchg64(ptr, 0, 0);
8855  #else
8856         return *ptr;
8857 -#define cmpxchg64 cmpxchg
8858  #endif
8859  }
8860
8861 @@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8862         return cmpxchg64_local(ptr, 0, 0);
8863  #else
8864         return *ptr;
8865 -#define cmpxchg64_local cmpxchg_local
8866  #endif
8867  }
8868
8869 @@ -341,35 +313,6 @@ static inline int time_values_up_to_date
8870         return (dst->version == src->version);
8871  }
8872
8873 -/*
8874 - * This is a special lock that is owned by the CPU and holds the index
8875 - * register we are working with.  It is required for NMI access to the
8876 - * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
8877 - */
8878 -volatile unsigned long cmos_lock = 0;
8879 -EXPORT_SYMBOL(cmos_lock);
8880 -
8881 -/* Routines for accessing the CMOS RAM/RTC. */
8882 -unsigned char rtc_cmos_read(unsigned char addr)
8883 -{
8884 -       unsigned char val;
8885 -       lock_cmos_prefix(addr);
8886 -       outb_p(addr, RTC_PORT(0));
8887 -       val = inb_p(RTC_PORT(1));
8888 -       lock_cmos_suffix(addr);
8889 -       return val;
8890 -}
8891 -EXPORT_SYMBOL(rtc_cmos_read);
8892 -
8893 -void rtc_cmos_write(unsigned char val, unsigned char addr)
8894 -{
8895 -       lock_cmos_prefix(addr);
8896 -       outb_p(addr, RTC_PORT(0));
8897 -       outb_p(val, RTC_PORT(1));
8898 -       lock_cmos_suffix(addr);
8899 -}
8900 -EXPORT_SYMBOL(rtc_cmos_write);
8901 -
8902  static void sync_xen_wallclock(unsigned long dummy);
8903  static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8904  static void sync_xen_wallclock(unsigned long dummy)
8905 @@ -378,7 +321,8 @@ static void sync_xen_wallclock(unsigned
8906         s64 nsec;
8907         struct xen_platform_op op;
8908
8909 -       if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8910 +       BUG_ON(!is_initial_xendomain());
8911 +       if (!ntp_synced() || independent_wallclock)
8912                 return;
8913
8914         write_seqlock_irq(&xtime_lock);
8915 @@ -401,23 +345,6 @@ static void sync_xen_wallclock(unsigned
8916         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8917  }
8918
8919 -static int set_rtc_mmss(unsigned long nowtime)
8920 -{
8921 -       int retval;
8922 -       unsigned long flags;
8923 -
8924 -       if (independent_wallclock || !is_initial_xendomain())
8925 -               return 0;
8926 -
8927 -       /* gets recalled with irq locally disabled */
8928 -       /* XXX - does irqsave resolve this? -johnstul */
8929 -       spin_lock_irqsave(&rtc_lock, flags);
8930 -       retval = set_wallclock(nowtime);
8931 -       spin_unlock_irqrestore(&rtc_lock, flags);
8932 -
8933 -       return retval;
8934 -}
8935 -
8936  static unsigned long long local_clock(void)
8937  {
8938         unsigned int cpu = get_cpu();
8939 @@ -500,28 +427,24 @@ unsigned long profile_pc(struct pt_regs
8940
8941  #if defined(CONFIG_SMP) || defined(__x86_64__)
8942  # ifdef __i386__
8943 -       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8944 +       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8945  # else
8946         if (!user_mode(regs)
8947  # endif
8948             && in_lock_functions(pc)) {
8949  # ifdef CONFIG_FRAME_POINTER
8950 -#  ifdef __i386__
8951 -               return ((unsigned long *)regs->ebp)[1];
8952 -#  else
8953 -               return ((unsigned long *)regs->rbp)[1];
8954 -#  endif
8955 +               return ((unsigned long *)regs->bp)[1];
8956  # else
8957  #  ifdef __i386__
8958 -               unsigned long *sp = (unsigned long *)&regs->esp;
8959 +               unsigned long *sp = (unsigned long *)&regs->sp;
8960  #  else
8961 -               unsigned long *sp = (unsigned long *)regs->rsp;
8962 +               unsigned long *sp = (unsigned long *)regs->sp;
8963  #  endif
8964
8965                 /* Return address is either directly at stack pointer
8966 -                  or above a saved eflags. Eflags has bits 22-31 zero,
8967 +                  or above a saved flags. Eflags has bits 22-31 zero,
8968                    kernel addresses don't. */
8969 -               if (sp[0] >> 22)
8970 +               if (sp[0] >> 22)
8971                         return sp[0];
8972                 if (sp[1] >> 22)
8973                         return sp[1];
8974 @@ -750,25 +673,32 @@ static void init_missing_ticks_accountin
8975                 runstate->time[RUNSTATE_offline];
8976  }
8977
8978 -/* not static: needed by APM */
8979 -unsigned long read_persistent_clock(void)
8980 +unsigned long xen_read_persistent_clock(void)
8981  {
8982 -       unsigned long retval;
8983 -       unsigned long flags;
8984 -
8985 -       spin_lock_irqsave(&rtc_lock, flags);
8986 +       const shared_info_t *s = HYPERVISOR_shared_info;
8987 +       u32 version, sec, nsec;
8988 +       u64 delta;
8989
8990 -       retval = get_wallclock();
8991 +       do {
8992 +               version = s->wc_version;
8993 +               rmb();
8994 +               sec     = s->wc_sec;
8995 +               nsec    = s->wc_nsec;
8996 +               rmb();
8997 +       } while ((s->wc_version & 1) | (version ^ s->wc_version));
8998
8999 -       spin_unlock_irqrestore(&rtc_lock, flags);
9000 +       delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
9001 +       do_div(delta, NSEC_PER_SEC);
9002
9003 -       return retval;
9004 +       return delta;
9005  }
9006
9007 -int update_persistent_clock(struct timespec now)
9008 +int xen_update_persistent_clock(void)
9009  {
9010 +       if (!is_initial_xendomain())
9011 +               return -1;
9012         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
9013 -       return set_rtc_mmss(now.tv_sec);
9014 +       return 0;
9015  }
9016
9017  extern void (*late_time_init)(void);
9018 --- a/arch/x86/kernel/traps_32-xen.c
9019 +++ b/arch/x86/kernel/traps_32-xen.c
9020 @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
9021   * F0 0F bug workaround.. We have a special link segment
9022   * for this.
9023   */
9024 -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
9025 +gate_desc idt_table[256]
9026 +       __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
9027  #endif
9028
9029  asmlinkage void divide_error(void);
9030 @@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
9031  int kstack_depth_to_print = 24;
9032  static unsigned int code_bytes = 64;
9033
9034 +void printk_address(unsigned long address, int reliable)
9035 +{
9036 +#ifdef CONFIG_KALLSYMS
9037 +       unsigned long offset = 0, symsize;
9038 +       const char *symname;
9039 +       char *modname;
9040 +       char *delim = ":";
9041 +       char namebuf[128];
9042 +       char reliab[4] = "";
9043 +
9044 +       symname = kallsyms_lookup(address, &symsize, &offset,
9045 +                                       &modname, namebuf);
9046 +       if (!symname) {
9047 +               printk(" [<%08lx>]\n", address);
9048 +               return;
9049 +       }
9050 +       if (!reliable)
9051 +               strcpy(reliab, "? ");
9052 +
9053 +       if (!modname)
9054 +               modname = delim = "";
9055 +       printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9056 +               address, reliab, delim, modname, delim, symname, offset, symsize);
9057 +#else
9058 +       printk(" [<%08lx>]\n", address);
9059 +#endif
9060 +}
9061 +
9062  static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
9063  {
9064         return  p > (void *)tinfo &&
9065 @@ -122,48 +151,35 @@ struct stack_frame {
9066  };
9067
9068  static inline unsigned long print_context_stack(struct thread_info *tinfo,
9069 -                               unsigned long *stack, unsigned long ebp,
9070 +                               unsigned long *stack, unsigned long bp,
9071                                 const struct stacktrace_ops *ops, void *data)
9072  {
9073 -#ifdef CONFIG_FRAME_POINTER
9074 -       struct stack_frame *frame = (struct stack_frame *)ebp;
9075 -       while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
9076 -               struct stack_frame *next;
9077 -               unsigned long addr;
9078 +       struct stack_frame *frame = (struct stack_frame *)bp;
9079
9080 -               addr = frame->return_address;
9081 -               ops->address(data, addr);
9082 -               /*
9083 -                * break out of recursive entries (such as
9084 -                * end_of_stack_stop_unwind_function). Also,
9085 -                * we can never allow a frame pointer to
9086 -                * move downwards!
9087 -                */
9088 -               next = frame->next_frame;
9089 -               if (next <= frame)
9090 -                       break;
9091 -               frame = next;
9092 -       }
9093 -#else
9094         while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
9095                 unsigned long addr;
9096
9097 -               addr = *stack++;
9098 -               if (__kernel_text_address(addr))
9099 -                       ops->address(data, addr);
9100 +               addr = *stack;
9101 +               if (__kernel_text_address(addr)) {
9102 +                       if ((unsigned long) stack == bp + 4) {
9103 +                               ops->address(data, addr, 1);
9104 +                               frame = frame->next_frame;
9105 +                               bp = (unsigned long) frame;
9106 +                       } else {
9107 +                               ops->address(data, addr, bp == 0);
9108 +                       }
9109 +               }
9110 +               stack++;
9111         }
9112 -#endif
9113 -       return ebp;
9114 +       return bp;
9115  }
9116
9117  #define MSG(msg) ops->warning(data, msg)
9118
9119  void dump_trace(struct task_struct *task, struct pt_regs *regs,
9120 -               unsigned long *stack,
9121 +               unsigned long *stack, unsigned long bp,
9122                 const struct stacktrace_ops *ops, void *data)
9123  {
9124 -       unsigned long ebp = 0;
9125 -
9126         if (!task)
9127                 task = current;
9128
9129 @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
9130                 unsigned long dummy;
9131                 stack = &dummy;
9132                 if (task != current)
9133 -                       stack = (unsigned long *)task->thread.esp;
9134 +                       stack = (unsigned long *)task->thread.sp;
9135         }
9136
9137  #ifdef CONFIG_FRAME_POINTER
9138 -       if (!ebp) {
9139 +       if (!bp) {
9140                 if (task == current) {
9141 -                       /* Grab ebp right from our regs */
9142 -                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
9143 +                       /* Grab bp right from our regs */
9144 +                       asm ("movl %%ebp, %0" : "=r" (bp) : );
9145                 } else {
9146 -                       /* ebp is the last reg pushed by switch_to */
9147 -                       ebp = *(unsigned long *) task->thread.esp;
9148 +                       /* bp is the last reg pushed by switch_to */
9149 +                       bp = *(unsigned long *) task->thread.sp;
9150                 }
9151         }
9152  #endif
9153 @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9154                 struct thread_info *context;
9155                 context = (struct thread_info *)
9156                         ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9157 -               ebp = print_context_stack(context, stack, ebp, ops, data);
9158 +               bp = print_context_stack(context, stack, bp, ops, data);
9159                 /* Should be after the line below, but somewhere
9160                    in early boot context comes out corrupted and we
9161                    can't reference it -AK */
9162 @@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9163  /*
9164   * Print one address/symbol entries per line.
9165   */
9166 -static void print_trace_address(void *data, unsigned long addr)
9167 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9168  {
9169         printk("%s [<%08lx>] ", (char *)data, addr);
9170 +       if (!reliable)
9171 +               printk("? ");
9172         print_symbol("%s\n", addr);
9173         touch_nmi_watchdog();
9174  }
9175 @@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9176
9177  static void
9178  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9179 -                  unsigned long * stack, char *log_lvl)
9180 +               unsigned long *stack, unsigned long bp, char *log_lvl)
9181  {
9182 -       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9183 +       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9184         printk("%s =======================\n", log_lvl);
9185  }
9186
9187  void show_trace(struct task_struct *task, struct pt_regs *regs,
9188 -               unsigned long * stack)
9189 +               unsigned long *stack, unsigned long bp)
9190  {
9191 -       show_trace_log_lvl(task, regs, stack, "");
9192 +       show_trace_log_lvl(task, regs, stack, bp, "");
9193  }
9194
9195  static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9196 -                              unsigned long *esp, char *log_lvl)
9197 +                      unsigned long *sp, unsigned long bp, char *log_lvl)
9198  {
9199         unsigned long *stack;
9200         int i;
9201
9202 -       if (esp == NULL) {
9203 +       if (sp == NULL) {
9204                 if (task)
9205 -                       esp = (unsigned long*)task->thread.esp;
9206 +                       sp = (unsigned long*)task->thread.sp;
9207                 else
9208 -                       esp = (unsigned long *)&esp;
9209 +                       sp = (unsigned long *)&sp;
9210         }
9211
9212 -       stack = esp;
9213 +       stack = sp;
9214         for(i = 0; i < kstack_depth_to_print; i++) {
9215                 if (kstack_end(stack))
9216                         break;
9217 @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9218                 printk("%08lx ", *stack++);
9219         }
9220         printk("\n%sCall Trace:\n", log_lvl);
9221 -       show_trace_log_lvl(task, regs, esp, log_lvl);
9222 +       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9223  }
9224
9225 -void show_stack(struct task_struct *task, unsigned long *esp)
9226 +void show_stack(struct task_struct *task, unsigned long *sp)
9227  {
9228         printk("       ");
9229 -       show_stack_log_lvl(task, NULL, esp, "");
9230 +       show_stack_log_lvl(task, NULL, sp, 0, "");
9231  }
9232
9233  /*
9234 @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9235  void dump_stack(void)
9236  {
9237         unsigned long stack;
9238 +       unsigned long bp = 0;
9239 +
9240 +#ifdef CONFIG_FRAME_POINTER
9241 +       if (!bp)
9242 +               asm("movl %%ebp, %0" : "=r" (bp):);
9243 +#endif
9244
9245         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9246                 current->pid, current->comm, print_tainted(),
9247                 init_utsname()->release,
9248                 (int)strcspn(init_utsname()->version, " "),
9249                 init_utsname()->version);
9250 -       show_trace(current, NULL, &stack);
9251 +       show_trace(current, NULL, &stack, bp);
9252  }
9253
9254  EXPORT_SYMBOL(dump_stack);
9255 @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9256          * time of the fault..
9257          */
9258         if (!user_mode_vm(regs)) {
9259 -               u8 *eip;
9260 +               u8 *ip;
9261                 unsigned int code_prologue = code_bytes * 43 / 64;
9262                 unsigned int code_len = code_bytes;
9263                 unsigned char c;
9264
9265                 printk("\n" KERN_EMERG "Stack: ");
9266 -               show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
9267 +               show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
9268
9269                 printk(KERN_EMERG "Code: ");
9270
9271 -               eip = (u8 *)regs->eip - code_prologue;
9272 -               if (eip < (u8 *)PAGE_OFFSET ||
9273 -                       probe_kernel_address(eip, c)) {
9274 +               ip = (u8 *)regs->ip - code_prologue;
9275 +               if (ip < (u8 *)PAGE_OFFSET ||
9276 +                       probe_kernel_address(ip, c)) {
9277                         /* try starting at EIP */
9278 -                       eip = (u8 *)regs->eip;
9279 +                       ip = (u8 *)regs->ip;
9280                         code_len = code_len - code_prologue + 1;
9281                 }
9282 -               for (i = 0; i < code_len; i++, eip++) {
9283 -                       if (eip < (u8 *)PAGE_OFFSET ||
9284 -                               probe_kernel_address(eip, c)) {
9285 +               for (i = 0; i < code_len; i++, ip++) {
9286 +                       if (ip < (u8 *)PAGE_OFFSET ||
9287 +                               probe_kernel_address(ip, c)) {
9288                                 printk(" Bad EIP value.");
9289                                 break;
9290                         }
9291 -                       if (eip == (u8 *)regs->eip)
9292 +                       if (ip == (u8 *)regs->ip)
9293                                 printk("<%02x> ", c);
9294                         else
9295                                 printk("%02x ", c);
9296 @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9297         printk("\n");
9298  }
9299
9300 -int is_valid_bugaddr(unsigned long eip)
9301 +int is_valid_bugaddr(unsigned long ip)
9302  {
9303         unsigned short ud2;
9304
9305 -       if (eip < PAGE_OFFSET)
9306 +       if (ip < PAGE_OFFSET)
9307                 return 0;
9308 -       if (probe_kernel_address((unsigned short *)eip, ud2))
9309 +       if (probe_kernel_address((unsigned short *)ip, ud2))
9310                 return 0;
9311
9312         return ud2 == 0x0b0f;
9313  }
9314
9315 +static int die_counter;
9316 +
9317 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9318 +{
9319 +       unsigned long sp;
9320 +       unsigned short ss;
9321 +
9322 +       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9323 +#ifdef CONFIG_PREEMPT
9324 +       printk("PREEMPT ");
9325 +#endif
9326 +#ifdef CONFIG_SMP
9327 +       printk("SMP ");
9328 +#endif
9329 +#ifdef CONFIG_DEBUG_PAGEALLOC
9330 +       printk("DEBUG_PAGEALLOC");
9331 +#endif
9332 +       printk("\n");
9333 +
9334 +       if (notify_die(DIE_OOPS, str, regs, err,
9335 +                               current->thread.trap_no, SIGSEGV) !=
9336 +                       NOTIFY_STOP) {
9337 +               show_registers(regs);
9338 +               /* Executive summary in case the oops scrolled away */
9339 +               sp = (unsigned long) (&regs->sp);
9340 +               savesegment(ss, ss);
9341 +               if (user_mode(regs)) {
9342 +                       sp = regs->sp;
9343 +                       ss = regs->ss & 0xffff;
9344 +               }
9345 +               printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9346 +               print_symbol("%s", regs->ip);
9347 +               printk(" SS:ESP %04x:%08lx\n", ss, sp);
9348 +               return 0;
9349 +       } else {
9350 +               return 1;
9351 +       }
9352 +}
9353 +
9354  /*
9355   * This is gone through when something in the kernel has done something bad and
9356   * is about to be terminated.
9357 @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9358                 .lock_owner =           -1,
9359                 .lock_owner_depth =     0
9360         };
9361 -       static int die_counter;
9362         unsigned long flags;
9363
9364         oops_enter();
9365 @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9366                 raw_local_irq_save(flags);
9367
9368         if (++die.lock_owner_depth < 3) {
9369 -               unsigned long esp;
9370 -               unsigned short ss;
9371 -
9372 -               report_bug(regs->eip, regs);
9373 -
9374 -               printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9375 -                      ++die_counter);
9376 -#ifdef CONFIG_PREEMPT
9377 -               printk("PREEMPT ");
9378 -#endif
9379 -#ifdef CONFIG_SMP
9380 -               printk("SMP ");
9381 -#endif
9382 -#ifdef CONFIG_DEBUG_PAGEALLOC
9383 -               printk("DEBUG_PAGEALLOC");
9384 -#endif
9385 -               printk("\n");
9386 +               report_bug(regs->ip, regs);
9387
9388 -               if (notify_die(DIE_OOPS, str, regs, err,
9389 -                                       current->thread.trap_no, SIGSEGV) !=
9390 -                               NOTIFY_STOP) {
9391 -                       show_registers(regs);
9392 -                       /* Executive summary in case the oops scrolled away */
9393 -                       esp = (unsigned long) (&regs->esp);
9394 -                       savesegment(ss, ss);
9395 -                       if (user_mode(regs)) {
9396 -                               esp = regs->esp;
9397 -                               ss = regs->xss & 0xffff;
9398 -                       }
9399 -                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9400 -                       print_symbol("%s", regs->eip);
9401 -                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
9402 -               }
9403 -               else
9404 +               if (__die(str, regs, err))
9405                         regs = NULL;
9406 -       } else
9407 +       } else {
9408                 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9409 +       }
9410
9411         bust_spinlocks(0);
9412         die.lock_owner = -1;
9413 @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9414  {
9415         struct task_struct *tsk = current;
9416
9417 -       if (regs->eflags & VM_MASK) {
9418 +       if (regs->flags & VM_MASK) {
9419                 if (vm86)
9420                         goto vm86_trap;
9421                 goto trap_signal;
9422 @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9423  }
9424
9425  #define DO_ERROR(trapnr, signr, str, name) \
9426 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9427 +void do_##name(struct pt_regs * regs, long error_code) \
9428  { \
9429         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9430                                                 == NOTIFY_STOP) \
9431 @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9432  }
9433
9434  #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9435 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9436 +void do_##name(struct pt_regs * regs, long error_code) \
9437  { \
9438         siginfo_t info; \
9439         if (irq) \
9440 @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9441  }
9442
9443  #define DO_VM86_ERROR(trapnr, signr, str, name) \
9444 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9445 +void do_##name(struct pt_regs * regs, long error_code) \
9446  { \
9447         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9448                                                 == NOTIFY_STOP) \
9449 @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9450  }
9451
9452  #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9453 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9454 +void do_##name(struct pt_regs * regs, long error_code) \
9455  { \
9456         siginfo_t info; \
9457         info.si_signo = signr; \
9458 @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9459         do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9460  }
9461
9462 -DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
9463 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
9464  #ifndef CONFIG_KPROBES
9465  DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9466  #endif
9467  DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9468  DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9469 -DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9470 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9471  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
9472  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9473  DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
9474 @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS,  "stack segment", s
9475  DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9476  DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9477
9478 -fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9479 +void __kprobes do_general_protection(struct pt_regs * regs,
9480                                               long error_code)
9481  {
9482 -       if (regs->eflags & VM_MASK)
9483 +       if (regs->flags & VM_MASK)
9484                 goto gp_in_vm86;
9485
9486         if (!user_mode(regs))
9487 @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9488         current->thread.error_code = error_code;
9489         current->thread.trap_no = 13;
9490         if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9491 -           printk_ratelimit())
9492 +           printk_ratelimit()) {
9493                 printk(KERN_INFO
9494 -                   "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9495 +                   "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9496                     current->comm, task_pid_nr(current),
9497 -                   regs->eip, regs->esp, error_code);
9498 +                   regs->ip, regs->sp, error_code);
9499 +               print_vma_addr(" in ", regs->ip);
9500 +               printk("\n");
9501 +       }
9502
9503         force_sig(SIGSEGV, current);
9504         return;
9505 @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9506         */
9507         bust_spinlocks(1);
9508         printk(KERN_EMERG "%s", msg);
9509 -       printk(" on CPU%d, eip %08lx, registers:\n",
9510 -               smp_processor_id(), regs->eip);
9511 +       printk(" on CPU%d, ip %08lx, registers:\n",
9512 +               smp_processor_id(), regs->ip);
9513         show_registers(regs);
9514         console_silent();
9515         spin_unlock(&nmi_print_lock);
9516 @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9517
9518  static int ignore_nmis;
9519
9520 -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9521 +__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9522  {
9523         int cpu;
9524
9525 @@ -762,7 +797,7 @@ void restart_nmi(void)
9526  }
9527
9528  #ifdef CONFIG_KPROBES
9529 -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9530 +void __kprobes do_int3(struct pt_regs *regs, long error_code)
9531  {
9532         trace_hardirqs_fixup();
9533
9534 @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9535   * find every occurrence of the TF bit that could be saved away even
9536   * by user code)
9537   */
9538 -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9539 +void __kprobes do_debug(struct pt_regs * regs, long error_code)
9540  {
9541         unsigned int condition;
9542         struct task_struct *tsk = current;
9543 @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9544
9545         get_debugreg(condition, 6);
9546
9547 +       /*
9548 +        * The processor cleared BTF, so don't mark that we need it set.
9549 +        */
9550 +       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9551 +       tsk->thread.debugctlmsr = 0;
9552 +
9553         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9554                                         SIGTRAP) == NOTIFY_STOP)
9555                 return;
9556         /* It's safe to allow irq's after DR6 has been saved */
9557 -       if (regs->eflags & X86_EFLAGS_IF)
9558 +       if (regs->flags & X86_EFLAGS_IF)
9559                 local_irq_enable();
9560
9561         /* Mask out spurious debug traps due to lazy DR7 setting */
9562         if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9563 -               if (!tsk->thread.debugreg[7])
9564 +               if (!tsk->thread.debugreg7)
9565                         goto clear_dr7;
9566         }
9567
9568 -       if (regs->eflags & VM_MASK)
9569 +       if (regs->flags & VM_MASK)
9570                 goto debug_vm86;
9571
9572         /* Save debug status register where ptrace can see it */
9573 -       tsk->thread.debugreg[6] = condition;
9574 +       tsk->thread.debugreg6 = condition;
9575
9576         /*
9577          * Single-stepping through TF: make sure we ignore any events in
9578 @@ -856,7 +897,7 @@ debug_vm86:
9579
9580  clear_TF_reenable:
9581         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9582 -       regs->eflags &= ~TF_MASK;
9583 +       regs->flags &= ~TF_MASK;
9584         return;
9585  }
9586
9587 @@ -865,7 +906,7 @@ clear_TF_reenable:
9588   * the correct behaviour even in the presence of the asynchronous
9589   * IRQ13 behaviour
9590   */
9591 -void math_error(void __user *eip)
9592 +void math_error(void __user *ip)
9593  {
9594         struct task_struct * task;
9595         siginfo_t info;
9596 @@ -881,7 +922,7 @@ void math_error(void __user *eip)
9597         info.si_signo = SIGFPE;
9598         info.si_errno = 0;
9599         info.si_code = __SI_FAULT;
9600 -       info.si_addr = eip;
9601 +       info.si_addr = ip;
9602         /*
9603          * (~cwd & swd) will mask out exceptions that are not set to unmasked
9604          * status.  0x3f is the exception bits in these regs, 0x200 is the
9605 @@ -924,13 +965,13 @@ void math_error(void __user *eip)
9606         force_sig_info(SIGFPE, &info, task);
9607  }
9608
9609 -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9610 +void do_coprocessor_error(struct pt_regs * regs, long error_code)
9611  {
9612         ignore_fpu_irq = 1;
9613 -       math_error((void __user *)regs->eip);
9614 +       math_error((void __user *)regs->ip);
9615  }
9616
9617 -static void simd_math_error(void __user *eip)
9618 +static void simd_math_error(void __user *ip)
9619  {
9620         struct task_struct * task;
9621         siginfo_t info;
9622 @@ -946,7 +987,7 @@ static void simd_math_error(void __user
9623         info.si_signo = SIGFPE;
9624         info.si_errno = 0;
9625         info.si_code = __SI_FAULT;
9626 -       info.si_addr = eip;
9627 +       info.si_addr = ip;
9628         /*
9629          * The SIMD FPU exceptions are handled a little differently, as there
9630          * is only a single status/control register.  Thus, to determine which
9631 @@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9632         force_sig_info(SIGFPE, &info, task);
9633  }
9634
9635 -fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9636 +void do_simd_coprocessor_error(struct pt_regs * regs,
9637                                           long error_code)
9638  {
9639         if (cpu_has_xmm) {
9640                 /* Handle SIMD FPU exceptions on PIII+ processors. */
9641                 ignore_fpu_irq = 1;
9642 -               simd_math_error((void __user *)regs->eip);
9643 +               simd_math_error((void __user *)regs->ip);
9644         } else {
9645                 /*
9646                  * Handle strange cache flush from user space exception
9647                  * in all other cases.  This is undocumented behaviour.
9648                  */
9649 -               if (regs->eflags & VM_MASK) {
9650 +               if (regs->flags & VM_MASK) {
9651                         handle_vm86_fault((struct kernel_vm86_regs *)regs,
9652                                           error_code);
9653                         return;
9654 @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9655  }
9656
9657  #ifndef CONFIG_XEN
9658 -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9659 +void do_spurious_interrupt_bug(struct pt_regs * regs,
9660                                           long error_code)
9661  {
9662  #if 0
9663 @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9664  #endif
9665  }
9666
9667 -fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9668 +unsigned long patch_espfix_desc(unsigned long uesp,
9669                                           unsigned long kesp)
9670  {
9671         struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9672 @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9673   * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9674   * for those that specify <dpl>|4 in the second field.
9675   */
9676 -static trap_info_t __cpuinitdata trap_table[] = {
9677 +static const trap_info_t __cpuinitconst trap_table[] = {
9678         {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
9679         {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
9680         {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
9681 @@ -1105,17 +1146,12 @@ void __init trap_init(void)
9682         if (ret)
9683                 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9684
9685 +       /*
9686 +        * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9687 +        * Generate a build-time error if the alignment is wrong.
9688 +        */
9689 +       BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9690         if (cpu_has_fxsr) {
9691 -               /*
9692 -                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9693 -                * Generates a compile-time "error: zero width for bit-field" if
9694 -                * the alignment is wrong.
9695 -                */
9696 -               struct fxsrAlignAssert {
9697 -                       int _:!(offsetof(struct task_struct,
9698 -                                       thread.i387.fxsave) & 15);
9699 -               };
9700 -
9701                 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9702                 set_in_cr4(X86_CR4_OSFXSR);
9703                 printk("done.\n");
9704 --- a/arch/x86/kernel/traps_64-xen.c
9705 +++ b/arch/x86/kernel/traps_64-xen.c
9706 @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9707  asmlinkage void machine_check(void);
9708  asmlinkage void spurious_interrupt_bug(void);
9709
9710 +static unsigned int code_bytes = 64;
9711 +
9712  static inline void conditional_sti(struct pt_regs *regs)
9713  {
9714 -       if (regs->eflags & X86_EFLAGS_IF)
9715 +       if (regs->flags & X86_EFLAGS_IF)
9716                 local_irq_enable();
9717  }
9718
9719  static inline void preempt_conditional_sti(struct pt_regs *regs)
9720  {
9721 -       preempt_disable();
9722 -       if (regs->eflags & X86_EFLAGS_IF)
9723 +       inc_preempt_count();
9724 +       if (regs->flags & X86_EFLAGS_IF)
9725                 local_irq_enable();
9726  }
9727
9728  static inline void preempt_conditional_cli(struct pt_regs *regs)
9729  {
9730 -       if (regs->eflags & X86_EFLAGS_IF)
9731 +       if (regs->flags & X86_EFLAGS_IF)
9732                 local_irq_disable();
9733         /* Make sure to not schedule here because we could be running
9734            on an exception stack. */
9735 -       preempt_enable_no_resched();
9736 +       dec_preempt_count();
9737  }
9738
9739  int kstack_depth_to_print = 12;
9740
9741 -#ifdef CONFIG_KALLSYMS
9742 -void printk_address(unsigned long address)
9743 +void printk_address(unsigned long address, int reliable)
9744  {
9745 +#ifdef CONFIG_KALLSYMS
9746         unsigned long offset = 0, symsize;
9747         const char *symname;
9748         char *modname;
9749         char *delim = ":";
9750 -       char namebuf[128];
9751 +       char namebuf[KSYM_NAME_LEN];
9752 +       char reliab[4] = "";
9753
9754         symname = kallsyms_lookup(address, &symsize, &offset,
9755                                         &modname, namebuf);
9756 @@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9757                 printk(" [<%016lx>]\n", address);
9758                 return;
9759         }
9760 +       if (!reliable)
9761 +               strcpy(reliab, "? ");
9762 +
9763         if (!modname)
9764 -               modname = delim = "";
9765 -       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9766 -               address, delim, modname, delim, symname, offset, symsize);
9767 -}
9768 +               modname = delim = "";
9769 +       printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9770 +               address, reliab, delim, modname, delim, symname, offset, symsize);
9771  #else
9772 -void printk_address(unsigned long address)
9773 -{
9774         printk(" [<%016lx>]\n", address);
9775 -}
9776  #endif
9777 +}
9778
9779  static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9780                                         unsigned *usedp, char **idp)
9781 @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9782   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9783   */
9784
9785 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9786 +static inline int valid_stack_ptr(struct thread_info *tinfo,
9787 +                       void *p, unsigned int size, void *end)
9788  {
9789 -       void *t = (void *)tinfo;
9790 -        return p > t && p < t + THREAD_SIZE - 3;
9791 +       void *t = tinfo;
9792 +       if (end) {
9793 +               if (p < end && p >= (end-THREAD_SIZE))
9794 +                       return 1;
9795 +               else
9796 +                       return 0;
9797 +       }
9798 +       return p > t && p < t + THREAD_SIZE - size;
9799 +}
9800 +
9801 +/* The form of the top of the frame on the stack */
9802 +struct stack_frame {
9803 +       struct stack_frame *next_frame;
9804 +       unsigned long return_address;
9805 +};
9806 +
9807 +
9808 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
9809 +                               unsigned long *stack, unsigned long bp,
9810 +                               const struct stacktrace_ops *ops, void *data,
9811 +                               unsigned long *end)
9812 +{
9813 +       struct stack_frame *frame = (struct stack_frame *)bp;
9814 +
9815 +       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9816 +               unsigned long addr;
9817 +
9818 +               addr = *stack;
9819 +               if (__kernel_text_address(addr)) {
9820 +                       if ((unsigned long) stack == bp + 8) {
9821 +                               ops->address(data, addr, 1);
9822 +                               frame = frame->next_frame;
9823 +                               bp = (unsigned long) frame;
9824 +                       } else {
9825 +                               ops->address(data, addr, bp == 0);
9826 +                       }
9827 +               }
9828 +               stack++;
9829 +       }
9830 +       return bp;
9831  }
9832
9833  void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9834 -               unsigned long *stack,
9835 +               unsigned long *stack, unsigned long bp,
9836                 const struct stacktrace_ops *ops, void *data)
9837  {
9838         const unsigned cpu = get_cpu();
9839 @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9840
9841         if (!tsk)
9842                 tsk = current;
9843 +       tinfo = task_thread_info(tsk);
9844
9845         if (!stack) {
9846                 unsigned long dummy;
9847                 stack = &dummy;
9848                 if (tsk && tsk != current)
9849 -                       stack = (unsigned long *)tsk->thread.rsp;
9850 +                       stack = (unsigned long *)tsk->thread.sp;
9851         }
9852
9853 -       /*
9854 -        * Print function call entries within a stack. 'cond' is the
9855 -        * "end of stackframe" condition, that the 'stack++'
9856 -        * iteration will eventually trigger.
9857 -        */
9858 -#define HANDLE_STACK(cond) \
9859 -       do while (cond) { \
9860 -               unsigned long addr = *stack++; \
9861 -               /* Use unlocked access here because except for NMIs     \
9862 -                  we should be already protected against module unloads */ \
9863 -               if (__kernel_text_address(addr)) { \
9864 -                       /* \
9865 -                        * If the address is either in the text segment of the \
9866 -                        * kernel, or in the region which contains vmalloc'ed \
9867 -                        * memory, it *may* be the address of a calling \
9868 -                        * routine; if so, print it so that someone tracing \
9869 -                        * down the cause of the crash will be able to figure \
9870 -                        * out the call path that was taken. \
9871 -                        */ \
9872 -                       ops->address(data, addr);   \
9873 -               } \
9874 -       } while (0)
9875 +#ifdef CONFIG_FRAME_POINTER
9876 +       if (!bp) {
9877 +               if (tsk == current) {
9878 +                       /* Grab bp right from our regs */
9879 +                       asm("movq %%rbp, %0" : "=r" (bp):);
9880 +               } else {
9881 +                       /* bp is the last reg pushed by switch_to */
9882 +                       bp = *(unsigned long *) tsk->thread.sp;
9883 +               }
9884 +       }
9885 +#endif
9886 +
9887 +
9888
9889         /*
9890          * Print function call entries in all stacks, starting at the
9891 @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9892                 if (estack_end) {
9893                         if (ops->stack(data, id) < 0)
9894                                 break;
9895 -                       HANDLE_STACK (stack < estack_end);
9896 +
9897 +                       bp = print_context_stack(tinfo, stack, bp, ops,
9898 +                                                       data, estack_end);
9899                         ops->stack(data, "<EOE>");
9900                         /*
9901                          * We link to the next stack via the
9902 @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9903                         if (stack >= irqstack && stack < irqstack_end) {
9904                                 if (ops->stack(data, "IRQ") < 0)
9905                                         break;
9906 -                               HANDLE_STACK (stack < irqstack_end);
9907 +                               bp = print_context_stack(tinfo, stack, bp,
9908 +                                               ops, data, irqstack_end);
9909                                 /*
9910                                  * We link to the next stack (which would be
9911                                  * the process stack normally) the last
9912 @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9913         /*
9914          * This handles the process stack:
9915          */
9916 -       tinfo = task_thread_info(tsk);
9917 -       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9918 -#undef HANDLE_STACK
9919 +       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9920         put_cpu();
9921  }
9922  EXPORT_SYMBOL(dump_trace);
9923 @@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9924         return 0;
9925  }
9926
9927 -static void print_trace_address(void *data, unsigned long addr)
9928 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9929  {
9930         touch_nmi_watchdog();
9931 -       printk_address(addr);
9932 +       printk_address(addr, reliable);
9933  }
9934
9935  static const struct stacktrace_ops print_trace_ops = {
9936 @@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9937  };
9938
9939  void
9940 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9941 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9942 +               unsigned long bp)
9943  {
9944         printk("\nCall Trace:\n");
9945 -       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9946 +       dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9947         printk("\n");
9948  }
9949
9950  static void
9951 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9952 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9953 +                                                       unsigned long bp)
9954  {
9955         unsigned long *stack;
9956         int i;
9957 @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9958         // debugging aid: "show_stack(NULL, NULL);" prints the
9959         // back trace for this cpu.
9960
9961 -       if (rsp == NULL) {
9962 +       if (sp == NULL) {
9963                 if (tsk)
9964 -                       rsp = (unsigned long *)tsk->thread.rsp;
9965 +                       sp = (unsigned long *)tsk->thread.sp;
9966                 else
9967 -                       rsp = (unsigned long *)&rsp;
9968 +                       sp = (unsigned long *)&sp;
9969         }
9970
9971 -       stack = rsp;
9972 +       stack = sp;
9973         for(i=0; i < kstack_depth_to_print; i++) {
9974                 if (stack >= irqstack && stack <= irqstack_end) {
9975                         if (stack == irqstack_end) {
9976 @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9977                 printk(" %016lx", *stack++);
9978                 touch_nmi_watchdog();
9979         }
9980 -       show_trace(tsk, regs, rsp);
9981 +       show_trace(tsk, regs, sp, bp);
9982  }
9983
9984 -void show_stack(struct task_struct *tsk, unsigned long * rsp)
9985 +void show_stack(struct task_struct *tsk, unsigned long * sp)
9986  {
9987 -       _show_stack(tsk, NULL, rsp);
9988 +       _show_stack(tsk, NULL, sp, 0);
9989  }
9990
9991  /*
9992 @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9993  void dump_stack(void)
9994  {
9995         unsigned long dummy;
9996 +       unsigned long bp = 0;
9997 +
9998 +#ifdef CONFIG_FRAME_POINTER
9999 +       if (!bp)
10000 +               asm("movq %%rbp, %0" : "=r" (bp):);
10001 +#endif
10002
10003         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
10004                 current->pid, current->comm, print_tainted(),
10005                 init_utsname()->release,
10006                 (int)strcspn(init_utsname()->version, " "),
10007                 init_utsname()->version);
10008 -       show_trace(NULL, NULL, &dummy);
10009 +       show_trace(NULL, NULL, &dummy, bp);
10010  }
10011
10012  EXPORT_SYMBOL(dump_stack);
10013 @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
10014  void show_registers(struct pt_regs *regs)
10015  {
10016         int i;
10017 -       int in_kernel = !user_mode(regs);
10018 -       unsigned long rsp;
10019 +       unsigned long sp;
10020         const int cpu = smp_processor_id();
10021         struct task_struct *cur = cpu_pda(cpu)->pcurrent;
10022 +       u8 *ip;
10023 +       unsigned int code_prologue = code_bytes * 43 / 64;
10024 +       unsigned int code_len = code_bytes;
10025
10026 -       rsp = regs->rsp;
10027 +       sp = regs->sp;
10028 +       ip = (u8 *) regs->ip - code_prologue;
10029         printk("CPU %d ", cpu);
10030         __show_regs(regs);
10031         printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
10032 @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
10033          * When in-kernel, we also print out the stack and code at the
10034          * time of the fault..
10035          */
10036 -       if (in_kernel) {
10037 +       if (!user_mode(regs)) {
10038 +               unsigned char c;
10039                 printk("Stack: ");
10040 -               _show_stack(NULL, regs, (unsigned long*)rsp);
10041 +               _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
10042 +               printk("\n");
10043
10044 -               printk("\nCode: ");
10045 -               if (regs->rip < PAGE_OFFSET)
10046 -                       goto bad;
10047 -
10048 -               for (i=0; i<20; i++) {
10049 -                       unsigned char c;
10050 -                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
10051 -bad:
10052 +               printk(KERN_EMERG "Code: ");
10053 +               if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
10054 +                       /* try starting at RIP */
10055 +                       ip = (u8 *) regs->ip;
10056 +                       code_len = code_len - code_prologue + 1;
10057 +               }
10058 +               for (i = 0; i < code_len; i++, ip++) {
10059 +                       if (ip < (u8 *)PAGE_OFFSET ||
10060 +                                       probe_kernel_address(ip, c)) {
10061                                 printk(" Bad RIP value.");
10062                                 break;
10063                         }
10064 -                       printk("%02x ", c);
10065 +                       if (ip == (u8 *)regs->ip)
10066 +                               printk("<%02x> ", c);
10067 +                       else
10068 +                               printk("%02x ", c);
10069                 }
10070         }
10071         printk("\n");
10072  }
10073
10074 -int is_valid_bugaddr(unsigned long rip)
10075 +int is_valid_bugaddr(unsigned long ip)
10076  {
10077         unsigned short ud2;
10078
10079 -       if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
10080 +       if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
10081                 return 0;
10082
10083         return ud2 == 0x0b0f;
10084  }
10085
10086 -#ifdef CONFIG_BUG
10087 -void out_of_line_bug(void)
10088 -{
10089 -       BUG();
10090 -}
10091 -EXPORT_SYMBOL(out_of_line_bug);
10092 -#endif
10093 -
10094  static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
10095  static int die_owner = -1;
10096  static unsigned int die_nest_count;
10097 @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
10098         return flags;
10099  }
10100
10101 -void __kprobes oops_end(unsigned long flags)
10102 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
10103  {
10104         die_owner = -1;
10105         bust_spinlocks(0);
10106 @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
10107                 /* Nest count reaches zero, release the lock. */
10108                 __raw_spin_unlock(&die_lock);
10109         raw_local_irq_restore(flags);
10110 +       if (!regs) {
10111 +               oops_exit();
10112 +               return;
10113 +       }
10114         if (panic_on_oops)
10115                 panic("Fatal exception");
10116         oops_exit();
10117 +       do_exit(signr);
10118  }
10119
10120 -void __kprobes __die(const char * str, struct pt_regs * regs, long err)
10121 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10122  {
10123         static int die_counter;
10124         printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
10125 @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
10126         printk("DEBUG_PAGEALLOC");
10127  #endif
10128         printk("\n");
10129 -       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
10130 +       if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
10131 +               return 1;
10132         show_registers(regs);
10133         add_taint(TAINT_DIE);
10134         /* Executive summary in case the oops scrolled away */
10135         printk(KERN_ALERT "RIP ");
10136 -       printk_address(regs->rip);
10137 -       printk(" RSP <%016lx>\n", regs->rsp);
10138 +       printk_address(regs->ip, 1);
10139 +       printk(" RSP <%016lx>\n", regs->sp);
10140         if (kexec_should_crash(current))
10141                 crash_kexec(regs);
10142 +       return 0;
10143  }
10144
10145  void die(const char * str, struct pt_regs * regs, long err)
10146 @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10147         unsigned long flags = oops_begin();
10148
10149         if (!user_mode(regs))
10150 -               report_bug(regs->rip, regs);
10151 +               report_bug(regs->ip, regs);
10152
10153 -       __die(str, regs, err);
10154 -       oops_end(flags);
10155 -       do_exit(SIGSEGV);
10156 +       if (__die(str, regs, err))
10157 +               regs = NULL;
10158 +       oops_end(flags, regs, SIGSEGV);
10159  }
10160
10161  #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10162 @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10163                 crash_kexec(regs);
10164         if (do_panic || panic_on_oops)
10165                 panic("Non maskable interrupt");
10166 -       oops_end(flags);
10167 +       oops_end(flags, NULL, SIGBUS);
10168         nmi_exit();
10169         local_irq_enable();
10170 -       do_exit(SIGSEGV);
10171 +       do_exit(SIGBUS);
10172  }
10173  #endif
10174
10175 @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10176                 tsk->thread.trap_no = trapnr;
10177
10178                 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10179 -                   printk_ratelimit())
10180 +                   printk_ratelimit()) {
10181                         printk(KERN_INFO
10182 -                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10183 +                              "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10184                                tsk->comm, tsk->pid, str,
10185 -                              regs->rip, regs->rsp, error_code);
10186 +                              regs->ip, regs->sp, error_code);
10187 +                       print_vma_addr(" in ", regs->ip);
10188 +                       printk("\n");
10189 +               }
10190
10191                 if (info)
10192                         force_sig_info(signr, info, tsk);
10193 @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10194         }
10195
10196
10197 -       /* kernel trap */
10198 -       {
10199 -               const struct exception_table_entry *fixup;
10200 -               fixup = search_exception_tables(regs->rip);
10201 -               if (fixup)
10202 -                       regs->rip = fixup->fixup;
10203 -               else {
10204 -                       tsk->thread.error_code = error_code;
10205 -                       tsk->thread.trap_no = trapnr;
10206 -                       die(str, regs, error_code);
10207 -               }
10208 -               return;
10209 +       if (!fixup_exception(regs)) {
10210 +               tsk->thread.error_code = error_code;
10211 +               tsk->thread.trap_no = trapnr;
10212 +               die(str, regs, error_code);
10213         }
10214 +       return;
10215  }
10216
10217  #define DO_ERROR(trapnr, signr, str, name) \
10218 @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10219         do_trap(trapnr, signr, str, regs, error_code, &info); \
10220  }
10221
10222 -DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
10223 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
10224  DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10225  DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10226 -DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10227 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10228  DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10229  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
10230  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10231 @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10232                 tsk->thread.trap_no = 13;
10233
10234                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10235 -                   printk_ratelimit())
10236 +                   printk_ratelimit()) {
10237                         printk(KERN_INFO
10238 -                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10239 +                      "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10240                                tsk->comm, tsk->pid,
10241 -                              regs->rip, regs->rsp, error_code);
10242 +                              regs->ip, regs->sp, error_code);
10243 +                       print_vma_addr(" in ", regs->ip);
10244 +                       printk("\n");
10245 +               }
10246
10247                 force_sig(SIGSEGV, tsk);
10248                 return;
10249         }
10250
10251 -       /* kernel gp */
10252 -       {
10253 -               const struct exception_table_entry *fixup;
10254 -               fixup = search_exception_tables(regs->rip);
10255 -               if (fixup) {
10256 -                       regs->rip = fixup->fixup;
10257 -                       return;
10258 -               }
10259 +       if (fixup_exception(regs))
10260 +               return;
10261
10262 -               tsk->thread.error_code = error_code;
10263 -               tsk->thread.trap_no = 13;
10264 -               if (notify_die(DIE_GPF, "general protection fault", regs,
10265 -                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
10266 -                       return;
10267 -               die("general protection fault", regs, error_code);
10268 -       }
10269 +       tsk->thread.error_code = error_code;
10270 +       tsk->thread.trap_no = 13;
10271 +       if (notify_die(DIE_GPF, "general protection fault", regs,
10272 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
10273 +               return;
10274 +       die("general protection fault", regs, error_code);
10275  }
10276
10277  static __kprobes void
10278 @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10279  {
10280         struct pt_regs *regs = eregs;
10281         /* Did already sync */
10282 -       if (eregs == (struct pt_regs *)eregs->rsp)
10283 +       if (eregs == (struct pt_regs *)eregs->sp)
10284                 ;
10285         /* Exception from user space */
10286         else if (user_mode(eregs))
10287                 regs = task_pt_regs(current);
10288         /* Exception from kernel and interrupts are enabled. Move to
10289            kernel process stack. */
10290 -       else if (eregs->eflags & X86_EFLAGS_IF)
10291 -               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10292 +       else if (eregs->flags & X86_EFLAGS_IF)
10293 +               regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10294         if (eregs != regs)
10295                 *regs = *eregs;
10296         return regs;
10297 @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10298
10299         get_debugreg(condition, 6);
10300
10301 +       /*
10302 +        * The processor cleared BTF, so don't mark that we need it set.
10303 +        */
10304 +       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10305 +       tsk->thread.debugctlmsr = 0;
10306 +
10307         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10308                                                 SIGTRAP) == NOTIFY_STOP)
10309                 return;
10310 @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10311
10312         tsk->thread.debugreg6 = condition;
10313
10314 -       /* Mask out spurious TF errors due to lazy TF clearing */
10315 +
10316 +       /*
10317 +        * Single-stepping through TF: make sure we ignore any events in
10318 +        * kernel space (but re-enable TF when returning to user mode).
10319 +        */
10320         if (condition & DR_STEP) {
10321 -               /*
10322 -                * The TF error should be masked out only if the current
10323 -                * process is not traced and if the TRAP flag has been set
10324 -                * previously by a tracing process (condition detected by
10325 -                * the PT_DTRACE flag); remember that the i386 TRAP flag
10326 -                * can be modified by the process itself in user mode,
10327 -                * allowing programs to debug themselves without the ptrace()
10328 -                * interface.
10329 -                */
10330                  if (!user_mode(regs))
10331                         goto clear_TF_reenable;
10332 -               /*
10333 -                * Was the TF flag set by a debugger? If so, clear it now,
10334 -                * so that register information is correct.
10335 -                */
10336 -               if (tsk->ptrace & PT_DTRACE) {
10337 -                       regs->eflags &= ~TF_MASK;
10338 -                       tsk->ptrace &= ~PT_DTRACE;
10339 -               }
10340         }
10341
10342         /* Ok, finally something we can handle */
10343 @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10344         info.si_signo = SIGTRAP;
10345         info.si_errno = 0;
10346         info.si_code = TRAP_BRKPT;
10347 -       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10348 +       info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10349         force_sig_info(SIGTRAP, &info, tsk);
10350
10351  clear_dr7:
10352 @@ -913,18 +949,15 @@ clear_dr7:
10353
10354  clear_TF_reenable:
10355         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10356 -       regs->eflags &= ~TF_MASK;
10357 +       regs->flags &= ~X86_EFLAGS_TF;
10358         preempt_conditional_cli(regs);
10359  }
10360
10361  static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10362  {
10363 -       const struct exception_table_entry *fixup;
10364 -       fixup = search_exception_tables(regs->rip);
10365 -       if (fixup) {
10366 -               regs->rip = fixup->fixup;
10367 +       if (fixup_exception(regs))
10368                 return 1;
10369 -       }
10370 +
10371         notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10372         /* Illegal floating point operation in the kernel */
10373         current->thread.trap_no = trapnr;
10374 @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10375   */
10376  asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10377  {
10378 -       void __user *rip = (void __user *)(regs->rip);
10379 +       void __user *ip = (void __user *)(regs->ip);
10380         struct task_struct * task;
10381         siginfo_t info;
10382         unsigned short cwd, swd;
10383 @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10384         info.si_signo = SIGFPE;
10385         info.si_errno = 0;
10386         info.si_code = __SI_FAULT;
10387 -       info.si_addr = rip;
10388 +       info.si_addr = ip;
10389         /*
10390          * (~cwd & swd) will mask out exceptions that are not set to unmasked
10391          * status.  0x3f is the exception bits in these regs, 0x200 is the
10392 @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10393
10394  asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10395  {
10396 -       void __user *rip = (void __user *)(regs->rip);
10397 +       void __user *ip = (void __user *)(regs->ip);
10398         struct task_struct * task;
10399         siginfo_t info;
10400         unsigned short mxcsr;
10401 @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10402         info.si_signo = SIGFPE;
10403         info.si_errno = 0;
10404         info.si_code = __SI_FAULT;
10405 -       info.si_addr = rip;
10406 +       info.si_addr = ip;
10407         /*
10408          * The SIMD FPU exceptions are handled a little differently, as there
10409          * is only a single status/control register.  Thus, to determine which
10410 @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10411         task_thread_info(me)->status |= TS_USEDFPU;
10412         me->fpu_counter++;
10413  }
10414 +EXPORT_SYMBOL_GPL(math_state_restore);
10415
10416
10417  /*
10418   * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10419   * specify <dpl>|4 in the second field.
10420   */
10421 -static trap_info_t __cpuinitdata trap_table[] = {
10422 +static const trap_info_t __cpuinitconst trap_table[] = {
10423          {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
10424          {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
10425          {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
10426 @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10427         return 0;
10428  }
10429  early_param("kstack", kstack_setup);
10430 +
10431 +
10432 +static int __init code_bytes_setup(char *s)
10433 +{
10434 +       code_bytes = simple_strtoul(s, NULL, 0);
10435 +       if (code_bytes > 8192)
10436 +               code_bytes = 8192;
10437 +
10438 +       return 1;
10439 +}
10440 +__setup("code_bytes=", code_bytes_setup);
10441 --- a/arch/x86/kernel/vsyscall_64-xen.c
10442 +++ b/arch/x86/kernel/vsyscall_64-xen.c
10443 @@ -43,12 +43,7 @@
10444  #include <asm/vgtod.h>
10445
10446  #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10447 -#define __syscall_clobber "r11","rcx","memory"
10448 -#define __pa_vsymbol(x)                        \
10449 -       ({unsigned long v;              \
10450 -       extern char __vsyscall_0;       \
10451 -         asm("" : "=r" (v) : "0" (x)); \
10452 -         ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10453 +#define __syscall_clobber "r11","cx","memory"
10454
10455  /*
10456   * vsyscall_gtod_data contains data that is :
10457 @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10458  static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10459  {
10460         int ret;
10461 -       asm volatile("vsysc2: syscall"
10462 +       asm volatile("syscall"
10463                 : "=a" (ret)
10464                 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10465                 : __syscall_clobber );
10466 @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10467  static __always_inline long time_syscall(long *t)
10468  {
10469         long secs;
10470 -       asm volatile("vsysc1: syscall"
10471 +       asm volatile("syscall"
10472                 : "=a" (secs)
10473                 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10474         return secs;
10475 @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10476  long __vsyscall(2)
10477  vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10478  {
10479 -       unsigned int dummy, p;
10480 +       unsigned int p;
10481         unsigned long j = 0;
10482
10483         /* Fast cache - only recompute value once per jiffies and avoid
10484 @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10485                 p = tcache->blob[1];
10486         } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10487                 /* Load per CPU data from RDTSCP */
10488 -               rdtscp(dummy, dummy, p);
10489 +               native_read_tscp(&p);
10490         } else {
10491                 /* Load per CPU data from GDT */
10492                 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10493 @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10494
10495  #ifdef CONFIG_SYSCTL
10496
10497 -#define SYSCALL 0x050f
10498 -#define NOP2    0x9090
10499 -
10500 -/*
10501 - * NOP out syscall in vsyscall page when not needed.
10502 - */
10503 -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10504 -                        void __user *buffer, size_t *lenp, loff_t *ppos)
10505 +static int
10506 +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10507 +                      void __user *buffer, size_t *lenp, loff_t *ppos)
10508  {
10509 -       extern u16 vsysc1, vsysc2;
10510 -       u16 __iomem *map1;
10511 -       u16 __iomem *map2;
10512 -       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10513 -       if (!write)
10514 -               return ret;
10515 -       /* gcc has some trouble with __va(__pa()), so just do it this
10516 -          way. */
10517 -       map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10518 -       if (!map1)
10519 -               return -ENOMEM;
10520 -       map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10521 -       if (!map2) {
10522 -               ret = -ENOMEM;
10523 -               goto out;
10524 -       }
10525 -       if (!vsyscall_gtod_data.sysctl_enabled) {
10526 -               writew(SYSCALL, map1);
10527 -               writew(SYSCALL, map2);
10528 -       } else {
10529 -               writew(NOP2, map1);
10530 -               writew(NOP2, map2);
10531 -       }
10532 -       iounmap(map2);
10533 -out:
10534 -       iounmap(map1);
10535 -       return ret;
10536 +       return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10537  }
10538
10539  static ctl_table kernel_table2[] = {
10540 @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10541           .child = kernel_table2 },
10542         {}
10543  };
10544 -
10545  #endif
10546
10547  /* Assume __initcall executes before all user space. Hopefully kmod
10548 @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10549         d |= cpu;
10550         d |= (node & 0xf) << 12;
10551         d |= (node >> 4) << 48;
10552 -       if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10553 +       if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10554                                                          + GDT_ENTRY_PER_CPU),
10555                                          d))
10556                 BUG();
10557 @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10558         return NOTIFY_DONE;
10559  }
10560
10561 -static void __init map_vsyscall(void)
10562 +void __init map_vsyscall(void)
10563  {
10564         extern char __vsyscall_0;
10565         unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10566 @@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10567         BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10568         BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10569         BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10570 -       map_vsyscall();
10571  #ifdef CONFIG_XEN
10572         vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10573         if (boot_cpu_has(X86_FEATURE_RDTSCP))
10574 --- a/arch/x86/kernel/xen_entry_64.S
10575 +++ /dev/null
10576 @@ -1,36 +0,0 @@
10577 -/*
10578 - * Copied from arch/xen/i386/kernel/entry.S
10579 - */
10580 -/* Offsets into shared_info_t. */
10581 -#define evtchn_upcall_pending          /* 0 */
10582 -#define evtchn_upcall_mask             1
10583 -
10584 -#define sizeof_vcpu_shift              6
10585 -
10586 -#ifdef CONFIG_SMP
10587 -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10588 -//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
10589 -#define preempt_disable(reg)
10590 -#define preempt_enable(reg)
10591 -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
10592 -                               movq %gs:pda_cpunumber,reg              ; \
10593 -                               shl  $32, reg                           ; \
10594 -                               shr  $32-sizeof_vcpu_shift,reg          ; \
10595 -                               addq HYPERVISOR_shared_info,reg
10596 -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
10597 -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10598 -#else
10599 -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10600 -#define XEN_PUT_VCPU_INFO(reg)
10601 -#define XEN_PUT_VCPU_INFO_fixup
10602 -#endif
10603 -
10604 -#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
10605 -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10606 -#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
10607 -                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
10608 -                               XEN_PUT_VCPU_INFO(reg)
10609 -#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
10610 -                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
10611 -                               XEN_PUT_VCPU_INFO(reg)
10612 -#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
10613 --- a/arch/x86/mach-xen/setup.c
10614 +++ b/arch/x86/mach-xen/setup.c
10615 @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10616
10617         /* Do an early initialization of the fixmap area */
10618         {
10619 -               extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10620 +               extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10621                 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10622 -               pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10623 -               pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10624 +               pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10625                 pmd_t *pmd = pmd_offset(pud, addr);
10626
10627 -               swapper_pg_dir = pgd;
10628 -               init_mm.pgd    = pgd;
10629 -               make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10630 -               set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10631 +               make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10632 +               set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10633         }
10634  }
10635 --- a/arch/x86/mm/fault_32-xen.c
10636 +++ /dev/null
10637 @@ -1,757 +0,0 @@
10638 -/*
10639 - *  linux/arch/i386/mm/fault.c
10640 - *
10641 - *  Copyright (C) 1995  Linus Torvalds
10642 - */
10643 -
10644 -#include <linux/signal.h>
10645 -#include <linux/sched.h>
10646 -#include <linux/kernel.h>
10647 -#include <linux/errno.h>
10648 -#include <linux/string.h>
10649 -#include <linux/types.h>
10650 -#include <linux/ptrace.h>
10651 -#include <linux/mman.h>
10652 -#include <linux/mm.h>
10653 -#include <linux/smp.h>
10654 -#include <linux/interrupt.h>
10655 -#include <linux/init.h>
10656 -#include <linux/tty.h>
10657 -#include <linux/vt_kern.h>             /* For unblank_screen() */
10658 -#include <linux/highmem.h>
10659 -#include <linux/bootmem.h>             /* for max_low_pfn */
10660 -#include <linux/vmalloc.h>
10661 -#include <linux/module.h>
10662 -#include <linux/kprobes.h>
10663 -#include <linux/uaccess.h>
10664 -#include <linux/kdebug.h>
10665 -#include <linux/kprobes.h>
10666 -
10667 -#include <asm/system.h>
10668 -#include <asm/desc.h>
10669 -#include <asm/segment.h>
10670 -
10671 -extern void die(const char *,struct pt_regs *,long);
10672 -
10673 -#ifdef CONFIG_KPROBES
10674 -static inline int notify_page_fault(struct pt_regs *regs)
10675 -{
10676 -       int ret = 0;
10677 -
10678 -       /* kprobe_running() needs smp_processor_id() */
10679 -       if (!user_mode_vm(regs)) {
10680 -               preempt_disable();
10681 -               if (kprobe_running() && kprobe_fault_handler(regs, 14))
10682 -                       ret = 1;
10683 -               preempt_enable();
10684 -       }
10685 -
10686 -       return ret;
10687 -}
10688 -#else
10689 -static inline int notify_page_fault(struct pt_regs *regs)
10690 -{
10691 -       return 0;
10692 -}
10693 -#endif
10694 -
10695 -/*
10696 - * Return EIP plus the CS segment base.  The segment limit is also
10697 - * adjusted, clamped to the kernel/user address space (whichever is
10698 - * appropriate), and returned in *eip_limit.
10699 - *
10700 - * The segment is checked, because it might have been changed by another
10701 - * task between the original faulting instruction and here.
10702 - *
10703 - * If CS is no longer a valid code segment, or if EIP is beyond the
10704 - * limit, or if it is a kernel address when CS is not a kernel segment,
10705 - * then the returned value will be greater than *eip_limit.
10706 - *
10707 - * This is slow, but is very rarely executed.
10708 - */
10709 -static inline unsigned long get_segment_eip(struct pt_regs *regs,
10710 -                                           unsigned long *eip_limit)
10711 -{
10712 -       unsigned long eip = regs->eip;
10713 -       unsigned seg = regs->xcs & 0xffff;
10714 -       u32 seg_ar, seg_limit, base, *desc;
10715 -
10716 -       /* Unlikely, but must come before segment checks. */
10717 -       if (unlikely(regs->eflags & VM_MASK)) {
10718 -               base = seg << 4;
10719 -               *eip_limit = base + 0xffff;
10720 -               return base + (eip & 0xffff);
10721 -       }
10722 -
10723 -       /* The standard kernel/user address space limit. */
10724 -       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
10725 -
10726 -       /* By far the most common cases. */
10727 -       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
10728 -               return eip;
10729 -
10730 -       /* Check the segment exists, is within the current LDT/GDT size,
10731 -          that kernel/user (ring 0..3) has the appropriate privilege,
10732 -          that it's a code segment, and get the limit. */
10733 -       __asm__ ("larl %3,%0; lsll %3,%1"
10734 -                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
10735 -       if ((~seg_ar & 0x9800) || eip > seg_limit) {
10736 -               *eip_limit = 0;
10737 -               return 1;        /* So that returned eip > *eip_limit. */
10738 -       }
10739 -
10740 -       /* Get the GDT/LDT descriptor base.
10741 -          When you look for races in this code remember that
10742 -          LDT and other horrors are only used in user space. */
10743 -       if (seg & (1<<2)) {
10744 -               /* Must lock the LDT while reading it. */
10745 -               mutex_lock(&current->mm->context.lock);
10746 -               desc = current->mm->context.ldt;
10747 -               desc = (void *)desc + (seg & ~7);
10748 -       } else {
10749 -               /* Must disable preemption while reading the GDT. */
10750 -               desc = (u32 *)get_cpu_gdt_table(get_cpu());
10751 -               desc = (void *)desc + (seg & ~7);
10752 -       }
10753 -
10754 -       /* Decode the code segment base from the descriptor */
10755 -       base = get_desc_base((unsigned long *)desc);
10756 -
10757 -       if (seg & (1<<2)) {
10758 -               mutex_unlock(&current->mm->context.lock);
10759 -       } else
10760 -               put_cpu();
10761 -
10762 -       /* Adjust EIP and segment limit, and clamp at the kernel limit.
10763 -          It's legitimate for segments to wrap at 0xffffffff. */
10764 -       seg_limit += base;
10765 -       if (seg_limit < *eip_limit && seg_limit >= base)
10766 -               *eip_limit = seg_limit;
10767 -       return eip + base;
10768 -}
10769 -
10770 -/*
10771 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10772 - * Check that here and ignore it.
10773 - */
10774 -static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
10775 -{
10776 -       unsigned long limit;
10777 -       unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
10778 -       int scan_more = 1;
10779 -       int prefetch = 0;
10780 -       int i;
10781 -
10782 -       for (i = 0; scan_more && i < 15; i++) {
10783 -               unsigned char opcode;
10784 -               unsigned char instr_hi;
10785 -               unsigned char instr_lo;
10786 -
10787 -               if (instr > (unsigned char *)limit)
10788 -                       break;
10789 -               if (probe_kernel_address(instr, opcode))
10790 -                       break;
10791 -
10792 -               instr_hi = opcode & 0xf0;
10793 -               instr_lo = opcode & 0x0f;
10794 -               instr++;
10795 -
10796 -               switch (instr_hi) {
10797 -               case 0x20:
10798 -               case 0x30:
10799 -                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
10800 -                       scan_more = ((instr_lo & 7) == 0x6);
10801 -                       break;
10802 -
10803 -               case 0x60:
10804 -                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
10805 -                       scan_more = (instr_lo & 0xC) == 0x4;
10806 -                       break;
10807 -               case 0xF0:
10808 -                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
10809 -                       scan_more = !instr_lo || (instr_lo>>1) == 1;
10810 -                       break;
10811 -               case 0x00:
10812 -                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
10813 -                       scan_more = 0;
10814 -                       if (instr > (unsigned char *)limit)
10815 -                               break;
10816 -                       if (probe_kernel_address(instr, opcode))
10817 -                               break;
10818 -                       prefetch = (instr_lo == 0xF) &&
10819 -                               (opcode == 0x0D || opcode == 0x18);
10820 -                       break;
10821 -               default:
10822 -                       scan_more = 0;
10823 -                       break;
10824 -               }
10825 -       }
10826 -       return prefetch;
10827 -}
10828 -
10829 -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
10830 -                             unsigned long error_code)
10831 -{
10832 -       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10833 -                    boot_cpu_data.x86 >= 6)) {
10834 -               /* Catch an obscure case of prefetch inside an NX page. */
10835 -               if (nx_enabled && (error_code & 16))
10836 -                       return 0;
10837 -               return __is_prefetch(regs, addr);
10838 -       }
10839 -       return 0;
10840 -}
10841 -
10842 -static noinline void force_sig_info_fault(int si_signo, int si_code,
10843 -       unsigned long address, struct task_struct *tsk)
10844 -{
10845 -       siginfo_t info;
10846 -
10847 -       info.si_signo = si_signo;
10848 -       info.si_errno = 0;
10849 -       info.si_code = si_code;
10850 -       info.si_addr = (void __user *)address;
10851 -       force_sig_info(si_signo, &info, tsk);
10852 -}
10853 -
10854 -fastcall void do_invalid_op(struct pt_regs *, unsigned long);
10855 -
10856 -#ifdef CONFIG_X86_PAE
10857 -static void dump_fault_path(unsigned long address)
10858 -{
10859 -       unsigned long *p, page;
10860 -       unsigned long mfn;
10861 -
10862 -       page = read_cr3();
10863 -       p  = (unsigned long *)__va(page);
10864 -       p += (address >> 30) * 2;
10865 -       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
10866 -       if (p[0] & _PAGE_PRESENT) {
10867 -               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10868 -               page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10869 -               p  = (unsigned long *)__va(page);
10870 -               address &= 0x3fffffff;
10871 -               p += (address >> 21) * 2;
10872 -               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
10873 -                      page, p[1], p[0]);
10874 -               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
10875 -#ifdef CONFIG_HIGHPTE
10876 -               if (mfn_to_pfn(mfn) >= highstart_pfn)
10877 -                       return;
10878 -#endif
10879 -               if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
10880 -                       page = mfn_to_pfn(mfn) << PAGE_SHIFT;
10881 -                       p  = (unsigned long *) __va(page);
10882 -                       address &= 0x001fffff;
10883 -                       p += (address >> 12) * 2;
10884 -                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
10885 -                              page, p[1], p[0]);
10886 -               }
10887 -       }
10888 -}
10889 -#else
10890 -static void dump_fault_path(unsigned long address)
10891 -{
10892 -       unsigned long page;
10893 -
10894 -       page = read_cr3();
10895 -       page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
10896 -       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
10897 -              machine_to_phys(page));
10898 -       /*
10899 -        * We must not directly access the pte in the highpte
10900 -        * case if the page table is located in highmem.
10901 -        * And lets rather not kmap-atomic the pte, just in case
10902 -        * it's allocated already.
10903 -        */
10904 -       if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
10905 -           && (page & _PAGE_PRESENT)
10906 -           && !(page & _PAGE_PSE)) {
10907 -               page = machine_to_phys(page & PAGE_MASK);
10908 -               page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
10909 -                                                     & (PTRS_PER_PTE - 1)];
10910 -               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
10911 -                      machine_to_phys(page));
10912 -       }
10913 -}
10914 -#endif
10915 -
10916 -static int spurious_fault(struct pt_regs *regs,
10917 -                         unsigned long address,
10918 -                         unsigned long error_code)
10919 -{
10920 -       pgd_t *pgd;
10921 -       pud_t *pud;
10922 -       pmd_t *pmd;
10923 -       pte_t *pte;
10924 -
10925 -       /* Reserved-bit violation or user access to kernel space? */
10926 -       if (error_code & 0x0c)
10927 -               return 0;
10928 -
10929 -       pgd = init_mm.pgd + pgd_index(address);
10930 -       if (!pgd_present(*pgd))
10931 -               return 0;
10932 -
10933 -       pud = pud_offset(pgd, address);
10934 -       if (!pud_present(*pud))
10935 -               return 0;
10936 -
10937 -       pmd = pmd_offset(pud, address);
10938 -       if (!pmd_present(*pmd))
10939 -               return 0;
10940 -
10941 -       pte = pte_offset_kernel(pmd, address);
10942 -       if (!pte_present(*pte))
10943 -               return 0;
10944 -       if ((error_code & 0x02) && !pte_write(*pte))
10945 -               return 0;
10946 -#ifdef CONFIG_X86_PAE
10947 -       if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
10948 -               return 0;
10949 -#endif
10950 -
10951 -       return 1;
10952 -}
10953 -
10954 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10955 -{
10956 -       unsigned index = pgd_index(address);
10957 -       pgd_t *pgd_k;
10958 -       pud_t *pud, *pud_k;
10959 -       pmd_t *pmd, *pmd_k;
10960 -
10961 -       pgd += index;
10962 -       pgd_k = init_mm.pgd + index;
10963 -
10964 -       if (!pgd_present(*pgd_k))
10965 -               return NULL;
10966 -
10967 -       /*
10968 -        * set_pgd(pgd, *pgd_k); here would be useless on PAE
10969 -        * and redundant with the set_pmd() on non-PAE. As would
10970 -        * set_pud.
10971 -        */
10972 -
10973 -       pud = pud_offset(pgd, address);
10974 -       pud_k = pud_offset(pgd_k, address);
10975 -       if (!pud_present(*pud_k))
10976 -               return NULL;
10977 -
10978 -       pmd = pmd_offset(pud, address);
10979 -       pmd_k = pmd_offset(pud_k, address);
10980 -       if (!pmd_present(*pmd_k))
10981 -               return NULL;
10982 -       if (!pmd_present(*pmd)) {
10983 -               bool lazy = x86_read_percpu(xen_lazy_mmu);
10984 -
10985 -               x86_write_percpu(xen_lazy_mmu, false);
10986 -#if CONFIG_XEN_COMPAT > 0x030002
10987 -               set_pmd(pmd, *pmd_k);
10988 -#else
10989 -               /*
10990 -                * When running on older Xen we must launder *pmd_k through
10991 -                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10992 -                */
10993 -               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10994 -#endif
10995 -               x86_write_percpu(xen_lazy_mmu, lazy);
10996 -       } else
10997 -               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
10998 -       return pmd_k;
10999 -}
11000 -
11001 -/*
11002 - * Handle a fault on the vmalloc or module mapping area
11003 - *
11004 - * This assumes no large pages in there.
11005 - */
11006 -static inline int vmalloc_fault(unsigned long address)
11007 -{
11008 -       unsigned long pgd_paddr;
11009 -       pmd_t *pmd_k;
11010 -       pte_t *pte_k;
11011 -       /*
11012 -        * Synchronize this task's top level page-table
11013 -        * with the 'reference' page table.
11014 -        *
11015 -        * Do _not_ use "current" here. We might be inside
11016 -        * an interrupt in the middle of a task switch..
11017 -        */
11018 -       pgd_paddr = read_cr3();
11019 -       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11020 -       if (!pmd_k)
11021 -               return -1;
11022 -       pte_k = pte_offset_kernel(pmd_k, address);
11023 -       if (!pte_present(*pte_k))
11024 -               return -1;
11025 -       return 0;
11026 -}
11027 -
11028 -int show_unhandled_signals = 1;
11029 -
11030 -/*
11031 - * This routine handles page faults.  It determines the address,
11032 - * and the problem, and then passes it off to one of the appropriate
11033 - * routines.
11034 - *
11035 - * error_code:
11036 - *     bit 0 == 0 means no page found, 1 means protection fault
11037 - *     bit 1 == 0 means read, 1 means write
11038 - *     bit 2 == 0 means kernel, 1 means user-mode
11039 - *     bit 3 == 1 means use of reserved bit detected
11040 - *     bit 4 == 1 means fault was an instruction fetch
11041 - */
11042 -fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11043 -                                     unsigned long error_code)
11044 -{
11045 -       struct task_struct *tsk;
11046 -       struct mm_struct *mm;
11047 -       struct vm_area_struct * vma;
11048 -       unsigned long address;
11049 -       int write, si_code;
11050 -       int fault;
11051 -
11052 -       /*
11053 -        * We can fault from pretty much anywhere, with unknown IRQ state.
11054 -        */
11055 -       trace_hardirqs_fixup();
11056 -
11057 -       /* get the address */
11058 -        address = read_cr2();
11059 -
11060 -       /* Set the "privileged fault" bit to something sane. */
11061 -       error_code &= ~4;
11062 -       error_code |= (regs->xcs & 2) << 1;
11063 -       if (regs->eflags & X86_EFLAGS_VM)
11064 -               error_code |= 4;
11065 -
11066 -       tsk = current;
11067 -
11068 -       si_code = SEGV_MAPERR;
11069 -
11070 -       /*
11071 -        * We fault-in kernel-space virtual memory on-demand. The
11072 -        * 'reference' page table is init_mm.pgd.
11073 -        *
11074 -        * NOTE! We MUST NOT take any locks for this case. We may
11075 -        * be in an interrupt or a critical region, and should
11076 -        * only copy the information from the master page table,
11077 -        * nothing more.
11078 -        *
11079 -        * This verifies that the fault happens in kernel space
11080 -        * (error_code & 4) == 0, and that the fault was not a
11081 -        * protection error (error_code & 9) == 0.
11082 -        */
11083 -       if (unlikely(address >= TASK_SIZE)) {
11084 -#ifdef CONFIG_XEN
11085 -               /* Faults in hypervisor area can never be patched up. */
11086 -               if (address >= hypervisor_virt_start)
11087 -                       goto bad_area_nosemaphore;
11088 -#endif
11089 -               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11090 -                       return;
11091 -               /* Can take a spurious fault if mapping changes R/O -> R/W. */
11092 -               if (spurious_fault(regs, address, error_code))
11093 -                       return;
11094 -               if (notify_page_fault(regs))
11095 -                       return;
11096 -               /*
11097 -                * Don't take the mm semaphore here. If we fixup a prefetch
11098 -                * fault we could otherwise deadlock.
11099 -                */
11100 -               goto bad_area_nosemaphore;
11101 -       }
11102 -
11103 -       if (notify_page_fault(regs))
11104 -               return;
11105 -
11106 -       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11107 -          fault has been handled. */
11108 -       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11109 -               local_irq_enable();
11110 -
11111 -       mm = tsk->mm;
11112 -
11113 -       /*
11114 -        * If we're in an interrupt, have no user context or are running in an
11115 -        * atomic region then we must not take the fault..
11116 -        */
11117 -       if (in_atomic() || !mm)
11118 -               goto bad_area_nosemaphore;
11119 -
11120 -       /* When running in the kernel we expect faults to occur only to
11121 -        * addresses in user space.  All other faults represent errors in the
11122 -        * kernel and should generate an OOPS.  Unfortunately, in the case of an
11123 -        * erroneous fault occurring in a code path which already holds mmap_sem
11124 -        * we will deadlock attempting to validate the fault against the
11125 -        * address space.  Luckily the kernel only validly references user
11126 -        * space from well defined areas of code, which are listed in the
11127 -        * exceptions table.
11128 -        *
11129 -        * As the vast majority of faults will be valid we will only perform
11130 -        * the source reference check when there is a possibility of a deadlock.
11131 -        * Attempt to lock the address space, if we cannot we then validate the
11132 -        * source.  If this is invalid we can skip the address space check,
11133 -        * thus avoiding the deadlock.
11134 -        */
11135 -       if (!down_read_trylock(&mm->mmap_sem)) {
11136 -               if ((error_code & 4) == 0 &&
11137 -                   !search_exception_tables(regs->eip))
11138 -                       goto bad_area_nosemaphore;
11139 -               down_read(&mm->mmap_sem);
11140 -       }
11141 -
11142 -       vma = find_vma(mm, address);
11143 -       if (!vma)
11144 -               goto bad_area;
11145 -       if (vma->vm_start <= address)
11146 -               goto good_area;
11147 -       if (!(vma->vm_flags & VM_GROWSDOWN))
11148 -               goto bad_area;
11149 -       if (error_code & 4) {
11150 -               /*
11151 -                * Accessing the stack below %esp is always a bug.
11152 -                * The large cushion allows instructions like enter
11153 -                * and pusha to work.  ("enter $65535,$31" pushes
11154 -                * 32 pointers and then decrements %esp by 65535.)
11155 -                */
11156 -               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
11157 -                       goto bad_area;
11158 -       }
11159 -       if (expand_stack(vma, address))
11160 -               goto bad_area;
11161 -/*
11162 - * Ok, we have a good vm_area for this memory access, so
11163 - * we can handle it..
11164 - */
11165 -good_area:
11166 -       si_code = SEGV_ACCERR;
11167 -       write = 0;
11168 -       switch (error_code & 3) {
11169 -               default:        /* 3: write, present */
11170 -                               /* fall through */
11171 -               case 2:         /* write, not present */
11172 -                       if (!(vma->vm_flags & VM_WRITE))
11173 -                               goto bad_area;
11174 -                       write++;
11175 -                       break;
11176 -               case 1:         /* read, present */
11177 -                       goto bad_area;
11178 -               case 0:         /* read, not present */
11179 -                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11180 -                               goto bad_area;
11181 -       }
11182 -
11183 - survive:
11184 -       /*
11185 -        * If for any reason at all we couldn't handle the fault,
11186 -        * make sure we exit gracefully rather than endlessly redo
11187 -        * the fault.
11188 -        */
11189 -       fault = handle_mm_fault(mm, vma, address, write);
11190 -       if (unlikely(fault & VM_FAULT_ERROR)) {
11191 -               if (fault & VM_FAULT_OOM)
11192 -                       goto out_of_memory;
11193 -               else if (fault & VM_FAULT_SIGBUS)
11194 -                       goto do_sigbus;
11195 -               BUG();
11196 -       }
11197 -       if (fault & VM_FAULT_MAJOR)
11198 -               tsk->maj_flt++;
11199 -       else
11200 -               tsk->min_flt++;
11201 -
11202 -       /*
11203 -        * Did it hit the DOS screen memory VA from vm86 mode?
11204 -        */
11205 -       if (regs->eflags & VM_MASK) {
11206 -               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11207 -               if (bit < 32)
11208 -                       tsk->thread.screen_bitmap |= 1 << bit;
11209 -       }
11210 -       up_read(&mm->mmap_sem);
11211 -       return;
11212 -
11213 -/*
11214 - * Something tried to access memory that isn't in our memory map..
11215 - * Fix it, but check if it's kernel or user first..
11216 - */
11217 -bad_area:
11218 -       up_read(&mm->mmap_sem);
11219 -
11220 -bad_area_nosemaphore:
11221 -       /* User mode accesses just cause a SIGSEGV */
11222 -       if (error_code & 4) {
11223 -               /*
11224 -                * It's possible to have interrupts off here.
11225 -                */
11226 -               local_irq_enable();
11227 -
11228 -               /*
11229 -                * Valid to do another page fault here because this one came
11230 -                * from user space.
11231 -                */
11232 -               if (is_prefetch(regs, address, error_code))
11233 -                       return;
11234 -
11235 -               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11236 -                   printk_ratelimit()) {
11237 -                       printk("%s%s[%d]: segfault at %08lx eip %08lx "
11238 -                           "esp %08lx error %lx\n",
11239 -                           task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11240 -                           tsk->comm, task_pid_nr(tsk), address, regs->eip,
11241 -                           regs->esp, error_code);
11242 -               }
11243 -               tsk->thread.cr2 = address;
11244 -               /* Kernel addresses are always protection faults */
11245 -               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11246 -               tsk->thread.trap_no = 14;
11247 -               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11248 -               return;
11249 -       }
11250 -
11251 -#ifdef CONFIG_X86_F00F_BUG
11252 -       /*
11253 -        * Pentium F0 0F C7 C8 bug workaround.
11254 -        */
11255 -       if (boot_cpu_data.f00f_bug) {
11256 -               unsigned long nr;
11257 -
11258 -               nr = (address - idt_descr.address) >> 3;
11259 -
11260 -               if (nr == 6) {
11261 -                       do_invalid_op(regs, 0);
11262 -                       return;
11263 -               }
11264 -       }
11265 -#endif
11266 -
11267 -no_context:
11268 -       /* Are we prepared to handle this kernel fault?  */
11269 -       if (fixup_exception(regs))
11270 -               return;
11271 -
11272 -       /*
11273 -        * Valid to do another page fault here, because if this fault
11274 -        * had been triggered by is_prefetch fixup_exception would have
11275 -        * handled it.
11276 -        */
11277 -       if (is_prefetch(regs, address, error_code))
11278 -               return;
11279 -
11280 -/*
11281 - * Oops. The kernel tried to access some bad page. We'll have to
11282 - * terminate things with extreme prejudice.
11283 - */
11284 -
11285 -       bust_spinlocks(1);
11286 -
11287 -       if (oops_may_print()) {
11288 -#ifdef CONFIG_X86_PAE
11289 -               if (error_code & 16) {
11290 -                       pte_t *pte = lookup_address(address);
11291 -
11292 -                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
11293 -                               printk(KERN_CRIT "kernel tried to execute "
11294 -                                       "NX-protected page - exploit attempt? "
11295 -                                       "(uid: %d)\n", current->uid);
11296 -               }
11297 -#endif
11298 -               if (address < PAGE_SIZE)
11299 -                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
11300 -                                       "pointer dereference");
11301 -               else
11302 -                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
11303 -                                       " request");
11304 -               printk(" at virtual address %08lx\n",address);
11305 -               printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
11306 -               dump_fault_path(address);
11307 -       }
11308 -       tsk->thread.cr2 = address;
11309 -       tsk->thread.trap_no = 14;
11310 -       tsk->thread.error_code = error_code;
11311 -       die("Oops", regs, error_code);
11312 -       bust_spinlocks(0);
11313 -       do_exit(SIGKILL);
11314 -
11315 -/*
11316 - * We ran out of memory, or some other thing happened to us that made
11317 - * us unable to handle the page fault gracefully.
11318 - */
11319 -out_of_memory:
11320 -       up_read(&mm->mmap_sem);
11321 -       if (is_global_init(tsk)) {
11322 -               yield();
11323 -               down_read(&mm->mmap_sem);
11324 -               goto survive;
11325 -       }
11326 -       printk("VM: killing process %s\n", tsk->comm);
11327 -       if (error_code & 4)
11328 -               do_group_exit(SIGKILL);
11329 -       goto no_context;
11330 -
11331 -do_sigbus:
11332 -       up_read(&mm->mmap_sem);
11333 -
11334 -       /* Kernel mode? Handle exceptions or die */
11335 -       if (!(error_code & 4))
11336 -               goto no_context;
11337 -
11338 -       /* User space => ok to do another page fault */
11339 -       if (is_prefetch(regs, address, error_code))
11340 -               return;
11341 -
11342 -       tsk->thread.cr2 = address;
11343 -       tsk->thread.error_code = error_code;
11344 -       tsk->thread.trap_no = 14;
11345 -       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11346 -}
11347 -
11348 -void vmalloc_sync_all(void)
11349 -{
11350 -       /*
11351 -        * Note that races in the updates of insync and start aren't
11352 -        * problematic: insync can only get set bits added, and updates to
11353 -        * start are only improving performance (without affecting correctness
11354 -        * if undone).
11355 -        * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11356 -        *      This change works just fine with 2-level paging too.
11357 -        */
11358 -#define sync_index(a) ((a) >> PMD_SHIFT)
11359 -       static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11360 -       static unsigned long start = TASK_SIZE;
11361 -       unsigned long address;
11362 -
11363 -       if (SHARED_KERNEL_PMD)
11364 -               return;
11365 -
11366 -       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
11367 -       for (address = start;
11368 -            address >= TASK_SIZE && address < hypervisor_virt_start;
11369 -            address += 1UL << PMD_SHIFT) {
11370 -               if (!test_bit(sync_index(address), insync)) {
11371 -                       unsigned long flags;
11372 -                       struct page *page;
11373 -
11374 -                       spin_lock_irqsave(&pgd_lock, flags);
11375 -                       /* XEN: failure path assumes non-empty pgd_list. */
11376 -                       if (unlikely(!pgd_list)) {
11377 -                               spin_unlock_irqrestore(&pgd_lock, flags);
11378 -                               return;
11379 -                       }
11380 -                       for (page = pgd_list; page; page =
11381 -                                       (struct page *)page->index)
11382 -                               if (!vmalloc_sync_one(page_address(page),
11383 -                                                               address)) {
11384 -                                       BUG_ON(page != pgd_list);
11385 -                                       break;
11386 -                               }
11387 -                       spin_unlock_irqrestore(&pgd_lock, flags);
11388 -                       if (!page)
11389 -                               set_bit(sync_index(address), insync);
11390 -               }
11391 -               if (address == start && test_bit(sync_index(address), insync))
11392 -                       start = address + (1UL << PMD_SHIFT);
11393 -       }
11394 -}
11395 --- a/arch/x86/mm/fault_64-xen.c
11396 +++ /dev/null
11397 @@ -1,686 +0,0 @@
11398 -/*
11399 - *  linux/arch/x86-64/mm/fault.c
11400 - *
11401 - *  Copyright (C) 1995  Linus Torvalds
11402 - *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
11403 - */
11404 -
11405 -#include <linux/signal.h>
11406 -#include <linux/sched.h>
11407 -#include <linux/kernel.h>
11408 -#include <linux/errno.h>
11409 -#include <linux/string.h>
11410 -#include <linux/types.h>
11411 -#include <linux/ptrace.h>
11412 -#include <linux/mman.h>
11413 -#include <linux/mm.h>
11414 -#include <linux/smp.h>
11415 -#include <linux/interrupt.h>
11416 -#include <linux/init.h>
11417 -#include <linux/tty.h>
11418 -#include <linux/vt_kern.h>             /* For unblank_screen() */
11419 -#include <linux/compiler.h>
11420 -#include <linux/vmalloc.h>
11421 -#include <linux/module.h>
11422 -#include <linux/kprobes.h>
11423 -#include <linux/uaccess.h>
11424 -#include <linux/kdebug.h>
11425 -#include <linux/kprobes.h>
11426 -
11427 -#include <asm/system.h>
11428 -#include <asm/pgalloc.h>
11429 -#include <asm/smp.h>
11430 -#include <asm/tlbflush.h>
11431 -#include <asm/proto.h>
11432 -#include <asm-generic/sections.h>
11433 -
11434 -/* Page fault error code bits */
11435 -#define PF_PROT        (1<<0)          /* or no page found */
11436 -#define PF_WRITE       (1<<1)
11437 -#define PF_USER        (1<<2)
11438 -#define PF_RSVD        (1<<3)
11439 -#define PF_INSTR       (1<<4)
11440 -
11441 -#ifdef CONFIG_KPROBES
11442 -static inline int notify_page_fault(struct pt_regs *regs)
11443 -{
11444 -       int ret = 0;
11445 -
11446 -       /* kprobe_running() needs smp_processor_id() */
11447 -       if (!user_mode(regs)) {
11448 -               preempt_disable();
11449 -               if (kprobe_running() && kprobe_fault_handler(regs, 14))
11450 -                       ret = 1;
11451 -               preempt_enable();
11452 -       }
11453 -
11454 -       return ret;
11455 -}
11456 -#else
11457 -static inline int notify_page_fault(struct pt_regs *regs)
11458 -{
11459 -       return 0;
11460 -}
11461 -#endif
11462 -
11463 -/* Sometimes the CPU reports invalid exceptions on prefetch.
11464 -   Check that here and ignore.
11465 -   Opcode checker based on code by Richard Brunner */
11466 -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11467 -                               unsigned long error_code)
11468 -{
11469 -       unsigned char *instr;
11470 -       int scan_more = 1;
11471 -       int prefetch = 0;
11472 -       unsigned char *max_instr;
11473 -
11474 -       /* If it was a exec fault ignore */
11475 -       if (error_code & PF_INSTR)
11476 -               return 0;
11477 -
11478 -       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
11479 -       max_instr = instr + 15;
11480 -
11481 -       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
11482 -               return 0;
11483 -
11484 -       while (scan_more && instr < max_instr) {
11485 -               unsigned char opcode;
11486 -               unsigned char instr_hi;
11487 -               unsigned char instr_lo;
11488 -
11489 -               if (probe_kernel_address(instr, opcode))
11490 -                       break;
11491 -
11492 -               instr_hi = opcode & 0xf0;
11493 -               instr_lo = opcode & 0x0f;
11494 -               instr++;
11495 -
11496 -               switch (instr_hi) {
11497 -               case 0x20:
11498 -               case 0x30:
11499 -                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
11500 -                          prefixes.  In long mode, the CPU will signal
11501 -                          invalid opcode if some of these prefixes are
11502 -                          present so we will never get here anyway */
11503 -                       scan_more = ((instr_lo & 7) == 0x6);
11504 -                       break;
11505 -
11506 -               case 0x40:
11507 -                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
11508 -                          Need to figure out under what instruction mode the
11509 -                          instruction was issued ... */
11510 -                       /* Could check the LDT for lm, but for now it's good
11511 -                          enough to assume that long mode only uses well known
11512 -                          segments or kernel. */
11513 -                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
11514 -                       break;
11515 -
11516 -               case 0x60:
11517 -                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
11518 -                       scan_more = (instr_lo & 0xC) == 0x4;
11519 -                       break;
11520 -               case 0xF0:
11521 -                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
11522 -                       scan_more = !instr_lo || (instr_lo>>1) == 1;
11523 -                       break;
11524 -               case 0x00:
11525 -                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
11526 -                       scan_more = 0;
11527 -                       if (probe_kernel_address(instr, opcode))
11528 -                               break;
11529 -                       prefetch = (instr_lo == 0xF) &&
11530 -                               (opcode == 0x0D || opcode == 0x18);
11531 -                       break;
11532 -               default:
11533 -                       scan_more = 0;
11534 -                       break;
11535 -               }
11536 -       }
11537 -       return prefetch;
11538 -}
11539 -
11540 -static int bad_address(void *p)
11541 -{
11542 -       unsigned long dummy;
11543 -       return probe_kernel_address((unsigned long *)p, dummy);
11544 -}
11545 -
11546 -void dump_pagetable(unsigned long address)
11547 -{
11548 -       pgd_t *pgd;
11549 -       pud_t *pud;
11550 -       pmd_t *pmd;
11551 -       pte_t *pte;
11552 -
11553 -       pgd = (pgd_t *)read_cr3();
11554 -
11555 -       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
11556 -       pgd += pgd_index(address);
11557 -       if (bad_address(pgd)) goto bad;
11558 -       printk("PGD %lx ", pgd_val(*pgd));
11559 -       if (!pgd_present(*pgd)) goto ret;
11560 -
11561 -       pud = pud_offset(pgd, address);
11562 -       if (bad_address(pud)) goto bad;
11563 -       printk("PUD %lx ", pud_val(*pud));
11564 -       if (!pud_present(*pud)) goto ret;
11565 -
11566 -       pmd = pmd_offset(pud, address);
11567 -       if (bad_address(pmd)) goto bad;
11568 -       printk("PMD %lx ", pmd_val(*pmd));
11569 -       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
11570 -
11571 -       pte = pte_offset_kernel(pmd, address);
11572 -       if (bad_address(pte)) goto bad;
11573 -       printk("PTE %lx", pte_val(*pte));
11574 -ret:
11575 -       printk("\n");
11576 -       return;
11577 -bad:
11578 -       printk("BAD\n");
11579 -}
11580 -
11581 -static const char errata93_warning[] =
11582 -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
11583 -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
11584 -KERN_ERR "******* Please consider a BIOS update.\n"
11585 -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
11586 -
11587 -/* Workaround for K8 erratum #93 & buggy BIOS.
11588 -   BIOS SMM functions are required to use a specific workaround
11589 -   to avoid corruption of the 64bit RIP register on C stepping K8.
11590 -   A lot of BIOS that didn't get tested properly miss this.
11591 -   The OS sees this as a page fault with the upper 32bits of RIP cleared.
11592 -   Try to work around it here.
11593 -   Note we only handle faults in kernel here. */
11594 -
11595 -static int is_errata93(struct pt_regs *regs, unsigned long address)
11596 -{
11597 -       static int warned;
11598 -       if (address != regs->rip)
11599 -               return 0;
11600 -       if ((address >> 32) != 0)
11601 -               return 0;
11602 -       address |= 0xffffffffUL << 32;
11603 -       if ((address >= (u64)_stext && address <= (u64)_etext) ||
11604 -           (address >= MODULES_VADDR && address <= MODULES_END)) {
11605 -               if (!warned) {
11606 -                       printk(errata93_warning);
11607 -                       warned = 1;
11608 -               }
11609 -               regs->rip = address;
11610 -               return 1;
11611 -       }
11612 -       return 0;
11613 -}
11614 -
11615 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
11616 -                                unsigned long error_code)
11617 -{
11618 -       unsigned long flags = oops_begin();
11619 -       struct task_struct *tsk;
11620 -
11621 -       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
11622 -              current->comm, address);
11623 -       dump_pagetable(address);
11624 -       tsk = current;
11625 -       tsk->thread.cr2 = address;
11626 -       tsk->thread.trap_no = 14;
11627 -       tsk->thread.error_code = error_code;
11628 -       __die("Bad pagetable", regs, error_code);
11629 -       oops_end(flags);
11630 -       do_exit(SIGKILL);
11631 -}
11632 -
11633 -/*
11634 - * Handle a fault on the vmalloc area
11635 - *
11636 - * This assumes no large pages in there.
11637 - */
11638 -static int vmalloc_fault(unsigned long address)
11639 -{
11640 -       pgd_t *pgd, *pgd_ref;
11641 -       pud_t *pud, *pud_ref;
11642 -       pmd_t *pmd, *pmd_ref;
11643 -       pte_t *pte, *pte_ref;
11644 -
11645 -       /* Copy kernel mappings over when needed. This can also
11646 -          happen within a race in page table update. In the later
11647 -          case just flush. */
11648 -
11649 -       /* On Xen the line below does not always work. Needs investigating! */
11650 -       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11651 -       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11652 -       pgd += pgd_index(address);
11653 -       pgd_ref = pgd_offset_k(address);
11654 -       if (pgd_none(*pgd_ref))
11655 -               return -1;
11656 -       if (pgd_none(*pgd))
11657 -               set_pgd(pgd, *pgd_ref);
11658 -       else
11659 -               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11660 -
11661 -       /* Below here mismatches are bugs because these lower tables
11662 -          are shared */
11663 -
11664 -       pud = pud_offset(pgd, address);
11665 -       pud_ref = pud_offset(pgd_ref, address);
11666 -       if (pud_none(*pud_ref))
11667 -               return -1;
11668 -       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11669 -               BUG();
11670 -       pmd = pmd_offset(pud, address);
11671 -       pmd_ref = pmd_offset(pud_ref, address);
11672 -       if (pmd_none(*pmd_ref))
11673 -               return -1;
11674 -       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11675 -               BUG();
11676 -       pte_ref = pte_offset_kernel(pmd_ref, address);
11677 -       if (!pte_present(*pte_ref))
11678 -               return -1;
11679 -       pte = pte_offset_kernel(pmd, address);
11680 -       /* Don't use pte_page here, because the mappings can point
11681 -          outside mem_map, and the NUMA hash lookup cannot handle
11682 -          that. */
11683 -       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11684 -               BUG();
11685 -       return 0;
11686 -}
11687 -
11688 -int show_unhandled_signals = 1;
11689 -
11690 -
11691 -#define MEM_VERBOSE 1
11692 -
11693 -#ifdef MEM_VERBOSE
11694 -#define MEM_LOG(_f, _a...)                     \
11695 -       printk("fault.c:[%d]-> " _f "\n",       \
11696 -       __LINE__ , ## _a )
11697 -#else
11698 -#define MEM_LOG(_f, _a...) ((void)0)
11699 -#endif
11700 -
11701 -static int spurious_fault(struct pt_regs *regs,
11702 -                         unsigned long address,
11703 -                         unsigned long error_code)
11704 -{
11705 -       pgd_t *pgd;
11706 -       pud_t *pud;
11707 -       pmd_t *pmd;
11708 -       pte_t *pte;
11709 -
11710 -#ifdef CONFIG_XEN
11711 -       /* Faults in hypervisor area are never spurious. */
11712 -       if ((address >= HYPERVISOR_VIRT_START) &&
11713 -           (address < HYPERVISOR_VIRT_END))
11714 -               return 0;
11715 -#endif
11716 -
11717 -       /* Reserved-bit violation or user access to kernel space? */
11718 -       if (error_code & (PF_RSVD|PF_USER))
11719 -               return 0;
11720 -
11721 -       pgd = init_mm.pgd + pgd_index(address);
11722 -       if (!pgd_present(*pgd))
11723 -               return 0;
11724 -
11725 -       pud = pud_offset(pgd, address);
11726 -       if (!pud_present(*pud))
11727 -               return 0;
11728 -
11729 -       pmd = pmd_offset(pud, address);
11730 -       if (!pmd_present(*pmd))
11731 -               return 0;
11732 -
11733 -       pte = pte_offset_kernel(pmd, address);
11734 -       if (!pte_present(*pte))
11735 -               return 0;
11736 -       if ((error_code & PF_WRITE) && !pte_write(*pte))
11737 -               return 0;
11738 -       if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
11739 -               return 0;
11740 -
11741 -       return 1;
11742 -}
11743 -
11744 -/*
11745 - * This routine handles page faults.  It determines the address,
11746 - * and the problem, and then passes it off to one of the appropriate
11747 - * routines.
11748 - */
11749 -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
11750 -                                       unsigned long error_code)
11751 -{
11752 -       struct task_struct *tsk;
11753 -       struct mm_struct *mm;
11754 -       struct vm_area_struct * vma;
11755 -       unsigned long address;
11756 -       const struct exception_table_entry *fixup;
11757 -       int write, fault;
11758 -       unsigned long flags;
11759 -       siginfo_t info;
11760 -
11761 -       if (!user_mode(regs))
11762 -               error_code &= ~PF_USER; /* means kernel */
11763 -
11764 -       /*
11765 -        * We can fault from pretty much anywhere, with unknown IRQ state.
11766 -        */
11767 -       trace_hardirqs_fixup();
11768 -
11769 -       tsk = current;
11770 -       mm = tsk->mm;
11771 -       prefetchw(&mm->mmap_sem);
11772 -
11773 -       /* get the address */
11774 -       address = read_cr2();
11775 -
11776 -       info.si_code = SEGV_MAPERR;
11777 -
11778 -
11779 -       /*
11780 -        * We fault-in kernel-space virtual memory on-demand. The
11781 -        * 'reference' page table is init_mm.pgd.
11782 -        *
11783 -        * NOTE! We MUST NOT take any locks for this case. We may
11784 -        * be in an interrupt or a critical region, and should
11785 -        * only copy the information from the master page table,
11786 -        * nothing more.
11787 -        *
11788 -        * This verifies that the fault happens in kernel space
11789 -        * (error_code & 4) == 0, and that the fault was not a
11790 -        * protection error (error_code & 9) == 0.
11791 -        */
11792 -       if (unlikely(address >= TASK_SIZE64)) {
11793 -               /*
11794 -                * Don't check for the module range here: its PML4
11795 -                * is always initialized because it's shared with the main
11796 -                * kernel text. Only vmalloc may need PML4 syncups.
11797 -                */
11798 -               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11799 -                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
11800 -                       if (vmalloc_fault(address) >= 0)
11801 -                               return;
11802 -               }
11803 -               /* Can take a spurious fault if mapping changes R/O -> R/W. */
11804 -               if (spurious_fault(regs, address, error_code))
11805 -                       return;
11806 -               if (notify_page_fault(regs))
11807 -                       return;
11808 -               /*
11809 -                * Don't take the mm semaphore here. If we fixup a prefetch
11810 -                * fault we could otherwise deadlock.
11811 -                */
11812 -               goto bad_area_nosemaphore;
11813 -       }
11814 -
11815 -       if (notify_page_fault(regs))
11816 -               return;
11817 -
11818 -       if (likely(regs->eflags & X86_EFLAGS_IF))
11819 -               local_irq_enable();
11820 -
11821 -       if (unlikely(error_code & PF_RSVD))
11822 -               pgtable_bad(address, regs, error_code);
11823 -
11824 -       /*
11825 -        * If we're in an interrupt or have no user
11826 -        * context, we must not take the fault..
11827 -        */
11828 -       if (unlikely(in_atomic() || !mm))
11829 -               goto bad_area_nosemaphore;
11830 -
11831 -       /*
11832 -        * User-mode registers count as a user access even for any
11833 -        * potential system fault or CPU buglet.
11834 -        */
11835 -       if (user_mode_vm(regs))
11836 -               error_code |= PF_USER;
11837 -
11838 - again:
11839 -       /* When running in the kernel we expect faults to occur only to
11840 -        * addresses in user space.  All other faults represent errors in the
11841 -        * kernel and should generate an OOPS.  Unfortunately, in the case of an
11842 -        * erroneous fault occurring in a code path which already holds mmap_sem
11843 -        * we will deadlock attempting to validate the fault against the
11844 -        * address space.  Luckily the kernel only validly references user
11845 -        * space from well defined areas of code, which are listed in the
11846 -        * exceptions table.
11847 -        *
11848 -        * As the vast majority of faults will be valid we will only perform
11849 -        * the source reference check when there is a possibility of a deadlock.
11850 -        * Attempt to lock the address space, if we cannot we then validate the
11851 -        * source.  If this is invalid we can skip the address space check,
11852 -        * thus avoiding the deadlock.
11853 -        */
11854 -       if (!down_read_trylock(&mm->mmap_sem)) {
11855 -               if ((error_code & PF_USER) == 0 &&
11856 -                   !search_exception_tables(regs->rip))
11857 -                       goto bad_area_nosemaphore;
11858 -               down_read(&mm->mmap_sem);
11859 -       }
11860 -
11861 -       vma = find_vma(mm, address);
11862 -       if (!vma)
11863 -               goto bad_area;
11864 -       if (likely(vma->vm_start <= address))
11865 -               goto good_area;
11866 -       if (!(vma->vm_flags & VM_GROWSDOWN))
11867 -               goto bad_area;
11868 -       if (error_code & 4) {
11869 -               /* Allow userspace just enough access below the stack pointer
11870 -                * to let the 'enter' instruction work.
11871 -                */
11872 -               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
11873 -                       goto bad_area;
11874 -       }
11875 -       if (expand_stack(vma, address))
11876 -               goto bad_area;
11877 -/*
11878 - * Ok, we have a good vm_area for this memory access, so
11879 - * we can handle it..
11880 - */
11881 -good_area:
11882 -       info.si_code = SEGV_ACCERR;
11883 -       write = 0;
11884 -       switch (error_code & (PF_PROT|PF_WRITE)) {
11885 -               default:        /* 3: write, present */
11886 -                       /* fall through */
11887 -               case PF_WRITE:          /* write, not present */
11888 -                       if (!(vma->vm_flags & VM_WRITE))
11889 -                               goto bad_area;
11890 -                       write++;
11891 -                       break;
11892 -               case PF_PROT:           /* read, present */
11893 -                       goto bad_area;
11894 -               case 0:                 /* read, not present */
11895 -                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11896 -                               goto bad_area;
11897 -       }
11898 -
11899 -       /*
11900 -        * If for any reason at all we couldn't handle the fault,
11901 -        * make sure we exit gracefully rather than endlessly redo
11902 -        * the fault.
11903 -        */
11904 -       fault = handle_mm_fault(mm, vma, address, write);
11905 -       if (unlikely(fault & VM_FAULT_ERROR)) {
11906 -               if (fault & VM_FAULT_OOM)
11907 -                       goto out_of_memory;
11908 -               else if (fault & VM_FAULT_SIGBUS)
11909 -                       goto do_sigbus;
11910 -               BUG();
11911 -       }
11912 -       if (fault & VM_FAULT_MAJOR)
11913 -               tsk->maj_flt++;
11914 -       else
11915 -               tsk->min_flt++;
11916 -       up_read(&mm->mmap_sem);
11917 -       return;
11918 -
11919 -/*
11920 - * Something tried to access memory that isn't in our memory map..
11921 - * Fix it, but check if it's kernel or user first..
11922 - */
11923 -bad_area:
11924 -       up_read(&mm->mmap_sem);
11925 -
11926 -bad_area_nosemaphore:
11927 -       /* User mode accesses just cause a SIGSEGV */
11928 -       if (error_code & PF_USER) {
11929 -
11930 -               /*
11931 -                * It's possible to have interrupts off here.
11932 -                */
11933 -               local_irq_enable();
11934 -
11935 -               if (is_prefetch(regs, address, error_code))
11936 -                       return;
11937 -
11938 -               /* Work around K8 erratum #100 K8 in compat mode
11939 -                  occasionally jumps to illegal addresses >4GB.  We
11940 -                  catch this here in the page fault handler because
11941 -                  these addresses are not reachable. Just detect this
11942 -                  case and return.  Any code segment in LDT is
11943 -                  compatibility mode. */
11944 -               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
11945 -                   (address >> 32))
11946 -                       return;
11947 -
11948 -               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11949 -                   printk_ratelimit()) {
11950 -                       printk(
11951 -                      "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
11952 -                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
11953 -                                       tsk->comm, tsk->pid, address, regs->rip,
11954 -                                       regs->rsp, error_code);
11955 -               }
11956 -
11957 -               tsk->thread.cr2 = address;
11958 -               /* Kernel addresses are always protection faults */
11959 -               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11960 -               tsk->thread.trap_no = 14;
11961 -               info.si_signo = SIGSEGV;
11962 -               info.si_errno = 0;
11963 -               /* info.si_code has been set above */
11964 -               info.si_addr = (void __user *)address;
11965 -               force_sig_info(SIGSEGV, &info, tsk);
11966 -               return;
11967 -       }
11968 -
11969 -no_context:
11970 -
11971 -       /* Are we prepared to handle this kernel fault?  */
11972 -       fixup = search_exception_tables(regs->rip);
11973 -       if (fixup) {
11974 -               regs->rip = fixup->fixup;
11975 -               return;
11976 -       }
11977 -
11978 -       /*
11979 -        * Hall of shame of CPU/BIOS bugs.
11980 -        */
11981 -
11982 -       if (is_prefetch(regs, address, error_code))
11983 -               return;
11984 -
11985 -       if (is_errata93(regs, address))
11986 -               return;
11987 -
11988 -/*
11989 - * Oops. The kernel tried to access some bad page. We'll have to
11990 - * terminate things with extreme prejudice.
11991 - */
11992 -
11993 -       flags = oops_begin();
11994 -
11995 -       if (address < PAGE_SIZE)
11996 -               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
11997 -       else
11998 -               printk(KERN_ALERT "Unable to handle kernel paging request");
11999 -       printk(" at %016lx RIP: \n" KERN_ALERT,address);
12000 -       printk_address(regs->rip);
12001 -       dump_pagetable(address);
12002 -       tsk->thread.cr2 = address;
12003 -       tsk->thread.trap_no = 14;
12004 -       tsk->thread.error_code = error_code;
12005 -       __die("Oops", regs, error_code);
12006 -       /* Executive summary in case the body of the oops scrolled away */
12007 -       printk(KERN_EMERG "CR2: %016lx\n", address);
12008 -       oops_end(flags);
12009 -       do_exit(SIGKILL);
12010 -
12011 -/*
12012 - * We ran out of memory, or some other thing happened to us that made
12013 - * us unable to handle the page fault gracefully.
12014 - */
12015 -out_of_memory:
12016 -       up_read(&mm->mmap_sem);
12017 -       if (is_global_init(current)) {
12018 -               yield();
12019 -               goto again;
12020 -       }
12021 -       printk("VM: killing process %s\n", tsk->comm);
12022 -       if (error_code & 4)
12023 -               do_group_exit(SIGKILL);
12024 -       goto no_context;
12025 -
12026 -do_sigbus:
12027 -       up_read(&mm->mmap_sem);
12028 -
12029 -       /* Kernel mode? Handle exceptions or die */
12030 -       if (!(error_code & PF_USER))
12031 -               goto no_context;
12032 -
12033 -       tsk->thread.cr2 = address;
12034 -       tsk->thread.error_code = error_code;
12035 -       tsk->thread.trap_no = 14;
12036 -       info.si_signo = SIGBUS;
12037 -       info.si_errno = 0;
12038 -       info.si_code = BUS_ADRERR;
12039 -       info.si_addr = (void __user *)address;
12040 -       force_sig_info(SIGBUS, &info, tsk);
12041 -       return;
12042 -}
12043 -
12044 -DEFINE_SPINLOCK(pgd_lock);
12045 -LIST_HEAD(pgd_list);
12046 -
12047 -void vmalloc_sync_all(void)
12048 -{
12049 -       /* Note that races in the updates of insync and start aren't
12050 -          problematic:
12051 -          insync can only get set bits added, and updates to start are only
12052 -          improving performance (without affecting correctness if undone). */
12053 -       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12054 -       static unsigned long start = VMALLOC_START & PGDIR_MASK;
12055 -       unsigned long address;
12056 -
12057 -       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12058 -               if (!test_bit(pgd_index(address), insync)) {
12059 -                       const pgd_t *pgd_ref = pgd_offset_k(address);
12060 -                       struct page *page;
12061 -
12062 -                       if (pgd_none(*pgd_ref))
12063 -                               continue;
12064 -                       spin_lock(&pgd_lock);
12065 -                       list_for_each_entry(page, &pgd_list, lru) {
12066 -                               pgd_t *pgd;
12067 -                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
12068 -                               if (pgd_none(*pgd))
12069 -                                       set_pgd(pgd, *pgd_ref);
12070 -                               else
12071 -                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12072 -                       }
12073 -                       spin_unlock(&pgd_lock);
12074 -                       set_bit(pgd_index(address), insync);
12075 -               }
12076 -               if (address == start)
12077 -                       start = address + PGDIR_SIZE;
12078 -       }
12079 -       /* Check that there is no need to do the same for the modules area. */
12080 -       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12081 -       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12082 -                               (__START_KERNEL & PGDIR_MASK)));
12083 -}
12084 --- /dev/null
12085 +++ b/arch/x86/mm/fault-xen.c
12086 @@ -0,0 +1,1026 @@
12087 +/*
12088 + *  Copyright (C) 1995  Linus Torvalds
12089 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12090 + */
12091 +
12092 +#include <linux/signal.h>
12093 +#include <linux/sched.h>
12094 +#include <linux/kernel.h>
12095 +#include <linux/errno.h>
12096 +#include <linux/string.h>
12097 +#include <linux/types.h>
12098 +#include <linux/ptrace.h>
12099 +#include <linux/mman.h>
12100 +#include <linux/mm.h>
12101 +#include <linux/smp.h>
12102 +#include <linux/interrupt.h>
12103 +#include <linux/init.h>
12104 +#include <linux/tty.h>
12105 +#include <linux/vt_kern.h>             /* For unblank_screen() */
12106 +#include <linux/compiler.h>
12107 +#include <linux/highmem.h>
12108 +#include <linux/bootmem.h>             /* for max_low_pfn */
12109 +#include <linux/vmalloc.h>
12110 +#include <linux/module.h>
12111 +#include <linux/kprobes.h>
12112 +#include <linux/uaccess.h>
12113 +#include <linux/kdebug.h>
12114 +
12115 +#include <asm/system.h>
12116 +#include <asm/desc.h>
12117 +#include <asm/segment.h>
12118 +#include <asm/pgalloc.h>
12119 +#include <asm/smp.h>
12120 +#include <asm/tlbflush.h>
12121 +#include <asm/proto.h>
12122 +#include <asm-generic/sections.h>
12123 +
12124 +/*
12125 + * Page fault error code bits
12126 + *     bit 0 == 0 means no page found, 1 means protection fault
12127 + *     bit 1 == 0 means read, 1 means write
12128 + *     bit 2 == 0 means kernel, 1 means user-mode
12129 + *     bit 3 == 1 means use of reserved bit detected
12130 + *     bit 4 == 1 means fault was an instruction fetch
12131 + */
12132 +#define PF_PROT                (1<<0)
12133 +#define PF_WRITE       (1<<1)
12134 +#define PF_USER                (1<<2)
12135 +#define PF_RSVD                (1<<3)
12136 +#define PF_INSTR       (1<<4)
12137 +
12138 +static inline int notify_page_fault(struct pt_regs *regs)
12139 +{
12140 +#ifdef CONFIG_KPROBES
12141 +       int ret = 0;
12142 +
12143 +       /* kprobe_running() needs smp_processor_id() */
12144 +#ifdef CONFIG_X86_32
12145 +       if (!user_mode_vm(regs)) {
12146 +#else
12147 +       if (!user_mode(regs)) {
12148 +#endif
12149 +               preempt_disable();
12150 +               if (kprobe_running() && kprobe_fault_handler(regs, 14))
12151 +                       ret = 1;
12152 +               preempt_enable();
12153 +       }
12154 +
12155 +       return ret;
12156 +#else
12157 +       return 0;
12158 +#endif
12159 +}
12160 +
12161 +/*
12162 + * X86_32
12163 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
12164 + * Check that here and ignore it.
12165 + *
12166 + * X86_64
12167 + * Sometimes the CPU reports invalid exceptions on prefetch.
12168 + * Check that here and ignore it.
12169 + *
12170 + * Opcode checker based on code by Richard Brunner
12171 + */
12172 +static int is_prefetch(struct pt_regs *regs, unsigned long addr,
12173 +                      unsigned long error_code)
12174 +{
12175 +       unsigned char *instr;
12176 +       int scan_more = 1;
12177 +       int prefetch = 0;
12178 +       unsigned char *max_instr;
12179 +
12180 +       /*
12181 +        * If it was a exec (instruction fetch) fault on NX page, then
12182 +        * do not ignore the fault:
12183 +        */
12184 +       if (error_code & PF_INSTR)
12185 +               return 0;
12186 +
12187 +       instr = (unsigned char *)convert_ip_to_linear(current, regs);
12188 +       max_instr = instr + 15;
12189 +
12190 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12191 +               return 0;
12192 +
12193 +       while (scan_more && instr < max_instr) {
12194 +               unsigned char opcode;
12195 +               unsigned char instr_hi;
12196 +               unsigned char instr_lo;
12197 +
12198 +               if (probe_kernel_address(instr, opcode))
12199 +                       break;
12200 +
12201 +               instr_hi = opcode & 0xf0;
12202 +               instr_lo = opcode & 0x0f;
12203 +               instr++;
12204 +
12205 +               switch (instr_hi) {
12206 +               case 0x20:
12207 +               case 0x30:
12208 +                       /*
12209 +                        * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
12210 +                        * In X86_64 long mode, the CPU will signal invalid
12211 +                        * opcode if some of these prefixes are present so
12212 +                        * X86_64 will never get here anyway
12213 +                        */
12214 +                       scan_more = ((instr_lo & 7) == 0x6);
12215 +                       break;
12216 +#ifdef CONFIG_X86_64
12217 +               case 0x40:
12218 +                       /*
12219 +                        * In AMD64 long mode 0x40..0x4F are valid REX prefixes
12220 +                        * Need to figure out under what instruction mode the
12221 +                        * instruction was issued. Could check the LDT for lm,
12222 +                        * but for now it's good enough to assume that long
12223 +                        * mode only uses well known segments or kernel.
12224 +                        */
12225 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12226 +                       break;
12227 +#endif
12228 +               case 0x60:
12229 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
12230 +                       scan_more = (instr_lo & 0xC) == 0x4;
12231 +                       break;
12232 +               case 0xF0:
12233 +                       /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
12234 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
12235 +                       break;
12236 +               case 0x00:
12237 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
12238 +                       scan_more = 0;
12239 +
12240 +                       if (probe_kernel_address(instr, opcode))
12241 +                               break;
12242 +                       prefetch = (instr_lo == 0xF) &&
12243 +                               (opcode == 0x0D || opcode == 0x18);
12244 +                       break;
12245 +               default:
12246 +                       scan_more = 0;
12247 +                       break;
12248 +               }
12249 +       }
12250 +       return prefetch;
12251 +}
12252 +
12253 +static void force_sig_info_fault(int si_signo, int si_code,
12254 +       unsigned long address, struct task_struct *tsk)
12255 +{
12256 +       siginfo_t info;
12257 +
12258 +       info.si_signo = si_signo;
12259 +       info.si_errno = 0;
12260 +       info.si_code = si_code;
12261 +       info.si_addr = (void __user *)address;
12262 +       force_sig_info(si_signo, &info, tsk);
12263 +}
12264 +
12265 +#ifdef CONFIG_X86_64
12266 +static int bad_address(void *p)
12267 +{
12268 +       unsigned long dummy;
12269 +       return probe_kernel_address((unsigned long *)p, dummy);
12270 +}
12271 +#endif
12272 +
12273 +static void dump_pagetable(unsigned long address)
12274 +{
12275 +#ifdef CONFIG_X86_32
12276 +       __typeof__(pte_val(__pte(0))) page;
12277 +
12278 +       page = read_cr3();
12279 +       page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
12280 +#ifdef CONFIG_X86_PAE
12281 +       printk("*pdpt = %016Lx ", page);
12282 +       if ((page & _PAGE_PRESENT)
12283 +           && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
12284 +               page = mfn_to_pfn(page >> PAGE_SHIFT);
12285 +               page <<= PAGE_SHIFT;
12286 +               page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
12287 +                                                        & (PTRS_PER_PMD - 1)];
12288 +               printk(KERN_CONT "*pde = %016Lx ", page);
12289 +               page &= ~_PAGE_NX;
12290 +       }
12291 +#else
12292 +       printk("*pde = %08lx ", page);
12293 +#endif
12294 +
12295 +       /*
12296 +        * We must not directly access the pte in the highpte
12297 +        * case if the page table is located in highmem.
12298 +        * And let's rather not kmap-atomic the pte, just in case
12299 +        * it's allocated already.
12300 +        */
12301 +       if ((page & _PAGE_PRESENT)
12302 +           && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
12303 +           && !(page & _PAGE_PSE)) {
12304 +               page = mfn_to_pfn(page >> PAGE_SHIFT);
12305 +               page <<= PAGE_SHIFT;
12306 +               page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
12307 +                                                        & (PTRS_PER_PTE - 1)];
12308 +               printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
12309 +       }
12310 +
12311 +       printk(KERN_CONT "\n");
12312 +#else /* CONFIG_X86_64 */
12313 +       pgd_t *pgd;
12314 +       pud_t *pud;
12315 +       pmd_t *pmd;
12316 +       pte_t *pte;
12317 +
12318 +       pgd = (pgd_t *)read_cr3();
12319 +
12320 +       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12321 +       pgd += pgd_index(address);
12322 +       if (bad_address(pgd)) goto bad;
12323 +       printk("PGD %lx ", pgd_val(*pgd));
12324 +       if (!pgd_present(*pgd)) goto ret;
12325 +
12326 +       pud = pud_offset(pgd, address);
12327 +       if (bad_address(pud)) goto bad;
12328 +       printk(KERN_CONT "PUD %lx ", pud_val(*pud));
12329 +       if (!pud_present(*pud) || pud_large(*pud))
12330 +               goto ret;
12331 +
12332 +       pmd = pmd_offset(pud, address);
12333 +       if (bad_address(pmd)) goto bad;
12334 +       printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
12335 +       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12336 +
12337 +       pte = pte_offset_kernel(pmd, address);
12338 +       if (bad_address(pte)) goto bad;
12339 +       printk(KERN_CONT "PTE %lx", pte_val(*pte));
12340 +ret:
12341 +       printk(KERN_CONT "\n");
12342 +       return;
12343 +bad:
12344 +       printk("BAD\n");
12345 +#endif
12346 +}
12347 +
12348 +#ifdef CONFIG_X86_32
12349 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
12350 +{
12351 +       unsigned index = pgd_index(address);
12352 +       pgd_t *pgd_k;
12353 +       pud_t *pud, *pud_k;
12354 +       pmd_t *pmd, *pmd_k;
12355 +
12356 +       pgd += index;
12357 +       pgd_k = init_mm.pgd + index;
12358 +
12359 +       if (!pgd_present(*pgd_k))
12360 +               return NULL;
12361 +
12362 +       /*
12363 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
12364 +        * and redundant with the set_pmd() on non-PAE. As would
12365 +        * set_pud.
12366 +        */
12367 +
12368 +       pud = pud_offset(pgd, address);
12369 +       pud_k = pud_offset(pgd_k, address);
12370 +       if (!pud_present(*pud_k))
12371 +               return NULL;
12372 +
12373 +       pmd = pmd_offset(pud, address);
12374 +       pmd_k = pmd_offset(pud_k, address);
12375 +       if (!pmd_present(*pmd_k))
12376 +               return NULL;
12377 +       if (!pmd_present(*pmd)) {
12378 +               bool lazy = x86_read_percpu(xen_lazy_mmu);
12379 +
12380 +               x86_write_percpu(xen_lazy_mmu, false);
12381 +#if CONFIG_XEN_COMPAT > 0x030002
12382 +               set_pmd(pmd, *pmd_k);
12383 +#else
12384 +               /*
12385 +                * When running on older Xen we must launder *pmd_k through
12386 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
12387 +                */
12388 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
12389 +#endif
12390 +               x86_write_percpu(xen_lazy_mmu, lazy);
12391 +       } else
12392 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
12393 +       return pmd_k;
12394 +}
12395 +#endif
12396 +
12397 +#ifdef CONFIG_X86_64
12398 +static const char errata93_warning[] =
12399 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12400 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12401 +KERN_ERR "******* Please consider a BIOS update.\n"
12402 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12403 +#endif
12404 +
12405 +/* Workaround for K8 erratum #93 & buggy BIOS.
12406 +   BIOS SMM functions are required to use a specific workaround
12407 +   to avoid corruption of the 64bit RIP register on C stepping K8.
12408 +   A lot of BIOS that didn't get tested properly miss this.
12409 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
12410 +   Try to work around it here.
12411 +   Note we only handle faults in kernel here.
12412 +   Does nothing for X86_32
12413 + */
12414 +static int is_errata93(struct pt_regs *regs, unsigned long address)
12415 +{
12416 +#ifdef CONFIG_X86_64
12417 +       static int warned;
12418 +       if (address != regs->ip)
12419 +               return 0;
12420 +       if ((address >> 32) != 0)
12421 +               return 0;
12422 +       address |= 0xffffffffUL << 32;
12423 +       if ((address >= (u64)_stext && address <= (u64)_etext) ||
12424 +           (address >= MODULES_VADDR && address <= MODULES_END)) {
12425 +               if (!warned) {
12426 +                       printk(errata93_warning);
12427 +                       warned = 1;
12428 +               }
12429 +               regs->ip = address;
12430 +               return 1;
12431 +       }
12432 +#endif
12433 +       return 0;
12434 +}
12435 +
12436 +/*
12437 + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
12438 + * addresses >4GB.  We catch this in the page fault handler because these
12439 + * addresses are not reachable. Just detect this case and return.  Any code
12440 + * segment in LDT is compatibility mode.
12441 + */
12442 +static int is_errata100(struct pt_regs *regs, unsigned long address)
12443 +{
12444 +#ifdef CONFIG_X86_64
12445 +       if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12446 +           (address >> 32))
12447 +               return 1;
12448 +#endif
12449 +       return 0;
12450 +}
12451 +
12452 +void do_invalid_op(struct pt_regs *, unsigned long);
12453 +
12454 +static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
12455 +{
12456 +#ifdef CONFIG_X86_F00F_BUG
12457 +       unsigned long nr;
12458 +       /*
12459 +        * Pentium F0 0F C7 C8 bug workaround.
12460 +        */
12461 +       if (boot_cpu_data.f00f_bug) {
12462 +               nr = (address - idt_descr.address) >> 3;
12463 +
12464 +               if (nr == 6) {
12465 +                       do_invalid_op(regs, 0);
12466 +                       return 1;
12467 +               }
12468 +       }
12469 +#endif
12470 +       return 0;
12471 +}
12472 +
12473 +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
12474 +                           unsigned long address)
12475 +{
12476 +#ifdef CONFIG_X86_32
12477 +       if (!oops_may_print())
12478 +               return;
12479 +#endif
12480 +
12481 +#ifdef CONFIG_X86_PAE
12482 +       if (error_code & PF_INSTR) {
12483 +               unsigned int level;
12484 +               pte_t *pte = lookup_address(address, &level);
12485 +
12486 +               if (pte && pte_present(*pte) && !pte_exec(*pte))
12487 +                       printk(KERN_CRIT "kernel tried to execute "
12488 +                               "NX-protected page - exploit attempt? "
12489 +                               "(uid: %d)\n", current->uid);
12490 +       }
12491 +#endif
12492 +
12493 +       printk(KERN_ALERT "BUG: unable to handle kernel ");
12494 +       if (address < PAGE_SIZE)
12495 +               printk(KERN_CONT "NULL pointer dereference");
12496 +       else
12497 +               printk(KERN_CONT "paging request");
12498 +#ifdef CONFIG_X86_32
12499 +       printk(KERN_CONT " at %08lx\n", address);
12500 +#else
12501 +       printk(KERN_CONT " at %016lx\n", address);
12502 +#endif
12503 +       printk(KERN_ALERT "IP:");
12504 +       printk_address(regs->ip, 1);
12505 +       dump_pagetable(address);
12506 +}
12507 +
12508 +#ifdef CONFIG_X86_64
12509 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12510 +                                unsigned long error_code)
12511 +{
12512 +       unsigned long flags = oops_begin();
12513 +       struct task_struct *tsk;
12514 +
12515 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12516 +              current->comm, address);
12517 +       dump_pagetable(address);
12518 +       tsk = current;
12519 +       tsk->thread.cr2 = address;
12520 +       tsk->thread.trap_no = 14;
12521 +       tsk->thread.error_code = error_code;
12522 +       if (__die("Bad pagetable", regs, error_code))
12523 +               regs = NULL;
12524 +       oops_end(flags, regs, SIGKILL);
12525 +}
12526 +#endif
12527 +
12528 +static int spurious_fault_check(unsigned long error_code, pte_t *pte)
12529 +{
12530 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
12531 +               return 0;
12532 +       if ((error_code & PF_INSTR) && !pte_exec(*pte))
12533 +               return 0;
12534 +
12535 +       return 1;
12536 +}
12537 +
12538 +/*
12539 + * Handle a spurious fault caused by a stale TLB entry.  This allows
12540 + * us to lazily refresh the TLB when increasing the permissions of a
12541 + * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
12542 + * expensive since that implies doing a full cross-processor TLB
12543 + * flush, even if no stale TLB entries exist on other processors.
12544 + * There are no security implications to leaving a stale TLB when
12545 + * increasing the permissions on a page.
12546 + */
12547 +static int spurious_fault(unsigned long address,
12548 +                         unsigned long error_code)
12549 +{
12550 +       pgd_t *pgd;
12551 +       pud_t *pud;
12552 +       pmd_t *pmd;
12553 +       pte_t *pte;
12554 +
12555 +       /* Reserved-bit violation or user access to kernel space? */
12556 +       if (error_code & (PF_USER | PF_RSVD))
12557 +               return 0;
12558 +
12559 +       pgd = init_mm.pgd + pgd_index(address);
12560 +       if (!pgd_present(*pgd))
12561 +               return 0;
12562 +
12563 +       pud = pud_offset(pgd, address);
12564 +       if (!pud_present(*pud))
12565 +               return 0;
12566 +
12567 +       if (pud_large(*pud))
12568 +               return spurious_fault_check(error_code, (pte_t *) pud);
12569 +
12570 +       pmd = pmd_offset(pud, address);
12571 +       if (!pmd_present(*pmd))
12572 +               return 0;
12573 +
12574 +       if (pmd_large(*pmd))
12575 +               return spurious_fault_check(error_code, (pte_t *) pmd);
12576 +
12577 +       pte = pte_offset_kernel(pmd, address);
12578 +       if (!pte_present(*pte))
12579 +               return 0;
12580 +
12581 +       return spurious_fault_check(error_code, pte);
12582 +}
12583 +
12584 +/*
12585 + * X86_32
12586 + * Handle a fault on the vmalloc or module mapping area
12587 + *
12588 + * X86_64
12589 + * Handle a fault on the vmalloc area
12590 + *
12591 + * This assumes no large pages in there.
12592 + */
12593 +static int vmalloc_fault(unsigned long address)
12594 +{
12595 +#ifdef CONFIG_X86_32
12596 +       unsigned long pgd_paddr;
12597 +       pmd_t *pmd_k;
12598 +       pte_t *pte_k;
12599 +       /*
12600 +        * Synchronize this task's top level page-table
12601 +        * with the 'reference' page table.
12602 +        *
12603 +        * Do _not_ use "current" here. We might be inside
12604 +        * an interrupt in the middle of a task switch..
12605 +        */
12606 +       pgd_paddr = read_cr3();
12607 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
12608 +       if (!pmd_k)
12609 +               return -1;
12610 +       pte_k = pte_offset_kernel(pmd_k, address);
12611 +       if (!pte_present(*pte_k))
12612 +               return -1;
12613 +       return 0;
12614 +#else
12615 +       pgd_t *pgd, *pgd_ref;
12616 +       pud_t *pud, *pud_ref;
12617 +       pmd_t *pmd, *pmd_ref;
12618 +       pte_t *pte, *pte_ref;
12619 +
12620 +       /* Make sure we are in vmalloc area */
12621 +       if (!(address >= VMALLOC_START && address < VMALLOC_END))
12622 +               return -1;
12623 +
12624 +       /* Copy kernel mappings over when needed. This can also
12625 +          happen within a race in page table update. In the later
12626 +          case just flush. */
12627 +
12628 +       /* On Xen the line below does not always work. Needs investigating! */
12629 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12630 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12631 +       pgd += pgd_index(address);
12632 +       pgd_ref = pgd_offset_k(address);
12633 +       if (pgd_none(*pgd_ref))
12634 +               return -1;
12635 +       if (pgd_none(*pgd))
12636 +               set_pgd(pgd, *pgd_ref);
12637 +       else
12638 +               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12639 +
12640 +       /* Below here mismatches are bugs because these lower tables
12641 +          are shared */
12642 +
12643 +       pud = pud_offset(pgd, address);
12644 +       pud_ref = pud_offset(pgd_ref, address);
12645 +       if (pud_none(*pud_ref))
12646 +               return -1;
12647 +       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12648 +               BUG();
12649 +       pmd = pmd_offset(pud, address);
12650 +       pmd_ref = pmd_offset(pud_ref, address);
12651 +       if (pmd_none(*pmd_ref))
12652 +               return -1;
12653 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12654 +               BUG();
12655 +       pte_ref = pte_offset_kernel(pmd_ref, address);
12656 +       if (!pte_present(*pte_ref))
12657 +               return -1;
12658 +       pte = pte_offset_kernel(pmd, address);
12659 +       /* Don't use pte_page here, because the mappings can point
12660 +          outside mem_map, and the NUMA hash lookup cannot handle
12661 +          that. */
12662 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12663 +               BUG();
12664 +       return 0;
12665 +#endif
12666 +}
12667 +
12668 +int show_unhandled_signals = 1;
12669 +
12670 +/*
12671 + * This routine handles page faults.  It determines the address,
12672 + * and the problem, and then passes it off to one of the appropriate
12673 + * routines.
12674 + */
12675 +#ifdef CONFIG_X86_64
12676 +asmlinkage
12677 +#endif
12678 +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
12679 +{
12680 +       struct task_struct *tsk;
12681 +       struct mm_struct *mm;
12682 +       struct vm_area_struct *vma;
12683 +       unsigned long address;
12684 +       int write, si_code;
12685 +       int fault;
12686 +#ifdef CONFIG_X86_64
12687 +       unsigned long flags;
12688 +#endif
12689 +
12690 +       /*
12691 +        * We can fault from pretty much anywhere, with unknown IRQ state.
12692 +        */
12693 +       trace_hardirqs_fixup();
12694 +
12695 +       /* Set the "privileged fault" bit to something sane. */
12696 +       if (user_mode_vm(regs))
12697 +               error_code |= PF_USER;
12698 +       else
12699 +               error_code &= ~PF_USER;
12700 +
12701 +       tsk = current;
12702 +       mm = tsk->mm;
12703 +       prefetchw(&mm->mmap_sem);
12704 +
12705 +       /* get the address */
12706 +       address = read_cr2();
12707 +
12708 +       si_code = SEGV_MAPERR;
12709 +
12710 +       if (notify_page_fault(regs))
12711 +               return;
12712 +
12713 +       /*
12714 +        * We fault-in kernel-space virtual memory on-demand. The
12715 +        * 'reference' page table is init_mm.pgd.
12716 +        *
12717 +        * NOTE! We MUST NOT take any locks for this case. We may
12718 +        * be in an interrupt or a critical region, and should
12719 +        * only copy the information from the master page table,
12720 +        * nothing more.
12721 +        *
12722 +        * This verifies that the fault happens in kernel space
12723 +        * (error_code & 4) == 0, and that the fault was not a
12724 +        * protection error (error_code & 9) == 0.
12725 +        */
12726 +#ifdef CONFIG_X86_32
12727 +       if (unlikely(address >= TASK_SIZE)) {
12728 +#else
12729 +       if (unlikely(address >= TASK_SIZE64)) {
12730 +#endif
12731 +               /* Faults in hypervisor area can never be patched up. */
12732 +#if defined(CONFIG_X86_XEN)
12733 +               if (address >= hypervisor_virt_start)
12734 +                       goto bad_area_nosemaphore;
12735 +#elif defined(CONFIG_X86_64_XEN)
12736 +               /* Faults in hypervisor area are never spurious. */
12737 +               if (address >= HYPERVISOR_VIRT_START
12738 +                   && address < HYPERVISOR_VIRT_END)
12739 +                       goto bad_area_nosemaphore;
12740 +#endif
12741 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12742 +                   vmalloc_fault(address) >= 0)
12743 +                       return;
12744 +
12745 +               /* Can handle a stale RO->RW TLB */
12746 +               if (spurious_fault(address, error_code))
12747 +                       return;
12748 +
12749 +               /*
12750 +                * Don't take the mm semaphore here. If we fixup a prefetch
12751 +                * fault we could otherwise deadlock.
12752 +                */
12753 +               goto bad_area_nosemaphore;
12754 +       }
12755 +
12756 +
12757 +#ifdef CONFIG_X86_32
12758 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
12759 +          fault has been handled. */
12760 +       if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
12761 +               local_irq_enable();
12762 +
12763 +       /*
12764 +        * If we're in an interrupt, have no user context or are running in an
12765 +        * atomic region then we must not take the fault.
12766 +        */
12767 +       if (in_atomic() || !mm)
12768 +               goto bad_area_nosemaphore;
12769 +#else /* CONFIG_X86_64 */
12770 +       if (likely(regs->flags & X86_EFLAGS_IF))
12771 +               local_irq_enable();
12772 +
12773 +       if (unlikely(error_code & PF_RSVD))
12774 +               pgtable_bad(address, regs, error_code);
12775 +
12776 +       /*
12777 +        * If we're in an interrupt, have no user context or are running in an
12778 +        * atomic region then we must not take the fault.
12779 +        */
12780 +       if (unlikely(in_atomic() || !mm))
12781 +               goto bad_area_nosemaphore;
12782 +
12783 +       /*
12784 +        * User-mode registers count as a user access even for any
12785 +        * potential system fault or CPU buglet.
12786 +        */
12787 +       if (user_mode_vm(regs))
12788 +               error_code |= PF_USER;
12789 +again:
12790 +#endif
12791 +       /* When running in the kernel we expect faults to occur only to
12792 +        * addresses in user space.  All other faults represent errors in the
12793 +        * kernel and should generate an OOPS.  Unfortunately, in the case of an
12794 +        * erroneous fault occurring in a code path which already holds mmap_sem
12795 +        * we will deadlock attempting to validate the fault against the
12796 +        * address space.  Luckily the kernel only validly references user
12797 +        * space from well defined areas of code, which are listed in the
12798 +        * exceptions table.
12799 +        *
12800 +        * As the vast majority of faults will be valid we will only perform
12801 +        * the source reference check when there is a possibility of a deadlock.
12802 +        * Attempt to lock the address space, if we cannot we then validate the
12803 +        * source.  If this is invalid we can skip the address space check,
12804 +        * thus avoiding the deadlock.
12805 +        */
12806 +       if (!down_read_trylock(&mm->mmap_sem)) {
12807 +               if ((error_code & PF_USER) == 0 &&
12808 +                   !search_exception_tables(regs->ip))
12809 +                       goto bad_area_nosemaphore;
12810 +               down_read(&mm->mmap_sem);
12811 +       }
12812 +
12813 +       vma = find_vma(mm, address);
12814 +       if (!vma)
12815 +               goto bad_area;
12816 +       if (vma->vm_start <= address)
12817 +               goto good_area;
12818 +       if (!(vma->vm_flags & VM_GROWSDOWN))
12819 +               goto bad_area;
12820 +       if (error_code & PF_USER) {
12821 +               /*
12822 +                * Accessing the stack below %sp is always a bug.
12823 +                * The large cushion allows instructions like enter
12824 +                * and pusha to work.  ("enter $65535,$31" pushes
12825 +                * 32 pointers and then decrements %sp by 65535.)
12826 +                */
12827 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
12828 +                       goto bad_area;
12829 +       }
12830 +       if (expand_stack(vma, address))
12831 +               goto bad_area;
12832 +/*
12833 + * Ok, we have a good vm_area for this memory access, so
12834 + * we can handle it..
12835 + */
12836 +good_area:
12837 +       si_code = SEGV_ACCERR;
12838 +       write = 0;
12839 +       switch (error_code & (PF_PROT|PF_WRITE)) {
12840 +       default:        /* 3: write, present */
12841 +               /* fall through */
12842 +       case PF_WRITE:          /* write, not present */
12843 +               if (!(vma->vm_flags & VM_WRITE))
12844 +                       goto bad_area;
12845 +               write++;
12846 +               break;
12847 +       case PF_PROT:           /* read, present */
12848 +               goto bad_area;
12849 +       case 0:                 /* read, not present */
12850 +               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12851 +                       goto bad_area;
12852 +       }
12853 +
12854 +#ifdef CONFIG_X86_32
12855 +survive:
12856 +#endif
12857 +       /*
12858 +        * If for any reason at all we couldn't handle the fault,
12859 +        * make sure we exit gracefully rather than endlessly redo
12860 +        * the fault.
12861 +        */
12862 +       fault = handle_mm_fault(mm, vma, address, write);
12863 +       if (unlikely(fault & VM_FAULT_ERROR)) {
12864 +               if (fault & VM_FAULT_OOM)
12865 +                       goto out_of_memory;
12866 +               else if (fault & VM_FAULT_SIGBUS)
12867 +                       goto do_sigbus;
12868 +               BUG();
12869 +       }
12870 +       if (fault & VM_FAULT_MAJOR)
12871 +               tsk->maj_flt++;
12872 +       else
12873 +               tsk->min_flt++;
12874 +
12875 +#ifdef CONFIG_X86_32
12876 +       /*
12877 +        * Did it hit the DOS screen memory VA from vm86 mode?
12878 +        */
12879 +       if (v8086_mode(regs)) {
12880 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12881 +               if (bit < 32)
12882 +                       tsk->thread.screen_bitmap |= 1 << bit;
12883 +       }
12884 +#endif
12885 +       up_read(&mm->mmap_sem);
12886 +       return;
12887 +
12888 +/*
12889 + * Something tried to access memory that isn't in our memory map..
12890 + * Fix it, but check if it's kernel or user first..
12891 + */
12892 +bad_area:
12893 +       up_read(&mm->mmap_sem);
12894 +
12895 +bad_area_nosemaphore:
12896 +       /* User mode accesses just cause a SIGSEGV */
12897 +       if (error_code & PF_USER) {
12898 +               /*
12899 +                * It's possible to have interrupts off here.
12900 +                */
12901 +               local_irq_enable();
12902 +
12903 +               /*
12904 +                * Valid to do another page fault here because this one came
12905 +                * from user space.
12906 +                */
12907 +               if (is_prefetch(regs, address, error_code))
12908 +                       return;
12909 +
12910 +               if (is_errata100(regs, address))
12911 +                       return;
12912 +
12913 +               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12914 +                   printk_ratelimit()) {
12915 +                       printk(
12916 +#ifdef CONFIG_X86_32
12917 +                       "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
12918 +#else
12919 +                       "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
12920 +#endif
12921 +                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12922 +                       tsk->comm, task_pid_nr(tsk), address, regs->ip,
12923 +                       regs->sp, error_code);
12924 +                       print_vma_addr(" in ", regs->ip);
12925 +                       printk("\n");
12926 +               }
12927 +
12928 +               tsk->thread.cr2 = address;
12929 +               /* Kernel addresses are always protection faults */
12930 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12931 +               tsk->thread.trap_no = 14;
12932 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12933 +               return;
12934 +       }
12935 +
12936 +       if (is_f00f_bug(regs, address))
12937 +               return;
12938 +
12939 +no_context:
12940 +       /* Are we prepared to handle this kernel fault?  */
12941 +       if (fixup_exception(regs))
12942 +               return;
12943 +
12944 +       /*
12945 +        * X86_32
12946 +        * Valid to do another page fault here, because if this fault
12947 +        * had been triggered by is_prefetch fixup_exception would have
12948 +        * handled it.
12949 +        *
12950 +        * X86_64
12951 +        * Hall of shame of CPU/BIOS bugs.
12952 +        */
12953 +       if (is_prefetch(regs, address, error_code))
12954 +               return;
12955 +
12956 +       if (is_errata93(regs, address))
12957 +               return;
12958 +
12959 +/*
12960 + * Oops. The kernel tried to access some bad page. We'll have to
12961 + * terminate things with extreme prejudice.
12962 + */
12963 +#ifdef CONFIG_X86_32
12964 +       bust_spinlocks(1);
12965 +#else
12966 +       flags = oops_begin();
12967 +#endif
12968 +
12969 +       show_fault_oops(regs, error_code, address);
12970 +
12971 +       tsk->thread.cr2 = address;
12972 +       tsk->thread.trap_no = 14;
12973 +       tsk->thread.error_code = error_code;
12974 +
12975 +#ifdef CONFIG_X86_32
12976 +       die("Oops", regs, error_code);
12977 +       bust_spinlocks(0);
12978 +       do_exit(SIGKILL);
12979 +#else
12980 +       if (__die("Oops", regs, error_code))
12981 +               regs = NULL;
12982 +       /* Executive summary in case the body of the oops scrolled away */
12983 +       printk(KERN_EMERG "CR2: %016lx\n", address);
12984 +       oops_end(flags, regs, SIGKILL);
12985 +#endif
12986 +
12987 +/*
12988 + * We ran out of memory, or some other thing happened to us that made
12989 + * us unable to handle the page fault gracefully.
12990 + */
12991 +out_of_memory:
12992 +       up_read(&mm->mmap_sem);
12993 +       if (is_global_init(tsk)) {
12994 +               yield();
12995 +#ifdef CONFIG_X86_32
12996 +               down_read(&mm->mmap_sem);
12997 +               goto survive;
12998 +#else
12999 +               goto again;
13000 +#endif
13001 +       }
13002 +
13003 +       printk("VM: killing process %s\n", tsk->comm);
13004 +       if (error_code & PF_USER)
13005 +               do_group_exit(SIGKILL);
13006 +       goto no_context;
13007 +
13008 +do_sigbus:
13009 +       up_read(&mm->mmap_sem);
13010 +
13011 +       /* Kernel mode? Handle exceptions or die */
13012 +       if (!(error_code & PF_USER))
13013 +               goto no_context;
13014 +#ifdef CONFIG_X86_32
13015 +       /* User space => ok to do another page fault */
13016 +       if (is_prefetch(regs, address, error_code))
13017 +               return;
13018 +#endif
13019 +       tsk->thread.cr2 = address;
13020 +       tsk->thread.error_code = error_code;
13021 +       tsk->thread.trap_no = 14;
13022 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
13023 +}
13024 +
13025 +DEFINE_SPINLOCK(pgd_lock);
13026 +LIST_HEAD(pgd_list);
13027 +
13028 +void vmalloc_sync_all(void)
13029 +{
13030 +#ifdef CONFIG_X86_32
13031 +       /*
13032 +        * Note that races in the updates of insync and start aren't
13033 +        * problematic: insync can only get set bits added, and updates to
13034 +        * start are only improving performance (without affecting correctness
13035 +        * if undone).
13036 +        * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
13037 +        *      This change works just fine with 2-level paging too.
13038 +        */
13039 +#define sync_index(a) ((a) >> PMD_SHIFT)
13040 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
13041 +       static unsigned long start = TASK_SIZE;
13042 +       unsigned long address;
13043 +
13044 +       if (SHARED_KERNEL_PMD)
13045 +               return;
13046 +
13047 +       BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
13048 +       for (address = start;
13049 +            address < hypervisor_virt_start;
13050 +            address += PMD_SIZE) {
13051 +               if (!test_bit(sync_index(address), insync)) {
13052 +                       unsigned long flags;
13053 +                       struct page *page;
13054 +
13055 +                       spin_lock_irqsave(&pgd_lock, flags);
13056 +                       /* XEN: failure path assumes non-empty pgd_list. */
13057 +                       if (unlikely(list_empty(&pgd_list))) {
13058 +                               spin_unlock_irqrestore(&pgd_lock, flags);
13059 +                               return;
13060 +                       }
13061 +                       list_for_each_entry(page, &pgd_list, lru) {
13062 +                               if (!vmalloc_sync_one(page_address(page),
13063 +                                                     address))
13064 +                                       break;
13065 +                       }
13066 +                       spin_unlock_irqrestore(&pgd_lock, flags);
13067 +                       if (!page)
13068 +                               set_bit(sync_index(address), insync);
13069 +               }
13070 +               if (address == start && test_bit(sync_index(address), insync))
13071 +                       start = address + PMD_SIZE;
13072 +       }
13073 +#else /* CONFIG_X86_64 */
13074 +       /*
13075 +        * Note that races in the updates of insync and start aren't
13076 +        * problematic: insync can only get set bits added, and updates to
13077 +        * start are only improving performance (without affecting correctness
13078 +        * if undone).
13079 +        */
13080 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
13081 +       static unsigned long start = VMALLOC_START & PGDIR_MASK;
13082 +       unsigned long address;
13083 +
13084 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
13085 +               if (!test_bit(pgd_index(address), insync)) {
13086 +                       const pgd_t *pgd_ref = pgd_offset_k(address);
13087 +                       unsigned long flags;
13088 +                       struct page *page;
13089 +
13090 +                       if (pgd_none(*pgd_ref))
13091 +                               continue;
13092 +                       spin_lock_irqsave(&pgd_lock, flags);
13093 +                       list_for_each_entry(page, &pgd_list, lru) {
13094 +                               pgd_t *pgd;
13095 +                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
13096 +                               if (pgd_none(*pgd))
13097 +                                       set_pgd(pgd, *pgd_ref);
13098 +                               else
13099 +                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
13100 +                       }
13101 +                       spin_unlock_irqrestore(&pgd_lock, flags);
13102 +                       set_bit(pgd_index(address), insync);
13103 +               }
13104 +               if (address == start)
13105 +                       start = address + PGDIR_SIZE;
13106 +       }
13107 +       /* Check that there is no need to do the same for the modules area. */
13108 +       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
13109 +       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
13110 +                               (__START_KERNEL & PGDIR_MASK)));
13111 +#endif
13112 +}
13113 --- a/arch/x86/mm/highmem_32-xen.c
13114 +++ b/arch/x86/mm/highmem_32-xen.c
13115 @@ -18,6 +18,49 @@ void kunmap(struct page *page)
13116         kunmap_high(page);
13117  }
13118
13119 +static void debug_kmap_atomic_prot(enum km_type type)
13120 +{
13121 +#ifdef CONFIG_DEBUG_HIGHMEM
13122 +       static unsigned warn_count = 10;
13123 +
13124 +       if (unlikely(warn_count == 0))
13125 +               return;
13126 +
13127 +       if (unlikely(in_interrupt())) {
13128 +               if (in_irq()) {
13129 +                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
13130 +                           type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
13131 +                           type != KM_BOUNCE_READ) {
13132 +                               WARN_ON(1);
13133 +                               warn_count--;
13134 +                       }
13135 +               } else if (!irqs_disabled()) {  /* softirq */
13136 +                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
13137 +                           type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
13138 +                           type != KM_SKB_SUNRPC_DATA &&
13139 +                           type != KM_SKB_DATA_SOFTIRQ &&
13140 +                           type != KM_BOUNCE_READ) {
13141 +                               WARN_ON(1);
13142 +                               warn_count--;
13143 +                       }
13144 +               }
13145 +       }
13146 +
13147 +       if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13148 +                       type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13149 +               if (!irqs_disabled()) {
13150 +                       WARN_ON(1);
13151 +                       warn_count--;
13152 +               }
13153 +       } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13154 +               if (irq_count() == 0 && !irqs_disabled()) {
13155 +                       WARN_ON(1);
13156 +                       warn_count--;
13157 +               }
13158 +       }
13159 +#endif
13160 +}
13161 +
13162  /*
13163   * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13164   * no global lock is needed and because the kmap code must perform a global TLB
13165 @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13166         if (!PageHighMem(page))
13167                 return page_address(page);
13168
13169 +       debug_kmap_atomic_prot(type);
13170 +
13171         idx = type + KM_TYPE_NR*smp_processor_id();
13172         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13173         BUG_ON(!pte_none(*(kmap_pte-idx)));
13174 --- a/arch/x86/mm/hypervisor.c
13175 +++ b/arch/x86/mm/hypervisor.c
13176 @@ -831,15 +831,11 @@ int xen_limit_pages_to_max_mfn(
13177  }
13178  EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13179
13180 -#ifdef __i386__
13181 -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13182 +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13183  {
13184 -       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13185 -       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13186 -       return HYPERVISOR_update_descriptor(
13187 -               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13188 +       maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13189 +       return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13190  }
13191 -#endif
13192
13193  #define MAX_BATCHED_FULL_PTES 32
13194
13195 --- a/arch/x86/mm/init_32-xen.c
13196 +++ b/arch/x86/mm/init_32-xen.c
13197 @@ -27,13 +27,13 @@
13198  #include <linux/bootmem.h>
13199  #include <linux/slab.h>
13200  #include <linux/proc_fs.h>
13201 -#include <linux/efi.h>
13202  #include <linux/memory_hotplug.h>
13203  #include <linux/initrd.h>
13204  #include <linux/cpumask.h>
13205  #include <linux/dma-mapping.h>
13206  #include <linux/scatterlist.h>
13207
13208 +#include <asm/asm.h>
13209  #include <asm/processor.h>
13210  #include <asm/system.h>
13211  #include <asm/uaccess.h>
13212 @@ -42,18 +42,22 @@
13213  #include <asm/fixmap.h>
13214  #include <asm/e820.h>
13215  #include <asm/apic.h>
13216 +#include <asm/bugs.h>
13217  #include <asm/tlb.h>
13218  #include <asm/tlbflush.h>
13219 +#include <asm/pgalloc.h>
13220  #include <asm/sections.h>
13221  #include <asm/hypervisor.h>
13222  #include <asm/swiotlb.h>
13223 +#include <asm/setup.h>
13224 +#include <asm/cacheflush.h>
13225
13226  unsigned int __VMALLOC_RESERVE = 128 << 20;
13227
13228  DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13229  unsigned long highstart_pfn, highend_pfn;
13230
13231 -static int noinline do_test_wp_bit(void);
13232 +static noinline int do_test_wp_bit(void);
13233
13234  /*
13235   * Creates a middle page table and puts a pointer to it in the
13236 @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13237  {
13238         pud_t *pud;
13239         pmd_t *pmd_table;
13240 -
13241 +
13242  #ifdef CONFIG_X86_PAE
13243         if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13244                 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13245
13246 -               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13247 +               paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13248                 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13249                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13250                 pud = pud_offset(pgd, 0);
13251 -               if (pmd_table != pmd_offset(pud, 0))
13252 -                       BUG();
13253 +               BUG_ON(pmd_table != pmd_offset(pud, 0));
13254         }
13255  #endif
13256         pud = pud_offset(pgd, 0);
13257 @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13258
13259  /*
13260   * Create a page table and place a pointer to it in a middle page
13261 - * directory entry.
13262 + * directory entry:
13263   */
13264  static pte_t * __init one_page_table_init(pmd_t *pmd)
13265  {
13266 @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13267  #ifdef CONFIG_DEBUG_PAGEALLOC
13268                 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13269  #endif
13270 -               if (!page_table)
13271 +               if (!page_table) {
13272                         page_table =
13273                                 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13274 +               }
13275
13276                 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13277                 make_lowmem_page_readonly(page_table,
13278 @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13279  }
13280
13281  /*
13282 - * This function initializes a certain range of kernel virtual memory
13283 + * This function initializes a certain range of kernel virtual memory
13284   * with new bootmem page tables, everywhere page tables are missing in
13285   * the given range.
13286 - */
13287 -
13288 -/*
13289 - * NOTE: The pagetables are allocated contiguous on the physical space
13290 - * so we can cache the place of the first one and move around without
13291 + *
13292 + * NOTE: The pagetables are allocated contiguous on the physical space
13293 + * so we can cache the place of the first one and move around without
13294   * checking the pgd every time.
13295   */
13296 -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13297 +static void __init
13298 +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13299  {
13300 -       pgd_t *pgd;
13301 -       pmd_t *pmd;
13302         int pgd_idx, pmd_idx;
13303         unsigned long vaddr;
13304 +       pgd_t *pgd;
13305 +       pmd_t *pmd;
13306
13307         vaddr = start;
13308         pgd_idx = pgd_index(vaddr);
13309 @@ -139,7 +142,8 @@ static void __init page_table_range_init
13310         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13311                 pmd = one_md_table_init(pgd);
13312                 pmd = pmd + pmd_index(vaddr);
13313 -               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13314 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13315 +                                                       pmd++, pmd_idx++) {
13316                         if (vaddr < hypervisor_virt_start)
13317                                 one_page_table_init(pmd);
13318
13319 @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13320  }
13321
13322  /*
13323 - * This maps the physical memory to kernel virtual address space, a total
13324 - * of max_low_pfn pages, by creating page tables starting from address
13325 - * PAGE_OFFSET.
13326 + * This maps the physical memory to kernel virtual address space, a total
13327 + * of max_low_pfn pages, by creating page tables starting from address
13328 + * PAGE_OFFSET:
13329   */
13330  static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13331  {
13332 +       int pgd_idx, pmd_idx, pte_ofs;
13333         unsigned long pfn;
13334         pgd_t *pgd;
13335         pmd_t *pmd;
13336         pte_t *pte;
13337 -       int pgd_idx, pmd_idx, pte_ofs;
13338
13339         unsigned long max_ram_pfn = xen_start_info->nr_pages;
13340         if (max_ram_pfn > max_low_pfn)
13341 @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13342                 if (pfn >= max_low_pfn)
13343                         continue;
13344                 pmd += pmd_idx;
13345 -               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13346 -                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13347 -                       if (address >= hypervisor_virt_start)
13348 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13349 +                    pmd++, pmd_idx++) {
13350 +                       unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13351 +
13352 +                       if (addr >= hypervisor_virt_start)
13353                                 continue;
13354
13355 -                       /* Map with big pages if possible, otherwise create normal page tables. */
13356 +                       /*
13357 +                        * Map with big pages if possible, otherwise
13358 +                        * create normal page tables:
13359 +                        */
13360                         if (cpu_has_pse) {
13361 -                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13362 -                               if (is_kernel_text(address) || is_kernel_text(address2))
13363 -                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13364 -                               else
13365 -                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13366 +                               unsigned int addr2;
13367 +                               pgprot_t prot = PAGE_KERNEL_LARGE;
13368 +
13369 +                               addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13370 +                                       PAGE_OFFSET + PAGE_SIZE-1;
13371 +
13372 +                               if (is_kernel_text(addr) ||
13373 +                                   is_kernel_text(addr2))
13374 +                                       prot = PAGE_KERNEL_LARGE_EXEC;
13375 +
13376 +                               set_pmd(pmd, pfn_pmd(pfn, prot));
13377
13378                                 pfn += PTRS_PER_PTE;
13379 -                       } else {
13380 -                               pte = one_page_table_init(pmd);
13381 +                               continue;
13382 +                       }
13383 +                       pte = one_page_table_init(pmd);
13384 +
13385 +                       for (pte += pte_ofs;
13386 +                            pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13387 +                            pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13388 +                               pgprot_t prot = PAGE_KERNEL;
13389 +
13390 +                               /* XEN: Only map initial RAM allocation. */
13391 +                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
13392 +                                       continue;
13393 +                               if (is_kernel_text(addr))
13394 +                                       prot = PAGE_KERNEL_EXEC;
13395
13396 -                               for (pte += pte_ofs;
13397 -                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13398 -                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13399 -                                       /* XEN: Only map initial RAM allocation. */
13400 -                                       if ((pfn >= max_ram_pfn) || pte_present(*pte))
13401 -                                               continue;
13402 -                                       if (is_kernel_text(address))
13403 -                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13404 -                                       else
13405 -                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13406 -                               }
13407 -                               pte_ofs = 0;
13408 +                               set_pte(pte, pfn_pte(pfn, prot));
13409                         }
13410 +                       pte_ofs = 0;
13411                 }
13412                 pmd_idx = 0;
13413         }
13414 @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13415
13416  #endif
13417
13418 -int page_is_ram(unsigned long pagenr)
13419 -{
13420 -       int i;
13421 -       unsigned long addr, end;
13422 -
13423 -       if (efi_enabled) {
13424 -               efi_memory_desc_t *md;
13425 -               void *p;
13426 -
13427 -               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13428 -                       md = p;
13429 -                       if (!is_available_memory(md))
13430 -                               continue;
13431 -                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13432 -                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13433 -
13434 -                       if ((pagenr >= addr) && (pagenr < end))
13435 -                               return 1;
13436 -               }
13437 -               return 0;
13438 -       }
13439 -
13440 -       for (i = 0; i < e820.nr_map; i++) {
13441 -
13442 -               if (e820.map[i].type != E820_RAM)       /* not usable memory */
13443 -                       continue;
13444 -               /*
13445 -                *      !!!FIXME!!! Some BIOSen report areas as RAM that
13446 -                *      are not. Notably the 640->1Mb area. We need a sanity
13447 -                *      check here.
13448 -                */
13449 -               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13450 -               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13451 -               if  ((pagenr >= addr) && (pagenr < end))
13452 -                       return 1;
13453 -       }
13454 -       return 0;
13455 -}
13456 -
13457  #ifdef CONFIG_HIGHMEM
13458  pte_t *kmap_pte;
13459  pgprot_t kmap_prot;
13460
13461 -#define kmap_get_fixmap_pte(vaddr)                                     \
13462 -       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13463 +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13464 +{
13465 +       return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13466 +                       vaddr), vaddr), vaddr);
13467 +}
13468
13469  static void __init kmap_init(void)
13470  {
13471         unsigned long kmap_vstart;
13472
13473 -       /* cache the first kmap pte */
13474 +       /*
13475 +        * Cache the first kmap pte:
13476 +        */
13477         kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13478         kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13479
13480 @@ -304,11 +287,11 @@ static void __init kmap_init(void)
13481
13482  static void __init permanent_kmaps_init(pgd_t *pgd_base)
13483  {
13484 +       unsigned long vaddr;
13485         pgd_t *pgd;
13486         pud_t *pud;
13487         pmd_t *pmd;
13488         pte_t *pte;
13489 -       unsigned long vaddr;
13490
13491         vaddr = PKMAP_BASE;
13492         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13493 @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13494         pud = pud_offset(pgd, vaddr);
13495         pmd = pmd_offset(pud, vaddr);
13496         pte = pte_offset_kernel(pmd, vaddr);
13497 -       pkmap_page_table = pte;
13498 +       pkmap_page_table = pte;
13499  }
13500
13501  static void __meminit free_new_highpage(struct page *page, int pfn)
13502 @@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13503                 SetPageReserved(page);
13504  }
13505
13506 -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13507 +static int __meminit
13508 +add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13509  {
13510         free_new_highpage(page, pfn);
13511         totalram_pages++;
13512 @@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13513         max_mapnr = max(pfn, max_mapnr);
13514  #endif
13515         num_physpages++;
13516 +
13517         return 0;
13518  }
13519
13520 @@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13521   * Not currently handling the NUMA case.
13522   * Assuming single node and all memory that
13523   * has been added dynamically that would be
13524 - * onlined here is in HIGHMEM
13525 + * onlined here is in HIGHMEM.
13526   */
13527  void __meminit online_page(struct page *page)
13528  {
13529 @@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13530         add_one_highpage_hotplug(page, page_to_pfn(page));
13531  }
13532
13533 -
13534 -#ifdef CONFIG_NUMA
13535 -extern void set_highmem_pages_init(int);
13536 -#else
13537 +#ifndef CONFIG_NUMA
13538  static void __init set_highmem_pages_init(int bad_ppro)
13539  {
13540         int pfn;
13541 +
13542         for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13543                 /*
13544                  * Holes under sparsemem might not have no mem_map[]:
13545 @@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13546         }
13547         totalram_pages += totalhigh_pages;
13548  }
13549 -#endif /* CONFIG_FLATMEM */
13550 +#endif /* !CONFIG_NUMA */
13551
13552  #else
13553 -#define kmap_init() do { } while (0)
13554 -#define permanent_kmaps_init(pgd_base) do { } while (0)
13555 -#define set_highmem_pages_init(bad_ppro) do { } while (0)
13556 +# define kmap_init()                           do { } while (0)
13557 +# define permanent_kmaps_init(pgd_base)                do { } while (0)
13558 +# define set_highmem_pages_init(bad_ppro)      do { } while (0)
13559  #endif /* CONFIG_HIGHMEM */
13560
13561 -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13562 +pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13563  EXPORT_SYMBOL(__PAGE_KERNEL);
13564 -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13565
13566 -#ifdef CONFIG_NUMA
13567 -extern void __init remap_numa_kva(void);
13568 -#else
13569 -#define remap_numa_kva() do {} while (0)
13570 -#endif
13571 +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13572
13573  pgd_t *swapper_pg_dir;
13574
13575 @@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13576   * the boot process.
13577   *
13578   * If we're booting on native hardware, this will be a pagetable
13579 - * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13580 - * (even if we'll end up running in PAE).  The root of the pagetable
13581 - * will be swapper_pg_dir.
13582 + * constructed in arch/x86/kernel/head_32.S.  The root of the
13583 + * pagetable will be swapper_pg_dir.
13584   *
13585   * If we're booting paravirtualized under a hypervisor, then there are
13586   * more options: we may already be running PAE, and the pagetable may
13587 @@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13588   * be partially populated, and so it avoids stomping on any existing
13589   * mappings.
13590   */
13591 -static void __init pagetable_init (void)
13592 +static void __init pagetable_init(void)
13593  {
13594 -       unsigned long vaddr, end;
13595         pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13596 +       unsigned long vaddr, end;
13597
13598         xen_pagetable_setup_start(pgd_base);
13599
13600 @@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13601          * Fixed mappings, only the page table structure has to be
13602          * created - mappings will be set by set_fixmap():
13603          */
13604 +       early_ioremap_clear();
13605         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13606         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13607         page_table_range_init(vaddr, end, pgd_base);
13608 +       early_ioremap_reset();
13609
13610         permanent_kmaps_init(pgd_base);
13611
13612         xen_pagetable_setup_done(pgd_base);
13613  }
13614
13615 -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13616 +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13617  /*
13618 - * Swap suspend & friends need this for resume because things like the intel-agp
13619 + * ACPI suspend needs this for resume, because things like the intel-agp
13620   * driver might have split up a kernel 4MB mapping.
13621   */
13622 -char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13623 -       __attribute__ ((aligned (PAGE_SIZE)));
13624 +char swsusp_pg_dir[PAGE_SIZE]
13625 +       __attribute__ ((aligned(PAGE_SIZE)));
13626
13627  static inline void save_pg_dir(void)
13628  {
13629         memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13630  }
13631 -#else
13632 +#else /* !CONFIG_ACPI_SLEEP */
13633  static inline void save_pg_dir(void)
13634  {
13635  }
13636 -#endif
13637 +#endif /* !CONFIG_ACPI_SLEEP */
13638
13639 -void zap_low_mappings (void)
13640 +void zap_low_mappings(void)
13641  {
13642         int i;
13643
13644 @@ -488,22 +467,24 @@ void zap_low_mappings (void)
13645          * Note that "pgd_clear()" doesn't do it for
13646          * us, because pgd_clear() is a no-op on i386.
13647          */
13648 -       for (i = 0; i < USER_PTRS_PER_PGD; i++)
13649 +       for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13650  #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13651                 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13652  #else
13653                 set_pgd(swapper_pg_dir+i, __pgd(0));
13654  #endif
13655 +       }
13656         flush_tlb_all();
13657  }
13658
13659 -int nx_enabled = 0;
13660 +int nx_enabled;
13661 +
13662 +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13663 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
13664
13665  #ifdef CONFIG_X86_PAE
13666
13667 -static int disable_nx __initdata = 0;
13668 -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13669 -EXPORT_SYMBOL_GPL(__supported_pte_mask);
13670 +static int disable_nx __initdata;
13671
13672  /*
13673   * noexec = on|off
13674 @@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13675                         __supported_pte_mask |= _PAGE_NX;
13676                         disable_nx = 0;
13677                 }
13678 -       } else if (!strcmp(str,"off")) {
13679 -               disable_nx = 1;
13680 -               __supported_pte_mask &= ~_PAGE_NX;
13681 -       } else
13682 -               return -EINVAL;
13683 +       } else {
13684 +               if (!strcmp(str, "off")) {
13685 +                       disable_nx = 1;
13686 +                       __supported_pte_mask &= ~_PAGE_NX;
13687 +               } else {
13688 +                       return -EINVAL;
13689 +               }
13690 +       }
13691
13692         return 0;
13693  }
13694 @@ -536,6 +520,7 @@ static void __init set_nx(void)
13695
13696         if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13697                 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13698 +
13699                 if ((v[3] & (1 << 20)) && !disable_nx) {
13700                         rdmsr(MSR_EFER, l, h);
13701                         l |= EFER_NX;
13702 @@ -545,35 +530,6 @@ static void __init set_nx(void)
13703                 }
13704         }
13705  }
13706 -
13707 -/*
13708 - * Enables/disables executability of a given kernel page and
13709 - * returns the previous setting.
13710 - */
13711 -int __init set_kernel_exec(unsigned long vaddr, int enable)
13712 -{
13713 -       pte_t *pte;
13714 -       int ret = 1;
13715 -
13716 -       if (!nx_enabled)
13717 -               goto out;
13718 -
13719 -       pte = lookup_address(vaddr);
13720 -       BUG_ON(!pte);
13721 -
13722 -       if (!pte_exec_kernel(*pte))
13723 -               ret = 0;
13724 -
13725 -       if (enable)
13726 -               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13727 -       else
13728 -               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13729 -       pte_update_defer(&init_mm, vaddr, pte);
13730 -       __flush_tlb_all();
13731 -out:
13732 -       return ret;
13733 -}
13734 -
13735  #endif
13736
13737  /*
13738 @@ -590,21 +546,10 @@ void __init paging_init(void)
13739  #ifdef CONFIG_X86_PAE
13740         set_nx();
13741         if (nx_enabled)
13742 -               printk("NX (Execute Disable) protection: active\n");
13743 +               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13744  #endif
13745 -
13746         pagetable_init();
13747
13748 -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13749 -       /*
13750 -        * We will bail out later - printk doesn't work right now so
13751 -        * the user would just see a hanging kernel.
13752 -        * when running as xen domain we are already in PAE mode at
13753 -        * this point.
13754 -        */
13755 -       if (cpu_has_pae)
13756 -               set_in_cr4(X86_CR4_PAE);
13757 -#endif
13758         __flush_tlb_all();
13759
13760         kmap_init();
13761 @@ -631,10 +576,10 @@ void __init paging_init(void)
13762   * used to involve black magic jumps to work around some nasty CPU bugs,
13763   * but fortunately the switch to using exceptions got rid of all that.
13764   */
13765 -
13766  static void __init test_wp_bit(void)
13767  {
13768 -       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13769 +       printk(KERN_INFO
13770 +  "Checking if this processor honours the WP bit even in supervisor mode...");
13771
13772         /* Any page-aligned address will do, the test is non-destructive */
13773         __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13774 @@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13775         clear_fixmap(FIX_WP_TEST);
13776
13777         if (!boot_cpu_data.wp_works_ok) {
13778 -               printk("No.\n");
13779 +               printk(KERN_CONT "No.\n");
13780  #ifdef CONFIG_X86_WP_WORKS_OK
13781 -               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13782 +               panic(
13783 +  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13784  #endif
13785         } else {
13786 -               printk("Ok.\n");
13787 +               printk(KERN_CONT "Ok.\n");
13788         }
13789  }
13790
13791 -static struct kcore_list kcore_mem, kcore_vmalloc;
13792 +static struct kcore_list kcore_mem, kcore_vmalloc;
13793
13794  void __init mem_init(void)
13795  {
13796 -       extern int ppro_with_ram_bug(void);
13797         int codesize, reservedpages, datasize, initsize;
13798 -       int tmp;
13799 -       int bad_ppro;
13800 +       int tmp, bad_ppro;
13801         unsigned long pfn;
13802
13803  #if defined(CONFIG_SWIOTLB)
13804 @@ -668,19 +612,19 @@ void __init mem_init(void)
13805  #ifdef CONFIG_FLATMEM
13806         BUG_ON(!mem_map);
13807  #endif
13808 -
13809         bad_ppro = ppro_with_ram_bug();
13810
13811  #ifdef CONFIG_HIGHMEM
13812         /* check that fixmap and pkmap do not overlap */
13813 -       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13814 -               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13815 +       if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13816 +               printk(KERN_ERR
13817 +                       "fixmap and kmap areas overlap - this will crash\n");
13818                 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13819 -                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13820 +                               PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13821 +                               FIXADDR_START);
13822                 BUG();
13823         }
13824  #endif
13825 -
13826         /* this will put all low memory onto the freelists */
13827         totalram_pages += free_all_bootmem();
13828         /* XEN: init and count low-mem pages outside initial allocation. */
13829 @@ -693,7 +637,7 @@ void __init mem_init(void)
13830         reservedpages = 0;
13831         for (tmp = 0; tmp < max_low_pfn; tmp++)
13832                 /*
13833 -                * Only count reserved RAM pages
13834 +                * Only count reserved RAM pages:
13835                  */
13836                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13837                         reservedpages++;
13838 @@ -704,11 +648,12 @@ void __init mem_init(void)
13839         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
13840         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
13841
13842 -       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13843 -       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13844 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13845 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13846                    VMALLOC_END-VMALLOC_START);
13847
13848 -       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13849 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13850 +                       "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13851                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13852                 num_physpages << (PAGE_SHIFT-10),
13853                 codesize >> 10,
13854 @@ -719,54 +664,53 @@ void __init mem_init(void)
13855                );
13856
13857  #if 1 /* double-sanity-check paranoia */
13858 -       printk("virtual kernel memory layout:\n"
13859 -              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13860 +       printk(KERN_INFO "virtual kernel memory layout:\n"
13861 +               "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13862  #ifdef CONFIG_HIGHMEM
13863 -              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13864 +               "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13865  #endif
13866 -              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13867 -              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13868 -              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13869 -              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13870 -              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
13871 -              FIXADDR_START, FIXADDR_TOP,
13872 -              (FIXADDR_TOP - FIXADDR_START) >> 10,
13873 +               "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13874 +               "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
13875 +               "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13876 +               "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
13877 +               "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
13878 +               FIXADDR_START, FIXADDR_TOP,
13879 +               (FIXADDR_TOP - FIXADDR_START) >> 10,
13880
13881  #ifdef CONFIG_HIGHMEM
13882 -              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13883 -              (LAST_PKMAP*PAGE_SIZE) >> 10,
13884 +               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13885 +               (LAST_PKMAP*PAGE_SIZE) >> 10,
13886  #endif
13887
13888 -              VMALLOC_START, VMALLOC_END,
13889 -              (VMALLOC_END - VMALLOC_START) >> 20,
13890 +               VMALLOC_START, VMALLOC_END,
13891 +               (VMALLOC_END - VMALLOC_START) >> 20,
13892
13893 -              (unsigned long)__va(0), (unsigned long)high_memory,
13894 -              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13895 +               (unsigned long)__va(0), (unsigned long)high_memory,
13896 +               ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13897
13898 -              (unsigned long)&__init_begin, (unsigned long)&__init_end,
13899 -              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13900 +               (unsigned long)&__init_begin, (unsigned long)&__init_end,
13901 +               ((unsigned long)&__init_end -
13902 +                (unsigned long)&__init_begin) >> 10,
13903
13904 -              (unsigned long)&_etext, (unsigned long)&_edata,
13905 -              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13906 +               (unsigned long)&_etext, (unsigned long)&_edata,
13907 +               ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13908
13909 -              (unsigned long)&_text, (unsigned long)&_etext,
13910 -              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13911 +               (unsigned long)&_text, (unsigned long)&_etext,
13912 +               ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13913
13914  #ifdef CONFIG_HIGHMEM
13915 -       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13916 -       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
13917 +       BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
13918 +       BUG_ON(VMALLOC_END                              > PKMAP_BASE);
13919  #endif
13920 -       BUG_ON(VMALLOC_START                   > VMALLOC_END);
13921 -       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
13922 +       BUG_ON(VMALLOC_START                            > VMALLOC_END);
13923 +       BUG_ON((unsigned long)high_memory               > VMALLOC_START);
13924  #endif /* double-sanity-check paranoia */
13925
13926 -#ifdef CONFIG_X86_PAE
13927 -       if (!cpu_has_pae)
13928 -               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13929 -#endif
13930         if (boot_cpu_data.wp_works_ok < 0)
13931                 test_wp_bit();
13932
13933 +       cpa_init();
13934 +
13935         /*
13936          * Subtle. SMP is doing it's boot stuff late (because it has to
13937          * fork idle threads) - but it also needs low mappings for the
13938 @@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13939
13940         return __add_pages(zone, start_pfn, nr_pages);
13941  }
13942 -
13943  #endif
13944
13945 -struct kmem_cache *pmd_cache;
13946 -
13947 -void __init pgtable_cache_init(void)
13948 -{
13949 -       if (PTRS_PER_PMD > 1)
13950 -               pmd_cache = kmem_cache_create("pmd",
13951 -                                             PTRS_PER_PMD*sizeof(pmd_t),
13952 -                                             PTRS_PER_PMD*sizeof(pmd_t),
13953 -                                             SLAB_PANIC,
13954 -                                             pmd_ctor);
13955 -}
13956 -
13957  /*
13958   * This function cannot be __init, since exceptions don't work in that
13959   * section.  Put this after the callers, so that it cannot be inlined.
13960   */
13961 -static int noinline do_test_wp_bit(void)
13962 +static noinline int do_test_wp_bit(void)
13963  {
13964         char tmp_reg;
13965         int flag;
13966
13967         __asm__ __volatile__(
13968 -               "       movb %0,%1      \n"
13969 -               "1:     movb %1,%0      \n"
13970 -               "       xorl %2,%2      \n"
13971 +               "       movb %0, %1     \n"
13972 +               "1:     movb %1, %0     \n"
13973 +               "       xorl %2, %2     \n"
13974                 "2:                     \n"
13975 -               ".section __ex_table,\"a\"\n"
13976 -               "       .align 4        \n"
13977 -               "       .long 1b,2b     \n"
13978 -               ".previous              \n"
13979 +               _ASM_EXTABLE(1b,2b)
13980                 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13981                  "=q" (tmp_reg),
13982                  "=r" (flag)
13983                 :"2" (1)
13984                 :"memory");
13985 -
13986 +
13987         return flag;
13988  }
13989
13990  #ifdef CONFIG_DEBUG_RODATA
13991 +const int rodata_test_data = 0xC3;
13992 +EXPORT_SYMBOL_GPL(rodata_test_data);
13993
13994  void mark_rodata_ro(void)
13995  {
13996 @@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13997         if (num_possible_cpus() <= 1)
13998  #endif
13999         {
14000 -               change_page_attr(virt_to_page(start),
14001 -                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
14002 -               printk("Write protecting the kernel text: %luk\n", size >> 10);
14003 +               set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14004 +               printk(KERN_INFO "Write protecting the kernel text: %luk\n",
14005 +                       size >> 10);
14006 +
14007 +#ifdef CONFIG_CPA_DEBUG
14008 +               printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
14009 +                       start, start+size);
14010 +               set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
14011 +
14012 +               printk(KERN_INFO "Testing CPA: write protecting again\n");
14013 +               set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
14014 +#endif
14015         }
14016  #endif
14017         start += size;
14018         size = (unsigned long)__end_rodata - start;
14019 -       change_page_attr(virt_to_page(start),
14020 -                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
14021 -       printk("Write protecting the kernel read-only data: %luk\n",
14022 -              size >> 10);
14023 +       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14024 +       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14025 +               size >> 10);
14026 +       rodata_test();
14027 +
14028 +#ifdef CONFIG_CPA_DEBUG
14029 +       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
14030 +       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
14031
14032 -       /*
14033 -        * change_page_attr() requires a global_flush_tlb() call after it.
14034 -        * We do this after the printk so that if something went wrong in the
14035 -        * change, the printk gets out at least to give a better debug hint
14036 -        * of who is the culprit.
14037 -        */
14038 -       global_flush_tlb();
14039 +       printk(KERN_INFO "Testing CPA: write protecting again\n");
14040 +       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
14041 +#endif
14042  }
14043  #endif
14044
14045  void free_init_pages(char *what, unsigned long begin, unsigned long end)
14046  {
14047 +#ifdef CONFIG_DEBUG_PAGEALLOC
14048 +       /*
14049 +        * If debugging page accesses then do not free this memory but
14050 +        * mark them not present - any buggy init-section access will
14051 +        * create a kernel page fault:
14052 +        */
14053 +       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14054 +               begin, PAGE_ALIGN(end));
14055 +       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14056 +#else
14057         unsigned long addr;
14058
14059 +       /*
14060 +        * We just marked the kernel text read only above, now that
14061 +        * we are going to free part of that, we need to make that
14062 +        * writeable first.
14063 +        */
14064 +       set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
14065 +
14066         for (addr = begin; addr < end; addr += PAGE_SIZE) {
14067                 ClearPageReserved(virt_to_page(addr));
14068                 init_page_count(virt_to_page(addr));
14069 @@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
14070                 totalram_pages++;
14071         }
14072         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14073 +#endif
14074  }
14075
14076  void free_initmem(void)
14077 @@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
14078         free_init_pages("initrd memory", start, end);
14079  }
14080  #endif
14081 -
14082 --- a/arch/x86/mm/init_64-xen.c
14083 +++ b/arch/x86/mm/init_64-xen.c
14084 @@ -46,14 +46,13 @@
14085  #include <asm/proto.h>
14086  #include <asm/smp.h>
14087  #include <asm/sections.h>
14088 +#include <asm/kdebug.h>
14089 +#include <asm/numa.h>
14090 +#include <asm/cacheflush.h>
14091
14092  #include <xen/features.h>
14093
14094 -#ifndef Dprintk
14095 -#define Dprintk(x...)
14096 -#endif
14097 -
14098 -const struct dma_mapping_ops* dma_ops;
14099 +const struct dma_mapping_ops *dma_ops;
14100  EXPORT_SYMBOL(dma_ops);
14101
14102  #if CONFIG_XEN_COMPAT <= 0x030002
14103 @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
14104         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
14105         __START_KERNEL_map)))
14106
14107 -static void __meminit early_make_page_readonly(void *va, unsigned int feature)
14108 +pmd_t *__init early_get_pmd(unsigned long va)
14109 +{
14110 +       unsigned long addr;
14111 +       unsigned long *page = (unsigned long *)init_level4_pgt;
14112 +
14113 +       addr = page[pgd_index(va)];
14114 +       addr_to_page(addr, page);
14115 +
14116 +       addr = page[pud_index(va)];
14117 +       addr_to_page(addr, page);
14118 +
14119 +       return (pmd_t *)&page[pmd_index(va)];
14120 +}
14121 +
14122 +void __meminit early_make_page_readonly(void *va, unsigned int feature)
14123  {
14124         unsigned long addr, _va = (unsigned long)va;
14125         pte_t pte, *ptep;
14126 @@ -107,76 +120,6 @@ static void __meminit early_make_page_re
14127                 BUG();
14128  }
14129
14130 -static void __make_page_readonly(void *va)
14131 -{
14132 -       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14133 -       unsigned long addr = (unsigned long) va;
14134 -
14135 -       pgd = pgd_offset_k(addr);
14136 -       pud = pud_offset(pgd, addr);
14137 -       pmd = pmd_offset(pud, addr);
14138 -       ptep = pte_offset_kernel(pmd, addr);
14139 -
14140 -       pte.pte = ptep->pte & ~_PAGE_RW;
14141 -       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14142 -               xen_l1_entry_update(ptep, pte); /* fallback */
14143 -
14144 -       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14145 -               __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14146 -}
14147 -
14148 -static void __make_page_writable(void *va)
14149 -{
14150 -       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14151 -       unsigned long addr = (unsigned long) va;
14152 -
14153 -       pgd = pgd_offset_k(addr);
14154 -       pud = pud_offset(pgd, addr);
14155 -       pmd = pmd_offset(pud, addr);
14156 -       ptep = pte_offset_kernel(pmd, addr);
14157 -
14158 -       pte.pte = ptep->pte | _PAGE_RW;
14159 -       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14160 -               xen_l1_entry_update(ptep, pte); /* fallback */
14161 -
14162 -       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14163 -               __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14164 -}
14165 -
14166 -void make_page_readonly(void *va, unsigned int feature)
14167 -{
14168 -       if (!xen_feature(feature))
14169 -               __make_page_readonly(va);
14170 -}
14171 -
14172 -void make_page_writable(void *va, unsigned int feature)
14173 -{
14174 -       if (!xen_feature(feature))
14175 -               __make_page_writable(va);
14176 -}
14177 -
14178 -void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14179 -{
14180 -       if (xen_feature(feature))
14181 -               return;
14182 -
14183 -       while (nr-- != 0) {
14184 -               __make_page_readonly(va);
14185 -               va = (void*)((unsigned long)va + PAGE_SIZE);
14186 -       }
14187 -}
14188 -
14189 -void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14190 -{
14191 -       if (xen_feature(feature))
14192 -               return;
14193 -
14194 -       while (nr-- != 0) {
14195 -               __make_page_writable(va);
14196 -               va = (void*)((unsigned long)va + PAGE_SIZE);
14197 -       }
14198 -}
14199 -
14200  /*
14201   * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14202   * physical space so we can cache the place of the first one and move
14203 @@ -187,22 +130,26 @@ void show_mem(void)
14204  {
14205         long i, total = 0, reserved = 0;
14206         long shared = 0, cached = 0;
14207 -       pg_data_t *pgdat;
14208         struct page *page;
14209 +       pg_data_t *pgdat;
14210
14211         printk(KERN_INFO "Mem-info:\n");
14212         show_free_areas();
14213 -       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14214 +       printk(KERN_INFO "Free swap:       %6ldkB\n",
14215 +               nr_swap_pages << (PAGE_SHIFT-10));
14216
14217         for_each_online_pgdat(pgdat) {
14218 -               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14219 -                       /* this loop can take a while with 256 GB and 4k pages
14220 -                          so update the NMI watchdog */
14221 -                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14222 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14223 +                       /*
14224 +                        * This loop can take a while with 256 GB and
14225 +                        * 4k pages so defer the NMI watchdog:
14226 +                        */
14227 +                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14228                                 touch_nmi_watchdog();
14229 -                       }
14230 +
14231                         if (!pfn_valid(pgdat->node_start_pfn + i))
14232                                 continue;
14233 +
14234                         page = pfn_to_page(pgdat->node_start_pfn + i);
14235                         total++;
14236                         if (PageReserved(page))
14237 @@ -211,58 +158,67 @@ void show_mem(void)
14238                                 cached++;
14239                         else if (page_count(page))
14240                                 shared += page_count(page) - 1;
14241 -               }
14242 +               }
14243         }
14244 -       printk(KERN_INFO "%lu pages of RAM\n", total);
14245 -       printk(KERN_INFO "%lu reserved pages\n",reserved);
14246 -       printk(KERN_INFO "%lu pages shared\n",shared);
14247 -       printk(KERN_INFO "%lu pages swap cached\n",cached);
14248 +       printk(KERN_INFO "%lu pages of RAM\n",          total);
14249 +       printk(KERN_INFO "%lu reserved pages\n",        reserved);
14250 +       printk(KERN_INFO "%lu pages shared\n",          shared);
14251 +       printk(KERN_INFO "%lu pages swap cached\n",     cached);
14252  }
14253
14254 +static unsigned long __meminitdata table_start;
14255 +static unsigned long __meminitdata table_end;
14256
14257  static __init void *spp_getpage(void)
14258 -{
14259 +{
14260         void *ptr;
14261 +
14262         if (after_bootmem)
14263 -               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14264 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14265         else if (start_pfn < table_end) {
14266                 ptr = __va(start_pfn << PAGE_SHIFT);
14267                 start_pfn++;
14268                 memset(ptr, 0, PAGE_SIZE);
14269         } else
14270                 ptr = alloc_bootmem_pages(PAGE_SIZE);
14271 -       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14272 -               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14273
14274 -       Dprintk("spp_getpage %p\n", ptr);
14275 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14276 +               panic("set_pte_phys: cannot allocate page data %s\n",
14277 +                       after_bootmem ? "after bootmem" : "");
14278 +       }
14279 +
14280 +       pr_debug("spp_getpage %p\n", ptr);
14281 +
14282         return ptr;
14283 -}
14284 +}
14285
14286  #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14287  #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14288
14289 -static __init void set_pte_phys(unsigned long vaddr,
14290 -                        unsigned long phys, pgprot_t prot, int user_mode)
14291 +static __init void
14292 +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14293  {
14294         pgd_t *pgd;
14295         pud_t *pud;
14296         pmd_t *pmd;
14297         pte_t *pte, new_pte;
14298
14299 -       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14300 +       pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14301
14302         pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14303         if (pgd_none(*pgd)) {
14304 -               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14305 +               printk(KERN_ERR
14306 +                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14307                 return;
14308         }
14309         pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14310         if (pud_none(*pud)) {
14311 -               pmd = (pmd_t *) spp_getpage();
14312 +               pmd = (pmd_t *) spp_getpage();
14313                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14314                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14315                 if (pmd != pmd_offset(pud, 0)) {
14316 -                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14317 +                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14318 +                               pmd, pmd_offset(pud, 0));
14319                         return;
14320                 }
14321         }
14322 @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14323                 make_page_readonly(pte, XENFEAT_writable_page_tables);
14324                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14325                 if (pte != pte_offset_kernel(pmd, 0)) {
14326 -                       printk("PAGETABLE BUG #02!\n");
14327 +                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
14328                         return;
14329                 }
14330         }
14331 @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14332         __flush_tlb_one(vaddr);
14333  }
14334
14335 -static __init void set_pte_phys_ma(unsigned long vaddr,
14336 -                                  unsigned long phys, pgprot_t prot)
14337 +static __init void
14338 +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14339  {
14340         pgd_t *pgd;
14341         pud_t *pud;
14342         pmd_t *pmd;
14343         pte_t *pte, new_pte;
14344
14345 -       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14346 +       pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14347
14348         pgd = pgd_offset_k(vaddr);
14349         if (pgd_none(*pgd)) {
14350 -               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14351 +               printk(KERN_ERR
14352 +                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14353                 return;
14354         }
14355         pud = pud_offset(pgd, vaddr);
14356         if (pud_none(*pud)) {
14357 -
14358 -               pmd = (pmd_t *) spp_getpage();
14359 +               pmd = (pmd_t *) spp_getpage();
14360                 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14361                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14362                 if (pmd != pmd_offset(pud, 0)) {
14363 -                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14364 -                       return;
14365 +                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14366 +                               pmd, pmd_offset(pud, 0));
14367                 }
14368         }
14369         pmd = pmd_offset(pud, vaddr);
14370 @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14371                 make_page_readonly(pte, XENFEAT_writable_page_tables);
14372                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14373                 if (pte != pte_offset_kernel(pmd, 0)) {
14374 -                       printk("PAGETABLE BUG #02!\n");
14375 +                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
14376                         return;
14377                 }
14378         }
14379 @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14380         __flush_tlb_one(vaddr);
14381  }
14382
14383 +#ifndef CONFIG_XEN
14384 +/*
14385 + * The head.S code sets up the kernel high mapping:
14386 + *
14387 + *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14388 + *
14389 + * phys_addr holds the negative offset to the kernel, which is added
14390 + * to the compile time generated pmds. This results in invalid pmds up
14391 + * to the point where we hit the physaddr 0 mapping.
14392 + *
14393 + * We limit the mappings to the region from _text to _end.  _end is
14394 + * rounded up to the 2MB boundary. This catches the invalid pmds as
14395 + * well, as they are located before _text:
14396 + */
14397 +void __init cleanup_highmap(void)
14398 +{
14399 +       unsigned long vaddr = __START_KERNEL_map;
14400 +       unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14401 +       pmd_t *pmd = level2_kernel_pgt;
14402 +       pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14403 +
14404 +       for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14405 +               if (!pmd_present(*pmd))
14406 +                       continue;
14407 +               if (vaddr < (unsigned long) _text || vaddr > end)
14408 +                       set_pmd(pmd, __pmd(0));
14409 +       }
14410 +}
14411 +#endif
14412 +
14413  /* NOTE: this is meant to be run only at boot */
14414 -void __init
14415 -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14416 +void __init
14417 +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14418  {
14419         unsigned long address = __fix_to_virt(idx);
14420
14421         if (idx >= __end_of_fixed_addresses) {
14422 -               printk("Invalid __set_fixmap\n");
14423 +               printk(KERN_ERR "Invalid __set_fixmap\n");
14424                 return;
14425         }
14426         switch (idx) {
14427 @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14428         }
14429  }
14430
14431 -unsigned long __meminitdata table_start, table_end;
14432 -
14433  static __meminit void *alloc_static_page(unsigned long *phys)
14434  {
14435         unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14436
14437         if (after_bootmem) {
14438                 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14439 -
14440                 *phys = __pa(adr);
14441 +
14442                 return adr;
14443         }
14444
14445 @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14446
14447  #define PTE_SIZE PAGE_SIZE
14448
14449 -static inline int make_readonly(unsigned long paddr)
14450 +static inline int __meminit make_readonly(unsigned long paddr)
14451  {
14452         extern char __vsyscall_0;
14453         int readonly = 0;
14454 @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14455  /* Must run before zap_low_mappings */
14456  __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14457  {
14458 -       unsigned long vaddr;
14459         pmd_t *pmd, *last_pmd;
14460 +       unsigned long vaddr;
14461         int i, pmds;
14462
14463         pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14464         vaddr = __START_KERNEL_map;
14465         pmd = level2_kernel_pgt;
14466         last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14467 +
14468         for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14469                 for (i = 0; i < pmds; i++) {
14470                         if (pmd_present(pmd[i]))
14471 -                               goto next;
14472 +                               goto continue_outer_loop;
14473                 }
14474                 vaddr += addr & ~PMD_MASK;
14475                 addr &= PMD_MASK;
14476 +
14477                 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14478 -                       set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14479 -               __flush_tlb();
14480 +                       set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14481 +               __flush_tlb_all();
14482 +
14483                 return (void *)vaddr;
14484 -       next:
14485 +continue_outer_loop:
14486                 ;
14487         }
14488         printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14489         return NULL;
14490  }
14491
14492 -/* To avoid virtual aliases later */
14493 +/*
14494 + * To avoid virtual aliases later:
14495 + */
14496  __meminit void early_iounmap(void *addr, unsigned long size)
14497  {
14498         unsigned long vaddr;
14499 @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14500         vaddr = (unsigned long)addr;
14501         pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14502         pmd = level2_kernel_pgt + pmd_index(vaddr);
14503 +
14504         for (i = 0; i < pmds; i++)
14505                 pmd_clear(pmd + i);
14506 -       __flush_tlb();
14507 +
14508 +       __flush_tlb_all();
14509  }
14510  #endif
14511
14512 @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14513  static void __meminit
14514  phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14515  {
14516 -       pmd_t *pmd = pmd_offset(pud,0);
14517 +       pmd_t *pmd = pmd_offset(pud, 0);
14518         spin_lock(&init_mm.page_table_lock);
14519         phys_pmd_init(pmd, address, end);
14520         spin_unlock(&init_mm.page_table_lock);
14521         __flush_tlb_all();
14522  }
14523
14524 -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14525 -{
14526 +static void __meminit
14527 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14528 +{
14529         int i = pud_index(addr);
14530
14531 -       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14532 +       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14533                 unsigned long pmd_phys;
14534                 pud_t *pud = pud_page + pud_index(addr);
14535                 pmd_t *pmd;
14536 @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14537
14538                 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14539         }
14540 -       __flush_tlb();
14541 -}
14542 +       __flush_tlb_all();
14543 +}
14544
14545  void __init xen_init_pt(void)
14546  {
14547 @@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14548  static void __init extend_init_mapping(unsigned long tables_space)
14549  {
14550         unsigned long va = __START_KERNEL_map;
14551 +       unsigned long start = start_pfn;
14552         unsigned long phys, addr, *pte_page;
14553         pmd_t *pmd;
14554         pte_t *pte, new_pte;
14555 @@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14556                         BUG();
14557                 va += PAGE_SIZE;
14558         }
14559 +
14560 +       if (start_pfn > start)
14561 +               reserve_early(start << PAGE_SHIFT,
14562 +                             start_pfn << PAGE_SHIFT, "INITMAP");
14563  }
14564
14565  static void __init find_early_table_space(unsigned long end)
14566 @@ -706,7 +703,7 @@ static void __init find_early_table_spac
14567                 (table_start << PAGE_SHIFT) + tables);
14568  }
14569
14570 -static void xen_finish_init_mapping(void)
14571 +static void __init xen_finish_init_mapping(void)
14572  {
14573         unsigned long i, start, end;
14574
14575 @@ -742,8 +739,10 @@ static void xen_finish_init_mapping(void
14576          * Prefetch pte's for the bt_ioremap() area. It gets used before the
14577          * boot-time allocator is online, so allocate-on-demand would fail.
14578          */
14579 +       early_ioremap_clear();
14580         for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14581                 __set_fixmap(i, 0, __pgprot(0));
14582 +       early_ioremap_reset();
14583
14584         /* Switch to the real shared_info page, and clear the dummy page. */
14585         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14586 @@ -764,20 +763,23 @@ static void xen_finish_init_mapping(void
14587         table_end = start_pfn;
14588  }
14589
14590 -/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14591 -   This runs before bootmem is initialized and gets pages directly from the
14592 -   physical memory. To access them they are temporarily mapped. */
14593 +/*
14594 + * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14595 + * This runs before bootmem is initialized and gets pages directly from
14596 + * the physical memory. To access them they are temporarily mapped.
14597 + */
14598  void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14599 -{
14600 +{
14601         unsigned long next;
14602
14603 -       Dprintk("init_memory_mapping\n");
14604 +       pr_debug("init_memory_mapping\n");
14605
14606 -       /*
14607 +       /*
14608          * Find space for the kernel direct mapping tables.
14609 -        * Later we should allocate these tables in the local node of the memory
14610 -        * mapped.  Unfortunately this is done currently before the nodes are
14611 -        * discovered.
14612 +        *
14613 +        * Later we should allocate these tables in the local node of the
14614 +        * memory mapped. Unfortunately this is done currently before the
14615 +        * nodes are discovered.
14616          */
14617         if (!after_bootmem)
14618                 find_early_table_space(end);
14619 @@ -786,8 +788,8 @@ void __init_refok init_memory_mapping(un
14620         end = (unsigned long)__va(end);
14621
14622         for (; start < end; start = next) {
14623 -               unsigned long pud_phys;
14624                 pgd_t *pgd = pgd_offset_k(start);
14625 +               unsigned long pud_phys;
14626                 pud_t *pud;
14627
14628                 if (after_bootmem)
14629 @@ -795,8 +797,8 @@ void __init_refok init_memory_mapping(un
14630                 else
14631                         pud = alloc_static_page(&pud_phys);
14632                 next = start + PGDIR_SIZE;
14633 -               if (next > end)
14634 -                       next = end;
14635 +               if (next > end)
14636 +                       next = end;
14637                 phys_pud_init(pud, __pa(start), __pa(next));
14638                 if (!after_bootmem) {
14639                         early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14640 @@ -810,12 +812,17 @@ void __init_refok init_memory_mapping(un
14641         }
14642
14643         __flush_tlb_all();
14644 +
14645 +       if (!after_bootmem)
14646 +               reserve_early(table_start << PAGE_SHIFT,
14647 +                             table_end << PAGE_SHIFT, "PGTABLE");
14648  }
14649
14650  #ifndef CONFIG_NUMA
14651  void __init paging_init(void)
14652  {
14653         unsigned long max_zone_pfns[MAX_NR_ZONES];
14654 +
14655         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14656         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14657         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14658 @@ -829,40 +836,6 @@ void __init paging_init(void)
14659  }
14660  #endif
14661
14662 -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14663 -   from the CPU leading to inconsistent cache lines. address and size
14664 -   must be aligned to 2MB boundaries.
14665 -   Does nothing when the mapping doesn't exist. */
14666 -void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14667 -{
14668 -       unsigned long end = address + size;
14669 -
14670 -       BUG_ON(address & ~LARGE_PAGE_MASK);
14671 -       BUG_ON(size & ~LARGE_PAGE_MASK);
14672 -
14673 -       for (; address < end; address += LARGE_PAGE_SIZE) {
14674 -               pgd_t *pgd = pgd_offset_k(address);
14675 -               pud_t *pud;
14676 -               pmd_t *pmd;
14677 -               if (pgd_none(*pgd))
14678 -                       continue;
14679 -               pud = pud_offset(pgd, address);
14680 -               if (pud_none(*pud))
14681 -                       continue;
14682 -               pmd = pmd_offset(pud, address);
14683 -               if (!pmd || pmd_none(*pmd))
14684 -                       continue;
14685 -               if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14686 -                       /* Could handle this, but it should not happen currently. */
14687 -                       printk(KERN_ERR
14688 -              "clear_kernel_mapping: mapping has been split. will leak memory\n");
14689 -                       pmd_ERROR(*pmd);
14690 -               }
14691 -               set_pmd(pmd, __pmd(0));
14692 -       }
14693 -       __flush_tlb_all();
14694 -}
14695 -
14696  /*
14697   * Memory hotplug specific functions
14698   */
14699 @@ -888,16 +861,12 @@ int arch_add_memory(int nid, u64 start,
14700         unsigned long nr_pages = size >> PAGE_SHIFT;
14701         int ret;
14702
14703 -       init_memory_mapping(start, (start + size -1));
14704 +       init_memory_mapping(start, start + size-1);
14705
14706         ret = __add_pages(zone, start_pfn, nr_pages);
14707 -       if (ret)
14708 -               goto error;
14709 +       WARN_ON(1);
14710
14711         return ret;
14712 -error:
14713 -       printk("%s: Problem encountered in __add_pages!\n", __func__);
14714 -       return ret;
14715  }
14716  EXPORT_SYMBOL_GPL(arch_add_memory);
14717
14718 @@ -911,36 +880,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14719
14720  #endif /* CONFIG_MEMORY_HOTPLUG */
14721
14722 -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14723 -/*
14724 - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14725 - * just online the pages.
14726 - */
14727 -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14728 -{
14729 -       int err = -EIO;
14730 -       unsigned long pfn;
14731 -       unsigned long total = 0, mem = 0;
14732 -       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14733 -               if (pfn_valid(pfn)) {
14734 -                       online_page(pfn_to_page(pfn));
14735 -                       err = 0;
14736 -                       mem++;
14737 -               }
14738 -               total++;
14739 -       }
14740 -       if (!err) {
14741 -               z->spanned_pages += total;
14742 -               z->present_pages += mem;
14743 -               z->zone_pgdat->node_spanned_pages += total;
14744 -               z->zone_pgdat->node_present_pages += mem;
14745 -       }
14746 -       return err;
14747 -}
14748 -#endif
14749 -
14750 -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14751 -                        kcore_vsyscall;
14752 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14753 +                        kcore_modules, kcore_vsyscall;
14754
14755  void __init mem_init(void)
14756  {
14757 @@ -949,8 +890,7 @@ void __init mem_init(void)
14758
14759         pci_iommu_alloc();
14760
14761 -       /* clear the zero-page */
14762 -       memset(empty_zero_page, 0, PAGE_SIZE);
14763 +       /* clear_bss() already clear the empty_zero_page */
14764
14765         reservedpages = 0;
14766
14767 @@ -968,7 +908,6 @@ void __init mem_init(void)
14768         }
14769         reservedpages = end_pfn - totalram_pages -
14770                                         absent_pages_in_range(0, end_pfn);
14771 -
14772         after_bootmem = 1;
14773
14774         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
14775 @@ -976,46 +915,64 @@ void __init mem_init(void)
14776         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
14777
14778         /* Register memory areas for /proc/kcore */
14779 -       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14780 -       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14781 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14782 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14783                    VMALLOC_END-VMALLOC_START);
14784         kclist_add(&kcore_kernel, &_stext, _end - _stext);
14785         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14786 -       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14787 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14788                                  VSYSCALL_END - VSYSCALL_START);
14789
14790 -       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14791 +       printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14792 +                               "%ldk reserved, %ldk data, %ldk init)\n",
14793                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14794                 end_pfn << (PAGE_SHIFT-10),
14795                 codesize >> 10,
14796                 reservedpages << (PAGE_SHIFT-10),
14797                 datasize >> 10,
14798                 initsize >> 10);
14799 +
14800 +       cpa_init();
14801  }
14802
14803  void free_init_pages(char *what, unsigned long begin, unsigned long end)
14804  {
14805 -       unsigned long addr;
14806 +       unsigned long addr = begin;
14807
14808 -       if (begin >= end)
14809 +       if (addr >= end)
14810                 return;
14811
14812 +       /*
14813 +        * If debugging page accesses then do not free this memory but
14814 +        * mark them not present - any buggy init-section access will
14815 +        * create a kernel page fault:
14816 +        */
14817 +#ifdef CONFIG_DEBUG_PAGEALLOC
14818 +       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14819 +               begin, PAGE_ALIGN(end));
14820 +       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14821 +#else
14822         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14823 -       for (addr = begin; addr < end; addr += PAGE_SIZE) {
14824 +
14825 +       for (; addr < end; addr += PAGE_SIZE) {
14826                 ClearPageReserved(virt_to_page(addr));
14827                 init_page_count(virt_to_page(addr));
14828                 memset((void *)(addr & ~(PAGE_SIZE-1)),
14829                        POISON_FREE_INITMEM, PAGE_SIZE);
14830                 if (addr >= __START_KERNEL_map) {
14831                         /* make_readonly() reports all kernel addresses. */
14832 -                       __make_page_writable(__va(__pa(addr)));
14833 -                       change_page_attr_addr(addr, 1, __pgprot(0));
14834 +                       if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14835 +                                                        pfn_pte(__pa(addr) >> PAGE_SHIFT,
14836 +                                                                PAGE_KERNEL),
14837 +                                                        0))
14838 +                               BUG();
14839 +                       if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14840 +                               BUG();
14841                 }
14842                 free_page(addr);
14843                 totalram_pages++;
14844         }
14845 -       if (addr > __START_KERNEL_map)
14846 -               global_flush_tlb();
14847 +#endif
14848  }
14849
14850  void free_initmem(void)
14851 @@ -1026,6 +983,8 @@ void free_initmem(void)
14852  }
14853
14854  #ifdef CONFIG_DEBUG_RODATA
14855 +const int rodata_test_data = 0xC3;
14856 +EXPORT_SYMBOL_GPL(rodata_test_data);
14857
14858  void mark_rodata_ro(void)
14859  {
14860 @@ -1047,18 +1006,27 @@ void mark_rodata_ro(void)
14861         if (end <= start)
14862                 return;
14863
14864 -       change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14865
14866         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14867                (end - start) >> 10);
14868 +       set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14869
14870         /*
14871 -        * change_page_attr_addr() requires a global_flush_tlb() call after it.
14872 -        * We do this after the printk so that if something went wrong in the
14873 -        * change, the printk gets out at least to give a better debug hint
14874 -        * of who is the culprit.
14875 +        * The rodata section (but not the kernel text!) should also be
14876 +        * not-executable.
14877          */
14878 -       global_flush_tlb();
14879 +       start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14880 +       set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14881 +
14882 +       rodata_test();
14883 +
14884 +#ifdef CONFIG_CPA_DEBUG
14885 +       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14886 +       set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14887 +
14888 +       printk(KERN_INFO "Testing CPA: again\n");
14889 +       set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14890 +#endif
14891  }
14892  #endif
14893
14894 @@ -1069,17 +1037,21 @@ void free_initrd_mem(unsigned long start
14895  }
14896  #endif
14897
14898 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14899 -{
14900 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14901 +{
14902  #ifdef CONFIG_NUMA
14903         int nid = phys_to_nid(phys);
14904  #endif
14905         unsigned long pfn = phys >> PAGE_SHIFT;
14906 +
14907         if (pfn >= end_pfn) {
14908 -               /* This can happen with kdump kernels when accessing firmware
14909 -                  tables. */
14910 +               /*
14911 +                * This can happen with kdump kernels when accessing
14912 +                * firmware tables:
14913 +                */
14914                 if (pfn < end_pfn_map)
14915                         return;
14916 +
14917                 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14918                                 phys, len);
14919                 return;
14920 @@ -1087,9 +1059,9 @@ void __init reserve_bootmem_generic(unsi
14921
14922         /* Should check here against the e820 map to avoid double free */
14923  #ifdef CONFIG_NUMA
14924 -       reserve_bootmem_node(NODE_DATA(nid), phys, len);
14925 -#else
14926 -       reserve_bootmem(phys, len);
14927 +       reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14928 +#else
14929 +       reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14930  #endif
14931  #ifndef CONFIG_XEN
14932         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14933 @@ -1101,46 +1073,49 @@ void __init reserve_bootmem_generic(unsi
14934  #endif
14935  }
14936
14937 -int kern_addr_valid(unsigned long addr)
14938 -{
14939 +int kern_addr_valid(unsigned long addr)
14940 +{
14941         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14942 -       pgd_t *pgd;
14943 -       pud_t *pud;
14944 -       pmd_t *pmd;
14945 -       pte_t *pte;
14946 +       pgd_t *pgd;
14947 +       pud_t *pud;
14948 +       pmd_t *pmd;
14949 +       pte_t *pte;
14950
14951         if (above != 0 && above != -1UL)
14952 -               return 0;
14953 -
14954 +               return 0;
14955 +
14956         pgd = pgd_offset_k(addr);
14957         if (pgd_none(*pgd))
14958                 return 0;
14959
14960         pud = pud_offset(pgd, addr);
14961         if (pud_none(*pud))
14962 -               return 0;
14963 +               return 0;
14964
14965         pmd = pmd_offset(pud, addr);
14966         if (pmd_none(*pmd))
14967                 return 0;
14968 +
14969         if (pmd_large(*pmd))
14970                 return pfn_valid(pmd_pfn(*pmd));
14971
14972         pte = pte_offset_kernel(pmd, addr);
14973         if (pte_none(*pte))
14974                 return 0;
14975 +
14976         return pfn_valid(pte_pfn(*pte));
14977  }
14978
14979 -/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
14980 -   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14981 -   not need special handling anymore. */
14982 -
14983 +/*
14984 + * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
14985 + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14986 + * not need special handling anymore:
14987 + */
14988  static struct vm_area_struct gate_vma = {
14989 -       .vm_start = VSYSCALL_START,
14990 -       .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14991 -       .vm_page_prot = PAGE_READONLY_EXEC,
14992 -       .vm_flags = VM_READ | VM_EXEC
14993 +       .vm_start       = VSYSCALL_START,
14994 +       .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14995 +       .vm_page_prot   = PAGE_READONLY_EXEC,
14996 +       .vm_flags       = VM_READ | VM_EXEC
14997  };
14998
14999  struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
15000 @@ -1155,14 +1130,17 @@ struct vm_area_struct *get_gate_vma(stru
15001  int in_gate_area(struct task_struct *task, unsigned long addr)
15002  {
15003         struct vm_area_struct *vma = get_gate_vma(task);
15004 +
15005         if (!vma)
15006                 return 0;
15007 +
15008         return (addr >= vma->vm_start) && (addr < vma->vm_end);
15009  }
15010
15011 -/* Use this when you have no reliable task/vma, typically from interrupt
15012 - * context.  It is less reliable than using the task's vma and may give
15013 - * false positives.
15014 +/*
15015 + * Use this when you have no reliable task/vma, typically from interrupt
15016 + * context. It is less reliable than using the task's vma and may give
15017 + * false positives:
15018   */
15019  int in_gate_area_no_task(unsigned long addr)
15020  {
15021 @@ -1182,8 +1160,8 @@ const char *arch_vma_name(struct vm_area
15022  /*
15023   * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
15024   */
15025 -int __meminit vmemmap_populate(struct page *start_page,
15026 -                                               unsigned long size, int node)
15027 +int __meminit
15028 +vmemmap_populate(struct page *start_page, unsigned long size, int node)
15029  {
15030         unsigned long addr = (unsigned long)start_page;
15031         unsigned long end = (unsigned long)(start_page + size);
15032 @@ -1198,6 +1176,7 @@ int __meminit vmemmap_populate(struct pa
15033                 pgd = vmemmap_pgd_populate(addr, node);
15034                 if (!pgd)
15035                         return -ENOMEM;
15036 +
15037                 pud = vmemmap_pud_populate(pgd, addr, node);
15038                 if (!pud)
15039                         return -ENOMEM;
15040 @@ -1205,20 +1184,22 @@ int __meminit vmemmap_populate(struct pa
15041                 pmd = pmd_offset(pud, addr);
15042                 if (pmd_none(*pmd)) {
15043                         pte_t entry;
15044 -                       void *p = vmemmap_alloc_block(PMD_SIZE, node);
15045 +                       void *p;
15046 +
15047 +                       p = vmemmap_alloc_block(PMD_SIZE, node);
15048                         if (!p)
15049                                 return -ENOMEM;
15050
15051 -                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
15052 -                       mk_pte_huge(entry);
15053 -                       set_pmd(pmd, __pmd(pte_val(entry)));
15054 +                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
15055 +                                                       PAGE_KERNEL_LARGE);
15056 +                       set_pmd(pmd, __pmd_ma(__pte_val(entry)));
15057
15058                         printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
15059                                 addr, addr + PMD_SIZE - 1, p, node);
15060 -               } else
15061 +               } else {
15062                         vmemmap_verify((pte_t *)pmd, node, addr, next);
15063 +               }
15064         }
15065 -
15066         return 0;
15067  }
15068  #endif
15069 --- a/arch/x86/mm/ioremap_32-xen.c
15070 +++ /dev/null
15071 @@ -1,445 +0,0 @@
15072 -/*
15073 - * arch/i386/mm/ioremap.c
15074 - *
15075 - * Re-map IO memory to kernel address space so that we can access it.
15076 - * This is needed for high PCI addresses that aren't mapped in the
15077 - * 640k-1MB IO memory area on PC's
15078 - *
15079 - * (C) Copyright 1995 1996 Linus Torvalds
15080 - */
15081 -
15082 -#include <linux/vmalloc.h>
15083 -#include <linux/init.h>
15084 -#include <linux/slab.h>
15085 -#include <linux/module.h>
15086 -#include <linux/io.h>
15087 -#include <linux/sched.h>
15088 -#include <asm/fixmap.h>
15089 -#include <asm/cacheflush.h>
15090 -#include <asm/tlbflush.h>
15091 -#include <asm/pgtable.h>
15092 -#include <asm/pgalloc.h>
15093 -
15094 -#define ISA_START_ADDRESS      0x0
15095 -#define ISA_END_ADDRESS                0x100000
15096 -
15097 -static int direct_remap_area_pte_fn(pte_t *pte,
15098 -                                   struct page *pmd_page,
15099 -                                   unsigned long address,
15100 -                                   void *data)
15101 -{
15102 -       mmu_update_t **v = (mmu_update_t **)data;
15103 -
15104 -       BUG_ON(!pte_none(*pte));
15105 -
15106 -       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15107 -                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15108 -       (*v)++;
15109 -
15110 -       return 0;
15111 -}
15112 -
15113 -static int __direct_remap_pfn_range(struct mm_struct *mm,
15114 -                                   unsigned long address,
15115 -                                   unsigned long mfn,
15116 -                                   unsigned long size,
15117 -                                   pgprot_t prot,
15118 -                                   domid_t  domid)
15119 -{
15120 -       int rc;
15121 -       unsigned long i, start_address;
15122 -       mmu_update_t *u, *v, *w;
15123 -
15124 -       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15125 -       if (u == NULL)
15126 -               return -ENOMEM;
15127 -
15128 -       start_address = address;
15129 -
15130 -       flush_cache_all();
15131 -
15132 -       for (i = 0; i < size; i += PAGE_SIZE) {
15133 -               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15134 -                       /* Flush a full batch after filling in the PTE ptrs. */
15135 -                       rc = apply_to_page_range(mm, start_address,
15136 -                                                address - start_address,
15137 -                                                direct_remap_area_pte_fn, &w);
15138 -                       if (rc)
15139 -                               goto out;
15140 -                       rc = -EFAULT;
15141 -                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15142 -                               goto out;
15143 -                       v = w = u;
15144 -                       start_address = address;
15145 -               }
15146 -
15147 -               /*
15148 -                * Fill in the machine address: PTE ptr is done later by
15149 -                * apply_to_page_range().
15150 -                */
15151 -               v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15152 -
15153 -               mfn++;
15154 -               address += PAGE_SIZE;
15155 -               v++;
15156 -       }
15157 -
15158 -       if (v != u) {
15159 -               /* Final batch. */
15160 -               rc = apply_to_page_range(mm, start_address,
15161 -                                        address - start_address,
15162 -                                        direct_remap_area_pte_fn, &w);
15163 -               if (rc)
15164 -                       goto out;
15165 -               rc = -EFAULT;
15166 -               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15167 -                       goto out;
15168 -       }
15169 -
15170 -       rc = 0;
15171 -
15172 - out:
15173 -       flush_tlb_all();
15174 -
15175 -       free_page((unsigned long)u);
15176 -
15177 -       return rc;
15178 -}
15179 -
15180 -int direct_remap_pfn_range(struct vm_area_struct *vma,
15181 -                          unsigned long address,
15182 -                          unsigned long mfn,
15183 -                          unsigned long size,
15184 -                          pgprot_t prot,
15185 -                          domid_t  domid)
15186 -{
15187 -       if (xen_feature(XENFEAT_auto_translated_physmap))
15188 -               return remap_pfn_range(vma, address, mfn, size, prot);
15189 -
15190 -       if (domid == DOMID_SELF)
15191 -               return -EINVAL;
15192 -
15193 -       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15194 -
15195 -       vma->vm_mm->context.has_foreign_mappings = 1;
15196 -
15197 -       return __direct_remap_pfn_range(
15198 -               vma->vm_mm, address, mfn, size, prot, domid);
15199 -}
15200 -EXPORT_SYMBOL(direct_remap_pfn_range);
15201 -
15202 -int direct_kernel_remap_pfn_range(unsigned long address,
15203 -                                 unsigned long mfn,
15204 -                                 unsigned long size,
15205 -                                 pgprot_t prot,
15206 -                                 domid_t  domid)
15207 -{
15208 -       return __direct_remap_pfn_range(
15209 -               &init_mm, address, mfn, size, prot, domid);
15210 -}
15211 -EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15212 -
15213 -static int lookup_pte_fn(
15214 -       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15215 -{
15216 -       uint64_t *ptep = (uint64_t *)data;
15217 -       if (ptep)
15218 -               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15219 -                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15220 -       return 0;
15221 -}
15222 -
15223 -int create_lookup_pte_addr(struct mm_struct *mm,
15224 -                          unsigned long address,
15225 -                          uint64_t *ptep)
15226 -{
15227 -       return apply_to_page_range(mm, address, PAGE_SIZE,
15228 -                                  lookup_pte_fn, ptep);
15229 -}
15230 -
15231 -EXPORT_SYMBOL(create_lookup_pte_addr);
15232 -
15233 -static int noop_fn(
15234 -       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15235 -{
15236 -       return 0;
15237 -}
15238 -
15239 -int touch_pte_range(struct mm_struct *mm,
15240 -                   unsigned long address,
15241 -                   unsigned long size)
15242 -{
15243 -       return apply_to_page_range(mm, address, size, noop_fn, NULL);
15244 -}
15245 -
15246 -EXPORT_SYMBOL(touch_pte_range);
15247 -
15248 -/*
15249 - * Does @address reside within a non-highmem page that is local to this virtual
15250 - * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15251 - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15252 - * why this works.
15253 - */
15254 -static inline int is_local_lowmem(unsigned long address)
15255 -{
15256 -       extern unsigned long max_low_pfn;
15257 -       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15258 -}
15259 -
15260 -/*
15261 - * Generic mapping function (not visible outside):
15262 - */
15263 -
15264 -/*
15265 - * Remap an arbitrary physical address space into the kernel virtual
15266 - * address space. Needed when the kernel wants to access high addresses
15267 - * directly.
15268 - *
15269 - * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15270 - * have to convert them into an offset in a page-aligned mapping, but the
15271 - * caller shouldn't need to know that small detail.
15272 - */
15273 -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15274 -{
15275 -       void __iomem * addr;
15276 -       struct vm_struct * area;
15277 -       unsigned long offset, last_addr;
15278 -       pgprot_t prot;
15279 -       domid_t domid = DOMID_IO;
15280 -
15281 -       /* Don't allow wraparound or zero size */
15282 -       last_addr = phys_addr + size - 1;
15283 -       if (!size || last_addr < phys_addr)
15284 -               return NULL;
15285 -
15286 -       /*
15287 -        * Don't remap the low PCI/ISA area, it's always mapped..
15288 -        */
15289 -       if (is_initial_xendomain() &&
15290 -           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15291 -               return (void __iomem *) isa_bus_to_virt(phys_addr);
15292 -
15293 -       /*
15294 -        * Don't allow anybody to remap normal RAM that we're using..
15295 -        */
15296 -       if (is_local_lowmem(phys_addr)) {
15297 -               char *t_addr, *t_end;
15298 -               struct page *page;
15299 -
15300 -               t_addr = bus_to_virt(phys_addr);
15301 -               t_end = t_addr + (size - 1);
15302 -
15303 -               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15304 -                       if(!PageReserved(page))
15305 -                               return NULL;
15306 -
15307 -               domid = DOMID_SELF;
15308 -       }
15309 -
15310 -       prot = __pgprot(_KERNPG_TABLE | flags);
15311 -
15312 -       /*
15313 -        * Mappings have to be page-aligned
15314 -        */
15315 -       offset = phys_addr & ~PAGE_MASK;
15316 -       phys_addr &= PAGE_MASK;
15317 -       size = PAGE_ALIGN(last_addr+1) - phys_addr;
15318 -
15319 -       /*
15320 -        * Ok, go for it..
15321 -        */
15322 -       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15323 -       if (!area)
15324 -               return NULL;
15325 -       area->phys_addr = phys_addr;
15326 -       addr = (void __iomem *) area->addr;
15327 -       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15328 -                                    phys_addr>>PAGE_SHIFT,
15329 -                                    size, prot, domid)) {
15330 -               vunmap((void __force *) addr);
15331 -               return NULL;
15332 -       }
15333 -       return (void __iomem *) (offset + (char __iomem *)addr);
15334 -}
15335 -EXPORT_SYMBOL(__ioremap);
15336 -
15337 -/**
15338 - * ioremap_nocache     -   map bus memory into CPU space
15339 - * @offset:    bus address of the memory
15340 - * @size:      size of the resource to map
15341 - *
15342 - * ioremap_nocache performs a platform specific sequence of operations to
15343 - * make bus memory CPU accessible via the readb/readw/readl/writeb/
15344 - * writew/writel functions and the other mmio helpers. The returned
15345 - * address is not guaranteed to be usable directly as a virtual
15346 - * address.
15347 - *
15348 - * This version of ioremap ensures that the memory is marked uncachable
15349 - * on the CPU as well as honouring existing caching rules from things like
15350 - * the PCI bus. Note that there are other caches and buffers on many
15351 - * busses. In particular driver authors should read up on PCI writes
15352 - *
15353 - * It's useful if some control registers are in such an area and
15354 - * write combining or read caching is not desirable:
15355 - *
15356 - * Must be freed with iounmap.
15357 - */
15358 -
15359 -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15360 -{
15361 -       unsigned long last_addr;
15362 -       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15363 -       if (!p)
15364 -               return p;
15365 -
15366 -       /* Guaranteed to be > phys_addr, as per __ioremap() */
15367 -       last_addr = phys_addr + size - 1;
15368 -
15369 -       if (is_local_lowmem(last_addr)) {
15370 -               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15371 -               unsigned long npages;
15372 -
15373 -               phys_addr &= PAGE_MASK;
15374 -
15375 -               /* This might overflow and become zero.. */
15376 -               last_addr = PAGE_ALIGN(last_addr);
15377 -
15378 -               /* .. but that's ok, because modulo-2**n arithmetic will make
15379 -               * the page-aligned "last - first" come out right.
15380 -               */
15381 -               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15382 -
15383 -               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15384 -                       iounmap(p);
15385 -                       p = NULL;
15386 -               }
15387 -               global_flush_tlb();
15388 -       }
15389 -
15390 -       return p;
15391 -}
15392 -EXPORT_SYMBOL(ioremap_nocache);
15393 -
15394 -/**
15395 - * iounmap - Free a IO remapping
15396 - * @addr: virtual address from ioremap_*
15397 - *
15398 - * Caller must ensure there is only one unmapping for the same pointer.
15399 - */
15400 -void iounmap(volatile void __iomem *addr)
15401 -{
15402 -       struct vm_struct *p, *o;
15403 -
15404 -       if ((void __force *)addr <= high_memory)
15405 -               return;
15406 -
15407 -       /*
15408 -        * __ioremap special-cases the PCI/ISA range by not instantiating a
15409 -        * vm_area and by simply returning an address into the kernel mapping
15410 -        * of ISA space.   So handle that here.
15411 -        */
15412 -       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15413 -               return;
15414 -
15415 -       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15416 -
15417 -       /* Use the vm area unlocked, assuming the caller
15418 -          ensures there isn't another iounmap for the same address
15419 -          in parallel. Reuse of the virtual address is prevented by
15420 -          leaving it in the global lists until we're done with it.
15421 -          cpa takes care of the direct mappings. */
15422 -       read_lock(&vmlist_lock);
15423 -       for (p = vmlist; p; p = p->next) {
15424 -               if (p->addr == addr)
15425 -                       break;
15426 -       }
15427 -       read_unlock(&vmlist_lock);
15428 -
15429 -       if (!p) {
15430 -               printk("iounmap: bad address %p\n", addr);
15431 -               dump_stack();
15432 -               return;
15433 -       }
15434 -
15435 -       /* Reset the direct mapping. Can block */
15436 -       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15437 -               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15438 -                                get_vm_area_size(p) >> PAGE_SHIFT,
15439 -                                PAGE_KERNEL);
15440 -               global_flush_tlb();
15441 -       }
15442 -
15443 -       /* Finally remove it */
15444 -       o = remove_vm_area((void *)addr);
15445 -       BUG_ON(p != o || o == NULL);
15446 -       kfree(p);
15447 -}
15448 -EXPORT_SYMBOL(iounmap);
15449 -
15450 -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
15451 -{
15452 -       unsigned long offset, last_addr;
15453 -       unsigned int nrpages;
15454 -       enum fixed_addresses idx;
15455 -
15456 -       /* Don't allow wraparound or zero size */
15457 -       last_addr = phys_addr + size - 1;
15458 -       if (!size || last_addr < phys_addr)
15459 -               return NULL;
15460 -
15461 -       /*
15462 -        * Don't remap the low PCI/ISA area, it's always mapped..
15463 -        */
15464 -       if (is_initial_xendomain() &&
15465 -           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15466 -               return isa_bus_to_virt(phys_addr);
15467 -
15468 -       /*
15469 -        * Mappings have to be page-aligned
15470 -        */
15471 -       offset = phys_addr & ~PAGE_MASK;
15472 -       phys_addr &= PAGE_MASK;
15473 -       size = PAGE_ALIGN(last_addr) - phys_addr;
15474 -
15475 -       /*
15476 -        * Mappings have to fit in the FIX_BTMAP area.
15477 -        */
15478 -       nrpages = size >> PAGE_SHIFT;
15479 -       if (nrpages > NR_FIX_BTMAPS)
15480 -               return NULL;
15481 -
15482 -       /*
15483 -        * Ok, go for it..
15484 -        */
15485 -       idx = FIX_BTMAP_BEGIN;
15486 -       while (nrpages > 0) {
15487 -               set_fixmap(idx, phys_addr);
15488 -               phys_addr += PAGE_SIZE;
15489 -               --idx;
15490 -               --nrpages;
15491 -       }
15492 -       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
15493 -}
15494 -
15495 -void __init bt_iounmap(void *addr, unsigned long size)
15496 -{
15497 -       unsigned long virt_addr;
15498 -       unsigned long offset;
15499 -       unsigned int nrpages;
15500 -       enum fixed_addresses idx;
15501 -
15502 -       virt_addr = (unsigned long)addr;
15503 -       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
15504 -               return;
15505 -       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15506 -               return;
15507 -       offset = virt_addr & ~PAGE_MASK;
15508 -       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15509 -
15510 -       idx = FIX_BTMAP_BEGIN;
15511 -       while (nrpages > 0) {
15512 -               clear_fixmap(idx);
15513 -               --idx;
15514 -               --nrpages;
15515 -       }
15516 -}
15517 --- /dev/null
15518 +++ b/arch/x86/mm/ioremap-xen.c
15519 @@ -0,0 +1,685 @@
15520 +/*
15521 + * Re-map IO memory to kernel address space so that we can access it.
15522 + * This is needed for high PCI addresses that aren't mapped in the
15523 + * 640k-1MB IO memory area on PC's
15524 + *
15525 + * (C) Copyright 1995 1996 Linus Torvalds
15526 + */
15527 +
15528 +#include <linux/bootmem.h>
15529 +#include <linux/init.h>
15530 +#include <linux/io.h>
15531 +#include <linux/module.h>
15532 +#include <linux/pfn.h>
15533 +#include <linux/slab.h>
15534 +#include <linux/vmalloc.h>
15535 +
15536 +#include <asm/cacheflush.h>
15537 +#include <asm/e820.h>
15538 +#include <asm/fixmap.h>
15539 +#include <asm/pgtable.h>
15540 +#include <asm/tlbflush.h>
15541 +#include <asm/pgalloc.h>
15542 +
15543 +enum ioremap_mode {
15544 +       IOR_MODE_UNCACHED,
15545 +       IOR_MODE_CACHED,
15546 +};
15547 +
15548 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
15549 +
15550 +unsigned long __phys_addr(unsigned long x)
15551 +{
15552 +       if (x >= __START_KERNEL_map)
15553 +               return x - __START_KERNEL_map + phys_base;
15554 +       return x - PAGE_OFFSET;
15555 +}
15556 +EXPORT_SYMBOL(__phys_addr);
15557 +
15558 +#endif
15559 +
15560 +static int direct_remap_area_pte_fn(pte_t *pte,
15561 +                                   struct page *pmd_page,
15562 +                                   unsigned long address,
15563 +                                   void *data)
15564 +{
15565 +       mmu_update_t **v = (mmu_update_t **)data;
15566 +
15567 +       BUG_ON(!pte_none(*pte));
15568 +
15569 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15570 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15571 +       (*v)++;
15572 +
15573 +       return 0;
15574 +}
15575 +
15576 +static int __direct_remap_pfn_range(struct mm_struct *mm,
15577 +                                   unsigned long address,
15578 +                                   unsigned long mfn,
15579 +                                   unsigned long size,
15580 +                                   pgprot_t prot,
15581 +                                   domid_t  domid)
15582 +{
15583 +       int rc;
15584 +       unsigned long i, start_address;
15585 +       mmu_update_t *u, *v, *w;
15586 +
15587 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15588 +       if (u == NULL)
15589 +               return -ENOMEM;
15590 +
15591 +       start_address = address;
15592 +
15593 +       flush_cache_all();
15594 +
15595 +       for (i = 0; i < size; i += PAGE_SIZE) {
15596 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15597 +                       /* Flush a full batch after filling in the PTE ptrs. */
15598 +                       rc = apply_to_page_range(mm, start_address,
15599 +                                                address - start_address,
15600 +                                                direct_remap_area_pte_fn, &w);
15601 +                       if (rc)
15602 +                               goto out;
15603 +                       rc = -EFAULT;
15604 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15605 +                               goto out;
15606 +                       v = w = u;
15607 +                       start_address = address;
15608 +               }
15609 +
15610 +               /*
15611 +                * Fill in the machine address: PTE ptr is done later by
15612 +                * apply_to_page_range().
15613 +                */
15614 +               v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15615 +
15616 +               mfn++;
15617 +               address += PAGE_SIZE;
15618 +               v++;
15619 +       }
15620 +
15621 +       if (v != u) {
15622 +               /* Final batch. */
15623 +               rc = apply_to_page_range(mm, start_address,
15624 +                                        address - start_address,
15625 +                                        direct_remap_area_pte_fn, &w);
15626 +               if (rc)
15627 +                       goto out;
15628 +               rc = -EFAULT;
15629 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15630 +                       goto out;
15631 +       }
15632 +
15633 +       rc = 0;
15634 +
15635 + out:
15636 +       flush_tlb_all();
15637 +
15638 +       free_page((unsigned long)u);
15639 +
15640 +       return rc;
15641 +}
15642 +
15643 +int direct_remap_pfn_range(struct vm_area_struct *vma,
15644 +                          unsigned long address,
15645 +                          unsigned long mfn,
15646 +                          unsigned long size,
15647 +                          pgprot_t prot,
15648 +                          domid_t  domid)
15649 +{
15650 +       if (xen_feature(XENFEAT_auto_translated_physmap))
15651 +               return remap_pfn_range(vma, address, mfn, size, prot);
15652 +
15653 +       if (domid == DOMID_SELF)
15654 +               return -EINVAL;
15655 +
15656 +       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15657 +
15658 +       vma->vm_mm->context.has_foreign_mappings = 1;
15659 +
15660 +       return __direct_remap_pfn_range(
15661 +               vma->vm_mm, address, mfn, size, prot, domid);
15662 +}
15663 +EXPORT_SYMBOL(direct_remap_pfn_range);
15664 +
15665 +int direct_kernel_remap_pfn_range(unsigned long address,
15666 +                                 unsigned long mfn,
15667 +                                 unsigned long size,
15668 +                                 pgprot_t prot,
15669 +                                 domid_t  domid)
15670 +{
15671 +       return __direct_remap_pfn_range(
15672 +               &init_mm, address, mfn, size, prot, domid);
15673 +}
15674 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15675 +
15676 +static int lookup_pte_fn(
15677 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15678 +{
15679 +       uint64_t *ptep = (uint64_t *)data;
15680 +       if (ptep)
15681 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15682 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15683 +       return 0;
15684 +}
15685 +
15686 +int create_lookup_pte_addr(struct mm_struct *mm,
15687 +                          unsigned long address,
15688 +                          uint64_t *ptep)
15689 +{
15690 +       return apply_to_page_range(mm, address, PAGE_SIZE,
15691 +                                  lookup_pte_fn, ptep);
15692 +}
15693 +
15694 +EXPORT_SYMBOL(create_lookup_pte_addr);
15695 +
15696 +static int noop_fn(
15697 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15698 +{
15699 +       return 0;
15700 +}
15701 +
15702 +int touch_pte_range(struct mm_struct *mm,
15703 +                   unsigned long address,
15704 +                   unsigned long size)
15705 +{
15706 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
15707 +}
15708 +
15709 +EXPORT_SYMBOL(touch_pte_range);
15710 +
15711 +#ifdef CONFIG_X86_32
15712 +int page_is_ram(unsigned long pagenr)
15713 +{
15714 +       unsigned long addr, end;
15715 +       int i;
15716 +
15717 +#ifndef CONFIG_XEN
15718 +       /*
15719 +        * A special case is the first 4Kb of memory;
15720 +        * This is a BIOS owned area, not kernel ram, but generally
15721 +        * not listed as such in the E820 table.
15722 +        */
15723 +       if (pagenr == 0)
15724 +               return 0;
15725 +
15726 +       /*
15727 +        * Second special case: Some BIOSen report the PC BIOS
15728 +        * area (640->1Mb) as ram even though it is not.
15729 +        */
15730 +       if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15731 +                   pagenr < (BIOS_END >> PAGE_SHIFT))
15732 +               return 0;
15733 +#endif
15734 +
15735 +       for (i = 0; i < e820.nr_map; i++) {
15736 +               /*
15737 +                * Not usable memory:
15738 +                */
15739 +               if (e820.map[i].type != E820_RAM)
15740 +                       continue;
15741 +               addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15742 +               end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15743 +
15744 +
15745 +               if ((pagenr >= addr) && (pagenr < end))
15746 +                       return 1;
15747 +       }
15748 +       return 0;
15749 +}
15750 +#endif
15751 +
15752 +/*
15753 + * Fix up the linear direct mapping of the kernel to avoid cache attribute
15754 + * conflicts.
15755 + */
15756 +static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15757 +                              enum ioremap_mode mode)
15758 +{
15759 +       unsigned long nrpages = size >> PAGE_SHIFT;
15760 +       int err;
15761 +
15762 +       switch (mode) {
15763 +       case IOR_MODE_UNCACHED:
15764 +       default:
15765 +               err = set_memory_uc(vaddr, nrpages);
15766 +               break;
15767 +       case IOR_MODE_CACHED:
15768 +               err = set_memory_wb(vaddr, nrpages);
15769 +               break;
15770 +       }
15771 +
15772 +       return err;
15773 +}
15774 +
15775 +/*
15776 + * Remap an arbitrary physical address space into the kernel virtual
15777 + * address space. Needed when the kernel wants to access high addresses
15778 + * directly.
15779 + *
15780 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15781 + * have to convert them into an offset in a page-aligned mapping, but the
15782 + * caller shouldn't need to know that small detail.
15783 + */
15784 +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15785 +                              enum ioremap_mode mode)
15786 +{
15787 +       unsigned long mfn, offset, last_addr, vaddr;
15788 +       struct vm_struct *area;
15789 +       pgprot_t prot;
15790 +       domid_t domid = DOMID_IO;
15791 +
15792 +       /* Don't allow wraparound or zero size */
15793 +       last_addr = phys_addr + size - 1;
15794 +       if (!size || last_addr < phys_addr)
15795 +               return NULL;
15796 +
15797 +       /*
15798 +        * Don't remap the low PCI/ISA area, it's always mapped..
15799 +        */
15800 +       if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15801 +               return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15802 +
15803 +       /*
15804 +        * Don't allow anybody to remap normal RAM that we're using..
15805 +        */
15806 +       for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15807 +               unsigned long pfn = mfn_to_local_pfn(mfn);
15808 +
15809 +               if (pfn >= max_pfn)
15810 +                       continue;
15811 +
15812 +               domid = DOMID_SELF;
15813 +
15814 +               if (pfn >= max_pfn_mapped) /* bogus */
15815 +                       continue;
15816 +
15817 +               if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15818 +                       return NULL;
15819 +       }
15820 +
15821 +       switch (mode) {
15822 +       case IOR_MODE_UNCACHED:
15823 +       default:
15824 +               /*
15825 +                * FIXME: we will use UC MINUS for now, as video fb drivers
15826 +                * depend on it. Upcoming ioremap_wc() will fix this behavior.
15827 +                */
15828 +               prot = PAGE_KERNEL_UC_MINUS;
15829 +               break;
15830 +       case IOR_MODE_CACHED:
15831 +               prot = PAGE_KERNEL;
15832 +               break;
15833 +       }
15834 +
15835 +       /*
15836 +        * Mappings have to be page-aligned
15837 +        */
15838 +       offset = phys_addr & ~PAGE_MASK;
15839 +       phys_addr &= PAGE_MASK;
15840 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
15841 +
15842 +       /*
15843 +        * Ok, go for it..
15844 +        */
15845 +       area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15846 +       if (!area)
15847 +               return NULL;
15848 +       area->phys_addr = phys_addr;
15849 +       vaddr = (unsigned long) area->addr;
15850 +       if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15851 +                                    size, prot, domid)) {
15852 +               free_vm_area(area);
15853 +               return NULL;
15854 +       }
15855 +
15856 +       if (ioremap_change_attr(vaddr, size, mode) < 0) {
15857 +               iounmap((void __iomem *) vaddr);
15858 +               return NULL;
15859 +       }
15860 +
15861 +       return (void __iomem *) (vaddr + offset);
15862 +}
15863 +
15864 +/**
15865 + * ioremap_nocache     -   map bus memory into CPU space
15866 + * @offset:    bus address of the memory
15867 + * @size:      size of the resource to map
15868 + *
15869 + * ioremap_nocache performs a platform specific sequence of operations to
15870 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
15871 + * writew/writel functions and the other mmio helpers. The returned
15872 + * address is not guaranteed to be usable directly as a virtual
15873 + * address.
15874 + *
15875 + * This version of ioremap ensures that the memory is marked uncachable
15876 + * on the CPU as well as honouring existing caching rules from things like
15877 + * the PCI bus. Note that there are other caches and buffers on many
15878 + * busses. In particular driver authors should read up on PCI writes
15879 + *
15880 + * It's useful if some control registers are in such an area and
15881 + * write combining or read caching is not desirable:
15882 + *
15883 + * Must be freed with iounmap.
15884 + */
15885 +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15886 +{
15887 +       return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15888 +}
15889 +EXPORT_SYMBOL(ioremap_nocache);
15890 +
15891 +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15892 +{
15893 +       return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15894 +}
15895 +EXPORT_SYMBOL(ioremap_cache);
15896 +
15897 +/**
15898 + * iounmap - Free a IO remapping
15899 + * @addr: virtual address from ioremap_*
15900 + *
15901 + * Caller must ensure there is only one unmapping for the same pointer.
15902 + */
15903 +void iounmap(volatile void __iomem *addr)
15904 +{
15905 +       struct vm_struct *p, *o;
15906 +
15907 +       if ((void __force *)addr <= high_memory)
15908 +               return;
15909 +
15910 +       /*
15911 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
15912 +        * vm_area and by simply returning an address into the kernel mapping
15913 +        * of ISA space.   So handle that here.
15914 +        */
15915 +       if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15916 +               return;
15917 +
15918 +       addr = (volatile void __iomem *)
15919 +               (PAGE_MASK & (unsigned long __force)addr);
15920 +
15921 +       /* Use the vm area unlocked, assuming the caller
15922 +          ensures there isn't another iounmap for the same address
15923 +          in parallel. Reuse of the virtual address is prevented by
15924 +          leaving it in the global lists until we're done with it.
15925 +          cpa takes care of the direct mappings. */
15926 +       read_lock(&vmlist_lock);
15927 +       for (p = vmlist; p; p = p->next) {
15928 +               if (p->addr == addr)
15929 +                       break;
15930 +       }
15931 +       read_unlock(&vmlist_lock);
15932 +
15933 +       if (!p) {
15934 +               printk(KERN_ERR "iounmap: bad address %p\n", addr);
15935 +               dump_stack();
15936 +               return;
15937 +       }
15938 +
15939 +       if ((p->flags >> 20) != IOR_MODE_CACHED) {
15940 +               unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15941 +               unsigned long mfn = p->phys_addr;
15942 +               unsigned long va = (unsigned long)addr;
15943 +
15944 +               for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15945 +                       if (mfn_to_local_pfn(mfn) < max_pfn)
15946 +                               set_memory_wb(va, 1);
15947 +       }
15948 +
15949 +       /* Finally remove it */
15950 +       o = remove_vm_area((void *)addr);
15951 +       BUG_ON(p != o || o == NULL);
15952 +       kfree(p);
15953 +}
15954 +EXPORT_SYMBOL(iounmap);
15955 +
15956 +int __initdata early_ioremap_debug;
15957 +
15958 +static int __init early_ioremap_debug_setup(char *str)
15959 +{
15960 +       early_ioremap_debug = 1;
15961 +
15962 +       return 0;
15963 +}
15964 +early_param("early_ioremap_debug", early_ioremap_debug_setup);
15965 +
15966 +static __initdata int after_paging_init;
15967 +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15968 +                               __attribute__((aligned(PAGE_SIZE)));
15969 +
15970 +#ifdef CONFIG_X86_32
15971 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15972 +{
15973 +       /* Don't assume we're using swapper_pg_dir at this point */
15974 +       pgd_t *base = __va(read_cr3());
15975 +       pgd_t *pgd = &base[pgd_index(addr)];
15976 +       pud_t *pud = pud_offset(pgd, addr);
15977 +       pmd_t *pmd = pmd_offset(pud, addr);
15978 +
15979 +       return pmd;
15980 +}
15981 +#else
15982 +#define early_ioremap_pmd early_get_pmd
15983 +#define make_lowmem_page_readonly early_make_page_readonly
15984 +#define make_lowmem_page_writable make_page_writable
15985 +#endif
15986 +
15987 +static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15988 +{
15989 +       return &bm_pte[pte_index(addr)];
15990 +}
15991 +
15992 +void __init early_ioremap_init(void)
15993 +{
15994 +       pmd_t *pmd;
15995 +
15996 +       if (early_ioremap_debug)
15997 +               printk(KERN_INFO "early_ioremap_init()\n");
15998 +
15999 +       pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16000 +       memset(bm_pte, 0, sizeof(bm_pte));
16001 +       make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
16002 +       pmd_populate_kernel(&init_mm, pmd, bm_pte);
16003 +
16004 +       /*
16005 +        * The boot-ioremap range spans multiple pmds, for which
16006 +        * we are not prepared:
16007 +        */
16008 +       if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
16009 +               WARN_ON(1);
16010 +               printk(KERN_WARNING "pmd %p != %p\n",
16011 +                      pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
16012 +               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
16013 +                       fix_to_virt(FIX_BTMAP_BEGIN));
16014 +               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
16015 +                       fix_to_virt(FIX_BTMAP_END));
16016 +
16017 +               printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
16018 +               printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
16019 +                      FIX_BTMAP_BEGIN);
16020 +       }
16021 +}
16022 +
16023 +void __init early_ioremap_clear(void)
16024 +{
16025 +       pmd_t *pmd;
16026 +
16027 +       if (early_ioremap_debug)
16028 +               printk(KERN_INFO "early_ioremap_clear()\n");
16029 +
16030 +       pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
16031 +       pmd_clear(pmd);
16032 +       make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
16033 +       /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
16034 +       __flush_tlb_all();
16035 +}
16036 +
16037 +void __init early_ioremap_reset(void)
16038 +{
16039 +       enum fixed_addresses idx;
16040 +       unsigned long addr, phys;
16041 +       pte_t *pte;
16042 +
16043 +       after_paging_init = 1;
16044 +       for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
16045 +               addr = fix_to_virt(idx);
16046 +               pte = early_ioremap_pte(addr);
16047 +               if (pte_present(*pte)) {
16048 +                       phys = __pte_val(*pte) & PAGE_MASK;
16049 +                       set_fixmap(idx, phys);
16050 +               }
16051 +       }
16052 +}
16053 +
16054 +static void __init __early_set_fixmap(enum fixed_addresses idx,
16055 +                                  unsigned long phys, pgprot_t flags)
16056 +{
16057 +       unsigned long addr = __fix_to_virt(idx);
16058 +       pte_t *pte;
16059 +
16060 +       if (idx >= __end_of_fixed_addresses) {
16061 +               BUG();
16062 +               return;
16063 +       }
16064 +       pte = early_ioremap_pte(addr);
16065 +       if (pgprot_val(flags))
16066 +               set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
16067 +       else
16068 +               pte_clear(NULL, addr, pte);
16069 +       __flush_tlb_one(addr);
16070 +}
16071 +
16072 +static inline void __init early_set_fixmap(enum fixed_addresses idx,
16073 +                                       unsigned long phys)
16074 +{
16075 +       if (after_paging_init)
16076 +               set_fixmap(idx, phys);
16077 +       else
16078 +               __early_set_fixmap(idx, phys, PAGE_KERNEL);
16079 +}
16080 +
16081 +static inline void __init early_clear_fixmap(enum fixed_addresses idx)
16082 +{
16083 +       if (after_paging_init)
16084 +               clear_fixmap(idx);
16085 +       else
16086 +               __early_set_fixmap(idx, 0, __pgprot(0));
16087 +}
16088 +
16089 +
16090 +int __initdata early_ioremap_nested;
16091 +
16092 +static int __init check_early_ioremap_leak(void)
16093 +{
16094 +       if (!early_ioremap_nested)
16095 +               return 0;
16096 +
16097 +       printk(KERN_WARNING
16098 +              "Debug warning: early ioremap leak of %d areas detected.\n",
16099 +              early_ioremap_nested);
16100 +       printk(KERN_WARNING
16101 +              "please boot with early_ioremap_debug and report the dmesg.\n");
16102 +       WARN_ON(1);
16103 +
16104 +       return 1;
16105 +}
16106 +late_initcall(check_early_ioremap_leak);
16107 +
16108 +void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
16109 +{
16110 +       unsigned long offset, last_addr;
16111 +       unsigned int nrpages, nesting;
16112 +       enum fixed_addresses idx0, idx;
16113 +
16114 +       WARN_ON(system_state != SYSTEM_BOOTING);
16115 +
16116 +       nesting = early_ioremap_nested;
16117 +       if (early_ioremap_debug) {
16118 +               printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
16119 +                      phys_addr, size, nesting);
16120 +               dump_stack();
16121 +       }
16122 +
16123 +       /* Don't allow wraparound or zero size */
16124 +       last_addr = phys_addr + size - 1;
16125 +       if (!size || last_addr < phys_addr) {
16126 +               WARN_ON(1);
16127 +               return NULL;
16128 +       }
16129 +
16130 +       if (nesting >= FIX_BTMAPS_NESTING) {
16131 +               WARN_ON(1);
16132 +               return NULL;
16133 +       }
16134 +       early_ioremap_nested++;
16135 +       /*
16136 +        * Mappings have to be page-aligned
16137 +        */
16138 +       offset = phys_addr & ~PAGE_MASK;
16139 +       phys_addr &= PAGE_MASK;
16140 +       size = PAGE_ALIGN(last_addr) - phys_addr;
16141 +
16142 +       /*
16143 +        * Mappings have to fit in the FIX_BTMAP area.
16144 +        */
16145 +       nrpages = size >> PAGE_SHIFT;
16146 +       if (nrpages > NR_FIX_BTMAPS) {
16147 +               WARN_ON(1);
16148 +               return NULL;
16149 +       }
16150 +
16151 +       /*
16152 +        * Ok, go for it..
16153 +        */
16154 +       idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16155 +       idx = idx0;
16156 +       while (nrpages > 0) {
16157 +               early_set_fixmap(idx, phys_addr);
16158 +               phys_addr += PAGE_SIZE;
16159 +               --idx;
16160 +               --nrpages;
16161 +       }
16162 +       if (early_ioremap_debug)
16163 +               printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
16164 +
16165 +       return (void *) (offset + fix_to_virt(idx0));
16166 +}
16167 +
16168 +void __init early_iounmap(void *addr, unsigned long size)
16169 +{
16170 +       unsigned long virt_addr;
16171 +       unsigned long offset;
16172 +       unsigned int nrpages;
16173 +       enum fixed_addresses idx;
16174 +       unsigned int nesting;
16175 +
16176 +       nesting = --early_ioremap_nested;
16177 +       WARN_ON(nesting < 0);
16178 +
16179 +       if (early_ioremap_debug) {
16180 +               printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
16181 +                      size, nesting);
16182 +               dump_stack();
16183 +       }
16184 +
16185 +       virt_addr = (unsigned long)addr;
16186 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
16187 +               WARN_ON(1);
16188 +               return;
16189 +       }
16190 +       offset = virt_addr & ~PAGE_MASK;
16191 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16192 +
16193 +       idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
16194 +       while (nrpages > 0) {
16195 +               early_clear_fixmap(idx);
16196 +               --idx;
16197 +               --nrpages;
16198 +       }
16199 +}
16200 +
16201 +void __this_fixmap_does_not_exist(void)
16202 +{
16203 +       WARN_ON(1);
16204 +}
16205 --- a/arch/x86/mm/pageattr_64-xen.c
16206 +++ /dev/null
16207 @@ -1,542 +0,0 @@
16208 -/*
16209 - * Copyright 2002 Andi Kleen, SuSE Labs.
16210 - * Thanks to Ben LaHaise for precious feedback.
16211 - */
16212 -
16213 -#include <linux/mm.h>
16214 -#include <linux/sched.h>
16215 -#include <linux/highmem.h>
16216 -#include <linux/module.h>
16217 -#include <linux/slab.h>
16218 -#include <asm/uaccess.h>
16219 -#include <asm/processor.h>
16220 -#include <asm/tlbflush.h>
16221 -#include <asm/io.h>
16222 -
16223 -#ifdef CONFIG_XEN
16224 -#include <asm/pgalloc.h>
16225 -#include <asm/mmu_context.h>
16226 -
16227 -static void _pin_lock(struct mm_struct *mm, int lock) {
16228 -       if (lock)
16229 -               spin_lock(&mm->page_table_lock);
16230 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16231 -       /* While mm->page_table_lock protects us against insertions and
16232 -        * removals of higher level page table pages, it doesn't protect
16233 -        * against updates of pte-s. Such updates, however, require the
16234 -        * pte pages to be in consistent state (unpinned+writable or
16235 -        * pinned+readonly). The pinning and attribute changes, however
16236 -        * cannot be done atomically, which is why such updates must be
16237 -        * prevented from happening concurrently.
16238 -        * Note that no pte lock can ever elsewhere be acquired nesting
16239 -        * with an already acquired one in the same mm, or with the mm's
16240 -        * page_table_lock already acquired, as that would break in the
16241 -        * non-split case (where all these are actually resolving to the
16242 -        * one page_table_lock). Thus acquiring all of them here is not
16243 -        * going to result in dead locks, and the order of acquires
16244 -        * doesn't matter.
16245 -        */
16246 -       {
16247 -               pgd_t *pgd = mm->pgd;
16248 -               unsigned g;
16249 -
16250 -               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16251 -                       pud_t *pud;
16252 -                       unsigned u;
16253 -
16254 -                       if (pgd_none(*pgd))
16255 -                               continue;
16256 -                       pud = pud_offset(pgd, 0);
16257 -                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16258 -                               pmd_t *pmd;
16259 -                               unsigned m;
16260 -
16261 -                               if (pud_none(*pud))
16262 -                                       continue;
16263 -                               pmd = pmd_offset(pud, 0);
16264 -                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16265 -                                       spinlock_t *ptl;
16266 -
16267 -                                       if (pmd_none(*pmd))
16268 -                                               continue;
16269 -                                       ptl = pte_lockptr(0, pmd);
16270 -                                       if (lock)
16271 -                                               spin_lock(ptl);
16272 -                                       else
16273 -                                               spin_unlock(ptl);
16274 -                               }
16275 -                       }
16276 -               }
16277 -       }
16278 -#endif
16279 -       if (!lock)
16280 -               spin_unlock(&mm->page_table_lock);
16281 -}
16282 -#define pin_lock(mm) _pin_lock(mm, 1)
16283 -#define pin_unlock(mm) _pin_lock(mm, 0)
16284 -
16285 -#define PIN_BATCH 8
16286 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16287 -
16288 -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
16289 -                                            unsigned int cpu, unsigned int seq)
16290 -{
16291 -       struct page *page = virt_to_page(pt);
16292 -       unsigned long pfn = page_to_pfn(page);
16293 -
16294 -       MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16295 -               (unsigned long)__va(pfn << PAGE_SHIFT),
16296 -               pfn_pte(pfn, flags), 0);
16297 -       if (unlikely(++seq == PIN_BATCH)) {
16298 -               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16299 -                                                       PIN_BATCH, NULL)))
16300 -                       BUG();
16301 -               seq = 0;
16302 -       }
16303 -
16304 -       return seq;
16305 -}
16306 -
16307 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16308 -{
16309 -       pgd_t       *pgd = pgd_base;
16310 -       pud_t       *pud;
16311 -       pmd_t       *pmd;
16312 -       pte_t       *pte;
16313 -       int          g,u,m;
16314 -       unsigned int cpu, seq;
16315 -       multicall_entry_t *mcl;
16316 -
16317 -       cpu = get_cpu();
16318 -
16319 -       /*
16320 -        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
16321 -        * be the 'current' task's pagetables (e.g., current may be 32-bit,
16322 -        * but the pagetables may be for a 64-bit task).
16323 -        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16324 -        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16325 -        */
16326 -       for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16327 -               if (pgd_none(*pgd))
16328 -                       continue;
16329 -               pud = pud_offset(pgd, 0);
16330 -               if (PTRS_PER_PUD > 1) /* not folded */
16331 -                       seq = pgd_walk_set_prot(pud,flags,cpu,seq);
16332 -               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16333 -                       if (pud_none(*pud))
16334 -                               continue;
16335 -                       pmd = pmd_offset(pud, 0);
16336 -                       if (PTRS_PER_PMD > 1) /* not folded */
16337 -                               seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
16338 -                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16339 -                               if (pmd_none(*pmd))
16340 -                                       continue;
16341 -                               pte = pte_offset_kernel(pmd,0);
16342 -                               seq = pgd_walk_set_prot(pte,flags,cpu,seq);
16343 -                       }
16344 -               }
16345 -       }
16346 -
16347 -       mcl = per_cpu(pb_mcl, cpu);
16348 -       if (unlikely(seq > PIN_BATCH - 2)) {
16349 -               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16350 -                       BUG();
16351 -               seq = 0;
16352 -       }
16353 -       MULTI_update_va_mapping(mcl + seq,
16354 -              (unsigned long)__user_pgd(pgd_base),
16355 -              pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16356 -              0);
16357 -       MULTI_update_va_mapping(mcl + seq + 1,
16358 -              (unsigned long)pgd_base,
16359 -              pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16360 -              UVMF_TLB_FLUSH);
16361 -       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16362 -               BUG();
16363 -
16364 -       put_cpu();
16365 -}
16366 -
16367 -static void __pgd_pin(pgd_t *pgd)
16368 -{
16369 -       pgd_walk(pgd, PAGE_KERNEL_RO);
16370 -       xen_pgd_pin(__pa(pgd)); /* kernel */
16371 -       xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16372 -       SetPagePinned(virt_to_page(pgd));
16373 -}
16374 -
16375 -static void __pgd_unpin(pgd_t *pgd)
16376 -{
16377 -       xen_pgd_unpin(__pa(pgd));
16378 -       xen_pgd_unpin(__pa(__user_pgd(pgd)));
16379 -       pgd_walk(pgd, PAGE_KERNEL);
16380 -       ClearPagePinned(virt_to_page(pgd));
16381 -}
16382 -
16383 -void pgd_test_and_unpin(pgd_t *pgd)
16384 -{
16385 -       if (PagePinned(virt_to_page(pgd)))
16386 -               __pgd_unpin(pgd);
16387 -}
16388 -
16389 -void mm_pin(struct mm_struct *mm)
16390 -{
16391 -       if (xen_feature(XENFEAT_writable_page_tables))
16392 -               return;
16393 -
16394 -       pin_lock(mm);
16395 -       __pgd_pin(mm->pgd);
16396 -       pin_unlock(mm);
16397 -}
16398 -
16399 -void mm_unpin(struct mm_struct *mm)
16400 -{
16401 -       if (xen_feature(XENFEAT_writable_page_tables))
16402 -               return;
16403 -
16404 -       pin_lock(mm);
16405 -       __pgd_unpin(mm->pgd);
16406 -       pin_unlock(mm);
16407 -}
16408 -
16409 -void mm_pin_all(void)
16410 -{
16411 -       struct page *page;
16412 -       unsigned long flags;
16413 -
16414 -       if (xen_feature(XENFEAT_writable_page_tables))
16415 -               return;
16416 -
16417 -       /*
16418 -        * Allow uninterrupted access to the pgd_list. Also protects
16419 -        * __pgd_pin() by disabling preemption.
16420 -        * All other CPUs must be at a safe point (e.g., in stop_machine
16421 -        * or offlined entirely).
16422 -        */
16423 -       spin_lock_irqsave(&pgd_lock, flags);
16424 -       list_for_each_entry(page, &pgd_list, lru) {
16425 -               if (!PagePinned(page))
16426 -                       __pgd_pin((pgd_t *)page_address(page));
16427 -       }
16428 -       spin_unlock_irqrestore(&pgd_lock, flags);
16429 -}
16430 -
16431 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16432 -{
16433 -       if (!PagePinned(virt_to_page(mm->pgd)))
16434 -               mm_pin(mm);
16435 -}
16436 -
16437 -void arch_exit_mmap(struct mm_struct *mm)
16438 -{
16439 -       struct task_struct *tsk = current;
16440 -
16441 -       task_lock(tsk);
16442 -
16443 -       /*
16444 -        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16445 -        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16446 -        */
16447 -       if (tsk->active_mm == mm) {
16448 -               tsk->active_mm = &init_mm;
16449 -               atomic_inc(&init_mm.mm_count);
16450 -
16451 -               switch_mm(mm, &init_mm, tsk);
16452 -
16453 -               atomic_dec(&mm->mm_count);
16454 -               BUG_ON(atomic_read(&mm->mm_count) == 0);
16455 -       }
16456 -
16457 -       task_unlock(tsk);
16458 -
16459 -       if (PagePinned(virt_to_page(mm->pgd))
16460 -           && (atomic_read(&mm->mm_count) == 1)
16461 -           && !mm->context.has_foreign_mappings)
16462 -               mm_unpin(mm);
16463 -}
16464 -
16465 -static void _pte_free(struct page *page, unsigned int order)
16466 -{
16467 -       BUG_ON(order);
16468 -       pte_free(page);
16469 -}
16470 -
16471 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
16472 -{
16473 -       struct page *pte;
16474 -
16475 -       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16476 -       if (pte) {
16477 -               SetPageForeign(pte, _pte_free);
16478 -               init_page_count(pte);
16479 -       }
16480 -       return pte;
16481 -}
16482 -
16483 -void pte_free(struct page *pte)
16484 -{
16485 -       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
16486 -
16487 -       if (!pte_write(*virt_to_ptep(va)))
16488 -               if (HYPERVISOR_update_va_mapping(
16489 -                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
16490 -                       BUG();
16491 -
16492 -       ClearPageForeign(pte);
16493 -       init_page_count(pte);
16494 -
16495 -       __free_page(pte);
16496 -}
16497 -#endif /* CONFIG_XEN */
16498 -
16499 -pte_t *lookup_address(unsigned long address)
16500 -{
16501 -       pgd_t *pgd = pgd_offset_k(address);
16502 -       pud_t *pud;
16503 -       pmd_t *pmd;
16504 -       pte_t *pte;
16505 -       if (pgd_none(*pgd))
16506 -               return NULL;
16507 -       pud = pud_offset(pgd, address);
16508 -       if (!pud_present(*pud))
16509 -               return NULL;
16510 -       pmd = pmd_offset(pud, address);
16511 -       if (!pmd_present(*pmd))
16512 -               return NULL;
16513 -       if (pmd_large(*pmd))
16514 -               return (pte_t *)pmd;
16515 -       pte = pte_offset_kernel(pmd, address);
16516 -       if (pte && !pte_present(*pte))
16517 -               pte = NULL;
16518 -       return pte;
16519 -}
16520 -
16521 -static struct page *split_large_page(unsigned long address, pgprot_t prot,
16522 -                                    pgprot_t ref_prot)
16523 -{
16524 -       int i;
16525 -       unsigned long addr;
16526 -       struct page *base = alloc_pages(GFP_KERNEL, 0);
16527 -       pte_t *pbase;
16528 -       if (!base)
16529 -               return NULL;
16530 -       /*
16531 -        * page_private is used to track the number of entries in
16532 -        * the page table page have non standard attributes.
16533 -        */
16534 -       SetPagePrivate(base);
16535 -       page_private(base) = 0;
16536 -
16537 -       address = __pa(address);
16538 -       addr = address & LARGE_PAGE_MASK;
16539 -       pbase = (pte_t *)page_address(base);
16540 -       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
16541 -               pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
16542 -                                  addr == address ? prot : ref_prot);
16543 -       }
16544 -       return base;
16545 -}
16546 -
16547 -void clflush_cache_range(void *adr, int size)
16548 -{
16549 -       int i;
16550 -       for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
16551 -               clflush(adr+i);
16552 -}
16553 -
16554 -static void flush_kernel_map(void *arg)
16555 -{
16556 -       struct list_head *l = (struct list_head *)arg;
16557 -       struct page *pg;
16558 -
16559 -       /* When clflush is available always use it because it is
16560 -          much cheaper than WBINVD. */
16561 -       /* clflush is still broken. Disable for now. */
16562 -       if (1 || !cpu_has_clflush)
16563 -               asm volatile("wbinvd" ::: "memory");
16564 -       else list_for_each_entry(pg, l, lru) {
16565 -               void *adr = page_address(pg);
16566 -               clflush_cache_range(adr, PAGE_SIZE);
16567 -       }
16568 -       __flush_tlb_all();
16569 -}
16570 -
16571 -static inline void flush_map(struct list_head *l)
16572 -{
16573 -       on_each_cpu(flush_kernel_map, l, 1, 1);
16574 -}
16575 -
16576 -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
16577 -
16578 -static inline void save_page(struct page *fpage)
16579 -{
16580 -       if (!test_and_set_bit(PG_arch_1, &fpage->flags))
16581 -               list_add(&fpage->lru, &deferred_pages);
16582 -}
16583 -
16584 -/*
16585 - * No more special protections in this 2/4MB area - revert to a
16586 - * large page again.
16587 - */
16588 -static void revert_page(unsigned long address, pgprot_t ref_prot)
16589 -{
16590 -       pgd_t *pgd;
16591 -       pud_t *pud;
16592 -       pmd_t *pmd;
16593 -       pte_t large_pte;
16594 -       unsigned long pfn;
16595 -
16596 -       pgd = pgd_offset_k(address);
16597 -       BUG_ON(pgd_none(*pgd));
16598 -       pud = pud_offset(pgd,address);
16599 -       BUG_ON(pud_none(*pud));
16600 -       pmd = pmd_offset(pud, address);
16601 -       BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
16602 -       pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
16603 -       large_pte = pfn_pte(pfn, ref_prot);
16604 -       large_pte = pte_mkhuge(large_pte);
16605 -       set_pte((pte_t *)pmd, large_pte);
16606 -}
16607 -
16608 -static int
16609 -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
16610 -                                  pgprot_t ref_prot)
16611 -{
16612 -       pte_t *kpte;
16613 -       struct page *kpte_page;
16614 -       pgprot_t ref_prot2;
16615 -
16616 -       kpte = lookup_address(address);
16617 -       if (!kpte) return 0;
16618 -       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
16619 -       BUG_ON(PageLRU(kpte_page));
16620 -       BUG_ON(PageCompound(kpte_page));
16621 -       if (pgprot_val(prot) != pgprot_val(ref_prot)) {
16622 -               if (!pte_huge(*kpte)) {
16623 -                       set_pte(kpte, pfn_pte(pfn, prot));
16624 -               } else {
16625 -                       /*
16626 -                        * split_large_page will take the reference for this
16627 -                        * change_page_attr on the split page.
16628 -                        */
16629 -                       struct page *split;
16630 -                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
16631 -                       split = split_large_page(address, prot, ref_prot2);
16632 -                       if (!split)
16633 -                               return -ENOMEM;
16634 -                       pgprot_val(ref_prot2) &= ~_PAGE_NX;
16635 -                       set_pte(kpte, mk_pte(split, ref_prot2));
16636 -                       kpte_page = split;
16637 -               }
16638 -               page_private(kpte_page)++;
16639 -       } else if (!pte_huge(*kpte)) {
16640 -               set_pte(kpte, pfn_pte(pfn, ref_prot));
16641 -               BUG_ON(page_private(kpte_page) == 0);
16642 -               page_private(kpte_page)--;
16643 -       } else
16644 -               BUG();
16645 -
16646 -       /* on x86-64 the direct mapping set at boot is not using 4k pages */
16647 -       /*
16648 -        * ..., but the XEN guest kernels (currently) do:
16649 -        * If the pte was reserved, it means it was created at boot
16650 -        * time (not via split_large_page) and in turn we must not
16651 -        * replace it with a large page.
16652 -        */
16653 -#ifndef CONFIG_XEN
16654 -       BUG_ON(PageReserved(kpte_page));
16655 -#else
16656 -       if (PageReserved(kpte_page))
16657 -               return 0;
16658 -#endif
16659 -
16660 -       save_page(kpte_page);
16661 -       if (page_private(kpte_page) == 0)
16662 -               revert_page(address, ref_prot);
16663 -       return 0;
16664 -}
16665 -
16666 -/*
16667 - * Change the page attributes of an page in the linear mapping.
16668 - *
16669 - * This should be used when a page is mapped with a different caching policy
16670 - * than write-back somewhere - some CPUs do not like it when mappings with
16671 - * different caching policies exist. This changes the page attributes of the
16672 - * in kernel linear mapping too.
16673 - *
16674 - * The caller needs to ensure that there are no conflicting mappings elsewhere.
16675 - * This function only deals with the kernel linear map.
16676 - *
16677 - * Caller must call global_flush_tlb() after this.
16678 - */
16679 -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
16680 -{
16681 -       int err = 0, kernel_map = 0;
16682 -       int i;
16683 -
16684 -       if (address >= __START_KERNEL_map
16685 -           && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
16686 -               address = (unsigned long)__va(__pa(address));
16687 -               kernel_map = 1;
16688 -       }
16689 -
16690 -       down_write(&init_mm.mmap_sem);
16691 -       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
16692 -               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
16693 -
16694 -               if (!kernel_map || pte_present(pfn_pte(0, prot))) {
16695 -                       err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
16696 -                       if (err)
16697 -                               break;
16698 -               }
16699 -               /* Handle kernel mapping too which aliases part of the
16700 -                * lowmem */
16701 -               if (__pa(address) < KERNEL_TEXT_SIZE) {
16702 -                       unsigned long addr2;
16703 -                       pgprot_t prot2;
16704 -                       addr2 = __START_KERNEL_map + __pa(address);
16705 -                       /* Make sure the kernel mappings stay executable */
16706 -                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
16707 -                       err = __change_page_attr(addr2, pfn, prot2,
16708 -                                                PAGE_KERNEL_EXEC);
16709 -               }
16710 -       }
16711 -       up_write(&init_mm.mmap_sem);
16712 -       return err;
16713 -}
16714 -
16715 -/* Don't call this for MMIO areas that may not have a mem_map entry */
16716 -int change_page_attr(struct page *page, int numpages, pgprot_t prot)
16717 -{
16718 -       unsigned long addr = (unsigned long)page_address(page);
16719 -       return change_page_attr_addr(addr, numpages, prot);
16720 -}
16721 -
16722 -void global_flush_tlb(void)
16723 -{
16724 -       struct page *pg, *next;
16725 -       struct list_head l;
16726 -
16727 -       /*
16728 -        * Write-protect the semaphore, to exclude two contexts
16729 -        * doing a list_replace_init() call in parallel and to
16730 -        * exclude new additions to the deferred_pages list:
16731 -        */
16732 -       down_write(&init_mm.mmap_sem);
16733 -       list_replace_init(&deferred_pages, &l);
16734 -       up_write(&init_mm.mmap_sem);
16735 -
16736 -       flush_map(&l);
16737 -
16738 -       list_for_each_entry_safe(pg, next, &l, lru) {
16739 -               list_del(&pg->lru);
16740 -               clear_bit(PG_arch_1, &pg->flags);
16741 -               if (page_private(pg) != 0)
16742 -                       continue;
16743 -               ClearPagePrivate(pg);
16744 -               __free_page(pg);
16745 -       }
16746 -}
16747 -
16748 -EXPORT_SYMBOL(change_page_attr);
16749 -EXPORT_SYMBOL(global_flush_tlb);
16750 --- /dev/null
16751 +++ b/arch/x86/mm/pageattr-xen.c
16752 @@ -0,0 +1,1412 @@
16753 +/*
16754 + * Copyright 2002 Andi Kleen, SuSE Labs.
16755 + * Thanks to Ben LaHaise for precious feedback.
16756 + */
16757 +#include <linux/highmem.h>
16758 +#include <linux/bootmem.h>
16759 +#include <linux/module.h>
16760 +#include <linux/sched.h>
16761 +#include <linux/slab.h>
16762 +#include <linux/mm.h>
16763 +#include <linux/interrupt.h>
16764 +
16765 +#include <asm/e820.h>
16766 +#include <asm/processor.h>
16767 +#include <asm/tlbflush.h>
16768 +#include <asm/sections.h>
16769 +#include <asm/uaccess.h>
16770 +#include <asm/pgalloc.h>
16771 +#include <asm/proto.h>
16772 +#include <asm/mmu_context.h>
16773 +
16774 +#ifndef CONFIG_X86_64
16775 +#define TASK_SIZE64 TASK_SIZE
16776 +#endif
16777 +
16778 +static void _pin_lock(struct mm_struct *mm, int lock) {
16779 +       if (lock)
16780 +               spin_lock(&mm->page_table_lock);
16781 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16782 +       /* While mm->page_table_lock protects us against insertions and
16783 +        * removals of higher level page table pages, it doesn't protect
16784 +        * against updates of pte-s. Such updates, however, require the
16785 +        * pte pages to be in consistent state (unpinned+writable or
16786 +        * pinned+readonly). The pinning and attribute changes, however
16787 +        * cannot be done atomically, which is why such updates must be
16788 +        * prevented from happening concurrently.
16789 +        * Note that no pte lock can ever elsewhere be acquired nesting
16790 +        * with an already acquired one in the same mm, or with the mm's
16791 +        * page_table_lock already acquired, as that would break in the
16792 +        * non-split case (where all these are actually resolving to the
16793 +        * one page_table_lock). Thus acquiring all of them here is not
16794 +        * going to result in dead locks, and the order of acquires
16795 +        * doesn't matter.
16796 +        */
16797 +       {
16798 +               pgd_t *pgd = mm->pgd;
16799 +               unsigned g;
16800 +
16801 +               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16802 +                       pud_t *pud;
16803 +                       unsigned u;
16804 +
16805 +                       if (pgd_none(*pgd))
16806 +                               continue;
16807 +                       pud = pud_offset(pgd, 0);
16808 +                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16809 +                               pmd_t *pmd;
16810 +                               unsigned m;
16811 +
16812 +                               if (pud_none(*pud))
16813 +                                       continue;
16814 +                               pmd = pmd_offset(pud, 0);
16815 +                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16816 +                                       spinlock_t *ptl;
16817 +
16818 +                                       if (pmd_none(*pmd))
16819 +                                               continue;
16820 +                                       ptl = pte_lockptr(0, pmd);
16821 +                                       if (lock)
16822 +                                               spin_lock(ptl);
16823 +                                       else
16824 +                                               spin_unlock(ptl);
16825 +                               }
16826 +                       }
16827 +               }
16828 +       }
16829 +#endif
16830 +       if (!lock)
16831 +               spin_unlock(&mm->page_table_lock);
16832 +}
16833 +#define pin_lock(mm) _pin_lock(mm, 1)
16834 +#define pin_unlock(mm) _pin_lock(mm, 0)
16835 +
16836 +#define PIN_BATCH sizeof(void *)
16837 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16838 +
16839 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16840 +                                            unsigned int cpu, unsigned int seq)
16841 +{
16842 +       unsigned long pfn = page_to_pfn(page);
16843 +
16844 +       if (PageHighMem(page)) {
16845 +               if (pgprot_val(flags) & _PAGE_RW)
16846 +                       ClearPagePinned(page);
16847 +               else
16848 +                       SetPagePinned(page);
16849 +       } else {
16850 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16851 +                                       (unsigned long)__va(pfn << PAGE_SHIFT),
16852 +                                       pfn_pte(pfn, flags), 0);
16853 +               if (unlikely(++seq == PIN_BATCH)) {
16854 +                       if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16855 +                                                               PIN_BATCH, NULL)))
16856 +                               BUG();
16857 +                       seq = 0;
16858 +               }
16859 +       }
16860 +
16861 +       return seq;
16862 +}
16863 +
16864 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16865 +{
16866 +       pgd_t       *pgd = pgd_base;
16867 +       pud_t       *pud;
16868 +       pmd_t       *pmd;
16869 +       int          g,u,m;
16870 +       unsigned int cpu, seq;
16871 +       multicall_entry_t *mcl;
16872 +
16873 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16874 +               return;
16875 +
16876 +       cpu = get_cpu();
16877 +
16878 +       /*
16879 +        * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16880 +        * may not be the 'current' task's pagetables (e.g., current may be
16881 +        * 32-bit, but the pagetables may be for a 64-bit task).
16882 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16883 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16884 +        */
16885 +       for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16886 +               if (pgd_none(*pgd))
16887 +                       continue;
16888 +               pud = pud_offset(pgd, 0);
16889 +               if (PTRS_PER_PUD > 1) /* not folded */
16890 +                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16891 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16892 +                       if (pud_none(*pud))
16893 +                               continue;
16894 +                       pmd = pmd_offset(pud, 0);
16895 +                       if (PTRS_PER_PMD > 1) /* not folded */
16896 +                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16897 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16898 +                               if (pmd_none(*pmd))
16899 +                                       continue;
16900 +                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16901 +                       }
16902 +               }
16903 +       }
16904 +
16905 +       mcl = per_cpu(pb_mcl, cpu);
16906 +#ifdef CONFIG_X86_64
16907 +       if (unlikely(seq > PIN_BATCH - 2)) {
16908 +               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16909 +                       BUG();
16910 +               seq = 0;
16911 +       }
16912 +       MULTI_update_va_mapping(mcl + seq,
16913 +              (unsigned long)__user_pgd(pgd_base),
16914 +              pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16915 +              0);
16916 +       MULTI_update_va_mapping(mcl + seq + 1,
16917 +              (unsigned long)pgd_base,
16918 +              pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16919 +              UVMF_TLB_FLUSH);
16920 +       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16921 +               BUG();
16922 +#else
16923 +       if (likely(seq != 0)) {
16924 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16925 +                       (unsigned long)pgd_base,
16926 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16927 +                       UVMF_TLB_FLUSH);
16928 +               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16929 +                                                       seq + 1, NULL)))
16930 +                       BUG();
16931 +       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16932 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16933 +                       UVMF_TLB_FLUSH))
16934 +               BUG();
16935 +#endif
16936 +
16937 +       put_cpu();
16938 +}
16939 +
16940 +static void __pgd_pin(pgd_t *pgd)
16941 +{
16942 +       pgd_walk(pgd, PAGE_KERNEL_RO);
16943 +       kmap_flush_unused();
16944 +       xen_pgd_pin(__pa(pgd)); /* kernel */
16945 +#ifdef CONFIG_X86_64
16946 +       xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16947 +#endif
16948 +       SetPagePinned(virt_to_page(pgd));
16949 +}
16950 +
16951 +static void __pgd_unpin(pgd_t *pgd)
16952 +{
16953 +       xen_pgd_unpin(__pa(pgd));
16954 +#ifdef CONFIG_X86_64
16955 +       xen_pgd_unpin(__pa(__user_pgd(pgd)));
16956 +#endif
16957 +       pgd_walk(pgd, PAGE_KERNEL);
16958 +       ClearPagePinned(virt_to_page(pgd));
16959 +}
16960 +
16961 +void pgd_test_and_unpin(pgd_t *pgd)
16962 +{
16963 +       if (PagePinned(virt_to_page(pgd)))
16964 +               __pgd_unpin(pgd);
16965 +}
16966 +
16967 +void mm_pin(struct mm_struct *mm)
16968 +{
16969 +       if (xen_feature(XENFEAT_writable_page_tables))
16970 +               return;
16971 +
16972 +       pin_lock(mm);
16973 +       __pgd_pin(mm->pgd);
16974 +       pin_unlock(mm);
16975 +}
16976 +
16977 +void mm_unpin(struct mm_struct *mm)
16978 +{
16979 +       if (xen_feature(XENFEAT_writable_page_tables))
16980 +               return;
16981 +
16982 +       pin_lock(mm);
16983 +       __pgd_unpin(mm->pgd);
16984 +       pin_unlock(mm);
16985 +}
16986 +
16987 +void mm_pin_all(void)
16988 +{
16989 +       struct page *page;
16990 +       unsigned long flags;
16991 +
16992 +       if (xen_feature(XENFEAT_writable_page_tables))
16993 +               return;
16994 +
16995 +       /*
16996 +        * Allow uninterrupted access to the pgd_list. Also protects
16997 +        * __pgd_pin() by disabling preemption.
16998 +        * All other CPUs must be at a safe point (e.g., in stop_machine
16999 +        * or offlined entirely).
17000 +        */
17001 +       spin_lock_irqsave(&pgd_lock, flags);
17002 +       list_for_each_entry(page, &pgd_list, lru) {
17003 +               if (!PagePinned(page))
17004 +                       __pgd_pin((pgd_t *)page_address(page));
17005 +       }
17006 +       spin_unlock_irqrestore(&pgd_lock, flags);
17007 +}
17008 +
17009 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17010 +{
17011 +       if (!PagePinned(virt_to_page(mm->pgd)))
17012 +               mm_pin(mm);
17013 +}
17014 +
17015 +void arch_exit_mmap(struct mm_struct *mm)
17016 +{
17017 +       struct task_struct *tsk = current;
17018 +
17019 +       task_lock(tsk);
17020 +
17021 +       /*
17022 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17023 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17024 +        */
17025 +       if (tsk->active_mm == mm) {
17026 +               tsk->active_mm = &init_mm;
17027 +               atomic_inc(&init_mm.mm_count);
17028 +
17029 +               switch_mm(mm, &init_mm, tsk);
17030 +
17031 +               atomic_dec(&mm->mm_count);
17032 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
17033 +       }
17034 +
17035 +       task_unlock(tsk);
17036 +
17037 +       if (PagePinned(virt_to_page(mm->pgd))
17038 +           && atomic_read(&mm->mm_count) == 1
17039 +           && !mm->context.has_foreign_mappings)
17040 +               mm_unpin(mm);
17041 +}
17042 +
17043 +static void _pte_free(struct page *page, unsigned int order)
17044 +{
17045 +       BUG_ON(order);
17046 +       __pte_free(page);
17047 +}
17048 +
17049 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
17050 +{
17051 +       struct page *pte;
17052 +
17053 +#ifdef CONFIG_HIGHPTE
17054 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17055 +#else
17056 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17057 +#endif
17058 +       if (pte) {
17059 +               pgtable_page_ctor(pte);
17060 +               SetPageForeign(pte, _pte_free);
17061 +               init_page_count(pte);
17062 +       }
17063 +       return pte;
17064 +}
17065 +
17066 +void __pte_free(pgtable_t pte)
17067 +{
17068 +       if (!PageHighMem(pte)) {
17069 +               unsigned long va = (unsigned long)page_address(pte);
17070 +               unsigned int level;
17071 +               pte_t *ptep = lookup_address(va, &level);
17072 +
17073 +               BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17074 +               if (!pte_write(*ptep)
17075 +                   && HYPERVISOR_update_va_mapping(va,
17076 +                                                   mk_pte(pte, PAGE_KERNEL),
17077 +                                                   0))
17078 +                       BUG();
17079 +       } else
17080 +#ifdef CONFIG_HIGHPTE
17081 +               ClearPagePinned(pte);
17082 +#else
17083 +               BUG();
17084 +#endif
17085 +
17086 +       ClearPageForeign(pte);
17087 +       init_page_count(pte);
17088 +       pgtable_page_dtor(pte);
17089 +       __free_page(pte);
17090 +}
17091 +
17092 +#if PAGETABLE_LEVELS >= 3
17093 +static void _pmd_free(struct page *page, unsigned int order)
17094 +{
17095 +       BUG_ON(order);
17096 +       __pmd_free(page);
17097 +}
17098 +
17099 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
17100 +{
17101 +       struct page *pmd;
17102 +
17103 +       pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17104 +       if (!pmd)
17105 +               return NULL;
17106 +       SetPageForeign(pmd, _pmd_free);
17107 +       init_page_count(pmd);
17108 +       return page_address(pmd);
17109 +}
17110 +
17111 +void __pmd_free(pgtable_t pmd)
17112 +{
17113 +       unsigned long va = (unsigned long)page_address(pmd);
17114 +       unsigned int level;
17115 +       pte_t *ptep = lookup_address(va, &level);
17116 +
17117 +       BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
17118 +       if (!pte_write(*ptep)
17119 +           && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
17120 +               BUG();
17121 +
17122 +       ClearPageForeign(pmd);
17123 +       init_page_count(pmd);
17124 +       __free_page(pmd);
17125 +}
17126 +#endif
17127 +
17128 +/* blktap and gntdev need this, as otherwise they would implicitly (and
17129 + * needlessly, as they never use it) reference init_mm. */
17130 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
17131 +                                 unsigned long addr, pte_t *ptep, int full)
17132 +{
17133 +       return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
17134 +}
17135 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
17136 +
17137 +/*
17138 + * The current flushing context - we pass it instead of 5 arguments:
17139 + */
17140 +struct cpa_data {
17141 +       unsigned long   vaddr;
17142 +       pgprot_t        mask_set;
17143 +       pgprot_t        mask_clr;
17144 +       int             numpages;
17145 +       int             flushtlb;
17146 +       unsigned long   pfn;
17147 +};
17148 +
17149 +#ifdef CONFIG_X86_64
17150 +
17151 +static inline unsigned long highmap_start_pfn(void)
17152 +{
17153 +       return __pa(_text) >> PAGE_SHIFT;
17154 +}
17155 +
17156 +static inline unsigned long highmap_end_pfn(void)
17157 +{
17158 +       return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
17159 +}
17160 +
17161 +#endif
17162 +
17163 +#ifdef CONFIG_DEBUG_PAGEALLOC
17164 +# define debug_pagealloc 1
17165 +#else
17166 +# define debug_pagealloc 0
17167 +#endif
17168 +
17169 +static inline int
17170 +within(unsigned long addr, unsigned long start, unsigned long end)
17171 +{
17172 +       return addr >= start && addr < end;
17173 +}
17174 +
17175 +/*
17176 + * Flushing functions
17177 + */
17178 +
17179 +/**
17180 + * clflush_cache_range - flush a cache range with clflush
17181 + * @addr:      virtual start address
17182 + * @size:      number of bytes to flush
17183 + *
17184 + * clflush is an unordered instruction which needs fencing with mfence
17185 + * to avoid ordering issues.
17186 + */
17187 +void clflush_cache_range(void *vaddr, unsigned int size)
17188 +{
17189 +       void *vend = vaddr + size - 1;
17190 +
17191 +       mb();
17192 +
17193 +       for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
17194 +               clflush(vaddr);
17195 +       /*
17196 +        * Flush any possible final partial cacheline:
17197 +        */
17198 +       clflush(vend);
17199 +
17200 +       mb();
17201 +}
17202 +
17203 +static void __cpa_flush_all(void *arg)
17204 +{
17205 +       unsigned long cache = (unsigned long)arg;
17206 +
17207 +       /*
17208 +        * Flush all to work around Errata in early athlons regarding
17209 +        * large page flushing.
17210 +        */
17211 +       __flush_tlb_all();
17212 +
17213 +       if (cache && boot_cpu_data.x86_model >= 4)
17214 +               wbinvd();
17215 +}
17216 +
17217 +static void cpa_flush_all(unsigned long cache)
17218 +{
17219 +       BUG_ON(irqs_disabled());
17220 +
17221 +       on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
17222 +}
17223 +
17224 +static void __cpa_flush_range(void *arg)
17225 +{
17226 +       /*
17227 +        * We could optimize that further and do individual per page
17228 +        * tlb invalidates for a low number of pages. Caveat: we must
17229 +        * flush the high aliases on 64bit as well.
17230 +        */
17231 +       __flush_tlb_all();
17232 +}
17233 +
17234 +static void cpa_flush_range(unsigned long start, int numpages, int cache)
17235 +{
17236 +       unsigned int i, level;
17237 +       unsigned long addr;
17238 +
17239 +       BUG_ON(irqs_disabled());
17240 +       WARN_ON(PAGE_ALIGN(start) != start);
17241 +
17242 +       on_each_cpu(__cpa_flush_range, NULL, 1, 1);
17243 +
17244 +       if (!cache)
17245 +               return;
17246 +
17247 +       /*
17248 +        * We only need to flush on one CPU,
17249 +        * clflush is a MESI-coherent instruction that
17250 +        * will cause all other CPUs to flush the same
17251 +        * cachelines:
17252 +        */
17253 +       for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
17254 +               pte_t *pte = lookup_address(addr, &level);
17255 +
17256 +               /*
17257 +                * Only flush present addresses:
17258 +                */
17259 +               if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
17260 +                       clflush_cache_range((void *) addr, PAGE_SIZE);
17261 +       }
17262 +}
17263 +
17264 +/*
17265 + * Certain areas of memory on x86 require very specific protection flags,
17266 + * for example the BIOS area or kernel text. Callers don't always get this
17267 + * right (again, ioremap() on BIOS memory is not uncommon) so this function
17268 + * checks and fixes these known static required protection bits.
17269 + */
17270 +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
17271 +                                  unsigned long pfn)
17272 +{
17273 +       pgprot_t forbidden = __pgprot(0);
17274 +
17275 +#ifndef CONFIG_XEN
17276 +       /*
17277 +        * The BIOS area between 640k and 1Mb needs to be executable for
17278 +        * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
17279 +        */
17280 +       if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
17281 +               pgprot_val(forbidden) |= _PAGE_NX;
17282 +#endif
17283 +
17284 +       /*
17285 +        * The kernel text needs to be executable for obvious reasons
17286 +        * Does not cover __inittext since that is gone later on. On
17287 +        * 64bit we do not enforce !NX on the low mapping
17288 +        */
17289 +       if (within(address, (unsigned long)_text, (unsigned long)_etext))
17290 +               pgprot_val(forbidden) |= _PAGE_NX;
17291 +
17292 +       /*
17293 +        * The .rodata section needs to be read-only. Using the pfn
17294 +        * catches all aliases.
17295 +        */
17296 +       if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
17297 +                  __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
17298 +               pgprot_val(forbidden) |= _PAGE_RW;
17299 +
17300 +       prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
17301 +
17302 +       return prot;
17303 +}
17304 +
17305 +/*
17306 + * Lookup the page table entry for a virtual address. Return a pointer
17307 + * to the entry and the level of the mapping.
17308 + *
17309 + * Note: We return pud and pmd either when the entry is marked large
17310 + * or when the present bit is not set. Otherwise we would return a
17311 + * pointer to a nonexisting mapping.
17312 + */
17313 +pte_t *lookup_address(unsigned long address, unsigned int *level)
17314 +{
17315 +       pgd_t *pgd = pgd_offset_k(address);
17316 +       pud_t *pud;
17317 +       pmd_t *pmd;
17318 +
17319 +       *level = PG_LEVEL_NONE;
17320 +
17321 +       if (pgd_none(*pgd))
17322 +               return NULL;
17323 +
17324 +       pud = pud_offset(pgd, address);
17325 +       if (pud_none(*pud))
17326 +               return NULL;
17327 +
17328 +       *level = PG_LEVEL_1G;
17329 +       if (pud_large(*pud) || !pud_present(*pud))
17330 +               return (pte_t *)pud;
17331 +
17332 +       pmd = pmd_offset(pud, address);
17333 +       if (pmd_none(*pmd))
17334 +               return NULL;
17335 +
17336 +       *level = PG_LEVEL_2M;
17337 +       if (pmd_large(*pmd) || !pmd_present(*pmd))
17338 +               return (pte_t *)pmd;
17339 +
17340 +       *level = PG_LEVEL_4K;
17341 +
17342 +       return pte_offset_kernel(pmd, address);
17343 +}
17344 +
17345 +/*
17346 + * Set the new pmd in all the pgds we know about:
17347 + */
17348 +static void __set_pmd_pte(pte_t *kpte, unsigned long address,
17349 +                         unsigned int level, pte_t pte)
17350 +{
17351 +       /* change init_mm */
17352 +       switch(level) {
17353 +       case PG_LEVEL_2M:
17354 +               xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
17355 +               break;
17356 +#ifdef CONFIG_X86_64
17357 +       case PG_LEVEL_1G:
17358 +               xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
17359 +               break;
17360 +#endif
17361 +       default:
17362 +               BUG();
17363 +       }
17364 +#ifdef CONFIG_X86_32
17365 +       if (!SHARED_KERNEL_PMD) {
17366 +               struct page *page;
17367 +
17368 +               list_for_each_entry(page, &pgd_list, lru) {
17369 +                       pgd_t *pgd;
17370 +                       pud_t *pud;
17371 +                       pmd_t *pmd;
17372 +
17373 +                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
17374 +                       pud = pud_offset(pgd, address);
17375 +                       pmd = pmd_offset(pud, address);
17376 +                       xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
17377 +               }
17378 +       }
17379 +#endif
17380 +}
17381 +
17382 +static int
17383 +try_preserve_large_page(pte_t *kpte, unsigned long address,
17384 +                       struct cpa_data *cpa)
17385 +{
17386 +       unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
17387 +       pte_t new_pte, old_pte, *tmp;
17388 +       pgprot_t old_prot, new_prot;
17389 +       int i, do_split = 1;
17390 +       unsigned int level;
17391 +
17392 +       spin_lock_irqsave(&pgd_lock, flags);
17393 +       /*
17394 +        * Check for races, another CPU might have split this page
17395 +        * up already:
17396 +        */
17397 +       tmp = lookup_address(address, &level);
17398 +       if (tmp != kpte)
17399 +               goto out_unlock;
17400 +
17401 +       switch (level) {
17402 +       case PG_LEVEL_2M:
17403 +               psize = PMD_PAGE_SIZE;
17404 +               pmask = PMD_PAGE_MASK;
17405 +               break;
17406 +#ifdef CONFIG_X86_64
17407 +       case PG_LEVEL_1G:
17408 +               psize = PUD_PAGE_SIZE;
17409 +               pmask = PUD_PAGE_MASK;
17410 +               break;
17411 +#endif
17412 +       default:
17413 +               do_split = -EINVAL;
17414 +               goto out_unlock;
17415 +       }
17416 +
17417 +       /*
17418 +        * Calculate the number of pages, which fit into this large
17419 +        * page starting at address:
17420 +        */
17421 +       nextpage_addr = (address + psize) & pmask;
17422 +       numpages = (nextpage_addr - address) >> PAGE_SHIFT;
17423 +       if (numpages < cpa->numpages)
17424 +               cpa->numpages = numpages;
17425 +
17426 +       /*
17427 +        * We are safe now. Check whether the new pgprot is the same:
17428 +        */
17429 +       old_pte = *kpte;
17430 +       old_prot = new_prot = pte_pgprot(old_pte);
17431 +
17432 +       pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17433 +       pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17434 +
17435 +       /*
17436 +        * old_pte points to the large page base address. So we need
17437 +        * to add the offset of the virtual address:
17438 +        */
17439 +       pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
17440 +       cpa->pfn = pfn;
17441 +
17442 +       new_prot = static_protections(new_prot, address, pfn);
17443 +
17444 +       /*
17445 +        * We need to check the full range, whether
17446 +        * static_protection() requires a different pgprot for one of
17447 +        * the pages in the range we try to preserve:
17448 +        */
17449 +       if (pfn < max_mapnr) {
17450 +               addr = address + PAGE_SIZE;
17451 +               for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
17452 +                    i++, addr += PAGE_SIZE) {
17453 +                       pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
17454 +
17455 +                       if (pgprot_val(chk_prot) != pgprot_val(new_prot))
17456 +                               goto out_unlock;
17457 +               }
17458 +       }
17459 +
17460 +       /*
17461 +        * If there are no changes, return. maxpages has been updated
17462 +        * above:
17463 +        */
17464 +       if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
17465 +               do_split = 0;
17466 +               goto out_unlock;
17467 +       }
17468 +
17469 +       /*
17470 +        * We need to change the attributes. Check, whether we can
17471 +        * change the large page in one go. We request a split, when
17472 +        * the address is not aligned and the number of pages is
17473 +        * smaller than the number of pages in the large page. Note
17474 +        * that we limited the number of possible pages already to
17475 +        * the number of pages in the large page.
17476 +        */
17477 +       if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
17478 +               /*
17479 +                * The address is aligned and the number of pages
17480 +                * covers the full page.
17481 +                */
17482 +               new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
17483 +               __set_pmd_pte(kpte, address, level, new_pte);
17484 +               cpa->flushtlb = 1;
17485 +               do_split = 0;
17486 +       }
17487 +
17488 +out_unlock:
17489 +       spin_unlock_irqrestore(&pgd_lock, flags);
17490 +
17491 +       return do_split;
17492 +}
17493 +
17494 +static LIST_HEAD(page_pool);
17495 +static unsigned long pool_size, pool_pages, pool_low;
17496 +static unsigned long pool_used, pool_failed;
17497 +
17498 +static void cpa_fill_pool(struct page **ret)
17499 +{
17500 +       gfp_t gfp = GFP_KERNEL;
17501 +       unsigned long flags;
17502 +       struct page *p;
17503 +
17504 +       /*
17505 +        * Avoid recursion (on debug-pagealloc) and also signal
17506 +        * our priority to get to these pagetables:
17507 +        */
17508 +       if (current->flags & PF_MEMALLOC)
17509 +               return;
17510 +       current->flags |= PF_MEMALLOC;
17511 +
17512 +       /*
17513 +        * Allocate atomically from atomic contexts:
17514 +        */
17515 +       if (in_atomic() || irqs_disabled() || debug_pagealloc)
17516 +               gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
17517 +
17518 +       while (pool_pages < pool_size || (ret && !*ret)) {
17519 +               p = alloc_pages(gfp, 0);
17520 +               if (!p) {
17521 +                       pool_failed++;
17522 +                       break;
17523 +               }
17524 +               /*
17525 +                * If the call site needs a page right now, provide it:
17526 +                */
17527 +               if (ret && !*ret) {
17528 +                       *ret = p;
17529 +                       continue;
17530 +               }
17531 +               spin_lock_irqsave(&pgd_lock, flags);
17532 +               list_add(&p->lru, &page_pool);
17533 +               pool_pages++;
17534 +               spin_unlock_irqrestore(&pgd_lock, flags);
17535 +       }
17536 +
17537 +       current->flags &= ~PF_MEMALLOC;
17538 +}
17539 +
17540 +#define SHIFT_MB               (20 - PAGE_SHIFT)
17541 +#define ROUND_MB_GB            ((1 << 10) - 1)
17542 +#define SHIFT_MB_GB            10
17543 +#define POOL_PAGES_PER_GB      16
17544 +
17545 +void __init cpa_init(void)
17546 +{
17547 +       struct sysinfo si;
17548 +       unsigned long gb;
17549 +
17550 +       si_meminfo(&si);
17551 +       /*
17552 +        * Calculate the number of pool pages:
17553 +        *
17554 +        * Convert totalram (nr of pages) to MiB and round to the next
17555 +        * GiB. Shift MiB to Gib and multiply the result by
17556 +        * POOL_PAGES_PER_GB:
17557 +        */
17558 +       if (debug_pagealloc) {
17559 +               gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
17560 +               pool_size = POOL_PAGES_PER_GB * gb;
17561 +       } else {
17562 +               pool_size = 1;
17563 +       }
17564 +       pool_low = pool_size;
17565 +
17566 +       cpa_fill_pool(NULL);
17567 +       printk(KERN_DEBUG
17568 +              "CPA: page pool initialized %lu of %lu pages preallocated\n",
17569 +              pool_pages, pool_size);
17570 +}
17571 +
17572 +static int split_large_page(pte_t *kpte, unsigned long address)
17573 +{
17574 +       unsigned long flags, mfn, mfninc = 1;
17575 +       unsigned int i, level;
17576 +       pte_t *pbase, *tmp;
17577 +       pgprot_t ref_prot;
17578 +       struct page *base;
17579 +
17580 +       /*
17581 +        * Get a page from the pool. The pool list is protected by the
17582 +        * pgd_lock, which we have to take anyway for the split
17583 +        * operation:
17584 +        */
17585 +       spin_lock_irqsave(&pgd_lock, flags);
17586 +       if (list_empty(&page_pool)) {
17587 +               spin_unlock_irqrestore(&pgd_lock, flags);
17588 +               base = NULL;
17589 +               cpa_fill_pool(&base);
17590 +               if (!base)
17591 +                       return -ENOMEM;
17592 +               spin_lock_irqsave(&pgd_lock, flags);
17593 +       } else {
17594 +               base = list_first_entry(&page_pool, struct page, lru);
17595 +               list_del(&base->lru);
17596 +               pool_pages--;
17597 +
17598 +               if (pool_pages < pool_low)
17599 +                       pool_low = pool_pages;
17600 +       }
17601 +
17602 +       /*
17603 +        * Check for races, another CPU might have split this page
17604 +        * up for us already:
17605 +        */
17606 +       tmp = lookup_address(address, &level);
17607 +       if (tmp != kpte)
17608 +               goto out_unlock;
17609 +
17610 +       pbase = (pte_t *)page_address(base);
17611 +#ifdef CONFIG_X86_32
17612 +       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
17613 +#endif
17614 +       ref_prot = pte_pgprot(pte_clrhuge(*kpte));
17615 +
17616 +#ifdef CONFIG_X86_64
17617 +       if (level == PG_LEVEL_1G) {
17618 +               mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
17619 +               pgprot_val(ref_prot) |= _PAGE_PSE;
17620 +       }
17621 +#endif
17622 +
17623 +       /*
17624 +        * Get the target mfn from the original entry:
17625 +        */
17626 +       mfn = __pte_mfn(*kpte);
17627 +       for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
17628 +               set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
17629 +
17630 +       /*
17631 +        * Install the new, split up pagetable. Important details here:
17632 +        *
17633 +        * On Intel the NX bit of all levels must be cleared to make a
17634 +        * page executable. See section 4.13.2 of Intel 64 and IA-32
17635 +        * Architectures Software Developer's Manual).
17636 +        *
17637 +        * Mark the entry present. The current mapping might be
17638 +        * set to not present, which we preserved above.
17639 +        */
17640 +       if (HYPERVISOR_update_va_mapping((unsigned long)pbase,
17641 +                                        mk_pte(base, PAGE_KERNEL_RO), 0))
17642 +               BUG();
17643 +       ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
17644 +       pgprot_val(ref_prot) |= _PAGE_PRESENT;
17645 +       __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
17646 +       base = NULL;
17647 +
17648 +out_unlock:
17649 +       /*
17650 +        * If we dropped out via the lookup_address check under
17651 +        * pgd_lock then stick the page back into the pool:
17652 +        */
17653 +       if (base) {
17654 +               list_add(&base->lru, &page_pool);
17655 +               pool_pages++;
17656 +       } else
17657 +               pool_used++;
17658 +       spin_unlock_irqrestore(&pgd_lock, flags);
17659 +
17660 +       return 0;
17661 +}
17662 +
17663 +static int __change_page_attr(struct cpa_data *cpa, int primary)
17664 +{
17665 +       unsigned long address = cpa->vaddr;
17666 +       int do_split, err;
17667 +       unsigned int level;
17668 +       pte_t *kpte, old_pte;
17669 +
17670 +repeat:
17671 +       kpte = lookup_address(address, &level);
17672 +       if (!kpte)
17673 +               return primary ? -EINVAL : 0;
17674 +
17675 +       old_pte = *kpte;
17676 +       if (!__pte_val(old_pte)) {
17677 +               if (!primary)
17678 +                       return 0;
17679 +               printk(KERN_WARNING "CPA: called for zero pte. "
17680 +                      "vaddr = %lx cpa->vaddr = %lx\n", address,
17681 +                      cpa->vaddr);
17682 +               WARN_ON(1);
17683 +               return -EINVAL;
17684 +       }
17685 +
17686 +       if (level == PG_LEVEL_4K) {
17687 +               pte_t new_pte;
17688 +               pgprot_t new_prot = pte_pgprot(old_pte);
17689 +               unsigned long mfn = __pte_mfn(old_pte);
17690 +
17691 +               pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17692 +               pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17693 +
17694 +               new_prot = static_protections(new_prot, address,
17695 +                                             mfn_to_local_pfn(mfn));
17696 +
17697 +               /*
17698 +                * We need to keep the mfn from the existing PTE,
17699 +                * after all we're only going to change it's attributes
17700 +                * not the memory it points to
17701 +                */
17702 +               new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17703 +               cpa->pfn = mfn_to_local_pfn(mfn);
17704 +               /*
17705 +                * Do we really change anything ?
17706 +                */
17707 +               if (__pte_val(old_pte) != __pte_val(new_pte)) {
17708 +                       set_pte_atomic(kpte, new_pte);
17709 +                       cpa->flushtlb = 1;
17710 +               }
17711 +               cpa->numpages = 1;
17712 +               return 0;
17713 +       }
17714 +
17715 +       /*
17716 +        * Check, whether we can keep the large page intact
17717 +        * and just change the pte:
17718 +        */
17719 +       do_split = try_preserve_large_page(kpte, address, cpa);
17720 +       /*
17721 +        * When the range fits into the existing large page,
17722 +        * return. cp->numpages and cpa->tlbflush have been updated in
17723 +        * try_large_page:
17724 +        */
17725 +       if (do_split <= 0)
17726 +               return do_split;
17727 +
17728 +       /*
17729 +        * We have to split the large page:
17730 +        */
17731 +       err = split_large_page(kpte, address);
17732 +       if (!err) {
17733 +               cpa->flushtlb = 1;
17734 +               goto repeat;
17735 +       }
17736 +
17737 +       return err;
17738 +}
17739 +
17740 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17741 +
17742 +static int cpa_process_alias(struct cpa_data *cpa)
17743 +{
17744 +       struct cpa_data alias_cpa;
17745 +       int ret = 0;
17746 +
17747 +       if (cpa->pfn > max_pfn_mapped)
17748 +               return 0;
17749 +
17750 +       /*
17751 +        * No need to redo, when the primary call touched the direct
17752 +        * mapping already:
17753 +        */
17754 +       if (!within(cpa->vaddr, PAGE_OFFSET,
17755 +                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17756 +
17757 +               alias_cpa = *cpa;
17758 +               alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17759 +
17760 +               ret = __change_page_attr_set_clr(&alias_cpa, 0);
17761 +       }
17762 +
17763 +#ifdef CONFIG_X86_64
17764 +       if (ret)
17765 +               return ret;
17766 +       /*
17767 +        * No need to redo, when the primary call touched the high
17768 +        * mapping already:
17769 +        */
17770 +       if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17771 +               return 0;
17772 +
17773 +       /*
17774 +        * If the physical address is inside the kernel map, we need
17775 +        * to touch the high mapped kernel as well:
17776 +        */
17777 +       if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17778 +               return 0;
17779 +
17780 +       alias_cpa = *cpa;
17781 +       alias_cpa.vaddr =
17782 +               (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17783 +
17784 +       /*
17785 +        * The high mapping range is imprecise, so ignore the return value.
17786 +        */
17787 +       __change_page_attr_set_clr(&alias_cpa, 0);
17788 +#endif
17789 +       return ret;
17790 +}
17791 +
17792 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17793 +{
17794 +       int ret, numpages = cpa->numpages;
17795 +
17796 +       while (numpages) {
17797 +               /*
17798 +                * Store the remaining nr of pages for the large page
17799 +                * preservation check.
17800 +                */
17801 +               cpa->numpages = numpages;
17802 +
17803 +               ret = __change_page_attr(cpa, checkalias);
17804 +               if (ret)
17805 +                       return ret;
17806 +
17807 +               if (checkalias) {
17808 +                       ret = cpa_process_alias(cpa);
17809 +                       if (ret)
17810 +                               return ret;
17811 +               }
17812 +
17813 +               /*
17814 +                * Adjust the number of pages with the result of the
17815 +                * CPA operation. Either a large page has been
17816 +                * preserved or a single page update happened.
17817 +                */
17818 +               BUG_ON(cpa->numpages > numpages);
17819 +               numpages -= cpa->numpages;
17820 +               cpa->vaddr += cpa->numpages * PAGE_SIZE;
17821 +       }
17822 +       return 0;
17823 +}
17824 +
17825 +static inline int cache_attr(pgprot_t attr)
17826 +{
17827 +       return pgprot_val(attr) &
17828 +               (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17829 +}
17830 +
17831 +static int change_page_attr_set_clr(unsigned long addr, int numpages,
17832 +                                   pgprot_t mask_set, pgprot_t mask_clr)
17833 +{
17834 +       struct cpa_data cpa;
17835 +       int ret, cache, checkalias;
17836 +
17837 +       /*
17838 +        * Check, if we are requested to change a not supported
17839 +        * feature:
17840 +        */
17841 +       mask_set = canon_pgprot(mask_set);
17842 +       mask_clr = canon_pgprot(mask_clr);
17843 +       if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17844 +               return 0;
17845 +
17846 +       /* Ensure we are PAGE_SIZE aligned */
17847 +       if (addr & ~PAGE_MASK) {
17848 +               addr &= PAGE_MASK;
17849 +               /*
17850 +                * People should not be passing in unaligned addresses:
17851 +                */
17852 +               WARN_ON_ONCE(1);
17853 +       }
17854 +
17855 +       cpa.vaddr = addr;
17856 +       cpa.numpages = numpages;
17857 +       cpa.mask_set = mask_set;
17858 +       cpa.mask_clr = mask_clr;
17859 +       cpa.flushtlb = 0;
17860 +
17861 +       /* No alias checking for _NX bit modifications */
17862 +       checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17863 +
17864 +       ret = __change_page_attr_set_clr(&cpa, checkalias);
17865 +
17866 +       /*
17867 +        * Check whether we really changed something:
17868 +        */
17869 +       if (!cpa.flushtlb)
17870 +               goto out;
17871 +
17872 +       /*
17873 +        * No need to flush, when we did not set any of the caching
17874 +        * attributes:
17875 +        */
17876 +       cache = cache_attr(mask_set);
17877 +
17878 +       /*
17879 +        * On success we use clflush, when the CPU supports it to
17880 +        * avoid the wbindv. If the CPU does not support it and in the
17881 +        * error case we fall back to cpa_flush_all (which uses
17882 +        * wbindv):
17883 +        */
17884 +       if (!ret && cpu_has_clflush)
17885 +               cpa_flush_range(addr, numpages, cache);
17886 +       else
17887 +               cpa_flush_all(cache);
17888 +
17889 +out:
17890 +       cpa_fill_pool(NULL);
17891 +
17892 +       return ret;
17893 +}
17894 +
17895 +static inline int change_page_attr_set(unsigned long addr, int numpages,
17896 +                                      pgprot_t mask)
17897 +{
17898 +       return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17899 +}
17900 +
17901 +static inline int change_page_attr_clear(unsigned long addr, int numpages,
17902 +                                        pgprot_t mask)
17903 +{
17904 +       return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17905 +}
17906 +
17907 +int set_memory_uc(unsigned long addr, int numpages)
17908 +{
17909 +       return change_page_attr_set(addr, numpages,
17910 +                                   __pgprot(_PAGE_PCD));
17911 +}
17912 +EXPORT_SYMBOL(set_memory_uc);
17913 +
17914 +int set_memory_wb(unsigned long addr, int numpages)
17915 +{
17916 +       return change_page_attr_clear(addr, numpages,
17917 +                                     __pgprot(_PAGE_PCD | _PAGE_PWT));
17918 +}
17919 +EXPORT_SYMBOL(set_memory_wb);
17920 +
17921 +int set_memory_x(unsigned long addr, int numpages)
17922 +{
17923 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17924 +}
17925 +EXPORT_SYMBOL(set_memory_x);
17926 +
17927 +int set_memory_nx(unsigned long addr, int numpages)
17928 +{
17929 +       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17930 +}
17931 +EXPORT_SYMBOL(set_memory_nx);
17932 +
17933 +int set_memory_ro(unsigned long addr, int numpages)
17934 +{
17935 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17936 +}
17937 +
17938 +int set_memory_rw(unsigned long addr, int numpages)
17939 +{
17940 +       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17941 +}
17942 +
17943 +int set_memory_np(unsigned long addr, int numpages)
17944 +{
17945 +       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17946 +}
17947 +
17948 +int set_pages_uc(struct page *page, int numpages)
17949 +{
17950 +       unsigned long addr = (unsigned long)page_address(page);
17951 +
17952 +       return set_memory_uc(addr, numpages);
17953 +}
17954 +EXPORT_SYMBOL(set_pages_uc);
17955 +
17956 +int set_pages_wb(struct page *page, int numpages)
17957 +{
17958 +       unsigned long addr = (unsigned long)page_address(page);
17959 +
17960 +       return set_memory_wb(addr, numpages);
17961 +}
17962 +EXPORT_SYMBOL(set_pages_wb);
17963 +
17964 +int set_pages_x(struct page *page, int numpages)
17965 +{
17966 +       unsigned long addr = (unsigned long)page_address(page);
17967 +
17968 +       return set_memory_x(addr, numpages);
17969 +}
17970 +EXPORT_SYMBOL(set_pages_x);
17971 +
17972 +int set_pages_nx(struct page *page, int numpages)
17973 +{
17974 +       unsigned long addr = (unsigned long)page_address(page);
17975 +
17976 +       return set_memory_nx(addr, numpages);
17977 +}
17978 +EXPORT_SYMBOL(set_pages_nx);
17979 +
17980 +int set_pages_ro(struct page *page, int numpages)
17981 +{
17982 +       unsigned long addr = (unsigned long)page_address(page);
17983 +
17984 +       return set_memory_ro(addr, numpages);
17985 +}
17986 +
17987 +int set_pages_rw(struct page *page, int numpages)
17988 +{
17989 +       unsigned long addr = (unsigned long)page_address(page);
17990 +
17991 +       return set_memory_rw(addr, numpages);
17992 +}
17993 +
17994 +#ifdef CONFIG_DEBUG_PAGEALLOC
17995 +
17996 +static int __set_pages_p(struct page *page, int numpages)
17997 +{
17998 +       struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17999 +                               .numpages = numpages,
18000 +                               .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
18001 +                               .mask_clr = __pgprot(0)};
18002 +
18003 +       return __change_page_attr_set_clr(&cpa, 1);
18004 +}
18005 +
18006 +static int __set_pages_np(struct page *page, int numpages)
18007 +{
18008 +       struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
18009 +                               .numpages = numpages,
18010 +                               .mask_set = __pgprot(0),
18011 +                               .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
18012 +
18013 +       return __change_page_attr_set_clr(&cpa, 1);
18014 +}
18015 +
18016 +void kernel_map_pages(struct page *page, int numpages, int enable)
18017 +{
18018 +       if (PageHighMem(page))
18019 +               return;
18020 +       if (!enable) {
18021 +               debug_check_no_locks_freed(page_address(page),
18022 +                                          numpages * PAGE_SIZE);
18023 +       }
18024 +
18025 +       /*
18026 +        * If page allocator is not up yet then do not call c_p_a():
18027 +        */
18028 +       if (!debug_pagealloc_enabled)
18029 +               return;
18030 +
18031 +       /*
18032 +        * The return value is ignored as the calls cannot fail.
18033 +        * Large pages are kept enabled at boot time, and are
18034 +        * split up quickly with DEBUG_PAGEALLOC. If a splitup
18035 +        * fails here (due to temporary memory shortage) no damage
18036 +        * is done because we just keep the largepage intact up
18037 +        * to the next attempt when it will likely be split up:
18038 +        */
18039 +       if (enable)
18040 +               __set_pages_p(page, numpages);
18041 +       else
18042 +               __set_pages_np(page, numpages);
18043 +
18044 +       /*
18045 +        * We should perform an IPI and flush all tlbs,
18046 +        * but that can deadlock->flush only current cpu:
18047 +        */
18048 +       __flush_tlb_all();
18049 +
18050 +       /*
18051 +        * Try to refill the page pool here. We can do this only after
18052 +        * the tlb flush.
18053 +        */
18054 +       cpa_fill_pool(NULL);
18055 +}
18056 +
18057 +#ifdef CONFIG_HIBERNATION
18058 +
18059 +bool kernel_page_present(struct page *page)
18060 +{
18061 +       unsigned int level;
18062 +       pte_t *pte;
18063 +
18064 +       if (PageHighMem(page))
18065 +               return false;
18066 +
18067 +       pte = lookup_address((unsigned long)page_address(page), &level);
18068 +       return (__pte_val(*pte) & _PAGE_PRESENT);
18069 +}
18070 +
18071 +#endif /* CONFIG_HIBERNATION */
18072 +
18073 +#endif /* CONFIG_DEBUG_PAGEALLOC */
18074 +
18075 +static inline int in_secondary_range(unsigned long va)
18076 +{
18077 +#ifdef CONFIG_X86_64
18078 +       return va >= VMALLOC_START && va < VMALLOC_END;
18079 +#else
18080 +       return va >= (unsigned long)high_memory;
18081 +#endif
18082 +}
18083 +
18084 +static void __make_page_readonly(unsigned long va)
18085 +{
18086 +       pte_t *pte;
18087 +       unsigned int level;
18088 +
18089 +       pte = lookup_address(va, &level);
18090 +       BUG_ON(!pte || level != PG_LEVEL_4K);
18091 +       if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
18092 +               BUG();
18093 +       if (in_secondary_range(va)) {
18094 +               unsigned long pfn = pte_pfn(*pte);
18095 +
18096 +#ifdef CONFIG_HIGHMEM
18097 +               if (pfn >= highstart_pfn)
18098 +                       kmap_flush_unused(); /* flush stale writable kmaps */
18099 +               else
18100 +#endif
18101 +                       __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
18102 +       }
18103 +}
18104 +
18105 +static void __make_page_writable(unsigned long va)
18106 +{
18107 +       pte_t *pte;
18108 +       unsigned int level;
18109 +
18110 +       pte = lookup_address(va, &level);
18111 +       BUG_ON(!pte || level != PG_LEVEL_4K);
18112 +       if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
18113 +               BUG();
18114 +       if (in_secondary_range(va)) {
18115 +               unsigned long pfn = pte_pfn(*pte);
18116 +
18117 +#ifdef CONFIG_HIGHMEM
18118 +               if (pfn < highstart_pfn)
18119 +#endif
18120 +                       __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
18121 +       }
18122 +}
18123 +
18124 +void make_page_readonly(void *va, unsigned int feature)
18125 +{
18126 +       if (!xen_feature(feature))
18127 +               __make_page_readonly((unsigned long)va);
18128 +}
18129 +
18130 +void make_page_writable(void *va, unsigned int feature)
18131 +{
18132 +       if (!xen_feature(feature))
18133 +               __make_page_writable((unsigned long)va);
18134 +}
18135 +
18136 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18137 +{
18138 +       unsigned long addr;
18139 +
18140 +       if (xen_feature(feature))
18141 +               return;
18142 +
18143 +       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18144 +               __make_page_readonly(addr);
18145 +}
18146 +
18147 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18148 +{
18149 +       unsigned long addr;
18150 +
18151 +       if (xen_feature(feature))
18152 +               return;
18153 +
18154 +       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
18155 +               __make_page_writable(addr);
18156 +}
18157 +
18158 +/*
18159 + * The testcases use internal knowledge of the implementation that shouldn't
18160 + * be exposed to the rest of the kernel. Include these directly here.
18161 + */
18162 +#ifdef CONFIG_CPA_DEBUG
18163 +#include "pageattr-test.c"
18164 +#endif
18165 --- a/arch/x86/mm/pgtable_32-xen.c
18166 +++ b/arch/x86/mm/pgtable_32-xen.c
18167 @@ -29,8 +29,6 @@
18168  #include <xen/features.h>
18169  #include <asm/hypervisor.h>
18170
18171 -static void pgd_test_and_unpin(pgd_t *pgd);
18172 -
18173  void show_mem(void)
18174  {
18175         int total = 0, reserved = 0;
18176 @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18177         return pte;
18178  }
18179
18180 -static void _pte_free(struct page *page, unsigned int order)
18181 -{
18182 -       BUG_ON(order);
18183 -       pte_free(page);
18184 -}
18185 -
18186 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18187 -{
18188 -       struct page *pte;
18189 -
18190 -#ifdef CONFIG_HIGHPTE
18191 -       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18192 -#else
18193 -       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18194 -#endif
18195 -       if (pte) {
18196 -               SetPageForeign(pte, _pte_free);
18197 -               init_page_count(pte);
18198 -       }
18199 -       return pte;
18200 -}
18201 -
18202 -void pte_free(struct page *pte)
18203 -{
18204 -       unsigned long pfn = page_to_pfn(pte);
18205 -
18206 -       if (!PageHighMem(pte)) {
18207 -               unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18208 -
18209 -               if (!pte_write(*virt_to_ptep(va)))
18210 -                       if (HYPERVISOR_update_va_mapping(
18211 -                               va, pfn_pte(pfn, PAGE_KERNEL), 0))
18212 -                               BUG();
18213 -       } else
18214 -               ClearPagePinned(pte);
18215 -
18216 -       ClearPageForeign(pte);
18217 -       init_page_count(pte);
18218 -
18219 -       __free_page(pte);
18220 -}
18221 -
18222 -void pmd_ctor(struct kmem_cache *cache, void *pmd)
18223 -{
18224 -       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18225 -}
18226 -
18227  /*
18228   * List of all pgd's needed for non-PAE so it can invalidate entries
18229   * in both cached and uncached pgd's; not needed for PAE since the
18230 @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18231   * vmalloc faults work because attached pagetables are never freed.
18232   * -- wli
18233   */
18234 -DEFINE_SPINLOCK(pgd_lock);
18235 -struct page *pgd_list;
18236 -
18237  static inline void pgd_list_add(pgd_t *pgd)
18238  {
18239         struct page *page = virt_to_page(pgd);
18240 -       page->index = (unsigned long)pgd_list;
18241 -       if (pgd_list)
18242 -               set_page_private(pgd_list, (unsigned long)&page->index);
18243 -       pgd_list = page;
18244 -       set_page_private(page, (unsigned long)&pgd_list);
18245 +
18246 +       list_add(&page->lru, &pgd_list);
18247  }
18248
18249  static inline void pgd_list_del(pgd_t *pgd)
18250  {
18251 -       struct page *next, **pprev, *page = virt_to_page(pgd);
18252 -       next = (struct page *)page->index;
18253 -       pprev = (struct page **)page_private(page);
18254 -       *pprev = next;
18255 -       if (next)
18256 -               set_page_private(next, (unsigned long)pprev);
18257 -}
18258 +       struct page *page = virt_to_page(pgd);
18259
18260 +       list_del(&page->lru);
18261 +}
18262
18263 +#define UNSHARED_PTRS_PER_PGD                          \
18264 +       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18265
18266 -#if (PTRS_PER_PMD == 1)
18267 -/* Non-PAE pgd constructor */
18268 -static void pgd_ctor(void *pgd)
18269 +static void pgd_ctor(void *p)
18270  {
18271 +       pgd_t *pgd = p;
18272         unsigned long flags;
18273
18274 -       /* !PAE, no pagetable sharing */
18275 +       pgd_test_and_unpin(pgd);
18276 +
18277 +       /* Clear usermode parts of PGD */
18278         memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18279
18280         spin_lock_irqsave(&pgd_lock, flags);
18281
18282 -       /* must happen under lock */
18283 -       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18284 -                       swapper_pg_dir + USER_PTRS_PER_PGD,
18285 -                       KERNEL_PGD_PTRS);
18286 -
18287 -       paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18288 -                               __pa(swapper_pg_dir) >> PAGE_SHIFT,
18289 -                               USER_PTRS_PER_PGD,
18290 -                               KERNEL_PGD_PTRS);
18291 -       pgd_list_add(pgd);
18292 -       spin_unlock_irqrestore(&pgd_lock, flags);
18293 -}
18294 -#else  /* PTRS_PER_PMD > 1 */
18295 -/* PAE pgd constructor */
18296 -static void pgd_ctor(void *pgd)
18297 -{
18298 -       /* PAE, kernel PMD may be shared */
18299 -
18300 -       if (SHARED_KERNEL_PMD) {
18301 -               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18302 +       /* If the pgd points to a shared pagetable level (either the
18303 +          ptes in non-PAE, or shared PMD in PAE), then just copy the
18304 +          references from swapper_pg_dir. */
18305 +       if (PAGETABLE_LEVELS == 2 ||
18306 +           (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18307 +               clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18308                                 swapper_pg_dir + USER_PTRS_PER_PGD,
18309                                 KERNEL_PGD_PTRS);
18310 -       } else {
18311 -               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18312 +               paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18313 +                                       __pa(swapper_pg_dir) >> PAGE_SHIFT,
18314 +                                       USER_PTRS_PER_PGD,
18315 +                                       KERNEL_PGD_PTRS);
18316         }
18317 +
18318 +       /* list required to sync kernel mapping updates */
18319 +       if (PAGETABLE_LEVELS == 2)
18320 +               pgd_list_add(pgd);
18321 +
18322 +       spin_unlock_irqrestore(&pgd_lock, flags);
18323  }
18324 -#endif /* PTRS_PER_PMD */
18325
18326  static void pgd_dtor(void *pgd)
18327  {
18328         unsigned long flags; /* can be called from interrupt context */
18329
18330 -       if (SHARED_KERNEL_PMD)
18331 -               return;
18332 -
18333 -       paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18334 -       spin_lock_irqsave(&pgd_lock, flags);
18335 -       pgd_list_del(pgd);
18336 -       spin_unlock_irqrestore(&pgd_lock, flags);
18337 +       if (!SHARED_KERNEL_PMD) {
18338 +               spin_lock_irqsave(&pgd_lock, flags);
18339 +               pgd_list_del(pgd);
18340 +               spin_unlock_irqrestore(&pgd_lock, flags);
18341 +       }
18342
18343         pgd_test_and_unpin(pgd);
18344  }
18345
18346 -#define UNSHARED_PTRS_PER_PGD                          \
18347 -       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18348 -
18349 -/* If we allocate a pmd for part of the kernel address space, then
18350 -   make sure its initialized with the appropriate kernel mappings.
18351 -   Otherwise use a cached zeroed pmd.  */
18352 -static pmd_t *pmd_cache_alloc(int idx)
18353 +#ifdef CONFIG_X86_PAE
18354 +/*
18355 + * Mop up any pmd pages which may still be attached to the pgd.
18356 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
18357 + * preallocate which never got a corresponding vma will need to be
18358 + * freed manually.
18359 + */
18360 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18361  {
18362 -       pmd_t *pmd;
18363 +       int i;
18364
18365 -       if (idx >= USER_PTRS_PER_PGD) {
18366 -               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18367 +       for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18368 +               pgd_t pgd = pgdp[i];
18369
18370 -#ifndef CONFIG_XEN
18371 -               if (pmd)
18372 -                       memcpy(pmd,
18373 -                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18374 -                              sizeof(pmd_t) * PTRS_PER_PMD);
18375 -#endif
18376 -       } else
18377 -               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18378 +               if (__pgd_val(pgd) != 0) {
18379 +                       pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18380
18381 -       return pmd;
18382 -}
18383 +                       pgdp[i] = xen_make_pgd(0);
18384
18385 -static void pmd_cache_free(pmd_t *pmd, int idx)
18386 -{
18387 -       if (idx >= USER_PTRS_PER_PGD) {
18388 -               make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18389 -               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18390 -               free_page((unsigned long)pmd);
18391 -       } else
18392 -               kmem_cache_free(pmd_cache, pmd);
18393 +                       paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18394 +                       pmd_free(mm, pmd);
18395 +               }
18396 +       }
18397  }
18398
18399 -pgd_t *pgd_alloc(struct mm_struct *mm)
18400 +/*
18401 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18402 + * updating the top-level pagetable entries to guarantee the
18403 + * processor notices the update.  Since this is expensive, and
18404 + * all 4 top-level entries are used almost immediately in a
18405 + * new process's life, we just pre-populate them here.
18406 + *
18407 + * Also, if we're in a paravirt environment where the kernel pmd is
18408 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18409 + * and initialize the kernel pmds here.
18410 + */
18411 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18412  {
18413 +       pud_t *pud;
18414 +       pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18415 +       unsigned long addr, flags;
18416         int i;
18417 -       pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18418 -       pmd_t **pmds = NULL;
18419 -       unsigned long flags;
18420 -
18421 -       pgd_test_and_unpin(pgd);
18422 -
18423 -       if (PTRS_PER_PMD == 1 || !pgd)
18424 -               return pgd;
18425 -
18426 -#ifdef CONFIG_XEN
18427 -       if (!SHARED_KERNEL_PMD) {
18428 -               /*
18429 -                * We can race save/restore (if we sleep during a GFP_KERNEL memory
18430 -                * allocation). We therefore store virtual addresses of pmds as they
18431 -                * do not change across save/restore, and poke the machine addresses
18432 -                * into the pgdir under the pgd_lock.
18433 -                */
18434 -               pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18435 -               if (!pmds) {
18436 -                       quicklist_free(0, pgd_dtor, pgd);
18437 -                       return NULL;
18438 -               }
18439 -       }
18440 -#endif
18441
18442 -       /* Allocate pmds, remember virtual addresses. */
18443 -       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18444 -               pmd_t *pmd = pmd_cache_alloc(i);
18445 -
18446 -               if (!pmd)
18447 +       /*
18448 +        * We can race save/restore (if we sleep during a GFP_KERNEL memory
18449 +        * allocation). We therefore store virtual addresses of pmds as they
18450 +        * do not change across save/restore, and poke the machine addresses
18451 +        * into the pgdir under the pgd_lock.
18452 +        */
18453 +       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18454 +               pmds[i] = pmd_alloc_one(mm, addr);
18455 +               if (!pmds[i])
18456                         goto out_oom;
18457 -
18458 -               paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18459 -               if (pmds)
18460 -                       pmds[i] = pmd;
18461 -               else
18462 -                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18463         }
18464
18465 -#ifdef CONFIG_XEN
18466 -       if (SHARED_KERNEL_PMD)
18467 -               return pgd;
18468 -
18469         spin_lock_irqsave(&pgd_lock, flags);
18470
18471         /* Protect against save/restore: move below 4GB under pgd_lock. */
18472 -       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18473 -               int rc = xen_create_contiguous_region(
18474 -                       (unsigned long)pgd, 0, 32);
18475 -               if (rc) {
18476 -                       spin_unlock_irqrestore(&pgd_lock, flags);
18477 -                       goto out_oom;
18478 -               }
18479 +       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18480 +           && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18481 +               spin_unlock_irqrestore(&pgd_lock, flags);
18482 +out_oom:
18483 +               while (i--)
18484 +                       pmd_free(mm, pmds[i]);
18485 +               return 0;
18486         }
18487
18488         /* Copy kernel pmd contents and write-protect the new pmds. */
18489 -       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18490 -               memcpy(pmds[i],
18491 -                      (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18492 -                      sizeof(pmd_t) * PTRS_PER_PMD);
18493 -               make_lowmem_page_readonly(
18494 -                       pmds[i], XENFEAT_writable_page_tables);
18495 -       }
18496 +       pud = pud_offset(pgd, 0);
18497 +       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18498 +            i++, pud++, addr += PUD_SIZE) {
18499 +               if (i >= USER_PTRS_PER_PGD) {
18500 +                       memcpy(pmds[i],
18501 +                              (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18502 +                              sizeof(pmd_t) * PTRS_PER_PMD);
18503 +                       make_lowmem_page_readonly(
18504 +                               pmds[i], XENFEAT_writable_page_tables);
18505 +               }
18506
18507 -       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18508 -       for (i = 0; i < PTRS_PER_PGD; i++)
18509 -               set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18510 +               /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18511 +               pud_populate(mm, pud, pmds[i]);
18512 +       }
18513
18514 -       /* Ensure this pgd gets picked up and pinned on save/restore. */
18515 +       /* List required to sync kernel mapping updates and
18516 +        * to pin/unpin on save/restore. */
18517         pgd_list_add(pgd);
18518
18519         spin_unlock_irqrestore(&pgd_lock, flags);
18520
18521 -       kfree(pmds);
18522 -#endif
18523 +       return 1;
18524 +}
18525 +#else  /* !CONFIG_X86_PAE */
18526 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
18527 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18528 +{
18529 +       return 1;
18530 +}
18531
18532 -       return pgd;
18533 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18534 +{
18535 +}
18536 +#endif /* CONFIG_X86_PAE */
18537
18538 -out_oom:
18539 -       if (!pmds) {
18540 -               for (i--; i >= 0; i--) {
18541 -                       pgd_t pgdent = pgd[i];
18542 -                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
18543 -                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18544 -                       pmd_cache_free(pmd, i);
18545 -               }
18546 -       } else {
18547 -               for (i--; i >= 0; i--) {
18548 -                       paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18549 -                       pmd_cache_free(pmds[i], i);
18550 -               }
18551 -               kfree(pmds);
18552 +pgd_t *pgd_alloc(struct mm_struct *mm)
18553 +{
18554 +       pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18555 +
18556 +       /* so that alloc_pd can use it */
18557 +       mm->pgd = pgd;
18558 +       if (pgd)
18559 +               pgd_ctor(pgd);
18560 +
18561 +       if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18562 +               free_page((unsigned long)pgd);
18563 +               pgd = NULL;
18564         }
18565 -       quicklist_free(0, pgd_dtor, pgd);
18566 -       return NULL;
18567 +
18568 +       return pgd;
18569  }
18570
18571 -void pgd_free(pgd_t *pgd)
18572 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18573  {
18574 -       int i;
18575 -
18576         /*
18577          * After this the pgd should not be pinned for the duration of this
18578          * function's execution. We should never sleep and thus never race:
18579 @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18580          *  2. The machine addresses in PGD entries will not become invalid
18581          *     due to a concurrent save/restore.
18582          */
18583 -       pgd_test_and_unpin(pgd);
18584 +       pgd_dtor(pgd);
18585
18586 -       /* in the PAE case user pgd entries are overwritten before usage */
18587 -       if (PTRS_PER_PMD > 1) {
18588 -               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18589 -                       pgd_t pgdent = pgd[i];
18590 -                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
18591 -                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18592 -                       pmd_cache_free(pmd, i);
18593 -               }
18594 +       if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18595 +               xen_destroy_contiguous_region((unsigned long)pgd, 0);
18596
18597 -               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18598 -                       xen_destroy_contiguous_region((unsigned long)pgd, 0);
18599 -       }
18600 +       pgd_mop_up_pmds(mm, pgd);
18601 +       free_page((unsigned long)pgd);
18602 +}
18603
18604 -       /* in the non-PAE case, free_pgtables() clears user pgd entries */
18605 -       quicklist_free(0, pgd_dtor, pgd);
18606 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18607 +{
18608 +       pgtable_page_dtor(pte);
18609 +       paravirt_release_pt(page_to_pfn(pte));
18610 +       tlb_remove_page(tlb, pte);
18611  }
18612
18613 -void check_pgt_cache(void)
18614 +#ifdef CONFIG_X86_PAE
18615 +
18616 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18617  {
18618 -       quicklist_trim(0, pgd_dtor, 25, 16);
18619 +       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18620 +       tlb_remove_page(tlb, virt_to_page(pmd));
18621  }
18622
18623 +#endif
18624 +
18625  void make_lowmem_page_readonly(void *va, unsigned int feature)
18626  {
18627         pte_t *pte;
18628 +       unsigned int level;
18629         int rc;
18630
18631         if (xen_feature(feature))
18632                 return;
18633
18634 -       pte = virt_to_ptep(va);
18635 +       pte = lookup_address((unsigned long)va, &level);
18636 +       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18637         rc = HYPERVISOR_update_va_mapping(
18638                 (unsigned long)va, pte_wrprotect(*pte), 0);
18639         BUG_ON(rc);
18640 @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18641  void make_lowmem_page_writable(void *va, unsigned int feature)
18642  {
18643         pte_t *pte;
18644 +       unsigned int level;
18645         int rc;
18646
18647         if (xen_feature(feature))
18648                 return;
18649
18650 -       pte = virt_to_ptep(va);
18651 +       pte = lookup_address((unsigned long)va, &level);
18652 +       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18653         rc = HYPERVISOR_update_va_mapping(
18654                 (unsigned long)va, pte_mkwrite(*pte), 0);
18655         BUG_ON(rc);
18656  }
18657 -
18658 -void make_page_readonly(void *va, unsigned int feature)
18659 -{
18660 -       pte_t *pte;
18661 -       int rc;
18662 -
18663 -       if (xen_feature(feature))
18664 -               return;
18665 -
18666 -       pte = virt_to_ptep(va);
18667 -       rc = HYPERVISOR_update_va_mapping(
18668 -               (unsigned long)va, pte_wrprotect(*pte), 0);
18669 -       if (rc) /* fallback? */
18670 -               xen_l1_entry_update(pte, pte_wrprotect(*pte));
18671 -       if ((unsigned long)va >= (unsigned long)high_memory) {
18672 -               unsigned long pfn = pte_pfn(*pte);
18673 -#ifdef CONFIG_HIGHMEM
18674 -               if (pfn >= highstart_pfn)
18675 -                       kmap_flush_unused(); /* flush stale writable kmaps */
18676 -               else
18677 -#endif
18678 -                       make_lowmem_page_readonly(
18679 -                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18680 -       }
18681 -}
18682 -
18683 -void make_page_writable(void *va, unsigned int feature)
18684 -{
18685 -       pte_t *pte;
18686 -       int rc;
18687 -
18688 -       if (xen_feature(feature))
18689 -               return;
18690 -
18691 -       pte = virt_to_ptep(va);
18692 -       rc = HYPERVISOR_update_va_mapping(
18693 -               (unsigned long)va, pte_mkwrite(*pte), 0);
18694 -       if (rc) /* fallback? */
18695 -               xen_l1_entry_update(pte, pte_mkwrite(*pte));
18696 -       if ((unsigned long)va >= (unsigned long)high_memory) {
18697 -               unsigned long pfn = pte_pfn(*pte);
18698 -#ifdef CONFIG_HIGHMEM
18699 -               if (pfn < highstart_pfn)
18700 -#endif
18701 -                       make_lowmem_page_writable(
18702 -                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18703 -       }
18704 -}
18705 -
18706 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18707 -{
18708 -       if (xen_feature(feature))
18709 -               return;
18710 -
18711 -       while (nr-- != 0) {
18712 -               make_page_readonly(va, feature);
18713 -               va = (void *)((unsigned long)va + PAGE_SIZE);
18714 -       }
18715 -}
18716 -
18717 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18718 -{
18719 -       if (xen_feature(feature))
18720 -               return;
18721 -
18722 -       while (nr-- != 0) {
18723 -               make_page_writable(va, feature);
18724 -               va = (void *)((unsigned long)va + PAGE_SIZE);
18725 -       }
18726 -}
18727 -
18728 -static void _pin_lock(struct mm_struct *mm, int lock) {
18729 -       if (lock)
18730 -               spin_lock(&mm->page_table_lock);
18731 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18732 -       /* While mm->page_table_lock protects us against insertions and
18733 -        * removals of higher level page table pages, it doesn't protect
18734 -        * against updates of pte-s. Such updates, however, require the
18735 -        * pte pages to be in consistent state (unpinned+writable or
18736 -        * pinned+readonly). The pinning and attribute changes, however
18737 -        * cannot be done atomically, which is why such updates must be
18738 -        * prevented from happening concurrently.
18739 -        * Note that no pte lock can ever elsewhere be acquired nesting
18740 -        * with an already acquired one in the same mm, or with the mm's
18741 -        * page_table_lock already acquired, as that would break in the
18742 -        * non-split case (where all these are actually resolving to the
18743 -        * one page_table_lock). Thus acquiring all of them here is not
18744 -        * going to result in dead locks, and the order of acquires
18745 -        * doesn't matter.
18746 -        */
18747 -       {
18748 -               pgd_t *pgd = mm->pgd;
18749 -               unsigned g;
18750 -
18751 -               for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18752 -                       pud_t *pud;
18753 -                       unsigned u;
18754 -
18755 -                       if (pgd_none(*pgd))
18756 -                               continue;
18757 -                       pud = pud_offset(pgd, 0);
18758 -                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18759 -                               pmd_t *pmd;
18760 -                               unsigned m;
18761 -
18762 -                               if (pud_none(*pud))
18763 -                                       continue;
18764 -                               pmd = pmd_offset(pud, 0);
18765 -                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18766 -                                       spinlock_t *ptl;
18767 -
18768 -                                       if (pmd_none(*pmd))
18769 -                                               continue;
18770 -                                       ptl = pte_lockptr(0, pmd);
18771 -                                       if (lock)
18772 -                                               spin_lock(ptl);
18773 -                                       else
18774 -                                               spin_unlock(ptl);
18775 -                               }
18776 -                       }
18777 -               }
18778 -       }
18779 -#endif
18780 -       if (!lock)
18781 -               spin_unlock(&mm->page_table_lock);
18782 -}
18783 -#define pin_lock(mm) _pin_lock(mm, 1)
18784 -#define pin_unlock(mm) _pin_lock(mm, 0)
18785 -
18786 -#define PIN_BATCH 4
18787 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18788 -
18789 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18790 -                                             unsigned int cpu, unsigned seq)
18791 -{
18792 -       unsigned long pfn = page_to_pfn(page);
18793 -
18794 -       if (PageHighMem(page)) {
18795 -               if (pgprot_val(flags) & _PAGE_RW)
18796 -                       ClearPagePinned(page);
18797 -               else
18798 -                       SetPagePinned(page);
18799 -       } else {
18800 -               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18801 -                               (unsigned long)__va(pfn << PAGE_SHIFT),
18802 -                               pfn_pte(pfn, flags), 0);
18803 -               if (unlikely(++seq == PIN_BATCH)) {
18804 -                       if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18805 -                                                               PIN_BATCH, NULL)))
18806 -                               BUG();
18807 -                       seq = 0;
18808 -               }
18809 -       }
18810 -
18811 -       return seq;
18812 -}
18813 -
18814 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18815 -{
18816 -       pgd_t *pgd = pgd_base;
18817 -       pud_t *pud;
18818 -       pmd_t *pmd;
18819 -       int    g, u, m;
18820 -       unsigned int cpu, seq;
18821 -
18822 -       if (xen_feature(XENFEAT_auto_translated_physmap))
18823 -               return;
18824 -
18825 -       cpu = get_cpu();
18826 -
18827 -       for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18828 -               if (pgd_none(*pgd))
18829 -                       continue;
18830 -               pud = pud_offset(pgd, 0);
18831 -               if (PTRS_PER_PUD > 1) /* not folded */
18832 -                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18833 -               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18834 -                       if (pud_none(*pud))
18835 -                               continue;
18836 -                       pmd = pmd_offset(pud, 0);
18837 -                       if (PTRS_PER_PMD > 1) /* not folded */
18838 -                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18839 -                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18840 -                               if (pmd_none(*pmd))
18841 -                                       continue;
18842 -                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18843 -                       }
18844 -               }
18845 -       }
18846 -
18847 -       if (likely(seq != 0)) {
18848 -               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18849 -                       (unsigned long)pgd_base,
18850 -                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18851 -                       UVMF_TLB_FLUSH);
18852 -               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18853 -                                                       seq + 1, NULL)))
18854 -                       BUG();
18855 -       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18856 -                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18857 -                       UVMF_TLB_FLUSH))
18858 -               BUG();
18859 -
18860 -       put_cpu();
18861 -}
18862 -
18863 -static void __pgd_pin(pgd_t *pgd)
18864 -{
18865 -       pgd_walk(pgd, PAGE_KERNEL_RO);
18866 -       kmap_flush_unused();
18867 -       xen_pgd_pin(__pa(pgd));
18868 -       SetPagePinned(virt_to_page(pgd));
18869 -}
18870 -
18871 -static void __pgd_unpin(pgd_t *pgd)
18872 -{
18873 -       xen_pgd_unpin(__pa(pgd));
18874 -       pgd_walk(pgd, PAGE_KERNEL);
18875 -       ClearPagePinned(virt_to_page(pgd));
18876 -}
18877 -
18878 -static void pgd_test_and_unpin(pgd_t *pgd)
18879 -{
18880 -       if (PagePinned(virt_to_page(pgd)))
18881 -               __pgd_unpin(pgd);
18882 -}
18883 -
18884 -void mm_pin(struct mm_struct *mm)
18885 -{
18886 -       if (xen_feature(XENFEAT_writable_page_tables))
18887 -               return;
18888 -       pin_lock(mm);
18889 -       __pgd_pin(mm->pgd);
18890 -       pin_unlock(mm);
18891 -}
18892 -
18893 -void mm_unpin(struct mm_struct *mm)
18894 -{
18895 -       if (xen_feature(XENFEAT_writable_page_tables))
18896 -               return;
18897 -       pin_lock(mm);
18898 -       __pgd_unpin(mm->pgd);
18899 -       pin_unlock(mm);
18900 -}
18901 -
18902 -void mm_pin_all(void)
18903 -{
18904 -       struct page *page;
18905 -       unsigned long flags;
18906 -
18907 -       if (xen_feature(XENFEAT_writable_page_tables))
18908 -               return;
18909 -
18910 -       /*
18911 -        * Allow uninterrupted access to the pgd_list. Also protects
18912 -        * __pgd_pin() by disabling preemption.
18913 -        * All other CPUs must be at a safe point (e.g., in stop_machine
18914 -        * or offlined entirely).
18915 -        */
18916 -       spin_lock_irqsave(&pgd_lock, flags);
18917 -       for (page = pgd_list; page; page = (struct page *)page->index) {
18918 -               if (!PagePinned(page))
18919 -                       __pgd_pin((pgd_t *)page_address(page));
18920 -       }
18921 -       spin_unlock_irqrestore(&pgd_lock, flags);
18922 -}
18923 -
18924 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18925 -{
18926 -       if (!PagePinned(virt_to_page(mm->pgd)))
18927 -               mm_pin(mm);
18928 -}
18929 -
18930 -void arch_exit_mmap(struct mm_struct *mm)
18931 -{
18932 -       struct task_struct *tsk = current;
18933 -
18934 -       task_lock(tsk);
18935 -
18936 -       /*
18937 -        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18938 -        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18939 -        */
18940 -       if (tsk->active_mm == mm) {
18941 -               tsk->active_mm = &init_mm;
18942 -               atomic_inc(&init_mm.mm_count);
18943 -
18944 -               switch_mm(mm, &init_mm, tsk);
18945 -
18946 -               atomic_dec(&mm->mm_count);
18947 -               BUG_ON(atomic_read(&mm->mm_count) == 0);
18948 -       }
18949 -
18950 -       task_unlock(tsk);
18951 -
18952 -       if (PagePinned(virt_to_page(mm->pgd)) &&
18953 -           (atomic_read(&mm->mm_count) == 1) &&
18954 -           !mm->context.has_foreign_mappings)
18955 -               mm_unpin(mm);
18956 -}
18957 --- a/arch/x86/pci/irq-xen.c
18958 +++ b/arch/x86/pci/irq-xen.c
18959 @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18960  {
18961         static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18962
18963 +       WARN_ON_ONCE(pirq >= 16);
18964         return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18965  }
18966
18967 @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18968  {
18969         static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18970         unsigned int val = irqmap[irq];
18971 -
18972 +
18973 +       WARN_ON_ONCE(pirq >= 16);
18974         if (val) {
18975                 write_config_nybble(router, 0x48, pirq-1, val);
18976                 return 1;
18977 @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18978  static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18979  {
18980         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18981 +
18982 +       WARN_ON_ONCE(pirq >= 5);
18983         return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18984  }
18985
18986  static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18987  {
18988         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18989 +
18990 +       WARN_ON_ONCE(pirq >= 5);
18991         write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18992         return 1;
18993  }
18994 @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18995  static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18996  {
18997         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18998 +
18999 +       WARN_ON_ONCE(pirq >= 4);
19000         return read_config_nybble(router,0x43, pirqmap[pirq-1]);
19001  }
19002
19003  static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19004  {
19005         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19006 +
19007 +       WARN_ON_ONCE(pirq >= 4);
19008         write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
19009         return 1;
19010  }
19011 @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
19012
19013  static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19014  {
19015 +       WARN_ON_ONCE(pirq >= 9);
19016         if (pirq > 8) {
19017                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19018                 return 0;
19019 @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
19020
19021  static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19022  {
19023 +       WARN_ON_ONCE(pirq >= 9);
19024         if (pirq > 8) {
19025                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19026                 return 0;
19027 @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
19028   */
19029  static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19030  {
19031 -       outb_p(pirq, 0xc00);
19032 +       outb(pirq, 0xc00);
19033         return inb(0xc01) & 0xf;
19034  }
19035
19036  static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19037  {
19038 -       outb_p(pirq, 0xc00);
19039 -       outb_p(irq, 0xc01);
19040 +       outb(pirq, 0xc00);
19041 +       outb(irq, 0xc01);
19042         return 1;
19043  }
19044
19045 @@ -575,6 +587,10 @@ static __init int intel_router_probe(str
19046                 case PCI_DEVICE_ID_INTEL_ICH9_4:
19047                 case PCI_DEVICE_ID_INTEL_ICH9_5:
19048                 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
19049 +               case PCI_DEVICE_ID_INTEL_ICH10_0:
19050 +               case PCI_DEVICE_ID_INTEL_ICH10_1:
19051 +               case PCI_DEVICE_ID_INTEL_ICH10_2:
19052 +               case PCI_DEVICE_ID_INTEL_ICH10_3:
19053                         r->name = "PIIX/ICH";
19054                         r->get = pirq_piix_get;
19055                         r->set = pirq_piix_set;
19056 --- a/arch/x86/vdso/Makefile
19057 +++ b/arch/x86/vdso/Makefile
19058 @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y)         += int80
19059  vdso32.so-$(CONFIG_COMPAT)     += syscall
19060  vdso32.so-$(VDSO32-y)          += sysenter
19061  xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
19062 +xen-vdso32-$(CONFIG_X86_32)    += syscall
19063  vdso32.so-$(CONFIG_XEN)                += $(xen-vdso32-y)
19064
19065  vdso32-images                  = $(vdso32.so-y:%=vdso32-%.so)
19066 --- a/arch/x86/vdso/vdso32.S
19067 +++ b/arch/x86/vdso/vdso32.S
19068 @@ -19,4 +19,16 @@ vdso32_sysenter_start:
19069         .incbin "arch/x86/vdso/vdso32-sysenter.so"
19070  vdso32_sysenter_end:
19071
19072 +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
19073 +       .globl vdso32_int80_start, vdso32_int80_end
19074 +vdso32_int80_start:
19075 +       .incbin "arch/x86/vdso/vdso32-int80.so"
19076 +vdso32_int80_end:
19077 +#elif defined(CONFIG_X86_XEN)
19078 +       .globl vdso32_syscall_start, vdso32_syscall_end
19079 +vdso32_syscall_start:
19080 +       .incbin "arch/x86/vdso/vdso32-syscall.so"
19081 +vdso32_syscall_end:
19082 +#endif
19083 +
19084  __FINIT
19085 --- a/arch/x86/vdso/vdso32-setup.c
19086 +++ b/arch/x86/vdso/vdso32-setup.c
19087 @@ -26,10 +26,6 @@
19088  #include <asm/vdso.h>
19089  #include <asm/proto.h>
19090
19091 -#ifdef CONFIG_XEN
19092 -#include <xen/interface/callback.h>
19093 -#endif
19094 -
19095  enum {
19096         VDSO_DISABLED = 0,
19097         VDSO_ENABLED = 1,
19098 @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
19099
19100  void enable_sep_cpu(void)
19101  {
19102 -#ifndef CONFIG_XEN
19103         int cpu = get_cpu();
19104         struct tss_struct *tss = &per_cpu(init_tss, cpu);
19105
19106 @@ -244,35 +239,6 @@ void enable_sep_cpu(void)
19107         wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
19108         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
19109         put_cpu();
19110 -#else
19111 -       extern asmlinkage void ia32pv_sysenter_target(void);
19112 -       static struct callback_register sysenter = {
19113 -               .type = CALLBACKTYPE_sysenter,
19114 -               .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19115 -       };
19116 -
19117 -       if (!boot_cpu_has(X86_FEATURE_SEP))
19118 -               return;
19119 -
19120 -       get_cpu();
19121 -
19122 -       if (xen_feature(XENFEAT_supervisor_mode_kernel))
19123 -               sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19124 -
19125 -       switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19126 -       case 0:
19127 -               break;
19128 -#if CONFIG_XEN_COMPAT < 0x030200
19129 -       case -ENOSYS:
19130 -               sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19131 -               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19132 -                       break;
19133 -#endif
19134 -       default:
19135 -               clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19136 -               break;
19137 -       }
19138 -#endif
19139  }
19140
19141  static struct vm_area_struct gate_vma;
19142 --- /dev/null
19143 +++ b/arch/x86/vdso/vdso32-setup-xen.c
19144 @@ -0,0 +1,506 @@
19145 +/*
19146 + * (C) Copyright 2002 Linus Torvalds
19147 + * Portions based on the vdso-randomization code from exec-shield:
19148 + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19149 + *
19150 + * This file contains the needed initializations to support sysenter.
19151 + */
19152 +
19153 +#include <linux/init.h>
19154 +#include <linux/smp.h>
19155 +#include <linux/thread_info.h>
19156 +#include <linux/sched.h>
19157 +#include <linux/gfp.h>
19158 +#include <linux/string.h>
19159 +#include <linux/elf.h>
19160 +#include <linux/mm.h>
19161 +#include <linux/err.h>
19162 +#include <linux/module.h>
19163 +
19164 +#include <asm/cpufeature.h>
19165 +#include <asm/msr.h>
19166 +#include <asm/pgtable.h>
19167 +#include <asm/unistd.h>
19168 +#include <asm/elf.h>
19169 +#include <asm/tlbflush.h>
19170 +#include <asm/vdso.h>
19171 +#include <asm/proto.h>
19172 +
19173 +#include <xen/interface/callback.h>
19174 +
19175 +enum {
19176 +       VDSO_DISABLED = 0,
19177 +       VDSO_ENABLED = 1,
19178 +       VDSO_COMPAT = 2,
19179 +};
19180 +
19181 +#ifdef CONFIG_COMPAT_VDSO
19182 +#define VDSO_DEFAULT   VDSO_COMPAT
19183 +#else
19184 +#define VDSO_DEFAULT   VDSO_ENABLED
19185 +#endif
19186 +
19187 +#ifdef CONFIG_X86_64
19188 +#define vdso_enabled                   sysctl_vsyscall32
19189 +#define arch_setup_additional_pages    syscall32_setup_pages
19190 +#endif
19191 +
19192 +/*
19193 + * This is the difference between the prelinked addresses in the vDSO images
19194 + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19195 + * in the user address space.
19196 + */
19197 +#define VDSO_ADDR_ADJUST       (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19198 +
19199 +/*
19200 + * Should the kernel map a VDSO page into processes and pass its
19201 + * address down to glibc upon exec()?
19202 + */
19203 +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19204 +
19205 +static int __init vdso_setup(char *s)
19206 +{
19207 +       vdso_enabled = simple_strtoul(s, NULL, 0);
19208 +
19209 +       return 1;
19210 +}
19211 +
19212 +/*
19213 + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19214 + * behavior on both 64-bit and 32-bit kernels.
19215 + * On 32-bit kernels, vdso=[012] means the same thing.
19216 + */
19217 +__setup("vdso32=", vdso_setup);
19218 +
19219 +#ifdef CONFIG_X86_32
19220 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19221 +
19222 +EXPORT_SYMBOL_GPL(vdso_enabled);
19223 +#endif
19224 +
19225 +static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19226 +                               unsigned offset, unsigned size)
19227 +{
19228 +       Elf32_Sym *sym = (void *)ehdr + offset;
19229 +       unsigned nsym = size / sizeof(*sym);
19230 +       unsigned i;
19231 +
19232 +       for(i = 0; i < nsym; i++, sym++) {
19233 +               if (sym->st_shndx == SHN_UNDEF ||
19234 +                   sym->st_shndx == SHN_ABS)
19235 +                       continue;  /* skip */
19236 +
19237 +               if (sym->st_shndx > SHN_LORESERVE) {
19238 +                       printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19239 +                              sym->st_shndx);
19240 +                       continue;
19241 +               }
19242 +
19243 +               switch(ELF_ST_TYPE(sym->st_info)) {
19244 +               case STT_OBJECT:
19245 +               case STT_FUNC:
19246 +               case STT_SECTION:
19247 +               case STT_FILE:
19248 +                       sym->st_value += VDSO_ADDR_ADJUST;
19249 +               }
19250 +       }
19251 +}
19252 +
19253 +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19254 +{
19255 +       Elf32_Dyn *dyn = (void *)ehdr + offset;
19256 +
19257 +       for(; dyn->d_tag != DT_NULL; dyn++)
19258 +               switch(dyn->d_tag) {
19259 +               case DT_PLTGOT:
19260 +               case DT_HASH:
19261 +               case DT_STRTAB:
19262 +               case DT_SYMTAB:
19263 +               case DT_RELA:
19264 +               case DT_INIT:
19265 +               case DT_FINI:
19266 +               case DT_REL:
19267 +               case DT_DEBUG:
19268 +               case DT_JMPREL:
19269 +               case DT_VERSYM:
19270 +               case DT_VERDEF:
19271 +               case DT_VERNEED:
19272 +               case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19273 +                       /* definitely pointers needing relocation */
19274 +                       dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19275 +                       break;
19276 +
19277 +               case DT_ENCODING ... OLD_DT_LOOS-1:
19278 +               case DT_LOOS ... DT_HIOS-1:
19279 +                       /* Tags above DT_ENCODING are pointers if
19280 +                          they're even */
19281 +                       if (dyn->d_tag >= DT_ENCODING &&
19282 +                           (dyn->d_tag & 1) == 0)
19283 +                               dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19284 +                       break;
19285 +
19286 +               case DT_VERDEFNUM:
19287 +               case DT_VERNEEDNUM:
19288 +               case DT_FLAGS_1:
19289 +               case DT_RELACOUNT:
19290 +               case DT_RELCOUNT:
19291 +               case DT_VALRNGLO ... DT_VALRNGHI:
19292 +                       /* definitely not pointers */
19293 +                       break;
19294 +
19295 +               case OLD_DT_LOOS ... DT_LOOS-1:
19296 +               case DT_HIOS ... DT_VALRNGLO-1:
19297 +               default:
19298 +                       if (dyn->d_tag > DT_ENCODING)
19299 +                               printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19300 +                                      dyn->d_tag);
19301 +                       break;
19302 +               }
19303 +}
19304 +
19305 +static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19306 +{
19307 +       Elf32_Phdr *phdr;
19308 +       Elf32_Shdr *shdr;
19309 +       int i;
19310 +
19311 +       BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19312 +              !elf_check_arch_ia32(ehdr) ||
19313 +              ehdr->e_type != ET_DYN);
19314 +
19315 +       ehdr->e_entry += VDSO_ADDR_ADJUST;
19316 +
19317 +       /* rebase phdrs */
19318 +       phdr = (void *)ehdr + ehdr->e_phoff;
19319 +       for (i = 0; i < ehdr->e_phnum; i++) {
19320 +               phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19321 +
19322 +               /* relocate dynamic stuff */
19323 +               if (phdr[i].p_type == PT_DYNAMIC)
19324 +                       reloc_dyn(ehdr, phdr[i].p_offset);
19325 +       }
19326 +
19327 +       /* rebase sections */
19328 +       shdr = (void *)ehdr + ehdr->e_shoff;
19329 +       for(i = 0; i < ehdr->e_shnum; i++) {
19330 +               if (!(shdr[i].sh_flags & SHF_ALLOC))
19331 +                       continue;
19332 +
19333 +               shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19334 +
19335 +               if (shdr[i].sh_type == SHT_SYMTAB ||
19336 +                   shdr[i].sh_type == SHT_DYNSYM)
19337 +                       reloc_symtab(ehdr, shdr[i].sh_offset,
19338 +                                    shdr[i].sh_size);
19339 +       }
19340 +}
19341 +
19342 +/*
19343 + * These symbols are defined by vdso32.S to mark the bounds
19344 + * of the ELF DSO images included therein.
19345 + */
19346 +extern const char vdso32_default_start, vdso32_default_end;
19347 +extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19348 +static struct page *vdso32_pages[1];
19349 +
19350 +#ifdef CONFIG_X86_64
19351 +
19352 +#if CONFIG_XEN_COMPAT < 0x030200
19353 +static int use_int80 = 1;
19354 +#endif
19355 +static int use_sysenter __read_mostly = -1;
19356 +
19357 +#define        vdso32_sysenter()       (use_sysenter > 0)
19358 +
19359 +/* May not be __init: called during resume */
19360 +void syscall32_cpu_init(void)
19361 +{
19362 +       static const struct callback_register cstar = {
19363 +               .type = CALLBACKTYPE_syscall32,
19364 +               .address = (unsigned long)ia32_cstar_target
19365 +       };
19366 +       static const struct callback_register sysenter = {
19367 +               .type = CALLBACKTYPE_sysenter,
19368 +               .address = (unsigned long)ia32_sysenter_target
19369 +       };
19370 +
19371 +       if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19372 +           (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19373 +#if CONFIG_XEN_COMPAT < 0x030200
19374 +               return;
19375 +       use_int80 = 0;
19376 +#else
19377 +               BUG();
19378 +#endif
19379 +
19380 +       if (use_sysenter < 0)
19381 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19382 +}
19383 +
19384 +#define compat_uses_vma                1
19385 +
19386 +static inline void map_compat_vdso(int map)
19387 +{
19388 +}
19389 +
19390 +#else  /* CONFIG_X86_32 */
19391 +
19392 +#define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
19393 +
19394 +extern asmlinkage void ia32pv_cstar_target(void);
19395 +static /*const*/ struct callback_register __cpuinitdata cstar = {
19396 +       .type = CALLBACKTYPE_syscall32,
19397 +       .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19398 +};
19399 +
19400 +void __cpuinit enable_sep_cpu(void)
19401 +{
19402 +       extern asmlinkage void ia32pv_sysenter_target(void);
19403 +       static struct callback_register __cpuinitdata sysenter = {
19404 +               .type = CALLBACKTYPE_sysenter,
19405 +               .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19406 +       };
19407 +
19408 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19409 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19410 +                       BUG();
19411 +               return;
19412 +       }
19413 +
19414 +       if (!boot_cpu_has(X86_FEATURE_SEP))
19415 +               return;
19416 +
19417 +       if (xen_feature(XENFEAT_supervisor_mode_kernel))
19418 +               sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19419 +
19420 +       switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19421 +       case 0:
19422 +               break;
19423 +#if CONFIG_XEN_COMPAT < 0x030200
19424 +       case -ENOSYS:
19425 +               sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19426 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19427 +                       break;
19428 +#endif
19429 +       default:
19430 +               setup_clear_cpu_cap(X86_FEATURE_SEP);
19431 +               break;
19432 +       }
19433 +}
19434 +
19435 +static struct vm_area_struct gate_vma;
19436 +
19437 +static int __init gate_vma_init(void)
19438 +{
19439 +       gate_vma.vm_mm = NULL;
19440 +       gate_vma.vm_start = FIXADDR_USER_START;
19441 +       gate_vma.vm_end = FIXADDR_USER_END;
19442 +       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19443 +       gate_vma.vm_page_prot = __P101;
19444 +       /*
19445 +        * Make sure the vDSO gets into every core dump.
19446 +        * Dumping its contents makes post-mortem fully interpretable later
19447 +        * without matching up the same kernel and hardware config to see
19448 +        * what PC values meant.
19449 +        */
19450 +       gate_vma.vm_flags |= VM_ALWAYSDUMP;
19451 +       return 0;
19452 +}
19453 +
19454 +#define compat_uses_vma                0
19455 +
19456 +static void map_compat_vdso(int map)
19457 +{
19458 +       static int vdso_mapped;
19459 +
19460 +       if (map == vdso_mapped)
19461 +               return;
19462 +
19463 +       vdso_mapped = map;
19464 +
19465 +       __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19466 +                    map ? PAGE_READONLY_EXEC : PAGE_NONE);
19467 +
19468 +       /* flush stray tlbs */
19469 +       flush_tlb_all();
19470 +}
19471 +
19472 +#endif /* CONFIG_X86_64 */
19473 +
19474 +int __init sysenter_setup(void)
19475 +{
19476 +       void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19477 +       const void *vsyscall;
19478 +       size_t vsyscall_len;
19479 +
19480 +       vdso32_pages[0] = virt_to_page(syscall_page);
19481 +
19482 +#ifdef CONFIG_X86_32
19483 +       gate_vma_init();
19484 +
19485 +       printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19486 +#endif
19487 +
19488 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19489 +       if (use_int80) {
19490 +               extern const char vdso32_int80_start, vdso32_int80_end;
19491 +
19492 +               vsyscall = &vdso32_int80_start;
19493 +               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19494 +       } else
19495 +#elif defined(CONFIG_X86_32)
19496 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)
19497 +           && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19498 +               || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19499 +               setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19500 +       barrier(); /* until clear_bit()'s constraints are correct ... */
19501 +       if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19502 +               extern const char vdso32_syscall_start, vdso32_syscall_end;
19503 +
19504 +               vsyscall = &vdso32_syscall_start;
19505 +               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19506 +       } else
19507 +#endif
19508 +       if (!vdso32_sysenter()) {
19509 +               vsyscall = &vdso32_default_start;
19510 +               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19511 +       } else {
19512 +               vsyscall = &vdso32_sysenter_start;
19513 +               vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19514 +       }
19515 +
19516 +       memcpy(syscall_page, vsyscall, vsyscall_len);
19517 +       relocate_vdso(syscall_page);
19518 +
19519 +       return 0;
19520 +}
19521 +
19522 +/* Setup a VMA at program startup for the vsyscall page */
19523 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19524 +{
19525 +       struct mm_struct *mm = current->mm;
19526 +       unsigned long addr;
19527 +       int ret = 0;
19528 +       bool compat;
19529 +
19530 +       down_write(&mm->mmap_sem);
19531 +
19532 +       /* Test compat mode once here, in case someone
19533 +          changes it via sysctl */
19534 +       compat = (vdso_enabled == VDSO_COMPAT);
19535 +
19536 +       map_compat_vdso(compat);
19537 +
19538 +       if (compat)
19539 +               addr = VDSO_HIGH_BASE;
19540 +       else {
19541 +               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19542 +               if (IS_ERR_VALUE(addr)) {
19543 +                       ret = addr;
19544 +                       goto up_fail;
19545 +               }
19546 +       }
19547 +
19548 +       if (compat_uses_vma || !compat) {
19549 +               /*
19550 +                * MAYWRITE to allow gdb to COW and set breakpoints
19551 +                *
19552 +                * Make sure the vDSO gets into every core dump.
19553 +                * Dumping its contents makes post-mortem fully
19554 +                * interpretable later without matching up the same
19555 +                * kernel and hardware config to see what PC values
19556 +                * meant.
19557 +                */
19558 +               ret = install_special_mapping(mm, addr, PAGE_SIZE,
19559 +                                             VM_READ|VM_EXEC|
19560 +                                             VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19561 +                                             VM_ALWAYSDUMP,
19562 +                                             vdso32_pages);
19563 +
19564 +               if (ret)
19565 +                       goto up_fail;
19566 +       }
19567 +
19568 +       current->mm->context.vdso = (void *)addr;
19569 +       current_thread_info()->sysenter_return =
19570 +               VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19571 +
19572 +  up_fail:
19573 +       up_write(&mm->mmap_sem);
19574 +
19575 +       return ret;
19576 +}
19577 +
19578 +#ifdef CONFIG_X86_64
19579 +
19580 +/*
19581 + * This must be done early in case we have an initrd containing 32-bit
19582 + * binaries (e.g., hotplug). This could be pushed upstream.
19583 + */
19584 +core_initcall(sysenter_setup);
19585 +
19586 +#ifdef CONFIG_SYSCTL
19587 +/* Register vsyscall32 into the ABI table */
19588 +#include <linux/sysctl.h>
19589 +
19590 +static ctl_table abi_table2[] = {
19591 +       {
19592 +               .procname       = "vsyscall32",
19593 +               .data           = &sysctl_vsyscall32,
19594 +               .maxlen         = sizeof(int),
19595 +               .mode           = 0644,
19596 +               .proc_handler   = proc_dointvec
19597 +       },
19598 +       {}
19599 +};
19600 +
19601 +static ctl_table abi_root_table2[] = {
19602 +       {
19603 +               .ctl_name = CTL_ABI,
19604 +               .procname = "abi",
19605 +               .mode = 0555,
19606 +               .child = abi_table2
19607 +       },
19608 +       {}
19609 +};
19610 +
19611 +static __init int ia32_binfmt_init(void)
19612 +{
19613 +       register_sysctl_table(abi_root_table2);
19614 +       return 0;
19615 +}
19616 +__initcall(ia32_binfmt_init);
19617 +#endif
19618 +
19619 +#else  /* CONFIG_X86_32 */
19620 +
19621 +const char *arch_vma_name(struct vm_area_struct *vma)
19622 +{
19623 +       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19624 +               return "[vdso]";
19625 +       return NULL;
19626 +}
19627 +
19628 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19629 +{
19630 +       struct mm_struct *mm = tsk->mm;
19631 +
19632 +       /* Check to see if this task was created in compat vdso mode */
19633 +       if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19634 +               return &gate_vma;
19635 +       return NULL;
19636 +}
19637 +
19638 +int in_gate_area(struct task_struct *task, unsigned long addr)
19639 +{
19640 +       const struct vm_area_struct *vma = get_gate_vma(task);
19641 +
19642 +       return vma && addr >= vma->vm_start && addr < vma->vm_end;
19643 +}
19644 +
19645 +int in_gate_area_no_task(unsigned long addr)
19646 +{
19647 +       return 0;
19648 +}
19649 +
19650 +#endif /* CONFIG_X86_64 */
19651 --- a/arch/x86/vdso/vdso32/syscall.S
19652 +++ b/arch/x86/vdso/vdso32/syscall.S
19653 @@ -19,8 +19,10 @@ __kernel_vsyscall:
19654  .Lpush_ebp:
19655         movl    %ecx, %ebp
19656         syscall
19657 +#ifndef CONFIG_XEN
19658         movl    $__USER32_DS, %ecx
19659         movl    %ecx, %ss
19660 +#endif
19661         movl    %ebp, %ecx
19662         popl    %ebp
19663  .Lpop_ebp:
19664 --- a/drivers/pci/msi-xen.c
19665 +++ b/drivers/pci/msi-xen.c
19666 @@ -43,6 +43,53 @@ struct msi_pirq_entry {
19667         int entry_nr;
19668  };
19669
19670 +/* Arch hooks */
19671 +
19672 +int __attribute__ ((weak))
19673 +arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19674 +{
19675 +       return 0;
19676 +}
19677 +
19678 +#ifndef CONFIG_XEN
19679 +int __attribute__ ((weak))
19680 +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19681 +{
19682 +       return 0;
19683 +}
19684 +
19685 +int __attribute__ ((weak))
19686 +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19687 +{
19688 +       struct msi_desc *entry;
19689 +       int ret;
19690 +
19691 +       list_for_each_entry(entry, &dev->msi_list, list) {
19692 +               ret = arch_setup_msi_irq(dev, entry);
19693 +               if (ret)
19694 +                       return ret;
19695 +       }
19696 +
19697 +       return 0;
19698 +}
19699 +
19700 +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19701 +{
19702 +       return;
19703 +}
19704 +
19705 +void __attribute__ ((weak))
19706 +arch_teardown_msi_irqs(struct pci_dev *dev)
19707 +{
19708 +       struct msi_desc *entry;
19709 +
19710 +       list_for_each_entry(entry, &dev->msi_list, list) {
19711 +               if (entry->irq != 0)
19712 +                       arch_teardown_msi_irq(entry->irq);
19713 +       }
19714 +}
19715 +#endif
19716 +
19717  static void msi_set_enable(struct pci_dev *dev, int enable)
19718  {
19719         int pos;
19720 @@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19721                 pci_intx(dev, enable);
19722  }
19723
19724 -#ifdef CONFIG_PM
19725  static void __pci_restore_msi_state(struct pci_dev *dev)
19726  {
19727         int pirq;
19728 @@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19729         __pci_restore_msi_state(dev);
19730         __pci_restore_msix_state(dev);
19731  }
19732 -#endif /* CONFIG_PM */
19733 +EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19734
19735  /**
19736   * msi_capability_init - configure device's MSI capability structure
19737 @@ -760,51 +806,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19738         INIT_LIST_HEAD(&dev->msi_list);
19739  #endif
19740  }
19741 -
19742 -
19743 -/* Arch hooks */
19744 -
19745 -int __attribute__ ((weak))
19746 -arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19747 -{
19748 -       return 0;
19749 -}
19750 -
19751 -#ifndef CONFIG_XEN
19752 -int __attribute__ ((weak))
19753 -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19754 -{
19755 -       return 0;
19756 -}
19757 -
19758 -int __attribute__ ((weak))
19759 -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19760 -{
19761 -       struct msi_desc *entry;
19762 -       int ret;
19763 -
19764 -       list_for_each_entry(entry, &dev->msi_list, list) {
19765 -               ret = arch_setup_msi_irq(dev, entry);
19766 -               if (ret)
19767 -                       return ret;
19768 -       }
19769 -
19770 -       return 0;
19771 -}
19772 -
19773 -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19774 -{
19775 -       return;
19776 -}
19777 -
19778 -void __attribute__ ((weak))
19779 -arch_teardown_msi_irqs(struct pci_dev *dev)
19780 -{
19781 -       struct msi_desc *entry;
19782 -
19783 -       list_for_each_entry(entry, &dev->msi_list, list) {
19784 -               if (entry->irq != 0)
19785 -                       arch_teardown_msi_irq(entry->irq);
19786 -       }
19787 -}
19788 -#endif
19789 --- a/drivers/pci/pci.c
19790 +++ b/drivers/pci/pci.c
19791 @@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19792   * Restore the BAR values for a given device, so as to make it
19793   * accessible by its driver.
19794   */
19795 +#ifndef CONFIG_XEN
19796  static void
19797 +#else
19798 +EXPORT_SYMBOL_GPL(pci_restore_bars);
19799 +void
19800 +#endif
19801  pci_restore_bars(struct pci_dev *dev)
19802  {
19803         int i, numres;
19804 --- a/drivers/xen/balloon/sysfs.c
19805 +++ b/drivers/xen/balloon/sysfs.c
19806 @@ -108,7 +108,7 @@ static struct attribute_group balloon_in
19807  };
19808
19809  static struct sysdev_class balloon_sysdev_class = {
19810 -       set_kset_name(BALLOON_CLASS_NAME),
19811 +       .name = BALLOON_CLASS_NAME,
19812  };
19813
19814  static struct sys_device balloon_sysdev;
19815 --- a/drivers/xen/blkback/blkback.c
19816 +++ b/drivers/xen/blkback/blkback.c
19817 @@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19818                 return;
19819         if (blkif->plug->unplug_fn)
19820                 blkif->plug->unplug_fn(blkif->plug);
19821 -       blk_put_queue(blkif->plug);
19822 +       kobject_put(&blkif->plug->kobj);
19823         blkif->plug = NULL;
19824  }
19825
19826 @@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19827         if (q == blkif->plug)
19828                 return;
19829         unplug_queue(blkif);
19830 -       blk_get_queue(q);
19831 +       WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19832 +       kobject_get(&q->kobj);
19833         blkif->plug = q;
19834  }
19835
19836 --- a/drivers/xen/blkfront/blkfront.c
19837 +++ b/drivers/xen/blkfront/blkfront.c
19838 @@ -716,7 +716,6 @@ static irqreturn_t blkif_int(int irq, vo
19839         RING_IDX i, rp;
19840         unsigned long flags;
19841         struct blkfront_info *info = (struct blkfront_info *)dev_id;
19842 -       int uptodate;
19843
19844         spin_lock_irqsave(&blkif_io_lock, flags);
19845
19846 @@ -741,13 +740,13 @@ static irqreturn_t blkif_int(int irq, vo
19847
19848                 ADD_ID_TO_FREELIST(info, id);
19849
19850 -               uptodate = (bret->status == BLKIF_RSP_OKAY);
19851 +               ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19852                 switch (bret->operation) {
19853                 case BLKIF_OP_WRITE_BARRIER:
19854                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19855                                 printk("blkfront: %s: write barrier op failed\n",
19856                                        info->gd->disk_name);
19857 -                               uptodate = -EOPNOTSUPP;
19858 +                               ret = -EOPNOTSUPP;
19859                                 info->feature_barrier = 0;
19860                                 xlvbd_barrier(info);
19861                         }
19862 @@ -758,10 +757,8 @@ static irqreturn_t blkif_int(int irq, vo
19863                                 DPRINTK("Bad return from blkdev data "
19864                                         "request: %x\n", bret->status);
19865
19866 -                       ret = end_that_request_first(req, uptodate,
19867 -                               req->hard_nr_sectors);
19868 +                       ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19869                         BUG_ON(ret);
19870 -                       end_that_request_last(req, uptodate);
19871                         break;
19872                 default:
19873                         BUG();
19874 --- a/drivers/xen/blktap/blktap.c
19875 +++ b/drivers/xen/blktap/blktap.c
19876 @@ -327,8 +327,8 @@ static pte_t blktap_clear_pte(struct vm_
19877          * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19878          */
19879         if (uvaddr < uvstart || vma->vm_file == NULL)
19880 -               return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19881 -                                              ptep, is_fullmm);
19882 +               return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19883 +                                                  is_fullmm);
19884
19885         info = vma->vm_file->private_data;
19886         map = vma->vm_private_data;
19887 @@ -375,8 +375,8 @@ static pte_t blktap_clear_pte(struct vm_
19888                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19889
19890                 /* USING SHADOW PAGE TABLES. */
19891 -               copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19892 -                                              is_fullmm);
19893 +               copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19894 +                                                  is_fullmm);
19895         }
19896
19897         if (count) {
19898 --- a/drivers/xen/core/evtchn.c
19899 +++ b/drivers/xen/core/evtchn.c
19900 @@ -193,7 +193,7 @@ static inline unsigned int cpu_from_evtc
19901
19902  /* Upcall to generic IRQ layer. */
19903  #ifdef CONFIG_X86
19904 -extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19905 +extern unsigned int do_IRQ(struct pt_regs *regs);
19906  void __init xen_init_IRQ(void);
19907  void __init init_IRQ(void)
19908  {
19909 @@ -202,13 +202,11 @@ void __init init_IRQ(void)
19910  }
19911  #if defined (__i386__)
19912  static inline void exit_idle(void) {}
19913 -#define IRQ_REG orig_eax
19914  #elif defined (__x86_64__)
19915  #include <asm/idle.h>
19916 -#define IRQ_REG orig_rax
19917  #endif
19918  #define do_IRQ(irq, regs) do {         \
19919 -       (regs)->IRQ_REG = ~(irq);       \
19920 +       (regs)->orig_ax = ~(irq);       \
19921         do_IRQ((regs));                 \
19922  } while (0)
19923  #endif
19924 @@ -669,13 +667,12 @@ static void set_affinity_irq(unsigned in
19925  int resend_irq_on_evtchn(unsigned int irq)
19926  {
19927         int masked, evtchn = evtchn_from_irq(irq);
19928 -       shared_info_t *s = HYPERVISOR_shared_info;
19929
19930         if (!VALID_EVTCHN(evtchn))
19931                 return 1;
19932
19933         masked = test_and_set_evtchn_mask(evtchn);
19934 -       synch_set_bit(evtchn, s->evtchn_pending);
19935 +       set_evtchn(evtchn);
19936         if (!masked)
19937                 unmask_evtchn(evtchn);
19938
19939 @@ -968,6 +965,43 @@ void disable_all_local_evtchn(void)
19940                         synch_set_bit(i, &s->evtchn_mask[0]);
19941  }
19942
19943 +/* Clear an irq's pending state, in preparation for polling on it. */
19944 +void xen_clear_irq_pending(int irq)
19945 +{
19946 +       int evtchn = evtchn_from_irq(irq);
19947 +
19948 +       if (VALID_EVTCHN(evtchn))
19949 +               clear_evtchn(evtchn);
19950 +}
19951 +
19952 +/* Set an irq's pending state, to avoid blocking on it. */
19953 +void xen_set_irq_pending(int irq)
19954 +{
19955 +       int evtchn = evtchn_from_irq(irq);
19956 +
19957 +       if (VALID_EVTCHN(evtchn))
19958 +               set_evtchn(evtchn);
19959 +}
19960 +
19961 +/* Test an irq's pending state. */
19962 +int xen_test_irq_pending(int irq)
19963 +{
19964 +       int evtchn = evtchn_from_irq(irq);
19965 +
19966 +       return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19967 +}
19968 +
19969 +/* Poll waiting for an irq to become pending.  In the usual case, the
19970 +   irq will be disabled so it won't deliver an interrupt. */
19971 +void xen_poll_irq(int irq)
19972 +{
19973 +       evtchn_port_t evtchn = evtchn_from_irq(irq);
19974 +
19975 +       if (VALID_EVTCHN(evtchn)
19976 +           && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19977 +               BUG();
19978 +}
19979 +
19980  static void restore_cpu_virqs(unsigned int cpu)
19981  {
19982         struct evtchn_bind_virq bind_virq;
19983 --- a/drivers/xen/core/hypervisor_sysfs.c
19984 +++ b/drivers/xen/core/hypervisor_sysfs.c
19985 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19986         if (!is_running_on_xen())
19987                 return -ENODEV;
19988
19989 -       hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19990 +       hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19991         return 0;
19992  }
19993
19994 --- a/drivers/xen/core/Makefile
19995 +++ b/drivers/xen/core/Makefile
19996 @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR)  += hypervis
19997  obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
19998  obj-$(CONFIG_XEN_SYSFS)                += xen_sysfs.o
19999  obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
20000 +obj-$(CONFIG_X86_SMP)          += spinlock.o
20001  obj-$(CONFIG_KEXEC)            += machine_kexec.o
20002  obj-$(CONFIG_XEN_XENCOMM)      += xencomm.o
20003 --- a/drivers/xen/core/smpboot.c
20004 +++ b/drivers/xen/core/smpboot.c
20005 @@ -139,6 +139,10 @@ static int __cpuinit xen_smp_intr_init(u
20006                 goto fail;
20007         per_cpu(callfunc_irq, cpu) = rc;
20008
20009 +       rc = xen_spinlock_init(cpu);
20010 +       if (rc < 0)
20011 +               goto fail;
20012 +
20013         if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
20014                 goto fail;
20015
20016 @@ -149,6 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
20017                 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20018         if (per_cpu(callfunc_irq, cpu) >= 0)
20019                 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20020 +       xen_spinlock_cleanup(cpu);
20021         return rc;
20022  }
20023
20024 @@ -160,6 +165,7 @@ static void xen_smp_intr_exit(unsigned i
20025
20026         unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
20027         unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
20028 +       xen_spinlock_cleanup(cpu);
20029  }
20030  #endif
20031
20032 @@ -212,36 +218,25 @@ static void __cpuinit cpu_initialize_con
20033         smp_trap_init(ctxt.trap_ctxt);
20034
20035         ctxt.ldt_ents = 0;
20036 -       ctxt.gdt_ents = GDT_SIZE / 8;
20037 -
20038 -#ifdef __i386__
20039         ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
20040 +       ctxt.gdt_ents = GDT_SIZE / 8;
20041
20042         ctxt.user_regs.cs = __KERNEL_CS;
20043 -       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
20044 +       ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
20045
20046         ctxt.kernel_ss = __KERNEL_DS;
20047 -       ctxt.kernel_sp = idle->thread.esp0;
20048 +       ctxt.kernel_sp = idle->thread.sp0;
20049
20050 -       ctxt.event_callback_cs     = __KERNEL_CS;
20051         ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
20052 -       ctxt.failsafe_callback_cs  = __KERNEL_CS;
20053         ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20054 +#ifdef __i386__
20055 +       ctxt.event_callback_cs     = __KERNEL_CS;
20056 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
20057
20058         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
20059
20060         ctxt.user_regs.fs = __KERNEL_PERCPU;
20061  #else /* __x86_64__ */
20062 -       ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
20063 -
20064 -       ctxt.user_regs.cs = __KERNEL_CS;
20065 -       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
20066 -
20067 -       ctxt.kernel_ss = __KERNEL_DS;
20068 -       ctxt.kernel_sp = idle->thread.rsp0;
20069 -
20070 -       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
20071 -       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
20072         ctxt.syscall_callback_eip  = (unsigned long)system_call;
20073
20074         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
20075 --- /dev/null
20076 +++ b/drivers/xen/core/spinlock.c
20077 @@ -0,0 +1,161 @@
20078 +/*
20079 + *     Xen spinlock functions
20080 + *
20081 + *     See arch/x86/xen/smp.c for copyright and credits for derived
20082 + *     portions of this file.
20083 + */
20084 +
20085 +#include <linux/init.h>
20086 +#include <linux/irq.h>
20087 +#include <linux/kernel.h>
20088 +#include <linux/kernel_stat.h>
20089 +#include <linux/module.h>
20090 +#include <xen/evtchn.h>
20091 +
20092 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
20093 +
20094 +static DEFINE_PER_CPU(int, spinlock_irq) = -1;
20095 +static char spinlock_name[NR_CPUS][15];
20096 +
20097 +struct spinning {
20098 +       raw_spinlock_t *lock;
20099 +       unsigned int ticket;
20100 +       struct spinning *prev;
20101 +};
20102 +static DEFINE_PER_CPU(struct spinning *, spinning);
20103 +/*
20104 + * Protect removal of objects: Addition can be done lockless, and even
20105 + * removal itself doesn't need protection - what needs to be prevented is
20106 + * removed objects going out of scope (as they're allocated on the stack.
20107 + */
20108 +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
20109 +
20110 +int __cpuinit xen_spinlock_init(unsigned int cpu)
20111 +{
20112 +       int rc;
20113 +
20114 +       sprintf(spinlock_name[cpu], "spinlock%u", cpu);
20115 +       rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
20116 +                                   cpu,
20117 +                                   smp_reschedule_interrupt,
20118 +                                   IRQF_DISABLED|IRQF_NOBALANCING,
20119 +                                   spinlock_name[cpu],
20120 +                                   NULL);
20121 +       if (rc < 0)
20122 +               return rc;
20123 +
20124 +       disable_irq(rc); /* make sure it's never delivered */
20125 +       per_cpu(spinlock_irq, cpu) = rc;
20126 +
20127 +       return 0;
20128 +}
20129 +
20130 +void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20131 +{
20132 +       if (per_cpu(spinlock_irq, cpu) >= 0)
20133 +               unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20134 +       per_cpu(spinlock_irq, cpu) = -1;
20135 +}
20136 +
20137 +int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20138 +{
20139 +       int rc = 0, irq = __get_cpu_var(spinlock_irq);
20140 +       raw_rwlock_t *rm_lock;
20141 +       unsigned long flags;
20142 +       struct spinning spinning;
20143 +
20144 +       /* If kicker interrupt not initialized yet, just spin. */
20145 +       if (unlikely(irq < 0))
20146 +               return 0;
20147 +
20148 +       token >>= TICKET_SHIFT;
20149 +
20150 +       /* announce we're spinning */
20151 +       spinning.ticket = token;
20152 +       spinning.lock = lock;
20153 +       spinning.prev = __get_cpu_var(spinning);
20154 +       smp_wmb();
20155 +       __get_cpu_var(spinning) = &spinning;
20156 +
20157 +       /* clear pending */
20158 +       xen_clear_irq_pending(irq);
20159 +
20160 +       do {
20161 +               /* Check again to make sure it didn't become free while
20162 +                * we weren't looking. */
20163 +               if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20164 +                       /* If we interrupted another spinlock while it was
20165 +                        * blocking, make sure it doesn't block (again)
20166 +                        * without rechecking the lock. */
20167 +                       if (spinning.prev)
20168 +                               xen_set_irq_pending(irq);
20169 +                       rc = 1;
20170 +                       break;
20171 +               }
20172 +
20173 +               /* block until irq becomes pending */
20174 +               xen_poll_irq(irq);
20175 +       } while (!xen_test_irq_pending(irq));
20176 +
20177 +       /* Leave the irq pending so that any interrupted blocker will
20178 +        * re-check. */
20179 +       kstat_this_cpu.irqs[irq] += !rc;
20180 +
20181 +       /* announce we're done */
20182 +       __get_cpu_var(spinning) = spinning.prev;
20183 +       rm_lock = &__get_cpu_var(spinning_rm_lock);
20184 +       raw_local_irq_save(flags);
20185 +       __raw_write_lock(rm_lock);
20186 +       __raw_write_unlock(rm_lock);
20187 +       raw_local_irq_restore(flags);
20188 +
20189 +       return rc;
20190 +}
20191 +EXPORT_SYMBOL(xen_spin_wait);
20192 +
20193 +unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20194 +{
20195 +       return token;//todo
20196 +}
20197 +EXPORT_SYMBOL(xen_spin_adjust);
20198 +
20199 +int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20200 +                         unsigned int flags)
20201 +{
20202 +       return xen_spin_wait(lock, *token);//todo
20203 +}
20204 +EXPORT_SYMBOL(xen_spin_wait_flags);
20205 +
20206 +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20207 +{
20208 +       unsigned int cpu;
20209 +
20210 +       token &= (1U << TICKET_SHIFT) - 1;
20211 +       for_each_online_cpu(cpu) {
20212 +               raw_rwlock_t *rm_lock;
20213 +               unsigned long flags;
20214 +               struct spinning *spinning;
20215 +
20216 +               if (cpu == raw_smp_processor_id())
20217 +                       continue;
20218 +
20219 +               rm_lock = &per_cpu(spinning_rm_lock, cpu);
20220 +               raw_local_irq_save(flags);
20221 +               __raw_read_lock(rm_lock);
20222 +
20223 +               spinning = per_cpu(spinning, cpu);
20224 +               smp_rmb();
20225 +               if (spinning
20226 +                   && (spinning->lock != lock || spinning->ticket != token))
20227 +                       spinning = NULL;
20228 +
20229 +               __raw_read_unlock(rm_lock);
20230 +               raw_local_irq_restore(flags);
20231 +
20232 +               if (unlikely(spinning)) {
20233 +                       notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20234 +                       return;
20235 +               }
20236 +       }
20237 +}
20238 +EXPORT_SYMBOL(xen_spin_kick);
20239 --- a/drivers/xen/core/xen_sysfs.c
20240 +++ b/drivers/xen/core/xen_sysfs.c
20241 @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20242
20243  static int __init xen_sysfs_type_init(void)
20244  {
20245 -       return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20246 +       return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20247  }
20248
20249  static void xen_sysfs_type_destroy(void)
20250  {
20251 -       sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20252 +       sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20253  }
20254
20255  /* xen version attributes */
20256 @@ -90,13 +90,12 @@ static struct attribute_group version_gr
20257
20258  static int __init xen_sysfs_version_init(void)
20259  {
20260 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20261 -                                 &version_group);
20262 +       return sysfs_create_group(hypervisor_kobj, &version_group);
20263  }
20264
20265  static void xen_sysfs_version_destroy(void)
20266  {
20267 -       sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20268 +       sysfs_remove_group(hypervisor_kobj, &version_group);
20269  }
20270
20271  /* UUID */
20272 @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20273
20274  static int __init xen_sysfs_uuid_init(void)
20275  {
20276 -       return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20277 +       return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20278  }
20279
20280  static void xen_sysfs_uuid_destroy(void)
20281  {
20282 -       sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20283 +       sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20284  }
20285
20286  /* xen compilation attributes */
20287 @@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20288
20289  int __init static xen_compilation_init(void)
20290  {
20291 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20292 -                                 &xen_compilation_group);
20293 +       return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20294  }
20295
20296  static void xen_compilation_destroy(void)
20297  {
20298 -       sysfs_remove_group(&hypervisor_subsys.kobj,
20299 -                          &xen_compilation_group);
20300 +       sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20301  }
20302
20303  /* xen properties info */
20304 @@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20305
20306  static int __init xen_properties_init(void)
20307  {
20308 -       return sysfs_create_group(&hypervisor_subsys.kobj,
20309 -                                 &xen_properties_group);
20310 +       return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20311  }
20312
20313  static void xen_properties_destroy(void)
20314  {
20315 -       sysfs_remove_group(&hypervisor_subsys.kobj,
20316 -                          &xen_properties_group);
20317 +       sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20318  }
20319
20320  #ifdef CONFIG_KEXEC
20321 @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20322
20323  static int __init xen_sysfs_vmcoreinfo_init(void)
20324  {
20325 -       return sysfs_create_file(&hypervisor_subsys.kobj,
20326 -                                &vmcoreinfo_attr.attr);
20327 +       return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20328  }
20329
20330  static void xen_sysfs_vmcoreinfo_destroy(void)
20331  {
20332 -       sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20333 +       sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20334  }
20335
20336  #endif
20337 --- a/drivers/xen/gntdev/gntdev.c
20338 +++ b/drivers/xen/gntdev/gntdev.c
20339 @@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20340                                        op.status);
20341                 } else {
20342                         /* USING SHADOW PAGE TABLES. */
20343 -                       copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20344 +                       copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20345                 }
20346
20347                 /* Finally, we unmap the grant from kernel space. */
20348 @@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20349                                     >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20350
20351         } else {
20352 -               copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20353 +               copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20354         }
20355
20356         return copy;
20357 --- a/drivers/xen/scsifront/scsifront.c
20358 +++ b/drivers/xen/scsifront/scsifront.c
20359 @@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20360                 return -ENOMEM;
20361         }
20362
20363 -       if (sc->use_sg) {
20364 +       if (scsi_bufflen(sc)) {
20365                 /* quoted scsi_lib.c/scsi_req_map_sg . */
20366 -               struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20367 -               unsigned int data_len = sc->request_bufflen;
20368 +               struct scatterlist *sg, *sgl = scsi_sglist(sc);
20369 +               unsigned int data_len = scsi_bufflen(sc);
20370
20371 -               nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20372 +               nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20373                 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20374                         printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20375                         ref_cnt = (-E2BIG);
20376                         goto big_to_sg;
20377                 }
20378
20379 -               for_each_sg (sgl, sg, sc->use_sg, i) {
20380 +               for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20381                         page = sg_page(sg);
20382                         off = sg->offset;
20383                         len = sg->length;
20384 @@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20385                                 ref_cnt++;
20386                         }
20387                 }
20388 -       } else if (sc->request_bufflen) {
20389 -               unsigned long end   = ((unsigned long)sc->request_buffer
20390 -                                       + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20391 -               unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20392 -
20393 -               page = virt_to_page(sc->request_buffer);
20394 -               nr_pages = end - start;
20395 -               len = sc->request_bufflen;
20396 -
20397 -               if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20398 -                       ref_cnt = (-E2BIG);
20399 -                       goto big_to_sg;
20400 -               }
20401 -
20402 -               buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20403 -
20404 -               off = offset_in_page((unsigned long)sc->request_buffer);
20405 -               for (i = 0; i < nr_pages; i++) {
20406 -                       bytes = PAGE_SIZE - off;
20407 -
20408 -                       if (bytes > len)
20409 -                               bytes = len;
20410 -
20411 -                       ref = gnttab_claim_grant_reference(&gref_head);
20412 -                       BUG_ON(ref == -ENOSPC);
20413 -
20414 -                       gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20415 -                               buffer_pfn, write);
20416 -
20417 -                       info->shadow[id].gref[i] = ref;
20418 -                       ring_req->seg[i].gref     = ref;
20419 -                       ring_req->seg[i].offset   = (uint16_t)off;
20420 -                       ring_req->seg[i].length   = (uint16_t)bytes;
20421 -
20422 -                       buffer_pfn++;
20423 -                       len -= bytes;
20424 -                       off = 0;
20425 -                       ref_cnt++;
20426 -               }
20427         }
20428
20429  big_to_sg:
20430 --- a/drivers/xen/xenoprof/xenoprofile.c
20431 +++ b/drivers/xen/xenoprof/xenoprofile.c
20432 @@ -79,7 +79,7 @@ static int xenoprof_resume(struct sys_de
20433
20434
20435  static struct sysdev_class oprofile_sysclass = {
20436 -       set_kset_name("oprofile"),
20437 +       .name           = "oprofile",
20438         .resume         = xenoprof_resume,
20439         .suspend        = xenoprof_suspend
20440  };
20441 --- a/include/asm-x86/mach-xen/asm/agp.h
20442 +++ b/include/asm-x86/mach-xen/asm/agp.h
20443 @@ -13,18 +13,13 @@
20444   * page. This avoids data corruption on some CPUs.
20445   */
20446
20447 -/*
20448 - * Caller's responsibility to call global_flush_tlb() for performance
20449 - * reasons
20450 - */
20451  #define map_page_into_agp(page) ( \
20452         xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20453 -       ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20454 +       ?: set_pages_uc(page, 1))
20455  #define unmap_page_from_agp(page) ( \
20456         xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20457         /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20458 -       change_page_attr(page, 1, PAGE_KERNEL))
20459 -#define flush_agp_mappings() global_flush_tlb()
20460 +       set_pages_wb(page, 1))
20461
20462  /*
20463   * Could use CLFLUSH here if the cpu supports it. But then it would
20464 --- a/include/asm-x86/mach-xen/asm/desc_32.h
20465 +++ /dev/null
20466 @@ -1,262 +0,0 @@
20467 -#ifndef __ARCH_DESC_H
20468 -#define __ARCH_DESC_H
20469 -
20470 -#include <asm/ldt.h>
20471 -#include <asm/segment.h>
20472 -
20473 -#ifndef __ASSEMBLY__
20474 -
20475 -#include <linux/preempt.h>
20476 -#include <linux/smp.h>
20477 -
20478 -#include <asm/mmu.h>
20479 -
20480 -struct Xgt_desc_struct {
20481 -       unsigned short size;
20482 -       unsigned long address __attribute__((packed));
20483 -       unsigned short pad;
20484 -} __attribute__ ((packed));
20485 -
20486 -struct gdt_page
20487 -{
20488 -       struct desc_struct gdt[GDT_ENTRIES];
20489 -} __attribute__((aligned(PAGE_SIZE)));
20490 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
20491 -
20492 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20493 -{
20494 -       return per_cpu(gdt_page, cpu).gdt;
20495 -}
20496 -
20497 -extern struct Xgt_desc_struct idt_descr;
20498 -extern struct desc_struct idt_table[];
20499 -extern void set_intr_gate(unsigned int irq, void * addr);
20500 -
20501 -static inline void pack_descriptor(__u32 *a, __u32 *b,
20502 -       unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20503 -{
20504 -       *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20505 -       *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20506 -               (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20507 -}
20508 -
20509 -static inline void pack_gate(__u32 *a, __u32 *b,
20510 -       unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20511 -{
20512 -       *a = (seg << 16) | (base & 0xffff);
20513 -       *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20514 -}
20515 -
20516 -#define DESCTYPE_LDT   0x82    /* present, system, DPL-0, LDT */
20517 -#define DESCTYPE_TSS   0x89    /* present, system, DPL-0, 32-bit TSS */
20518 -#define DESCTYPE_TASK  0x85    /* present, system, DPL-0, task gate */
20519 -#define DESCTYPE_INT   0x8e    /* present, system, DPL-0, interrupt gate */
20520 -#define DESCTYPE_TRAP  0x8f    /* present, system, DPL-0, trap gate */
20521 -#define DESCTYPE_DPL3  0x60    /* DPL-3 */
20522 -#define DESCTYPE_S     0x10    /* !system */
20523 -
20524 -#ifndef CONFIG_XEN
20525 -#define load_TR_desc() native_load_tr_desc()
20526 -#define load_gdt(dtr) native_load_gdt(dtr)
20527 -#define load_idt(dtr) native_load_idt(dtr)
20528 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20529 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20530 -
20531 -#define store_gdt(dtr) native_store_gdt(dtr)
20532 -#define store_idt(dtr) native_store_idt(dtr)
20533 -#define store_tr(tr) (tr = native_store_tr())
20534 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20535 -
20536 -#define load_TLS(t, cpu) native_load_tls(t, cpu)
20537 -#define set_ldt native_set_ldt
20538 -
20539 -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20540 -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20541 -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20542 -
20543 -static inline void write_dt_entry(struct desc_struct *dt,
20544 -                                 int entry, u32 entry_low, u32 entry_high)
20545 -{
20546 -       dt[entry].a = entry_low;
20547 -       dt[entry].b = entry_high;
20548 -}
20549 -
20550 -static inline void native_set_ldt(const void *addr, unsigned int entries)
20551 -{
20552 -       if (likely(entries == 0))
20553 -               __asm__ __volatile__("lldt %w0"::"q" (0));
20554 -       else {
20555 -               unsigned cpu = smp_processor_id();
20556 -               __u32 a, b;
20557 -
20558 -               pack_descriptor(&a, &b, (unsigned long)addr,
20559 -                               entries * sizeof(struct desc_struct) - 1,
20560 -                               DESCTYPE_LDT, 0);
20561 -               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20562 -               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20563 -       }
20564 -}
20565 -
20566 -
20567 -static inline void native_load_tr_desc(void)
20568 -{
20569 -       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20570 -}
20571 -
20572 -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20573 -{
20574 -       asm volatile("lgdt %0"::"m" (*dtr));
20575 -}
20576 -
20577 -static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20578 -{
20579 -       asm volatile("lidt %0"::"m" (*dtr));
20580 -}
20581 -
20582 -static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20583 -{
20584 -       asm ("sgdt %0":"=m" (*dtr));
20585 -}
20586 -
20587 -static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20588 -{
20589 -       asm ("sidt %0":"=m" (*dtr));
20590 -}
20591 -
20592 -static inline unsigned long native_store_tr(void)
20593 -{
20594 -       unsigned long tr;
20595 -       asm ("str %0":"=r" (tr));
20596 -       return tr;
20597 -}
20598 -
20599 -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20600 -{
20601 -       unsigned int i;
20602 -       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20603 -
20604 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20605 -               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20606 -}
20607 -#else
20608 -#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20609 -#define set_ldt xen_set_ldt
20610 -
20611 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20612 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20613 -
20614 -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20615 -{
20616 -       unsigned int i;
20617 -       struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20618 -
20619 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20620 -               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20621 -                                                *(u64 *)&t->tls_array[i]))
20622 -                       BUG();
20623 -}
20624 -#endif
20625 -
20626 -#ifndef CONFIG_X86_NO_IDT
20627 -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20628 -{
20629 -       __u32 a, b;
20630 -       pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20631 -       write_idt_entry(idt_table, gate, a, b);
20632 -}
20633 -#endif
20634 -
20635 -#ifndef CONFIG_X86_NO_TSS
20636 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20637 -{
20638 -       __u32 a, b;
20639 -       pack_descriptor(&a, &b, (unsigned long)addr,
20640 -                       offsetof(struct tss_struct, __cacheline_filler) - 1,
20641 -                       DESCTYPE_TSS, 0);
20642 -       write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20643 -}
20644 -#endif
20645 -
20646 -
20647 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20648 -
20649 -#define LDT_entry_a(info) \
20650 -       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20651 -
20652 -#define LDT_entry_b(info) \
20653 -       (((info)->base_addr & 0xff000000) | \
20654 -       (((info)->base_addr & 0x00ff0000) >> 16) | \
20655 -       ((info)->limit & 0xf0000) | \
20656 -       (((info)->read_exec_only ^ 1) << 9) | \
20657 -       ((info)->contents << 10) | \
20658 -       (((info)->seg_not_present ^ 1) << 15) | \
20659 -       ((info)->seg_32bit << 22) | \
20660 -       ((info)->limit_in_pages << 23) | \
20661 -       ((info)->useable << 20) | \
20662 -       0x7000)
20663 -
20664 -#define LDT_empty(info) (\
20665 -       (info)->base_addr       == 0    && \
20666 -       (info)->limit           == 0    && \
20667 -       (info)->contents        == 0    && \
20668 -       (info)->read_exec_only  == 1    && \
20669 -       (info)->seg_32bit       == 0    && \
20670 -       (info)->limit_in_pages  == 0    && \
20671 -       (info)->seg_not_present == 1    && \
20672 -       (info)->useable         == 0    )
20673 -
20674 -static inline void clear_LDT(void)
20675 -{
20676 -       set_ldt(NULL, 0);
20677 -}
20678 -
20679 -/*
20680 - * load one particular LDT into the current CPU
20681 - */
20682 -static inline void load_LDT_nolock(mm_context_t *pc)
20683 -{
20684 -       set_ldt(pc->ldt, pc->size);
20685 -}
20686 -
20687 -static inline void load_LDT(mm_context_t *pc)
20688 -{
20689 -       preempt_disable();
20690 -       load_LDT_nolock(pc);
20691 -       preempt_enable();
20692 -}
20693 -
20694 -static inline unsigned long get_desc_base(unsigned long *desc)
20695 -{
20696 -       unsigned long base;
20697 -       base = ((desc[0] >> 16)  & 0x0000ffff) |
20698 -               ((desc[1] << 16) & 0x00ff0000) |
20699 -               (desc[1] & 0xff000000);
20700 -       return base;
20701 -}
20702 -
20703 -#else /* __ASSEMBLY__ */
20704 -
20705 -/*
20706 - * GET_DESC_BASE reads the descriptor base of the specified segment.
20707 - *
20708 - * Args:
20709 - *    idx - descriptor index
20710 - *    gdt - GDT pointer
20711 - *    base - 32bit register to which the base will be written
20712 - *    lo_w - lo word of the "base" register
20713 - *    lo_b - lo byte of the "base" register
20714 - *    hi_b - hi byte of the low word of the "base" register
20715 - *
20716 - * Example:
20717 - *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20718 - *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20719 - */
20720 -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20721 -       movb idx*8+4(gdt), lo_b; \
20722 -       movb idx*8+7(gdt), hi_b; \
20723 -       shll $16, base; \
20724 -       movw idx*8+2(gdt), lo_w;
20725 -
20726 -#endif /* !__ASSEMBLY__ */
20727 -
20728 -#endif
20729 --- a/include/asm-x86/mach-xen/asm/desc_64.h
20730 +++ /dev/null
20731 @@ -1,228 +0,0 @@
20732 -/* Written 2000 by Andi Kleen */
20733 -#ifndef __ARCH_DESC_H
20734 -#define __ARCH_DESC_H
20735 -
20736 -#include <linux/threads.h>
20737 -#include <asm/ldt.h>
20738 -
20739 -#ifndef __ASSEMBLY__
20740 -
20741 -#include <linux/string.h>
20742 -#include <linux/smp.h>
20743 -#include <asm/desc_defs.h>
20744 -
20745 -#include <asm/segment.h>
20746 -#include <asm/mmu.h>
20747 -
20748 -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
20749 -
20750 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20751 -
20752 -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
20753 -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
20754 -
20755 -static inline void clear_LDT(void)
20756 -{
20757 -       int cpu = get_cpu();
20758 -
20759 -       /*
20760 -        * NB. We load the default_ldt for lcall7/27 handling on demand, as
20761 -        * it slows down context switching. Noone uses it anyway.
20762 -        */
20763 -       cpu = cpu;              /* XXX avoid compiler warning */
20764 -       xen_set_ldt(NULL, 0);
20765 -       put_cpu();
20766 -}
20767 -
20768 -#ifndef CONFIG_X86_NO_TSS
20769 -static inline unsigned long __store_tr(void)
20770 -{
20771 -       unsigned long tr;
20772 -
20773 -       asm volatile ("str %w0":"=r" (tr));
20774 -       return tr;
20775 -}
20776 -
20777 -#define store_tr(tr) (tr) = __store_tr()
20778 -#endif
20779 -
20780 -/*
20781 - * This is the ldt that every process will get unless we need
20782 - * something other than this.
20783 - */
20784 -extern struct desc_struct default_ldt[];
20785 -#ifndef CONFIG_X86_NO_IDT
20786 -extern struct gate_struct idt_table[];
20787 -#endif
20788 -extern struct desc_ptr cpu_gdt_descr[];
20789 -
20790 -/* the cpu gdt accessor */
20791 -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
20792 -
20793 -#ifndef CONFIG_XEN
20794 -static inline void load_gdt(const struct desc_ptr *ptr)
20795 -{
20796 -       asm volatile("lgdt %w0"::"m" (*ptr));
20797 -}
20798 -
20799 -static inline void store_gdt(struct desc_ptr *ptr)
20800 -{
20801 -       asm("sgdt %w0":"=m" (*ptr));
20802 -}
20803 -#endif
20804 -
20805 -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
20806 -{
20807 -       struct gate_struct s;
20808 -       s.offset_low = PTR_LOW(func);
20809 -       s.segment = __KERNEL_CS;
20810 -       s.ist = ist;
20811 -       s.p = 1;
20812 -       s.dpl = dpl;
20813 -       s.zero0 = 0;
20814 -       s.zero1 = 0;
20815 -       s.type = type;
20816 -       s.offset_middle = PTR_MIDDLE(func);
20817 -       s.offset_high = PTR_HIGH(func);
20818 -       /* does not need to be atomic because it is only done once at setup time */
20819 -       memcpy(adr, &s, 16);
20820 -}
20821 -
20822 -#ifndef CONFIG_X86_NO_IDT
20823 -static inline void set_intr_gate(int nr, void *func)
20824 -{
20825 -       BUG_ON((unsigned)nr > 0xFF);
20826 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
20827 -}
20828 -
20829 -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
20830 -{
20831 -       BUG_ON((unsigned)nr > 0xFF);
20832 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
20833 -}
20834 -
20835 -static inline void set_system_gate(int nr, void *func)
20836 -{
20837 -       BUG_ON((unsigned)nr > 0xFF);
20838 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
20839 -}
20840 -
20841 -static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
20842 -{
20843 -       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
20844 -}
20845 -
20846 -static inline void load_idt(const struct desc_ptr *ptr)
20847 -{
20848 -       asm volatile("lidt %w0"::"m" (*ptr));
20849 -}
20850 -
20851 -static inline void store_idt(struct desc_ptr *dtr)
20852 -{
20853 -       asm("sidt %w0":"=m" (*dtr));
20854 -}
20855 -#endif
20856 -
20857 -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
20858 -                                        unsigned size)
20859 -{
20860 -       struct ldttss_desc d;
20861 -       memset(&d,0,sizeof(d));
20862 -       d.limit0 = size & 0xFFFF;
20863 -       d.base0 = PTR_LOW(tss);
20864 -       d.base1 = PTR_MIDDLE(tss) & 0xFF;
20865 -       d.type = type;
20866 -       d.p = 1;
20867 -       d.limit1 = (size >> 16) & 0xF;
20868 -       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
20869 -       d.base3 = PTR_HIGH(tss);
20870 -       memcpy(ptr, &d, 16);
20871 -}
20872 -
20873 -#ifndef CONFIG_X86_NO_TSS
20874 -static inline void set_tss_desc(unsigned cpu, void *addr)
20875 -{
20876 -       /*
20877 -        * sizeof(unsigned long) coming from an extra "long" at the end
20878 -        * of the iobitmap. See tss_struct definition in processor.h
20879 -        *
20880 -        * -1? seg base+limit should be pointing to the address of the
20881 -        * last valid byte
20882 -        */
20883 -       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
20884 -               (unsigned long)addr, DESC_TSS,
20885 -               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20886 -}
20887 -#endif
20888 -
20889 -static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
20890 -{
20891 -       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
20892 -                             DESC_LDT, size * 8 - 1);
20893 -}
20894 -
20895 -#define LDT_entry_a(info) \
20896 -       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20897 -/* Don't allow setting of the lm bit. It is useless anyways because
20898 -   64bit system calls require __USER_CS. */
20899 -#define LDT_entry_b(info) \
20900 -       (((info)->base_addr & 0xff000000) | \
20901 -       (((info)->base_addr & 0x00ff0000) >> 16) | \
20902 -       ((info)->limit & 0xf0000) | \
20903 -       (((info)->read_exec_only ^ 1) << 9) | \
20904 -       ((info)->contents << 10) | \
20905 -       (((info)->seg_not_present ^ 1) << 15) | \
20906 -       ((info)->seg_32bit << 22) | \
20907 -       ((info)->limit_in_pages << 23) | \
20908 -       ((info)->useable << 20) | \
20909 -       /* ((info)->lm << 21) | */ \
20910 -       0x7000)
20911 -
20912 -#define LDT_empty(info) (\
20913 -       (info)->base_addr       == 0    && \
20914 -       (info)->limit           == 0    && \
20915 -       (info)->contents        == 0    && \
20916 -       (info)->read_exec_only  == 1    && \
20917 -       (info)->seg_32bit       == 0    && \
20918 -       (info)->limit_in_pages  == 0    && \
20919 -       (info)->seg_not_present == 1    && \
20920 -       (info)->useable         == 0    && \
20921 -       (info)->lm              == 0)
20922 -
20923 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
20924 -{
20925 -       unsigned int i;
20926 -       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
20927 -
20928 -       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20929 -               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20930 -                                                t->tls_array[i]))
20931 -                       BUG();
20932 -}
20933 -
20934 -/*
20935 - * load one particular LDT into the current CPU
20936 - */
20937 -static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
20938 -{
20939 -       void *segments = pc->ldt;
20940 -       int count = pc->size;
20941 -
20942 -       if (likely(!count))
20943 -               segments = NULL;
20944 -
20945 -       xen_set_ldt(segments, count);
20946 -}
20947 -
20948 -static inline void load_LDT(mm_context_t *pc)
20949 -{
20950 -       int cpu = get_cpu();
20951 -       load_LDT_nolock(pc, cpu);
20952 -       put_cpu();
20953 -}
20954 -
20955 -extern struct desc_ptr idt_descr;
20956 -
20957 -#endif /* !__ASSEMBLY__ */
20958 -
20959 -#endif
20960 --- a/include/asm-x86/mach-xen/asm/desc.h
20961 +++ b/include/asm-x86/mach-xen/asm/desc.h
20962 @@ -1,5 +1,404 @@
20963 +#ifndef _ASM_DESC_H_
20964 +#define _ASM_DESC_H_
20965 +
20966 +#ifndef __ASSEMBLY__
20967 +#include <asm/desc_defs.h>
20968 +#include <asm/ldt.h>
20969 +#include <asm/mmu.h>
20970 +#include <linux/smp.h>
20971 +
20972 +static inline void fill_ldt(struct desc_struct *desc,
20973 +                           const struct user_desc *info)
20974 +{
20975 +       desc->limit0 = info->limit & 0x0ffff;
20976 +       desc->base0 = info->base_addr & 0x0000ffff;
20977 +
20978 +       desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20979 +       desc->type = (info->read_exec_only ^ 1) << 1;
20980 +       desc->type |= info->contents << 2;
20981 +       desc->s = 1;
20982 +       desc->dpl = 0x3;
20983 +       desc->p = info->seg_not_present ^ 1;
20984 +       desc->limit = (info->limit & 0xf0000) >> 16;
20985 +       desc->avl = info->useable;
20986 +       desc->d = info->seg_32bit;
20987 +       desc->g = info->limit_in_pages;
20988 +       desc->base2 = (info->base_addr & 0xff000000) >> 24;
20989 +}
20990 +
20991 +#ifndef CONFIG_X86_NO_IDT
20992 +extern struct desc_ptr idt_descr;
20993 +extern gate_desc idt_table[];
20994 +#endif
20995 +
20996 +#ifdef CONFIG_X86_64
20997 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20998 +extern struct desc_ptr cpu_gdt_descr[];
20999 +/* the cpu gdt accessor */
21000 +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
21001 +
21002 +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
21003 +                            unsigned dpl, unsigned ist, unsigned seg)
21004 +{
21005 +       gate->offset_low = PTR_LOW(func);
21006 +       gate->segment = __KERNEL_CS;
21007 +       gate->ist = ist;
21008 +       gate->p = 1;
21009 +       gate->dpl = dpl;
21010 +       gate->zero0 = 0;
21011 +       gate->zero1 = 0;
21012 +       gate->type = type;
21013 +       gate->offset_middle = PTR_MIDDLE(func);
21014 +       gate->offset_high = PTR_HIGH(func);
21015 +}
21016 +
21017 +#else
21018 +struct gdt_page {
21019 +       struct desc_struct gdt[GDT_ENTRIES];
21020 +} __attribute__((aligned(PAGE_SIZE)));
21021 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
21022 +
21023 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
21024 +{
21025 +       return per_cpu(gdt_page, cpu).gdt;
21026 +}
21027 +
21028 +static inline void pack_gate(gate_desc *gate, unsigned char type,
21029 +       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
21030 +
21031 +{
21032 +       gate->a = (seg << 16) | (base & 0xffff);
21033 +       gate->b = (base & 0xffff0000) |
21034 +                 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
21035 +}
21036 +
21037 +#endif
21038 +
21039 +static inline int desc_empty(const void *ptr)
21040 +{
21041 +       const u32 *desc = ptr;
21042 +       return !(desc[0] | desc[1]);
21043 +}
21044 +
21045 +#ifndef CONFIG_XEN
21046 +#define load_TR_desc() native_load_tr_desc()
21047 +#define load_gdt(dtr) native_load_gdt(dtr)
21048 +#define load_idt(dtr) native_load_idt(dtr)
21049 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
21050 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
21051 +
21052 +#define store_gdt(dtr) native_store_gdt(dtr)
21053 +#define store_idt(dtr) native_store_idt(dtr)
21054 +#define store_tr(tr) (tr = native_store_tr())
21055 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
21056 +
21057 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
21058 +#define set_ldt native_set_ldt
21059 +
21060 +#define write_ldt_entry(dt, entry, desc) \
21061 +                               native_write_ldt_entry(dt, entry, desc)
21062 +#define write_gdt_entry(dt, entry, desc, type) \
21063 +                               native_write_gdt_entry(dt, entry, desc, type)
21064 +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
21065 +
21066 +static inline void native_write_idt_entry(gate_desc *idt, int entry,
21067 +                                         const gate_desc *gate)
21068 +{
21069 +       memcpy(&idt[entry], gate, sizeof(*gate));
21070 +}
21071 +
21072 +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
21073 +                                         const void *desc)
21074 +{
21075 +       memcpy(&ldt[entry], desc, 8);
21076 +}
21077 +
21078 +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
21079 +                                         const void *desc, int type)
21080 +{
21081 +       unsigned int size;
21082 +       switch (type) {
21083 +       case DESC_TSS:
21084 +               size = sizeof(tss_desc);
21085 +               break;
21086 +       case DESC_LDT:
21087 +               size = sizeof(ldt_desc);
21088 +               break;
21089 +       default:
21090 +               size = sizeof(struct desc_struct);
21091 +               break;
21092 +       }
21093 +       memcpy(&gdt[entry], desc, size);
21094 +}
21095 +#endif
21096 +
21097 +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
21098 +                                  unsigned long limit, unsigned char type,
21099 +                                  unsigned char flags)
21100 +{
21101 +       desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
21102 +       desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
21103 +                 (limit & 0x000f0000) | ((type & 0xff) << 8) |
21104 +                 ((flags & 0xf) << 20);
21105 +       desc->p = 1;
21106 +}
21107 +
21108 +
21109 +#ifndef CONFIG_XEN
21110 +static inline void set_tssldt_descriptor(void *d, unsigned long addr,
21111 +                                        unsigned type, unsigned size)
21112 +{
21113 +#ifdef CONFIG_X86_64
21114 +       struct ldttss_desc64 *desc = d;
21115 +       memset(desc, 0, sizeof(*desc));
21116 +       desc->limit0 = size & 0xFFFF;
21117 +       desc->base0 = PTR_LOW(addr);
21118 +       desc->base1 = PTR_MIDDLE(addr) & 0xFF;
21119 +       desc->type = type;
21120 +       desc->p = 1;
21121 +       desc->limit1 = (size >> 16) & 0xF;
21122 +       desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
21123 +       desc->base3 = PTR_HIGH(addr);
21124 +#else
21125 +
21126 +       pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
21127 +#endif
21128 +}
21129 +
21130 +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
21131 +{
21132 +       struct desc_struct *d = get_cpu_gdt_table(cpu);
21133 +       tss_desc tss;
21134 +
21135 +       /*
21136 +        * sizeof(unsigned long) coming from an extra "long" at the end
21137 +        * of the iobitmap. See tss_struct definition in processor.h
21138 +        *
21139 +        * -1? seg base+limit should be pointing to the address of the
21140 +        * last valid byte
21141 +        */
21142 +       set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
21143 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21144 +       write_gdt_entry(d, entry, &tss, DESC_TSS);
21145 +}
21146 +
21147 +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
21148 +
21149 +static inline void native_set_ldt(const void *addr, unsigned int entries)
21150 +{
21151 +       if (likely(entries == 0))
21152 +               __asm__ __volatile__("lldt %w0"::"q" (0));
21153 +       else {
21154 +               unsigned cpu = smp_processor_id();
21155 +               ldt_desc ldt;
21156 +
21157 +               set_tssldt_descriptor(&ldt, (unsigned long)addr,
21158 +                                     DESC_LDT, entries * sizeof(ldt) - 1);
21159 +               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
21160 +                               &ldt, DESC_LDT);
21161 +               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
21162 +       }
21163 +}
21164 +
21165 +static inline void native_load_tr_desc(void)
21166 +{
21167 +       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
21168 +}
21169 +
21170 +static inline void native_load_gdt(const struct desc_ptr *dtr)
21171 +{
21172 +       asm volatile("lgdt %0"::"m" (*dtr));
21173 +}
21174 +
21175 +static inline void native_load_idt(const struct desc_ptr *dtr)
21176 +{
21177 +       asm volatile("lidt %0"::"m" (*dtr));
21178 +}
21179 +
21180 +static inline void native_store_gdt(struct desc_ptr *dtr)
21181 +{
21182 +       asm volatile("sgdt %0":"=m" (*dtr));
21183 +}
21184 +
21185 +static inline void native_store_idt(struct desc_ptr *dtr)
21186 +{
21187 +       asm volatile("sidt %0":"=m" (*dtr));
21188 +}
21189 +
21190 +static inline unsigned long native_store_tr(void)
21191 +{
21192 +       unsigned long tr;
21193 +       asm volatile("str %0":"=r" (tr));
21194 +       return tr;
21195 +}
21196 +
21197 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
21198 +{
21199 +       unsigned int i;
21200 +       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
21201 +
21202 +       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21203 +               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
21204 +}
21205 +#else
21206 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
21207 +#define set_ldt xen_set_ldt
21208 +
21209 +extern int write_ldt_entry(struct desc_struct *ldt, int entry,
21210 +                          const void *desc);
21211 +extern int write_gdt_entry(struct desc_struct *gdt, int entry,
21212 +                          const void *desc, int type);
21213 +
21214 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
21215 +{
21216 +       unsigned int i;
21217 +       struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
21218 +
21219 +       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21220 +               if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21221 +                                                *(u64 *)&t->tls_array[i]))
21222 +                       BUG();
21223 +}
21224 +#endif
21225 +
21226 +#define _LDT_empty(info) (\
21227 +       (info)->base_addr       == 0    && \
21228 +       (info)->limit           == 0    && \
21229 +       (info)->contents        == 0    && \
21230 +       (info)->read_exec_only  == 1    && \
21231 +       (info)->seg_32bit       == 0    && \
21232 +       (info)->limit_in_pages  == 0    && \
21233 +       (info)->seg_not_present == 1    && \
21234 +       (info)->useable         == 0)
21235 +
21236 +#ifdef CONFIG_X86_64
21237 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
21238 +#else
21239 +#define LDT_empty(info) (_LDT_empty(info))
21240 +#endif
21241 +
21242 +static inline void clear_LDT(void)
21243 +{
21244 +       set_ldt(NULL, 0);
21245 +}
21246 +
21247 +/*
21248 + * load one particular LDT into the current CPU
21249 + */
21250 +static inline void load_LDT_nolock(mm_context_t *pc)
21251 +{
21252 +       set_ldt(pc->ldt, pc->size);
21253 +}
21254 +
21255 +static inline void load_LDT(mm_context_t *pc)
21256 +{
21257 +       preempt_disable();
21258 +       load_LDT_nolock(pc);
21259 +       preempt_enable();
21260 +}
21261 +
21262 +static inline unsigned long get_desc_base(const struct desc_struct *desc)
21263 +{
21264 +       return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
21265 +}
21266 +
21267 +static inline unsigned long get_desc_limit(const struct desc_struct *desc)
21268 +{
21269 +       return desc->limit0 | (desc->limit << 16);
21270 +}
21271 +
21272 +#ifndef CONFIG_X86_NO_IDT
21273 +static inline void _set_gate(int gate, unsigned type, void *addr,
21274 +                             unsigned dpl, unsigned ist, unsigned seg)
21275 +{
21276 +       gate_desc s;
21277 +       pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
21278 +       /*
21279 +        * does not need to be atomic because it is only done once at
21280 +        * setup time
21281 +        */
21282 +       write_idt_entry(idt_table, gate, &s);
21283 +}
21284 +
21285 +/*
21286 + * This needs to use 'idt_table' rather than 'idt', and
21287 + * thus use the _nonmapped_ version of the IDT, as the
21288 + * Pentium F0 0F bugfix can have resulted in the mapped
21289 + * IDT being write-protected.
21290 + */
21291 +static inline void set_intr_gate(unsigned int n, void *addr)
21292 +{
21293 +       BUG_ON((unsigned)n > 0xFF);
21294 +       _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
21295 +}
21296 +
21297 +/*
21298 + * This routine sets up an interrupt gate at directory privilege level 3.
21299 + */
21300 +static inline void set_system_intr_gate(unsigned int n, void *addr)
21301 +{
21302 +       BUG_ON((unsigned)n > 0xFF);
21303 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21304 +}
21305 +
21306 +static inline void set_trap_gate(unsigned int n, void *addr)
21307 +{
21308 +       BUG_ON((unsigned)n > 0xFF);
21309 +       _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
21310 +}
21311 +
21312 +static inline void set_system_gate(unsigned int n, void *addr)
21313 +{
21314 +       BUG_ON((unsigned)n > 0xFF);
21315  #ifdef CONFIG_X86_32
21316 -# include "desc_32.h"
21317 +       _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
21318 +#else
21319 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
21320 +#endif
21321 +}
21322 +
21323 +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
21324 +{
21325 +       BUG_ON((unsigned)n > 0xFF);
21326 +       _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
21327 +}
21328 +
21329 +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
21330 +{
21331 +       BUG_ON((unsigned)n > 0xFF);
21332 +       _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
21333 +}
21334 +
21335 +static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
21336 +{
21337 +       BUG_ON((unsigned)n > 0xFF);
21338 +       _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
21339 +}
21340 +#endif
21341 +
21342  #else
21343 -# include "desc_64.h"
21344 +/*
21345 + * GET_DESC_BASE reads the descriptor base of the specified segment.
21346 + *
21347 + * Args:
21348 + *    idx - descriptor index
21349 + *    gdt - GDT pointer
21350 + *    base - 32bit register to which the base will be written
21351 + *    lo_w - lo word of the "base" register
21352 + *    lo_b - lo byte of the "base" register
21353 + *    hi_b - hi byte of the low word of the "base" register
21354 + *
21355 + * Example:
21356 + *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21357 + *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21358 + */
21359 +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21360 +       movb idx*8+4(gdt), lo_b; \
21361 +       movb idx*8+7(gdt), hi_b; \
21362 +       shll $16, base; \
21363 +       movw idx*8+2(gdt), lo_w;
21364 +
21365 +
21366 +#endif /* __ASSEMBLY__ */
21367 +
21368  #endif
21369 --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21370 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h
21371 @@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21372         dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21373  }
21374
21375 -static inline void
21376 +extern void
21377  dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21378 -                   enum dma_data_direction direction)
21379 -{
21380 -       if (swiotlb)
21381 -               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21382 -       flush_write_buffers();
21383 -}
21384 +                   enum dma_data_direction direction);
21385
21386 -static inline void
21387 +extern void
21388  dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21389 -                   enum dma_data_direction direction)
21390 -{
21391 -       if (swiotlb)
21392 -               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21393 -       flush_write_buffers();
21394 -}
21395 +                   enum dma_data_direction direction);
21396
21397  extern int
21398  dma_mapping_error(dma_addr_t dma_addr);
21399 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
21400 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
21401 @@ -64,7 +64,7 @@ enum fixed_addresses {
21402  #endif
21403  #ifdef CONFIG_X86_VISWS_APIC
21404         FIX_CO_CPU,     /* Cobalt timer */
21405 -       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
21406 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
21407         FIX_LI_PCIA,    /* Lithium PCI Bridge A */
21408         FIX_LI_PCIB,    /* Lithium PCI Bridge B */
21409  #endif
21410 @@ -73,7 +73,7 @@ enum fixed_addresses {
21411  #endif
21412  #ifdef CONFIG_X86_CYCLONE_TIMER
21413         FIX_CYCLONE_TIMER, /*cyclone timer register*/
21414 -#endif
21415 +#endif
21416  #ifdef CONFIG_HIGHMEM
21417         FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21418         FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21419 @@ -93,11 +93,23 @@ enum fixed_addresses {
21420         FIX_ISAMAP_END,
21421         FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21422         __end_of_permanent_fixed_addresses,
21423 -       /* temporary boot-time mappings, used before ioremap() is functional */
21424 -#define NR_FIX_BTMAPS  16
21425 -       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21426 -       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21427 +       /*
21428 +        * 256 temporary boot-time mappings, used by early_ioremap(),
21429 +        * before ioremap() is functional.
21430 +        *
21431 +        * We round it up to the next 512 pages boundary so that we
21432 +        * can have a single pgd entry and a single pte table:
21433 +        */
21434 +#define NR_FIX_BTMAPS          64
21435 +#define FIX_BTMAPS_NESTING     4
21436 +       FIX_BTMAP_END =
21437 +               __end_of_permanent_fixed_addresses + 512 -
21438 +                       (__end_of_permanent_fixed_addresses & 511),
21439 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21440         FIX_WP_TEST,
21441 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21442 +       FIX_OHCI1394_BASE,
21443 +#endif
21444         __end_of_fixed_addresses
21445  };
21446
21447 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
21448 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
21449 @@ -15,6 +15,7 @@
21450  #include <asm/apicdef.h>
21451  #include <asm/page.h>
21452  #include <asm/vsyscall.h>
21453 +#include <asm/efi.h>
21454  #include <asm/acpi.h>
21455
21456  /*
21457 @@ -46,6 +47,10 @@ enum fixed_addresses {
21458         FIX_IO_APIC_BASE_0,
21459         FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21460  #endif
21461 +#ifdef CONFIG_EFI
21462 +       FIX_EFI_IO_MAP_LAST_PAGE,
21463 +       FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21464 +#endif
21465  #ifdef CONFIG_ACPI
21466         FIX_ACPI_BEGIN,
21467         FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21468 @@ -55,10 +60,22 @@ enum fixed_addresses {
21469         FIX_ISAMAP_END,
21470         FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21471         __end_of_permanent_fixed_addresses,
21472 -       /* temporary boot-time mappings, used before ioremap() is functional */
21473 -#define NR_FIX_BTMAPS  16
21474 -       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21475 -       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21476 +       /*
21477 +        * 256 temporary boot-time mappings, used by early_ioremap(),
21478 +        * before ioremap() is functional.
21479 +        *
21480 +        * We round it up to the next 512 pages boundary so that we
21481 +        * can have a single pgd entry and a single pte table:
21482 +        */
21483 +#define NR_FIX_BTMAPS          64
21484 +#define FIX_BTMAPS_NESTING     4
21485 +       FIX_BTMAP_END =
21486 +               __end_of_permanent_fixed_addresses + 512 -
21487 +                       (__end_of_permanent_fixed_addresses & 511),
21488 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21489 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21490 +       FIX_OHCI1394_BASE,
21491 +#endif
21492         __end_of_fixed_addresses
21493  };
21494
21495 --- a/include/asm-x86/mach-xen/asm/highmem.h
21496 +++ b/include/asm-x86/mach-xen/asm/highmem.h
21497 @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21498   * easily, subsequent pte tables have to be allocated in one physical
21499   * chunk of RAM.
21500   */
21501 -#ifdef CONFIG_X86_PAE
21502 -#define LAST_PKMAP 512
21503 -#else
21504 -#define LAST_PKMAP 1024
21505 -#endif
21506  /*
21507   * Ordering is:
21508   *
21509 @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21510   * VMALLOC_START
21511   * high_memory
21512   */
21513 -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21514  #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21515  #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21516  #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21517
21518 -extern void * FASTCALL(kmap_high(struct page *page));
21519 -extern void FASTCALL(kunmap_high(struct page *page));
21520 +extern void *kmap_high(struct page *page);
21521 +extern void kunmap_high(struct page *page);
21522
21523  void *kmap(struct page *page);
21524  void kunmap(struct page *page);
21525 --- a/include/asm-x86/mach-xen/asm/hypervisor.h
21526 +++ b/include/asm-x86/mach-xen/asm/hypervisor.h
21527 @@ -264,6 +264,25 @@ HYPERVISOR_poll(
21528         return rc;
21529  }
21530
21531 +static inline int __must_check
21532 +HYPERVISOR_poll_no_timeout(
21533 +       evtchn_port_t *ports, unsigned int nr_ports)
21534 +{
21535 +       int rc;
21536 +       struct sched_poll sched_poll = {
21537 +               .nr_ports = nr_ports
21538 +       };
21539 +       set_xen_guest_handle(sched_poll.ports, ports);
21540 +
21541 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21542 +#if CONFIG_XEN_COMPAT <= 0x030002
21543 +       if (rc == -ENOSYS)
21544 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21545 +#endif
21546 +
21547 +       return rc;
21548 +}
21549 +
21550  #ifdef CONFIG_XEN
21551
21552  static inline void
21553 --- a/include/asm-x86/mach-xen/asm/io_32.h
21554 +++ b/include/asm-x86/mach-xen/asm/io_32.h
21555 @@ -113,8 +113,6 @@ static inline void * phys_to_virt(unsign
21556          ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
21557           bvec_to_pseudophys((vec2))))
21558
21559 -extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21560 -
21561  /**
21562   * ioremap     -   map bus memory into CPU space
21563   * @offset:    bus address of the memory
21564 @@ -124,32 +122,39 @@ extern void __iomem * __ioremap(unsigned
21565   * make bus memory CPU accessible via the readb/readw/readl/writeb/
21566   * writew/writel functions and the other mmio helpers. The returned
21567   * address is not guaranteed to be usable directly as a virtual
21568 - * address.
21569 + * address.
21570   *
21571   * If the area you are trying to map is a PCI BAR you should have a
21572   * look at pci_iomap().
21573   */
21574 +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21575 +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21576
21577 -static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
21578 +/*
21579 + * The default ioremap() behavior is non-cached:
21580 + */
21581 +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21582  {
21583 -       return __ioremap(offset, size, 0);
21584 +       return ioremap_nocache(offset, size);
21585  }
21586
21587 -extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
21588  extern void iounmap(volatile void __iomem *addr);
21589
21590  /*
21591 - * bt_ioremap() and bt_iounmap() are for temporary early boot-time
21592 + * early_ioremap() and early_iounmap() are for temporary early boot-time
21593   * mappings, before the real ioremap() is functional.
21594   * A boot-time mapping is currently limited to at most 16 pages.
21595   */
21596 -extern void *bt_ioremap(unsigned long offset, unsigned long size);
21597 -extern void bt_iounmap(void *addr, unsigned long size);
21598 +extern void early_ioremap_init(void);
21599 +extern void early_ioremap_clear(void);
21600 +extern void early_ioremap_reset(void);
21601 +extern void *early_ioremap(unsigned long offset, unsigned long size);
21602 +extern void early_iounmap(void *addr, unsigned long size);
21603  extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21604
21605  /* Use early IO mappings for DMI because it's initialized early */
21606 -#define dmi_ioremap bt_ioremap
21607 -#define dmi_iounmap bt_iounmap
21608 +#define dmi_ioremap early_ioremap
21609 +#define dmi_iounmap early_iounmap
21610  #define dmi_alloc alloc_bootmem
21611
21612  /*
21613 @@ -263,43 +268,21 @@ static inline void flush_write_buffers(v
21614
21615  #endif /* __KERNEL__ */
21616
21617 -static inline void xen_io_delay(void)
21618 -{
21619 -       asm volatile("outb %%al,$0x80" : : : "memory");
21620 -}
21621 +extern void xen_io_delay(void);
21622 +#define native_io_delay xen_io_delay
21623 +
21624 +extern int io_delay_type;
21625 +extern void io_delay_init(void);
21626
21627  static inline void slow_down_io(void) {
21628 -       xen_io_delay();
21629 +       native_io_delay();
21630  #ifdef REALLY_SLOW_IO
21631 -       xen_io_delay();
21632 -       xen_io_delay();
21633 -       xen_io_delay();
21634 +       native_io_delay();
21635 +       native_io_delay();
21636 +       native_io_delay();
21637  #endif
21638  }
21639
21640 -#ifdef CONFIG_X86_NUMAQ
21641 -extern void *xquad_portio;    /* Where the IO area was mapped */
21642 -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
21643 -#define __BUILDIO(bwl,bw,type) \
21644 -static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
21645 -       if (xquad_portio) \
21646 -               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
21647 -       else \
21648 -               out##bwl##_local(value, port); \
21649 -} \
21650 -static inline void out##bwl(unsigned type value, int port) { \
21651 -       out##bwl##_quad(value, port, 0); \
21652 -} \
21653 -static inline unsigned type in##bwl##_quad(int port, int quad) { \
21654 -       if (xquad_portio) \
21655 -               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
21656 -       else \
21657 -               return in##bwl##_local(port); \
21658 -} \
21659 -static inline unsigned type in##bwl(int port) { \
21660 -       return in##bwl##_quad(port, 0); \
21661 -}
21662 -#else
21663  #define __BUILDIO(bwl,bw,type) \
21664  static inline void out##bwl(unsigned type value, int port) { \
21665         out##bwl##_local(value, port); \
21666 @@ -307,8 +290,6 @@ static inline void out##bwl(unsigned typ
21667  static inline unsigned type in##bwl(int port) { \
21668         return in##bwl##_local(port); \
21669  }
21670 -#endif
21671 -
21672
21673  #define BUILDIO(bwl,bw,type) \
21674  static inline void out##bwl##_local(unsigned type value, int port) { \
21675 --- a/include/asm-x86/mach-xen/asm/io_64.h
21676 +++ b/include/asm-x86/mach-xen/asm/io_64.h
21677 @@ -36,13 +36,21 @@
21678    *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21679    */
21680
21681 -#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
21682 +extern void xen_io_delay(void);
21683 +#define native_io_delay xen_io_delay
21684
21685 +extern int io_delay_type;
21686 +extern void io_delay_init(void);
21687 +
21688 +static inline void slow_down_io(void)
21689 +{
21690 +       native_io_delay();
21691  #ifdef REALLY_SLOW_IO
21692 -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
21693 -#else
21694 -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
21695 +       native_io_delay();
21696 +       native_io_delay();
21697 +       native_io_delay();
21698  #endif
21699 +}
21700
21701  /*
21702   * Talk about misusing macros..
21703 @@ -53,9 +61,15 @@ static inline void out##s(unsigned x val
21704  #define __OUT2(s,s1,s2) \
21705  __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
21706
21707 +#ifndef REALLY_SLOW_IO
21708 +#define REALLY_SLOW_IO
21709 +#define UNSET_REALLY_SLOW_IO
21710 +#endif
21711 +
21712  #define __OUT(s,s1,x) \
21713  __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
21714 -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
21715 +__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
21716 +               slow_down_io(); }
21717
21718  #define __IN1(s) \
21719  static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
21720 @@ -64,8 +78,13 @@ static inline RETURN_TYPE in##s(unsigned
21721  __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
21722
21723  #define __IN(s,s1,i...) \
21724 -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21725 -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
21726 +__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
21727 +__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i);          \
21728 +                               slow_down_io(); return _v; }
21729 +
21730 +#ifdef UNSET_REALLY_SLOW_IO
21731 +#undef REALLY_SLOW_IO
21732 +#endif
21733
21734  #define __INS(s) \
21735  static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
21736 @@ -143,25 +162,30 @@ static inline void * phys_to_virt(unsign
21737
21738  #include <asm-generic/iomap.h>
21739
21740 -extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
21741 -
21742 -static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
21743 -{
21744 -       return __ioremap(offset, size, 0);
21745 -}
21746 -
21747 -extern void *bt_ioremap(unsigned long addr, unsigned long size);
21748 -extern void bt_iounmap(void *addr, unsigned long size);
21749 -#define early_ioremap bt_ioremap
21750 -#define early_iounmap bt_iounmap
21751 +extern void early_ioremap_init(void);
21752 +extern void early_ioremap_clear(void);
21753 +extern void early_ioremap_reset(void);
21754 +extern void *early_ioremap(unsigned long addr, unsigned long size);
21755 +extern void early_iounmap(void *addr, unsigned long size);
21756
21757  /*
21758   * This one maps high address device memory and turns off caching for that area.
21759   * it's useful if some control registers are in such an area and write combining
21760   * or read caching is not desirable:
21761   */
21762 -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
21763 +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
21764 +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
21765 +
21766 +/*
21767 + * The default ioremap() behavior is non-cached:
21768 + */
21769 +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
21770 +{
21771 +       return ioremap_nocache(offset, size);
21772 +}
21773 +
21774  extern void iounmap(volatile void __iomem *addr);
21775 +
21776  extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
21777
21778  /*
21779 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
21780 +++ /dev/null
21781 @@ -1,212 +0,0 @@
21782 -/*
21783 - * include/asm-i386/irqflags.h
21784 - *
21785 - * IRQ flags handling
21786 - *
21787 - * This file gets included from lowlevel asm headers too, to provide
21788 - * wrapped versions of the local_irq_*() APIs, based on the
21789 - * raw_local_irq_*() functions from the lowlevel headers.
21790 - */
21791 -#ifndef _ASM_IRQFLAGS_H
21792 -#define _ASM_IRQFLAGS_H
21793 -
21794 -#ifndef __ASSEMBLY__
21795 -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21796 -
21797 -#define xen_restore_fl(f)                                      \
21798 -do {                                                           \
21799 -       vcpu_info_t *_vcpu;                                     \
21800 -       barrier();                                              \
21801 -       _vcpu = current_vcpu_info();                            \
21802 -       if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
21803 -               barrier(); /* unmask then check (avoid races) */\
21804 -               if (unlikely(_vcpu->evtchn_upcall_pending))     \
21805 -                       force_evtchn_callback();                \
21806 -       }                                                       \
21807 -} while (0)
21808 -
21809 -#define xen_irq_disable()                                      \
21810 -do {                                                           \
21811 -       current_vcpu_info()->evtchn_upcall_mask = 1;            \
21812 -       barrier();                                              \
21813 -} while (0)
21814 -
21815 -#define xen_irq_enable()                                       \
21816 -do {                                                           \
21817 -       vcpu_info_t *_vcpu;                                     \
21818 -       barrier();                                              \
21819 -       _vcpu = current_vcpu_info();                            \
21820 -       _vcpu->evtchn_upcall_mask = 0;                          \
21821 -       barrier(); /* unmask then check (avoid races) */        \
21822 -       if (unlikely(_vcpu->evtchn_upcall_pending))             \
21823 -               force_evtchn_callback();                        \
21824 -} while (0)
21825 -
21826 -void xen_safe_halt(void);
21827 -
21828 -void xen_halt(void);
21829 -
21830 -/*
21831 - * The use of 'barrier' in the following reflects their use as local-lock
21832 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21833 - * critical operations are executed. All critical operations must complete
21834 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21835 - * includes these barriers, for example.
21836 - */
21837 -
21838 -#define __raw_local_save_flags() xen_save_fl()
21839 -
21840 -#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21841 -
21842 -#define raw_local_irq_disable()        xen_irq_disable()
21843 -
21844 -#define raw_local_irq_enable() xen_irq_enable()
21845 -
21846 -/*
21847 - * Used in the idle loop; sti takes one instruction cycle
21848 - * to complete:
21849 - */
21850 -static inline void raw_safe_halt(void)
21851 -{
21852 -       xen_safe_halt();
21853 -}
21854 -
21855 -/*
21856 - * Used when interrupts are already enabled or to
21857 - * shutdown the processor:
21858 - */
21859 -static inline void halt(void)
21860 -{
21861 -       xen_halt();
21862 -}
21863 -
21864 -/*
21865 - * For spinlocks, etc:
21866 - */
21867 -#define __raw_local_irq_save()                                         \
21868 -({                                                                     \
21869 -       unsigned long flags = __raw_local_save_flags();                 \
21870 -                                                                       \
21871 -       raw_local_irq_disable();                                        \
21872 -                                                                       \
21873 -       flags;                                                          \
21874 -})
21875 -
21876 -#else
21877 -/* Offsets into shared_info_t. */
21878 -#define evtchn_upcall_pending          /* 0 */
21879 -#define evtchn_upcall_mask             1
21880 -
21881 -#define sizeof_vcpu_shift              6
21882 -
21883 -#ifdef CONFIG_SMP
21884 -#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
21885 -                               shl  $sizeof_vcpu_shift,%esi            ; \
21886 -                               addl HYPERVISOR_shared_info,%esi
21887 -#else
21888 -#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
21889 -#endif
21890 -
21891 -#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
21892 -#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
21893 -#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
21894 -#define DISABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
21895 -                               __DISABLE_INTERRUPTS
21896 -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO                           ; \
21897 -                               __ENABLE_INTERRUPTS
21898 -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS                  ; \
21899 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
21900 -       __TEST_PENDING                                                  ; \
21901 -       jnz  14f        /* process more events if necessary... */       ; \
21902 -       movl PT_ESI(%esp), %esi                                         ; \
21903 -       sysexit                                                         ; \
21904 -14:    __DISABLE_INTERRUPTS                                            ; \
21905 -       TRACE_IRQS_OFF                                                  ; \
21906 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
21907 -       push %esp                                                       ; \
21908 -       call evtchn_do_upcall                                           ; \
21909 -       add  $4,%esp                                                    ; \
21910 -       jmp  ret_from_intr
21911 -#define INTERRUPT_RETURN       iret
21912 -#endif /* __ASSEMBLY__ */
21913 -
21914 -#ifndef __ASSEMBLY__
21915 -#define raw_local_save_flags(flags) \
21916 -               do { (flags) = __raw_local_save_flags(); } while (0)
21917 -
21918 -#define raw_local_irq_save(flags) \
21919 -               do { (flags) = __raw_local_irq_save(); } while (0)
21920 -
21921 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21922 -{
21923 -       return (flags != 0);
21924 -}
21925 -
21926 -#define raw_irqs_disabled()                                            \
21927 -({                                                                     \
21928 -       unsigned long flags = __raw_local_save_flags();                 \
21929 -                                                                       \
21930 -       raw_irqs_disabled_flags(flags);                                 \
21931 -})
21932 -
21933 -/*
21934 - * makes the traced hardirq state match with the machine state
21935 - *
21936 - * should be a rarely used function, only in places where its
21937 - * otherwise impossible to know the irq state, like in traps.
21938 - */
21939 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21940 -{
21941 -       if (raw_irqs_disabled_flags(flags))
21942 -               trace_hardirqs_off();
21943 -       else
21944 -               trace_hardirqs_on();
21945 -}
21946 -
21947 -#define trace_hardirqs_fixup() \
21948 -       trace_hardirqs_fixup_flags(__raw_local_save_flags())
21949 -#endif /* __ASSEMBLY__ */
21950 -
21951 -/*
21952 - * Do the CPU's IRQ-state tracing from assembly code. We call a
21953 - * C function, so save all the C-clobbered registers:
21954 - */
21955 -#ifdef CONFIG_TRACE_IRQFLAGS
21956 -
21957 -# define TRACE_IRQS_ON                         \
21958 -       pushl %eax;                             \
21959 -       pushl %ecx;                             \
21960 -       pushl %edx;                             \
21961 -       call trace_hardirqs_on;                 \
21962 -       popl %edx;                              \
21963 -       popl %ecx;                              \
21964 -       popl %eax;
21965 -
21966 -# define TRACE_IRQS_OFF                                \
21967 -       pushl %eax;                             \
21968 -       pushl %ecx;                             \
21969 -       pushl %edx;                             \
21970 -       call trace_hardirqs_off;                \
21971 -       popl %edx;                              \
21972 -       popl %ecx;                              \
21973 -       popl %eax;
21974 -
21975 -#else
21976 -# define TRACE_IRQS_ON
21977 -# define TRACE_IRQS_OFF
21978 -#endif
21979 -
21980 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
21981 -# define LOCKDEP_SYS_EXIT                      \
21982 -       pushl %eax;                             \
21983 -       pushl %ecx;                             \
21984 -       pushl %edx;                             \
21985 -       call lockdep_sys_exit;                  \
21986 -       popl %edx;                              \
21987 -       popl %ecx;                              \
21988 -       popl %eax;
21989 -#else
21990 -# define LOCKDEP_SYS_EXIT
21991 -#endif
21992 -
21993 -#endif
21994 --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
21995 +++ /dev/null
21996 @@ -1,178 +0,0 @@
21997 -/*
21998 - * include/asm-x86_64/irqflags.h
21999 - *
22000 - * IRQ flags handling
22001 - *
22002 - * This file gets included from lowlevel asm headers too, to provide
22003 - * wrapped versions of the local_irq_*() APIs, based on the
22004 - * raw_local_irq_*() functions from the lowlevel headers.
22005 - */
22006 -#ifndef _ASM_IRQFLAGS_H
22007 -#define _ASM_IRQFLAGS_H
22008 -#include <asm/processor-flags.h>
22009 -
22010 -#ifndef __ASSEMBLY__
22011 -/*
22012 - * Interrupt control:
22013 - */
22014 -
22015 -/*
22016 - * The use of 'barrier' in the following reflects their use as local-lock
22017 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22018 - * critical operations are executed. All critical operations must complete
22019 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22020 - * includes these barriers, for example.
22021 - */
22022 -
22023 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
22024 -
22025 -#define raw_local_save_flags(flags) \
22026 -               do { (flags) = __raw_local_save_flags(); } while (0)
22027 -
22028 -#define raw_local_irq_restore(x)                                       \
22029 -do {                                                                   \
22030 -       vcpu_info_t *_vcpu;                                             \
22031 -       barrier();                                                      \
22032 -       _vcpu = current_vcpu_info();            \
22033 -       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
22034 -               barrier(); /* unmask then check (avoid races) */        \
22035 -               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
22036 -                       force_evtchn_callback();                        \
22037 -       }                                                               \
22038 -} while (0)
22039 -
22040 -#ifdef CONFIG_X86_VSMP
22041 -
22042 -/*
22043 - * Interrupt control for the VSMP architecture:
22044 - */
22045 -
22046 -static inline void raw_local_irq_disable(void)
22047 -{
22048 -       unsigned long flags = __raw_local_save_flags();
22049 -
22050 -       raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
22051 -}
22052 -
22053 -static inline void raw_local_irq_enable(void)
22054 -{
22055 -       unsigned long flags = __raw_local_save_flags();
22056 -
22057 -       raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
22058 -}
22059 -
22060 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22061 -{
22062 -       return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
22063 -}
22064 -
22065 -#else /* CONFIG_X86_VSMP */
22066 -
22067 -#define raw_local_irq_disable()                                                \
22068 -do {                                                                   \
22069 -       current_vcpu_info()->evtchn_upcall_mask = 1;                                    \
22070 -       barrier();                                                      \
22071 -} while (0)
22072 -
22073 -#define raw_local_irq_enable()                                         \
22074 -do {                                                                   \
22075 -       vcpu_info_t *_vcpu;                                             \
22076 -       barrier();                                                      \
22077 -       _vcpu = current_vcpu_info();            \
22078 -       _vcpu->evtchn_upcall_mask = 0;                                  \
22079 -       barrier(); /* unmask then check (avoid races) */                \
22080 -       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
22081 -               force_evtchn_callback();                                \
22082 -} while (0)
22083 -
22084 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22085 -{
22086 -       return (flags != 0);
22087 -}
22088 -
22089 -#endif
22090 -
22091 -/*
22092 - * For spinlocks, etc.:
22093 - */
22094 -
22095 -#define __raw_local_irq_save()                                         \
22096 -({                                                                     \
22097 -       unsigned long flags = __raw_local_save_flags();                 \
22098 -                                                                       \
22099 -       raw_local_irq_disable();                                        \
22100 -                                                                       \
22101 -       flags;                                                          \
22102 -})
22103 -
22104 -#define raw_local_irq_save(flags) \
22105 -               do { (flags) = __raw_local_irq_save(); } while (0)
22106 -
22107 -#define raw_irqs_disabled()                                            \
22108 -({                                                                     \
22109 -       unsigned long flags = __raw_local_save_flags();                 \
22110 -                                                                       \
22111 -       raw_irqs_disabled_flags(flags);                                 \
22112 -})
22113 -
22114 -/*
22115 - * makes the traced hardirq state match with the machine state
22116 - *
22117 - * should be a rarely used function, only in places where its
22118 - * otherwise impossible to know the irq state, like in traps.
22119 - */
22120 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22121 -{
22122 -       if (raw_irqs_disabled_flags(flags))
22123 -               trace_hardirqs_off();
22124 -       else
22125 -               trace_hardirqs_on();
22126 -}
22127 -
22128 -#define trace_hardirqs_fixup() \
22129 -       trace_hardirqs_fixup_flags(__raw_local_save_flags())
22130 -/*
22131 - * Used in the idle loop; sti takes one instruction cycle
22132 - * to complete:
22133 - */
22134 -void xen_safe_halt(void);
22135 -static inline void raw_safe_halt(void)
22136 -{
22137 -       xen_safe_halt();
22138 -}
22139 -
22140 -/*
22141 - * Used when interrupts are already enabled or to
22142 - * shutdown the processor:
22143 - */
22144 -void xen_halt(void);
22145 -static inline void halt(void)
22146 -{
22147 -       xen_halt();
22148 -}
22149 -
22150 -#else /* __ASSEMBLY__: */
22151 -# ifdef CONFIG_TRACE_IRQFLAGS
22152 -#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk
22153 -#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk
22154 -# else
22155 -#  define TRACE_IRQS_ON
22156 -#  define TRACE_IRQS_OFF
22157 -# endif
22158 -# ifdef CONFIG_DEBUG_LOCK_ALLOC
22159 -#  define LOCKDEP_SYS_EXIT     call lockdep_sys_exit_thunk
22160 -#  define LOCKDEP_SYS_EXIT_IRQ \
22161 -       TRACE_IRQS_ON; \
22162 -       sti; \
22163 -       SAVE_REST; \
22164 -       LOCKDEP_SYS_EXIT; \
22165 -       RESTORE_REST; \
22166 -       cli; \
22167 -       TRACE_IRQS_OFF;
22168 -# else
22169 -#  define LOCKDEP_SYS_EXIT
22170 -#  define LOCKDEP_SYS_EXIT_IRQ
22171 -# endif
22172 -#endif
22173 -
22174 -#endif
22175 --- a/include/asm-x86/mach-xen/asm/irqflags.h
22176 +++ b/include/asm-x86/mach-xen/asm/irqflags.h
22177 @@ -1,5 +1,247 @@
22178 -#ifdef CONFIG_X86_32
22179 -# include "irqflags_32.h"
22180 +#ifndef _X86_IRQFLAGS_H_
22181 +#define _X86_IRQFLAGS_H_
22182 +
22183 +#include <asm/processor-flags.h>
22184 +
22185 +#ifndef __ASSEMBLY__
22186 +/*
22187 + * The use of 'barrier' in the following reflects their use as local-lock
22188 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
22189 + * critical operations are executed. All critical operations must complete
22190 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
22191 + * includes these barriers, for example.
22192 + */
22193 +
22194 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
22195 +
22196 +#define xen_restore_fl(f)                                      \
22197 +do {                                                           \
22198 +       vcpu_info_t *_vcpu;                                     \
22199 +       barrier();                                              \
22200 +       _vcpu = current_vcpu_info();                            \
22201 +       if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
22202 +               barrier(); /* unmask then check (avoid races) */\
22203 +               if (unlikely(_vcpu->evtchn_upcall_pending))     \
22204 +                       force_evtchn_callback();                \
22205 +       }                                                       \
22206 +} while (0)
22207 +
22208 +#define xen_irq_disable()                                      \
22209 +do {                                                           \
22210 +       current_vcpu_info()->evtchn_upcall_mask = 1;            \
22211 +       barrier();                                              \
22212 +} while (0)
22213 +
22214 +#define xen_irq_enable()                                       \
22215 +do {                                                           \
22216 +       vcpu_info_t *_vcpu;                                     \
22217 +       barrier();                                              \
22218 +       _vcpu = current_vcpu_info();                            \
22219 +       _vcpu->evtchn_upcall_mask = 0;                          \
22220 +       barrier(); /* unmask then check (avoid races) */        \
22221 +       if (unlikely(_vcpu->evtchn_upcall_pending))             \
22222 +               force_evtchn_callback();                        \
22223 +} while (0)
22224 +
22225 +void xen_safe_halt(void);
22226 +
22227 +void xen_halt(void);
22228 +
22229 +#define __raw_local_save_flags() xen_save_fl()
22230 +
22231 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
22232 +
22233 +#define raw_local_irq_disable()        xen_irq_disable()
22234 +
22235 +#define raw_local_irq_enable() xen_irq_enable()
22236 +
22237 +/*
22238 + * Used in the idle loop; sti takes one instruction cycle
22239 + * to complete:
22240 + */
22241 +static inline void raw_safe_halt(void)
22242 +{
22243 +       xen_safe_halt();
22244 +}
22245 +
22246 +/*
22247 + * Used when interrupts are already enabled or to
22248 + * shutdown the processor:
22249 + */
22250 +static inline void halt(void)
22251 +{
22252 +       xen_halt();
22253 +}
22254 +
22255 +/*
22256 + * For spinlocks, etc:
22257 + */
22258 +#define __raw_local_irq_save()                                         \
22259 +({                                                                     \
22260 +       unsigned long flags = __raw_local_save_flags();                 \
22261 +                                                                       \
22262 +       raw_local_irq_disable();                                        \
22263 +                                                                       \
22264 +       flags;                                                          \
22265 +})
22266  #else
22267 -# include "irqflags_64.h"
22268 +
22269 +/* Offsets into shared_info_t. */
22270 +#define evtchn_upcall_pending          /* 0 */
22271 +#define evtchn_upcall_mask             1
22272 +
22273 +#define sizeof_vcpu_shift              6
22274 +
22275 +#ifdef CONFIG_X86_64
22276 +# define __REG_si %rsi
22277 +# define __CPU_num %gs:pda_cpunumber
22278 +#else
22279 +# define __REG_si %esi
22280 +# define __CPU_num TI_cpu(%ebp)
22281 +#endif
22282 +
22283 +#ifdef CONFIG_SMP
22284 +#define GET_VCPU_INFO          movl __CPU_num,%esi                     ; \
22285 +                               shl $sizeof_vcpu_shift,%esi             ; \
22286 +                               add HYPERVISOR_shared_info,__REG_si
22287 +#else
22288 +#define GET_VCPU_INFO          mov HYPERVISOR_shared_info,__REG_si
22289 +#endif
22290 +
22291 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(__REG_si)
22292 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(__REG_si)
22293 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(__REG_si)
22294 +#define DISABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
22295 +                               __DISABLE_INTERRUPTS
22296 +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO                           ; \
22297 +                               __ENABLE_INTERRUPTS
22298 +
22299 +#ifndef CONFIG_X86_64
22300 +#define INTERRUPT_RETURN               iret
22301 +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS              ; \
22302 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
22303 +       __TEST_PENDING                                                  ; \
22304 +       jnz  14f        /* process more events if necessary... */       ; \
22305 +       movl PT_ESI(%esp), %esi                                         ; \
22306 +       sysexit                                                         ; \
22307 +14:    __DISABLE_INTERRUPTS                                            ; \
22308 +       TRACE_IRQS_OFF                                                  ; \
22309 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
22310 +       push %esp                                                       ; \
22311 +       call evtchn_do_upcall                                           ; \
22312 +       add  $4,%esp                                                    ; \
22313 +       jmp  ret_from_intr
22314 +#endif
22315 +
22316 +
22317 +#endif /* __ASSEMBLY__ */
22318 +
22319 +#ifndef __ASSEMBLY__
22320 +#define raw_local_save_flags(flags) \
22321 +               do { (flags) = __raw_local_save_flags(); } while (0)
22322 +
22323 +#define raw_local_irq_save(flags) \
22324 +               do { (flags) = __raw_local_irq_save(); } while (0)
22325 +
22326 +static inline int raw_irqs_disabled_flags(unsigned long flags)
22327 +{
22328 +       return (flags != 0);
22329 +}
22330 +
22331 +#define raw_irqs_disabled()                                            \
22332 +({                                                                     \
22333 +       unsigned long flags = __raw_local_save_flags();                 \
22334 +                                                                       \
22335 +       raw_irqs_disabled_flags(flags);                                 \
22336 +})
22337 +
22338 +/*
22339 + * makes the traced hardirq state match with the machine state
22340 + *
22341 + * should be a rarely used function, only in places where its
22342 + * otherwise impossible to know the irq state, like in traps.
22343 + */
22344 +static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22345 +{
22346 +       if (raw_irqs_disabled_flags(flags))
22347 +               trace_hardirqs_off();
22348 +       else
22349 +               trace_hardirqs_on();
22350 +}
22351 +
22352 +#define trace_hardirqs_fixup() \
22353 +       trace_hardirqs_fixup_flags(__raw_local_save_flags())
22354 +
22355 +#else
22356 +
22357 +#ifdef CONFIG_X86_64
22358 +/*
22359 + * Currently paravirt can't handle swapgs nicely when we
22360 + * don't have a stack we can rely on (such as a user space
22361 + * stack).  So we either find a way around these or just fault
22362 + * and emulate if a guest tries to call swapgs directly.
22363 + *
22364 + * Either way, this is a good way to document that we don't
22365 + * have a reliable stack. x86_64 only.
22366 + */
22367 +#define SWAPGS_UNSAFE_STACK    swapgs
22368 +#define ARCH_TRACE_IRQS_ON             call trace_hardirqs_on_thunk
22369 +#define ARCH_TRACE_IRQS_OFF            call trace_hardirqs_off_thunk
22370 +#define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
22371 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
22372 +       TRACE_IRQS_ON; \
22373 +       ENABLE_INTERRUPTS(CLBR_NONE); \
22374 +       SAVE_REST; \
22375 +       LOCKDEP_SYS_EXIT; \
22376 +       RESTORE_REST; \
22377 +       __DISABLE_INTERRUPTS; \
22378 +       TRACE_IRQS_OFF;
22379 +
22380 +#else
22381 +#define ARCH_TRACE_IRQS_ON                     \
22382 +       pushl %eax;                             \
22383 +       pushl %ecx;                             \
22384 +       pushl %edx;                             \
22385 +       call trace_hardirqs_on;                 \
22386 +       popl %edx;                              \
22387 +       popl %ecx;                              \
22388 +       popl %eax;
22389 +
22390 +#define ARCH_TRACE_IRQS_OFF                    \
22391 +       pushl %eax;                             \
22392 +       pushl %ecx;                             \
22393 +       pushl %edx;                             \
22394 +       call trace_hardirqs_off;                \
22395 +       popl %edx;                              \
22396 +       popl %ecx;                              \
22397 +       popl %eax;
22398 +
22399 +#define ARCH_LOCKDEP_SYS_EXIT                  \
22400 +       pushl %eax;                             \
22401 +       pushl %ecx;                             \
22402 +       pushl %edx;                             \
22403 +       call lockdep_sys_exit;                  \
22404 +       popl %edx;                              \
22405 +       popl %ecx;                              \
22406 +       popl %eax;
22407 +
22408 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ
22409 +#endif
22410 +
22411 +#ifdef CONFIG_TRACE_IRQFLAGS
22412 +#  define TRACE_IRQS_ON                ARCH_TRACE_IRQS_ON
22413 +#  define TRACE_IRQS_OFF       ARCH_TRACE_IRQS_OFF
22414 +#else
22415 +#  define TRACE_IRQS_ON
22416 +#  define TRACE_IRQS_OFF
22417 +#endif
22418 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
22419 +#  define LOCKDEP_SYS_EXIT     ARCH_LOCKDEP_SYS_EXIT
22420 +#  define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
22421 +# else
22422 +#  define LOCKDEP_SYS_EXIT
22423 +#  define LOCKDEP_SYS_EXIT_IRQ
22424 +# endif
22425 +
22426 +#endif /* __ASSEMBLY__ */
22427  #endif
22428 --- a/include/asm-x86/mach-xen/asm/maddr_32.h
22429 +++ b/include/asm-x86/mach-xen/asm/maddr_32.h
22430 @@ -1,6 +1,7 @@
22431  #ifndef _I386_MADDR_H
22432  #define _I386_MADDR_H
22433
22434 +#include <asm/bug.h>
22435  #include <xen/features.h>
22436  #include <xen/interface/xen.h>
22437
22438 @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22439         phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22440         return phys;
22441  }
22442 -#endif
22443 -
22444 -#ifdef CONFIG_X86_PAE
22445 -#define __pte_ma(x)    ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22446 -extern unsigned long long __supported_pte_mask;
22447 -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22448 -{
22449 -       pte_t pte;
22450 -
22451 -       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22452 -                                       (pgprot_val(pgprot) >> 32);
22453 -       pte.pte_high &= (__supported_pte_mask >> 32);
22454 -       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22455 -                                                       __supported_pte_mask;
22456 -       return pte;
22457 -}
22458  #else
22459 -#define __pte_ma(x)    ((pte_t) { (x) } )
22460 -#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22461 +#define pte_phys_to_machine phys_to_machine
22462 +#define pte_machine_to_phys machine_to_phys
22463  #endif
22464
22465  #else /* !CONFIG_XEN */
22466 --- a/include/asm-x86/mach-xen/asm/maddr_64.h
22467 +++ b/include/asm-x86/mach-xen/asm/maddr_64.h
22468 @@ -1,6 +1,7 @@
22469  #ifndef _X86_64_MADDR_H
22470  #define _X86_64_MADDR_H
22471
22472 +#include <asm/bug.h>
22473  #include <xen/features.h>
22474  #include <xen/interface/xen.h>
22475
22476 @@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22477  #ifdef CONFIG_XEN
22478
22479  extern unsigned long *phys_to_machine_mapping;
22480 +extern unsigned long  max_mapnr;
22481
22482  #undef machine_to_phys_mapping
22483  extern unsigned long *machine_to_phys_mapping;
22484 @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22485  {
22486         if (xen_feature(XENFEAT_auto_translated_physmap))
22487                 return pfn;
22488 -       BUG_ON(end_pfn && pfn >= end_pfn);
22489 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22490         return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22491  }
22492
22493 @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22494  {
22495         if (xen_feature(XENFEAT_auto_translated_physmap))
22496                 return 1;
22497 -       BUG_ON(end_pfn && pfn >= end_pfn);
22498 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22499         return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22500  }
22501
22502 @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22503                 return mfn;
22504
22505         if (unlikely((mfn >> machine_to_phys_order) != 0))
22506 -               return end_pfn;
22507 +               return max_mapnr;
22508
22509         /* The array access can fail (e.g., device space beyond end of RAM). */
22510         asm (
22511 @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22512                 "       .quad 1b,3b\n"
22513                 ".previous"
22514                 : "=r" (pfn)
22515 -               : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22516 +               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22517
22518         return pfn;
22519  }
22520 @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22521  static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22522  {
22523         unsigned long pfn = mfn_to_pfn(mfn);
22524 -       if ((pfn < end_pfn)
22525 +       if ((pfn < max_mapnr)
22526             && !xen_feature(XENFEAT_auto_translated_physmap)
22527             && (phys_to_machine_mapping[pfn] != mfn))
22528 -               return end_pfn; /* force !pfn_valid() */
22529 +               return max_mapnr; /* force !pfn_valid() */
22530         return pfn;
22531  }
22532
22533  static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22534  {
22535 -       BUG_ON(end_pfn && pfn >= end_pfn);
22536 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
22537         if (xen_feature(XENFEAT_auto_translated_physmap)) {
22538                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22539                 return;
22540 @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22541         return phys;
22542  }
22543
22544 -#define __pte_ma(x)     ((pte_t) { (x) } )
22545 -#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22546 -
22547  #else /* !CONFIG_XEN */
22548
22549  #define pfn_to_mfn(pfn) (pfn)
22550 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
22551 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
22552 @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22553                 : : "r" (0) );
22554  }
22555
22556 -void leave_mm(unsigned long cpu);
22557 -
22558  static inline void switch_mm(struct mm_struct *prev,
22559                              struct mm_struct *next,
22560                              struct task_struct *tsk)
22561 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
22562 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
22563 @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22564  extern void mm_unpin(struct mm_struct *mm);
22565  void mm_pin_all(void);
22566
22567 -static inline void load_cr3(pgd_t *pgd)
22568 -{
22569 -       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22570 -                    "memory");
22571 -}
22572 -
22573  static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22574                              struct task_struct *tsk)
22575  {
22576 @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22577                 op++;
22578
22579                 if (unlikely(next->context.ldt != prev->context.ldt)) {
22580 -                       /* load_LDT_nolock(&next->context, cpu) */
22581 +                       /* load_LDT_nolock(&next->context) */
22582                         op->cmd = MMUEXT_SET_LDT;
22583                         op->arg1.linear_addr = (unsigned long)next->context.ldt;
22584                         op->arg2.nr_ents     = next->context.size;
22585 @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22586         else {
22587                 write_pda(mmu_state, TLBSTATE_OK);
22588                 if (read_pda(active_mm) != next)
22589 -                       out_of_line_bug();
22590 +                       BUG();
22591                 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22592                         /* We were in lazy tlb mode and leave_mm disabled
22593                          * tlb flush IPI delivery. We must reload CR3
22594 @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22595                          */
22596                          load_cr3(next->pgd);
22597                          xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22598 -                       load_LDT_nolock(&next->context, cpu);
22599 +                       load_LDT_nolock(&next->context);
22600                 }
22601         }
22602  #endif
22603 --- a/include/asm-x86/mach-xen/asm/page_64.h
22604 +++ b/include/asm-x86/mach-xen/asm/page_64.h
22605 @@ -1,37 +1,9 @@
22606  #ifndef _X86_64_PAGE_H
22607  #define _X86_64_PAGE_H
22608
22609 -/* #include <linux/string.h> */
22610 -#ifndef __ASSEMBLY__
22611 -#include <linux/kernel.h>
22612 -#include <linux/types.h>
22613 -#include <asm/bug.h>
22614 -#endif
22615 -#include <linux/const.h>
22616 -#include <xen/interface/xen.h>
22617 -
22618 -/*
22619 - * Need to repeat this here in order to not include pgtable.h (which in turn
22620 - * depends on definitions made here), but to be able to use the symbolic
22621 - * below. The preprocessor will warn if the two definitions aren't identical.
22622 - */
22623 -#define _PAGE_PRESENT  0x001
22624 -#define _PAGE_IO       0x200
22625 -
22626 -/* PAGE_SHIFT determines the page size */
22627 -#define PAGE_SHIFT     12
22628 -#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
22629 -#define PAGE_MASK      (~(PAGE_SIZE-1))
22630 -
22631 -/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22632 -#define __PHYSICAL_MASK_SHIFT  46
22633 -#define __PHYSICAL_MASK                ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22634 -#define __VIRTUAL_MASK_SHIFT   48
22635 -#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22636 -
22637 -#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22638 +#define PAGETABLE_LEVELS       4
22639
22640 -#define THREAD_ORDER 1
22641 +#define THREAD_ORDER   1
22642  #define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
22643  #define CURRENT_MASK (~(THREAD_SIZE-1))
22644
22645 @@ -51,106 +23,10 @@
22646  #define MCE_STACK 5
22647  #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
22648
22649 -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22650 -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22651 -
22652 -#define HPAGE_SHIFT PMD_SHIFT
22653 -#define HPAGE_SIZE     (_AC(1,UL) << HPAGE_SHIFT)
22654 -#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
22655 -#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
22656 -
22657 -#ifdef __KERNEL__
22658 -#ifndef __ASSEMBLY__
22659 -
22660 -extern unsigned long end_pfn;
22661 -
22662 -#include <asm/maddr.h>
22663 -
22664 -void clear_page(void *);
22665 -void copy_page(void *, void *);
22666 -
22667 -#define clear_user_page(page, vaddr, pg)       clear_page(page)
22668 -#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
22669 -
22670 -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22671 -       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22672 -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22673 -
22674 -/*
22675 - * These are used to make use of C type-checking..
22676 - */
22677 -typedef struct { unsigned long pte; } pte_t;
22678 -typedef struct { unsigned long pmd; } pmd_t;
22679 -typedef struct { unsigned long pud; } pud_t;
22680 -typedef struct { unsigned long pgd; } pgd_t;
22681 -#define PTE_MASK       PHYSICAL_PAGE_MASK
22682 -
22683 -typedef struct { unsigned long pgprot; } pgprot_t;
22684 -
22685 -#define __pte_val(x) ((x).pte)
22686 -#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO))  \
22687 -                   == _PAGE_PRESENT ?                          \
22688 -                   pte_machine_to_phys(__pte_val(x)) :         \
22689 -                   __pte_val(x))
22690 -
22691 -#define __pmd_val(x) ((x).pmd)
22692 -static inline unsigned long pmd_val(pmd_t x)
22693 -{
22694 -       unsigned long ret = __pmd_val(x);
22695 -#if CONFIG_XEN_COMPAT <= 0x030002
22696 -       if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22697 -#else
22698 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22699 -#endif
22700 -       return ret;
22701 -}
22702 -
22703 -#define __pud_val(x) ((x).pud)
22704 -static inline unsigned long pud_val(pud_t x)
22705 -{
22706 -       unsigned long ret = __pud_val(x);
22707 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22708 -       return ret;
22709 -}
22710 -
22711 -#define __pgd_val(x) ((x).pgd)
22712 -static inline unsigned long pgd_val(pgd_t x)
22713 -{
22714 -       unsigned long ret = __pgd_val(x);
22715 -       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22716 -       return ret;
22717 -}
22718 -
22719 -#define pgprot_val(x)  ((x).pgprot)
22720 -
22721 -static inline pte_t __pte(unsigned long x)
22722 -{
22723 -       if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22724 -               x = pte_phys_to_machine(x);
22725 -       return ((pte_t) { (x) });
22726 -}
22727 -
22728 -static inline pmd_t __pmd(unsigned long x)
22729 -{
22730 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22731 -       return ((pmd_t) { (x) });
22732 -}
22733 -
22734 -static inline pud_t __pud(unsigned long x)
22735 -{
22736 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22737 -       return ((pud_t) { (x) });
22738 -}
22739 -
22740 -static inline pgd_t __pgd(unsigned long x)
22741 -{
22742 -       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22743 -       return ((pgd_t) { (x) });
22744 -}
22745 -
22746 -#define __pgprot(x)    ((pgprot_t) { (x) } )
22747 +#define PUD_PAGE_SIZE          (_AC(1, UL) << PUD_SHIFT)
22748 +#define PUD_PAGE_MASK          (~(PUD_PAGE_SIZE-1))
22749
22750 -#endif /* !__ASSEMBLY__ */
22751 +#define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
22752
22753  #define __PHYSICAL_START       CONFIG_PHYSICAL_START
22754  #define __KERNEL_ALIGN         0x200000
22755 @@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22756
22757  #define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
22758  #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
22759 -#define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
22760
22761  #if CONFIG_XEN_COMPAT <= 0x030002
22762  #undef LOAD_OFFSET
22763  #define LOAD_OFFSET            0
22764  #endif
22765
22766 -/* to align the pointer to the (next) page boundary */
22767 -#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22768 -
22769 -#define KERNEL_TEXT_SIZE  (40*1024*1024)
22770 -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22771 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22772 +#define __PHYSICAL_MASK_SHIFT  46
22773 +#define __VIRTUAL_MASK_SHIFT   48
22774
22775 -#define PAGE_OFFSET            __PAGE_OFFSET
22776 +/*
22777 + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22778 + * arch/x86/kernel/head_64.S), and it is mapped here:
22779 + */
22780 +#define KERNEL_IMAGE_SIZE      (128*1024*1024)
22781 +#define KERNEL_IMAGE_START     _AC(0xffffffff80000000, UL)
22782
22783  #ifndef __ASSEMBLY__
22784 +void clear_page(void *page);
22785 +void copy_page(void *to, void *from);
22786 +
22787 +extern unsigned long end_pfn;
22788 +extern unsigned long end_pfn_map;
22789 +
22790  static inline unsigned long __phys_addr(unsigned long x)
22791  {
22792 -       return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22793 +       return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22794  }
22795 -#endif
22796
22797 -#define __pa(x)                __phys_addr((unsigned long)(x))
22798 -#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22799 +#define __phys_reloc_hide(x)   (x)
22800
22801 -#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
22802 -#define __boot_va(x)           __va(x)
22803 -#define __boot_pa(x)           __pa(x)
22804 -#ifdef CONFIG_FLATMEM
22805 -#define pfn_valid(pfn)         ((pfn) < end_pfn)
22806 -#endif
22807 +/*
22808 + * These are used to make use of C type-checking..
22809 + */
22810 +typedef unsigned long  pteval_t;
22811 +typedef unsigned long  pmdval_t;
22812 +typedef unsigned long  pudval_t;
22813 +typedef unsigned long  pgdval_t;
22814 +typedef unsigned long  pgprotval_t;
22815 +typedef unsigned long  phys_addr_t;
22816
22817 -#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22818 -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22819 -#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
22820 -
22821 -#define VM_DATA_DEFAULT_FLAGS \
22822 -       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22823 -        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22824 +typedef struct page *pgtable_t;
22825 +
22826 +typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22827
22828 -#define __HAVE_ARCH_GATE_AREA 1
22829  #define vmemmap ((struct page *)VMEMMAP_START)
22830
22831 -#include <asm-generic/memory_model.h>
22832 -#include <asm-generic/page.h>
22833 +#endif /* !__ASSEMBLY__ */
22834 +
22835 +#ifdef CONFIG_FLATMEM
22836 +#define pfn_valid(pfn)          ((pfn) < max_mapnr)
22837 +#endif
22838
22839 -#endif /* __KERNEL__ */
22840
22841  #endif /* _X86_64_PAGE_H */
22842 --- a/include/asm-x86/mach-xen/asm/page.h
22843 +++ b/include/asm-x86/mach-xen/asm/page.h
22844 @@ -1,13 +1,231 @@
22845 +#ifndef _ASM_X86_PAGE_H
22846 +#define _ASM_X86_PAGE_H
22847 +
22848 +#include <linux/const.h>
22849 +
22850 +/* PAGE_SHIFT determines the page size */
22851 +#define PAGE_SHIFT     12
22852 +#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
22853 +#define PAGE_MASK      (~(PAGE_SIZE-1))
22854 +
22855  #ifdef __KERNEL__
22856 -# ifdef CONFIG_X86_32
22857 -#  include "page_32.h"
22858 -# else
22859 -#  include "page_64.h"
22860 -# endif
22861 +
22862 +/*
22863 + * Need to repeat this here in order to not include pgtable.h (which in turn
22864 + * depends on definitions made here), but to be able to use the symbolics
22865 + * below. The preprocessor will warn if the two definitions aren't identical.
22866 + */
22867 +#define _PAGE_BIT_PRESENT      0
22868 +#define _PAGE_PRESENT          (_AC(1, L)<<_PAGE_BIT_PRESENT)
22869 +#define _PAGE_BIT_IO           9
22870 +#define _PAGE_IO               (_AC(1, L)<<_PAGE_BIT_IO)
22871 +
22872 +#define PHYSICAL_PAGE_MASK     (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22873 +#define PTE_MASK               _AT(pteval_t, PHYSICAL_PAGE_MASK)
22874 +
22875 +#define PMD_PAGE_SIZE          (_AC(1, UL) << PMD_SHIFT)
22876 +#define PMD_PAGE_MASK          (~(PMD_PAGE_SIZE-1))
22877 +
22878 +#define HPAGE_SHIFT            PMD_SHIFT
22879 +#define HPAGE_SIZE             (_AC(1,UL) << HPAGE_SHIFT)
22880 +#define HPAGE_MASK             (~(HPAGE_SIZE - 1))
22881 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
22882 +
22883 +/* to align the pointer to the (next) page boundary */
22884 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22885 +
22886 +#define __PHYSICAL_MASK                _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22887 +#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22888 +
22889 +#ifndef __ASSEMBLY__
22890 +#include <linux/types.h>
22891 +#endif
22892 +
22893 +#ifdef CONFIG_X86_64
22894 +#include <asm/page_64.h>
22895 +#define max_pfn_mapped         end_pfn_map
22896 +#else
22897 +#include <asm/page_32.h>
22898 +#define max_pfn_mapped         max_low_pfn
22899 +#endif /* CONFIG_X86_64 */
22900 +
22901 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
22902 +
22903 +#define VM_DATA_DEFAULT_FLAGS \
22904 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22905 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22906 +
22907 +
22908 +#ifndef __ASSEMBLY__
22909 +
22910 +extern int page_is_ram(unsigned long pagenr);
22911 +
22912 +struct page;
22913 +
22914 +static inline void clear_user_page(void *page, unsigned long vaddr,
22915 +                               struct page *pg)
22916 +{
22917 +       clear_page(page);
22918 +}
22919 +
22920 +static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22921 +                               struct page *topage)
22922 +{
22923 +       copy_page(to, from);
22924 +}
22925 +
22926 +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22927 +       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22928 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22929 +
22930 +typedef struct { pgprotval_t pgprot; } pgprot_t;
22931 +
22932 +#define pgprot_val(x)  ((x).pgprot)
22933 +#define __pgprot(x)    ((pgprot_t) { (x) } )
22934 +
22935 +#include <asm/maddr.h>
22936 +
22937 +typedef struct { pgdval_t pgd; } pgd_t;
22938 +
22939 +#define __pgd_ma(x) ((pgd_t) { (x) } )
22940 +static inline pgd_t xen_make_pgd(pgdval_t val)
22941 +{
22942 +       if (val & _PAGE_PRESENT)
22943 +               val = pte_phys_to_machine(val);
22944 +       return (pgd_t) { val };
22945 +}
22946 +
22947 +#define __pgd_val(x) ((x).pgd)
22948 +static inline pgdval_t xen_pgd_val(pgd_t pgd)
22949 +{
22950 +       pgdval_t ret = __pgd_val(pgd);
22951 +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22952 +       if (ret)
22953 +               ret = machine_to_phys(ret) | _PAGE_PRESENT;
22954 +#else
22955 +       if (ret & _PAGE_PRESENT)
22956 +               ret = pte_machine_to_phys(ret);
22957 +#endif
22958 +       return ret;
22959 +}
22960 +
22961 +#if PAGETABLE_LEVELS >= 3
22962 +#if PAGETABLE_LEVELS == 4
22963 +typedef struct { pudval_t pud; } pud_t;
22964 +
22965 +#define __pud_ma(x) ((pud_t) { (x) } )
22966 +static inline pud_t xen_make_pud(pudval_t val)
22967 +{
22968 +       if (val & _PAGE_PRESENT)
22969 +               val = pte_phys_to_machine(val);
22970 +       return (pud_t) { val };
22971 +}
22972 +
22973 +#define __pud_val(x) ((x).pud)
22974 +static inline pudval_t xen_pud_val(pud_t pud)
22975 +{
22976 +       pudval_t ret = __pud_val(pud);
22977 +       if (ret & _PAGE_PRESENT)
22978 +               ret = pte_machine_to_phys(ret);
22979 +       return ret;
22980 +}
22981 +#else  /* PAGETABLE_LEVELS == 3 */
22982 +#include <asm-generic/pgtable-nopud.h>
22983 +
22984 +#define __pud_val(x) __pgd_val((x).pgd)
22985 +static inline pudval_t xen_pud_val(pud_t pud)
22986 +{
22987 +       return xen_pgd_val(pud.pgd);
22988 +}
22989 +#endif /* PAGETABLE_LEVELS == 4 */
22990 +
22991 +typedef struct { pmdval_t pmd; } pmd_t;
22992 +
22993 +#define __pmd_ma(x)    ((pmd_t) { (x) } )
22994 +static inline pmd_t xen_make_pmd(pmdval_t val)
22995 +{
22996 +       if (val & _PAGE_PRESENT)
22997 +               val = pte_phys_to_machine(val);
22998 +       return (pmd_t) { val };
22999 +}
23000 +
23001 +#define __pmd_val(x) ((x).pmd)
23002 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
23003 +{
23004 +       pmdval_t ret = __pmd_val(pmd);
23005 +#if CONFIG_XEN_COMPAT <= 0x030002
23006 +       if (ret)
23007 +               ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
23008  #else
23009 -# ifdef __i386__
23010 -#  include "page_32.h"
23011 -# else
23012 -#  include "page_64.h"
23013 -# endif
23014 +       if (ret & _PAGE_PRESENT)
23015 +               ret = pte_machine_to_phys(ret);
23016 +#endif
23017 +       return ret;
23018 +}
23019 +#else  /* PAGETABLE_LEVELS == 2 */
23020 +#include <asm-generic/pgtable-nopmd.h>
23021 +
23022 +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
23023 +#define __pmd_val(x) __pgd_val((x).pud.pgd)
23024 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
23025 +{
23026 +       return xen_pgd_val(pmd.pud.pgd);
23027 +}
23028 +#endif /* PAGETABLE_LEVELS >= 3 */
23029 +
23030 +#define __pte_ma(x) ((pte_t) { .pte = (x) } )
23031 +static inline pte_t xen_make_pte(pteval_t val)
23032 +{
23033 +       if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23034 +               val = pte_phys_to_machine(val);
23035 +       return (pte_t) { .pte = val };
23036 +}
23037 +
23038 +#define __pte_val(x) ((x).pte)
23039 +static inline pteval_t xen_pte_val(pte_t pte)
23040 +{
23041 +       pteval_t ret = __pte_val(pte);
23042 +       if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
23043 +               ret = pte_machine_to_phys(ret);
23044 +       return ret;
23045 +}
23046 +
23047 +#define pgd_val(x)     xen_pgd_val(x)
23048 +#define __pgd(x)       xen_make_pgd(x)
23049 +
23050 +#ifndef __PAGETABLE_PUD_FOLDED
23051 +#define pud_val(x)     xen_pud_val(x)
23052 +#define __pud(x)       xen_make_pud(x)
23053 +#endif
23054 +
23055 +#ifndef __PAGETABLE_PMD_FOLDED
23056 +#define pmd_val(x)     xen_pmd_val(x)
23057 +#define __pmd(x)       xen_make_pmd(x)
23058  #endif
23059 +
23060 +#define pte_val(x)     xen_pte_val(x)
23061 +#define __pte(x)       xen_make_pte(x)
23062 +
23063 +#define __pa(x)                __phys_addr((unsigned long)(x))
23064 +/* __pa_symbol should be used for C visible symbols.
23065 +   This seems to be the official gcc blessed way to do such arithmetic. */
23066 +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
23067 +
23068 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
23069 +
23070 +#define __boot_va(x)           __va(x)
23071 +#define __boot_pa(x)           __pa(x)
23072 +
23073 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
23074 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
23075 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
23076 +
23077 +#endif /* __ASSEMBLY__ */
23078 +
23079 +#include <asm-generic/memory_model.h>
23080 +#include <asm-generic/page.h>
23081 +
23082 +#define __HAVE_ARCH_GATE_AREA 1
23083 +
23084 +#endif /* __KERNEL__ */
23085 +#endif /* _ASM_X86_PAGE_H */
23086 --- a/include/asm-x86/mach-xen/asm/pci_64.h
23087 +++ b/include/asm-x86/mach-xen/asm/pci_64.h
23088 @@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg,
23089
23090
23091  extern void pci_iommu_alloc(void);
23092 -extern int iommu_setup(char *opt);
23093
23094  /* The PCI address space does equal the physical memory
23095   * address space.  The networking and block device layers use
23096 --- a/include/asm-x86/mach-xen/asm/pci.h
23097 +++ b/include/asm-x86/mach-xen/asm/pci.h
23098 @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
23099
23100
23101  #ifdef CONFIG_PCI
23102 +extern void early_quirks(void);
23103  static inline void pci_dma_burst_advice(struct pci_dev *pdev,
23104                                         enum pci_dma_burst_strategy *strat,
23105                                         unsigned long *strategy_parameter)
23106 @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
23107         *strat = PCI_DMA_BURST_INFINITY;
23108         *strategy_parameter = ~0UL;
23109  }
23110 +#else
23111 +static inline void early_quirks(void) { }
23112  #endif
23113
23114 -
23115  #endif  /* __KERNEL__ */
23116
23117  #ifdef CONFIG_X86_32
23118 @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
23119  /* generic pci stuff */
23120  #include <asm-generic/pci.h>
23121
23122 +#ifdef CONFIG_NUMA
23123 +/* Returns the node based on pci bus */
23124 +static inline int __pcibus_to_node(struct pci_bus *bus)
23125 +{
23126 +       struct pci_sysdata *sd = bus->sysdata;
23127 +
23128 +       return sd->node;
23129 +}
23130
23131 +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
23132 +{
23133 +       return node_to_cpumask(__pcibus_to_node(bus));
23134 +}
23135 +#endif
23136
23137  #endif
23138 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
23139 +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
23140 @@ -3,69 +3,109 @@
23141
23142  #include <linux/threads.h>
23143  #include <linux/mm.h>          /* for struct page */
23144 +#include <linux/pagemap.h>
23145 +#include <asm/tlb.h>
23146 +#include <asm-generic/tlb.h>
23147  #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
23148
23149  #define paravirt_alloc_pt(mm, pfn) do { } while (0)
23150 -#define paravirt_alloc_pd(pfn) do { } while (0)
23151 -#define paravirt_alloc_pd(pfn) do { } while (0)
23152 +#define paravirt_alloc_pd(mm, pfn) do { } while (0)
23153  #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
23154  #define paravirt_release_pt(pfn) do { } while (0)
23155  #define paravirt_release_pd(pfn) do { } while (0)
23156
23157 -#define pmd_populate_kernel(mm, pmd, pte)                      \
23158 -do {                                                           \
23159 -       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);         \
23160 -       set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));           \
23161 -} while (0)
23162 -
23163 -#define pmd_populate(mm, pmd, pte)                                     \
23164 -do {                                                                   \
23165 -       unsigned long pfn = page_to_pfn(pte);                           \
23166 -       paravirt_alloc_pt(mm, pfn);                                     \
23167 -       if (PagePinned(virt_to_page((mm)->pgd))) {                      \
23168 -               if (!PageHighMem(pte))                                  \
23169 -                       BUG_ON(HYPERVISOR_update_va_mapping(            \
23170 -                         (unsigned long)__va(pfn << PAGE_SHIFT),       \
23171 -                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));            \
23172 -               else if (!test_and_set_bit(PG_pinned, &pte->flags))     \
23173 -                       kmap_flush_unused();                            \
23174 -               set_pmd(pmd,                                            \
23175 -                       __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
23176 -       } else                                                  \
23177 -               *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
23178 -} while (0)
23179 +static inline void pmd_populate_kernel(struct mm_struct *mm,
23180 +                                      pmd_t *pmd, pte_t *pte)
23181 +{
23182 +       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
23183 +       set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
23184 +}
23185 +
23186 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23187 +{
23188 +       unsigned long pfn = page_to_pfn(pte);
23189 +
23190 +       paravirt_alloc_pt(mm, pfn);
23191 +       if (PagePinned(virt_to_page(mm->pgd))) {
23192 +               if (!PageHighMem(pte))
23193 +                       BUG_ON(HYPERVISOR_update_va_mapping(
23194 +                         (unsigned long)__va(pfn << PAGE_SHIFT),
23195 +                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23196 +               else if (!test_and_set_bit(PG_pinned, &pte->flags))
23197 +                       kmap_flush_unused();
23198 +               set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
23199 +       } else
23200 +               *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
23201 +}
23202 +#define pmd_pgtable(pmd) pmd_page(pmd)
23203
23204  /*
23205   * Allocate and free page tables.
23206   */
23207 +extern void pgd_test_and_unpin(pgd_t *);
23208  extern pgd_t *pgd_alloc(struct mm_struct *);
23209 -extern void pgd_free(pgd_t *pgd);
23210 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
23211
23212  extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
23213 -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
23214 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
23215
23216 -static inline void pte_free_kernel(pte_t *pte)
23217 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23218  {
23219         make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
23220         free_page((unsigned long)pte);
23221  }
23222
23223 -extern void pte_free(struct page *pte);
23224 +extern void __pte_free(pgtable_t);
23225 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23226 +{
23227 +       __pte_free(pte);
23228 +}
23229 +
23230
23231 -#define __pte_free_tlb(tlb,pte)                                        \
23232 -do {                                                                   \
23233 -       paravirt_release_pt(page_to_pfn(pte));                          \
23234 -       tlb_remove_page((tlb),(pte));                                   \
23235 -} while (0)
23236 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
23237
23238  #ifdef CONFIG_X86_PAE
23239  /*
23240   * In the PAE case we free the pmds as part of the pgd.
23241   */
23242 -#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
23243 -#define pmd_free(x)                    do { } while (0)
23244 -#define __pmd_free_tlb(tlb,x)          do { } while (0)
23245 -#define pud_populate(mm, pmd, pte)     BUG()
23246 -#endif
23247 +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
23248 +
23249 +extern void __pmd_free(pgtable_t);
23250 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23251 +{
23252 +       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23253 +       __pmd_free(virt_to_page(pmd));
23254 +}
23255 +
23256 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
23257 +
23258 +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
23259 +{
23260 +       struct page *page = virt_to_page(pmd);
23261 +       unsigned long pfn = page_to_pfn(page);
23262 +
23263 +       paravirt_alloc_pd(mm, pfn);
23264 +
23265 +       /* Note: almost everything apart from _PAGE_PRESENT is
23266 +          reserved at the pmd (PDPT) level. */
23267 +       if (PagePinned(virt_to_page(mm->pgd))) {
23268 +               BUG_ON(PageHighMem(page));
23269 +               BUG_ON(HYPERVISOR_update_va_mapping(
23270 +                         (unsigned long)__va(pfn << PAGE_SHIFT),
23271 +                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));
23272 +               set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
23273 +       } else
23274 +               *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
23275 +
23276 +       /*
23277 +        * According to Intel App note "TLBs, Paging-Structure Caches,
23278 +        * and Their Invalidation", April 2007, document 317080-001,
23279 +        * section 8.1: in PAE mode we explicitly have to flush the
23280 +        * TLB via cr3 if the top-level pgd is changed...
23281 +        */
23282 +       if (mm == current->active_mm)
23283 +               xen_tlb_flush();
23284 +}
23285 +#endif /* CONFIG_X86_PAE */
23286
23287  #endif /* _I386_PGALLOC_H */
23288 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
23289 +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
23290 @@ -6,30 +6,13 @@
23291  #include <linux/mm.h>
23292  #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
23293
23294 -#include <xen/features.h>
23295 -void make_page_readonly(void *va, unsigned int feature);
23296 -void make_page_writable(void *va, unsigned int feature);
23297 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23298 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23299 +pmd_t *early_get_pmd(unsigned long va);
23300 +void early_make_page_readonly(void *va, unsigned int feature);
23301
23302  #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
23303
23304 -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
23305 -{
23306 -       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
23307 -}
23308 -
23309 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23310 -{
23311 -       if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23312 -               BUG_ON(HYPERVISOR_update_va_mapping(
23313 -                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23314 -                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23315 -               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23316 -       } else {
23317 -               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23318 -       }
23319 -}
23320 +#define pmd_populate_kernel(mm, pmd, pte) \
23321 +               set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
23322
23323  static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
23324  {
23325 @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
23326         }
23327  }
23328
23329 -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23330 -extern void pte_free(struct page *pte);
23331 +#define pmd_pgtable(pmd) pmd_page(pmd)
23332
23333 -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
23334 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
23335  {
23336 -       struct page *pg;
23337 -
23338 -       pg = pte_alloc_one(mm, addr);
23339 -       return pg ? page_address(pg) : NULL;
23340 +       if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
23341 +               BUG_ON(HYPERVISOR_update_va_mapping(
23342 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
23343 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
23344 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
23345 +       } else {
23346 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23347 +       }
23348  }
23349
23350 -static inline void pmd_free(pmd_t *pmd)
23351 +extern void __pmd_free(pgtable_t);
23352 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23353  {
23354         BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23355 -       pte_free(virt_to_page(pmd));
23356 +       __pmd_free(virt_to_page(pmd));
23357  }
23358
23359 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23360 +
23361  static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23362  {
23363 -       struct page *pg;
23364 -
23365 -       pg = pte_alloc_one(mm, addr);
23366 -       return pg ? page_address(pg) : NULL;
23367 +       return (pud_t *)pmd_alloc_one(mm, addr);
23368  }
23369
23370 -static inline void pud_free(pud_t *pud)
23371 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23372  {
23373         BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23374 -       pte_free(virt_to_page(pud));
23375 +       __pmd_free(virt_to_page(pud));
23376  }
23377
23378  static inline void pgd_list_add(pgd_t *pgd)
23379  {
23380         struct page *page = virt_to_page(pgd);
23381 +       unsigned long flags;
23382
23383 -       spin_lock(&pgd_lock);
23384 +       spin_lock_irqsave(&pgd_lock, flags);
23385         list_add(&page->lru, &pgd_list);
23386 -       spin_unlock(&pgd_lock);
23387 +       spin_unlock_irqrestore(&pgd_lock, flags);
23388  }
23389
23390  static inline void pgd_list_del(pgd_t *pgd)
23391  {
23392         struct page *page = virt_to_page(pgd);
23393 +       unsigned long flags;
23394
23395 -       spin_lock(&pgd_lock);
23396 +       spin_lock_irqsave(&pgd_lock, flags);
23397         list_del(&page->lru);
23398 -       spin_unlock(&pgd_lock);
23399 +       spin_unlock_irqrestore(&pgd_lock, flags);
23400  }
23401
23402  extern void pgd_test_and_unpin(pgd_t *);
23403 @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23404         return pgd;
23405  }
23406
23407 -static inline void pgd_free(pgd_t *pgd)
23408 +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23409  {
23410         pgd_test_and_unpin(pgd);
23411         pgd_list_del(pgd);
23412 @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23413         return pte;
23414  }
23415
23416 +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23417 +
23418  /* Should really implement gc for free page table pages. This could be
23419     done with a reference count in struct page. */
23420
23421 -static inline void pte_free_kernel(pte_t *pte)
23422 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23423  {
23424         BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23425         make_page_writable(pte, XENFEAT_writable_page_tables);
23426         free_page((unsigned long)pte);
23427  }
23428
23429 -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23430 +extern void __pte_free(pgtable_t);
23431 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23432 +{
23433 +       __pte_free(pte);
23434 +}
23435 +
23436 +#define __pte_free_tlb(tlb,pte)                                \
23437 +do {                                                   \
23438 +       pgtable_page_dtor((pte));                               \
23439 +       tlb_remove_page((tlb), (pte));                  \
23440 +} while (0)
23441 +
23442  #define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
23443  #define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
23444
23445 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
23446 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
23447 @@ -1,8 +1,6 @@
23448  #ifndef _I386_PGTABLE_H
23449  #define _I386_PGTABLE_H
23450
23451 -#include <asm/hypervisor.h>
23452 -
23453  /*
23454   * The Linux memory management assumes a three-level page table setup. On
23455   * the i386, we use that, but "fold" the mid level into the top-level page
23456 @@ -25,20 +23,10 @@
23457
23458  struct vm_area_struct;
23459
23460 -/*
23461 - * ZERO_PAGE is a global shared page that is always zero: used
23462 - * for zero-mapped memory areas etc..
23463 - */
23464 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23465 -extern unsigned long empty_zero_page[1024];
23466  extern pgd_t *swapper_pg_dir;
23467 -extern struct kmem_cache *pmd_cache;
23468 -extern spinlock_t pgd_lock;
23469 -extern struct page *pgd_list;
23470 -void check_pgt_cache(void);
23471
23472 -void pmd_ctor(struct kmem_cache *, void *);
23473 -void pgtable_cache_init(void);
23474 +static inline void pgtable_cache_init(void) { }
23475 +static inline void check_pgt_cache(void) { }
23476  void paging_init(void);
23477
23478
23479 @@ -58,16 +46,9 @@ void paging_init(void);
23480  #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
23481  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
23482
23483 -#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
23484 -#define FIRST_USER_ADDRESS     0
23485 -
23486  #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23487  #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23488
23489 -#define TWOLEVEL_PGDIR_SHIFT   22
23490 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23491 -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23492 -
23493  /* Just any arbitrary offset to the start of the vmalloc VM area: the
23494   * current 8MB value just means that there will be a 8MB "hole" after the
23495   * physical memory until the kernel virtual memory starts.  That means that
23496 @@ -78,121 +59,19 @@ void paging_init(void);
23497  #define VMALLOC_OFFSET (8*1024*1024)
23498  #define VMALLOC_START  (((unsigned long) high_memory + \
23499                         2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23500 -#ifdef CONFIG_HIGHMEM
23501 -# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
23502 -#else
23503 -# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
23504 -#endif
23505 -
23506 -/*
23507 - * _PAGE_PSE set in the page directory entry just means that
23508 - * the page directory entry points directly to a 4MB-aligned block of
23509 - * memory.
23510 - */
23511 -#define _PAGE_BIT_PRESENT      0
23512 -#define _PAGE_BIT_RW           1
23513 -#define _PAGE_BIT_USER         2
23514 -#define _PAGE_BIT_PWT          3
23515 -#define _PAGE_BIT_PCD          4
23516 -#define _PAGE_BIT_ACCESSED     5
23517 -#define _PAGE_BIT_DIRTY                6
23518 -#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
23519 -#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
23520 -/*#define _PAGE_BIT_UNUSED1    9*/     /* available for programmer */
23521 -#define _PAGE_BIT_UNUSED2      10
23522 -#define _PAGE_BIT_UNUSED3      11
23523 -#define _PAGE_BIT_NX           63
23524 -
23525 -#define _PAGE_PRESENT  0x001
23526 -#define _PAGE_RW       0x002
23527 -#define _PAGE_USER     0x004
23528 -#define _PAGE_PWT      0x008
23529 -#define _PAGE_PCD      0x010
23530 -#define _PAGE_ACCESSED 0x020
23531 -#define _PAGE_DIRTY    0x040
23532 -#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
23533 -#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
23534 -/*#define _PAGE_UNUSED1        0x200*/ /* available for programmer */
23535 -#define _PAGE_UNUSED2  0x400
23536 -#define _PAGE_UNUSED3  0x800
23537 -
23538 -/* If _PAGE_PRESENT is clear, we use these: */
23539 -#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
23540 -#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
23541 -                                  pte_present gives true */
23542  #ifdef CONFIG_X86_PAE
23543 -#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
23544 +#define LAST_PKMAP 512
23545  #else
23546 -#define _PAGE_NX       0
23547 +#define LAST_PKMAP 1024
23548  #endif
23549
23550 -/* Mapped page is I/O or foreign and has no associated page struct. */
23551 -#define _PAGE_IO       0x200
23552 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23553
23554 -#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23555 -#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23556 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23557 -
23558 -#define PAGE_NONE \
23559 -       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23560 -#define PAGE_SHARED \
23561 -       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23562 -
23563 -#define PAGE_SHARED_EXEC \
23564 -       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23565 -#define PAGE_COPY_NOEXEC \
23566 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23567 -#define PAGE_COPY_EXEC \
23568 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23569 -#define PAGE_COPY \
23570 -       PAGE_COPY_NOEXEC
23571 -#define PAGE_READONLY \
23572 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23573 -#define PAGE_READONLY_EXEC \
23574 -       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23575 -
23576 -#define _PAGE_KERNEL \
23577 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23578 -#define _PAGE_KERNEL_EXEC \
23579 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23580 -
23581 -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23582 -#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
23583 -#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23584 -#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
23585 -#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
23586 -#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23587 -
23588 -#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
23589 -#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
23590 -#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
23591 -#define PAGE_KERNEL_RX         __pgprot(__PAGE_KERNEL_RX)
23592 -#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
23593 -#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
23594 -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23595 -
23596 -/*
23597 - * The i386 can't do page protection for execute, and considers that
23598 - * the same are read. Also, write permissions imply read permissions.
23599 - * This is the closest we can get..
23600 - */
23601 -#define __P000 PAGE_NONE
23602 -#define __P001 PAGE_READONLY
23603 -#define __P010 PAGE_COPY
23604 -#define __P011 PAGE_COPY
23605 -#define __P100 PAGE_READONLY_EXEC
23606 -#define __P101 PAGE_READONLY_EXEC
23607 -#define __P110 PAGE_COPY_EXEC
23608 -#define __P111 PAGE_COPY_EXEC
23609 -
23610 -#define __S000 PAGE_NONE
23611 -#define __S001 PAGE_READONLY
23612 -#define __S010 PAGE_SHARED
23613 -#define __S011 PAGE_SHARED
23614 -#define __S100 PAGE_READONLY_EXEC
23615 -#define __S101 PAGE_READONLY_EXEC
23616 -#define __S110 PAGE_SHARED_EXEC
23617 -#define __S111 PAGE_SHARED_EXEC
23618 +#ifdef CONFIG_HIGHMEM
23619 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
23620 +#else
23621 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
23622 +#endif
23623
23624  /*
23625   * Define this if things work differently on an i386 and an i486:
23626 @@ -221,28 +100,6 @@ extern unsigned long pg0[];
23627
23628  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23629
23630 -/*
23631 - * The following only work if pte_present() is true.
23632 - * Undefined behaviour if not..
23633 - */
23634 -static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
23635 -static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
23636 -static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
23637 -static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
23638 -
23639 -/*
23640 - * The following only works if pte_present() is not true.
23641 - */
23642 -static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
23643 -
23644 -static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23645 -static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23646 -static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
23647 -static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23648 -static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23649 -static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
23650 -static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
23651 -
23652  #ifdef CONFIG_X86_PAE
23653  # include <asm/pgtable-3level.h>
23654  #else
23655 @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23656  #endif
23657
23658  /*
23659 - * Rules for using pte_update - it must be called after any PTE update which
23660 - * has not been done using the set_pte / clear_pte interfaces.  It is used by
23661 - * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
23662 - * updates should either be sets, clears, or set_pte_atomic for P->P
23663 - * transitions, which means this hook should only be called for user PTEs.
23664 - * This hook implies a P->P protection or access change has taken place, which
23665 - * requires a subsequent TLB flush.  The notification can optionally be delayed
23666 - * until the TLB flush event by using the pte_update_defer form of the
23667 - * interface, but care must be taken to assure that the flush happens while
23668 - * still holding the same page table lock so that the shadow and primary pages
23669 - * do not become out of sync on SMP.
23670 - */
23671 -#define pte_update(mm, addr, ptep)             do { } while (0)
23672 -#define pte_update_defer(mm, addr, ptep)       do { } while (0)
23673 -
23674 -/* local pte updates need not use xchg for locking */
23675 -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23676 -{
23677 -       xen_set_pte(ptep, __pte(0));
23678 -       return res;
23679 -}
23680 -
23681 -/*
23682 - * We only update the dirty/accessed state if we set
23683 - * the dirty bit by hand in the kernel, since the hardware
23684 - * will do the accessed bit for us, and we don't want to
23685 - * race with other CPU's that might be updating the dirty
23686 - * bit at the same time.
23687 - */
23688 -#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23689 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
23690 -({                                                                     \
23691 -       int __changed = !pte_same(*(ptep), entry);                      \
23692 -       if (__changed && (dirty)) {                                     \
23693 -               if ( likely((vma)->vm_mm == current->mm) ) {            \
23694 -                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
23695 -                               entry,                                  \
23696 -                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23697 -                                       UVMF_INVLPG|UVMF_MULTI));       \
23698 -               } else {                                                \
23699 -                       xen_l1_entry_update(ptep, entry);               \
23700 -                       flush_tlb_page(vma, address);                   \
23701 -               }                                                       \
23702 -       }                                                               \
23703 -       __changed;                                                      \
23704 -})
23705 -
23706 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23707 -#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
23708 -       int __ret = 0;                                                  \
23709 -       if (pte_young(*(ptep)))                                         \
23710 -               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
23711 -                                               &(ptep)->pte_low);      \
23712 -       if (__ret)                                                      \
23713 -               pte_update((vma)->vm_mm, addr, ptep);                   \
23714 -       __ret;                                                          \
23715 -})
23716 -
23717 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23718 -#define ptep_clear_flush_young(vma, address, ptep)                     \
23719 -({                                                                     \
23720 -       pte_t __pte = *(ptep);                                          \
23721 -       int __young = pte_young(__pte);                                 \
23722 -       __pte = pte_mkold(__pte);                                       \
23723 -       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
23724 -               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23725 -       else if (__young)                                               \
23726 -               (ptep)->pte_low = __pte.pte_low;                        \
23727 -       __young;                                                        \
23728 -})
23729 -
23730 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23731 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23732 -{
23733 -       pte_t pte = *ptep;
23734 -       if (!pte_none(pte)
23735 -           && (mm != &init_mm
23736 -               || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23737 -               pte = xen_ptep_get_and_clear(ptep, pte);
23738 -               pte_update(mm, addr, ptep);
23739 -       }
23740 -       return pte;
23741 -}
23742 -
23743 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23744 -#define ptep_get_and_clear_full(mm, addr, ptep, full)                  \
23745 -       ((full) ? ({                                                    \
23746 -               pte_t __res = *(ptep);                                  \
23747 -               if (PagePinned(virt_to_page((mm)->pgd)))                \
23748 -                       xen_l1_entry_update(ptep, __pte(0));            \
23749 -               else                                                    \
23750 -                       *(ptep) = __pte(0);                             \
23751 -               __res;                                                  \
23752 -        }) :                                                           \
23753 -        ptep_get_and_clear(mm, addr, ptep))
23754 -
23755 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23756 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23757 -{
23758 -       pte_t pte = *ptep;
23759 -       if (pte_write(pte))
23760 -               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23761 -}
23762 -
23763 -/*
23764   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
23765   *
23766   *  dst - pointer to pgd range anwhere on a pgd page
23767 @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
23768
23769  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
23770
23771 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23772 -{
23773 -       /*
23774 -        * Since this might change the present bit (which controls whether
23775 -        * a pte_t object has undergone p2m translation), we must use
23776 -        * pte_val() on the input pte and __pte() for the return value.
23777 -        */
23778 -       paddr_t pteval = pte_val(pte);
23779 -
23780 -       pteval &= _PAGE_CHG_MASK;
23781 -       pteval |= pgprot_val(newprot);
23782 -#ifdef CONFIG_X86_PAE
23783 -       pteval &= __supported_pte_mask;
23784 -#endif
23785 -       return __pte(pteval);
23786 -}
23787 -
23788 -#define pmd_large(pmd) \
23789 -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
23790 -
23791  /*
23792   * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
23793   *
23794 @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
23795   */
23796  #define pgd_offset_k(address) pgd_offset(&init_mm, address)
23797
23798 +static inline int pud_large(pud_t pud) { return 0; }
23799 +
23800  /*
23801   * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
23802   *
23803 @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
23804  #define pmd_page_vaddr(pmd) \
23805                 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
23806
23807 -/*
23808 - * Helper function that returns the kernel pagetable entry controlling
23809 - * the virtual address 'address'. NULL means no pagetable entry present.
23810 - * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23811 - * as a pte too.
23812 - */
23813 -extern pte_t *lookup_address(unsigned long address);
23814 -
23815 -/*
23816 - * Make a given kernel text page executable/non-executable.
23817 - * Returns the previous executability setting of that page (which
23818 - * is used to restore the previous state). Used by the SMP bootup code.
23819 - * NOTE: this is an __init function for security reasons.
23820 - */
23821 -#ifdef CONFIG_X86_PAE
23822 - extern int set_kernel_exec(unsigned long vaddr, int enable);
23823 -#else
23824 - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
23825 -#endif
23826 -
23827  #if defined(CONFIG_HIGHPTE)
23828  #define pte_offset_map(dir, address) \
23829         ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
23830 @@ -496,59 +210,22 @@ extern pte_t *lookup_address(unsigned lo
23831   */
23832  #define update_mmu_cache(vma,address,pte) do { } while (0)
23833
23834 -#include <xen/features.h>
23835  void make_lowmem_page_readonly(void *va, unsigned int feature);
23836  void make_lowmem_page_writable(void *va, unsigned int feature);
23837 -void make_page_readonly(void *va, unsigned int feature);
23838 -void make_page_writable(void *va, unsigned int feature);
23839 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23840 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23841 -
23842 -#define virt_to_ptep(va)                                               \
23843 -({                                                                     \
23844 -       pte_t *__ptep = lookup_address((unsigned long)(va));            \
23845 -       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
23846 -       __ptep;                                                         \
23847 -})
23848 -
23849 -#define arbitrary_virt_to_machine(va)                                  \
23850 -       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
23851 -        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
23852
23853  #endif /* !__ASSEMBLY__ */
23854
23855 +/*
23856 + * kern_addr_valid() is (1) for FLATMEM and (0) for
23857 + * SPARSEMEM and DISCONTIGMEM
23858 + */
23859  #ifdef CONFIG_FLATMEM
23860  #define kern_addr_valid(addr)  (1)
23861 -#endif /* CONFIG_FLATMEM */
23862 -
23863 -int direct_remap_pfn_range(struct vm_area_struct *vma,
23864 -                           unsigned long address,
23865 -                           unsigned long mfn,
23866 -                           unsigned long size,
23867 -                           pgprot_t prot,
23868 -                           domid_t  domid);
23869 -int direct_kernel_remap_pfn_range(unsigned long address,
23870 -                                 unsigned long mfn,
23871 -                                 unsigned long size,
23872 -                                 pgprot_t prot,
23873 -                                 domid_t  domid);
23874 -int create_lookup_pte_addr(struct mm_struct *mm,
23875 -                           unsigned long address,
23876 -                           uint64_t *ptep);
23877 -int touch_pte_range(struct mm_struct *mm,
23878 -                    unsigned long address,
23879 -                    unsigned long size);
23880 -
23881 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23882 -               unsigned long addr, unsigned long end, pgprot_t newprot,
23883 -               int dirty_accountable);
23884 -
23885 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23886 -       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23887 +#else
23888 +#define kern_addr_valid(kaddr) (0)
23889 +#endif
23890
23891  #define io_remap_pfn_range(vma,from,pfn,size,prot) \
23892  direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
23893
23894 -#include <asm-generic/pgtable.h>
23895 -
23896  #endif /* _I386_PGTABLE_H */
23897 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
23898 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
23899 @@ -18,16 +18,18 @@
23900         printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23901                &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23902
23903 -#define pud_none(pud)                          0
23904 -#define pud_bad(pud)                           0
23905 -#define pud_present(pud)                       1
23906
23907 -/*
23908 - * All present pages with !NX bit are kernel-executable:
23909 - */
23910 -static inline int pte_exec_kernel(pte_t pte)
23911 +static inline int pud_none(pud_t pud)
23912 +{
23913 +       return __pud_val(pud) == 0;
23914 +}
23915 +static inline int pud_bad(pud_t pud)
23916  {
23917 -       return !(__pte_val(pte) & _PAGE_NX);
23918 +       return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23919 +}
23920 +static inline int pud_present(pud_t pud)
23921 +{
23922 +       return __pud_val(pud) & _PAGE_PRESENT;
23923  }
23924
23925  /* Rules for using set_pte: the pte being assigned *must* be
23926 @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23927         ptep->pte_low = pte.pte_low;
23928  }
23929
23930 -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23931 -                                 pte_t *ptep , pte_t pte)
23932 -{
23933 -       if ((mm != current->mm && mm != &init_mm) ||
23934 -           HYPERVISOR_update_va_mapping(addr, pte, 0))
23935 -               xen_set_pte(ptep, pte);
23936 -}
23937 -
23938  static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23939  {
23940         set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23941 @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23942   * entry, so clear the bottom half first and enforce ordering with a compiler
23943   * barrier.
23944   */
23945 -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23946 +static inline void __xen_pte_clear(pte_t *ptep)
23947  {
23948 -       if ((mm != current->mm && mm != &init_mm)
23949 -           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23950 -               ptep->pte_low = 0;
23951 -               smp_wmb();
23952 -               ptep->pte_high = 0;
23953 -       }
23954 +       ptep->pte_low = 0;
23955 +       smp_wmb();
23956 +       ptep->pte_high = 0;
23957  }
23958
23959  static inline void xen_pmd_clear(pmd_t *pmd)
23960 @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23961         xen_l2_entry_update(pmd, __pmd(0));
23962  }
23963
23964 -#define set_pte(ptep, pte)                     xen_set_pte(ptep, pte)
23965 -#define set_pte_at(mm, addr, ptep, pte)                xen_set_pte_at(mm, addr, ptep, pte)
23966 -#define set_pte_atomic(ptep, pte)              xen_set_pte_atomic(ptep, pte)
23967 -#define set_pmd(pmdp, pmd)                     xen_set_pmd(pmdp, pmd)
23968 -#define set_pud(pudp, pud)                     xen_set_pud(pudp, pud)
23969 -#define pte_clear(mm, addr, ptep)              xen_pte_clear(mm, addr, ptep)
23970 -#define pmd_clear(pmd)                         xen_pmd_clear(pmd)
23971 +static inline void pud_clear(pud_t *pudp)
23972 +{
23973 +       pgdval_t pgd;
23974 +
23975 +       set_pud(pudp, __pud(0));
23976
23977 -/*
23978 - * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23979 - * the TLB via cr3 if the top-level pgd is changed...
23980 - * We do not let the generic code free and clear pgd entries due to
23981 - * this erratum.
23982 - */
23983 -static inline void pud_clear (pud_t * pud) { }
23984 +       /*
23985 +        * According to Intel App note "TLBs, Paging-Structure Caches,
23986 +        * and Their Invalidation", April 2007, document 317080-001,
23987 +        * section 8.1: in PAE mode we explicitly have to flush the
23988 +        * TLB via cr3 if the top-level pgd is changed...
23989 +        *
23990 +        * Make sure the pud entry we're updating is within the
23991 +        * current pgd to avoid unnecessary TLB flushes.
23992 +        */
23993 +       pgd = read_cr3();
23994 +       if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23995 +               xen_tlb_flush();
23996 +}
23997
23998  #define pud_page(pud) \
23999  ((struct page *) __va(pud_val(pud) & PAGE_MASK))
24000 @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
24001  #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24002  #endif
24003
24004 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24005 -#define ptep_clear_flush(vma, addr, ptep)                      \
24006 -({                                                             \
24007 -       pte_t *__ptep = (ptep);                                 \
24008 -       pte_t __res = *__ptep;                                  \
24009 -       if (!pte_none(__res) &&                                 \
24010 -           ((vma)->vm_mm != current->mm ||                     \
24011 -            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
24012 -                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24013 -                               UVMF_INVLPG|UVMF_MULTI))) {     \
24014 -               __ptep->pte_low = 0;                            \
24015 -               smp_wmb();                                      \
24016 -               __ptep->pte_high = 0;                           \
24017 -               flush_tlb_page(vma, addr);                      \
24018 -       }                                                       \
24019 -       __res;                                                  \
24020 -})
24021 -
24022  #define __HAVE_ARCH_PTE_SAME
24023  static inline int pte_same(pte_t a, pte_t b)
24024  {
24025 @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
24026                        mfn_to_local_pfn(__pte_mfn(_pte)) :      \
24027                        __pte_mfn(_pte))
24028
24029 -extern unsigned long long __supported_pte_mask;
24030 -
24031 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24032 -{
24033 -       return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
24034 -                     pgprot_val(pgprot)) & __supported_pte_mask);
24035 -}
24036 -
24037 -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24038 -{
24039 -       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
24040 -                     pgprot_val(pgprot)) & __supported_pte_mask);
24041 -}
24042 -
24043  /*
24044   * Bits 0, 6 and 7 are taken in the low part of the pte,
24045   * put the 32 bits of offset into the high part.
24046   */
24047  #define pte_to_pgoff(pte) ((pte).pte_high)
24048 -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
24049 +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
24050  #define PTE_FILE_MAX_BITS       32
24051
24052  /* Encode and de-code a swap entry */
24053 @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
24054  #define __swp_offset(x)                        ((x).val >> 5)
24055  #define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
24056  #define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
24057 -#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
24058 -
24059 -#define __pmd_free_tlb(tlb, x)         do { } while (0)
24060 +#define __swp_entry_to_pte(x)          ((pte_t){ { .pte_high = (x).val } })
24061
24062  #endif /* _I386_PGTABLE_3LEVEL_H */
24063 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
24064 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
24065 @@ -13,47 +13,26 @@
24066  #include <linux/threads.h>
24067  #include <linux/sched.h>
24068  #include <asm/pda.h>
24069 -#ifdef CONFIG_XEN
24070 -#include <asm/hypervisor.h>
24071
24072 +#ifdef CONFIG_XEN
24073  extern pud_t level3_user_pgt[512];
24074
24075  extern void xen_init_pt(void);
24076 -
24077 -extern pte_t *lookup_address(unsigned long address);
24078 -
24079 -#define virt_to_ptep(va)                                               \
24080 -({                                                                     \
24081 -       pte_t *__ptep = lookup_address((unsigned long)(va));            \
24082 -       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
24083 -       __ptep;                                                         \
24084 -})
24085 -
24086 -#define arbitrary_virt_to_machine(va)                                  \
24087 -       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
24088 -        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24089  #endif
24090
24091  extern pud_t level3_kernel_pgt[512];
24092  extern pud_t level3_ident_pgt[512];
24093  extern pmd_t level2_kernel_pgt[512];
24094  extern pgd_t init_level4_pgt[];
24095 -extern unsigned long __supported_pte_mask;
24096
24097  #define swapper_pg_dir init_level4_pgt
24098
24099  extern void paging_init(void);
24100 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24101 -
24102 -/*
24103 - * ZERO_PAGE is a global shared page that is always zero: used
24104 - * for zero-mapped memory areas etc..
24105 - */
24106 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24107 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24108
24109  #endif /* !__ASSEMBLY__ */
24110
24111 +#define SHARED_KERNEL_PMD      1
24112 +
24113  /*
24114   * PGDIR_SHIFT determines what a top-level page table entry can map
24115   */
24116 @@ -96,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24117  #define pgd_none(x)    (!__pgd_val(x))
24118  #define pud_none(x)    (!__pud_val(x))
24119
24120 -static inline void set_pte(pte_t *dst, pte_t val)
24121 +struct mm_struct;
24122 +
24123 +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24124 +
24125 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24126 +{
24127 +       *ptep = pte;
24128 +}
24129 +
24130 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24131 +{
24132 +       xen_set_pte(ptep, pte);
24133 +}
24134 +
24135 +#ifdef CONFIG_SMP
24136 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24137 +{
24138 +       return __pte_ma(xchg(&xp->pte, 0));
24139 +}
24140 +#else
24141 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24142 +#endif
24143 +
24144 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24145  {
24146 -       *dst = val;
24147 +       xen_l2_entry_update(pmdp, pmd);
24148  }
24149
24150 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24151 -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24152 -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24153 +static inline void xen_pmd_clear(pmd_t *pmd)
24154 +{
24155 +       xen_set_pmd(pmd, xen_make_pmd(0));
24156 +}
24157 +
24158 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24159 +{
24160 +       xen_l3_entry_update(pudp, pud);
24161 +}
24162
24163 -static inline void pud_clear (pud_t * pud)
24164 +static inline void xen_pud_clear(pud_t *pud)
24165  {
24166 -       set_pud(pud, __pud(0));
24167 +       xen_set_pud(pud, xen_make_pud(0));
24168  }
24169
24170  #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24171
24172 -static inline void pgd_clear (pgd_t * pgd)
24173 +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24174  {
24175 -        set_pgd(pgd, __pgd(0));
24176 -        set_pgd(__user_pgd(pgd), __pgd(0));
24177 +       xen_l4_entry_update(pgdp, pgd);
24178  }
24179
24180 -#define pte_same(a, b)         ((a).pte == (b).pte)
24181 +static inline void xen_pgd_clear(pgd_t * pgd)
24182 +{
24183 +       xen_set_pgd(pgd, xen_make_pgd(0));
24184 +       xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24185 +}
24186
24187 -#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24188 +#define pte_same(a, b)         ((a).pte == (b).pte)
24189
24190  #endif /* !__ASSEMBLY__ */
24191
24192 @@ -131,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24193  #define PGDIR_SIZE     (_AC(1,UL) << PGDIR_SHIFT)
24194  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
24195
24196 -#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
24197 -#define FIRST_USER_ADDRESS     0
24198
24199  #define MAXMEM          _AC(0x3fffffffffff, UL)
24200  #define VMALLOC_START    _AC(0xffffc20000000000, UL)
24201 @@ -142,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24202  #define MODULES_END      _AC(0xfffffffffff00000, UL)
24203  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
24204
24205 -#define _PAGE_BIT_PRESENT      0
24206 -#define _PAGE_BIT_RW           1
24207 -#define _PAGE_BIT_USER         2
24208 -#define _PAGE_BIT_PWT          3
24209 -#define _PAGE_BIT_PCD          4
24210 -#define _PAGE_BIT_ACCESSED     5
24211 -#define _PAGE_BIT_DIRTY                6
24212 -#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
24213 -#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
24214 -#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
24215 -
24216 -#define _PAGE_PRESENT  0x001
24217 -#define _PAGE_RW       0x002
24218 -#define _PAGE_USER     0x004
24219 -#define _PAGE_PWT      0x008
24220 -#define _PAGE_PCD      0x010
24221 -#define _PAGE_ACCESSED 0x020
24222 -#define _PAGE_DIRTY    0x040
24223 -#define _PAGE_PSE      0x080   /* 2MB page */
24224 -#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
24225 -#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
24226 -
24227 -#define _PAGE_PROTNONE 0x080   /* If not present */
24228 -#define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
24229 -
24230 -/* Mapped page is I/O or foreign and has no associated page struct. */
24231 -#define _PAGE_IO       0x200
24232 -
24233 -#ifndef __ASSEMBLY__
24234 -#if CONFIG_XEN_COMPAT <= 0x030002
24235 -extern unsigned int __kernel_page_user;
24236 -#else
24237 -#define __kernel_page_user 0
24238 -#endif
24239 -#endif
24240 -
24241 -#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24242 -#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24243 -
24244 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24245 -
24246 -#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24247 -#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24248 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24249 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24250 -#define PAGE_COPY PAGE_COPY_NOEXEC
24251 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24252 -#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24253 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24254 -#define __PAGE_KERNEL \
24255 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24256 -#define __PAGE_KERNEL_EXEC \
24257 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24258 -#define __PAGE_KERNEL_NOCACHE \
24259 -       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24260 -#define __PAGE_KERNEL_RO \
24261 -       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24262 -#define __PAGE_KERNEL_VSYSCALL \
24263 -       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24264 -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24265 -       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24266 -#define __PAGE_KERNEL_LARGE \
24267 -       (__PAGE_KERNEL | _PAGE_PSE)
24268 -#define __PAGE_KERNEL_LARGE_EXEC \
24269 -       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24270 -
24271 -/*
24272 - * We don't support GLOBAL page in xenolinux64
24273 - */
24274 -#define MAKE_GLOBAL(x) __pgprot((x))
24275 -
24276 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24277 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24278 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24279 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24280 -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24281 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24282 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24283 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24284 -
24285 -/*         xwr */
24286 -#define __P000 PAGE_NONE
24287 -#define __P001 PAGE_READONLY
24288 -#define __P010 PAGE_COPY
24289 -#define __P011 PAGE_COPY
24290 -#define __P100 PAGE_READONLY_EXEC
24291 -#define __P101 PAGE_READONLY_EXEC
24292 -#define __P110 PAGE_COPY_EXEC
24293 -#define __P111 PAGE_COPY_EXEC
24294 -
24295 -#define __S000 PAGE_NONE
24296 -#define __S001 PAGE_READONLY
24297 -#define __S010 PAGE_SHARED
24298 -#define __S011 PAGE_SHARED
24299 -#define __S100 PAGE_READONLY_EXEC
24300 -#define __S101 PAGE_READONLY_EXEC
24301 -#define __S110 PAGE_SHARED_EXEC
24302 -#define __S111 PAGE_SHARED_EXEC
24303 -
24304  #ifndef __ASSEMBLY__
24305
24306  static inline unsigned long pgd_bad(pgd_t pgd)
24307 @@ -258,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24308         return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24309  }
24310
24311 -#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
24312 -       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
24313 -           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
24314 -               set_pte((ptep), (pteval));                              \
24315 -} while (0)
24316 -
24317  #define pte_none(x)    (!(x).pte)
24318  #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24319 -#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24320
24321 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24322 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))        /* FIXME: is this right? */
24323
24324  #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24325  #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24326         __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24327 -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn :       \
24328 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr :     \
24329                        (_pte).pte & _PAGE_PRESENT ?             \
24330                        mfn_to_local_pfn(__pte_mfn(_pte)) :      \
24331                        __pte_mfn(_pte))
24332
24333  #define pte_page(x)    pfn_to_page(pte_pfn(x))
24334
24335 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24336 -{
24337 -       unsigned long pte = page_nr << PAGE_SHIFT;
24338 -       pte |= pgprot_val(pgprot);
24339 -       pte &= __supported_pte_mask;
24340 -       return __pte(pte);
24341 -}
24342 -
24343 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24344 -{
24345 -       pte_t pte = *ptep;
24346 -       if (!pte_none(pte)) {
24347 -               if ((mm != &init_mm) ||
24348 -                   HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24349 -                       pte = __pte_ma(xchg(&ptep->pte, 0));
24350 -       }
24351 -       return pte;
24352 -}
24353 -
24354 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24355 -{
24356 -       if (full) {
24357 -               pte_t pte = *ptep;
24358 -               if (PagePinned(virt_to_page(mm->pgd)))
24359 -                       xen_l1_entry_update(ptep, __pte(0));
24360 -               else
24361 -                       *ptep = __pte(0);
24362 -               return pte;
24363 -       }
24364 -       return ptep_get_and_clear(mm, addr, ptep);
24365 -}
24366 -
24367 -#define ptep_clear_flush(vma, addr, ptep)                      \
24368 -({                                                             \
24369 -       pte_t *__ptep = (ptep);                                 \
24370 -       pte_t __res = *__ptep;                                  \
24371 -       if (!pte_none(__res) &&                                 \
24372 -           ((vma)->vm_mm != current->mm ||                     \
24373 -            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
24374 -                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24375 -                               UVMF_INVLPG|UVMF_MULTI))) {     \
24376 -               __ptep->pte = 0;                                \
24377 -               flush_tlb_page(vma, addr);                      \
24378 -       }                                                       \
24379 -       __res;                                                  \
24380 -})
24381 -
24382 -/*
24383 - * The following only work if pte_present() is true.
24384 - * Undefined behaviour if not..
24385 - */
24386 -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24387 -static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
24388 -static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
24389 -static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
24390 -static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
24391 -static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
24392 -
24393 -static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24394 -static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24395 -static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24396 -static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24397 -static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24398 -static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24399 -static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
24400 -static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= _PAGE_PSE; return pte; }
24401 -static inline pte_t pte_clrhuge(pte_t pte)     { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24402 -
24403 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24404 -{
24405 -       if (!pte_young(*ptep))
24406 -               return 0;
24407 -       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24408 -}
24409 -
24410 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24411 -{
24412 -       pte_t pte = *ptep;
24413 -       if (pte_write(pte))
24414 -               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24415 -}
24416 -
24417  /*
24418   * Macro to mark a page protection value as "uncacheable".
24419   */
24420  #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24421
24422 -static inline int pmd_large(pmd_t pte) {
24423 -       return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24424 -}
24425 -
24426
24427  /*
24428   * Conversion functions: convert a page and protection to a page entry,
24429 @@ -386,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24430  #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24431  #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24432  #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24433 +static inline int pgd_large(pgd_t pgd) { return 0; }
24434  #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24435
24436  /* PUD - Level3 access */
24437 @@ -396,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24438  #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24439  #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24440
24441 +static inline int pud_large(pud_t pte)
24442 +{
24443 +       return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24444 +               (_PAGE_PSE|_PAGE_PRESENT);
24445 +}
24446 +
24447  /* PMD  - Level 2 access */
24448  #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24449  #define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24450 @@ -411,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24451  #else
24452  #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24453  #endif
24454 -#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
24455  #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24456  #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24457
24458  #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24459 -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24460 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24461  #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24462
24463  /* PTE - Level 1 access. */
24464
24465  /* page, protection -> pte */
24466  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
24467 -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24468
24469 -/* Change flags of a PTE */
24470 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24471 -{
24472 -       /*
24473 -        * Since this might change the present bit (which controls whether
24474 -        * a pte_t object has undergone p2m translation), we must use
24475 -        * pte_val() on the input pte and __pte() for the return value.
24476 -        */
24477 -       unsigned long pteval = pte_val(pte);
24478 -
24479 -       pteval &= _PAGE_CHG_MASK;
24480 -       pteval |= pgprot_val(newprot);
24481 -       pteval &= __supported_pte_mask;
24482 -       return __pte(pteval);
24483 -}
24484 -
24485  #define pte_index(address) \
24486                 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24487  #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24488 @@ -454,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24489
24490  #define update_mmu_cache(vma,address,pte) do { } while (0)
24491
24492 -/*
24493 - * Rules for using ptep_establish: the pte MUST be a user pte, and
24494 - * must be a present->present transition.
24495 - */
24496 -#define __HAVE_ARCH_PTEP_ESTABLISH
24497 -#define ptep_establish(vma, address, ptep, pteval)                     \
24498 -       do {                                                            \
24499 -               if ( likely((vma)->vm_mm == current->mm) ) {            \
24500 -                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
24501 -                               pteval,                                 \
24502 -                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24503 -                                       UVMF_INVLPG|UVMF_MULTI));       \
24504 -               } else {                                                \
24505 -                       xen_l1_entry_update(ptep, pteval);              \
24506 -                       flush_tlb_page(vma, address);                   \
24507 -               }                                                       \
24508 -       } while (0)
24509 -
24510 -/* We only update the dirty/accessed state if we set
24511 - * the dirty bit by hand in the kernel, since the hardware
24512 - * will do the accessed bit for us, and we don't want to
24513 - * race with other CPU's that might be updating the dirty
24514 - * bit at the same time. */
24515 -#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24516 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
24517 -({                                                                     \
24518 -       int __changed = !pte_same(*(ptep), entry);                      \
24519 -       if (__changed && (dirty))                                       \
24520 -               ptep_establish(vma, address, ptep, entry);              \
24521 -       __changed;                                                      \
24522 -})
24523 -
24524 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24525 -#define ptep_clear_flush_young(vma, address, ptep)                     \
24526 -({                                                                     \
24527 -       pte_t __pte = *(ptep);                                          \
24528 -       int __young = pte_young(__pte);                                 \
24529 -       __pte = pte_mkold(__pte);                                       \
24530 -       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
24531 -               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24532 -       else if (__young)                                               \
24533 -               set_pte(ptep, __pte);                                   \
24534 -       __young;                                                        \
24535 -})
24536 -
24537  /* Encode and de-code a swap entry */
24538  #define __swp_type(x)                  (((x).val >> 1) & 0x3f)
24539  #define __swp_offset(x)                        ((x).val >> 8)
24540  #define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24541  #define __pte_to_swp_entry(pte)                ((swp_entry_t) { __pte_val(pte) })
24542 -#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
24543 -
24544 -extern spinlock_t pgd_lock;
24545 -extern struct list_head pgd_list;
24546 +#define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
24547
24548  extern int kern_addr_valid(unsigned long addr);
24549 -
24550 -#define DOMID_LOCAL (0xFFFFU)
24551 -
24552 -struct vm_area_struct;
24553 -
24554 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24555 -                            unsigned long address,
24556 -                            unsigned long mfn,
24557 -                            unsigned long size,
24558 -                            pgprot_t prot,
24559 -                            domid_t  domid);
24560 -
24561 -int direct_kernel_remap_pfn_range(unsigned long address,
24562 -                                 unsigned long mfn,
24563 -                                 unsigned long size,
24564 -                                 pgprot_t prot,
24565 -                                 domid_t  domid);
24566 -
24567 -int create_lookup_pte_addr(struct mm_struct *mm,
24568 -                           unsigned long address,
24569 -                           uint64_t *ptep);
24570 -
24571 -int touch_pte_range(struct mm_struct *mm,
24572 -                    unsigned long address,
24573 -                    unsigned long size);
24574 -
24575 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24576 -               unsigned long addr, unsigned long end, pgprot_t newprot,
24577 -               int dirty_accountable);
24578 -
24579 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24580 -       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24581 -
24582 -pte_t *lookup_address(unsigned long addr);
24583 +extern void cleanup_highmap(void);
24584
24585  #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
24586                 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24587
24588  #define HAVE_ARCH_UNMAPPED_AREA
24589 +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24590
24591  #define pgtable_cache_init()   do { } while (0)
24592  #define check_pgt_cache()      do { } while (0)
24593 @@ -561,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24594  #define        kc_offset_to_vaddr(o) \
24595     (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24596
24597 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24598 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24599 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24600 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24601 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24602  #define __HAVE_ARCH_PTE_SAME
24603 -#include <asm-generic/pgtable.h>
24604  #endif /* !__ASSEMBLY__ */
24605
24606  #endif /* _X86_64_PGTABLE_H */
24607 --- a/include/asm-x86/mach-xen/asm/pgtable.h
24608 +++ b/include/asm-x86/mach-xen/asm/pgtable.h
24609 @@ -1,5 +1,454 @@
24610 +#ifndef _ASM_X86_PGTABLE_H
24611 +#define _ASM_X86_PGTABLE_H
24612 +
24613 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
24614 +#define FIRST_USER_ADDRESS     0
24615 +
24616 +#define _PAGE_BIT_PRESENT      0
24617 +#define _PAGE_BIT_RW           1
24618 +#define _PAGE_BIT_USER         2
24619 +#define _PAGE_BIT_PWT          3
24620 +#define _PAGE_BIT_PCD          4
24621 +#define _PAGE_BIT_ACCESSED     5
24622 +#define _PAGE_BIT_DIRTY                6
24623 +#define _PAGE_BIT_FILE         6
24624 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
24625 +#define _PAGE_BIT_PAT          7       /* on 4KB pages */
24626 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
24627 +#define _PAGE_BIT_IO           9       /* Mapped page is I/O or foreign and
24628 +                                        * has no associated page struct. */
24629 +#define _PAGE_BIT_UNUSED2      10      /* available for programmer */
24630 +#define _PAGE_BIT_UNUSED3      11
24631 +#define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
24632 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
24633 +
24634 +/*
24635 + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
24636 + * sign-extended value on 32-bit with all 1's in the upper word,
24637 + * which preserves the upper pte values on 64-bit ptes:
24638 + */
24639 +#define _PAGE_PRESENT  (_AC(1, L)<<_PAGE_BIT_PRESENT)
24640 +#define _PAGE_RW       (_AC(1, L)<<_PAGE_BIT_RW)
24641 +#define _PAGE_USER     (_AC(1, L)<<_PAGE_BIT_USER)
24642 +#define _PAGE_PWT      (_AC(1, L)<<_PAGE_BIT_PWT)
24643 +#define _PAGE_PCD      (_AC(1, L)<<_PAGE_BIT_PCD)
24644 +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
24645 +#define _PAGE_DIRTY    (_AC(1, L)<<_PAGE_BIT_DIRTY)
24646 +#define _PAGE_PSE      (_AC(1, L)<<_PAGE_BIT_PSE)      /* 2MB page */
24647 +#define _PAGE_GLOBAL   (_AC(1, L)<<_PAGE_BIT_GLOBAL)   /* Global TLB entry */
24648 +#define _PAGE_IO       (_AC(1, L)<<_PAGE_BIT_IO)
24649 +#define _PAGE_UNUSED2  (_AC(1, L)<<_PAGE_BIT_UNUSED2)
24650 +#define _PAGE_UNUSED3  (_AC(1, L)<<_PAGE_BIT_UNUSED3)
24651 +#define _PAGE_PAT      (_AC(1, L)<<_PAGE_BIT_PAT)
24652 +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
24653 +
24654 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
24655 +#define _PAGE_NX       (_AC(1, ULL) << _PAGE_BIT_NX)
24656 +#else
24657 +#define _PAGE_NX       0
24658 +#endif
24659 +
24660 +/* If _PAGE_PRESENT is clear, we use these: */
24661 +#define _PAGE_FILE     _PAGE_DIRTY     /* nonlinear file mapping, saved PTE; unset:swap */
24662 +#define _PAGE_PROTNONE _PAGE_PSE       /* if the user mapped it with PROT_NONE;
24663 +                                          pte_present gives true */
24664 +
24665 +#ifndef __ASSEMBLY__
24666 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
24667 +extern unsigned int __kernel_page_user;
24668 +#else
24669 +#define __kernel_page_user 0
24670 +#endif
24671 +#endif
24672 +
24673 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24674 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24675 +
24676 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24677 +
24678 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24679 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24680 +
24681 +#define PAGE_SHARED_EXEC       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24682 +#define PAGE_COPY_NOEXEC       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24683 +#define PAGE_COPY_EXEC         __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24684 +#define PAGE_COPY              PAGE_COPY_NOEXEC
24685 +#define PAGE_READONLY          __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24686 +#define PAGE_READONLY_EXEC     __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24687 +
24688 +#ifdef CONFIG_X86_32
24689 +#define _PAGE_KERNEL_EXEC \
24690 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
24691 +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
24692 +
24693 +#ifndef __ASSEMBLY__
24694 +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
24695 +#endif /* __ASSEMBLY__ */
24696 +#else
24697 +#define __PAGE_KERNEL_EXEC                                             \
24698 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24699 +#define __PAGE_KERNEL          (__PAGE_KERNEL_EXEC | _PAGE_NX)
24700 +#endif
24701 +
24702 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
24703 +#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
24704 +#define __PAGE_KERNEL_EXEC_NOCACHE     (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
24705 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
24706 +#define __PAGE_KERNEL_UC_MINUS         (__PAGE_KERNEL | _PAGE_PCD)
24707 +#define __PAGE_KERNEL_VSYSCALL         (__PAGE_KERNEL_RX | _PAGE_USER)
24708 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
24709 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
24710 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24711 +
24712 +/*
24713 + * We don't support GLOBAL page in xenolinux64
24714 + */
24715 +#define MAKE_GLOBAL(x)                 __pgprot((x))
24716 +
24717 +#define PAGE_KERNEL                    MAKE_GLOBAL(__PAGE_KERNEL)
24718 +#define PAGE_KERNEL_RO                 MAKE_GLOBAL(__PAGE_KERNEL_RO)
24719 +#define PAGE_KERNEL_EXEC               MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24720 +#define PAGE_KERNEL_RX                 MAKE_GLOBAL(__PAGE_KERNEL_RX)
24721 +#define PAGE_KERNEL_NOCACHE            MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24722 +#define PAGE_KERNEL_UC_MINUS           MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
24723 +#define PAGE_KERNEL_EXEC_NOCACHE       MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
24724 +#define PAGE_KERNEL_LARGE              MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24725 +#define PAGE_KERNEL_LARGE_EXEC         MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
24726 +#define PAGE_KERNEL_VSYSCALL           MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24727 +#define PAGE_KERNEL_VSYSCALL_NOCACHE   MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24728 +
24729 +/*         xwr */
24730 +#define __P000 PAGE_NONE
24731 +#define __P001 PAGE_READONLY
24732 +#define __P010 PAGE_COPY
24733 +#define __P011 PAGE_COPY
24734 +#define __P100 PAGE_READONLY_EXEC
24735 +#define __P101 PAGE_READONLY_EXEC
24736 +#define __P110 PAGE_COPY_EXEC
24737 +#define __P111 PAGE_COPY_EXEC
24738 +
24739 +#define __S000 PAGE_NONE
24740 +#define __S001 PAGE_READONLY
24741 +#define __S010 PAGE_SHARED
24742 +#define __S011 PAGE_SHARED
24743 +#define __S100 PAGE_READONLY_EXEC
24744 +#define __S101 PAGE_READONLY_EXEC
24745 +#define __S110 PAGE_SHARED_EXEC
24746 +#define __S111 PAGE_SHARED_EXEC
24747 +
24748 +#ifndef __ASSEMBLY__
24749 +
24750 +/*
24751 + * ZERO_PAGE is a global shared page that is always zero: used
24752 + * for zero-mapped memory areas etc..
24753 + */
24754 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24755 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24756 +
24757 +extern spinlock_t pgd_lock;
24758 +extern struct list_head pgd_list;
24759 +
24760 +/*
24761 + * The following only work if pte_present() is true.
24762 + * Undefined behaviour if not..
24763 + */
24764 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
24765 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
24766 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
24767 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
24768 +static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
24769 +static inline int pte_global(pte_t pte)        { return 0; }
24770 +static inline int pte_exec(pte_t pte)          { return !(__pte_val(pte) & _PAGE_NX); }
24771 +
24772 +static inline int pmd_large(pmd_t pte) {
24773 +       return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24774 +               (_PAGE_PSE|_PAGE_PRESENT);
24775 +}
24776 +
24777 +static inline pte_t pte_mkclean(pte_t pte)     { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
24778 +static inline pte_t pte_mkold(pte_t pte)       { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
24779 +static inline pte_t pte_wrprotect(pte_t pte)   { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
24780 +static inline pte_t pte_mkexec(pte_t pte)      { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
24781 +static inline pte_t pte_mkdirty(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
24782 +static inline pte_t pte_mkyoung(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
24783 +static inline pte_t pte_mkwrite(pte_t pte)     { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
24784 +static inline pte_t pte_mkhuge(pte_t pte)      { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
24785 +static inline pte_t pte_clrhuge(pte_t pte)     { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
24786 +static inline pte_t pte_mkglobal(pte_t pte)    { return pte; }
24787 +static inline pte_t pte_clrglobal(pte_t pte)   { return pte; }
24788 +
24789 +extern pteval_t __supported_pte_mask;
24790 +
24791 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24792 +{
24793 +       return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
24794 +                     pgprot_val(pgprot)) & __supported_pte_mask);
24795 +}
24796 +
24797 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
24798 +{
24799 +       return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
24800 +                        pgprot_val(pgprot)) & __supported_pte_mask);
24801 +}
24802 +
24803 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
24804 +{
24805 +       return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
24806 +                     pgprot_val(pgprot)) & __supported_pte_mask);
24807 +}
24808 +
24809 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24810 +{
24811 +       pteval_t val = pte_val(pte);
24812 +
24813 +       val &= _PAGE_CHG_MASK;
24814 +       val |= pgprot_val(newprot) & __supported_pte_mask;
24815 +
24816 +       return __pte(val);
24817 +}
24818 +
24819 +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
24820 +
24821 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
24822 +
24823 +#define set_pte(ptep, pte)             xen_set_pte(ptep, pte)
24824 +#define set_pte_at(mm, addr, ptep, pte)        xen_set_pte_at(mm, addr, ptep, pte)
24825 +
24826 +#define set_pte_atomic(ptep, pte)                                      \
24827 +       xen_set_pte_atomic(ptep, pte)
24828 +
24829 +#define set_pmd(pmdp, pmd)             xen_set_pmd(pmdp, pmd)
24830 +
24831 +#ifndef __PAGETABLE_PUD_FOLDED
24832 +#define set_pgd(pgdp, pgd)             xen_set_pgd(pgdp, pgd)
24833 +#define pgd_clear(pgd)                 xen_pgd_clear(pgd)
24834 +#endif
24835 +
24836 +#ifndef set_pud
24837 +# define set_pud(pudp, pud)            xen_set_pud(pudp, pud)
24838 +#endif
24839 +
24840 +#ifndef __PAGETABLE_PMD_FOLDED
24841 +#define pud_clear(pud)                 xen_pud_clear(pud)
24842 +#endif
24843 +
24844 +#define pte_clear(mm, addr, ptep)      xen_pte_clear(mm, addr, ptep)
24845 +#define pmd_clear(pmd)                 xen_pmd_clear(pmd)
24846 +
24847 +#define pte_update(mm, addr, ptep)              do { } while (0)
24848 +#define pte_update_defer(mm, addr, ptep)        do { } while (0)
24849 +
24850 +#endif /* __ASSEMBLY__ */
24851 +
24852  #ifdef CONFIG_X86_32
24853  # include "pgtable_32.h"
24854  #else
24855  # include "pgtable_64.h"
24856  #endif
24857 +
24858 +#ifndef __ASSEMBLY__
24859 +
24860 +enum {
24861 +       PG_LEVEL_NONE,
24862 +       PG_LEVEL_4K,
24863 +       PG_LEVEL_2M,
24864 +       PG_LEVEL_1G,
24865 +};
24866 +
24867 +/*
24868 + * Helper function that returns the kernel pagetable entry controlling
24869 + * the virtual address 'address'. NULL means no pagetable entry present.
24870 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24871 + * as a pte too.
24872 + */
24873 +extern pte_t *lookup_address(unsigned long address, unsigned int *level);
24874 +
24875 +/* local pte updates need not use xchg for locking */
24876 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
24877 +{
24878 +       xen_set_pte(ptep, __pte(0));
24879 +       return res;
24880 +}
24881 +
24882 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
24883 +                                 pte_t *ptep , pte_t pte)
24884 +{
24885 +       if ((mm != current->mm && mm != &init_mm) ||
24886 +           HYPERVISOR_update_va_mapping(addr, pte, 0))
24887 +               xen_set_pte(ptep, pte);
24888 +}
24889 +
24890 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
24891 +                                pte_t *ptep)
24892 +{
24893 +       if ((mm != current->mm && mm != &init_mm)
24894 +           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24895 +               __xen_pte_clear(ptep);
24896 +}
24897 +
24898 +#ifndef CONFIG_PARAVIRT
24899 +/*
24900 + * Rules for using pte_update - it must be called after any PTE update which
24901 + * has not been done using the set_pte / clear_pte interfaces.  It is used by
24902 + * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
24903 + * updates should either be sets, clears, or set_pte_atomic for P->P
24904 + * transitions, which means this hook should only be called for user PTEs.
24905 + * This hook implies a P->P protection or access change has taken place, which
24906 + * requires a subsequent TLB flush.  The notification can optionally be delayed
24907 + * until the TLB flush event by using the pte_update_defer form of the
24908 + * interface, but care must be taken to assure that the flush happens while
24909 + * still holding the same page table lock so that the shadow and primary pages
24910 + * do not become out of sync on SMP.
24911 + */
24912 +#define pte_update(mm, addr, ptep)             do { } while (0)
24913 +#define pte_update_defer(mm, addr, ptep)       do { } while (0)
24914 +#endif
24915 +
24916 +/*
24917 + * We only update the dirty/accessed state if we set
24918 + * the dirty bit by hand in the kernel, since the hardware
24919 + * will do the accessed bit for us, and we don't want to
24920 + * race with other CPU's that might be updating the dirty
24921 + * bit at the same time.
24922 + */
24923 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24924 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
24925 +({                                                                     \
24926 +       int __changed = !pte_same(*(ptep), entry);                      \
24927 +       if (__changed && (dirty)) {                                     \
24928 +               if ( likely((vma)->vm_mm == current->mm) ) {            \
24929 +                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
24930 +                               entry,                                  \
24931 +                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24932 +                                       UVMF_INVLPG|UVMF_MULTI));       \
24933 +               } else {                                                \
24934 +                       xen_l1_entry_update(ptep, entry);               \
24935 +                       flush_tlb_page(vma, address);                   \
24936 +               }                                                       \
24937 +       }                                                               \
24938 +       __changed;                                                      \
24939 +})
24940 +
24941 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24942 +#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
24943 +       int __ret = 0;                                                  \
24944 +       if (pte_young(*(ptep)))                                         \
24945 +               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
24946 +                                          &(ptep)->pte);               \
24947 +       if (__ret)                                                      \
24948 +               pte_update((vma)->vm_mm, addr, ptep);                   \
24949 +       __ret;                                                          \
24950 +})
24951 +
24952 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24953 +#define ptep_clear_flush_young(vma, address, ptep)                     \
24954 +({                                                                     \
24955 +       pte_t __pte = *(ptep);                                          \
24956 +       int __young = pte_young(__pte);                                 \
24957 +       __pte = pte_mkold(__pte);                                       \
24958 +       if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))                \
24959 +               (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24960 +       else if (__young)                                               \
24961 +               (ptep)->pte_low = __pte.pte_low;                        \
24962 +       __young;                                                        \
24963 +})
24964 +
24965 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24966 +#define ptep_clear_flush(vma, addr, ptep)                      \
24967 +({                                                             \
24968 +       pte_t *__ptep = (ptep);                                 \
24969 +       pte_t __res = *__ptep;                                  \
24970 +       if (!pte_none(__res) &&                                 \
24971 +           ((vma)->vm_mm != current->mm ||                     \
24972 +            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
24973 +                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24974 +                               UVMF_INVLPG|UVMF_MULTI))) {     \
24975 +               __xen_pte_clear(__ptep);                        \
24976 +               flush_tlb_page(vma, addr);                      \
24977 +       }                                                       \
24978 +       __res;                                                  \
24979 +})
24980 +
24981 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24982 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24983 +{
24984 +       pte_t pte = *ptep;
24985 +       if (!pte_none(pte)
24986 +           && (mm != &init_mm
24987 +               || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24988 +               pte = xen_ptep_get_and_clear(ptep, pte);
24989 +               pte_update(mm, addr, ptep);
24990 +       }
24991 +       return pte;
24992 +}
24993 +
24994 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24995 +#define ptep_get_and_clear_full(mm, addr, ptep, full)          \
24996 +       ((full) ? ({                                            \
24997 +               pte_t *__ptep = (ptep);                         \
24998 +               pte_t __res = *__ptep;                          \
24999 +               if (!PagePinned(virt_to_page((mm)->pgd)))       \
25000 +                       __xen_pte_clear(__ptep);                \
25001 +               else if (!pte_none(__res))                      \
25002 +                       xen_l1_entry_update(__ptep, __pte(0));  \
25003 +               __res;                                          \
25004 +        }) :                                                   \
25005 +        ptep_get_and_clear(mm, addr, ptep))
25006 +
25007 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
25008 +
25009 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
25010 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
25011 +{
25012 +       pte_t pte = *ptep;
25013 +       if (pte_write(pte))
25014 +               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
25015 +}
25016 +
25017 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25018 +       xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25019 +
25020 +#define arbitrary_virt_to_machine(va)                                  \
25021 +({                                                                     \
25022 +       unsigned int __lvl;                                             \
25023 +       pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);    \
25024 +       BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
25025 +       (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT)                      \
25026 +        | ((unsigned long)(va) & (PAGE_SIZE - 1)));                    \
25027 +})
25028 +
25029 +#include <asm-generic/pgtable.h>
25030 +
25031 +#include <xen/features.h>
25032 +void make_page_readonly(void *va, unsigned int feature);
25033 +void make_page_writable(void *va, unsigned int feature);
25034 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
25035 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
25036 +
25037 +struct vm_area_struct;
25038 +
25039 +int direct_remap_pfn_range(struct vm_area_struct *vma,
25040 +                           unsigned long address,
25041 +                           unsigned long mfn,
25042 +                           unsigned long size,
25043 +                           pgprot_t prot,
25044 +                           domid_t  domid);
25045 +int direct_kernel_remap_pfn_range(unsigned long address,
25046 +                                 unsigned long mfn,
25047 +                                 unsigned long size,
25048 +                                 pgprot_t prot,
25049 +                                 domid_t  domid);
25050 +int create_lookup_pte_addr(struct mm_struct *mm,
25051 +                           unsigned long address,
25052 +                           uint64_t *ptep);
25053 +int touch_pte_range(struct mm_struct *mm,
25054 +                    unsigned long address,
25055 +                    unsigned long size);
25056 +
25057 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25058 +               unsigned long addr, unsigned long end, pgprot_t newprot,
25059 +               int dirty_accountable);
25060 +
25061 +#endif /* __ASSEMBLY__ */
25062 +
25063 +#endif /* _ASM_X86_PGTABLE_H */
25064 --- a/include/asm-x86/mach-xen/asm/processor_32.h
25065 +++ /dev/null
25066 @@ -1,751 +0,0 @@
25067 -/*
25068 - * include/asm-i386/processor.h
25069 - *
25070 - * Copyright (C) 1994 Linus Torvalds
25071 - */
25072 -
25073 -#ifndef __ASM_I386_PROCESSOR_H
25074 -#define __ASM_I386_PROCESSOR_H
25075 -
25076 -#include <asm/vm86.h>
25077 -#include <asm/math_emu.h>
25078 -#include <asm/segment.h>
25079 -#include <asm/page.h>
25080 -#include <asm/types.h>
25081 -#include <asm/sigcontext.h>
25082 -#include <asm/cpufeature.h>
25083 -#include <asm/msr.h>
25084 -#include <asm/system.h>
25085 -#include <linux/cache.h>
25086 -#include <linux/threads.h>
25087 -#include <asm/percpu.h>
25088 -#include <linux/cpumask.h>
25089 -#include <linux/init.h>
25090 -#include <asm/processor-flags.h>
25091 -#include <xen/interface/physdev.h>
25092 -
25093 -/* flag for disabling the tsc */
25094 -#define tsc_disable 0
25095 -
25096 -struct desc_struct {
25097 -       unsigned long a,b;
25098 -};
25099 -
25100 -#define desc_empty(desc) \
25101 -               (!((desc)->a | (desc)->b))
25102 -
25103 -#define desc_equal(desc1, desc2) \
25104 -               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25105 -/*
25106 - * Default implementation of macro that returns current
25107 - * instruction pointer ("program counter").
25108 - */
25109 -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25110 -
25111 -/*
25112 - *  CPU type and hardware bug flags. Kept separately for each CPU.
25113 - *  Members of this structure are referenced in head.S, so think twice
25114 - *  before touching them. [mj]
25115 - */
25116 -
25117 -struct cpuinfo_x86 {
25118 -       __u8    x86;            /* CPU family */
25119 -       __u8    x86_vendor;     /* CPU vendor */
25120 -       __u8    x86_model;
25121 -       __u8    x86_mask;
25122 -       char    wp_works_ok;    /* It doesn't on 386's */
25123 -       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
25124 -       char    hard_math;
25125 -       char    rfu;
25126 -               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
25127 -       unsigned long   x86_capability[NCAPINTS];
25128 -       char    x86_vendor_id[16];
25129 -       char    x86_model_id[64];
25130 -       int     x86_cache_size;  /* in KB - valid for CPUS which support this
25131 -                                   call  */
25132 -       int     x86_cache_alignment;    /* In bytes */
25133 -       char    fdiv_bug;
25134 -       char    f00f_bug;
25135 -       char    coma_bug;
25136 -       char    pad0;
25137 -       int     x86_power;
25138 -       unsigned long loops_per_jiffy;
25139 -#ifdef CONFIG_SMP
25140 -       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
25141 -#endif
25142 -       unsigned char x86_max_cores;    /* cpuid returned max cores value */
25143 -       unsigned char apicid;
25144 -       unsigned short x86_clflush_size;
25145 -#ifdef CONFIG_SMP
25146 -       unsigned char booted_cores;     /* number of cores as seen by OS */
25147 -       __u8 phys_proc_id;              /* Physical processor id. */
25148 -       __u8 cpu_core_id;               /* Core id */
25149 -       __u8 cpu_index;                 /* index into per_cpu list */
25150 -#endif
25151 -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25152 -
25153 -#define X86_VENDOR_INTEL 0
25154 -#define X86_VENDOR_CYRIX 1
25155 -#define X86_VENDOR_AMD 2
25156 -#define X86_VENDOR_UMC 3
25157 -#define X86_VENDOR_NEXGEN 4
25158 -#define X86_VENDOR_CENTAUR 5
25159 -#define X86_VENDOR_TRANSMETA 7
25160 -#define X86_VENDOR_NSC 8
25161 -#define X86_VENDOR_NUM 9
25162 -#define X86_VENDOR_UNKNOWN 0xff
25163 -
25164 -/*
25165 - * capabilities of CPUs
25166 - */
25167 -
25168 -extern struct cpuinfo_x86 boot_cpu_data;
25169 -extern struct cpuinfo_x86 new_cpu_data;
25170 -#ifndef CONFIG_X86_NO_TSS
25171 -extern struct tss_struct doublefault_tss;
25172 -DECLARE_PER_CPU(struct tss_struct, init_tss);
25173 -#endif
25174 -
25175 -#ifdef CONFIG_SMP
25176 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25177 -#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
25178 -#define current_cpu_data       cpu_data(smp_processor_id())
25179 -#else
25180 -#define cpu_data(cpu)          boot_cpu_data
25181 -#define current_cpu_data       boot_cpu_data
25182 -#endif
25183 -
25184 -/*
25185 - * the following now lives in the per cpu area:
25186 - * extern      int cpu_llc_id[NR_CPUS];
25187 - */
25188 -DECLARE_PER_CPU(u8, cpu_llc_id);
25189 -extern char ignore_fpu_irq;
25190 -
25191 -void __init cpu_detect(struct cpuinfo_x86 *c);
25192 -
25193 -extern void identify_boot_cpu(void);
25194 -extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25195 -extern void print_cpu_info(struct cpuinfo_x86 *);
25196 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25197 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25198 -extern unsigned short num_cache_leaves;
25199 -
25200 -#ifdef CONFIG_X86_HT
25201 -extern void detect_ht(struct cpuinfo_x86 *c);
25202 -#else
25203 -static inline void detect_ht(struct cpuinfo_x86 *c) {}
25204 -#endif
25205 -
25206 -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25207 -                            unsigned int *ecx, unsigned int *edx)
25208 -{
25209 -       /* ecx is often an input as well as an output. */
25210 -       __asm__(XEN_CPUID
25211 -               : "=a" (*eax),
25212 -                 "=b" (*ebx),
25213 -                 "=c" (*ecx),
25214 -                 "=d" (*edx)
25215 -               : "0" (*eax), "2" (*ecx));
25216 -}
25217 -
25218 -#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25219 -
25220 -/*
25221 - * Save the cr4 feature set we're using (ie
25222 - * Pentium 4MB enable and PPro Global page
25223 - * enable), so that any CPU's that boot up
25224 - * after us can get the correct flags.
25225 - */
25226 -extern unsigned long mmu_cr4_features;
25227 -
25228 -static inline void set_in_cr4 (unsigned long mask)
25229 -{
25230 -       unsigned cr4;
25231 -       mmu_cr4_features |= mask;
25232 -       cr4 = read_cr4();
25233 -       cr4 |= mask;
25234 -       write_cr4(cr4);
25235 -}
25236 -
25237 -static inline void clear_in_cr4 (unsigned long mask)
25238 -{
25239 -       unsigned cr4;
25240 -       mmu_cr4_features &= ~mask;
25241 -       cr4 = read_cr4();
25242 -       cr4 &= ~mask;
25243 -       write_cr4(cr4);
25244 -}
25245 -
25246 -/* Stop speculative execution */
25247 -static inline void sync_core(void)
25248 -{
25249 -       int tmp;
25250 -       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25251 -}
25252 -
25253 -static inline void __monitor(const void *eax, unsigned long ecx,
25254 -               unsigned long edx)
25255 -{
25256 -       /* "monitor %eax,%ecx,%edx;" */
25257 -       asm volatile(
25258 -               ".byte 0x0f,0x01,0xc8;"
25259 -               : :"a" (eax), "c" (ecx), "d"(edx));
25260 -}
25261 -
25262 -static inline void __mwait(unsigned long eax, unsigned long ecx)
25263 -{
25264 -       /* "mwait %eax,%ecx;" */
25265 -       asm volatile(
25266 -               ".byte 0x0f,0x01,0xc9;"
25267 -               : :"a" (eax), "c" (ecx));
25268 -}
25269 -
25270 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25271 -
25272 -/* from system description table in BIOS.  Mostly for MCA use, but
25273 -others may find it useful. */
25274 -extern unsigned int machine_id;
25275 -extern unsigned int machine_submodel_id;
25276 -extern unsigned int BIOS_revision;
25277 -extern unsigned int mca_pentium_flag;
25278 -
25279 -/* Boot loader type from the setup header */
25280 -extern int bootloader_type;
25281 -
25282 -/*
25283 - * User space process size: 3GB (default).
25284 - */
25285 -#define TASK_SIZE      (PAGE_OFFSET)
25286 -
25287 -/* This decides where the kernel will search for a free chunk of vm
25288 - * space during mmap's.
25289 - */
25290 -#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
25291 -
25292 -#define HAVE_ARCH_PICK_MMAP_LAYOUT
25293 -
25294 -extern void hard_disable_TSC(void);
25295 -extern void disable_TSC(void);
25296 -extern void hard_enable_TSC(void);
25297 -
25298 -/*
25299 - * Size of io_bitmap.
25300 - */
25301 -#define IO_BITMAP_BITS  65536
25302 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25303 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25304 -#ifndef CONFIG_X86_NO_TSS
25305 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25306 -#endif
25307 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25308 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25309 -
25310 -struct i387_fsave_struct {
25311 -       long    cwd;
25312 -       long    swd;
25313 -       long    twd;
25314 -       long    fip;
25315 -       long    fcs;
25316 -       long    foo;
25317 -       long    fos;
25318 -       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25319 -       long    status;         /* software status information */
25320 -};
25321 -
25322 -struct i387_fxsave_struct {
25323 -       unsigned short  cwd;
25324 -       unsigned short  swd;
25325 -       unsigned short  twd;
25326 -       unsigned short  fop;
25327 -       long    fip;
25328 -       long    fcs;
25329 -       long    foo;
25330 -       long    fos;
25331 -       long    mxcsr;
25332 -       long    mxcsr_mask;
25333 -       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
25334 -       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
25335 -       long    padding[56];
25336 -} __attribute__ ((aligned (16)));
25337 -
25338 -struct i387_soft_struct {
25339 -       long    cwd;
25340 -       long    swd;
25341 -       long    twd;
25342 -       long    fip;
25343 -       long    fcs;
25344 -       long    foo;
25345 -       long    fos;
25346 -       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
25347 -       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
25348 -       struct info     *info;
25349 -       unsigned long   entry_eip;
25350 -};
25351 -
25352 -union i387_union {
25353 -       struct i387_fsave_struct        fsave;
25354 -       struct i387_fxsave_struct       fxsave;
25355 -       struct i387_soft_struct soft;
25356 -};
25357 -
25358 -typedef struct {
25359 -       unsigned long seg;
25360 -} mm_segment_t;
25361 -
25362 -struct thread_struct;
25363 -
25364 -#ifndef CONFIG_X86_NO_TSS
25365 -/* This is the TSS defined by the hardware. */
25366 -struct i386_hw_tss {
25367 -       unsigned short  back_link,__blh;
25368 -       unsigned long   esp0;
25369 -       unsigned short  ss0,__ss0h;
25370 -       unsigned long   esp1;
25371 -       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25372 -       unsigned long   esp2;
25373 -       unsigned short  ss2,__ss2h;
25374 -       unsigned long   __cr3;
25375 -       unsigned long   eip;
25376 -       unsigned long   eflags;
25377 -       unsigned long   eax,ecx,edx,ebx;
25378 -       unsigned long   esp;
25379 -       unsigned long   ebp;
25380 -       unsigned long   esi;
25381 -       unsigned long   edi;
25382 -       unsigned short  es, __esh;
25383 -       unsigned short  cs, __csh;
25384 -       unsigned short  ss, __ssh;
25385 -       unsigned short  ds, __dsh;
25386 -       unsigned short  fs, __fsh;
25387 -       unsigned short  gs, __gsh;
25388 -       unsigned short  ldt, __ldth;
25389 -       unsigned short  trace, io_bitmap_base;
25390 -} __attribute__((packed));
25391 -
25392 -struct tss_struct {
25393 -       struct i386_hw_tss x86_tss;
25394 -
25395 -       /*
25396 -        * The extra 1 is there because the CPU will access an
25397 -        * additional byte beyond the end of the IO permission
25398 -        * bitmap. The extra byte must be all 1 bits, and must
25399 -        * be within the limit.
25400 -        */
25401 -       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
25402 -       /*
25403 -        * Cache the current maximum and the last task that used the bitmap:
25404 -        */
25405 -       unsigned long io_bitmap_max;
25406 -       struct thread_struct *io_bitmap_owner;
25407 -       /*
25408 -        * pads the TSS to be cacheline-aligned (size is 0x100)
25409 -        */
25410 -       unsigned long __cacheline_filler[35];
25411 -       /*
25412 -        * .. and then another 0x100 bytes for emergency kernel stack
25413 -        */
25414 -       unsigned long stack[64];
25415 -} __attribute__((packed));
25416 -#endif
25417 -
25418 -#define ARCH_MIN_TASKALIGN     16
25419 -
25420 -struct thread_struct {
25421 -/* cached TLS descriptors. */
25422 -       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25423 -       unsigned long   esp0;
25424 -       unsigned long   sysenter_cs;
25425 -       unsigned long   eip;
25426 -       unsigned long   esp;
25427 -       unsigned long   fs;
25428 -       unsigned long   gs;
25429 -/* Hardware debugging registers */
25430 -       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
25431 -/* fault info */
25432 -       unsigned long   cr2, trap_no, error_code;
25433 -/* floating point info */
25434 -       union i387_union        i387;
25435 -/* virtual 86 mode info */
25436 -       struct vm86_struct __user * vm86_info;
25437 -       unsigned long           screen_bitmap;
25438 -       unsigned long           v86flags, v86mask, saved_esp0;
25439 -       unsigned int            saved_fs, saved_gs;
25440 -/* IO permissions */
25441 -       unsigned long   *io_bitmap_ptr;
25442 -       unsigned long   iopl;
25443 -/* max allowed port in the bitmap, in bytes: */
25444 -       unsigned long   io_bitmap_max;
25445 -};
25446 -
25447 -#define INIT_THREAD  {                                                 \
25448 -       .esp0 = sizeof(init_stack) + (long)&init_stack,                 \
25449 -       .vm86_info = NULL,                                              \
25450 -       .sysenter_cs = __KERNEL_CS,                                     \
25451 -       .io_bitmap_ptr = NULL,                                          \
25452 -       .fs = __KERNEL_PERCPU,                                          \
25453 -}
25454 -
25455 -/*
25456 - * Note that the .io_bitmap member must be extra-big. This is because
25457 - * the CPU will access an additional byte beyond the end of the IO
25458 - * permission bitmap. The extra byte must be all 1 bits, and must
25459 - * be within the limit.
25460 - */
25461 -#define INIT_TSS  {                                                    \
25462 -       .x86_tss = {                                                    \
25463 -               .esp0           = sizeof(init_stack) + (long)&init_stack, \
25464 -               .ss0            = __KERNEL_DS,                          \
25465 -               .ss1            = __KERNEL_CS,                          \
25466 -               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
25467 -        },                                                             \
25468 -       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
25469 -}
25470 -
25471 -#define start_thread(regs, new_eip, new_esp) do {              \
25472 -       __asm__("movl %0,%%gs": :"r" (0));                      \
25473 -       regs->xfs = 0;                                          \
25474 -       set_fs(USER_DS);                                        \
25475 -       regs->xds = __USER_DS;                                  \
25476 -       regs->xes = __USER_DS;                                  \
25477 -       regs->xss = __USER_DS;                                  \
25478 -       regs->xcs = __USER_CS;                                  \
25479 -       regs->eip = new_eip;                                    \
25480 -       regs->esp = new_esp;                                    \
25481 -} while (0)
25482 -
25483 -/* Forward declaration, a strange C thing */
25484 -struct task_struct;
25485 -struct mm_struct;
25486 -
25487 -/* Free all resources held by a thread. */
25488 -extern void release_thread(struct task_struct *);
25489 -
25490 -/* Prepare to copy thread state - unlazy all lazy status */
25491 -extern void prepare_to_copy(struct task_struct *tsk);
25492 -
25493 -/*
25494 - * create a kernel thread without removing it from tasklists
25495 - */
25496 -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25497 -
25498 -extern unsigned long thread_saved_pc(struct task_struct *tsk);
25499 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25500 -
25501 -unsigned long get_wchan(struct task_struct *p);
25502 -
25503 -#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
25504 -#define KSTK_TOP(info)                                                 \
25505 -({                                                                     \
25506 -       unsigned long *__ptr = (unsigned long *)(info);                 \
25507 -       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
25508 -})
25509 -
25510 -/*
25511 - * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25512 - * This is necessary to guarantee that the entire "struct pt_regs"
25513 - * is accessable even if the CPU haven't stored the SS/ESP registers
25514 - * on the stack (interrupt gate does not save these registers
25515 - * when switching to the same priv ring).
25516 - * Therefore beware: accessing the xss/esp fields of the
25517 - * "struct pt_regs" is possible, but they may contain the
25518 - * completely wrong values.
25519 - */
25520 -#define task_pt_regs(task)                                             \
25521 -({                                                                     \
25522 -       struct pt_regs *__regs__;                                       \
25523 -       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25524 -       __regs__ - 1;                                                   \
25525 -})
25526 -
25527 -#define KSTK_EIP(task) (task_pt_regs(task)->eip)
25528 -#define KSTK_ESP(task) (task_pt_regs(task)->esp)
25529 -
25530 -
25531 -struct microcode_header {
25532 -       unsigned int hdrver;
25533 -       unsigned int rev;
25534 -       unsigned int date;
25535 -       unsigned int sig;
25536 -       unsigned int cksum;
25537 -       unsigned int ldrver;
25538 -       unsigned int pf;
25539 -       unsigned int datasize;
25540 -       unsigned int totalsize;
25541 -       unsigned int reserved[3];
25542 -};
25543 -
25544 -struct microcode {
25545 -       struct microcode_header hdr;
25546 -       unsigned int bits[0];
25547 -};
25548 -
25549 -typedef struct microcode microcode_t;
25550 -typedef struct microcode_header microcode_header_t;
25551 -
25552 -/* microcode format is extended from prescott processors */
25553 -struct extended_signature {
25554 -       unsigned int sig;
25555 -       unsigned int pf;
25556 -       unsigned int cksum;
25557 -};
25558 -
25559 -struct extended_sigtable {
25560 -       unsigned int count;
25561 -       unsigned int cksum;
25562 -       unsigned int reserved[3];
25563 -       struct extended_signature sigs[0];
25564 -};
25565 -
25566 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25567 -static inline void rep_nop(void)
25568 -{
25569 -       __asm__ __volatile__("rep;nop": : :"memory");
25570 -}
25571 -
25572 -#define cpu_relax()    rep_nop()
25573 -
25574 -#ifndef CONFIG_X86_NO_TSS
25575 -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
25576 -{
25577 -       tss->x86_tss.esp0 = thread->esp0;
25578 -       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
25579 -       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25580 -               tss->x86_tss.ss1 = thread->sysenter_cs;
25581 -               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25582 -       }
25583 -}
25584 -#else
25585 -#define xen_load_esp0(tss, thread) do { \
25586 -       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
25587 -               BUG(); \
25588 -} while (0)
25589 -#endif
25590 -
25591 -
25592 -static inline unsigned long xen_get_debugreg(int regno)
25593 -{
25594 -       return HYPERVISOR_get_debugreg(regno);
25595 -}
25596 -
25597 -static inline void xen_set_debugreg(int regno, unsigned long value)
25598 -{
25599 -       WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25600 -}
25601 -
25602 -/*
25603 - * Set IOPL bits in EFLAGS from given mask
25604 - */
25605 -static inline void xen_set_iopl_mask(unsigned mask)
25606 -{
25607 -       struct physdev_set_iopl set_iopl;
25608 -
25609 -       /* Force the change at ring 0. */
25610 -       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25611 -       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25612 -}
25613 -
25614 -
25615 -#define paravirt_enabled() 0
25616 -#define __cpuid xen_cpuid
25617 -
25618 -#define load_esp0 xen_load_esp0
25619 -
25620 -/*
25621 - * These special macros can be used to get or set a debugging register
25622 - */
25623 -#define get_debugreg(var, register)                            \
25624 -       (var) = xen_get_debugreg(register)
25625 -#define set_debugreg(value, register)                          \
25626 -       xen_set_debugreg(register, value)
25627 -
25628 -#define set_iopl_mask xen_set_iopl_mask
25629 -
25630 -/*
25631 - * Generic CPUID function
25632 - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25633 - * resulting in stale register contents being returned.
25634 - */
25635 -static inline void cpuid(unsigned int op,
25636 -                        unsigned int *eax, unsigned int *ebx,
25637 -                        unsigned int *ecx, unsigned int *edx)
25638 -{
25639 -       *eax = op;
25640 -       *ecx = 0;
25641 -       __cpuid(eax, ebx, ecx, edx);
25642 -}
25643 -
25644 -/* Some CPUID calls want 'count' to be placed in ecx */
25645 -static inline void cpuid_count(unsigned int op, int count,
25646 -                              unsigned int *eax, unsigned int *ebx,
25647 -                              unsigned int *ecx, unsigned int *edx)
25648 -{
25649 -       *eax = op;
25650 -       *ecx = count;
25651 -       __cpuid(eax, ebx, ecx, edx);
25652 -}
25653 -
25654 -/*
25655 - * CPUID functions returning a single datum
25656 - */
25657 -static inline unsigned int cpuid_eax(unsigned int op)
25658 -{
25659 -       unsigned int eax, ebx, ecx, edx;
25660 -
25661 -       cpuid(op, &eax, &ebx, &ecx, &edx);
25662 -       return eax;
25663 -}
25664 -static inline unsigned int cpuid_ebx(unsigned int op)
25665 -{
25666 -       unsigned int eax, ebx, ecx, edx;
25667 -
25668 -       cpuid(op, &eax, &ebx, &ecx, &edx);
25669 -       return ebx;
25670 -}
25671 -static inline unsigned int cpuid_ecx(unsigned int op)
25672 -{
25673 -       unsigned int eax, ebx, ecx, edx;
25674 -
25675 -       cpuid(op, &eax, &ebx, &ecx, &edx);
25676 -       return ecx;
25677 -}
25678 -static inline unsigned int cpuid_edx(unsigned int op)
25679 -{
25680 -       unsigned int eax, ebx, ecx, edx;
25681 -
25682 -       cpuid(op, &eax, &ebx, &ecx, &edx);
25683 -       return edx;
25684 -}
25685 -
25686 -/* generic versions from gas */
25687 -#define GENERIC_NOP1   ".byte 0x90\n"
25688 -#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
25689 -#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
25690 -#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
25691 -#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
25692 -#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
25693 -#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
25694 -#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
25695 -
25696 -/* Opteron nops */
25697 -#define K8_NOP1 GENERIC_NOP1
25698 -#define K8_NOP2        ".byte 0x66,0x90\n"
25699 -#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
25700 -#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
25701 -#define K8_NOP5        K8_NOP3 K8_NOP2
25702 -#define K8_NOP6        K8_NOP3 K8_NOP3
25703 -#define K8_NOP7        K8_NOP4 K8_NOP3
25704 -#define K8_NOP8        K8_NOP4 K8_NOP4
25705 -
25706 -/* K7 nops */
25707 -/* uses eax dependencies (arbitary choice) */
25708 -#define K7_NOP1  GENERIC_NOP1
25709 -#define K7_NOP2        ".byte 0x8b,0xc0\n"
25710 -#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
25711 -#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
25712 -#define K7_NOP5        K7_NOP4 ASM_NOP1
25713 -#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
25714 -#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
25715 -#define K7_NOP8        K7_NOP7 ASM_NOP1
25716 -
25717 -/* P6 nops */
25718 -/* uses eax dependencies (Intel-recommended choice) */
25719 -#define P6_NOP1        GENERIC_NOP1
25720 -#define P6_NOP2        ".byte 0x66,0x90\n"
25721 -#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
25722 -#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
25723 -#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
25724 -#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
25725 -#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
25726 -#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
25727 -
25728 -#ifdef CONFIG_MK8
25729 -#define ASM_NOP1 K8_NOP1
25730 -#define ASM_NOP2 K8_NOP2
25731 -#define ASM_NOP3 K8_NOP3
25732 -#define ASM_NOP4 K8_NOP4
25733 -#define ASM_NOP5 K8_NOP5
25734 -#define ASM_NOP6 K8_NOP6
25735 -#define ASM_NOP7 K8_NOP7
25736 -#define ASM_NOP8 K8_NOP8
25737 -#elif defined(CONFIG_MK7)
25738 -#define ASM_NOP1 K7_NOP1
25739 -#define ASM_NOP2 K7_NOP2
25740 -#define ASM_NOP3 K7_NOP3
25741 -#define ASM_NOP4 K7_NOP4
25742 -#define ASM_NOP5 K7_NOP5
25743 -#define ASM_NOP6 K7_NOP6
25744 -#define ASM_NOP7 K7_NOP7
25745 -#define ASM_NOP8 K7_NOP8
25746 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
25747 -      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
25748 -      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
25749 -#define ASM_NOP1 P6_NOP1
25750 -#define ASM_NOP2 P6_NOP2
25751 -#define ASM_NOP3 P6_NOP3
25752 -#define ASM_NOP4 P6_NOP4
25753 -#define ASM_NOP5 P6_NOP5
25754 -#define ASM_NOP6 P6_NOP6
25755 -#define ASM_NOP7 P6_NOP7
25756 -#define ASM_NOP8 P6_NOP8
25757 -#else
25758 -#define ASM_NOP1 GENERIC_NOP1
25759 -#define ASM_NOP2 GENERIC_NOP2
25760 -#define ASM_NOP3 GENERIC_NOP3
25761 -#define ASM_NOP4 GENERIC_NOP4
25762 -#define ASM_NOP5 GENERIC_NOP5
25763 -#define ASM_NOP6 GENERIC_NOP6
25764 -#define ASM_NOP7 GENERIC_NOP7
25765 -#define ASM_NOP8 GENERIC_NOP8
25766 -#endif
25767 -
25768 -#define ASM_NOP_MAX 8
25769 -
25770 -/* Prefetch instructions for Pentium III and AMD Athlon */
25771 -/* It's not worth to care about 3dnow! prefetches for the K6
25772 -   because they are microcoded there and very slow.
25773 -   However we don't do prefetches for pre XP Athlons currently
25774 -   That should be fixed. */
25775 -#define ARCH_HAS_PREFETCH
25776 -static inline void prefetch(const void *x)
25777 -{
25778 -       alternative_input(ASM_NOP4,
25779 -                         "prefetchnta (%1)",
25780 -                         X86_FEATURE_XMM,
25781 -                         "r" (x));
25782 -}
25783 -
25784 -#define ARCH_HAS_PREFETCH
25785 -#define ARCH_HAS_PREFETCHW
25786 -#define ARCH_HAS_SPINLOCK_PREFETCH
25787 -
25788 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
25789 -   spinlocks to avoid one state transition in the cache coherency protocol. */
25790 -static inline void prefetchw(const void *x)
25791 -{
25792 -       alternative_input(ASM_NOP4,
25793 -                         "prefetchw (%1)",
25794 -                         X86_FEATURE_3DNOW,
25795 -                         "r" (x));
25796 -}
25797 -#define spin_lock_prefetch(x)  prefetchw(x)
25798 -
25799 -extern void select_idle_routine(const struct cpuinfo_x86 *c);
25800 -
25801 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25802 -
25803 -extern unsigned long boot_option_idle_override;
25804 -extern void enable_sep_cpu(void);
25805 -extern int sysenter_setup(void);
25806 -
25807 -/* Defined in head.S */
25808 -extern struct Xgt_desc_struct early_gdt_descr;
25809 -
25810 -extern void cpu_set_gdt(int);
25811 -extern void switch_to_new_gdt(void);
25812 -extern void cpu_init(void);
25813 -extern void init_gdt(int cpu);
25814 -
25815 -extern int force_mwait;
25816 -
25817 -#endif /* __ASM_I386_PROCESSOR_H */
25818 --- a/include/asm-x86/mach-xen/asm/processor_64.h
25819 +++ /dev/null
25820 @@ -1,461 +0,0 @@
25821 -/*
25822 - * include/asm-x86_64/processor.h
25823 - *
25824 - * Copyright (C) 1994 Linus Torvalds
25825 - */
25826 -
25827 -#ifndef __ASM_X86_64_PROCESSOR_H
25828 -#define __ASM_X86_64_PROCESSOR_H
25829 -
25830 -#include <asm/segment.h>
25831 -#include <asm/page.h>
25832 -#include <asm/types.h>
25833 -#include <asm/sigcontext.h>
25834 -#include <asm/cpufeature.h>
25835 -#include <linux/threads.h>
25836 -#include <asm/msr.h>
25837 -#include <asm/current.h>
25838 -#include <asm/system.h>
25839 -#include <asm/mmsegment.h>
25840 -#include <asm/percpu.h>
25841 -#include <linux/personality.h>
25842 -#include <linux/cpumask.h>
25843 -#include <asm/processor-flags.h>
25844 -
25845 -#define TF_MASK                0x00000100
25846 -#define IF_MASK                0x00000200
25847 -#define IOPL_MASK      0x00003000
25848 -#define NT_MASK                0x00004000
25849 -#define VM_MASK                0x00020000
25850 -#define AC_MASK                0x00040000
25851 -#define VIF_MASK       0x00080000      /* virtual interrupt flag */
25852 -#define VIP_MASK       0x00100000      /* virtual interrupt pending */
25853 -#define ID_MASK                0x00200000
25854 -
25855 -#define desc_empty(desc) \
25856 -               (!((desc)->a | (desc)->b))
25857 -
25858 -#define desc_equal(desc1, desc2) \
25859 -               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25860 -
25861 -/*
25862 - * Default implementation of macro that returns current
25863 - * instruction pointer ("program counter").
25864 - */
25865 -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
25866 -
25867 -/*
25868 - *  CPU type and hardware bug flags. Kept separately for each CPU.
25869 - */
25870 -
25871 -struct cpuinfo_x86 {
25872 -       __u8    x86;            /* CPU family */
25873 -       __u8    x86_vendor;     /* CPU vendor */
25874 -       __u8    x86_model;
25875 -       __u8    x86_mask;
25876 -       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
25877 -       __u32   x86_capability[NCAPINTS];
25878 -       char    x86_vendor_id[16];
25879 -       char    x86_model_id[64];
25880 -       int     x86_cache_size;  /* in KB */
25881 -       int     x86_clflush_size;
25882 -       int     x86_cache_alignment;
25883 -       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
25884 -        __u8    x86_virt_bits, x86_phys_bits;
25885 -       __u8    x86_max_cores;  /* cpuid returned max cores value */
25886 -        __u32   x86_power;
25887 -       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
25888 -       unsigned long loops_per_jiffy;
25889 -#ifdef CONFIG_SMP
25890 -       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
25891 -#endif
25892 -       __u8    apicid;
25893 -#ifdef CONFIG_SMP
25894 -       __u8    booted_cores;   /* number of cores as seen by OS */
25895 -       __u8    phys_proc_id;   /* Physical Processor id. */
25896 -       __u8    cpu_core_id;    /* Core id. */
25897 -       __u8    cpu_index;      /* index into per_cpu list */
25898 -#endif
25899 -} ____cacheline_aligned;
25900 -
25901 -#define X86_VENDOR_INTEL 0
25902 -#define X86_VENDOR_CYRIX 1
25903 -#define X86_VENDOR_AMD 2
25904 -#define X86_VENDOR_UMC 3
25905 -#define X86_VENDOR_NEXGEN 4
25906 -#define X86_VENDOR_CENTAUR 5
25907 -#define X86_VENDOR_TRANSMETA 7
25908 -#define X86_VENDOR_NUM 8
25909 -#define X86_VENDOR_UNKNOWN 0xff
25910 -
25911 -#ifdef CONFIG_SMP
25912 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25913 -#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
25914 -#define current_cpu_data       cpu_data(smp_processor_id())
25915 -#else
25916 -#define cpu_data(cpu)          boot_cpu_data
25917 -#define current_cpu_data       boot_cpu_data
25918 -#endif
25919 -
25920 -extern char ignore_irq13;
25921 -
25922 -extern void identify_cpu(struct cpuinfo_x86 *);
25923 -extern void print_cpu_info(struct cpuinfo_x86 *);
25924 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25925 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25926 -extern unsigned short num_cache_leaves;
25927 -
25928 -/*
25929 - * Save the cr4 feature set we're using (ie
25930 - * Pentium 4MB enable and PPro Global page
25931 - * enable), so that any CPU's that boot up
25932 - * after us can get the correct flags.
25933 - */
25934 -extern unsigned long mmu_cr4_features;
25935 -
25936 -static inline void set_in_cr4 (unsigned long mask)
25937 -{
25938 -       mmu_cr4_features |= mask;
25939 -       __asm__("movq %%cr4,%%rax\n\t"
25940 -               "orq %0,%%rax\n\t"
25941 -               "movq %%rax,%%cr4\n"
25942 -               : : "irg" (mask)
25943 -               :"ax");
25944 -}
25945 -
25946 -static inline void clear_in_cr4 (unsigned long mask)
25947 -{
25948 -       mmu_cr4_features &= ~mask;
25949 -       __asm__("movq %%cr4,%%rax\n\t"
25950 -               "andq %0,%%rax\n\t"
25951 -               "movq %%rax,%%cr4\n"
25952 -               : : "irg" (~mask)
25953 -               :"ax");
25954 -}
25955 -
25956 -
25957 -/*
25958 - * User space process size. 47bits minus one guard page.
25959 - */
25960 -#define TASK_SIZE64    (0x800000000000UL - 4096)
25961 -
25962 -/* This decides where the kernel will search for a free chunk of vm
25963 - * space during mmap's.
25964 - */
25965 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
25966 -
25967 -#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25968 -#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
25969 -
25970 -#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
25971 -
25972 -/*
25973 - * Size of io_bitmap.
25974 - */
25975 -#define IO_BITMAP_BITS  65536
25976 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25977 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25978 -#ifndef CONFIG_X86_NO_TSS
25979 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25980 -#endif
25981 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25982 -
25983 -struct i387_fxsave_struct {
25984 -       u16     cwd;
25985 -       u16     swd;
25986 -       u16     twd;
25987 -       u16     fop;
25988 -       u64     rip;
25989 -       u64     rdp;
25990 -       u32     mxcsr;
25991 -       u32     mxcsr_mask;
25992 -       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
25993 -       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
25994 -       u32     padding[24];
25995 -} __attribute__ ((aligned (16)));
25996 -
25997 -union i387_union {
25998 -       struct i387_fxsave_struct       fxsave;
25999 -};
26000 -
26001 -#ifndef CONFIG_X86_NO_TSS
26002 -struct tss_struct {
26003 -       u32 reserved1;
26004 -       u64 rsp0;
26005 -       u64 rsp1;
26006 -       u64 rsp2;
26007 -       u64 reserved2;
26008 -       u64 ist[7];
26009 -       u32 reserved3;
26010 -       u32 reserved4;
26011 -       u16 reserved5;
26012 -       u16 io_bitmap_base;
26013 -       /*
26014 -        * The extra 1 is there because the CPU will access an
26015 -        * additional byte beyond the end of the IO permission
26016 -        * bitmap. The extra byte must be all 1 bits, and must
26017 -        * be within the limit. Thus we have:
26018 -        *
26019 -        * 128 bytes, the bitmap itself, for ports 0..0x3ff
26020 -        * 8 bytes, for an extra "long" of ~0UL
26021 -        */
26022 -       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26023 -} __attribute__((packed)) ____cacheline_aligned;
26024 -
26025 -DECLARE_PER_CPU(struct tss_struct,init_tss);
26026 -#endif
26027 -
26028 -
26029 -extern struct cpuinfo_x86 boot_cpu_data;
26030 -#ifndef CONFIG_X86_NO_TSS
26031 -/* Save the original ist values for checking stack pointers during debugging */
26032 -struct orig_ist {
26033 -       unsigned long ist[7];
26034 -};
26035 -DECLARE_PER_CPU(struct orig_ist, orig_ist);
26036 -#endif
26037 -
26038 -#ifdef CONFIG_X86_VSMP
26039 -#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
26040 -#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
26041 -#else
26042 -#define ARCH_MIN_TASKALIGN     16
26043 -#define ARCH_MIN_MMSTRUCT_ALIGN        0
26044 -#endif
26045 -
26046 -struct thread_struct {
26047 -       unsigned long   rsp0;
26048 -       unsigned long   rsp;
26049 -       unsigned long   userrsp;        /* Copy from PDA */
26050 -       unsigned long   fs;
26051 -       unsigned long   gs;
26052 -       unsigned short  es, ds, fsindex, gsindex;
26053 -/* Hardware debugging registers */
26054 -       unsigned long   debugreg0;
26055 -       unsigned long   debugreg1;
26056 -       unsigned long   debugreg2;
26057 -       unsigned long   debugreg3;
26058 -       unsigned long   debugreg6;
26059 -       unsigned long   debugreg7;
26060 -/* fault info */
26061 -       unsigned long   cr2, trap_no, error_code;
26062 -/* floating point info */
26063 -       union i387_union        i387  __attribute__((aligned(16)));
26064 -/* IO permissions. the bitmap could be moved into the GDT, that would make
26065 -   switch faster for a limited number of ioperm using tasks. -AK */
26066 -       int             ioperm;
26067 -       unsigned long   *io_bitmap_ptr;
26068 -       unsigned io_bitmap_max;
26069 -/* cached TLS descriptors. */
26070 -       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26071 -       unsigned int    iopl;
26072 -} __attribute__((aligned(16)));
26073 -
26074 -#define INIT_THREAD  { \
26075 -       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26076 -}
26077 -
26078 -#ifndef CONFIG_X86_NO_TSS
26079 -#define INIT_TSS  { \
26080 -       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26081 -}
26082 -#endif
26083 -
26084 -#define INIT_MMAP \
26085 -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26086 -
26087 -#define start_thread(regs,new_rip,new_rsp) do { \
26088 -       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
26089 -       load_gs_index(0);                                                       \
26090 -       (regs)->rip = (new_rip);                                                 \
26091 -       (regs)->rsp = (new_rsp);                                                 \
26092 -       write_pda(oldrsp, (new_rsp));                                            \
26093 -       (regs)->cs = __USER_CS;                                                  \
26094 -       (regs)->ss = __USER_DS;                                                  \
26095 -       (regs)->eflags = 0x200;                                                  \
26096 -       set_fs(USER_DS);                                                         \
26097 -} while(0)
26098 -
26099 -#define get_debugreg(var, register)                            \
26100 -       var = HYPERVISOR_get_debugreg(register)
26101 -#define set_debugreg(value, register) do {                     \
26102 -       if (HYPERVISOR_set_debugreg(register, value))           \
26103 -               BUG();                                          \
26104 -} while (0)
26105 -
26106 -struct task_struct;
26107 -struct mm_struct;
26108 -
26109 -/* Free all resources held by a thread. */
26110 -extern void release_thread(struct task_struct *);
26111 -
26112 -/* Prepare to copy thread state - unlazy all lazy status */
26113 -extern void prepare_to_copy(struct task_struct *tsk);
26114 -
26115 -/*
26116 - * create a kernel thread without removing it from tasklists
26117 - */
26118 -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26119 -
26120 -/*
26121 - * Return saved PC of a blocked thread.
26122 - * What is this good for? it will be always the scheduler or ret_from_fork.
26123 - */
26124 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26125 -
26126 -extern unsigned long get_wchan(struct task_struct *p);
26127 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26128 -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26129 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26130 -
26131 -
26132 -struct microcode_header {
26133 -       unsigned int hdrver;
26134 -       unsigned int rev;
26135 -       unsigned int date;
26136 -       unsigned int sig;
26137 -       unsigned int cksum;
26138 -       unsigned int ldrver;
26139 -       unsigned int pf;
26140 -       unsigned int datasize;
26141 -       unsigned int totalsize;
26142 -       unsigned int reserved[3];
26143 -};
26144 -
26145 -struct microcode {
26146 -       struct microcode_header hdr;
26147 -       unsigned int bits[0];
26148 -};
26149 -
26150 -typedef struct microcode microcode_t;
26151 -typedef struct microcode_header microcode_header_t;
26152 -
26153 -/* microcode format is extended from prescott processors */
26154 -struct extended_signature {
26155 -       unsigned int sig;
26156 -       unsigned int pf;
26157 -       unsigned int cksum;
26158 -};
26159 -
26160 -struct extended_sigtable {
26161 -       unsigned int count;
26162 -       unsigned int cksum;
26163 -       unsigned int reserved[3];
26164 -       struct extended_signature sigs[0];
26165 -};
26166 -
26167 -
26168 -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26169 -#define ASM_NOP1 P6_NOP1
26170 -#define ASM_NOP2 P6_NOP2
26171 -#define ASM_NOP3 P6_NOP3
26172 -#define ASM_NOP4 P6_NOP4
26173 -#define ASM_NOP5 P6_NOP5
26174 -#define ASM_NOP6 P6_NOP6
26175 -#define ASM_NOP7 P6_NOP7
26176 -#define ASM_NOP8 P6_NOP8
26177 -#else
26178 -#define ASM_NOP1 K8_NOP1
26179 -#define ASM_NOP2 K8_NOP2
26180 -#define ASM_NOP3 K8_NOP3
26181 -#define ASM_NOP4 K8_NOP4
26182 -#define ASM_NOP5 K8_NOP5
26183 -#define ASM_NOP6 K8_NOP6
26184 -#define ASM_NOP7 K8_NOP7
26185 -#define ASM_NOP8 K8_NOP8
26186 -#endif
26187 -
26188 -/* Opteron nops */
26189 -#define K8_NOP1 ".byte 0x90\n"
26190 -#define K8_NOP2        ".byte 0x66,0x90\n"
26191 -#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
26192 -#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
26193 -#define K8_NOP5        K8_NOP3 K8_NOP2
26194 -#define K8_NOP6        K8_NOP3 K8_NOP3
26195 -#define K8_NOP7        K8_NOP4 K8_NOP3
26196 -#define K8_NOP8        K8_NOP4 K8_NOP4
26197 -
26198 -/* P6 nops */
26199 -/* uses eax dependencies (Intel-recommended choice) */
26200 -#define P6_NOP1        ".byte 0x90\n"
26201 -#define P6_NOP2        ".byte 0x66,0x90\n"
26202 -#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
26203 -#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
26204 -#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
26205 -#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26206 -#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26207 -#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26208 -
26209 -#define ASM_NOP_MAX 8
26210 -
26211 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26212 -static inline void rep_nop(void)
26213 -{
26214 -       __asm__ __volatile__("rep;nop": : :"memory");
26215 -}
26216 -
26217 -/* Stop speculative execution */
26218 -static inline void sync_core(void)
26219 -{
26220 -       int tmp;
26221 -       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26222 -}
26223 -
26224 -#define ARCH_HAS_PREFETCHW 1
26225 -static inline void prefetchw(void *x)
26226 -{
26227 -       alternative_input("prefetcht0 (%1)",
26228 -                         "prefetchw (%1)",
26229 -                         X86_FEATURE_3DNOW,
26230 -                         "r" (x));
26231 -}
26232 -
26233 -#define ARCH_HAS_SPINLOCK_PREFETCH 1
26234 -
26235 -#define spin_lock_prefetch(x)  prefetchw(x)
26236 -
26237 -#define cpu_relax()   rep_nop()
26238 -
26239 -static inline void __monitor(const void *eax, unsigned long ecx,
26240 -               unsigned long edx)
26241 -{
26242 -       /* "monitor %eax,%ecx,%edx;" */
26243 -       asm volatile(
26244 -               ".byte 0x0f,0x01,0xc8;"
26245 -               : :"a" (eax), "c" (ecx), "d"(edx));
26246 -}
26247 -
26248 -static inline void __mwait(unsigned long eax, unsigned long ecx)
26249 -{
26250 -       /* "mwait %eax,%ecx;" */
26251 -       asm volatile(
26252 -               ".byte 0x0f,0x01,0xc9;"
26253 -               : :"a" (eax), "c" (ecx));
26254 -}
26255 -
26256 -static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26257 -{
26258 -       /* "mwait %eax,%ecx;" */
26259 -       asm volatile(
26260 -               "sti; .byte 0x0f,0x01,0xc9;"
26261 -               : :"a" (eax), "c" (ecx));
26262 -}
26263 -
26264 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26265 -
26266 -#define stack_current() \
26267 -({                                                             \
26268 -       struct thread_info *ti;                                 \
26269 -       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
26270 -       ti->task;                                       \
26271 -})
26272 -
26273 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26274 -
26275 -extern unsigned long boot_option_idle_override;
26276 -/* Boot loader type from the setup header */
26277 -extern int bootloader_type;
26278 -
26279 -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26280 -
26281 -#endif /* __ASM_X86_64_PROCESSOR_H */
26282 --- a/include/asm-x86/mach-xen/asm/processor.h
26283 +++ b/include/asm-x86/mach-xen/asm/processor.h
26284 @@ -1,5 +1,793 @@
26285 +#ifndef __ASM_X86_PROCESSOR_H
26286 +#define __ASM_X86_PROCESSOR_H
26287 +
26288 +#include <asm/processor-flags.h>
26289 +
26290 +/* migration helpers, for KVM - will be removed in 2.6.25: */
26291 +#include <asm/vm86.h>
26292 +#define Xgt_desc_struct        desc_ptr
26293 +
26294 +/* Forward declaration, a strange C thing */
26295 +struct task_struct;
26296 +struct mm_struct;
26297 +
26298 +#include <asm/vm86.h>
26299 +#include <asm/math_emu.h>
26300 +#include <asm/segment.h>
26301 +#include <asm/types.h>
26302 +#include <asm/sigcontext.h>
26303 +#include <asm/current.h>
26304 +#include <asm/cpufeature.h>
26305 +#include <asm/system.h>
26306 +#include <asm/page.h>
26307 +#include <asm/percpu.h>
26308 +#include <asm/msr.h>
26309 +#include <asm/desc_defs.h>
26310 +#include <asm/nops.h>
26311 +#include <linux/personality.h>
26312 +#include <linux/cpumask.h>
26313 +#include <linux/cache.h>
26314 +#include <linux/threads.h>
26315 +#include <linux/init.h>
26316 +#include <xen/interface/physdev.h>
26317 +
26318 +/*
26319 + * Default implementation of macro that returns current
26320 + * instruction pointer ("program counter").
26321 + */
26322 +static inline void *current_text_addr(void)
26323 +{
26324 +       void *pc;
26325 +       asm volatile("mov $1f,%0\n1:":"=r" (pc));
26326 +       return pc;
26327 +}
26328 +
26329 +#ifdef CONFIG_X86_VSMP
26330 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
26331 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
26332 +#else
26333 +#define ARCH_MIN_TASKALIGN     16
26334 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
26335 +#endif
26336 +
26337 +/*
26338 + *  CPU type and hardware bug flags. Kept separately for each CPU.
26339 + *  Members of this structure are referenced in head.S, so think twice
26340 + *  before touching them. [mj]
26341 + */
26342 +
26343 +struct cpuinfo_x86 {
26344 +       __u8    x86;            /* CPU family */
26345 +       __u8    x86_vendor;     /* CPU vendor */
26346 +       __u8    x86_model;
26347 +       __u8    x86_mask;
26348 +#ifdef CONFIG_X86_32
26349 +       char    wp_works_ok;    /* It doesn't on 386's */
26350 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
26351 +       char    hard_math;
26352 +       char    rfu;
26353 +       char    fdiv_bug;
26354 +       char    f00f_bug;
26355 +       char    coma_bug;
26356 +       char    pad0;
26357 +#else
26358 +       /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26359 +       int     x86_tlbsize;
26360 +       __u8    x86_virt_bits, x86_phys_bits;
26361 +       /* cpuid returned core id bits */
26362 +       __u8    x86_coreid_bits;
26363 +       /* Max extended CPUID function supported */
26364 +       __u32   extended_cpuid_level;
26365 +#endif
26366 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
26367 +       __u32   x86_capability[NCAPINTS];
26368 +       char    x86_vendor_id[16];
26369 +       char    x86_model_id[64];
26370 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
26371 +                                   call  */
26372 +       int     x86_cache_alignment;    /* In bytes */
26373 +       int     x86_power;
26374 +       unsigned long loops_per_jiffy;
26375 +#ifdef CONFIG_SMP
26376 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
26377 +#endif
26378 +       u16 x86_max_cores;              /* cpuid returned max cores value */
26379 +       u16 apicid;
26380 +       u16 x86_clflush_size;
26381 +#ifdef CONFIG_SMP
26382 +       u16 booted_cores;               /* number of cores as seen by OS */
26383 +       u16 phys_proc_id;               /* Physical processor id. */
26384 +       u16 cpu_core_id;                /* Core id */
26385 +       u16 cpu_index;                  /* index into per_cpu list */
26386 +#endif
26387 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
26388 +
26389 +#define X86_VENDOR_INTEL 0
26390 +#define X86_VENDOR_CYRIX 1
26391 +#define X86_VENDOR_AMD 2
26392 +#define X86_VENDOR_UMC 3
26393 +#define X86_VENDOR_NEXGEN 4
26394 +#define X86_VENDOR_CENTAUR 5
26395 +#define X86_VENDOR_TRANSMETA 7
26396 +#define X86_VENDOR_NSC 8
26397 +#define X86_VENDOR_NUM 9
26398 +#define X86_VENDOR_UNKNOWN 0xff
26399 +
26400 +/*
26401 + * capabilities of CPUs
26402 + */
26403 +extern struct cpuinfo_x86 boot_cpu_data;
26404 +extern struct cpuinfo_x86 new_cpu_data;
26405 +extern __u32 cleared_cpu_caps[NCAPINTS];
26406 +
26407 +#ifdef CONFIG_SMP
26408 +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26409 +#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
26410 +#define current_cpu_data       cpu_data(smp_processor_id())
26411 +#else
26412 +#define cpu_data(cpu)          boot_cpu_data
26413 +#define current_cpu_data       boot_cpu_data
26414 +#endif
26415 +
26416 +void cpu_detect(struct cpuinfo_x86 *c);
26417 +
26418 +extern void identify_cpu(struct cpuinfo_x86 *);
26419 +extern void identify_boot_cpu(void);
26420 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
26421 +extern void print_cpu_info(struct cpuinfo_x86 *);
26422 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26423 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26424 +extern unsigned short num_cache_leaves;
26425 +
26426 +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
26427 +extern void detect_ht(struct cpuinfo_x86 *c);
26428 +#else
26429 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
26430 +#endif
26431 +
26432 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
26433 +                            unsigned int *ecx, unsigned int *edx)
26434 +{
26435 +       /* ecx is often an input as well as an output. */
26436 +       __asm__(XEN_CPUID
26437 +               : "=a" (*eax),
26438 +                 "=b" (*ebx),
26439 +                 "=c" (*ecx),
26440 +                 "=d" (*edx)
26441 +               : "0" (*eax), "2" (*ecx));
26442 +}
26443 +
26444 +static inline void load_cr3(pgd_t *pgdir)
26445 +{
26446 +       write_cr3(__pa(pgdir));
26447 +}
26448 +
26449 +#ifndef CONFIG_X86_NO_TSS
26450 +#ifdef CONFIG_X86_32
26451 +/* This is the TSS defined by the hardware. */
26452 +struct x86_hw_tss {
26453 +       unsigned short  back_link, __blh;
26454 +       unsigned long   sp0;
26455 +       unsigned short  ss0, __ss0h;
26456 +       unsigned long   sp1;
26457 +       unsigned short  ss1, __ss1h;    /* ss1 caches MSR_IA32_SYSENTER_CS */
26458 +       unsigned long   sp2;
26459 +       unsigned short  ss2, __ss2h;
26460 +       unsigned long   __cr3;
26461 +       unsigned long   ip;
26462 +       unsigned long   flags;
26463 +       unsigned long   ax, cx, dx, bx;
26464 +       unsigned long   sp, bp, si, di;
26465 +       unsigned short  es, __esh;
26466 +       unsigned short  cs, __csh;
26467 +       unsigned short  ss, __ssh;
26468 +       unsigned short  ds, __dsh;
26469 +       unsigned short  fs, __fsh;
26470 +       unsigned short  gs, __gsh;
26471 +       unsigned short  ldt, __ldth;
26472 +       unsigned short  trace, io_bitmap_base;
26473 +} __attribute__((packed));
26474 +extern struct tss_struct doublefault_tss;
26475 +#else
26476 +struct x86_hw_tss {
26477 +       u32 reserved1;
26478 +       u64 sp0;
26479 +       u64 sp1;
26480 +       u64 sp2;
26481 +       u64 reserved2;
26482 +       u64 ist[7];
26483 +       u32 reserved3;
26484 +       u32 reserved4;
26485 +       u16 reserved5;
26486 +       u16 io_bitmap_base;
26487 +} __attribute__((packed)) ____cacheline_aligned;
26488 +#endif
26489 +#endif /* CONFIG_X86_NO_TSS */
26490 +
26491 +/*
26492 + * Size of io_bitmap.
26493 + */
26494 +#define IO_BITMAP_BITS  65536
26495 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26496 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26497 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
26498 +#define INVALID_IO_BITMAP_OFFSET 0x8000
26499 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
26500 +
26501 +#ifndef CONFIG_X86_NO_TSS
26502 +struct tss_struct {
26503 +       struct x86_hw_tss x86_tss;
26504 +
26505 +       /*
26506 +        * The extra 1 is there because the CPU will access an
26507 +        * additional byte beyond the end of the IO permission
26508 +        * bitmap. The extra byte must be all 1 bits, and must
26509 +        * be within the limit.
26510 +        */
26511 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
26512 +       /*
26513 +        * Cache the current maximum and the last task that used the bitmap:
26514 +        */
26515 +       unsigned long io_bitmap_max;
26516 +       struct thread_struct *io_bitmap_owner;
26517 +       /*
26518 +        * pads the TSS to be cacheline-aligned (size is 0x100)
26519 +        */
26520 +       unsigned long __cacheline_filler[35];
26521 +       /*
26522 +        * .. and then another 0x100 bytes for emergency kernel stack
26523 +        */
26524 +       unsigned long stack[64];
26525 +} __attribute__((packed));
26526 +
26527 +DECLARE_PER_CPU(struct tss_struct, init_tss);
26528 +
26529 +/* Save the original ist values for checking stack pointers during debugging */
26530 +struct orig_ist {
26531 +       unsigned long ist[7];
26532 +};
26533 +#endif /* CONFIG_X86_NO_TSS */
26534 +
26535 +#define        MXCSR_DEFAULT           0x1f80
26536 +
26537 +struct i387_fsave_struct {
26538 +       u32     cwd;
26539 +       u32     swd;
26540 +       u32     twd;
26541 +       u32     fip;
26542 +       u32     fcs;
26543 +       u32     foo;
26544 +       u32     fos;
26545 +       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
26546 +       u32     status;         /* software status information */
26547 +};
26548 +
26549 +struct i387_fxsave_struct {
26550 +       u16     cwd;
26551 +       u16     swd;
26552 +       u16     twd;
26553 +       u16     fop;
26554 +       union {
26555 +               struct {
26556 +                       u64     rip;
26557 +                       u64     rdp;
26558 +               };
26559 +               struct {
26560 +                       u32     fip;
26561 +                       u32     fcs;
26562 +                       u32     foo;
26563 +                       u32     fos;
26564 +               };
26565 +       };
26566 +       u32     mxcsr;
26567 +       u32     mxcsr_mask;
26568 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
26569 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
26570 +       u32     padding[24];
26571 +} __attribute__((aligned(16)));
26572 +
26573 +struct i387_soft_struct {
26574 +       u32     cwd;
26575 +       u32     swd;
26576 +       u32     twd;
26577 +       u32     fip;
26578 +       u32     fcs;
26579 +       u32     foo;
26580 +       u32     fos;
26581 +       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
26582 +       u8      ftop, changed, lookahead, no_update, rm, alimit;
26583 +       struct info     *info;
26584 +       u32     entry_eip;
26585 +};
26586 +
26587 +union i387_union {
26588 +       struct i387_fsave_struct        fsave;
26589 +       struct i387_fxsave_struct       fxsave;
26590 +       struct i387_soft_struct         soft;
26591 +};
26592 +
26593 +#ifdef CONFIG_X86_32
26594 +DECLARE_PER_CPU(u8, cpu_llc_id);
26595 +#elif !defined(CONFIG_X86_NO_TSS)
26596 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
26597 +#endif
26598 +
26599 +extern void print_cpu_info(struct cpuinfo_x86 *);
26600 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26601 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26602 +extern unsigned short num_cache_leaves;
26603 +
26604 +struct thread_struct {
26605 +/* cached TLS descriptors. */
26606 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
26607 +       unsigned long   sp0;
26608 +       unsigned long   sp;
26609 +#ifdef CONFIG_X86_32
26610 +       unsigned long   sysenter_cs;
26611 +#else
26612 +       unsigned long   usersp; /* Copy from PDA */
26613 +       unsigned short  es, ds, fsindex, gsindex;
26614 +#endif
26615 +       unsigned long   ip;
26616 +       unsigned long   fs;
26617 +       unsigned long   gs;
26618 +/* Hardware debugging registers */
26619 +       unsigned long   debugreg0;
26620 +       unsigned long   debugreg1;
26621 +       unsigned long   debugreg2;
26622 +       unsigned long   debugreg3;
26623 +       unsigned long   debugreg6;
26624 +       unsigned long   debugreg7;
26625 +/* fault info */
26626 +       unsigned long   cr2, trap_no, error_code;
26627 +/* floating point info */
26628 +       union i387_union        i387 __attribute__((aligned(16)));;
26629 +#ifdef CONFIG_X86_32
26630 +/* virtual 86 mode info */
26631 +       struct vm86_struct __user *vm86_info;
26632 +       unsigned long           screen_bitmap;
26633 +       unsigned long           v86flags, v86mask, saved_sp0;
26634 +       unsigned int            saved_fs, saved_gs;
26635 +#endif
26636 +/* IO permissions */
26637 +       unsigned long   *io_bitmap_ptr;
26638 +       unsigned long   iopl;
26639 +/* max allowed port in the bitmap, in bytes: */
26640 +       unsigned io_bitmap_max;
26641 +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
26642 +       unsigned long   debugctlmsr;
26643 +/* Debug Store - if not 0 points to a DS Save Area configuration;
26644 + *               goes into MSR_IA32_DS_AREA */
26645 +       unsigned long   ds_area_msr;
26646 +};
26647 +
26648 +static inline unsigned long xen_get_debugreg(int regno)
26649 +{
26650 +       return HYPERVISOR_get_debugreg(regno);
26651 +}
26652 +
26653 +static inline void xen_set_debugreg(int regno, unsigned long value)
26654 +{
26655 +       WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26656 +}
26657 +
26658 +/*
26659 + * Set IOPL bits in EFLAGS from given mask
26660 + */
26661 +static inline void xen_set_iopl_mask(unsigned mask)
26662 +{
26663 +       struct physdev_set_iopl set_iopl;
26664 +
26665 +       /* Force the change at ring 0. */
26666 +       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26667 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26668 +}
26669 +
26670 +#ifndef CONFIG_X86_NO_TSS
26671 +static inline void native_load_sp0(struct tss_struct *tss,
26672 +                                  struct thread_struct *thread)
26673 +{
26674 +       tss->x86_tss.sp0 = thread->sp0;
26675 +#ifdef CONFIG_X86_32
26676 +       /* Only happens when SEP is enabled, no need to test "SEP"arately */
26677 +       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26678 +               tss->x86_tss.ss1 = thread->sysenter_cs;
26679 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26680 +       }
26681 +#endif
26682 +}
26683 +#else
26684 +#define xen_load_sp0(tss, thread) do { \
26685 +       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
26686 +               BUG(); \
26687 +} while (0)
26688 +#endif
26689 +
26690 +#define __cpuid xen_cpuid
26691 +#define paravirt_enabled() 0
26692 +
26693 +/*
26694 + * These special macros can be used to get or set a debugging register
26695 + */
26696 +#define get_debugreg(var, register)                            \
26697 +       (var) = xen_get_debugreg(register)
26698 +#define set_debugreg(value, register)                          \
26699 +       xen_set_debugreg(register, value)
26700 +
26701 +#define load_sp0 xen_load_sp0
26702 +
26703 +#define set_iopl_mask xen_set_iopl_mask
26704 +
26705 +/*
26706 + * Save the cr4 feature set we're using (ie
26707 + * Pentium 4MB enable and PPro Global page
26708 + * enable), so that any CPU's that boot up
26709 + * after us can get the correct flags.
26710 + */
26711 +extern unsigned long mmu_cr4_features;
26712 +
26713 +static inline void set_in_cr4(unsigned long mask)
26714 +{
26715 +       unsigned cr4;
26716 +       mmu_cr4_features |= mask;
26717 +       cr4 = read_cr4();
26718 +       cr4 |= mask;
26719 +       write_cr4(cr4);
26720 +}
26721 +
26722 +static inline void clear_in_cr4(unsigned long mask)
26723 +{
26724 +       unsigned cr4;
26725 +       mmu_cr4_features &= ~mask;
26726 +       cr4 = read_cr4();
26727 +       cr4 &= ~mask;
26728 +       write_cr4(cr4);
26729 +}
26730 +
26731 +struct microcode_header {
26732 +       unsigned int hdrver;
26733 +       unsigned int rev;
26734 +       unsigned int date;
26735 +       unsigned int sig;
26736 +       unsigned int cksum;
26737 +       unsigned int ldrver;
26738 +       unsigned int pf;
26739 +       unsigned int datasize;
26740 +       unsigned int totalsize;
26741 +       unsigned int reserved[3];
26742 +};
26743 +
26744 +struct microcode {
26745 +       struct microcode_header hdr;
26746 +       unsigned int bits[0];
26747 +};
26748 +
26749 +typedef struct microcode microcode_t;
26750 +typedef struct microcode_header microcode_header_t;
26751 +
26752 +/* microcode format is extended from prescott processors */
26753 +struct extended_signature {
26754 +       unsigned int sig;
26755 +       unsigned int pf;
26756 +       unsigned int cksum;
26757 +};
26758 +
26759 +struct extended_sigtable {
26760 +       unsigned int count;
26761 +       unsigned int cksum;
26762 +       unsigned int reserved[3];
26763 +       struct extended_signature sigs[0];
26764 +};
26765 +
26766 +typedef struct {
26767 +       unsigned long seg;
26768 +} mm_segment_t;
26769 +
26770 +
26771 +/*
26772 + * create a kernel thread without removing it from tasklists
26773 + */
26774 +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
26775 +
26776 +/* Free all resources held by a thread. */
26777 +extern void release_thread(struct task_struct *);
26778 +
26779 +/* Prepare to copy thread state - unlazy all lazy status */
26780 +extern void prepare_to_copy(struct task_struct *tsk);
26781 +
26782 +unsigned long get_wchan(struct task_struct *p);
26783 +
26784 +/*
26785 + * Generic CPUID function
26786 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26787 + * resulting in stale register contents being returned.
26788 + */
26789 +static inline void cpuid(unsigned int op,
26790 +                        unsigned int *eax, unsigned int *ebx,
26791 +                        unsigned int *ecx, unsigned int *edx)
26792 +{
26793 +       *eax = op;
26794 +       *ecx = 0;
26795 +       __cpuid(eax, ebx, ecx, edx);
26796 +}
26797 +
26798 +/* Some CPUID calls want 'count' to be placed in ecx */
26799 +static inline void cpuid_count(unsigned int op, int count,
26800 +                              unsigned int *eax, unsigned int *ebx,
26801 +                              unsigned int *ecx, unsigned int *edx)
26802 +{
26803 +       *eax = op;
26804 +       *ecx = count;
26805 +       __cpuid(eax, ebx, ecx, edx);
26806 +}
26807 +
26808 +/*
26809 + * CPUID functions returning a single datum
26810 + */
26811 +static inline unsigned int cpuid_eax(unsigned int op)
26812 +{
26813 +       unsigned int eax, ebx, ecx, edx;
26814 +
26815 +       cpuid(op, &eax, &ebx, &ecx, &edx);
26816 +       return eax;
26817 +}
26818 +static inline unsigned int cpuid_ebx(unsigned int op)
26819 +{
26820 +       unsigned int eax, ebx, ecx, edx;
26821 +
26822 +       cpuid(op, &eax, &ebx, &ecx, &edx);
26823 +       return ebx;
26824 +}
26825 +static inline unsigned int cpuid_ecx(unsigned int op)
26826 +{
26827 +       unsigned int eax, ebx, ecx, edx;
26828 +
26829 +       cpuid(op, &eax, &ebx, &ecx, &edx);
26830 +       return ecx;
26831 +}
26832 +static inline unsigned int cpuid_edx(unsigned int op)
26833 +{
26834 +       unsigned int eax, ebx, ecx, edx;
26835 +
26836 +       cpuid(op, &eax, &ebx, &ecx, &edx);
26837 +       return edx;
26838 +}
26839 +
26840 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26841 +static inline void rep_nop(void)
26842 +{
26843 +       __asm__ __volatile__("rep;nop": : :"memory");
26844 +}
26845 +
26846 +/* Stop speculative execution */
26847 +static inline void sync_core(void)
26848 +{
26849 +       int tmp;
26850 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1)
26851 +                                         : "ebx", "ecx", "edx", "memory");
26852 +}
26853 +
26854 +#define cpu_relax()   rep_nop()
26855 +
26856 +static inline void __monitor(const void *eax, unsigned long ecx,
26857 +               unsigned long edx)
26858 +{
26859 +       /* "monitor %eax,%ecx,%edx;" */
26860 +       asm volatile(
26861 +               ".byte 0x0f,0x01,0xc8;"
26862 +               : :"a" (eax), "c" (ecx), "d"(edx));
26863 +}
26864 +
26865 +static inline void __mwait(unsigned long eax, unsigned long ecx)
26866 +{
26867 +       /* "mwait %eax,%ecx;" */
26868 +       asm volatile(
26869 +               ".byte 0x0f,0x01,0xc9;"
26870 +               : :"a" (eax), "c" (ecx));
26871 +}
26872 +
26873 +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26874 +{
26875 +       /* "mwait %eax,%ecx;" */
26876 +       asm volatile(
26877 +               "sti; .byte 0x0f,0x01,0xc9;"
26878 +               : :"a" (eax), "c" (ecx));
26879 +}
26880 +
26881 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26882 +
26883 +extern int force_mwait;
26884 +
26885 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
26886 +
26887 +extern unsigned long boot_option_idle_override;
26888 +
26889 +extern void enable_sep_cpu(void);
26890 +extern int sysenter_setup(void);
26891 +
26892 +/* Defined in head.S */
26893 +extern struct desc_ptr early_gdt_descr;
26894 +
26895 +extern void cpu_set_gdt(int);
26896 +extern void switch_to_new_gdt(void);
26897 +extern void cpu_init(void);
26898 +extern void init_gdt(int cpu);
26899 +
26900 +/* from system description table in BIOS.  Mostly for MCA use, but
26901 + * others may find it useful. */
26902 +extern unsigned int machine_id;
26903 +extern unsigned int machine_submodel_id;
26904 +extern unsigned int BIOS_revision;
26905 +
26906 +/* Boot loader type from the setup header */
26907 +extern int bootloader_type;
26908 +
26909 +extern char ignore_fpu_irq;
26910 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26911 +
26912 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26913 +#define ARCH_HAS_PREFETCHW
26914 +#define ARCH_HAS_SPINLOCK_PREFETCH
26915 +
26916 +#ifdef CONFIG_X86_32
26917 +#define BASE_PREFETCH  ASM_NOP4
26918 +#define ARCH_HAS_PREFETCH
26919 +#else
26920 +#define BASE_PREFETCH  "prefetcht0 (%1)"
26921 +#endif
26922 +
26923 +/* Prefetch instructions for Pentium III and AMD Athlon */
26924 +/* It's not worth to care about 3dnow! prefetches for the K6
26925 +   because they are microcoded there and very slow.
26926 +   However we don't do prefetches for pre XP Athlons currently
26927 +   That should be fixed. */
26928 +static inline void prefetch(const void *x)
26929 +{
26930 +       alternative_input(BASE_PREFETCH,
26931 +                         "prefetchnta (%1)",
26932 +                         X86_FEATURE_XMM,
26933 +                         "r" (x));
26934 +}
26935 +
26936 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
26937 +   spinlocks to avoid one state transition in the cache coherency protocol. */
26938 +static inline void prefetchw(const void *x)
26939 +{
26940 +       alternative_input(BASE_PREFETCH,
26941 +                         "prefetchw (%1)",
26942 +                         X86_FEATURE_3DNOW,
26943 +                         "r" (x));
26944 +}
26945 +
26946 +#define spin_lock_prefetch(x)  prefetchw(x)
26947  #ifdef CONFIG_X86_32
26948 -# include "processor_32.h"
26949 +/*
26950 + * User space process size: 3GB (default).
26951 + */
26952 +#define TASK_SIZE      (PAGE_OFFSET)
26953 +#define STACK_TOP      TASK_SIZE
26954 +#define STACK_TOP_MAX  STACK_TOP
26955 +
26956 +#define INIT_THREAD  {                                                 \
26957 +       .sp0 = sizeof(init_stack) + (long)&init_stack,                  \
26958 +       .vm86_info = NULL,                                              \
26959 +       .sysenter_cs = __KERNEL_CS,                                     \
26960 +       .io_bitmap_ptr = NULL,                                          \
26961 +       .fs = __KERNEL_PERCPU,                                          \
26962 +}
26963 +
26964 +/*
26965 + * Note that the .io_bitmap member must be extra-big. This is because
26966 + * the CPU will access an additional byte beyond the end of the IO
26967 + * permission bitmap. The extra byte must be all 1 bits, and must
26968 + * be within the limit.
26969 + */
26970 +#define INIT_TSS  {                                                    \
26971 +       .x86_tss = {                                                    \
26972 +               .sp0            = sizeof(init_stack) + (long)&init_stack, \
26973 +               .ss0            = __KERNEL_DS,                          \
26974 +               .ss1            = __KERNEL_CS,                          \
26975 +               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
26976 +        },                                                             \
26977 +       .io_bitmap      = { [0 ... IO_BITMAP_LONGS] = ~0 },             \
26978 +}
26979 +
26980 +#define start_thread(regs, new_eip, new_esp) do {              \
26981 +       __asm__("movl %0,%%gs": :"r" (0));                      \
26982 +       regs->fs = 0;                                           \
26983 +       set_fs(USER_DS);                                        \
26984 +       regs->ds = __USER_DS;                                   \
26985 +       regs->es = __USER_DS;                                   \
26986 +       regs->ss = __USER_DS;                                   \
26987 +       regs->cs = __USER_CS;                                   \
26988 +       regs->ip = new_eip;                                     \
26989 +       regs->sp = new_esp;                                     \
26990 +} while (0)
26991 +
26992 +
26993 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
26994 +
26995 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
26996 +#define KSTK_TOP(info)                                                 \
26997 +({                                                                     \
26998 +       unsigned long *__ptr = (unsigned long *)(info);                 \
26999 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
27000 +})
27001 +
27002 +/*
27003 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
27004 + * This is necessary to guarantee that the entire "struct pt_regs"
27005 + * is accessable even if the CPU haven't stored the SS/ESP registers
27006 + * on the stack (interrupt gate does not save these registers
27007 + * when switching to the same priv ring).
27008 + * Therefore beware: accessing the ss/esp fields of the
27009 + * "struct pt_regs" is possible, but they may contain the
27010 + * completely wrong values.
27011 + */
27012 +#define task_pt_regs(task)                                             \
27013 +({                                                                     \
27014 +       struct pt_regs *__regs__;                                       \
27015 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
27016 +       __regs__ - 1;                                                   \
27017 +})
27018 +
27019 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
27020 +
27021  #else
27022 -# include "processor_64.h"
27023 +/*
27024 + * User space process size. 47bits minus one guard page.
27025 + */
27026 +#define TASK_SIZE64    (0x800000000000UL - 4096)
27027 +
27028 +/* This decides where the kernel will search for a free chunk of vm
27029 + * space during mmap's.
27030 + */
27031 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
27032 +                          0xc0000000 : 0xFFFFe000)
27033 +
27034 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? \
27035 +                                IA32_PAGE_OFFSET : TASK_SIZE64)
27036 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? \
27037 +                                 IA32_PAGE_OFFSET : TASK_SIZE64)
27038 +
27039 +#define STACK_TOP              TASK_SIZE
27040 +#define STACK_TOP_MAX          TASK_SIZE64
27041 +
27042 +#define INIT_THREAD  { \
27043 +       .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27044 +}
27045 +
27046 +#define INIT_TSS  { \
27047 +       .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
27048 +}
27049 +
27050 +#define start_thread(regs, new_rip, new_rsp) do {                           \
27051 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));  \
27052 +       load_gs_index(0);                                                    \
27053 +       (regs)->ip = (new_rip);                                              \
27054 +       (regs)->sp = (new_rsp);                                              \
27055 +       write_pda(oldrsp, (new_rsp));                                        \
27056 +       (regs)->cs = __USER_CS;                                              \
27057 +       (regs)->ss = __USER_DS;                                              \
27058 +       (regs)->flags = 0x200;                                               \
27059 +       set_fs(USER_DS);                                                     \
27060 +} while (0)
27061 +
27062 +/*
27063 + * Return saved PC of a blocked thread.
27064 + * What is this good for? it will be always the scheduler or ret_from_fork.
27065 + */
27066 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
27067 +
27068 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
27069 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
27070 +#endif /* CONFIG_X86_64 */
27071 +
27072 +/* This decides where the kernel will search for a free chunk of vm
27073 + * space during mmap's.
27074 + */
27075 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
27076 +
27077 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
27078 +
27079  #endif
27080 --- a/include/asm-x86/mach-xen/asm/segment_32.h
27081 +++ /dev/null
27082 @@ -1,150 +0,0 @@
27083 -#ifndef _ASM_SEGMENT_H
27084 -#define _ASM_SEGMENT_H
27085 -
27086 -/*
27087 - * The layout of the per-CPU GDT under Linux:
27088 - *
27089 - *   0 - null
27090 - *   1 - reserved
27091 - *   2 - reserved
27092 - *   3 - reserved
27093 - *
27094 - *   4 - unused                        <==== new cacheline
27095 - *   5 - unused
27096 - *
27097 - *  ------- start of TLS (Thread-Local Storage) segments:
27098 - *
27099 - *   6 - TLS segment #1                        [ glibc's TLS segment ]
27100 - *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
27101 - *   8 - TLS segment #3
27102 - *   9 - reserved
27103 - *  10 - reserved
27104 - *  11 - reserved
27105 - *
27106 - *  ------- start of kernel segments:
27107 - *
27108 - *  12 - kernel code segment           <==== new cacheline
27109 - *  13 - kernel data segment
27110 - *  14 - default user CS
27111 - *  15 - default user DS
27112 - *  16 - TSS
27113 - *  17 - LDT
27114 - *  18 - PNPBIOS support (16->32 gate)
27115 - *  19 - PNPBIOS support
27116 - *  20 - PNPBIOS support
27117 - *  21 - PNPBIOS support
27118 - *  22 - PNPBIOS support
27119 - *  23 - APM BIOS support
27120 - *  24 - APM BIOS support
27121 - *  25 - APM BIOS support
27122 - *
27123 - *  26 - ESPFIX small SS
27124 - *  27 - per-cpu                       [ offset to per-cpu data area ]
27125 - *  28 - unused
27126 - *  29 - unused
27127 - *  30 - unused
27128 - *  31 - TSS for double fault handler
27129 - */
27130 -#define GDT_ENTRY_TLS_ENTRIES  3
27131 -#define GDT_ENTRY_TLS_MIN      6
27132 -#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27133 -
27134 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27135 -
27136 -#define GDT_ENTRY_DEFAULT_USER_CS      14
27137 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27138 -
27139 -#define GDT_ENTRY_DEFAULT_USER_DS      15
27140 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27141 -
27142 -#define GDT_ENTRY_KERNEL_BASE  12
27143 -
27144 -#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
27145 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27146 -
27147 -#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
27148 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27149 -
27150 -#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
27151 -#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
27152 -
27153 -#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
27154 -#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
27155 -
27156 -#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
27157 -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27158 -
27159 -#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
27160 -#ifdef CONFIG_SMP
27161 -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27162 -#else
27163 -#define __KERNEL_PERCPU 0
27164 -#endif
27165 -
27166 -#define GDT_ENTRY_DOUBLEFAULT_TSS      31
27167 -
27168 -/*
27169 - * The GDT has 32 entries
27170 - */
27171 -#define GDT_ENTRIES 32
27172 -#define GDT_SIZE (GDT_ENTRIES * 8)
27173 -
27174 -/* Simple and small GDT entries for booting only */
27175 -
27176 -#define GDT_ENTRY_BOOT_CS              2
27177 -#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
27178 -
27179 -#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
27180 -#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
27181 -
27182 -/* The PnP BIOS entries in the GDT */
27183 -#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
27184 -#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
27185 -#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
27186 -#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
27187 -#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
27188 -
27189 -/* The PnP BIOS selectors */
27190 -#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
27191 -#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
27192 -#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
27193 -#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27194 -#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27195 -
27196 -/*
27197 - * The interrupt descriptor table has room for 256 idt's,
27198 - * the global descriptor table is dependent on the number
27199 - * of tasks we can have..
27200 - */
27201 -#define IDT_ENTRIES 256
27202 -
27203 -/* Bottom two bits of selector give the ring privilege level */
27204 -#define SEGMENT_RPL_MASK       0x3
27205 -/* Bit 2 is table indicator (LDT/GDT) */
27206 -#define SEGMENT_TI_MASK                0x4
27207 -
27208 -/* User mode is privilege level 3 */
27209 -#define USER_RPL               0x3
27210 -/* LDT segment has TI set, GDT has it cleared */
27211 -#define SEGMENT_LDT            0x4
27212 -#define SEGMENT_GDT            0x0
27213 -
27214 -#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27215 -
27216 -/*
27217 - * Matching rules for certain types of segments.
27218 - */
27219 -
27220 -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27221 -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27222 -                                   || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27223 -
27224 -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27225 -#define SEGMENT_IS_FLAT_CODE(x)  (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27226 -                                  || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27227 -                                  || ((x) & ~3) == (FLAT_USER_CS & ~3))
27228 -
27229 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27230 -#define SEGMENT_IS_PNP_CODE(x)   (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27231 -
27232 -#endif
27233 --- a/include/asm-x86/mach-xen/asm/segment.h
27234 +++ b/include/asm-x86/mach-xen/asm/segment.h
27235 @@ -1,5 +1,204 @@
27236 +#ifndef _ASM_X86_SEGMENT_H_
27237 +#define _ASM_X86_SEGMENT_H_
27238 +
27239 +/* Simple and small GDT entries for booting only */
27240 +
27241 +#define GDT_ENTRY_BOOT_CS      2
27242 +#define __BOOT_CS              (GDT_ENTRY_BOOT_CS * 8)
27243 +
27244 +#define GDT_ENTRY_BOOT_DS      (GDT_ENTRY_BOOT_CS + 1)
27245 +#define __BOOT_DS              (GDT_ENTRY_BOOT_DS * 8)
27246 +
27247 +#define GDT_ENTRY_BOOT_TSS     (GDT_ENTRY_BOOT_CS + 2)
27248 +#define __BOOT_TSS             (GDT_ENTRY_BOOT_TSS * 8)
27249 +
27250  #ifdef CONFIG_X86_32
27251 -# include "segment_32.h"
27252 +/*
27253 + * The layout of the per-CPU GDT under Linux:
27254 + *
27255 + *   0 - null
27256 + *   1 - reserved
27257 + *   2 - reserved
27258 + *   3 - reserved
27259 + *
27260 + *   4 - unused                        <==== new cacheline
27261 + *   5 - unused
27262 + *
27263 + *  ------- start of TLS (Thread-Local Storage) segments:
27264 + *
27265 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
27266 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
27267 + *   8 - TLS segment #3
27268 + *   9 - reserved
27269 + *  10 - reserved
27270 + *  11 - reserved
27271 + *
27272 + *  ------- start of kernel segments:
27273 + *
27274 + *  12 - kernel code segment           <==== new cacheline
27275 + *  13 - kernel data segment
27276 + *  14 - default user CS
27277 + *  15 - default user DS
27278 + *  16 - TSS
27279 + *  17 - LDT
27280 + *  18 - PNPBIOS support (16->32 gate)
27281 + *  19 - PNPBIOS support
27282 + *  20 - PNPBIOS support
27283 + *  21 - PNPBIOS support
27284 + *  22 - PNPBIOS support
27285 + *  23 - APM BIOS support
27286 + *  24 - APM BIOS support
27287 + *  25 - APM BIOS support
27288 + *
27289 + *  26 - ESPFIX small SS
27290 + *  27 - per-cpu                       [ offset to per-cpu data area ]
27291 + *  28 - unused
27292 + *  29 - unused
27293 + *  30 - unused
27294 + *  31 - TSS for double fault handler
27295 + */
27296 +#define GDT_ENTRY_TLS_MIN      6
27297 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27298 +
27299 +#define GDT_ENTRY_DEFAULT_USER_CS      14
27300 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27301 +
27302 +#define GDT_ENTRY_DEFAULT_USER_DS      15
27303 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27304 +
27305 +#define GDT_ENTRY_KERNEL_BASE  12
27306 +
27307 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
27308 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27309 +
27310 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
27311 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27312 +
27313 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
27314 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
27315 +
27316 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
27317 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
27318 +
27319 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
27320 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27321 +
27322 +#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
27323 +#ifdef CONFIG_SMP
27324 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27325  #else
27326 -# include "../../segment_64.h"
27327 +#define __KERNEL_PERCPU 0
27328 +#endif
27329 +
27330 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
27331 +
27332 +/*
27333 + * The GDT has 32 entries
27334 + */
27335 +#define GDT_ENTRIES 32
27336 +
27337 +/* The PnP BIOS entries in the GDT */
27338 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
27339 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
27340 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
27341 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
27342 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
27343 +
27344 +/* The PnP BIOS selectors */
27345 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
27346 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
27347 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
27348 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27349 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27350 +
27351 +/* Bottom two bits of selector give the ring privilege level */
27352 +#define SEGMENT_RPL_MASK       0x3
27353 +/* Bit 2 is table indicator (LDT/GDT) */
27354 +#define SEGMENT_TI_MASK                0x4
27355 +
27356 +/* User mode is privilege level 3 */
27357 +#define USER_RPL               0x3
27358 +/* LDT segment has TI set, GDT has it cleared */
27359 +#define SEGMENT_LDT            0x4
27360 +#define SEGMENT_GDT            0x0
27361 +
27362 +/*
27363 + * Matching rules for certain types of segments.
27364 + */
27365 +
27366 +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27367 +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27368 +                                   || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27369 +
27370 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27371 +#define SEGMENT_IS_FLAT_CODE(x)  (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27372 +                                  || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27373 +                                  || ((x) & ~3) == (FLAT_USER_CS & ~3))
27374 +
27375 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27376 +#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
27377 +
27378 +#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27379 +
27380 +#else
27381 +#include <asm/cache.h>
27382 +
27383 +#define __KERNEL_CS    0x10
27384 +#define __KERNEL_DS    0x18
27385 +
27386 +#define __KERNEL32_CS   0x08
27387 +
27388 +/*
27389 + * we cannot use the same code segment descriptor for user and kernel
27390 + * -- not even in the long flat mode, because of different DPL /kkeil
27391 + * The segment offset needs to contain a RPL. Grr. -AK
27392 + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
27393 + */
27394 +
27395 +#define __USER32_CS   0x23   /* 4*8+3 */
27396 +#define __USER_DS     0x2b   /* 5*8+3 */
27397 +#define __USER_CS     0x33   /* 6*8+3 */
27398 +#define __USER32_DS    __USER_DS
27399 +
27400 +#define GDT_ENTRY_TSS 8        /* needs two entries */
27401 +#define GDT_ENTRY_LDT 10 /* needs two entries */
27402 +#define GDT_ENTRY_TLS_MIN 12
27403 +#define GDT_ENTRY_TLS_MAX 14
27404 +
27405 +#define GDT_ENTRY_PER_CPU 15   /* Abused to load per CPU data from limit */
27406 +#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU * 8 + 3)
27407 +
27408 +/* TLS indexes for 64bit - hardcoded in arch_prctl */
27409 +#define FS_TLS 0
27410 +#define GS_TLS 1
27411 +
27412 +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
27413 +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
27414 +
27415 +#define GDT_ENTRIES 16
27416 +
27417 +#endif
27418 +
27419 +/* User mode is privilege level 3 */
27420 +#define USER_RPL               0x3
27421 +/* LDT segment has TI set, GDT has it cleared */
27422 +#define SEGMENT_LDT            0x4
27423 +#define SEGMENT_GDT            0x0
27424 +
27425 +/* Bottom two bits of selector give the ring privilege level */
27426 +#define SEGMENT_RPL_MASK       0x3
27427 +/* Bit 2 is table indicator (LDT/GDT) */
27428 +#define SEGMENT_TI_MASK                0x4
27429 +
27430 +#define IDT_ENTRIES 256
27431 +#define GDT_SIZE (GDT_ENTRIES * 8)
27432 +#define GDT_ENTRY_TLS_ENTRIES 3
27433 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27434 +
27435 +#ifdef __KERNEL__
27436 +#ifndef __ASSEMBLY__
27437 +extern const char early_idt_handlers[IDT_ENTRIES][10];
27438 +#endif
27439 +#endif
27440 +
27441  #endif
27442 --- a/include/asm-x86/mach-xen/asm/smp_32.h
27443 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
27444 @@ -1,56 +1,51 @@
27445  #ifndef __ASM_SMP_H
27446  #define __ASM_SMP_H
27447
27448 +#ifndef __ASSEMBLY__
27449 +#include <linux/cpumask.h>
27450 +#include <linux/init.h>
27451 +
27452  /*
27453   * We need the APIC definitions automatically as part of 'smp.h'
27454   */
27455 -#ifndef __ASSEMBLY__
27456 -#include <linux/kernel.h>
27457 -#include <linux/threads.h>
27458 -#include <linux/cpumask.h>
27459 +#ifdef CONFIG_X86_LOCAL_APIC
27460 +# include <asm/mpspec.h>
27461 +# include <asm/apic.h>
27462 +# ifdef CONFIG_X86_IO_APIC
27463 +#  include <asm/io_apic.h>
27464 +# endif
27465  #endif
27466
27467 -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27468 -#include <linux/bitops.h>
27469 -#include <asm/mpspec.h>
27470 -#include <asm/apic.h>
27471 -#ifdef CONFIG_X86_IO_APIC
27472 -#include <asm/io_apic.h>
27473 -#endif
27474 -#endif
27475 +#define cpu_callout_map cpu_possible_map
27476 +#define cpu_callin_map cpu_possible_map
27477
27478 -#define BAD_APICID 0xFFu
27479 -#ifdef CONFIG_SMP
27480 -#ifndef __ASSEMBLY__
27481 +extern int smp_num_siblings;
27482 +extern unsigned int num_processors;
27483
27484 -/*
27485 - * Private routines/data
27486 - */
27487 -
27488  extern void smp_alloc_memory(void);
27489 -extern int pic_mode;
27490 -extern int smp_num_siblings;
27491 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27492 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27493 +extern void lock_ipi_call_lock(void);
27494 +extern void unlock_ipi_call_lock(void);
27495
27496  extern void (*mtrr_hook) (void);
27497  extern void zap_low_mappings (void);
27498 -extern void lock_ipi_call_lock(void);
27499 -extern void unlock_ipi_call_lock(void);
27500
27501 -#define MAX_APICID 256
27502 -extern u8 __initdata x86_cpu_to_apicid_init[];
27503 -extern void *x86_cpu_to_apicid_ptr;
27504 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27505 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27506 +DECLARE_PER_CPU(u8, cpu_llc_id);
27507  DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27508
27509 -#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27510 -
27511  #ifdef CONFIG_HOTPLUG_CPU
27512  extern void cpu_exit_clear(void);
27513  extern void cpu_uninit(void);
27514  #endif
27515
27516 +#ifdef CONFIG_SMP
27517 +
27518  #ifndef CONFIG_XEN
27519 +
27520 +/* Globals due to paravirt */
27521 +extern void set_cpu_sibling_map(int cpu);
27522 +
27523  struct smp_ops
27524  {
27525         void (*smp_prepare_boot_cpu)(void);
27526 @@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27527  int native_cpu_up(unsigned int cpunum);
27528  void native_smp_cpus_done(unsigned int max_cpus);
27529
27530 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp)            \
27531 -do { } while (0)
27532 -
27533 -#else
27534 +#ifndef CONFIG_PARAVIRT
27535 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27536 +#endif
27537
27538 +#else /* CONFIG_XEN */
27539
27540  void xen_smp_send_stop(void);
27541  void xen_smp_send_reschedule(int cpu);
27542 @@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27543  #define smp_send_reschedule    xen_smp_send_reschedule
27544  #define smp_call_function_mask xen_smp_call_function_mask
27545
27546 -#endif
27547 +extern void prefill_possible_map(void);
27548 +
27549 +#endif /* CONFIG_XEN */
27550 +
27551 +extern int __cpu_disable(void);
27552 +extern void __cpu_die(unsigned int cpu);
27553
27554  /*
27555   * This function is needed by all SMP systems. It must _always_ be valid
27556 @@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27557  DECLARE_PER_CPU(int, cpu_number);
27558  #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27559
27560 -extern cpumask_t cpu_possible_map;
27561 -#define cpu_callin_map cpu_possible_map
27562 +#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27563 +
27564 +#define safe_smp_processor_id() smp_processor_id()
27565
27566  /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27567  static inline int num_booting_cpus(void)
27568  {
27569 -       return cpus_weight(cpu_possible_map);
27570 +       return cpus_weight(cpu_callout_map);
27571  }
27572
27573 -#define safe_smp_processor_id() smp_processor_id()
27574 -extern int __cpu_disable(void);
27575 -extern void __cpu_die(unsigned int cpu);
27576 -extern void prefill_possible_map(void);
27577 -extern unsigned int num_processors;
27578 -
27579 -#endif /* !__ASSEMBLY__ */
27580 -
27581  #else /* CONFIG_SMP */
27582
27583  #define safe_smp_processor_id()                0
27584  #define cpu_physical_id(cpu)           boot_cpu_physical_apicid
27585
27586 -#define NO_PROC_ID             0xFF            /* No processor magic marker */
27587 -
27588 -#endif /* CONFIG_SMP */
27589 -
27590 -#ifndef __ASSEMBLY__
27591 +#endif /* !CONFIG_SMP */
27592
27593  #ifdef CONFIG_X86_LOCAL_APIC
27594
27595 -#ifdef APIC_DEFINITION
27596 +static __inline int logical_smp_processor_id(void)
27597 +{
27598 +       /* we don't want to mark this access volatile - bad code generation */
27599 +       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27600 +}
27601 +
27602 +# ifdef APIC_DEFINITION
27603  extern int hard_smp_processor_id(void);
27604 -#else
27605 -#include <mach_apicdef.h>
27606 +# else
27607 +#  include <mach_apicdef.h>
27608  static inline int hard_smp_processor_id(void)
27609  {
27610         /* we don't want to mark this access volatile - bad code generation */
27611 -       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27612 +       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27613  }
27614 -#endif /* APIC_DEFINITION */
27615 +# endif /* APIC_DEFINITION */
27616
27617  #else /* CONFIG_X86_LOCAL_APIC */
27618
27619 -#ifndef CONFIG_SMP
27620 -#define hard_smp_processor_id()                0
27621 -#endif
27622 +# ifndef CONFIG_SMP
27623 +#  define hard_smp_processor_id()      0
27624 +# endif
27625
27626  #endif /* CONFIG_X86_LOCAL_APIC */
27627
27628 -extern u8 apicid_2_node[];
27629 -
27630 -#ifdef CONFIG_X86_LOCAL_APIC
27631 -static __inline int logical_smp_processor_id(void)
27632 -{
27633 -       /* we don't want to mark this access volatile - bad code generation */
27634 -       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27635 -}
27636 -#endif
27637 -#endif
27638 -
27639 +#endif /* !ASSEMBLY */
27640  #endif
27641 --- a/include/asm-x86/mach-xen/asm/smp_64.h
27642 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
27643 @@ -1,139 +1,103 @@
27644  #ifndef __ASM_SMP_H
27645  #define __ASM_SMP_H
27646
27647 -/*
27648 - * We need the APIC definitions automatically as part of 'smp.h'
27649 - */
27650 -#include <linux/threads.h>
27651  #include <linux/cpumask.h>
27652 -#include <linux/bitops.h>
27653  #include <linux/init.h>
27654 -extern int disable_apic;
27655
27656  #ifdef CONFIG_X86_LOCAL_APIC
27657 -#include <asm/mpspec.h>
27658 +/*
27659 + * We need the APIC definitions automatically as part of 'smp.h'
27660 + */
27661  #include <asm/apic.h>
27662  #ifdef CONFIG_X86_IO_APIC
27663  #include <asm/io_apic.h>
27664  #endif
27665 -#include <asm/thread_info.h>
27666 +#include <asm/mpspec.h>
27667  #endif
27668 -
27669 -#ifdef CONFIG_SMP
27670 -
27671  #include <asm/pda.h>
27672 +#include <asm/thread_info.h>
27673
27674 -struct pt_regs;
27675 -
27676 -extern cpumask_t cpu_present_mask;
27677 -extern cpumask_t cpu_possible_map;
27678 -extern cpumask_t cpu_online_map;
27679  extern cpumask_t cpu_initialized;
27680
27681 -/*
27682 - * Private routines/data
27683 - */
27684 -
27685 +extern int smp_num_siblings;
27686 +extern unsigned int num_processors;
27687 +
27688  extern void smp_alloc_memory(void);
27689 -extern volatile unsigned long smp_invalidate_needed;
27690  extern void lock_ipi_call_lock(void);
27691  extern void unlock_ipi_call_lock(void);
27692 -extern int smp_num_siblings;
27693 -extern void smp_send_reschedule(int cpu);
27694 +
27695  extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27696                                   void *info, int wait);
27697
27698 -/*
27699 - * cpu_sibling_map and cpu_core_map now live
27700 - * in the per cpu area
27701 - *
27702 - * extern cpumask_t cpu_sibling_map[NR_CPUS];
27703 - * extern cpumask_t cpu_core_map[NR_CPUS];
27704 - */
27705  DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27706  DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27707 -DECLARE_PER_CPU(u8, cpu_llc_id);
27708 -
27709 -#define SMP_TRAMPOLINE_BASE 0x6000
27710 +DECLARE_PER_CPU(u16, cpu_llc_id);
27711 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27712 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27713
27714 -/*
27715 - * On x86 all CPUs are mapped 1:1 to the APIC space.
27716 - * This simplifies scheduling and IPI sending and
27717 - * compresses data structures.
27718 - */
27719 -
27720 -static inline int num_booting_cpus(void)
27721 +#ifdef CONFIG_X86_LOCAL_APIC
27722 +static inline int cpu_present_to_apicid(int mps_cpu)
27723  {
27724 -       return cpus_weight(cpu_possible_map);
27725 +       if (cpu_present(mps_cpu))
27726 +               return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27727 +       else
27728 +               return BAD_APICID;
27729  }
27730 +#endif
27731
27732 -#define raw_smp_processor_id() read_pda(cpunumber)
27733 +#ifdef CONFIG_SMP
27734 +
27735 +#define SMP_TRAMPOLINE_BASE 0x6000
27736
27737  extern int __cpu_disable(void);
27738  extern void __cpu_die(unsigned int cpu);
27739  extern void prefill_possible_map(void);
27740 -extern unsigned num_processors;
27741  extern unsigned __cpuinitdata disabled_cpus;
27742
27743 -#define NO_PROC_ID             0xFF            /* No processor magic marker */
27744 -
27745 -#endif /* CONFIG_SMP */
27746 +#define raw_smp_processor_id() read_pda(cpunumber)
27747 +#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
27748
27749 -#define safe_smp_processor_id()                smp_processor_id()
27750 -
27751 -#ifdef CONFIG_X86_LOCAL_APIC
27752 -static inline int hard_smp_processor_id(void)
27753 -{
27754 -       /* we don't want to mark this access volatile - bad code generation */
27755 -       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27756 -}
27757 -#endif
27758 +#define stack_smp_processor_id()                                       \
27759 +       ({                                                              \
27760 +       struct thread_info *ti;                                         \
27761 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
27762 +       ti->cpu;                                                        \
27763 +})
27764
27765  /*
27766 - * Some lowlevel functions might want to know about
27767 - * the real APIC ID <-> CPU # mapping.
27768 + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27769 + * scheduling and IPI sending and compresses data structures.
27770   */
27771 -extern u8 __initdata x86_cpu_to_apicid_init[];
27772 -extern void *x86_cpu_to_apicid_ptr;
27773 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);        /* physical ID */
27774 -extern u8 bios_cpu_apicid[];
27775 -
27776 -#ifdef CONFIG_X86_LOCAL_APIC
27777 -static inline int cpu_present_to_apicid(int mps_cpu)
27778 +static inline int num_booting_cpus(void)
27779  {
27780 -       if (mps_cpu < NR_CPUS)
27781 -               return (int)bios_cpu_apicid[mps_cpu];
27782 -       else
27783 -               return BAD_APICID;
27784 +       return cpus_weight(cpu_possible_map);
27785  }
27786 -#endif
27787
27788 -#ifndef CONFIG_SMP
27789 +extern void smp_send_reschedule(int cpu);
27790 +
27791 +#else /* CONFIG_SMP */
27792 +
27793 +extern unsigned int boot_cpu_id;
27794 +#define cpu_physical_id(cpu)   boot_cpu_id
27795  #define stack_smp_processor_id() 0
27796 -#define cpu_logical_map(x) (x)
27797 -#else
27798 -#include <asm/thread_info.h>
27799 -#define stack_smp_processor_id() \
27800 -({                                                             \
27801 -       struct thread_info *ti;                                 \
27802 -       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
27803 -       ti->cpu;                                                \
27804 -})
27805 -#endif
27806 +
27807 +#endif /* !CONFIG_SMP */
27808 +
27809 +#define safe_smp_processor_id()                smp_processor_id()
27810
27811  #ifdef CONFIG_X86_LOCAL_APIC
27812  static __inline int logical_smp_processor_id(void)
27813  {
27814         /* we don't want to mark this access volatile - bad code generation */
27815 -       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27816 +       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27817 +}
27818 +
27819 +static inline int hard_smp_processor_id(void)
27820 +{
27821 +       /* we don't want to mark this access volatile - bad code generation */
27822 +       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27823  }
27824  #endif
27825
27826 -#ifdef CONFIG_SMP
27827 -#define cpu_physical_id(cpu)           per_cpu(x86_cpu_to_apicid, cpu)
27828 -#else
27829 -extern unsigned int boot_cpu_id;
27830 -#define cpu_physical_id(cpu)           boot_cpu_id
27831 -#endif /* !CONFIG_SMP */
27832  #endif
27833
27834 --- /dev/null
27835 +++ b/include/asm-x86/mach-xen/asm/spinlock.h
27836 @@ -0,0 +1,333 @@
27837 +#ifndef _X86_SPINLOCK_H_
27838 +#define _X86_SPINLOCK_H_
27839 +
27840 +#include <asm/atomic.h>
27841 +#include <asm/rwlock.h>
27842 +#include <asm/page.h>
27843 +#include <asm/processor.h>
27844 +#include <linux/compiler.h>
27845 +
27846 +/*
27847 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
27848 + *
27849 + * Simple spin lock operations.  There are two variants, one clears IRQ's
27850 + * on the local processor, one does not.
27851 + *
27852 + * These are fair FIFO ticket locks, which are currently limited to 256
27853 + * CPUs.
27854 + *
27855 + * (the type definitions are in asm/spinlock_types.h)
27856 + */
27857 +
27858 +#ifdef CONFIG_X86_32
27859 +# define LOCK_PTR_REG "a"
27860 +# define REG_PTR_MODE "k"
27861 +#else
27862 +# define LOCK_PTR_REG "D"
27863 +# define REG_PTR_MODE "q"
27864 +#endif
27865 +
27866 +#if defined(CONFIG_X86_32) && \
27867 +       (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27868 +/*
27869 + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27870 + * (PPro errata 66, 92)
27871 + */
27872 +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27873 +#else
27874 +# define UNLOCK_LOCK_PREFIX
27875 +#endif
27876 +
27877 +int xen_spinlock_init(unsigned int cpu);
27878 +void xen_spinlock_cleanup(unsigned int cpu);
27879 +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27880 +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27881 +                              unsigned int flags);
27882 +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27883 +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27884 +
27885 +/*
27886 + * Ticket locks are conceptually two parts, one indicating the current head of
27887 + * the queue, and the other indicating the current tail. The lock is acquired
27888 + * by atomically noting the tail and incrementing it by one (thus adding
27889 + * ourself to the queue and noting our position), then waiting until the head
27890 + * becomes equal to the the initial value of the tail.
27891 + *
27892 + * We use an xadd covering *both* parts of the lock, to increment the tail and
27893 + * also load the position of the head, which takes care of memory ordering
27894 + * issues and should be optimal for the uncontended case. Note the tail must be
27895 + * in the high part, because a wide xadd increment of the low part would carry
27896 + * up and contaminate the high part.
27897 + *
27898 + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27899 + * save some instructions and make the code more elegant. There really isn't
27900 + * much between them in performance though, especially as locks are out of line.
27901 + */
27902 +#if (NR_CPUS < 256)
27903 +#define TICKET_SHIFT 8
27904 +#define __raw_spin_lock_preamble \
27905 +       asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27906 +           "cmpb %h0, %b0\n\t" \
27907 +           "sete %1" \
27908 +           : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27909 +           : "0" (0x0100) \
27910 +           : "memory", "cc")
27911 +#define __raw_spin_lock_body \
27912 +       asm("1:\t" \
27913 +           "cmpb %h0, %b0\n\t" \
27914 +           "je 2f\n\t" \
27915 +           "decl %1\n\t" \
27916 +           "jz 2f\n\t" \
27917 +           "rep ; nop\n\t" \
27918 +           "movb %2, %b0\n\t" \
27919 +           /* don't need lfence here, because loads are in-order */ \
27920 +           "jmp 1b\n" \
27921 +           "2:" \
27922 +           : "+Q" (token), "+g" (count) \
27923 +           : "m" (lock->slock) \
27924 +           : "memory", "cc")
27925 +
27926 +
27927 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27928 +{
27929 +       int tmp, new;
27930 +
27931 +       asm("movzwl %2, %0\n\t"
27932 +           "cmpb %h0, %b0\n\t"
27933 +           "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27934 +           "jne 1f\n\t"
27935 +           LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27936 +           "1:\t"
27937 +           "sete %b1\n\t"
27938 +           "movzbl %b1, %0\n\t"
27939 +           : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27940 +           :
27941 +           : "memory", "cc");
27942 +
27943 +       return tmp;
27944 +}
27945 +
27946 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27947 +{
27948 +       unsigned int token;
27949 +       unsigned char kick;
27950 +
27951 +       asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27952 +           "movzwl %2, %0\n\t"
27953 +           "cmpb %h0, %b0\n\t"
27954 +           "setne %1"
27955 +           : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27956 +           :
27957 +           : "memory", "cc");
27958 +       if (kick)
27959 +               xen_spin_kick(lock, token);
27960 +}
27961 +#else
27962 +#define TICKET_SHIFT 16
27963 +#define __raw_spin_lock_preamble \
27964 +       do { \
27965 +               unsigned int tmp; \
27966 +               asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27967 +                   "shldl $16, %0, %3\n\t" \
27968 +                   "cmpw %w3, %w0\n\t" \
27969 +                   "sete %1"
27970 +                   : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27971 +                     "=&g" (tmp) \
27972 +                   : "0" (0x00010000) \
27973 +                   : "memory", "cc"); \
27974 +       } while (0)
27975 +#define __raw_spin_lock_body \
27976 +       do { \
27977 +               unsigned int tmp; \
27978 +               asm("shldl $16, %0, %2\n" \
27979 +                   "1:\t" \
27980 +                   "cmpw %w2, %w0\n\t" \
27981 +                   "je 2f\n\t" \
27982 +                   "decl %1\n\t" \
27983 +                   "jz 2f\n\t" \
27984 +                   "rep ; nop\n\t" \
27985 +                   "movw %3, %w0\n\t" \
27986 +                   /* don't need lfence here, because loads are in-order */ \
27987 +                   "jmp 1b\n" \
27988 +                   "2:" \
27989 +                   : "+r" (token), "+g" (count), "=&g" (tmp) \
27990 +                   : "m" (lock->slock) \
27991 +                   : "memory", "cc"); \
27992 +       } while (0)
27993 +
27994 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27995 +{
27996 +       int tmp;
27997 +       int new;
27998 +
27999 +       asm("movl %2, %0\n\t"
28000 +           "movl %0, %1\n\t"
28001 +           "roll $16, %0\n\t"
28002 +           "cmpl %0, %1\n\t"
28003 +           "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
28004 +           "jne 1f\n\t"
28005 +           LOCK_PREFIX "cmpxchgl %1, %2\n"
28006 +           "1:\t"
28007 +           "sete %b1\n\t"
28008 +           "movzbl %b1, %0\n\t"
28009 +           : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
28010 +           :
28011 +           : "memory", "cc");
28012 +
28013 +       return tmp;
28014 +}
28015 +
28016 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
28017 +{
28018 +       unsigned int token, tmp;
28019 +       bool kick;
28020 +
28021 +       asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
28022 +           "movl %2, %0\n\t"
28023 +           "shldl $16, %0, %3\n\t"
28024 +           "cmpw %w3, %w0\n\t"
28025 +           "setne %1"
28026 +           : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
28027 +           :
28028 +           : "memory", "cc");
28029 +       if (kick)
28030 +               xen_spin_kick(lock, token);
28031 +}
28032 +#endif
28033 +
28034 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
28035 +{
28036 +       int tmp = *(volatile signed int *)(&(lock)->slock);
28037 +
28038 +       return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
28039 +}
28040 +
28041 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
28042 +{
28043 +       int tmp = *(volatile signed int *)(&(lock)->slock);
28044 +
28045 +       return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
28046 +}
28047 +
28048 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
28049 +{
28050 +       unsigned int token, count;
28051 +       bool free;
28052 +
28053 +       __raw_spin_lock_preamble;
28054 +       if (unlikely(!free))
28055 +               token = xen_spin_adjust(lock, token);
28056 +       do {
28057 +               count = 1 << 10;
28058 +               __raw_spin_lock_body;
28059 +       } while (unlikely(!count) && !xen_spin_wait(lock, token));
28060 +}
28061 +
28062 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
28063 +                                        unsigned long flags)
28064 +{
28065 +       unsigned int token, count;
28066 +       bool free;
28067 +
28068 +       __raw_spin_lock_preamble;
28069 +       if (unlikely(!free))
28070 +               token = xen_spin_adjust(lock, token);
28071 +       do {
28072 +               count = 1 << 10;
28073 +               __raw_spin_lock_body;
28074 +       } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
28075 +}
28076 +
28077 +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
28078 +{
28079 +       while (__raw_spin_is_locked(lock))
28080 +               cpu_relax();
28081 +}
28082 +
28083 +/*
28084 + * Read-write spinlocks, allowing multiple readers
28085 + * but only one writer.
28086 + *
28087 + * NOTE! it is quite common to have readers in interrupts
28088 + * but no interrupt writers. For those circumstances we
28089 + * can "mix" irq-safe locks - any writer needs to get a
28090 + * irq-safe write-lock, but readers can get non-irqsafe
28091 + * read-locks.
28092 + *
28093 + * On x86, we implement read-write locks as a 32-bit counter
28094 + * with the high bit (sign) being the "contended" bit.
28095 + */
28096 +
28097 +/**
28098 + * read_can_lock - would read_trylock() succeed?
28099 + * @lock: the rwlock in question.
28100 + */
28101 +static inline int __raw_read_can_lock(raw_rwlock_t *lock)
28102 +{
28103 +       return (int)(lock)->lock > 0;
28104 +}
28105 +
28106 +/**
28107 + * write_can_lock - would write_trylock() succeed?
28108 + * @lock: the rwlock in question.
28109 + */
28110 +static inline int __raw_write_can_lock(raw_rwlock_t *lock)
28111 +{
28112 +       return (lock)->lock == RW_LOCK_BIAS;
28113 +}
28114 +
28115 +static inline void __raw_read_lock(raw_rwlock_t *rw)
28116 +{
28117 +       asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
28118 +                    "jns 1f\n"
28119 +                    "call __read_lock_failed\n\t"
28120 +                    "1:\n"
28121 +                    ::LOCK_PTR_REG (rw) : "memory");
28122 +}
28123 +
28124 +static inline void __raw_write_lock(raw_rwlock_t *rw)
28125 +{
28126 +       asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
28127 +                    "jz 1f\n"
28128 +                    "call __write_lock_failed\n\t"
28129 +                    "1:\n"
28130 +                    ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
28131 +}
28132 +
28133 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
28134 +{
28135 +       atomic_t *count = (atomic_t *)lock;
28136 +
28137 +       atomic_dec(count);
28138 +       if (atomic_read(count) >= 0)
28139 +               return 1;
28140 +       atomic_inc(count);
28141 +       return 0;
28142 +}
28143 +
28144 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
28145 +{
28146 +       atomic_t *count = (atomic_t *)lock;
28147 +
28148 +       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
28149 +               return 1;
28150 +       atomic_add(RW_LOCK_BIAS, count);
28151 +       return 0;
28152 +}
28153 +
28154 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
28155 +{
28156 +       asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
28157 +}
28158 +
28159 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
28160 +{
28161 +       asm volatile(LOCK_PREFIX "addl %1, %0"
28162 +                    : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
28163 +}
28164 +
28165 +#define _raw_spin_relax(lock)  cpu_relax()
28166 +#define _raw_read_relax(lock)  cpu_relax()
28167 +#define _raw_write_relax(lock) cpu_relax()
28168 +
28169 +#endif
28170 --- a/include/asm-x86/mach-xen/asm/system_32.h
28171 +++ /dev/null
28172 @@ -1,312 +0,0 @@
28173 -#ifndef __ASM_SYSTEM_H
28174 -#define __ASM_SYSTEM_H
28175 -
28176 -#include <linux/kernel.h>
28177 -#include <asm/segment.h>
28178 -#include <asm/cpufeature.h>
28179 -#include <asm/cmpxchg.h>
28180 -#include <asm/synch_bitops.h>
28181 -#include <asm/hypervisor.h>
28182 -
28183 -#ifdef __KERNEL__
28184 -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28185 -
28186 -struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
28187 -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28188 -
28189 -/*
28190 - * Saving eflags is important. It switches not only IOPL between tasks,
28191 - * it also protects other tasks from NT leaking through sysenter etc.
28192 - */
28193 -#define switch_to(prev,next,last) do {                                 \
28194 -       unsigned long esi,edi;                                          \
28195 -       asm volatile("pushfl\n\t"               /* Save flags */        \
28196 -                    "pushl %%ebp\n\t"                                  \
28197 -                    "movl %%esp,%0\n\t"        /* save ESP */          \
28198 -                    "movl %5,%%esp\n\t"        /* restore ESP */       \
28199 -                    "movl $1f,%1\n\t"          /* save EIP */          \
28200 -                    "pushl %6\n\t"             /* restore EIP */       \
28201 -                    "jmp __switch_to\n"                                \
28202 -                    "1:\t"                                             \
28203 -                    "popl %%ebp\n\t"                                   \
28204 -                    "popfl"                                            \
28205 -                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
28206 -                     "=a" (last),"=S" (esi),"=D" (edi)                 \
28207 -                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
28208 -                     "2" (prev), "d" (next));                          \
28209 -} while (0)
28210 -
28211 -#define _set_base(addr,base) do { unsigned long __pr; \
28212 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28213 -       "rorl $16,%%edx\n\t" \
28214 -       "movb %%dl,%2\n\t" \
28215 -       "movb %%dh,%3" \
28216 -       :"=&d" (__pr) \
28217 -       :"m" (*((addr)+2)), \
28218 -        "m" (*((addr)+4)), \
28219 -        "m" (*((addr)+7)), \
28220 -         "0" (base) \
28221 -        ); } while(0)
28222 -
28223 -#define _set_limit(addr,limit) do { unsigned long __lr; \
28224 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28225 -       "rorl $16,%%edx\n\t" \
28226 -       "movb %2,%%dh\n\t" \
28227 -       "andb $0xf0,%%dh\n\t" \
28228 -       "orb %%dh,%%dl\n\t" \
28229 -       "movb %%dl,%2" \
28230 -       :"=&d" (__lr) \
28231 -       :"m" (*(addr)), \
28232 -        "m" (*((addr)+6)), \
28233 -        "0" (limit) \
28234 -        ); } while(0)
28235 -
28236 -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28237 -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28238 -
28239 -/*
28240 - * Load a segment. Fall back on loading the zero
28241 - * segment if something goes wrong..
28242 - */
28243 -#define loadsegment(seg,value)                 \
28244 -       asm volatile("\n"                       \
28245 -               "1:\t"                          \
28246 -               "mov %0,%%" #seg "\n"           \
28247 -               "2:\n"                          \
28248 -               ".section .fixup,\"ax\"\n"      \
28249 -               "3:\t"                          \
28250 -               "pushl $0\n\t"                  \
28251 -               "popl %%" #seg "\n\t"           \
28252 -               "jmp 2b\n"                      \
28253 -               ".previous\n"                   \
28254 -               ".section __ex_table,\"a\"\n\t" \
28255 -               ".align 4\n\t"                  \
28256 -               ".long 1b,3b\n"                 \
28257 -               ".previous"                     \
28258 -               : :"rm" (value))
28259 -
28260 -/*
28261 - * Save a segment register away
28262 - */
28263 -#define savesegment(seg, value) \
28264 -       asm volatile("mov %%" #seg ",%0":"=rm" (value))
28265 -
28266 -static inline void xen_clts(void)
28267 -{
28268 -       HYPERVISOR_fpu_taskswitch(0);
28269 -}
28270 -
28271 -static inline unsigned long xen_read_cr0(void)
28272 -{
28273 -       unsigned long val;
28274 -       asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28275 -       return val;
28276 -}
28277 -
28278 -static inline void xen_write_cr0(unsigned long val)
28279 -{
28280 -       asm volatile("movl %0,%%cr0": :"r" (val));
28281 -}
28282 -
28283 -#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28284 -
28285 -static inline void xen_write_cr2(unsigned long val)
28286 -{
28287 -       asm volatile("movl %0,%%cr2": :"r" (val));
28288 -}
28289 -
28290 -static inline unsigned long xen_read_cr3(void)
28291 -{
28292 -       unsigned long val;
28293 -       asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28294 -       return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28295 -}
28296 -
28297 -static inline void xen_write_cr3(unsigned long val)
28298 -{
28299 -       val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28300 -       asm volatile("movl %0,%%cr3": :"r" (val));
28301 -}
28302 -
28303 -static inline unsigned long xen_read_cr4(void)
28304 -{
28305 -       unsigned long val;
28306 -       asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28307 -       return val;
28308 -}
28309 -
28310 -static inline unsigned long xen_read_cr4_safe(void)
28311 -{
28312 -       unsigned long val;
28313 -       /* This could fault if %cr4 does not exist */
28314 -       asm volatile("1: movl %%cr4, %0         \n"
28315 -               "2:                             \n"
28316 -               ".section __ex_table,\"a\"      \n"
28317 -               ".long 1b,2b                    \n"
28318 -               ".previous                      \n"
28319 -               : "=r" (val): "0" (0));
28320 -       return val;
28321 -}
28322 -
28323 -static inline void xen_write_cr4(unsigned long val)
28324 -{
28325 -       asm volatile("movl %0,%%cr4": :"r" (val));
28326 -}
28327 -
28328 -static inline void xen_wbinvd(void)
28329 -{
28330 -       asm volatile("wbinvd": : :"memory");
28331 -}
28332 -
28333 -static inline void clflush(volatile void *__p)
28334 -{
28335 -       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28336 -}
28337 -
28338 -#define read_cr0()     (xen_read_cr0())
28339 -#define write_cr0(x)   (xen_write_cr0(x))
28340 -#define read_cr2()     (xen_read_cr2())
28341 -#define write_cr2(x)   (xen_write_cr2(x))
28342 -#define read_cr3()     (xen_read_cr3())
28343 -#define write_cr3(x)   (xen_write_cr3(x))
28344 -#define read_cr4()     (xen_read_cr4())
28345 -#define read_cr4_safe()        (xen_read_cr4_safe())
28346 -#define write_cr4(x)   (xen_write_cr4(x))
28347 -#define wbinvd()       (xen_wbinvd())
28348 -
28349 -/* Clear the 'TS' bit */
28350 -#define clts()         (xen_clts())
28351 -
28352 -/* Set the 'TS' bit */
28353 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28354 -
28355 -#endif /* __KERNEL__ */
28356 -
28357 -static inline unsigned long get_limit(unsigned long segment)
28358 -{
28359 -       unsigned long __limit;
28360 -       __asm__("lsll %1,%0"
28361 -               :"=r" (__limit):"r" (segment));
28362 -       return __limit+1;
28363 -}
28364 -
28365 -#define nop() __asm__ __volatile__ ("nop")
28366 -
28367 -/*
28368 - * Force strict CPU ordering.
28369 - * And yes, this is required on UP too when we're talking
28370 - * to devices.
28371 - *
28372 - * For now, "wmb()" doesn't actually do anything, as all
28373 - * Intel CPU's follow what Intel calls a *Processor Order*,
28374 - * in which all writes are seen in the program order even
28375 - * outside the CPU.
28376 - *
28377 - * I expect future Intel CPU's to have a weaker ordering,
28378 - * but I'd also expect them to finally get their act together
28379 - * and add some real memory barriers if so.
28380 - *
28381 - * Some non intel clones support out of order store. wmb() ceases to be a
28382 - * nop for these.
28383 - */
28384 -
28385 -
28386 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28387 -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28388 -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28389 -
28390 -/**
28391 - * read_barrier_depends - Flush all pending reads that subsequents reads
28392 - * depend on.
28393 - *
28394 - * No data-dependent reads from memory-like regions are ever reordered
28395 - * over this barrier.  All reads preceding this primitive are guaranteed
28396 - * to access memory (but not necessarily other CPUs' caches) before any
28397 - * reads following this primitive that depend on the data return by
28398 - * any of the preceding reads.  This primitive is much lighter weight than
28399 - * rmb() on most CPUs, and is never heavier weight than is
28400 - * rmb().
28401 - *
28402 - * These ordering constraints are respected by both the local CPU
28403 - * and the compiler.
28404 - *
28405 - * Ordering is not guaranteed by anything other than these primitives,
28406 - * not even by data dependencies.  See the documentation for
28407 - * memory_barrier() for examples and URLs to more information.
28408 - *
28409 - * For example, the following code would force ordering (the initial
28410 - * value of "a" is zero, "b" is one, and "p" is "&a"):
28411 - *
28412 - * <programlisting>
28413 - *     CPU 0                           CPU 1
28414 - *
28415 - *     b = 2;
28416 - *     memory_barrier();
28417 - *     p = &b;                         q = p;
28418 - *                                     read_barrier_depends();
28419 - *                                     d = *q;
28420 - * </programlisting>
28421 - *
28422 - * because the read of "*q" depends on the read of "p" and these
28423 - * two reads are separated by a read_barrier_depends().  However,
28424 - * the following code, with the same initial values for "a" and "b":
28425 - *
28426 - * <programlisting>
28427 - *     CPU 0                           CPU 1
28428 - *
28429 - *     a = 2;
28430 - *     memory_barrier();
28431 - *     b = 3;                          y = b;
28432 - *                                     read_barrier_depends();
28433 - *                                     x = a;
28434 - * </programlisting>
28435 - *
28436 - * does not enforce ordering, since there is no data dependency between
28437 - * the read of "a" and the read of "b".  Therefore, on some CPUs, such
28438 - * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
28439 - * in cases like this where there are no data dependencies.
28440 - **/
28441 -
28442 -#define read_barrier_depends() do { } while(0)
28443 -
28444 -#ifdef CONFIG_SMP
28445 -#define smp_mb()       mb()
28446 -#ifdef CONFIG_X86_PPRO_FENCE
28447 -# define smp_rmb()     rmb()
28448 -#else
28449 -# define smp_rmb()     barrier()
28450 -#endif
28451 -#ifdef CONFIG_X86_OOSTORE
28452 -# define smp_wmb()     wmb()
28453 -#else
28454 -# define smp_wmb()     barrier()
28455 -#endif
28456 -#define smp_read_barrier_depends()     read_barrier_depends()
28457 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28458 -#else
28459 -#define smp_mb()       barrier()
28460 -#define smp_rmb()      barrier()
28461 -#define smp_wmb()      barrier()
28462 -#define smp_read_barrier_depends()     do { } while(0)
28463 -#define set_mb(var, value) do { var = value; barrier(); } while (0)
28464 -#endif
28465 -
28466 -#include <linux/irqflags.h>
28467 -
28468 -/*
28469 - * disable hlt during certain critical i/o operations
28470 - */
28471 -#define HAVE_DISABLE_HLT
28472 -void disable_hlt(void);
28473 -void enable_hlt(void);
28474 -
28475 -extern int es7000_plat;
28476 -void cpu_idle_wait(void);
28477 -
28478 -extern unsigned long arch_align_stack(unsigned long sp);
28479 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28480 -
28481 -void default_idle(void);
28482 -void __show_registers(struct pt_regs *, int all);
28483 -
28484 -#endif
28485 --- a/include/asm-x86/mach-xen/asm/system_64.h
28486 +++ b/include/asm-x86/mach-xen/asm/system_64.h
28487 @@ -1,122 +1,9 @@
28488  #ifndef __ASM_SYSTEM_H
28489  #define __ASM_SYSTEM_H
28490
28491 -#include <linux/kernel.h>
28492  #include <asm/segment.h>
28493  #include <asm/cmpxchg.h>
28494
28495 -#include <asm/synch_bitops.h>
28496 -#include <asm/hypervisor.h>
28497 -#include <xen/interface/arch-x86_64.h>
28498 -
28499 -#ifdef __KERNEL__
28500 -
28501 -/* entries in ARCH_DLINFO: */
28502 -#ifdef CONFIG_IA32_EMULATION
28503 -# define AT_VECTOR_SIZE_ARCH 2
28504 -#else
28505 -# define AT_VECTOR_SIZE_ARCH 1
28506 -#endif
28507 -
28508 -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28509 -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28510 -
28511 -/* frame pointer must be last for get_wchan */
28512 -#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28513 -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28514 -
28515 -#define __EXTRA_CLOBBER  \
28516 -       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28517 -
28518 -/* Save restore flags to clear handle leaking NT */
28519 -#define switch_to(prev,next,last) \
28520 -       asm volatile(SAVE_CONTEXT                                                   \
28521 -                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
28522 -                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
28523 -                    "call __switch_to\n\t"                                       \
28524 -                    ".globl thread_return\n"                                   \
28525 -                    "thread_return:\n\t"                                           \
28526 -                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
28527 -                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
28528 -                    LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
28529 -                    "movq %%rax,%%rdi\n\t"                                       \
28530 -                    "jc   ret_from_fork\n\t"                                     \
28531 -                    RESTORE_CONTEXT                                                \
28532 -                    : "=a" (last)                                                \
28533 -                    : [next] "S" (next), [prev] "D" (prev),                      \
28534 -                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28535 -                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28536 -                      [tif_fork] "i" (TIF_FORK),                         \
28537 -                      [thread_info] "i" (offsetof(struct task_struct, stack)), \
28538 -                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
28539 -                    : "memory", "cc" __EXTRA_CLOBBER)
28540 -
28541 -extern void load_gs_index(unsigned);
28542 -
28543 -/*
28544 - * Load a segment. Fall back on loading the zero
28545 - * segment if something goes wrong..
28546 - */
28547 -#define loadsegment(seg,value) \
28548 -       asm volatile("\n"                       \
28549 -               "1:\t"                          \
28550 -               "movl %k0,%%" #seg "\n"         \
28551 -               "2:\n"                          \
28552 -               ".section .fixup,\"ax\"\n"      \
28553 -               "3:\t"                          \
28554 -               "movl %1,%%" #seg "\n\t"        \
28555 -               "jmp 2b\n"                      \
28556 -               ".previous\n"                   \
28557 -               ".section __ex_table,\"a\"\n\t" \
28558 -               ".align 8\n\t"                  \
28559 -               ".quad 1b,3b\n"                 \
28560 -               ".previous"                     \
28561 -               : :"r" (value), "r" (0))
28562 -
28563 -/*
28564 - * Clear and set 'TS' bit respectively
28565 - */
28566 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
28567 -
28568 -static inline unsigned long read_cr0(void)
28569 -{
28570 -       unsigned long cr0;
28571 -       asm volatile("movq %%cr0,%0" : "=r" (cr0));
28572 -       return cr0;
28573 -}
28574 -
28575 -static inline void write_cr0(unsigned long val)
28576 -{
28577 -       asm volatile("movq %0,%%cr0" :: "r" (val));
28578 -}
28579 -
28580 -#define read_cr2() current_vcpu_info()->arch.cr2
28581 -
28582 -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28583 -
28584 -#define read_cr3() ({ \
28585 -       unsigned long __dummy; \
28586 -       asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28587 -       machine_to_phys(__dummy); \
28588 -})
28589 -
28590 -static inline void write_cr3(unsigned long val)
28591 -{
28592 -       val = phys_to_machine(val);
28593 -       asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28594 -}
28595 -
28596 -static inline unsigned long read_cr4(void)
28597 -{
28598 -       unsigned long cr4;
28599 -       asm volatile("movq %%cr4,%0" : "=r" (cr4));
28600 -       return cr4;
28601 -}
28602 -
28603 -static inline void write_cr4(unsigned long val)
28604 -{
28605 -       asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28606 -}
28607
28608  static inline unsigned long read_cr8(void)
28609  {
28610 @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28611         BUG_ON(val);
28612  }
28613
28614 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28615 -
28616 -#define wbinvd() \
28617 -       __asm__ __volatile__ ("wbinvd": : :"memory")
28618 -
28619 -#endif /* __KERNEL__ */
28620 -
28621 -static inline void clflush(volatile void *__p)
28622 -{
28623 -       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28624 -}
28625 -
28626 -#define nop() __asm__ __volatile__ ("nop")
28627 -
28628 -#ifdef CONFIG_SMP
28629 -#define smp_mb()       mb()
28630 -#define smp_rmb()      barrier()
28631 -#define smp_wmb()      barrier()
28632 -#define smp_read_barrier_depends()     do {} while(0)
28633 -#else
28634 -#define smp_mb()       barrier()
28635 -#define smp_rmb()      barrier()
28636 -#define smp_wmb()      barrier()
28637 -#define smp_read_barrier_depends()     do {} while(0)
28638 -#endif
28639 -
28640 -
28641 -/*
28642 - * Force strict CPU ordering.
28643 - * And yes, this is required on UP too when we're talking
28644 - * to devices.
28645 - */
28646 -#define mb()   asm volatile("mfence":::"memory")
28647 -#define rmb()  asm volatile("lfence":::"memory")
28648 -#define wmb()  asm volatile("sfence" ::: "memory")
28649 -
28650 -#define read_barrier_depends() do {} while(0)
28651 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28652 -
28653 -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28654 -
28655  #include <linux/irqflags.h>
28656
28657 -void cpu_idle_wait(void);
28658 -
28659 -extern unsigned long arch_align_stack(unsigned long sp);
28660 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28661 -
28662  #endif
28663 --- a/include/asm-x86/mach-xen/asm/system.h
28664 +++ b/include/asm-x86/mach-xen/asm/system.h
28665 @@ -1,5 +1,393 @@
28666 +#ifndef _ASM_X86_SYSTEM_H_
28667 +#define _ASM_X86_SYSTEM_H_
28668 +
28669 +#include <asm/asm.h>
28670 +#include <asm/segment.h>
28671 +#include <asm/cpufeature.h>
28672 +#include <asm/cmpxchg.h>
28673 +#include <asm/nops.h>
28674 +#include <asm/hypervisor.h>
28675 +
28676 +#include <linux/kernel.h>
28677 +#include <linux/irqflags.h>
28678 +
28679 +/* entries in ARCH_DLINFO: */
28680 +#ifdef CONFIG_IA32_EMULATION
28681 +# define AT_VECTOR_SIZE_ARCH 2
28682 +#else
28683 +# define AT_VECTOR_SIZE_ARCH 1
28684 +#endif
28685 +
28686 +#ifdef CONFIG_X86_32
28687 +
28688 +struct task_struct; /* one of the stranger aspects of C forward declarations */
28689 +struct task_struct *__switch_to(struct task_struct *prev,
28690 +                               struct task_struct *next);
28691 +
28692 +/*
28693 + * Saving eflags is important. It switches not only IOPL between tasks,
28694 + * it also protects other tasks from NT leaking through sysenter etc.
28695 + */
28696 +#define switch_to(prev, next, last) do {                               \
28697 +       unsigned long esi, edi;                                         \
28698 +       asm volatile("pushfl\n\t"               /* Save flags */        \
28699 +                    "pushl %%ebp\n\t"                                  \
28700 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
28701 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
28702 +                    "movl $1f,%1\n\t"          /* save EIP */          \
28703 +                    "pushl %6\n\t"             /* restore EIP */       \
28704 +                    "jmp __switch_to\n"                                \
28705 +                    "1:\t"                                             \
28706 +                    "popl %%ebp\n\t"                                   \
28707 +                    "popfl"                                            \
28708 +                    :"=m" (prev->thread.sp), "=m" (prev->thread.ip),   \
28709 +                     "=a" (last), "=S" (esi), "=D" (edi)               \
28710 +                    :"m" (next->thread.sp), "m" (next->thread.ip),     \
28711 +                     "2" (prev), "d" (next));                          \
28712 +} while (0)
28713 +
28714 +/*
28715 + * disable hlt during certain critical i/o operations
28716 + */
28717 +#define HAVE_DISABLE_HLT
28718 +#else
28719 +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28720 +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28721 +
28722 +/* frame pointer must be last for get_wchan */
28723 +#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28724 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
28725 +
28726 +#define __EXTRA_CLOBBER  \
28727 +       , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
28728 +         "r12", "r13", "r14", "r15"
28729 +
28730 +/* Save restore flags to clear handle leaking NT */
28731 +#define switch_to(prev, next, last) \
28732 +       asm volatile(SAVE_CONTEXT                                                   \
28733 +            "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
28734 +            "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
28735 +            "call __switch_to\n\t"                                       \
28736 +            ".globl thread_return\n"                                     \
28737 +            "thread_return:\n\t"                                         \
28738 +            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
28739 +            "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
28740 +            LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
28741 +            "movq %%rax,%%rdi\n\t"                                       \
28742 +            "jc   ret_from_fork\n\t"                                     \
28743 +            RESTORE_CONTEXT                                              \
28744 +            : "=a" (last)                                                \
28745 +            : [next] "S" (next), [prev] "D" (prev),                      \
28746 +              [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
28747 +              [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
28748 +              [tif_fork] "i" (TIF_FORK),                                 \
28749 +              [thread_info] "i" (offsetof(struct task_struct, stack)),   \
28750 +              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
28751 +            : "memory", "cc" __EXTRA_CLOBBER)
28752 +#endif
28753 +
28754 +#ifdef __KERNEL__
28755 +#define _set_base(addr, base) do { unsigned long __pr; \
28756 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28757 +       "rorl $16,%%edx\n\t" \
28758 +       "movb %%dl,%2\n\t" \
28759 +       "movb %%dh,%3" \
28760 +       :"=&d" (__pr) \
28761 +       :"m" (*((addr)+2)), \
28762 +        "m" (*((addr)+4)), \
28763 +        "m" (*((addr)+7)), \
28764 +        "0" (base) \
28765 +       ); } while (0)
28766 +
28767 +#define _set_limit(addr, limit) do { unsigned long __lr; \
28768 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28769 +       "rorl $16,%%edx\n\t" \
28770 +       "movb %2,%%dh\n\t" \
28771 +       "andb $0xf0,%%dh\n\t" \
28772 +       "orb %%dh,%%dl\n\t" \
28773 +       "movb %%dl,%2" \
28774 +       :"=&d" (__lr) \
28775 +       :"m" (*(addr)), \
28776 +        "m" (*((addr)+6)), \
28777 +        "0" (limit) \
28778 +       ); } while (0)
28779 +
28780 +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
28781 +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
28782 +
28783 +extern void load_gs_index(unsigned);
28784 +
28785 +/*
28786 + * Load a segment. Fall back on loading the zero
28787 + * segment if something goes wrong..
28788 + */
28789 +#define loadsegment(seg, value)                        \
28790 +       asm volatile("\n"                       \
28791 +               "1:\t"                          \
28792 +               "movl %k0,%%" #seg "\n"         \
28793 +               "2:\n"                          \
28794 +               ".section .fixup,\"ax\"\n"      \
28795 +               "3:\t"                          \
28796 +               "movl %k1, %%" #seg "\n\t"      \
28797 +               "jmp 2b\n"                      \
28798 +               ".previous\n"                   \
28799 +               _ASM_EXTABLE(1b,3b)             \
28800 +               : :"r" (value), "r" (0))
28801 +
28802 +
28803 +/*
28804 + * Save a segment register away
28805 + */
28806 +#define savesegment(seg, value) \
28807 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
28808 +
28809 +static inline unsigned long get_limit(unsigned long segment)
28810 +{
28811 +       unsigned long __limit;
28812 +       __asm__("lsll %1,%0"
28813 +               :"=r" (__limit):"r" (segment));
28814 +       return __limit+1;
28815 +}
28816 +
28817 +static inline void xen_clts(void)
28818 +{
28819 +       HYPERVISOR_fpu_taskswitch(0);
28820 +}
28821 +
28822 +static inline void xen_stts(void)
28823 +{
28824 +       HYPERVISOR_fpu_taskswitch(1);
28825 +}
28826 +
28827 +/*
28828 + * Volatile isn't enough to prevent the compiler from reordering the
28829 + * read/write functions for the control registers and messing everything up.
28830 + * A memory clobber would solve the problem, but would prevent reordering of
28831 + * all loads stores around it, which can hurt performance. Solution is to
28832 + * use a variable and mimic reads and writes to it to enforce serialization
28833 + */
28834 +static unsigned long __force_order;
28835 +
28836 +static inline unsigned long xen_read_cr0(void)
28837 +{
28838 +       unsigned long val;
28839 +       asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28840 +       return val;
28841 +}
28842 +
28843 +static inline void xen_write_cr0(unsigned long val)
28844 +{
28845 +       asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28846 +}
28847 +
28848 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28849 +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28850 +
28851 +static inline unsigned long xen_read_cr3(void)
28852 +{
28853 +       unsigned long val;
28854 +       asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28855 +#ifdef CONFIG_X86_32
28856 +       return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28857 +#else
28858 +       return machine_to_phys(val);
28859 +#endif
28860 +}
28861 +
28862 +static inline void xen_write_cr3(unsigned long val)
28863 +{
28864 +#ifdef CONFIG_X86_32
28865 +       val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28866 +#else
28867 +       val = phys_to_machine(val);
28868 +#endif
28869 +       asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28870 +}
28871 +
28872 +static inline unsigned long xen_read_cr4(void)
28873 +{
28874 +       unsigned long val;
28875 +       asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28876 +       return val;
28877 +}
28878 +
28879 +#define xen_read_cr4_safe() xen_read_cr4()
28880 +
28881 +static inline void xen_write_cr4(unsigned long val)
28882 +{
28883 +       asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28884 +}
28885 +
28886 +#ifdef CONFIG_X86_64
28887 +static inline unsigned long xen_read_cr8(void)
28888 +{
28889 +       return 0;
28890 +}
28891 +
28892 +static inline void xen_write_cr8(unsigned long val)
28893 +{
28894 +       BUG_ON(val);
28895 +}
28896 +#endif
28897 +
28898 +static inline void xen_wbinvd(void)
28899 +{
28900 +       asm volatile("wbinvd": : :"memory");
28901 +}
28902 +#define read_cr0()     (xen_read_cr0())
28903 +#define write_cr0(x)   (xen_write_cr0(x))
28904 +#define read_cr2()     (xen_read_cr2())
28905 +#define write_cr2(x)   (xen_write_cr2(x))
28906 +#define read_cr3()     (xen_read_cr3())
28907 +#define write_cr3(x)   (xen_write_cr3(x))
28908 +#define read_cr4()     (xen_read_cr4())
28909 +#define read_cr4_safe()        (xen_read_cr4_safe())
28910 +#define write_cr4(x)   (xen_write_cr4(x))
28911 +#define wbinvd()       (xen_wbinvd())
28912 +#ifdef CONFIG_X86_64
28913 +#define read_cr8()     (xen_read_cr8())
28914 +#define write_cr8(x)   (xen_write_cr8(x))
28915 +#endif
28916 +
28917 +/* Clear the 'TS' bit */
28918 +#define clts()         (xen_clts())
28919 +#define stts()         (xen_stts())
28920 +
28921 +#endif /* __KERNEL__ */
28922 +
28923 +static inline void clflush(volatile void *__p)
28924 +{
28925 +       asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28926 +}
28927 +
28928 +#define nop() __asm__ __volatile__ ("nop")
28929 +
28930 +void disable_hlt(void);
28931 +void enable_hlt(void);
28932 +
28933 +extern int es7000_plat;
28934 +void cpu_idle_wait(void);
28935 +
28936 +extern unsigned long arch_align_stack(unsigned long sp);
28937 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28938 +
28939 +void default_idle(void);
28940 +
28941 +/*
28942 + * Force strict CPU ordering.
28943 + * And yes, this is required on UP too when we're talking
28944 + * to devices.
28945 + */
28946  #ifdef CONFIG_X86_32
28947 -# include "system_32.h"
28948 +/*
28949 + * For now, "wmb()" doesn't actually do anything, as all
28950 + * Intel CPU's follow what Intel calls a *Processor Order*,
28951 + * in which all writes are seen in the program order even
28952 + * outside the CPU.
28953 + *
28954 + * I expect future Intel CPU's to have a weaker ordering,
28955 + * but I'd also expect them to finally get their act together
28956 + * and add some real memory barriers if so.
28957 + *
28958 + * Some non intel clones support out of order store. wmb() ceases to be a
28959 + * nop for these.
28960 + */
28961 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28962 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28963 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28964 +#else
28965 +#define mb()   asm volatile("mfence":::"memory")
28966 +#define rmb()  asm volatile("lfence":::"memory")
28967 +#define wmb()  asm volatile("sfence" ::: "memory")
28968 +#endif
28969 +
28970 +/**
28971 + * read_barrier_depends - Flush all pending reads that subsequents reads
28972 + * depend on.
28973 + *
28974 + * No data-dependent reads from memory-like regions are ever reordered
28975 + * over this barrier.  All reads preceding this primitive are guaranteed
28976 + * to access memory (but not necessarily other CPUs' caches) before any
28977 + * reads following this primitive that depend on the data return by
28978 + * any of the preceding reads.  This primitive is much lighter weight than
28979 + * rmb() on most CPUs, and is never heavier weight than is
28980 + * rmb().
28981 + *
28982 + * These ordering constraints are respected by both the local CPU
28983 + * and the compiler.
28984 + *
28985 + * Ordering is not guaranteed by anything other than these primitives,
28986 + * not even by data dependencies.  See the documentation for
28987 + * memory_barrier() for examples and URLs to more information.
28988 + *
28989 + * For example, the following code would force ordering (the initial
28990 + * value of "a" is zero, "b" is one, and "p" is "&a"):
28991 + *
28992 + * <programlisting>
28993 + *     CPU 0                           CPU 1
28994 + *
28995 + *     b = 2;
28996 + *     memory_barrier();
28997 + *     p = &b;                         q = p;
28998 + *                                     read_barrier_depends();
28999 + *                                     d = *q;
29000 + * </programlisting>
29001 + *
29002 + * because the read of "*q" depends on the read of "p" and these
29003 + * two reads are separated by a read_barrier_depends().  However,
29004 + * the following code, with the same initial values for "a" and "b":
29005 + *
29006 + * <programlisting>
29007 + *     CPU 0                           CPU 1
29008 + *
29009 + *     a = 2;
29010 + *     memory_barrier();
29011 + *     b = 3;                          y = b;
29012 + *                                     read_barrier_depends();
29013 + *                                     x = a;
29014 + * </programlisting>
29015 + *
29016 + * does not enforce ordering, since there is no data dependency between
29017 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
29018 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
29019 + * in cases like this where there are no data dependencies.
29020 + **/
29021 +
29022 +#define read_barrier_depends() do { } while (0)
29023 +
29024 +#ifdef CONFIG_SMP
29025 +#define smp_mb()       mb()
29026 +#ifdef CONFIG_X86_PPRO_FENCE
29027 +# define smp_rmb()     rmb()
29028  #else
29029 -# include "system_64.h"
29030 +# define smp_rmb()     barrier()
29031 +#endif
29032 +#ifdef CONFIG_X86_OOSTORE
29033 +# define smp_wmb()     wmb()
29034 +#else
29035 +# define smp_wmb()     barrier()
29036 +#endif
29037 +#define smp_read_barrier_depends()     read_barrier_depends()
29038 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
29039 +#else
29040 +#define smp_mb()       barrier()
29041 +#define smp_rmb()      barrier()
29042 +#define smp_wmb()      barrier()
29043 +#define smp_read_barrier_depends()     do { } while (0)
29044 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
29045 +#endif
29046 +
29047 +/*
29048 + * Stop RDTSC speculation. This is needed when you need to use RDTSC
29049 + * (or get_cycles or vread that possibly accesses the TSC) in a defined
29050 + * code region.
29051 + *
29052 + * (Could use an alternative three way for this if there was one.)
29053 + */
29054 +static inline void rdtsc_barrier(void)
29055 +{
29056 +       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
29057 +       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
29058 +}
29059 +
29060  #endif
29061 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
29062 +++ /dev/null
29063 @@ -1,99 +0,0 @@
29064 -#ifndef _I386_TLBFLUSH_H
29065 -#define _I386_TLBFLUSH_H
29066 -
29067 -#include <linux/mm.h>
29068 -#include <asm/processor.h>
29069 -
29070 -#define __flush_tlb() xen_tlb_flush()
29071 -#define __flush_tlb_global() xen_tlb_flush()
29072 -#define __flush_tlb_all() xen_tlb_flush()
29073 -
29074 -#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
29075 -
29076 -#define __flush_tlb_single(addr) xen_invlpg(addr)
29077 -
29078 -#define __flush_tlb_one(addr) __flush_tlb_single(addr)
29079 -
29080 -/*
29081 - * TLB flushing:
29082 - *
29083 - *  - flush_tlb() flushes the current mm struct TLBs
29084 - *  - flush_tlb_all() flushes all processes TLBs
29085 - *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
29086 - *  - flush_tlb_page(vma, vmaddr) flushes one page
29087 - *  - flush_tlb_range(vma, start, end) flushes a range of pages
29088 - *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29089 - *
29090 - * ..but the i386 has somewhat limited tlb flushing capabilities,
29091 - * and page-granular flushes are available only on i486 and up.
29092 - */
29093 -
29094 -#define TLB_FLUSH_ALL  0xffffffff
29095 -
29096 -
29097 -#ifndef CONFIG_SMP
29098 -
29099 -#include <linux/sched.h>
29100 -
29101 -#define flush_tlb() __flush_tlb()
29102 -#define flush_tlb_all() __flush_tlb_all()
29103 -#define local_flush_tlb() __flush_tlb()
29104 -
29105 -static inline void flush_tlb_mm(struct mm_struct *mm)
29106 -{
29107 -       if (mm == current->active_mm)
29108 -               __flush_tlb();
29109 -}
29110 -
29111 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29112 -       unsigned long addr)
29113 -{
29114 -       if (vma->vm_mm == current->active_mm)
29115 -               __flush_tlb_one(addr);
29116 -}
29117 -
29118 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29119 -       unsigned long start, unsigned long end)
29120 -{
29121 -       if (vma->vm_mm == current->active_mm)
29122 -               __flush_tlb();
29123 -}
29124 -
29125 -#else  /* SMP */
29126 -
29127 -#include <asm/smp.h>
29128 -
29129 -#define local_flush_tlb() \
29130 -       __flush_tlb()
29131 -
29132 -#define flush_tlb_all xen_tlb_flush_all
29133 -#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29134 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29135 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29136 -
29137 -#define flush_tlb()    flush_tlb_current_task()
29138 -
29139 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29140 -{
29141 -       flush_tlb_mm(vma->vm_mm);
29142 -}
29143 -
29144 -#define TLBSTATE_OK    1
29145 -#define TLBSTATE_LAZY  2
29146 -
29147 -struct tlb_state
29148 -{
29149 -       struct mm_struct *active_mm;
29150 -       int state;
29151 -       char __cacheline_padding[L1_CACHE_BYTES-8];
29152 -};
29153 -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29154 -#endif /* SMP */
29155 -
29156 -static inline void flush_tlb_kernel_range(unsigned long start,
29157 -                                       unsigned long end)
29158 -{
29159 -       flush_tlb_all();
29160 -}
29161 -
29162 -#endif /* _I386_TLBFLUSH_H */
29163 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
29164 +++ /dev/null
29165 @@ -1,97 +0,0 @@
29166 -#ifndef _X8664_TLBFLUSH_H
29167 -#define _X8664_TLBFLUSH_H
29168 -
29169 -#include <linux/mm.h>
29170 -#include <linux/sched.h>
29171 -#include <asm/processor.h>
29172 -#include <asm/system.h>
29173 -
29174 -#define __flush_tlb()  xen_tlb_flush()
29175 -
29176 -/*
29177 - * Global pages have to be flushed a bit differently. Not a real
29178 - * performance problem because this does not happen often.
29179 - */
29180 -#define __flush_tlb_global()   xen_tlb_flush()
29181 -
29182 -#define __flush_tlb_all() __flush_tlb_global()
29183 -
29184 -#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
29185 -
29186 -
29187 -/*
29188 - * TLB flushing:
29189 - *
29190 - *  - flush_tlb() flushes the current mm struct TLBs
29191 - *  - flush_tlb_all() flushes all processes TLBs
29192 - *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
29193 - *  - flush_tlb_page(vma, vmaddr) flushes one page
29194 - *  - flush_tlb_range(vma, start, end) flushes a range of pages
29195 - *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29196 - *
29197 - * x86-64 can only flush individual pages or full VMs. For a range flush
29198 - * we always do the full VM. Might be worth trying if for a small
29199 - * range a few INVLPGs in a row are a win.
29200 - */
29201 -
29202 -#ifndef CONFIG_SMP
29203 -
29204 -#define flush_tlb() __flush_tlb()
29205 -#define flush_tlb_all() __flush_tlb_all()
29206 -#define local_flush_tlb() __flush_tlb()
29207 -
29208 -static inline void flush_tlb_mm(struct mm_struct *mm)
29209 -{
29210 -       if (mm == current->active_mm)
29211 -               __flush_tlb();
29212 -}
29213 -
29214 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29215 -       unsigned long addr)
29216 -{
29217 -       if (vma->vm_mm == current->active_mm)
29218 -               __flush_tlb_one(addr);
29219 -}
29220 -
29221 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29222 -       unsigned long start, unsigned long end)
29223 -{
29224 -       if (vma->vm_mm == current->active_mm)
29225 -               __flush_tlb();
29226 -}
29227 -
29228 -#else
29229 -
29230 -#include <asm/smp.h>
29231 -
29232 -#define local_flush_tlb() \
29233 -       __flush_tlb()
29234 -
29235 -#define flush_tlb_all xen_tlb_flush_all
29236 -#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29237 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29238 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29239 -
29240 -#define flush_tlb()    flush_tlb_current_task()
29241 -
29242 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29243 -{
29244 -       flush_tlb_mm(vma->vm_mm);
29245 -}
29246 -
29247 -#define TLBSTATE_OK    1
29248 -#define TLBSTATE_LAZY  2
29249 -
29250 -/* Roughly an IPI every 20MB with 4k pages for freeing page table
29251 -   ranges. Cost is about 42k of memory for each CPU. */
29252 -#define ARCH_FREE_PTE_NR 5350
29253 -
29254 -#endif
29255 -
29256 -static inline void flush_tlb_kernel_range(unsigned long start,
29257 -                                       unsigned long end)
29258 -{
29259 -       flush_tlb_all();
29260 -}
29261 -
29262 -#endif /* _X8664_TLBFLUSH_H */
29263 --- a/include/asm-x86/mach-xen/asm/tlbflush.h
29264 +++ b/include/asm-x86/mach-xen/asm/tlbflush.h
29265 @@ -1,5 +1,106 @@
29266 +#ifndef _ASM_X86_TLBFLUSH_H
29267 +#define _ASM_X86_TLBFLUSH_H
29268 +
29269 +#include <linux/mm.h>
29270 +#include <linux/sched.h>
29271 +
29272 +#include <asm/processor.h>
29273 +#include <asm/system.h>
29274 +
29275 +#define __flush_tlb() xen_tlb_flush()
29276 +#define __flush_tlb_global() xen_tlb_flush()
29277 +#define __flush_tlb_single(addr) xen_invlpg(addr)
29278 +#define __flush_tlb_all() xen_tlb_flush()
29279 +#define __flush_tlb_one(addr) xen_invlpg(addr)
29280 +
29281  #ifdef CONFIG_X86_32
29282 -# include "tlbflush_32.h"
29283 +# define TLB_FLUSH_ALL 0xffffffff
29284  #else
29285 -# include "tlbflush_64.h"
29286 +# define TLB_FLUSH_ALL -1ULL
29287  #endif
29288 +
29289 +/*
29290 + * TLB flushing:
29291 + *
29292 + *  - flush_tlb() flushes the current mm struct TLBs
29293 + *  - flush_tlb_all() flushes all processes TLBs
29294 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
29295 + *  - flush_tlb_page(vma, vmaddr) flushes one page
29296 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
29297 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
29298 + *
29299 + * ..but the i386 has somewhat limited tlb flushing capabilities,
29300 + * and page-granular flushes are available only on i486 and up.
29301 + *
29302 + * x86-64 can only flush individual pages or full VMs. For a range flush
29303 + * we always do the full VM. Might be worth trying if for a small
29304 + * range a few INVLPGs in a row are a win.
29305 + */
29306 +
29307 +#ifndef CONFIG_SMP
29308 +
29309 +#define flush_tlb() __flush_tlb()
29310 +#define flush_tlb_all() __flush_tlb_all()
29311 +#define local_flush_tlb() __flush_tlb()
29312 +
29313 +static inline void flush_tlb_mm(struct mm_struct *mm)
29314 +{
29315 +       if (mm == current->active_mm)
29316 +               __flush_tlb();
29317 +}
29318 +
29319 +static inline void flush_tlb_page(struct vm_area_struct *vma,
29320 +                                 unsigned long addr)
29321 +{
29322 +       if (vma->vm_mm == current->active_mm)
29323 +               __flush_tlb_one(addr);
29324 +}
29325 +
29326 +static inline void flush_tlb_range(struct vm_area_struct *vma,
29327 +                                  unsigned long start, unsigned long end)
29328 +{
29329 +       if (vma->vm_mm == current->active_mm)
29330 +               __flush_tlb();
29331 +}
29332 +
29333 +#else  /* SMP */
29334 +
29335 +#include <asm/smp.h>
29336 +
29337 +#define local_flush_tlb() __flush_tlb()
29338 +
29339 +#define flush_tlb_all xen_tlb_flush_all
29340 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29341 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29342 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29343 +
29344 +#define flush_tlb()    flush_tlb_current_task()
29345 +
29346 +static inline void flush_tlb_range(struct vm_area_struct *vma,
29347 +                                  unsigned long start, unsigned long end)
29348 +{
29349 +       flush_tlb_mm(vma->vm_mm);
29350 +}
29351 +
29352 +#define TLBSTATE_OK    1
29353 +#define TLBSTATE_LAZY  2
29354 +
29355 +#ifdef CONFIG_X86_32
29356 +struct tlb_state
29357 +{
29358 +       struct mm_struct *active_mm;
29359 +       int state;
29360 +       char __cacheline_padding[L1_CACHE_BYTES-8];
29361 +};
29362 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
29363 +#endif
29364 +
29365 +#endif /* SMP */
29366 +
29367 +static inline void flush_tlb_kernel_range(unsigned long start,
29368 +                                         unsigned long end)
29369 +{
29370 +       flush_tlb_all();
29371 +}
29372 +
29373 +#endif /* _ASM_X86_TLBFLUSH_H */
29374 --- a/include/asm-x86/mach-xen/irq_vectors.h
29375 +++ b/include/asm-x86/mach-xen/irq_vectors.h
29376 @@ -82,7 +82,8 @@
29377
29378  #define RESCHEDULE_VECTOR      0
29379  #define CALL_FUNCTION_VECTOR   1
29380 -#define NR_IPIS                        2
29381 +#define SPIN_UNLOCK_VECTOR     2
29382 +#define NR_IPIS                        3
29383
29384  /*
29385   * The maximum number of vectors supported by i386 processors
29386 --- a/include/asm-x86/mmu.h
29387 +++ b/include/asm-x86/mmu.h
29388 @@ -23,7 +23,7 @@ typedef struct {
29389         void *vdso;
29390  } mm_context_t;
29391
29392 -#ifdef CONFIG_SMP
29393 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29394  void leave_mm(int cpu);
29395  #else
29396  static inline void leave_mm(int cpu)
29397 --- a/include/asm-x86/ptrace.h
29398 +++ b/include/asm-x86/ptrace.h
29399 @@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29400  extern void user_disable_single_step(struct task_struct *);
29401
29402  extern void user_enable_block_step(struct task_struct *);
29403 -#ifdef CONFIG_X86_DEBUGCTLMSR
29404 +#if defined(CONFIG_XEN)
29405 +#define arch_has_block_step()  (0)
29406 +#elif defined(CONFIG_X86_DEBUGCTLMSR)
29407  #define arch_has_block_step()  (1)
29408  #else
29409  #define arch_has_block_step()  (boot_cpu_data.x86 >= 6)
29410 --- a/include/asm-x86/thread_info.h
29411 +++ b/include/asm-x86/thread_info.h
29412 @@ -94,6 +94,9 @@ struct thread_info {
29413  #define TIF_DEBUGCTLMSR                25      /* uses thread_struct.debugctlmsr */
29414  #define TIF_DS_AREA_MSR                26      /* uses thread_struct.ds_area_msr */
29415  #define TIF_BTS_TRACE_TS       27      /* record scheduling event timestamps */
29416 +#ifdef CONFIG_X86_XEN
29417 +#define TIF_CSTAR              31      /* cstar-based syscall (special handling) */
29418 +#endif
29419
29420  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
29421  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
29422 @@ -118,6 +121,7 @@ struct thread_info {
29423  #define _TIF_DEBUGCTLMSR       (1 << TIF_DEBUGCTLMSR)
29424  #define _TIF_DS_AREA_MSR       (1 << TIF_DS_AREA_MSR)
29425  #define _TIF_BTS_TRACE_TS      (1 << TIF_BTS_TRACE_TS)
29426 +#define _TIF_CSTAR             (1 << TIF_CSTAR)
29427
29428  /* work to do in syscall_trace_enter() */
29429  #define _TIF_WORK_SYSCALL_ENTRY        \
29430 @@ -147,12 +151,12 @@ struct thread_info {
29431         (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29432          _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29433
29434 -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29435 -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29436  #else
29437 -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29438 -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29439 +#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29440 +     /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29441  #endif
29442 +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29443 +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29444
29445  #define PREEMPT_ACTIVE         0x10000000
29446
29447 --- a/include/asm-x86/time.h
29448 +++ b/include/asm-x86/time.h
29449 @@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29450
29451  extern unsigned long __init calibrate_cpu(void);
29452
29453 +#ifdef CONFIG_XEN
29454 +extern int xen_independent_wallclock(void);
29455 +extern unsigned long xen_read_persistent_clock(void);
29456 +extern int xen_update_persistent_clock(void);
29457 +#endif
29458 +
29459  #endif
29460 --- a/include/linux/page-flags.h
29461 +++ b/include/linux/page-flags.h
29462 @@ -101,8 +101,8 @@ enum pageflags {
29463         PG_foreign,             /* Page is owned by foreign allocator. */
29464         PG_pinned,              /* Cannot alias with PG_owner_priv_1 since
29465                                  * bad_page() checks include this bit.
29466 -                                * Also cannot use PG_arch_1 since that now
29467 -                                * has a different purpose on x86. */
29468 +                                * Should not use PG_arch_1 as that may have
29469 +                                * a different purpose elsewhere. */
29470  #endif
29471         __NR_PAGEFLAGS,
29472
29473 --- a/include/linux/pci.h
29474 +++ b/include/linux/pci.h
29475 @@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29476  void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29477  int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29478  int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29479 +#ifdef CONFIG_XEN
29480 +void pci_restore_bars(struct pci_dev *);
29481 +#endif
29482
29483  /* ROM control related routines */
29484  void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29485 --- a/include/xen/evtchn.h
29486 +++ b/include/xen/evtchn.h
29487 @@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29488         synch_clear_bit(port, s->evtchn_pending);
29489  }
29490
29491 +static inline void set_evtchn(int port)
29492 +{
29493 +       shared_info_t *s = HYPERVISOR_shared_info;
29494 +       synch_set_bit(port, s->evtchn_pending);
29495 +}
29496 +
29497 +static inline int test_evtchn(int port)
29498 +{
29499 +       shared_info_t *s = HYPERVISOR_shared_info;
29500 +       return synch_test_bit(port, s->evtchn_pending);
29501 +}
29502 +
29503  static inline void notify_remote_via_evtchn(int port)
29504  {
29505         struct evtchn_send send = { .port = port };
29506         VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29507  }
29508
29509 +/* Clear an irq's pending state, in preparation for polling on it. */
29510 +void xen_clear_irq_pending(int irq);
29511 +
29512 +/* Set an irq's pending state, to avoid blocking on it. */
29513 +void xen_set_irq_pending(int irq);
29514 +
29515 +/* Test an irq's pending state. */
29516 +int xen_test_irq_pending(int irq);
29517 +
29518 +/* Poll waiting for an irq to become pending.  In the usual case, the
29519 +   irq will be disabled so it won't deliver an interrupt. */
29520 +void xen_poll_irq(int irq);
29521 +
29522  /*
29523   * Use these to access the event channel underlying the IRQ handle returned
29524   * by bind_*_to_irqhandler().
29525 --- a/kernel/sysctl_check.c
29526 +++ b/kernel/sysctl_check.c
29527 @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29528  };
29529
29530  #ifdef CONFIG_XEN
29531 -static struct trans_ctl_table trans_xen_table[] = {
29532 +static const struct trans_ctl_table trans_xen_table[] = {
29533         { CTL_XEN_INDEPENDENT_WALLCLOCK,        "independent_wallclock" },
29534         { CTL_XEN_PERMITTED_CLOCK_JITTER,       "permitted_clock_jitter" },
29535         {}
29536 --- a/lib/swiotlb-xen.c
29537 +++ b/lib/swiotlb-xen.c
29538 @@ -30,7 +30,6 @@
29539  #include <asm/gnttab_dma.h>
29540
29541  int swiotlb;
29542 -EXPORT_SYMBOL(swiotlb);
29543
29544  #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29545
29546 @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29547         }
29548  }
29549
29550 +static inline unsigned int is_span_boundary(unsigned int index,
29551 +                                           unsigned int nslots,
29552 +                                           unsigned long offset_slots,
29553 +                                           unsigned long max_slots)
29554 +{
29555 +       unsigned long offset = (offset_slots + index) & (max_slots - 1);
29556 +       return offset + nslots > max_slots;
29557 +}
29558 +
29559  /*
29560   * Allocates bounce buffer and returns its kernel virtual address.
29561   */
29562 @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29563         unsigned int nslots, stride, index, wrap;
29564         struct phys_addr slot_buf;
29565         int i;
29566 +       unsigned long mask;
29567 +       unsigned long offset_slots;
29568 +       unsigned long max_slots;
29569 +
29570 +       mask = dma_get_seg_boundary(hwdev);
29571 +       offset_slots = -IO_TLB_SEGSIZE;
29572 +       max_slots = mask + 1
29573 +                   ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29574 +                   : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29575
29576         /*
29577          * For mappings greater than a page, we limit the stride (and
29578 @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29579          */
29580         spin_lock_irqsave(&io_tlb_lock, flags);
29581         {
29582 -               wrap = index = ALIGN(io_tlb_index, stride);
29583 -
29584 +               index = ALIGN(io_tlb_index, stride);
29585                 if (index >= iotlb_nslabs)
29586 -                       wrap = index = 0;
29587 +                       index = 0;
29588 +               wrap = index;
29589
29590                 do {
29591 +                       while (is_span_boundary(index, nslots, offset_slots,
29592 +                                               max_slots)) {
29593 +                               index += stride;
29594 +                               if (index >= iotlb_nslabs)
29595 +                                       index = 0;
29596 +                               if (index == wrap)
29597 +                                       goto not_found;
29598 +                       }
29599 +
29600                         /*
29601                          * If we find a slot that indicates we have 'nslots'
29602                          * number of contiguous buffers, we allocate the
29603 @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29604                                 index = 0;
29605                 } while (index != wrap);
29606
29607 +  not_found:
29608                 spin_unlock_irqrestore(&io_tlb_lock, flags);
29609                 return NULL;
29610         }