src/patches/60008_xen3-auto-xen-arch.patch1

   1 Subject: xen3 xen-arch
   2 From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd)
   3 Patch-mainline: obsolete
   4 Acked-by: jbeulich@novell.com
   5
   6 List of files having Xen derivates (perhaps created during the merging
   7 of newer kernel versions), for xen-port-patches.py to pick up (i.e. this
   8 must be retained here until the XenSource tree has these in the right
   9 places):
  10 +++ linux/arch/x86/kernel/acpi/sleep-xen.c
  11 +++ linux/arch/x86/kernel/cpu/common_64-xen.c
  12 +++ linux/arch/x86/kernel/e820-xen.c
  13 +++ linux/arch/x86/kernel/head-xen.c
  14 +++ linux/arch/x86/kernel/head32-xen.c
  15 +++ linux/arch/x86/kernel/ioport-xen.c
  16 +++ linux/arch/x86/kernel/ipi-xen.c
  17 +++ linux/arch/x86/kernel/ldt-xen.c
  18 +++ linux/arch/x86/kernel/mpparse-xen.c
  19 +++ linux/arch/x86/kernel/pci-nommu-xen.c
  20 +++ linux/arch/x86/kernel/process-xen.c
  21 +++ linux/arch/x86/kernel/setup-xen.c
  22 +++ linux/arch/x86/kernel/setup_percpu-xen.c
  23 +++ linux/arch/x86/kernel/smp-xen.c
  24 +++ linux/arch/x86/mm/fault-xen.c
  25 +++ linux/arch/x86/mm/ioremap-xen.c
  26 +++ linux/arch/x86/mm/pageattr-xen.c
  27 +++ linux/arch/x86/mm/pat-xen.c
  28 +++ linux/arch/x86/mm/pgtable-xen.c
  29 +++ linux/arch/x86/vdso/vdso32-setup-xen.c
  30 +++ linux/drivers/char/mem-xen.c
  31 +++ linux/include/asm-x86/mach-xen/asm/desc.h
  32 +++ linux/include/asm-x86/mach-xen/asm/dma-mapping.h
  33 +++ linux/include/asm-x86/mach-xen/asm/fixmap.h
  34 +++ linux/include/asm-x86/mach-xen/asm/io.h
  35 +++ linux/include/asm-x86/mach-xen/asm/irq_vectors.h
  36 +++ linux/include/asm-x86/mach-xen/asm/irqflags.h
  37 +++ linux/include/asm-x86/mach-xen/asm/mmu_context.h
  38 +++ linux/include/asm-x86/mach-xen/asm/page.h
  39 +++ linux/include/asm-x86/mach-xen/asm/pci.h
  40 +++ linux/include/asm-x86/mach-xen/asm/pgalloc.h
  41 +++ linux/include/asm-x86/mach-xen/asm/pgtable.h
  42 +++ linux/include/asm-x86/mach-xen/asm/processor.h
  43 +++ linux/include/asm-x86/mach-xen/asm/segment.h
  44 +++ linux/include/asm-x86/mach-xen/asm/smp.h
  45 +++ linux/include/asm-x86/mach-xen/asm/spinlock.h
  46 +++ linux/include/asm-x86/mach-xen/asm/swiotlb.h
  47 +++ linux/include/asm-x86/mach-xen/asm/system.h
  48 +++ linux/include/asm-x86/mach-xen/asm/tlbflush.h
  49 +++ linux/include/asm-x86/mach-xen/asm/xor.h
  50
  51 List of files folded into their native counterparts (and hence removed
  52 from this patch for xen-port-patches.py to not needlessly pick them up;
  53 for reference, prefixed with the version the removal occured):
  54 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level.h
  55 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level-defs.h
  56 2.6.19/include/asm-x86/mach-xen/asm/ptrace.h
  57 2.6.23/arch/x86/kernel/vsyscall-note_32-xen.S
  58 2.6.23/include/asm-x86/mach-xen/asm/ptrace_64.h
  59 2.6.24/arch/x86/kernel/early_printk_32-xen.c
  60 2.6.24/include/asm-x86/mach-xen/asm/arch_hooks_64.h
  61 2.6.24/include/asm-x86/mach-xen/asm/bootsetup_64.h
  62 2.6.24/include/asm-x86/mach-xen/asm/mmu_32.h
  63 2.6.24/include/asm-x86/mach-xen/asm/mmu_64.h
  64 2.6.24/include/asm-x86/mach-xen/asm/nmi_64.h
  65 2.6.24/include/asm-x86/mach-xen/asm/setup.h
  66 2.6.24/include/asm-x86/mach-xen/asm/time_64.h (added in 2.6.20)
  67 2.6.25/arch/x86/ia32/syscall32-xen.c
  68 2.6.25/arch/x86/ia32/syscall32_syscall-xen.S
  69 2.6.25/arch/x86/ia32/vsyscall-int80.S
  70 2.6.25/arch/x86/kernel/acpi/boot-xen.c
  71 2.6.25/include/asm-x86/mach-xen/asm/msr.h
  72 2.6.25/include/asm-x86/mach-xen/asm/page_32.h
  73 2.6.25/include/asm-x86/mach-xen/asm/spinlock_32.h
  74 2.6.25/include/asm-x86/mach-xen/asm/timer.h (added in 2.6.24)
  75 2.6.25/include/asm-x86/mach-xen/asm/timer_64.h
  76 2.6.26/arch/x86/kernel/pci-dma_32-xen.c
  77 2.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c
  78 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_32.h
  79 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_64.h
  80 2.6.26/include/asm-x86/mach-xen/asm/nmi.h (added in 2.6.24)
  81 2.6.26/include/asm-x86/mach-xen/asm/scatterlist.h (added in 2.6.24)
  82 2.6.26/include/asm-x86/mach-xen/asm/scatterlist_32.h
  83 2.6.26/include/xen/xencomm.h
  84 2.6.27/arch/x86/kernel/e820_32-xen.c
  85 2.6.27/include/asm-x86/mach-xen/asm/e820.h (added in 2.6.24)
  86 2.6.27/include/asm-x86/mach-xen/asm/e820_64.h
  87 2.6.27/include/asm-x86/mach-xen/asm/hw_irq.h (added in 2.6.24)
  88 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_32.h
  89 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_64.h
  90 2.6.27/include/asm-x86/mach-xen/asm/irq.h (added in 2.6.24)
  91 2.6.27/include/asm-x86/mach-xen/asm/irq_64.h
  92
  93 Index: head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c
  94 ===================================================================
  95 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
  96 +++ head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c        2008-10-01 15:43:24.000000000 +0200
  97 @@ -0,0 +1,209 @@
  98 +/*
  99 + * processor_extcntl_xen.c - interface to notify Xen
 100 + *
 101 + *  Copyright (C) 2008, Intel corporation
 102 + *
 103 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 104 + *
 105 + *  This program is free software; you can redistribute it and/or modify
 106 + *  it under the terms of the GNU General Public License as published by
 107 + *  the Free Software Foundation; either version 2 of the License, or (at
 108 + *  your option) any later version.
 109 + *
 110 + *  This program is distributed in the hope that it will be useful, but
 111 + *  WITHOUT ANY WARRANTY; without even the implied warranty of
 112 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 113 + *  General Public License for more details.
 114 + *
 115 + *  You should have received a copy of the GNU General Public License along
 116 + *  with this program; if not, write to the Free Software Foundation, Inc.,
 117 + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 118 + *
 119 + */
 120 +
 121 +#include <linux/kernel.h>
 122 +#include <linux/init.h>
 123 +#include <linux/types.h>
 124 +#include <linux/acpi.h>
 125 +#include <linux/pm.h>
 126 +#include <linux/cpu.h>
 127 +
 128 +#include <linux/cpufreq.h>
 129 +#include <acpi/processor.h>
 130 +#include <asm/hypercall.h>
 131 +
 132 +static int xen_cx_notifier(struct acpi_processor *pr, int action)
 133 +{
 134 +       int ret, count = 0, i;
 135 +       xen_platform_op_t op = {
 136 +               .cmd                    = XENPF_set_processor_pminfo,
 137 +               .interface_version      = XENPF_INTERFACE_VERSION,
 138 +               .u.set_pminfo.id        = pr->acpi_id,
 139 +               .u.set_pminfo.type      = XEN_PM_CX,
 140 +       };
 141 +       struct xen_processor_cx *data, *buf;
 142 +       struct acpi_processor_cx *cx;
 143 +
 144 +       if (action == PROCESSOR_PM_CHANGE)
 145 +               return -EINVAL;
 146 +
 147 +       /* Convert to Xen defined structure and hypercall */
 148 +       buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
 149 +                       GFP_KERNEL);
 150 +       if (!buf)
 151 +               return -ENOMEM;
 152 +
 153 +       data = buf;
 154 +       for (i = 1; i <= pr->power.count; i++) {
 155 +               cx = &pr->power.states[i];
 156 +               /* Skip invalid cstate entry */
 157 +               if (!cx->valid)
 158 +                       continue;
 159 +
 160 +               data->type = cx->type;
 161 +               data->latency = cx->latency;
 162 +               data->power = cx->power;
 163 +               data->reg.space_id = cx->reg.space_id;
 164 +               data->reg.bit_width = cx->reg.bit_width;
 165 +               data->reg.bit_offset = cx->reg.bit_offset;
 166 +               data->reg.access_size = cx->reg.reserved;
 167 +               data->reg.address = cx->reg.address;
 168 +
 169 +               /* Get dependency relationships */
 170 +               if (cx->csd_count) {
 171 +                       printk("Wow! _CSD is found. Not support for now!\n");
 172 +                       kfree(buf);
 173 +                       return -EINVAL;
 174 +               } else {
 175 +                       data->dpcnt = 0;
 176 +                       set_xen_guest_handle(data->dp, NULL);
 177 +               }
 178 +
 179 +               data++;
 180 +               count++;
 181 +       }
 182 +
 183 +       if (!count) {
 184 +               printk("No available Cx info for cpu %d\n", pr->acpi_id);
 185 +               kfree(buf);
 186 +               return -EINVAL;
 187 +       }
 188 +
 189 +       op.u.set_pminfo.power.count = count;
 190 +       op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
 191 +       op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
 192 +       op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
 193 +       op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done;
 194 +
 195 +       set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
 196 +       ret = HYPERVISOR_platform_op(&op);
 197 +       kfree(buf);
 198 +       return ret;
 199 +}
 200 +
 201 +static int xen_px_notifier(struct acpi_processor *pr, int action)
 202 +{
 203 +       int ret = -EINVAL;
 204 +       xen_platform_op_t op = {
 205 +               .cmd                    = XENPF_set_processor_pminfo,
 206 +               .interface_version      = XENPF_INTERFACE_VERSION,
 207 +               .u.set_pminfo.id        = pr->acpi_id,
 208 +               .u.set_pminfo.type      = XEN_PM_PX,
 209 +       };
 210 +       struct xen_processor_performance *perf;
 211 +       struct xen_processor_px *states = NULL;
 212 +       struct acpi_processor_performance *px;
 213 +       struct acpi_psd_package *pdomain;
 214 +
 215 +       if (!pr)
 216 +               return -EINVAL;
 217 +
 218 +       perf = &op.u.set_pminfo.perf;
 219 +       px = pr->performance;
 220 +
 221 +       switch(action) {
 222 +       case PROCESSOR_PM_CHANGE:
 223 +               /* ppc dynamic handle */
 224 +               perf->flags = XEN_PX_PPC;
 225 +               perf->platform_limit = pr->performance_platform_limit;
 226 +
 227 +               ret = HYPERVISOR_platform_op(&op);
 228 +               break;
 229 +
 230 +       case PROCESSOR_PM_INIT:
 231 +               /* px normal init */
 232 +               perf->flags = XEN_PX_PPC |
 233 +                             XEN_PX_PCT |
 234 +                             XEN_PX_PSS |
 235 +                             XEN_PX_PSD;
 236 +
 237 +               /* ppc */
 238 +               perf->platform_limit = pr->performance_platform_limit;
 239 +
 240 +               /* pct */
 241 +               xen_convert_pct_reg(&perf->control_register, &px->control_register);
 242 +               xen_convert_pct_reg(&perf->status_register, &px->status_register);
 243 +
 244 +               /* pss */
 245 +               perf->state_count = px->state_count;
 246 +               states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
 247 +               if (!states)
 248 +                       return -ENOMEM;
 249 +               xen_convert_pss_states(states, px->states, px->state_count);
 250 +               set_xen_guest_handle(perf->states, states);
 251 +
 252 +               /* psd */
 253 +               pdomain = &px->domain_info;
 254 +               xen_convert_psd_pack(&perf->domain_info, pdomain);
 255 +               if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
 256 +                       perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
 257 +               else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
 258 +                       perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
 259 +               else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
 260 +                       perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
 261 +               else {
 262 +                       ret = -ENODEV;
 263 +                       kfree(states);
 264 +                       break;
 265 +               }
 266 +
 267 +               ret = HYPERVISOR_platform_op(&op);
 268 +               kfree(states);
 269 +               break;
 270 +
 271 +       default:
 272 +               break;
 273 +       }
 274 +
 275 +       return ret;
 276 +}
 277 +
 278 +static int xen_tx_notifier(struct acpi_processor *pr, int action)
 279 +{
 280 +       return -EINVAL;
 281 +}
 282 +static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
 283 +{
 284 +       return -EINVAL;
 285 +}
 286 +
 287 +static struct processor_extcntl_ops xen_extcntl_ops = {
 288 +       .hotplug                = xen_hotplug_notifier,
 289 +};
 290 +
 291 +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops)
 292 +{
 293 +       unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
 294 +
 295 +       if (!pmbits)
 296 +               return;
 297 +       if (pmbits & XEN_PROCESSOR_PM_CX)
 298 +               xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
 299 +       if (pmbits & XEN_PROCESSOR_PM_PX)
 300 +               xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
 301 +       if (pmbits & XEN_PROCESSOR_PM_TX)
 302 +               xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
 303 +
 304 +       *ops = &xen_extcntl_ops;
 305 +}
 306 +EXPORT_SYMBOL(arch_acpi_processor_init_extcntl);
 307 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c
 308 ===================================================================
 309 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
 310 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200
 311 @@ -0,0 +1,113 @@
 312 +/*
 313 + * sleep.c - x86-specific ACPI sleep support.
 314 + *
 315 + *  Copyright (C) 2001-2003 Patrick Mochel
 316 + *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
 317 + */
 318 +
 319 +#include <linux/acpi.h>
 320 +#include <linux/bootmem.h>
 321 +#include <linux/dmi.h>
 322 +#include <linux/cpumask.h>
 323 +
 324 +#include <asm/smp.h>
 325 +
 326 +#ifndef CONFIG_ACPI_PV_SLEEP
 327 +/* address in low memory of the wakeup routine. */
 328 +unsigned long acpi_wakeup_address = 0;
 329 +unsigned long acpi_video_flags;
 330 +extern char wakeup_start, wakeup_end;
 331 +
 332 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
 333 +#endif
 334 +
 335 +/**
 336 + * acpi_save_state_mem - save kernel state
 337 + *
 338 + * Create an identity mapped page table and copy the wakeup routine to
 339 + * low memory.
 340 + */
 341 +int acpi_save_state_mem(void)
 342 +{
 343 +#ifndef CONFIG_ACPI_PV_SLEEP
 344 +       if (!acpi_wakeup_address)
 345 +               return 1;
 346 +       memcpy((void *)acpi_wakeup_address, &wakeup_start,
 347 +              &wakeup_end - &wakeup_start);
 348 +       acpi_copy_wakeup_routine(acpi_wakeup_address);
 349 +#endif
 350 +       return 0;
 351 +}
 352 +
 353 +/*
 354 + * acpi_restore_state - undo effects of acpi_save_state_mem
 355 + */
 356 +void acpi_restore_state_mem(void)
 357 +{
 358 +}
 359 +
 360 +/**
 361 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
 362 + *
 363 + * We allocate a page from the first 1MB of memory for the wakeup
 364 + * routine for when we come back from a sleep state. The
 365 + * runtime allocator allows specification of <16MB pages, but not
 366 + * <1MB pages.
 367 + */
 368 +void __init acpi_reserve_bootmem(void)
 369 +{
 370 +#ifndef CONFIG_ACPI_PV_SLEEP
 371 +       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
 372 +               printk(KERN_ERR
 373 +                      "ACPI: Wakeup code way too big, S3 disabled.\n");
 374 +               return;
 375 +       }
 376 +
 377 +       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
 378 +       if (!acpi_wakeup_address)
 379 +               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 380 +#endif
 381 +}
 382 +
 383 +#ifndef CONFIG_ACPI_PV_SLEEP
 384 +static int __init acpi_sleep_setup(char *str)
 385 +{
 386 +       while ((str != NULL) && (*str != '\0')) {
 387 +               if (strncmp(str, "s3_bios", 7) == 0)
 388 +                       acpi_video_flags = 1;
 389 +               if (strncmp(str, "s3_mode", 7) == 0)
 390 +                       acpi_video_flags |= 2;
 391 +               str = strchr(str, ',');
 392 +               if (str != NULL)
 393 +                       str += strspn(str, ", \t");
 394 +       }
 395 +       return 1;
 396 +}
 397 +
 398 +__setup("acpi_sleep=", acpi_sleep_setup);
 399 +
 400 +static __init int reset_videomode_after_s3(struct dmi_system_id *d)
 401 +{
 402 +       acpi_video_flags |= 2;
 403 +       return 0;
 404 +}
 405 +
 406 +static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
 407 +       {                       /* Reset video mode after returning from ACPI S3 sleep */
 408 +        .callback = reset_videomode_after_s3,
 409 +        .ident = "Toshiba Satellite 4030cdt",
 410 +        .matches = {
 411 +                    DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
 412 +                    },
 413 +        },
 414 +       {}
 415 +};
 416 +
 417 +static int __init acpisleep_dmi_init(void)
 418 +{
 419 +       dmi_check_system(acpisleep_dmi_table);
 420 +       return 0;
 421 +}
 422 +
 423 +core_initcall(acpisleep_dmi_init);
 424 +#endif /* CONFIG_ACPI_PV_SLEEP */
 425 Index: head-2008-11-25/arch/x86/kernel/apic_32-xen.c
 426 ===================================================================
 427 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
 428 +++ head-2008-11-25/arch/x86/kernel/apic_32-xen.c       2007-06-12 13:12:48.000000000 +0200
 429 @@ -0,0 +1,155 @@
 430 +/*
 431 + *     Local APIC handling, local APIC timers
 432 + *
 433 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
 434 + *
 435 + *     Fixes
 436 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
 437 + *                                     thanks to Eric Gilmore
 438 + *                                     and Rolf G. Tews
 439 + *                                     for testing these extensively.
 440 + *     Maciej W. Rozycki       :       Various updates and fixes.
 441 + *     Mikael Pettersson       :       Power Management for UP-APIC.
 442 + *     Pavel Machek and
 443 + *     Mikael Pettersson       :       PM converted to driver model.
 444 + */
 445 +
 446 +#include <linux/init.h>
 447 +
 448 +#include <linux/mm.h>
 449 +#include <linux/delay.h>
 450 +#include <linux/bootmem.h>
 451 +#include <linux/smp_lock.h>
 452 +#include <linux/interrupt.h>
 453 +#include <linux/mc146818rtc.h>
 454 +#include <linux/kernel_stat.h>
 455 +#include <linux/sysdev.h>
 456 +#include <linux/cpu.h>
 457 +#include <linux/module.h>
 458 +
 459 +#include <asm/atomic.h>
 460 +#include <asm/smp.h>
 461 +#include <asm/mtrr.h>
 462 +#include <asm/mpspec.h>
 463 +#include <asm/desc.h>
 464 +#include <asm/arch_hooks.h>
 465 +#include <asm/hpet.h>
 466 +#include <asm/i8253.h>
 467 +#include <asm/nmi.h>
 468 +
 469 +#include <mach_apic.h>
 470 +#include <mach_apicdef.h>
 471 +#include <mach_ipi.h>
 472 +
 473 +#include "io_ports.h"
 474 +
 475 +#ifndef CONFIG_XEN
 476 +/*
 477 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
 478 + * IPIs in place of local APIC timers
 479 + */
 480 +static cpumask_t timer_bcast_ipi;
 481 +#endif
 482 +
 483 +/*
 484 + * Knob to control our willingness to enable the local APIC.
 485 + */
 486 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
 487 +
 488 +/*
 489 + * Debug level
 490 + */
 491 +int apic_verbosity;
 492 +
 493 +#ifndef CONFIG_XEN
 494 +static int modern_apic(void)
 495 +{
 496 +       unsigned int lvr, version;
 497 +       /* AMD systems use old APIC versions, so check the CPU */
 498 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 499 +               boot_cpu_data.x86 >= 0xf)
 500 +               return 1;
 501 +       lvr = apic_read(APIC_LVR);
 502 +       version = GET_APIC_VERSION(lvr);
 503 +       return version >= 0x14;
 504 +}
 505 +#endif /* !CONFIG_XEN */
 506 +
 507 +/*
 508 + * 'what should we do if we get a hw irq event on an illegal vector'.
 509 + * each architecture has to answer this themselves.
 510 + */
 511 +void ack_bad_irq(unsigned int irq)
 512 +{
 513 +       printk("unexpected IRQ trap at vector %02x\n", irq);
 514 +       /*
 515 +        * Currently unexpected vectors happen only on SMP and APIC.
 516 +        * We _must_ ack these because every local APIC has only N
 517 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
 518 +        * holds up an irq slot - in excessive cases (when multiple
 519 +        * unexpected vectors occur) that might lock up the APIC
 520 +        * completely.
 521 +        * But only ack when the APIC is enabled -AK
 522 +        */
 523 +       if (cpu_has_apic)
 524 +               ack_APIC_irq();
 525 +}
 526 +
 527 +int get_physical_broadcast(void)
 528 +{
 529 +        return 0xff;
 530 +}
 531 +
 532 +#ifndef CONFIG_XEN
 533 +#ifndef CONFIG_SMP
 534 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
 535 +{
 536 +       int cpu = smp_processor_id();
 537 +
 538 +       /*
 539 +        * the NMI deadlock-detector uses this.
 540 +        */
 541 +       per_cpu(irq_stat, cpu).apic_timer_irqs++;
 542 +
 543 +       smp_local_timer_interrupt(regs);
 544 +}
 545 +#endif
 546 +
 547 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
 548 +{
 549 +       cpumask_t mask;
 550 +
 551 +       cpus_and(mask, cpu_online_map, timer_bcast_ipi);
 552 +       if (!cpus_empty(mask)) {
 553 +#ifdef CONFIG_SMP
 554 +               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
 555 +#else
 556 +               /*
 557 +                * We can directly call the apic timer interrupt handler
 558 +                * in UP case. Minus all irq related functions
 559 +                */
 560 +               up_apic_timer_interrupt_call(regs);
 561 +#endif
 562 +       }
 563 +}
 564 +#endif
 565 +
 566 +int setup_profiling_timer(unsigned int multiplier)
 567 +{
 568 +       return -EINVAL;
 569 +}
 570 +
 571 +/*
 572 + * This initializes the IO-APIC and APIC hardware if this is
 573 + * a UP kernel.
 574 + */
 575 +int __init APIC_init_uniprocessor (void)
 576 +{
 577 +#ifdef CONFIG_X86_IO_APIC
 578 +       if (smp_found_config)
 579 +               if (!skip_ioapic_setup && nr_ioapics)
 580 +                       setup_IO_APIC();
 581 +#endif
 582 +
 583 +       return 0;
 584 +}
 585 Index: head-2008-11-25/arch/x86/kernel/cpu/common-xen.c
 586 ===================================================================
 587 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
 588 +++ head-2008-11-25/arch/x86/kernel/cpu/common-xen.c    2007-12-10 08:47:31.000000000 +0100
 589 @@ -0,0 +1,743 @@
 590 +#include <linux/init.h>
 591 +#include <linux/string.h>
 592 +#include <linux/delay.h>
 593 +#include <linux/smp.h>
 594 +#include <linux/module.h>
 595 +#include <linux/percpu.h>
 596 +#include <linux/bootmem.h>
 597 +#include <asm/semaphore.h>
 598 +#include <asm/processor.h>
 599 +#include <asm/i387.h>
 600 +#include <asm/msr.h>
 601 +#include <asm/io.h>
 602 +#include <asm/mmu_context.h>
 603 +#include <asm/mtrr.h>
 604 +#include <asm/mce.h>
 605 +#ifdef CONFIG_X86_LOCAL_APIC
 606 +#include <asm/mpspec.h>
 607 +#include <asm/apic.h>
 608 +#include <mach_apic.h>
 609 +#else
 610 +#ifdef CONFIG_XEN
 611 +#define phys_pkg_id(a,b) a
 612 +#endif
 613 +#endif
 614 +#include <asm/hypervisor.h>
 615 +
 616 +#include "cpu.h"
 617 +
 618 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 619 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 620 +
 621 +#ifndef CONFIG_XEN
 622 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
 623 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
 624 +#endif
 625 +
 626 +static int cachesize_override __cpuinitdata = -1;
 627 +static int disable_x86_fxsr __cpuinitdata;
 628 +static int disable_x86_serial_nr __cpuinitdata = 1;
 629 +static int disable_x86_sep __cpuinitdata;
 630 +
 631 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 632 +
 633 +extern int disable_pse;
 634 +
 635 +static void default_init(struct cpuinfo_x86 * c)
 636 +{
 637 +       /* Not much we can do here... */
 638 +       /* Check if at least it has cpuid */
 639 +       if (c->cpuid_level == -1) {
 640 +               /* No cpuid. It must be an ancient CPU */
 641 +               if (c->x86 == 4)
 642 +                       strcpy(c->x86_model_id, "486");
 643 +               else if (c->x86 == 3)
 644 +                       strcpy(c->x86_model_id, "386");
 645 +       }
 646 +}
 647 +
 648 +static struct cpu_dev default_cpu = {
 649 +       .c_init = default_init,
 650 +       .c_vendor = "Unknown",
 651 +};
 652 +static struct cpu_dev * this_cpu = &default_cpu;
 653 +
 654 +static int __init cachesize_setup(char *str)
 655 +{
 656 +       get_option (&str, &cachesize_override);
 657 +       return 1;
 658 +}
 659 +__setup("cachesize=", cachesize_setup);
 660 +
 661 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 662 +{
 663 +       unsigned int *v;
 664 +       char *p, *q;
 665 +
 666 +       if (cpuid_eax(0x80000000) < 0x80000004)
 667 +               return 0;
 668 +
 669 +       v = (unsigned int *) c->x86_model_id;
 670 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
 671 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
 672 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 673 +       c->x86_model_id[48] = 0;
 674 +
 675 +       /* Intel chips right-justify this string for some dumb reason;
 676 +          undo that brain damage */
 677 +       p = q = &c->x86_model_id[0];
 678 +       while ( *p == ' ' )
 679 +            p++;
 680 +       if ( p != q ) {
 681 +            while ( *p )
 682 +                 *q++ = *p++;
 683 +            while ( q <= &c->x86_model_id[48] )
 684 +                 *q++ = '\0';  /* Zero-pad the rest */
 685 +       }
 686 +
 687 +       return 1;
 688 +}
 689 +
 690 +
 691 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 692 +{
 693 +       unsigned int n, dummy, ecx, edx, l2size;
 694 +
 695 +       n = cpuid_eax(0x80000000);
 696 +
 697 +       if (n >= 0x80000005) {
 698 +               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
 699 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
 700 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
 701 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
 702 +       }
 703 +
 704 +       if (n < 0x80000006)     /* Some chips just has a large L1. */
 705 +               return;
 706 +
 707 +       ecx = cpuid_ecx(0x80000006);
 708 +       l2size = ecx >> 16;
 709 +
 710 +       /* do processor-specific cache resizing */
 711 +       if (this_cpu->c_size_cache)
 712 +               l2size = this_cpu->c_size_cache(c,l2size);
 713 +
 714 +       /* Allow user to override all this if necessary. */
 715 +       if (cachesize_override != -1)
 716 +               l2size = cachesize_override;
 717 +
 718 +       if ( l2size == 0 )
 719 +               return;         /* Again, no L2 cache is possible */
 720 +
 721 +       c->x86_cache_size = l2size;
 722 +
 723 +       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
 724 +              l2size, ecx & 0xFF);
 725 +}
 726 +
 727 +/* Naming convention should be: <Name> [(<Codename>)] */
 728 +/* This table only is used unless init_<vendor>() below doesn't set it; */
 729 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
 730 +
 731 +/* Look up CPU names by table lookup. */
 732 +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
 733 +{
 734 +       struct cpu_model_info *info;
 735 +
 736 +       if ( c->x86_model >= 16 )
 737 +               return NULL;    /* Range check */
 738 +
 739 +       if (!this_cpu)
 740 +               return NULL;
 741 +
 742 +       info = this_cpu->c_models;
 743 +
 744 +       while (info && info->family) {
 745 +               if (info->family == c->x86)
 746 +                       return info->model_names[c->x86_model];
 747 +               info++;
 748 +       }
 749 +       return NULL;            /* Not found */
 750 +}
 751 +
 752 +
 753 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
 754 +{
 755 +       char *v = c->x86_vendor_id;
 756 +       int i;
 757 +       static int printed;
 758 +
 759 +       for (i = 0; i < X86_VENDOR_NUM; i++) {
 760 +               if (cpu_devs[i]) {
 761 +                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
 762 +                           (cpu_devs[i]->c_ident[1] &&
 763 +                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
 764 +                               c->x86_vendor = i;
 765 +                               if (!early)
 766 +                                       this_cpu = cpu_devs[i];
 767 +                               return;
 768 +                       }
 769 +               }
 770 +       }
 771 +       if (!printed) {
 772 +               printed++;
 773 +               printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
 774 +               printk(KERN_ERR "CPU: Your system may be unstable.\n");
 775 +       }
 776 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
 777 +       this_cpu = &default_cpu;
 778 +}
 779 +
 780 +
 781 +static int __init x86_fxsr_setup(char * s)
 782 +{
 783 +       disable_x86_fxsr = 1;
 784 +       return 1;
 785 +}
 786 +__setup("nofxsr", x86_fxsr_setup);
 787 +
 788 +
 789 +static int __init x86_sep_setup(char * s)
 790 +{
 791 +       disable_x86_sep = 1;
 792 +       return 1;
 793 +}
 794 +__setup("nosep", x86_sep_setup);
 795 +
 796 +
 797 +/* Standard macro to see if a specific flag is changeable */
 798 +static inline int flag_is_changeable_p(u32 flag)
 799 +{
 800 +       u32 f1, f2;
 801 +
 802 +       asm("pushfl\n\t"
 803 +           "pushfl\n\t"
 804 +           "popl %0\n\t"
 805 +           "movl %0,%1\n\t"
 806 +           "xorl %2,%0\n\t"
 807 +           "pushl %0\n\t"
 808 +           "popfl\n\t"
 809 +           "pushfl\n\t"
 810 +           "popl %0\n\t"
 811 +           "popfl\n\t"
 812 +           : "=&r" (f1), "=&r" (f2)
 813 +           : "ir" (flag));
 814 +
 815 +       return ((f1^f2) & flag) != 0;
 816 +}
 817 +
 818 +
 819 +/* Probe for the CPUID instruction */
 820 +static int __cpuinit have_cpuid_p(void)
 821 +{
 822 +       return flag_is_changeable_p(X86_EFLAGS_ID);
 823 +}
 824 +
 825 +/* Do minimum CPU detection early.
 826 +   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
 827 +   The others are not touched to avoid unwanted side effects.
 828 +
 829 +   WARNING: this function is only called on the BP.  Don't add code here
 830 +   that is supposed to run on all CPUs. */
 831 +static void __init early_cpu_detect(void)
 832 +{
 833 +       struct cpuinfo_x86 *c = &boot_cpu_data;
 834 +
 835 +       c->x86_cache_alignment = 32;
 836 +
 837 +       if (!have_cpuid_p())
 838 +               return;
 839 +
 840 +       /* Get vendor name */
 841 +       cpuid(0x00000000, &c->cpuid_level,
 842 +             (int *)&c->x86_vendor_id[0],
 843 +             (int *)&c->x86_vendor_id[8],
 844 +             (int *)&c->x86_vendor_id[4]);
 845 +
 846 +       get_cpu_vendor(c, 1);
 847 +
 848 +       c->x86 = 4;
 849 +       if (c->cpuid_level >= 0x00000001) {
 850 +               u32 junk, tfms, cap0, misc;
 851 +               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
 852 +               c->x86 = (tfms >> 8) & 15;
 853 +               c->x86_model = (tfms >> 4) & 15;
 854 +               if (c->x86 == 0xf)
 855 +                       c->x86 += (tfms >> 20) & 0xff;
 856 +               if (c->x86 >= 0x6)
 857 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
 858 +               c->x86_mask = tfms & 15;
 859 +               if (cap0 & (1<<19))
 860 +                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 861 +       }
 862 +}
 863 +
 864 +void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 865 +{
 866 +       u32 tfms, xlvl;
 867 +       int ebx;
 868 +
 869 +       if (have_cpuid_p()) {
 870 +               /* Get vendor name */
 871 +               cpuid(0x00000000, &c->cpuid_level,
 872 +                     (int *)&c->x86_vendor_id[0],
 873 +                     (int *)&c->x86_vendor_id[8],
 874 +                     (int *)&c->x86_vendor_id[4]);
 875 +
 876 +               get_cpu_vendor(c, 0);
 877 +               /* Initialize the standard set of capabilities */
 878 +               /* Note that the vendor-specific code below might override */
 879 +
 880 +               /* Intel-defined flags: level 0x00000001 */
 881 +               if ( c->cpuid_level >= 0x00000001 ) {
 882 +                       u32 capability, excap;
 883 +                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 884 +                       c->x86_capability[0] = capability;
 885 +                       c->x86_capability[4] = excap;
 886 +                       c->x86 = (tfms >> 8) & 15;
 887 +                       c->x86_model = (tfms >> 4) & 15;
 888 +                       if (c->x86 == 0xf)
 889 +                               c->x86 += (tfms >> 20) & 0xff;
 890 +                       if (c->x86 >= 0x6)
 891 +                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
 892 +                       c->x86_mask = tfms & 15;
 893 +#ifdef CONFIG_X86_HT
 894 +                       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 895 +#else
 896 +                       c->apicid = (ebx >> 24) & 0xFF;
 897 +#endif
 898 +               } else {
 899 +                       /* Have CPUID level 0 only - unheard of */
 900 +                       c->x86 = 4;
 901 +               }
 902 +
 903 +               /* AMD-defined flags: level 0x80000001 */
 904 +               xlvl = cpuid_eax(0x80000000);
 905 +               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
 906 +                       if ( xlvl >= 0x80000001 ) {
 907 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
 908 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
 909 +                       }
 910 +                       if ( xlvl >= 0x80000004 )
 911 +                               get_model_name(c); /* Default name */
 912 +               }
 913 +       }
 914 +
 915 +       early_intel_workaround(c);
 916 +
 917 +#ifdef CONFIG_X86_HT
 918 +       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 919 +#endif
 920 +}
 921 +
 922 +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 923 +{
 924 +       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
 925 +               /* Disable processor serial number */
 926 +               unsigned long lo,hi;
 927 +               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
 928 +               lo |= 0x200000;
 929 +               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
 930 +               printk(KERN_NOTICE "CPU serial number disabled.\n");
 931 +               clear_bit(X86_FEATURE_PN, c->x86_capability);
 932 +
 933 +               /* Disabling the serial number may affect the cpuid level */
 934 +               c->cpuid_level = cpuid_eax(0);
 935 +       }
 936 +}
 937 +
 938 +static int __init x86_serial_nr_setup(char *s)
 939 +{
 940 +       disable_x86_serial_nr = 0;
 941 +       return 1;
 942 +}
 943 +__setup("serialnumber", x86_serial_nr_setup);
 944 +
 945 +
 946 +
 947 +/*
 948 + * This does the hard work of actually picking apart the CPU stuff...
 949 + */
 950 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 951 +{
 952 +       int i;
 953 +
 954 +       c->loops_per_jiffy = loops_per_jiffy;
 955 +       c->x86_cache_size = -1;
 956 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
 957 +       c->cpuid_level = -1;    /* CPUID not detected */
 958 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
 959 +       c->x86_vendor_id[0] = '\0'; /* Unset */
 960 +       c->x86_model_id[0] = '\0';  /* Unset */
 961 +       c->x86_max_cores = 1;
 962 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
 963 +
 964 +       if (!have_cpuid_p()) {
 965 +               /* First of all, decide if this is a 486 or higher */
 966 +               /* It's a 486 if we can modify the AC flag */
 967 +               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
 968 +                       c->x86 = 4;
 969 +               else
 970 +                       c->x86 = 3;
 971 +       }
 972 +
 973 +       generic_identify(c);
 974 +
 975 +       printk(KERN_DEBUG "CPU: After generic identify, caps:");
 976 +       for (i = 0; i < NCAPINTS; i++)
 977 +               printk(" %08lx", c->x86_capability[i]);
 978 +       printk("\n");
 979 +
 980 +       if (this_cpu->c_identify) {
 981 +               this_cpu->c_identify(c);
 982 +
 983 +               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
 984 +               for (i = 0; i < NCAPINTS; i++)
 985 +                       printk(" %08lx", c->x86_capability[i]);
 986 +               printk("\n");
 987 +       }
 988 +
 989 +       /*
 990 +        * Vendor-specific initialization.  In this section we
 991 +        * canonicalize the feature flags, meaning if there are
 992 +        * features a certain CPU supports which CPUID doesn't
 993 +        * tell us, CPUID claiming incorrect flags, or other bugs,
 994 +        * we handle them here.
 995 +        *
 996 +        * At the end of this section, c->x86_capability better
 997 +        * indicate the features this CPU genuinely supports!
 998 +        */
 999 +       if (this_cpu->c_init)
1000 +               this_cpu->c_init(c);
1001 +
1002 +       /* Disable the PN if appropriate */
1003 +       squash_the_stupid_serial_number(c);
1004 +
1005 +       /*
1006 +        * The vendor-specific functions might have changed features.  Now
1007 +        * we do "generic changes."
1008 +        */
1009 +
1010 +       /* TSC disabled? */
1011 +       if ( tsc_disable )
1012 +               clear_bit(X86_FEATURE_TSC, c->x86_capability);
1013 +
1014 +       /* FXSR disabled? */
1015 +       if (disable_x86_fxsr) {
1016 +               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1017 +               clear_bit(X86_FEATURE_XMM, c->x86_capability);
1018 +       }
1019 +
1020 +       /* SEP disabled? */
1021 +       if (disable_x86_sep)
1022 +               clear_bit(X86_FEATURE_SEP, c->x86_capability);
1023 +
1024 +       if (disable_pse)
1025 +               clear_bit(X86_FEATURE_PSE, c->x86_capability);
1026 +
1027 +       /* If the model name is still unset, do table lookup. */
1028 +       if ( !c->x86_model_id[0] ) {
1029 +               char *p;
1030 +               p = table_lookup_model(c);
1031 +               if ( p )
1032 +                       strcpy(c->x86_model_id, p);
1033 +               else
1034 +                       /* Last resort... */
1035 +                       sprintf(c->x86_model_id, "%02x/%02x",
1036 +                               c->x86, c->x86_model);
1037 +       }
1038 +
1039 +       /* Now the feature flags better reflect actual CPU features! */
1040 +
1041 +       printk(KERN_DEBUG "CPU: After all inits, caps:");
1042 +       for (i = 0; i < NCAPINTS; i++)
1043 +               printk(" %08lx", c->x86_capability[i]);
1044 +       printk("\n");
1045 +
1046 +       /*
1047 +        * On SMP, boot_cpu_data holds the common feature set between
1048 +        * all CPUs; so make sure that we indicate which features are
1049 +        * common between the CPUs.  The first time this routine gets
1050 +        * executed, c == &boot_cpu_data.
1051 +        */
1052 +       if ( c != &boot_cpu_data ) {
1053 +               /* AND the already accumulated flags with these */
1054 +               for ( i = 0 ; i < NCAPINTS ; i++ )
1055 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1056 +       }
1057 +
1058 +       /* Init Machine Check Exception if available. */
1059 +       mcheck_init(c);
1060 +
1061 +       if (c == &boot_cpu_data)
1062 +               sysenter_setup();
1063 +       enable_sep_cpu();
1064 +
1065 +       if (c == &boot_cpu_data)
1066 +               mtrr_bp_init();
1067 +       else
1068 +               mtrr_ap_init();
1069 +}
1070 +
1071 +#ifdef CONFIG_X86_HT
1072 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
1073 +{
1074 +       u32     eax, ebx, ecx, edx;
1075 +       int     index_msb, core_bits;
1076 +
1077 +       cpuid(1, &eax, &ebx, &ecx, &edx);
1078 +
1079 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
1080 +               return;
1081 +
1082 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
1083 +
1084 +       if (smp_num_siblings == 1) {
1085 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
1086 +       } else if (smp_num_siblings > 1 ) {
1087 +
1088 +               if (smp_num_siblings > NR_CPUS) {
1089 +                       printk(KERN_WARNING "CPU: Unsupported number of the "
1090 +                                       "siblings %d", smp_num_siblings);
1091 +                       smp_num_siblings = 1;
1092 +                       return;
1093 +               }
1094 +
1095 +               index_msb = get_count_order(smp_num_siblings);
1096 +               c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
1097 +
1098 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
1099 +                      c->phys_proc_id);
1100 +
1101 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
1102 +
1103 +               index_msb = get_count_order(smp_num_siblings) ;
1104 +
1105 +               core_bits = get_count_order(c->x86_max_cores);
1106 +
1107 +               c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
1108 +                                              ((1 << core_bits) - 1);
1109 +
1110 +               if (c->x86_max_cores > 1)
1111 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
1112 +                              c->cpu_core_id);
1113 +       }
1114 +}
1115 +#endif
1116 +
1117 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1118 +{
1119 +       char *vendor = NULL;
1120 +
1121 +       if (c->x86_vendor < X86_VENDOR_NUM)
1122 +               vendor = this_cpu->c_vendor;
1123 +       else if (c->cpuid_level >= 0)
1124 +               vendor = c->x86_vendor_id;
1125 +
1126 +       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
1127 +               printk("%s ", vendor);
1128 +
1129 +       if (!c->x86_model_id[0])
1130 +               printk("%d86", c->x86);
1131 +       else
1132 +               printk("%s", c->x86_model_id);
1133 +
1134 +       if (c->x86_mask || c->cpuid_level >= 0)
1135 +               printk(" stepping %02x\n", c->x86_mask);
1136 +       else
1137 +               printk("\n");
1138 +}
1139 +
1140 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1141 +
1142 +/* This is hacky. :)
1143 + * We're emulating future behavior.
1144 + * In the future, the cpu-specific init functions will be called implicitly
1145 + * via the magic of initcalls.
1146 + * They will insert themselves into the cpu_devs structure.
1147 + * Then, when cpu_init() is called, we can just iterate over that array.
1148 + */
1149 +
1150 +extern int intel_cpu_init(void);
1151 +extern int cyrix_init_cpu(void);
1152 +extern int nsc_init_cpu(void);
1153 +extern int amd_init_cpu(void);
1154 +extern int centaur_init_cpu(void);
1155 +extern int transmeta_init_cpu(void);
1156 +extern int rise_init_cpu(void);
1157 +extern int nexgen_init_cpu(void);
1158 +extern int umc_init_cpu(void);
1159 +
1160 +void __init early_cpu_init(void)
1161 +{
1162 +       intel_cpu_init();
1163 +       cyrix_init_cpu();
1164 +       nsc_init_cpu();
1165 +       amd_init_cpu();
1166 +       centaur_init_cpu();
1167 +       transmeta_init_cpu();
1168 +       rise_init_cpu();
1169 +       nexgen_init_cpu();
1170 +       umc_init_cpu();
1171 +       early_cpu_detect();
1172 +
1173 +#ifdef CONFIG_DEBUG_PAGEALLOC
1174 +       /* pse is not compatible with on-the-fly unmapping,
1175 +        * disable it even if the cpus claim to support it.
1176 +        */
1177 +       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1178 +       disable_pse = 1;
1179 +#endif
1180 +}
1181 +
1182 +static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
1183 +{
1184 +       unsigned long frames[16];
1185 +       unsigned long va;
1186 +       int f;
1187 +
1188 +       for (va = gdt_descr->address, f = 0;
1189 +            va < gdt_descr->address + gdt_descr->size;
1190 +            va += PAGE_SIZE, f++) {
1191 +               frames[f] = virt_to_mfn(va);
1192 +               make_lowmem_page_readonly(
1193 +                       (void *)va, XENFEAT_writable_descriptor_tables);
1194 +       }
1195 +       if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
1196 +               BUG();
1197 +}
1198 +
1199 +/*
1200 + * cpu_init() initializes state that is per-CPU. Some data is already
1201 + * initialized (naturally) in the bootstrap process, such as the GDT
1202 + * and IDT. We reload them nevertheless, this function acts as a
1203 + * 'CPU state barrier', nothing should get across.
1204 + */
1205 +void __cpuinit cpu_init(void)
1206 +{
1207 +       int cpu = smp_processor_id();
1208 +#ifndef CONFIG_X86_NO_TSS
1209 +       struct tss_struct * t = &per_cpu(init_tss, cpu);
1210 +#endif
1211 +       struct thread_struct *thread = &current->thread;
1212 +       struct desc_struct *gdt;
1213 +       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1214 +
1215 +       if (cpu_test_and_set(cpu, cpu_initialized)) {
1216 +               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1217 +               for (;;) local_irq_enable();
1218 +       }
1219 +       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1220 +
1221 +       if (cpu_has_vme || cpu_has_de)
1222 +               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1223 +       if (tsc_disable && cpu_has_tsc) {
1224 +               printk(KERN_NOTICE "Disabling TSC...\n");
1225 +               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1226 +               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1227 +               set_in_cr4(X86_CR4_TSD);
1228 +       }
1229 +
1230 +#ifndef CONFIG_XEN
1231 +       /* The CPU hotplug case */
1232 +       if (cpu_gdt_descr->address) {
1233 +               gdt = (struct desc_struct *)cpu_gdt_descr->address;
1234 +               memset(gdt, 0, PAGE_SIZE);
1235 +               goto old_gdt;
1236 +       }
1237 +       /*
1238 +        * This is a horrible hack to allocate the GDT.  The problem
1239 +        * is that cpu_init() is called really early for the boot CPU
1240 +        * (and hence needs bootmem) but much later for the secondary
1241 +        * CPUs, when bootmem will have gone away
1242 +        */
1243 +       if (NODE_DATA(0)->bdata->node_bootmem_map) {
1244 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1245 +               /* alloc_bootmem_pages panics on failure, so no check */
1246 +               memset(gdt, 0, PAGE_SIZE);
1247 +       } else {
1248 +               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
1249 +               if (unlikely(!gdt)) {
1250 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
1251 +                       for (;;)
1252 +                               local_irq_enable();
1253 +               }
1254 +       }
1255 +old_gdt:
1256 +       /*
1257 +        * Initialize the per-CPU GDT with the boot GDT,
1258 +        * and set up the GDT descriptor:
1259 +        */
1260 +       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1261 +
1262 +       /* Set up GDT entry for 16bit stack */
1263 +       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
1264 +               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
1265 +               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
1266 +               (CPU_16BIT_STACK_SIZE - 1);
1267 +
1268 +       cpu_gdt_descr->size = GDT_SIZE - 1;
1269 +       cpu_gdt_descr->address = (unsigned long)gdt;
1270 +#else
1271 +       if (cpu == 0 && cpu_gdt_descr->address == 0) {
1272 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1273 +               /* alloc_bootmem_pages panics on failure, so no check */
1274 +               memset(gdt, 0, PAGE_SIZE);
1275 +
1276 +               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1277 +
1278 +               cpu_gdt_descr->size = GDT_SIZE;
1279 +               cpu_gdt_descr->address = (unsigned long)gdt;
1280 +       }
1281 +#endif
1282 +
1283 +       cpu_gdt_init(cpu_gdt_descr);
1284 +
1285 +       /*
1286 +        * Set up and load the per-CPU TSS and LDT
1287 +        */
1288 +       atomic_inc(&init_mm.mm_count);
1289 +       current->active_mm = &init_mm;
1290 +       if (current->mm)
1291 +               BUG();
1292 +       enter_lazy_tlb(&init_mm, current);
1293 +
1294 +       load_esp0(t, thread);
1295 +
1296 +       load_LDT(&init_mm.context);
1297 +
1298 +#ifdef CONFIG_DOUBLEFAULT
1299 +       /* Set up doublefault TSS pointer in the GDT */
1300 +       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1301 +#endif
1302 +
1303 +       /* Clear %fs and %gs. */
1304 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
1305 +
1306 +       /* Clear all 6 debug registers: */
1307 +       set_debugreg(0, 0);
1308 +       set_debugreg(0, 1);
1309 +       set_debugreg(0, 2);
1310 +       set_debugreg(0, 3);
1311 +       set_debugreg(0, 6);
1312 +       set_debugreg(0, 7);
1313 +
1314 +       /*
1315 +        * Force FPU initialization:
1316 +        */
1317 +       current_thread_info()->status = 0;
1318 +       clear_used_math();
1319 +       mxcsr_feature_mask_init();
1320 +}
1321 +
1322 +#ifdef CONFIG_HOTPLUG_CPU
1323 +void __cpuinit cpu_uninit(void)
1324 +{
1325 +       int cpu = raw_smp_processor_id();
1326 +       cpu_clear(cpu, cpu_initialized);
1327 +
1328 +       /* lazy TLB state */
1329 +       per_cpu(cpu_tlbstate, cpu).state = 0;
1330 +       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1331 +}
1332 +#endif
1333 Index: head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c
1334 ===================================================================
1335 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
1336 +++ head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100
1337 @@ -0,0 +1,198 @@
1338 +#include <linux/init.h>
1339 +#include <linux/proc_fs.h>
1340 +#include <linux/ctype.h>
1341 +#include <linux/module.h>
1342 +#include <linux/seq_file.h>
1343 +#include <asm/uaccess.h>
1344 +#include <linux/mutex.h>
1345 +
1346 +#include <asm/mtrr.h>
1347 +#include "mtrr.h"
1348 +
1349 +static DEFINE_MUTEX(mtrr_mutex);
1350 +
1351 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
1352 +                     unsigned int *size, mtrr_type * type)
1353 +{
1354 +       struct xen_platform_op op;
1355 +
1356 +       op.cmd = XENPF_read_memtype;
1357 +       op.u.read_memtype.reg = reg;
1358 +       if (unlikely(HYPERVISOR_platform_op(&op)))
1359 +               memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
1360 +
1361 +       *size = op.u.read_memtype.nr_mfns;
1362 +       *base = op.u.read_memtype.mfn;
1363 +       *type = op.u.read_memtype.type;
1364 +}
1365 +
1366 +struct mtrr_ops generic_mtrr_ops = {
1367 +       .use_intel_if      = 1,
1368 +       .get               = generic_get_mtrr,
1369 +};
1370 +
1371 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1372 +unsigned int num_var_ranges;
1373 +unsigned int *usage_table;
1374 +
1375 +static void __init set_num_var_ranges(void)
1376 +{
1377 +       struct xen_platform_op op;
1378 +
1379 +       for (num_var_ranges = 0; ; num_var_ranges++) {
1380 +               op.cmd = XENPF_read_memtype;
1381 +               op.u.read_memtype.reg = num_var_ranges;
1382 +               if (HYPERVISOR_platform_op(&op) != 0)
1383 +                       break;
1384 +       }
1385 +}
1386 +
1387 +static void __init init_table(void)
1388 +{
1389 +       int i, max;
1390 +
1391 +       max = num_var_ranges;
1392 +       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1393 +           == NULL) {
1394 +               printk(KERN_ERR "mtrr: could not allocate\n");
1395 +               return;
1396 +       }
1397 +       for (i = 0; i < max; i++)
1398 +               usage_table[i] = 0;
1399 +}
1400 +
1401 +int mtrr_add_page(unsigned long base, unsigned long size,
1402 +                 unsigned int type, char increment)
1403 +{
1404 +       int error;
1405 +       struct xen_platform_op op;
1406 +
1407 +       mutex_lock(&mtrr_mutex);
1408 +
1409 +       op.cmd = XENPF_add_memtype;
1410 +       op.u.add_memtype.mfn     = base;
1411 +       op.u.add_memtype.nr_mfns = size;
1412 +       op.u.add_memtype.type    = type;
1413 +       error = HYPERVISOR_platform_op(&op);
1414 +       if (error) {
1415 +               mutex_unlock(&mtrr_mutex);
1416 +               BUG_ON(error > 0);
1417 +               return error;
1418 +       }
1419 +
1420 +       if (increment)
1421 +               ++usage_table[op.u.add_memtype.reg];
1422 +
1423 +       mutex_unlock(&mtrr_mutex);
1424 +
1425 +       return op.u.add_memtype.reg;
1426 +}
1427 +
1428 +static int mtrr_check(unsigned long base, unsigned long size)
1429 +{
1430 +       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
1431 +               printk(KERN_WARNING
1432 +                       "mtrr: size and base must be multiples of 4 kiB\n");
1433 +               printk(KERN_DEBUG
1434 +                       "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
1435 +               dump_stack();
1436 +               return -1;
1437 +       }
1438 +       return 0;
1439 +}
1440 +
1441 +int
1442 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1443 +        char increment)
1444 +{
1445 +       if (mtrr_check(base, size))
1446 +               return -EINVAL;
1447 +       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
1448 +                            increment);
1449 +}
1450 +
1451 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
1452 +{
1453 +       unsigned i;
1454 +       mtrr_type ltype;
1455 +       unsigned long lbase;
1456 +       unsigned int lsize;
1457 +       int error = -EINVAL;
1458 +       struct xen_platform_op op;
1459 +
1460 +       mutex_lock(&mtrr_mutex);
1461 +
1462 +       if (reg < 0) {
1463 +               /*  Search for existing MTRR  */
1464 +               for (i = 0; i < num_var_ranges; ++i) {
1465 +                       mtrr_if->get(i, &lbase, &lsize, &ltype);
1466 +                       if (lbase == base && lsize == size) {
1467 +                               reg = i;
1468 +                               break;
1469 +                       }
1470 +               }
1471 +               if (reg < 0) {
1472 +                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
1473 +                              size);
1474 +                       goto out;
1475 +               }
1476 +       }
1477 +       if (usage_table[reg] < 1) {
1478 +               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1479 +               goto out;
1480 +       }
1481 +       if (--usage_table[reg] < 1) {
1482 +               op.cmd = XENPF_del_memtype;
1483 +               op.u.del_memtype.handle = 0;
1484 +               op.u.del_memtype.reg    = reg;
1485 +               error = HYPERVISOR_platform_op(&op);
1486 +               if (error) {
1487 +                       BUG_ON(error > 0);
1488 +                       goto out;
1489 +               }
1490 +       }
1491 +       error = reg;
1492 + out:
1493 +       mutex_unlock(&mtrr_mutex);
1494 +       return error;
1495 +}
1496 +
1497 +int
1498 +mtrr_del(int reg, unsigned long base, unsigned long size)
1499 +{
1500 +       if (mtrr_check(base, size))
1501 +               return -EINVAL;
1502 +       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
1503 +}
1504 +
1505 +EXPORT_SYMBOL(mtrr_add);
1506 +EXPORT_SYMBOL(mtrr_del);
1507 +
1508 +void __init mtrr_bp_init(void)
1509 +{
1510 +}
1511 +
1512 +void mtrr_ap_init(void)
1513 +{
1514 +}
1515 +
1516 +static int __init mtrr_init(void)
1517 +{
1518 +       struct cpuinfo_x86 *c = &boot_cpu_data;
1519 +
1520 +       if (!is_initial_xendomain())
1521 +               return -ENODEV;
1522 +
1523 +       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
1524 +           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
1525 +           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
1526 +           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
1527 +               return -ENODEV;
1528 +
1529 +       set_num_var_ranges();
1530 +       init_table();
1531 +
1532 +       return 0;
1533 +}
1534 +
1535 +subsys_initcall(mtrr_init);
1536 Index: head-2008-11-25/arch/x86/kernel/entry_32-xen.S
1537 ===================================================================
1538 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
1539 +++ head-2008-11-25/arch/x86/kernel/entry_32-xen.S      2007-12-10 08:47:31.000000000 +0100
1540 @@ -0,0 +1,1238 @@
1541 +/*
1542 + *  linux/arch/i386/entry.S
1543 + *
1544 + *  Copyright (C) 1991, 1992  Linus Torvalds
1545 + */
1546 +
1547 +/*
1548 + * entry.S contains the system-call and fault low-level handling routines.
1549 + * This also contains the timer-interrupt handler, as well as all interrupts
1550 + * and faults that can result in a task-switch.
1551 + *
1552 + * NOTE: This code handles signal-recognition, which happens every time
1553 + * after a timer-interrupt and after each system call.
1554 + *
1555 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
1556 + * on a 486.
1557 + *
1558 + * Stack layout in 'ret_from_system_call':
1559 + *     ptrace needs to have all regs on the stack.
1560 + *     if the order here is changed, it needs to be
1561 + *     updated in fork.c:copy_process, signal.c:do_signal,
1562 + *     ptrace.c and ptrace.h
1563 + *
1564 + *      0(%esp) - %ebx
1565 + *      4(%esp) - %ecx
1566 + *      8(%esp) - %edx
1567 + *       C(%esp) - %esi
1568 + *     10(%esp) - %edi
1569 + *     14(%esp) - %ebp
1570 + *     18(%esp) - %eax
1571 + *     1C(%esp) - %ds
1572 + *     20(%esp) - %es
1573 + *     24(%esp) - orig_eax
1574 + *     28(%esp) - %eip
1575 + *     2C(%esp) - %cs
1576 + *     30(%esp) - %eflags
1577 + *     34(%esp) - %oldesp
1578 + *     38(%esp) - %oldss
1579 + *
1580 + * "current" is in register %ebx during any slow entries.
1581 + */
1582 +
1583 +#include <linux/linkage.h>
1584 +#include <asm/thread_info.h>
1585 +#include <asm/irqflags.h>
1586 +#include <asm/errno.h>
1587 +#include <asm/segment.h>
1588 +#include <asm/smp.h>
1589 +#include <asm/page.h>
1590 +#include <asm/desc.h>
1591 +#include <asm/dwarf2.h>
1592 +#include "irq_vectors.h"
1593 +#include <xen/interface/xen.h>
1594 +
1595 +#define nr_syscalls ((syscall_table_size)/4)
1596 +
1597 +EBX            = 0x00
1598 +ECX            = 0x04
1599 +EDX            = 0x08
1600 +ESI            = 0x0C
1601 +EDI            = 0x10
1602 +EBP            = 0x14
1603 +EAX            = 0x18
1604 +DS             = 0x1C
1605 +ES             = 0x20
1606 +ORIG_EAX       = 0x24
1607 +EIP            = 0x28
1608 +CS             = 0x2C
1609 +EFLAGS         = 0x30
1610 +OLDESP         = 0x34
1611 +OLDSS          = 0x38
1612 +
1613 +CF_MASK                = 0x00000001
1614 +TF_MASK                = 0x00000100
1615 +IF_MASK                = 0x00000200
1616 +DF_MASK                = 0x00000400
1617 +NT_MASK                = 0x00004000
1618 +VM_MASK                = 0x00020000
1619 +/* Pseudo-eflags. */
1620 +NMI_MASK       = 0x80000000
1621 +
1622 +#ifndef CONFIG_XEN
1623 +#define DISABLE_INTERRUPTS     cli
1624 +#define ENABLE_INTERRUPTS      sti
1625 +#else
1626 +/* Offsets into shared_info_t. */
1627 +#define evtchn_upcall_pending          /* 0 */
1628 +#define evtchn_upcall_mask             1
1629 +
1630 +#define sizeof_vcpu_shift              6
1631 +
1632 +#ifdef CONFIG_SMP
1633 +#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
1634 +                               shl  $sizeof_vcpu_shift,%esi            ; \
1635 +                               addl HYPERVISOR_shared_info,%esi
1636 +#else
1637 +#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
1638 +#endif
1639 +
1640 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
1641 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
1642 +#define DISABLE_INTERRUPTS     GET_VCPU_INFO                           ; \
1643 +                               __DISABLE_INTERRUPTS
1644 +#define ENABLE_INTERRUPTS      GET_VCPU_INFO                           ; \
1645 +                               __ENABLE_INTERRUPTS
1646 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
1647 +#endif
1648 +
1649 +#ifdef CONFIG_PREEMPT
1650 +#define preempt_stop           cli; TRACE_IRQS_OFF
1651 +#else
1652 +#define preempt_stop
1653 +#define resume_kernel          restore_nocheck
1654 +#endif
1655 +
1656 +.macro TRACE_IRQS_IRET
1657 +#ifdef CONFIG_TRACE_IRQFLAGS
1658 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off?
1659 +       jz 1f
1660 +       TRACE_IRQS_ON
1661 +1:
1662 +#endif
1663 +.endm
1664 +
1665 +#ifdef CONFIG_VM86
1666 +#define resume_userspace_sig   check_userspace
1667 +#else
1668 +#define resume_userspace_sig   resume_userspace
1669 +#endif
1670 +
1671 +#define SAVE_ALL \
1672 +       cld; \
1673 +       pushl %es; \
1674 +       CFI_ADJUST_CFA_OFFSET 4;\
1675 +       /*CFI_REL_OFFSET es, 0;*/\
1676 +       pushl %ds; \
1677 +       CFI_ADJUST_CFA_OFFSET 4;\
1678 +       /*CFI_REL_OFFSET ds, 0;*/\
1679 +       pushl %eax; \
1680 +       CFI_ADJUST_CFA_OFFSET 4;\
1681 +       CFI_REL_OFFSET eax, 0;\
1682 +       pushl %ebp; \
1683 +       CFI_ADJUST_CFA_OFFSET 4;\
1684 +       CFI_REL_OFFSET ebp, 0;\
1685 +       pushl %edi; \
1686 +       CFI_ADJUST_CFA_OFFSET 4;\
1687 +       CFI_REL_OFFSET edi, 0;\
1688 +       pushl %esi; \
1689 +       CFI_ADJUST_CFA_OFFSET 4;\
1690 +       CFI_REL_OFFSET esi, 0;\
1691 +       pushl %edx; \
1692 +       CFI_ADJUST_CFA_OFFSET 4;\
1693 +       CFI_REL_OFFSET edx, 0;\
1694 +       pushl %ecx; \
1695 +       CFI_ADJUST_CFA_OFFSET 4;\
1696 +       CFI_REL_OFFSET ecx, 0;\
1697 +       pushl %ebx; \
1698 +       CFI_ADJUST_CFA_OFFSET 4;\
1699 +       CFI_REL_OFFSET ebx, 0;\
1700 +       movl $(__USER_DS), %edx; \
1701 +       movl %edx, %ds; \
1702 +       movl %edx, %es;
1703 +
1704 +#define RESTORE_INT_REGS \
1705 +       popl %ebx;      \
1706 +       CFI_ADJUST_CFA_OFFSET -4;\
1707 +       CFI_RESTORE ebx;\
1708 +       popl %ecx;      \
1709 +       CFI_ADJUST_CFA_OFFSET -4;\
1710 +       CFI_RESTORE ecx;\
1711 +       popl %edx;      \
1712 +       CFI_ADJUST_CFA_OFFSET -4;\
1713 +       CFI_RESTORE edx;\
1714 +       popl %esi;      \
1715 +       CFI_ADJUST_CFA_OFFSET -4;\
1716 +       CFI_RESTORE esi;\
1717 +       popl %edi;      \
1718 +       CFI_ADJUST_CFA_OFFSET -4;\
1719 +       CFI_RESTORE edi;\
1720 +       popl %ebp;      \
1721 +       CFI_ADJUST_CFA_OFFSET -4;\
1722 +       CFI_RESTORE ebp;\
1723 +       popl %eax;      \
1724 +       CFI_ADJUST_CFA_OFFSET -4;\
1725 +       CFI_RESTORE eax
1726 +
1727 +#define RESTORE_REGS   \
1728 +       RESTORE_INT_REGS; \
1729 +1:     popl %ds;       \
1730 +       CFI_ADJUST_CFA_OFFSET -4;\
1731 +       /*CFI_RESTORE ds;*/\
1732 +2:     popl %es;       \
1733 +       CFI_ADJUST_CFA_OFFSET -4;\
1734 +       /*CFI_RESTORE es;*/\
1735 +.section .fixup,"ax";  \
1736 +3:     movl $0,(%esp); \
1737 +       jmp 1b;         \
1738 +4:     movl $0,(%esp); \
1739 +       jmp 2b;         \
1740 +.previous;             \
1741 +.section __ex_table,"a";\
1742 +       .align 4;       \
1743 +       .long 1b,3b;    \
1744 +       .long 2b,4b;    \
1745 +.previous
1746 +
1747 +#define RING0_INT_FRAME \
1748 +       CFI_STARTPROC simple;\
1749 +       CFI_DEF_CFA esp, 3*4;\
1750 +       /*CFI_OFFSET cs, -2*4;*/\
1751 +       CFI_OFFSET eip, -3*4
1752 +
1753 +#define RING0_EC_FRAME \
1754 +       CFI_STARTPROC simple;\
1755 +       CFI_DEF_CFA esp, 4*4;\
1756 +       /*CFI_OFFSET cs, -2*4;*/\
1757 +       CFI_OFFSET eip, -3*4
1758 +
1759 +#define RING0_PTREGS_FRAME \
1760 +       CFI_STARTPROC simple;\
1761 +       CFI_DEF_CFA esp, OLDESP-EBX;\
1762 +       /*CFI_OFFSET cs, CS-OLDESP;*/\
1763 +       CFI_OFFSET eip, EIP-OLDESP;\
1764 +       /*CFI_OFFSET es, ES-OLDESP;*/\
1765 +       /*CFI_OFFSET ds, DS-OLDESP;*/\
1766 +       CFI_OFFSET eax, EAX-OLDESP;\
1767 +       CFI_OFFSET ebp, EBP-OLDESP;\
1768 +       CFI_OFFSET edi, EDI-OLDESP;\
1769 +       CFI_OFFSET esi, ESI-OLDESP;\
1770 +       CFI_OFFSET edx, EDX-OLDESP;\
1771 +       CFI_OFFSET ecx, ECX-OLDESP;\
1772 +       CFI_OFFSET ebx, EBX-OLDESP
1773 +
1774 +ENTRY(ret_from_fork)
1775 +       CFI_STARTPROC
1776 +       pushl %eax
1777 +       CFI_ADJUST_CFA_OFFSET 4
1778 +       call schedule_tail
1779 +       GET_THREAD_INFO(%ebp)
1780 +       popl %eax
1781 +       CFI_ADJUST_CFA_OFFSET -4
1782 +       pushl $0x0202                   # Reset kernel eflags
1783 +       CFI_ADJUST_CFA_OFFSET 4
1784 +       popfl
1785 +       CFI_ADJUST_CFA_OFFSET -4
1786 +       jmp syscall_exit
1787 +       CFI_ENDPROC
1788 +
1789 +/*
1790 + * Return to user mode is not as complex as all this looks,
1791 + * but we want the default path for a system call return to
1792 + * go as quickly as possible which is why some of this is
1793 + * less clear than it otherwise should be.
1794 + */
1795 +
1796 +       # userspace resumption stub bypassing syscall exit tracing
1797 +       ALIGN
1798 +       RING0_PTREGS_FRAME
1799 +ret_from_exception:
1800 +       preempt_stop
1801 +ret_from_intr:
1802 +       GET_THREAD_INFO(%ebp)
1803 +check_userspace:
1804 +       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
1805 +       movb CS(%esp), %al
1806 +       testl $(VM_MASK | 2), %eax
1807 +       jz resume_kernel
1808 +ENTRY(resume_userspace)
1809 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
1810 +                                       # setting need_resched or sigpending
1811 +                                       # between sampling and the iret
1812 +       movl TI_flags(%ebp), %ecx
1813 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
1814 +                                       # int/exception return?
1815 +       jne work_pending
1816 +       jmp restore_all
1817 +
1818 +#ifdef CONFIG_PREEMPT
1819 +ENTRY(resume_kernel)
1820 +       cli
1821 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
1822 +       jnz restore_nocheck
1823 +need_resched:
1824 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
1825 +       testb $_TIF_NEED_RESCHED, %cl
1826 +       jz restore_all
1827 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
1828 +       jz restore_all
1829 +       call preempt_schedule_irq
1830 +       jmp need_resched
1831 +#endif
1832 +       CFI_ENDPROC
1833 +
1834 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
1835 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
1836 +
1837 +       # sysenter call handler stub
1838 +ENTRY(sysenter_entry)
1839 +       CFI_STARTPROC simple
1840 +       CFI_DEF_CFA esp, 0
1841 +       CFI_REGISTER esp, ebp
1842 +       movl SYSENTER_stack_esp0(%esp),%esp
1843 +sysenter_past_esp:
1844 +       /*
1845 +        * No need to follow this irqs on/off section: the syscall
1846 +        * disabled irqs and here we enable it straight after entry:
1847 +        */
1848 +       sti
1849 +       pushl $(__USER_DS)
1850 +       CFI_ADJUST_CFA_OFFSET 4
1851 +       /*CFI_REL_OFFSET ss, 0*/
1852 +       pushl %ebp
1853 +       CFI_ADJUST_CFA_OFFSET 4
1854 +       CFI_REL_OFFSET esp, 0
1855 +       pushfl
1856 +       CFI_ADJUST_CFA_OFFSET 4
1857 +       pushl $(__USER_CS)
1858 +       CFI_ADJUST_CFA_OFFSET 4
1859 +       /*CFI_REL_OFFSET cs, 0*/
1860 +       /*
1861 +        * Push current_thread_info()->sysenter_return to the stack.
1862 +        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1863 +        * pushed above; +8 corresponds to copy_thread's esp0 setting.
1864 +        */
1865 +       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1866 +       CFI_ADJUST_CFA_OFFSET 4
1867 +       CFI_REL_OFFSET eip, 0
1868 +
1869 +/*
1870 + * Load the potential sixth argument from user stack.
1871 + * Careful about security.
1872 + */
1873 +       cmpl $__PAGE_OFFSET-3,%ebp
1874 +       jae syscall_fault
1875 +1:     movl (%ebp),%ebp
1876 +.section __ex_table,"a"
1877 +       .align 4
1878 +       .long 1b,syscall_fault
1879 +.previous
1880 +
1881 +       pushl %eax
1882 +       CFI_ADJUST_CFA_OFFSET 4
1883 +       SAVE_ALL
1884 +       GET_THREAD_INFO(%ebp)
1885 +
1886 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1887 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1888 +       jnz syscall_trace_entry
1889 +       cmpl $(nr_syscalls), %eax
1890 +       jae syscall_badsys
1891 +       call *sys_call_table(,%eax,4)
1892 +       movl %eax,EAX(%esp)
1893 +       DISABLE_INTERRUPTS
1894 +       TRACE_IRQS_OFF
1895 +       movl TI_flags(%ebp), %ecx
1896 +       testw $_TIF_ALLWORK_MASK, %cx
1897 +       jne syscall_exit_work
1898 +/* if something modifies registers it must also disable sysexit */
1899 +       movl EIP(%esp), %edx
1900 +       movl OLDESP(%esp), %ecx
1901 +       xorl %ebp,%ebp
1902 +#ifdef CONFIG_XEN
1903 +       TRACE_IRQS_ON
1904 +       __ENABLE_INTERRUPTS
1905 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1906 +       __TEST_PENDING
1907 +       jnz  14f                        # process more events if necessary...
1908 +       movl ESI(%esp), %esi
1909 +       sysexit
1910 +14:    __DISABLE_INTERRUPTS
1911 +       TRACE_IRQS_OFF
1912 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1913 +       push %esp
1914 +       call evtchn_do_upcall
1915 +       add  $4,%esp
1916 +       jmp  ret_from_intr
1917 +#else
1918 +       TRACE_IRQS_ON
1919 +       sti
1920 +       sysexit
1921 +#endif /* !CONFIG_XEN */
1922 +       CFI_ENDPROC
1923 +
1924 +       # pv sysenter call handler stub
1925 +ENTRY(sysenter_entry_pv)
1926 +       RING0_INT_FRAME
1927 +       movl $__USER_DS,16(%esp)
1928 +       movl %ebp,12(%esp)
1929 +       movl $__USER_CS,4(%esp)
1930 +       addl $4,%esp
1931 +       /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
1932 +       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1933 +/*
1934 + * Load the potential sixth argument from user stack.
1935 + * Careful about security.
1936 + */
1937 +       cmpl $__PAGE_OFFSET-3,%ebp
1938 +       jae syscall_fault
1939 +1:     movl (%ebp),%ebp
1940 +.section __ex_table,"a"
1941 +       .align 4
1942 +       .long 1b,syscall_fault
1943 +.previous
1944 +       /* fall through */
1945 +       CFI_ENDPROC
1946 +ENDPROC(sysenter_entry_pv)
1947 +
1948 +       # system call handler stub
1949 +ENTRY(system_call)
1950 +       RING0_INT_FRAME                 # can't unwind into user space anyway
1951 +       pushl %eax                      # save orig_eax
1952 +       CFI_ADJUST_CFA_OFFSET 4
1953 +       SAVE_ALL
1954 +       GET_THREAD_INFO(%ebp)
1955 +       testl $TF_MASK,EFLAGS(%esp)
1956 +       jz no_singlestep
1957 +       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1958 +no_singlestep:
1959 +                                       # system call tracing in operation / emulation
1960 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1961 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1962 +       jnz syscall_trace_entry
1963 +       cmpl $(nr_syscalls), %eax
1964 +       jae syscall_badsys
1965 +syscall_call:
1966 +       call *sys_call_table(,%eax,4)
1967 +       movl %eax,EAX(%esp)             # store the return value
1968 +syscall_exit:
1969 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
1970 +                                       # setting need_resched or sigpending
1971 +                                       # between sampling and the iret
1972 +       TRACE_IRQS_OFF
1973 +       movl TI_flags(%ebp), %ecx
1974 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
1975 +       jne syscall_exit_work
1976 +
1977 +restore_all:
1978 +#ifndef CONFIG_XEN
1979 +       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
1980 +       # Warning: OLDSS(%esp) contains the wrong/random values if we
1981 +       # are returning to the kernel.
1982 +       # See comments in process.c:copy_thread() for details.
1983 +       movb OLDSS(%esp), %ah
1984 +       movb CS(%esp), %al
1985 +       andl $(VM_MASK | (4 << 8) | 3), %eax
1986 +       cmpl $((4 << 8) | 3), %eax
1987 +       CFI_REMEMBER_STATE
1988 +       je ldt_ss                       # returning to user-space with LDT SS
1989 +restore_nocheck:
1990 +#else
1991 +restore_nocheck:
1992 +       movl EFLAGS(%esp), %eax
1993 +       testl $(VM_MASK|NMI_MASK), %eax
1994 +       CFI_REMEMBER_STATE
1995 +       jnz hypervisor_iret
1996 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
1997 +       GET_VCPU_INFO
1998 +       andb evtchn_upcall_mask(%esi),%al
1999 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
2000 +       CFI_REMEMBER_STATE
2001 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
2002 +#endif
2003 +       TRACE_IRQS_IRET
2004 +restore_nocheck_notrace:
2005 +       RESTORE_REGS
2006 +       addl $4, %esp
2007 +       CFI_ADJUST_CFA_OFFSET -4
2008 +1:     iret
2009 +.section .fixup,"ax"
2010 +iret_exc:
2011 +#ifndef CONFIG_XEN
2012 +       TRACE_IRQS_ON
2013 +       sti
2014 +#endif
2015 +       pushl $0                        # no error code
2016 +       pushl $do_iret_error
2017 +       jmp error_code
2018 +.previous
2019 +.section __ex_table,"a"
2020 +       .align 4
2021 +       .long 1b,iret_exc
2022 +.previous
2023 +
2024 +       CFI_RESTORE_STATE
2025 +#ifndef CONFIG_XEN
2026 +ldt_ss:
2027 +       larl OLDSS(%esp), %eax
2028 +       jnz restore_nocheck
2029 +       testl $0x00400000, %eax         # returning to 32bit stack?
2030 +       jnz restore_nocheck             # allright, normal return
2031 +       /* If returning to userspace with 16bit stack,
2032 +        * try to fix the higher word of ESP, as the CPU
2033 +        * won't restore it.
2034 +        * This is an "official" bug of all the x86-compatible
2035 +        * CPUs, which we can try to work around to make
2036 +        * dosemu and wine happy. */
2037 +       subl $8, %esp           # reserve space for switch16 pointer
2038 +       CFI_ADJUST_CFA_OFFSET 8
2039 +       cli
2040 +       TRACE_IRQS_OFF
2041 +       movl %esp, %eax
2042 +       /* Set up the 16bit stack frame with switch32 pointer on top,
2043 +        * and a switch16 pointer on top of the current frame. */
2044 +       call setup_x86_bogus_stack
2045 +       CFI_ADJUST_CFA_OFFSET -8        # frame has moved
2046 +       TRACE_IRQS_IRET
2047 +       RESTORE_REGS
2048 +       lss 20+4(%esp), %esp    # switch to 16bit stack
2049 +1:     iret
2050 +.section __ex_table,"a"
2051 +       .align 4
2052 +       .long 1b,iret_exc
2053 +.previous
2054 +#else
2055 +        ALIGN
2056 +restore_all_enable_events:
2057 +       TRACE_IRQS_ON
2058 +       __ENABLE_INTERRUPTS
2059 +scrit: /**** START OF CRITICAL REGION ****/
2060 +       __TEST_PENDING
2061 +       jnz  14f                        # process more events if necessary...
2062 +       RESTORE_REGS
2063 +       addl $4, %esp
2064 +       CFI_ADJUST_CFA_OFFSET -4
2065 +1:     iret
2066 +.section __ex_table,"a"
2067 +       .align 4
2068 +       .long 1b,iret_exc
2069 +.previous
2070 +14:    __DISABLE_INTERRUPTS
2071 +       TRACE_IRQS_OFF
2072 +       jmp  11f
2073 +ecrit:  /**** END OF CRITICAL REGION ****/
2074 +
2075 +       CFI_RESTORE_STATE
2076 +hypervisor_iret:
2077 +       andl $~NMI_MASK, EFLAGS(%esp)
2078 +       RESTORE_REGS
2079 +       addl $4, %esp
2080 +       CFI_ADJUST_CFA_OFFSET -4
2081 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
2082 +#endif
2083 +       CFI_ENDPROC
2084 +
2085 +       # perform work that needs to be done immediately before resumption
2086 +       ALIGN
2087 +       RING0_PTREGS_FRAME              # can't unwind into user space anyway
2088 +work_pending:
2089 +       testb $_TIF_NEED_RESCHED, %cl
2090 +       jz work_notifysig
2091 +work_resched:
2092 +       call schedule
2093 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
2094 +                                       # setting need_resched or sigpending
2095 +                                       # between sampling and the iret
2096 +       TRACE_IRQS_OFF
2097 +       movl TI_flags(%ebp), %ecx
2098 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
2099 +                                       # than syscall tracing?
2100 +       jz restore_all
2101 +       testb $_TIF_NEED_RESCHED, %cl
2102 +       jnz work_resched
2103 +
2104 +work_notifysig:                                # deal with pending signals and
2105 +                                       # notify-resume requests
2106 +       testl $VM_MASK, EFLAGS(%esp)
2107 +       movl %esp, %eax
2108 +       jne work_notifysig_v86          # returning to kernel-space or
2109 +                                       # vm86-space
2110 +       xorl %edx, %edx
2111 +       call do_notify_resume
2112 +       jmp resume_userspace_sig
2113 +
2114 +       ALIGN
2115 +work_notifysig_v86:
2116 +#ifdef CONFIG_VM86
2117 +       pushl %ecx                      # save ti_flags for do_notify_resume
2118 +       CFI_ADJUST_CFA_OFFSET 4
2119 +       call save_v86_state             # %eax contains pt_regs pointer
2120 +       popl %ecx
2121 +       CFI_ADJUST_CFA_OFFSET -4
2122 +       movl %eax, %esp
2123 +       xorl %edx, %edx
2124 +       call do_notify_resume
2125 +       jmp resume_userspace_sig
2126 +#endif
2127 +
2128 +       # perform syscall exit tracing
2129 +       ALIGN
2130 +syscall_trace_entry:
2131 +       movl $-ENOSYS,EAX(%esp)
2132 +       movl %esp, %eax
2133 +       xorl %edx,%edx
2134 +       call do_syscall_trace
2135 +       cmpl $0, %eax
2136 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
2137 +                                       # so must skip actual syscall
2138 +       movl ORIG_EAX(%esp), %eax
2139 +       cmpl $(nr_syscalls), %eax
2140 +       jnae syscall_call
2141 +       jmp syscall_exit
2142 +
2143 +       # perform syscall exit tracing
2144 +       ALIGN
2145 +syscall_exit_work:
2146 +       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2147 +       jz work_pending
2148 +       TRACE_IRQS_ON
2149 +       ENABLE_INTERRUPTS               # could let do_syscall_trace() call
2150 +                                       # schedule() instead
2151 +       movl %esp, %eax
2152 +       movl $1, %edx
2153 +       call do_syscall_trace
2154 +       jmp resume_userspace
2155 +       CFI_ENDPROC
2156 +
2157 +       RING0_INT_FRAME                 # can't unwind into user space anyway
2158 +syscall_fault:
2159 +       pushl %eax                      # save orig_eax
2160 +       CFI_ADJUST_CFA_OFFSET 4
2161 +       SAVE_ALL
2162 +       GET_THREAD_INFO(%ebp)
2163 +       movl $-EFAULT,EAX(%esp)
2164 +       jmp resume_userspace
2165 +
2166 +syscall_badsys:
2167 +       movl $-ENOSYS,EAX(%esp)
2168 +       jmp resume_userspace
2169 +       CFI_ENDPROC
2170 +
2171 +#ifndef CONFIG_XEN
2172 +#define FIXUP_ESPFIX_STACK \
2173 +       movl %esp, %eax; \
2174 +       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2175 +       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2176 +       /* copy data from 16bit stack to 32bit stack */ \
2177 +       call fixup_x86_bogus_stack; \
2178 +       /* put ESP to the proper location */ \
2179 +       movl %eax, %esp;
2180 +#define UNWIND_ESPFIX_STACK \
2181 +       pushl %eax; \
2182 +       CFI_ADJUST_CFA_OFFSET 4; \
2183 +       movl %ss, %eax; \
2184 +       /* see if on 16bit stack */ \
2185 +       cmpw $__ESPFIX_SS, %ax; \
2186 +       je 28f; \
2187 +27:    popl %eax; \
2188 +       CFI_ADJUST_CFA_OFFSET -4; \
2189 +.section .fixup,"ax"; \
2190 +28:    movl $__KERNEL_DS, %eax; \
2191 +       movl %eax, %ds; \
2192 +       movl %eax, %es; \
2193 +       /* switch to 32bit stack */ \
2194 +       FIXUP_ESPFIX_STACK; \
2195 +       jmp 27b; \
2196 +.previous
2197 +
2198 +/*
2199 + * Build the entry stubs and pointer table with
2200 + * some assembler magic.
2201 + */
2202 +.data
2203 +ENTRY(interrupt)
2204 +.text
2205 +
2206 +vector=0
2207 +ENTRY(irq_entries_start)
2208 +       RING0_INT_FRAME
2209 +.rept NR_IRQS
2210 +       ALIGN
2211 + .if vector
2212 +       CFI_ADJUST_CFA_OFFSET -4
2213 + .endif
2214 +1:     pushl $~(vector)
2215 +       CFI_ADJUST_CFA_OFFSET 4
2216 +       jmp common_interrupt
2217 +.data
2218 +       .long 1b
2219 +.text
2220 +vector=vector+1
2221 +.endr
2222 +
2223 +/*
2224 + * the CPU automatically disables interrupts when executing an IRQ vector,
2225 + * so IRQ-flags tracing has to follow that:
2226 + */
2227 +       ALIGN
2228 +common_interrupt:
2229 +       SAVE_ALL
2230 +       TRACE_IRQS_OFF
2231 +       movl %esp,%eax
2232 +       call do_IRQ
2233 +       jmp ret_from_intr
2234 +       CFI_ENDPROC
2235 +
2236 +#define BUILD_INTERRUPT(name, nr)      \
2237 +ENTRY(name)                            \
2238 +       RING0_INT_FRAME;                \
2239 +       pushl $~(nr);                   \
2240 +       CFI_ADJUST_CFA_OFFSET 4;        \
2241 +       SAVE_ALL;                       \
2242 +       TRACE_IRQS_OFF                  \
2243 +       movl %esp,%eax;                 \
2244 +       call smp_/**/name;              \
2245 +       jmp ret_from_intr;              \
2246 +       CFI_ENDPROC
2247 +
2248 +/* The include is where all of the SMP etc. interrupts come from */
2249 +#include "entry_arch.h"
2250 +#else
2251 +#define UNWIND_ESPFIX_STACK
2252 +#endif
2253 +
2254 +ENTRY(divide_error)
2255 +       RING0_INT_FRAME
2256 +       pushl $0                        # no error code
2257 +       CFI_ADJUST_CFA_OFFSET 4
2258 +       pushl $do_divide_error
2259 +       CFI_ADJUST_CFA_OFFSET 4
2260 +       ALIGN
2261 +error_code:
2262 +       pushl %ds
2263 +       CFI_ADJUST_CFA_OFFSET 4
2264 +       /*CFI_REL_OFFSET ds, 0*/
2265 +       pushl %eax
2266 +       CFI_ADJUST_CFA_OFFSET 4
2267 +       CFI_REL_OFFSET eax, 0
2268 +       xorl %eax, %eax
2269 +       pushl %ebp
2270 +       CFI_ADJUST_CFA_OFFSET 4
2271 +       CFI_REL_OFFSET ebp, 0
2272 +       pushl %edi
2273 +       CFI_ADJUST_CFA_OFFSET 4
2274 +       CFI_REL_OFFSET edi, 0
2275 +       pushl %esi
2276 +       CFI_ADJUST_CFA_OFFSET 4
2277 +       CFI_REL_OFFSET esi, 0
2278 +       pushl %edx
2279 +       CFI_ADJUST_CFA_OFFSET 4
2280 +       CFI_REL_OFFSET edx, 0
2281 +       decl %eax                       # eax = -1
2282 +       pushl %ecx
2283 +       CFI_ADJUST_CFA_OFFSET 4
2284 +       CFI_REL_OFFSET ecx, 0
2285 +       pushl %ebx
2286 +       CFI_ADJUST_CFA_OFFSET 4
2287 +       CFI_REL_OFFSET ebx, 0
2288 +       cld
2289 +       pushl %es
2290 +       CFI_ADJUST_CFA_OFFSET 4
2291 +       /*CFI_REL_OFFSET es, 0*/
2292 +       UNWIND_ESPFIX_STACK
2293 +       popl %ecx
2294 +       CFI_ADJUST_CFA_OFFSET -4
2295 +       /*CFI_REGISTER es, ecx*/
2296 +       movl ES(%esp), %edi             # get the function address
2297 +       movl ORIG_EAX(%esp), %edx       # get the error code
2298 +       movl %eax, ORIG_EAX(%esp)
2299 +       movl %ecx, ES(%esp)
2300 +       /*CFI_REL_OFFSET es, ES*/
2301 +       movl $(__USER_DS), %ecx
2302 +       movl %ecx, %ds
2303 +       movl %ecx, %es
2304 +       movl %esp,%eax                  # pt_regs pointer
2305 +       call *%edi
2306 +       jmp ret_from_exception
2307 +       CFI_ENDPROC
2308 +
2309 +#ifdef CONFIG_XEN
2310 +# A note on the "critical region" in our callback handler.
2311 +# We want to avoid stacking callback handlers due to events occurring
2312 +# during handling of the last event. To do this, we keep events disabled
2313 +# until we've done all processing. HOWEVER, we must enable events before
2314 +# popping the stack frame (can't be done atomically) and so it would still
2315 +# be possible to get enough handler activations to overflow the stack.
2316 +# Although unlikely, bugs of that kind are hard to track down, so we'd
2317 +# like to avoid the possibility.
2318 +# So, on entry to the handler we detect whether we interrupted an
2319 +# existing activation in its critical region -- if so, we pop the current
2320 +# activation and restart the handler using the previous one.
2321 +#
2322 +# The sysexit critical region is slightly different. sysexit
2323 +# atomically removes the entire stack frame. If we interrupt in the
2324 +# critical region we know that the entire frame is present and correct
2325 +# so we can simply throw away the new one.
2326 +ENTRY(hypervisor_callback)
2327 +       RING0_INT_FRAME
2328 +       pushl %eax
2329 +       CFI_ADJUST_CFA_OFFSET 4
2330 +       SAVE_ALL
2331 +       movl EIP(%esp),%eax
2332 +       cmpl $scrit,%eax
2333 +       jb   11f
2334 +       cmpl $ecrit,%eax
2335 +       jb   critical_region_fixup
2336 +       cmpl $sysexit_scrit,%eax
2337 +       jb   11f
2338 +       cmpl $sysexit_ecrit,%eax
2339 +       ja   11f
2340 +       addl $OLDESP,%esp               # Remove eflags...ebx from stack frame.
2341 +11:    push %esp
2342 +       CFI_ADJUST_CFA_OFFSET 4
2343 +       call evtchn_do_upcall
2344 +       add  $4,%esp
2345 +       CFI_ADJUST_CFA_OFFSET -4
2346 +       jmp  ret_from_intr
2347 +       CFI_ENDPROC
2348 +
2349 +# [How we do the fixup]. We want to merge the current stack frame with the
2350 +# just-interrupted frame. How we do this depends on where in the critical
2351 +# region the interrupted handler was executing, and so how many saved
2352 +# registers are in each frame. We do this quickly using the lookup table
2353 +# 'critical_fixup_table'. For each byte offset in the critical region, it
2354 +# provides the number of bytes which have already been popped from the
2355 +# interrupted stack frame.
2356 +critical_region_fixup:
2357 +       movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
2358 +       cmpb $0xff,%cl                  # 0xff => vcpu_info critical region
2359 +       jne  15f
2360 +       xorl %ecx,%ecx
2361 +15:    leal (%esp,%ecx),%esi           # %esi points at end of src region
2362 +       leal OLDESP(%esp),%edi          # %edi points at end of dst region
2363 +       shrl $2,%ecx                    # convert words to bytes
2364 +       je   17f                        # skip loop if nothing to copy
2365 +16:    subl $4,%esi                    # pre-decrementing copy loop
2366 +       subl $4,%edi
2367 +       movl (%esi),%eax
2368 +       movl %eax,(%edi)
2369 +       loop 16b
2370 +17:    movl %edi,%esp                  # final %edi is top of merged stack
2371 +       jmp  11b
2372 +
2373 +.section .rodata,"a"
2374 +critical_fixup_table:
2375 +       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = __TEST_PENDING
2376 +       .byte 0xff,0xff                 # jnz  14f
2377 +       .byte 0x00                      # pop  %ebx
2378 +       .byte 0x04                      # pop  %ecx
2379 +       .byte 0x08                      # pop  %edx
2380 +       .byte 0x0c                      # pop  %esi
2381 +       .byte 0x10                      # pop  %edi
2382 +       .byte 0x14                      # pop  %ebp
2383 +       .byte 0x18                      # pop  %eax
2384 +       .byte 0x1c                      # pop  %ds
2385 +       .byte 0x20                      # pop  %es
2386 +       .byte 0x24,0x24,0x24            # add  $4,%esp
2387 +       .byte 0x28                      # iret
2388 +       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
2389 +       .byte 0x00,0x00                 # jmp  11b
2390 +.previous
2391 +
2392 +# Hypervisor uses this for application faults while it executes.
2393 +# We get here for two reasons:
2394 +#  1. Fault while reloading DS, ES, FS or GS
2395 +#  2. Fault while executing IRET
2396 +# Category 1 we fix up by reattempting the load, and zeroing the segment
2397 +# register if the load fails.
2398 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
2399 +# normal Linux return path in this case because if we use the IRET hypercall
2400 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
2401 +# We distinguish between categories by maintaining a status value in EAX.
2402 +ENTRY(failsafe_callback)
2403 +       pushl %eax
2404 +       movl $1,%eax
2405 +1:     mov 4(%esp),%ds
2406 +2:     mov 8(%esp),%es
2407 +3:     mov 12(%esp),%fs
2408 +4:     mov 16(%esp),%gs
2409 +       testl %eax,%eax
2410 +       popl %eax
2411 +       jz 5f
2412 +       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
2413 +       jmp iret_exc
2414 +5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
2415 +       RING0_INT_FRAME
2416 +       pushl $0
2417 +       SAVE_ALL
2418 +       jmp ret_from_exception
2419 +.section .fixup,"ax";          \
2420 +6:     xorl %eax,%eax;         \
2421 +       movl %eax,4(%esp);      \
2422 +       jmp 1b;                 \
2423 +7:     xorl %eax,%eax;         \
2424 +       movl %eax,8(%esp);      \
2425 +       jmp 2b;                 \
2426 +8:     xorl %eax,%eax;         \
2427 +       movl %eax,12(%esp);     \
2428 +       jmp 3b;                 \
2429 +9:     xorl %eax,%eax;         \
2430 +       movl %eax,16(%esp);     \
2431 +       jmp 4b;                 \
2432 +.previous;                     \
2433 +.section __ex_table,"a";       \
2434 +       .align 4;               \
2435 +       .long 1b,6b;            \
2436 +       .long 2b,7b;            \
2437 +       .long 3b,8b;            \
2438 +       .long 4b,9b;            \
2439 +.previous
2440 +#endif
2441 +       CFI_ENDPROC
2442 +
2443 +ENTRY(coprocessor_error)
2444 +       RING0_INT_FRAME
2445 +       pushl $0
2446 +       CFI_ADJUST_CFA_OFFSET 4
2447 +       pushl $do_coprocessor_error
2448 +       CFI_ADJUST_CFA_OFFSET 4
2449 +       jmp error_code
2450 +       CFI_ENDPROC
2451 +
2452 +ENTRY(simd_coprocessor_error)
2453 +       RING0_INT_FRAME
2454 +       pushl $0
2455 +       CFI_ADJUST_CFA_OFFSET 4
2456 +       pushl $do_simd_coprocessor_error
2457 +       CFI_ADJUST_CFA_OFFSET 4
2458 +       jmp error_code
2459 +       CFI_ENDPROC
2460 +
2461 +ENTRY(device_not_available)
2462 +       RING0_INT_FRAME
2463 +       pushl $-1                       # mark this as an int
2464 +       CFI_ADJUST_CFA_OFFSET 4
2465 +       SAVE_ALL
2466 +#ifndef CONFIG_XEN
2467 +       movl %cr0, %eax
2468 +       testl $0x4, %eax                # EM (math emulation bit)
2469 +       je device_available_emulate
2470 +       pushl $0                        # temporary storage for ORIG_EIP
2471 +       CFI_ADJUST_CFA_OFFSET 4
2472 +       call math_emulate
2473 +       addl $4, %esp
2474 +       CFI_ADJUST_CFA_OFFSET -4
2475 +       jmp ret_from_exception
2476 +device_available_emulate:
2477 +#endif
2478 +       preempt_stop
2479 +       call math_state_restore
2480 +       jmp ret_from_exception
2481 +       CFI_ENDPROC
2482 +
2483 +#ifndef CONFIG_XEN
2484 +/*
2485 + * Debug traps and NMI can happen at the one SYSENTER instruction
2486 + * that sets up the real kernel stack. Check here, since we can't
2487 + * allow the wrong stack to be used.
2488 + *
2489 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2490 + * already pushed 3 words if it hits on the sysenter instruction:
2491 + * eflags, cs and eip.
2492 + *
2493 + * We just load the right stack, and push the three (known) values
2494 + * by hand onto the new stack - while updating the return eip past
2495 + * the instruction that would have done it for sysenter.
2496 + */
2497 +#define FIX_STACK(offset, ok, label)           \
2498 +       cmpw $__KERNEL_CS,4(%esp);              \
2499 +       jne ok;                                 \
2500 +label:                                         \
2501 +       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
2502 +       pushfl;                                 \
2503 +       pushl $__KERNEL_CS;                     \
2504 +       pushl $sysenter_past_esp
2505 +#endif /* CONFIG_XEN */
2506 +
2507 +KPROBE_ENTRY(debug)
2508 +       RING0_INT_FRAME
2509 +#ifndef CONFIG_XEN
2510 +       cmpl $sysenter_entry,(%esp)
2511 +       jne debug_stack_correct
2512 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2513 +debug_stack_correct:
2514 +#endif /* !CONFIG_XEN */
2515 +       pushl $-1                       # mark this as an int
2516 +       CFI_ADJUST_CFA_OFFSET 4
2517 +       SAVE_ALL
2518 +       xorl %edx,%edx                  # error code 0
2519 +       movl %esp,%eax                  # pt_regs pointer
2520 +       call do_debug
2521 +       jmp ret_from_exception
2522 +       CFI_ENDPROC
2523 +       .previous .text
2524 +#ifndef CONFIG_XEN
2525 +/*
2526 + * NMI is doubly nasty. It can happen _while_ we're handling
2527 + * a debug fault, and the debug fault hasn't yet been able to
2528 + * clear up the stack. So we first check whether we got  an
2529 + * NMI on the sysenter entry path, but after that we need to
2530 + * check whether we got an NMI on the debug path where the debug
2531 + * fault happened on the sysenter path.
2532 + */
2533 +ENTRY(nmi)
2534 +       RING0_INT_FRAME
2535 +       pushl %eax
2536 +       CFI_ADJUST_CFA_OFFSET 4
2537 +       movl %ss, %eax
2538 +       cmpw $__ESPFIX_SS, %ax
2539 +       popl %eax
2540 +       CFI_ADJUST_CFA_OFFSET -4
2541 +       je nmi_16bit_stack
2542 +       cmpl $sysenter_entry,(%esp)
2543 +       je nmi_stack_fixup
2544 +       pushl %eax
2545 +       CFI_ADJUST_CFA_OFFSET 4
2546 +       movl %esp,%eax
2547 +       /* Do not access memory above the end of our stack page,
2548 +        * it might not exist.
2549 +        */
2550 +       andl $(THREAD_SIZE-1),%eax
2551 +       cmpl $(THREAD_SIZE-20),%eax
2552 +       popl %eax
2553 +       CFI_ADJUST_CFA_OFFSET -4
2554 +       jae nmi_stack_correct
2555 +       cmpl $sysenter_entry,12(%esp)
2556 +       je nmi_debug_stack_check
2557 +nmi_stack_correct:
2558 +       pushl %eax
2559 +       CFI_ADJUST_CFA_OFFSET 4
2560 +       SAVE_ALL
2561 +       xorl %edx,%edx          # zero error code
2562 +       movl %esp,%eax          # pt_regs pointer
2563 +       call do_nmi
2564 +       jmp restore_nocheck_notrace
2565 +       CFI_ENDPROC
2566 +
2567 +nmi_stack_fixup:
2568 +       FIX_STACK(12,nmi_stack_correct, 1)
2569 +       jmp nmi_stack_correct
2570 +nmi_debug_stack_check:
2571 +       cmpw $__KERNEL_CS,16(%esp)
2572 +       jne nmi_stack_correct
2573 +       cmpl $debug,(%esp)
2574 +       jb nmi_stack_correct
2575 +       cmpl $debug_esp_fix_insn,(%esp)
2576 +       ja nmi_stack_correct
2577 +       FIX_STACK(24,nmi_stack_correct, 1)
2578 +       jmp nmi_stack_correct
2579 +
2580 +nmi_16bit_stack:
2581 +       RING0_INT_FRAME
2582 +       /* create the pointer to lss back */
2583 +       pushl %ss
2584 +       CFI_ADJUST_CFA_OFFSET 4
2585 +       pushl %esp
2586 +       CFI_ADJUST_CFA_OFFSET 4
2587 +       movzwl %sp, %esp
2588 +       addw $4, (%esp)
2589 +       /* copy the iret frame of 12 bytes */
2590 +       .rept 3
2591 +       pushl 16(%esp)
2592 +       CFI_ADJUST_CFA_OFFSET 4
2593 +       .endr
2594 +       pushl %eax
2595 +       CFI_ADJUST_CFA_OFFSET 4
2596 +       SAVE_ALL
2597 +       FIXUP_ESPFIX_STACK              # %eax == %esp
2598 +       CFI_ADJUST_CFA_OFFSET -20       # the frame has now moved
2599 +       xorl %edx,%edx                  # zero error code
2600 +       call do_nmi
2601 +       RESTORE_REGS
2602 +       lss 12+4(%esp), %esp            # back to 16bit stack
2603 +1:     iret
2604 +       CFI_ENDPROC
2605 +.section __ex_table,"a"
2606 +       .align 4
2607 +       .long 1b,iret_exc
2608 +.previous
2609 +#else
2610 +ENTRY(nmi)
2611 +       RING0_INT_FRAME
2612 +       pushl %eax
2613 +       CFI_ADJUST_CFA_OFFSET 4
2614 +       SAVE_ALL
2615 +       xorl %edx,%edx          # zero error code
2616 +       movl %esp,%eax          # pt_regs pointer
2617 +       call do_nmi
2618 +       orl  $NMI_MASK, EFLAGS(%esp)
2619 +       jmp restore_all
2620 +       CFI_ENDPROC
2621 +#endif
2622 +
2623 +KPROBE_ENTRY(int3)
2624 +       RING0_INT_FRAME
2625 +       pushl $-1                       # mark this as an int
2626 +       CFI_ADJUST_CFA_OFFSET 4
2627 +       SAVE_ALL
2628 +       xorl %edx,%edx          # zero error code
2629 +       movl %esp,%eax          # pt_regs pointer
2630 +       call do_int3
2631 +       jmp ret_from_exception
2632 +       CFI_ENDPROC
2633 +       .previous .text
2634 +
2635 +ENTRY(overflow)
2636 +       RING0_INT_FRAME
2637 +       pushl $0
2638 +       CFI_ADJUST_CFA_OFFSET 4
2639 +       pushl $do_overflow
2640 +       CFI_ADJUST_CFA_OFFSET 4
2641 +       jmp error_code
2642 +       CFI_ENDPROC
2643 +
2644 +ENTRY(bounds)
2645 +       RING0_INT_FRAME
2646 +       pushl $0
2647 +       CFI_ADJUST_CFA_OFFSET 4
2648 +       pushl $do_bounds
2649 +       CFI_ADJUST_CFA_OFFSET 4
2650 +       jmp error_code
2651 +       CFI_ENDPROC
2652 +
2653 +ENTRY(invalid_op)
2654 +       RING0_INT_FRAME
2655 +       pushl $0
2656 +       CFI_ADJUST_CFA_OFFSET 4
2657 +       pushl $do_invalid_op
2658 +       CFI_ADJUST_CFA_OFFSET 4
2659 +       jmp error_code
2660 +       CFI_ENDPROC
2661 +
2662 +ENTRY(coprocessor_segment_overrun)
2663 +       RING0_INT_FRAME
2664 +       pushl $0
2665 +       CFI_ADJUST_CFA_OFFSET 4
2666 +       pushl $do_coprocessor_segment_overrun
2667 +       CFI_ADJUST_CFA_OFFSET 4
2668 +       jmp error_code
2669 +       CFI_ENDPROC
2670 +
2671 +ENTRY(invalid_TSS)
2672 +       RING0_EC_FRAME
2673 +       pushl $do_invalid_TSS
2674 +       CFI_ADJUST_CFA_OFFSET 4
2675 +       jmp error_code
2676 +       CFI_ENDPROC
2677 +
2678 +ENTRY(segment_not_present)
2679 +       RING0_EC_FRAME
2680 +       pushl $do_segment_not_present
2681 +       CFI_ADJUST_CFA_OFFSET 4
2682 +       jmp error_code
2683 +       CFI_ENDPROC
2684 +
2685 +ENTRY(stack_segment)
2686 +       RING0_EC_FRAME
2687 +       pushl $do_stack_segment
2688 +       CFI_ADJUST_CFA_OFFSET 4
2689 +       jmp error_code
2690 +       CFI_ENDPROC
2691 +
2692 +KPROBE_ENTRY(general_protection)
2693 +       RING0_EC_FRAME
2694 +       pushl $do_general_protection
2695 +       CFI_ADJUST_CFA_OFFSET 4
2696 +       jmp error_code
2697 +       CFI_ENDPROC
2698 +       .previous .text
2699 +
2700 +ENTRY(alignment_check)
2701 +       RING0_EC_FRAME
2702 +       pushl $do_alignment_check
2703 +       CFI_ADJUST_CFA_OFFSET 4
2704 +       jmp error_code
2705 +       CFI_ENDPROC
2706 +
2707 +KPROBE_ENTRY(page_fault)
2708 +       RING0_EC_FRAME
2709 +       pushl $do_page_fault
2710 +       CFI_ADJUST_CFA_OFFSET 4
2711 +       jmp error_code
2712 +       CFI_ENDPROC
2713 +       .previous .text
2714 +
2715 +#ifdef CONFIG_X86_MCE
2716 +ENTRY(machine_check)
2717 +       RING0_INT_FRAME
2718 +       pushl $0
2719 +       CFI_ADJUST_CFA_OFFSET 4
2720 +       pushl machine_check_vector
2721 +       CFI_ADJUST_CFA_OFFSET 4
2722 +       jmp error_code
2723 +       CFI_ENDPROC
2724 +#endif
2725 +
2726 +#ifndef CONFIG_XEN
2727 +ENTRY(spurious_interrupt_bug)
2728 +       RING0_INT_FRAME
2729 +       pushl $0
2730 +       CFI_ADJUST_CFA_OFFSET 4
2731 +       pushl $do_spurious_interrupt_bug
2732 +       CFI_ADJUST_CFA_OFFSET 4
2733 +       jmp error_code
2734 +       CFI_ENDPROC
2735 +#endif /* !CONFIG_XEN */
2736 +
2737 +#ifdef CONFIG_STACK_UNWIND
2738 +ENTRY(arch_unwind_init_running)
2739 +       CFI_STARTPROC
2740 +       movl    4(%esp), %edx
2741 +       movl    (%esp), %ecx
2742 +       leal    4(%esp), %eax
2743 +       movl    %ebx, EBX(%edx)
2744 +       xorl    %ebx, %ebx
2745 +       movl    %ebx, ECX(%edx)
2746 +       movl    %ebx, EDX(%edx)
2747 +       movl    %esi, ESI(%edx)
2748 +       movl    %edi, EDI(%edx)
2749 +       movl    %ebp, EBP(%edx)
2750 +       movl    %ebx, EAX(%edx)
2751 +       movl    $__USER_DS, DS(%edx)
2752 +       movl    $__USER_DS, ES(%edx)
2753 +       movl    %ebx, ORIG_EAX(%edx)
2754 +       movl    %ecx, EIP(%edx)
2755 +       movl    12(%esp), %ecx
2756 +       movl    $__KERNEL_CS, CS(%edx)
2757 +       movl    %ebx, EFLAGS(%edx)
2758 +       movl    %eax, OLDESP(%edx)
2759 +       movl    8(%esp), %eax
2760 +       movl    %ecx, 8(%esp)
2761 +       movl    EBX(%edx), %ebx
2762 +       movl    $__KERNEL_DS, OLDSS(%edx)
2763 +       jmpl    *%eax
2764 +       CFI_ENDPROC
2765 +ENDPROC(arch_unwind_init_running)
2766 +#endif
2767 +
2768 +ENTRY(fixup_4gb_segment)
2769 +       RING0_EC_FRAME
2770 +       pushl $do_fixup_4gb_segment
2771 +       CFI_ADJUST_CFA_OFFSET 4
2772 +       jmp error_code
2773 +       CFI_ENDPROC
2774 +
2775 +.section .rodata,"a"
2776 +#include "syscall_table.S"
2777 +
2778 +syscall_table_size=(.-sys_call_table)
2779 Index: head-2008-11-25/arch/x86/kernel/fixup.c
2780 ===================================================================
2781 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
2782 +++ head-2008-11-25/arch/x86/kernel/fixup.c     2008-01-28 12:24:18.000000000 +0100
2783 @@ -0,0 +1,88 @@
2784 +/******************************************************************************
2785 + * fixup.c
2786 + *
2787 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
2788 + * Used to avoid repeated slow emulation of common instructions used by the
2789 + * user-space TLS (Thread-Local Storage) libraries.
2790 + *
2791 + * **** NOTE ****
2792 + *  Issues with the binary rewriting have caused it to be removed. Instead
2793 + *  we rely on Xen's emulator to boot the kernel, and then print a banner
2794 + *  message recommending that the user disables /lib/tls.
2795 + *
2796 + * Copyright (c) 2004, K A Fraser
2797 + *
2798 + * This program is free software; you can redistribute it and/or modify
2799 + * it under the terms of the GNU General Public License as published by
2800 + * the Free Software Foundation; either version 2 of the License, or
2801 + * (at your option) any later version.
2802 + *
2803 + * This program is distributed in the hope that it will be useful,
2804 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2805 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
2806 + * GNU General Public License for more details.
2807 + *
2808 + * You should have received a copy of the GNU General Public License
2809 + * along with this program; if not, write to the Free Software
2810 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
2811 + */
2812 +
2813 +#include <linux/init.h>
2814 +#include <linux/sched.h>
2815 +#include <linux/slab.h>
2816 +#include <linux/kernel.h>
2817 +#include <linux/delay.h>
2818 +#include <linux/version.h>
2819 +
2820 +#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
2821 +
2822 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2823 +{
2824 +       static unsigned long printed = 0;
2825 +       char info[100];
2826 +       int i;
2827 +
2828 +       /* Ignore statically-linked init. */
2829 +       if (current->tgid == 1)
2830 +               return;
2831 +
2832 +       VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
2833 +                                 VMASST_TYPE_4gb_segments_notify));
2834 +
2835 +       if (test_and_set_bit(0, &printed))
2836 +               return;
2837 +
2838 +       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
2839 +
2840 +       DP("");
2841 +       DP("***************************************************************");
2842 +       DP("***************************************************************");
2843 +       DP("** WARNING: Currently emulating unsupported memory accesses  **");
2844 +       DP("**          in /lib/tls glibc libraries. The emulation is    **");
2845 +       DP("**          slow. To ensure full performance you should      **");
2846 +       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
2847 +       DP("**          the library, or disable tls support by executing **");
2848 +       DP("**          the following as root:                           **");
2849 +       DP("**          mv /lib/tls /lib/tls.disabled                    **");
2850 +       DP("** Offending process: %-38.38s **", info);
2851 +       DP("***************************************************************");
2852 +       DP("***************************************************************");
2853 +       DP("");
2854 +
2855 +       for (i = 5; i > 0; i--) {
2856 +               touch_softlockup_watchdog();
2857 +               printk("Pausing... %d", i);
2858 +               mdelay(1000);
2859 +               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
2860 +       }
2861 +
2862 +       printk("Continuing...\n\n");
2863 +}
2864 +
2865 +static int __init fixup_init(void)
2866 +{
2867 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
2868 +                                    VMASST_TYPE_4gb_segments_notify));
2869 +       return 0;
2870 +}
2871 +__initcall(fixup_init);
2872 Index: head-2008-11-25/arch/x86/kernel/head_32-xen.S
2873 ===================================================================
2874 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
2875 +++ head-2008-11-25/arch/x86/kernel/head_32-xen.S       2007-06-12 13:12:48.000000000 +0200
2876 @@ -0,0 +1,207 @@
2877 +
2878 +
2879 +.text
2880 +#include <linux/elfnote.h>
2881 +#include <linux/threads.h>
2882 +#include <linux/linkage.h>
2883 +#include <asm/segment.h>
2884 +#include <asm/page.h>
2885 +#include <asm/cache.h>
2886 +#include <asm/thread_info.h>
2887 +#include <asm/asm-offsets.h>
2888 +#include <asm/dwarf2.h>
2889 +#include <xen/interface/xen.h>
2890 +#include <xen/interface/elfnote.h>
2891 +
2892 +/*
2893 + * References to members of the new_cpu_data structure.
2894 + */
2895 +
2896 +#define X86            new_cpu_data+CPUINFO_x86
2897 +#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
2898 +#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
2899 +#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
2900 +#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
2901 +#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
2902 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
2903 +#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
2904 +
2905 +#define VIRT_ENTRY_OFFSET 0x0
2906 +.org VIRT_ENTRY_OFFSET
2907 +ENTRY(startup_32)
2908 +       movl %esi,xen_start_info
2909 +       cld
2910 +
2911 +       /* Set up the stack pointer */
2912 +       movl $(init_thread_union+THREAD_SIZE),%esp
2913 +
2914 +       /* get vendor info */
2915 +       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
2916 +       XEN_CPUID
2917 +       movl %eax,X86_CPUID             # save CPUID level
2918 +       movl %ebx,X86_VENDOR_ID         # lo 4 chars
2919 +       movl %edx,X86_VENDOR_ID+4       # next 4 chars
2920 +       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
2921 +
2922 +       movl $1,%eax            # Use the CPUID instruction to get CPU type
2923 +       XEN_CPUID
2924 +       movb %al,%cl            # save reg for future use
2925 +       andb $0x0f,%ah          # mask processor family
2926 +       movb %ah,X86
2927 +       andb $0xf0,%al          # mask model
2928 +       shrb $4,%al
2929 +       movb %al,X86_MODEL
2930 +       andb $0x0f,%cl          # mask mask revision
2931 +       movb %cl,X86_MASK
2932 +       movl %edx,X86_CAPABILITY
2933 +
2934 +       movb $1,X86_HARD_MATH
2935 +
2936 +       xorl %eax,%eax                  # Clear FS/GS and LDT
2937 +       movl %eax,%fs
2938 +       movl %eax,%gs
2939 +       cld                     # gcc2 wants the direction flag cleared at all times
2940 +
2941 +       pushl %eax              # fake return address
2942 +       jmp start_kernel
2943 +
2944 +#define HYPERCALL_PAGE_OFFSET 0x1000
2945 +.org HYPERCALL_PAGE_OFFSET
2946 +ENTRY(hypercall_page)
2947 +       CFI_STARTPROC
2948 +.skip 0x1000
2949 +       CFI_ENDPROC
2950 +
2951 +/*
2952 + * Real beginning of normal "text" segment
2953 + */
2954 +ENTRY(stext)
2955 +ENTRY(_stext)
2956 +
2957 +/*
2958 + * BSS section
2959 + */
2960 +.section ".bss.page_aligned","w"
2961 +ENTRY(empty_zero_page)
2962 +       .fill 4096,1,0
2963 +
2964 +/*
2965 + * This starts the data section.
2966 + */
2967 +.data
2968 +
2969 +/*
2970 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
2971 + */
2972 +       .align L1_CACHE_BYTES
2973 +ENTRY(cpu_gdt_table)
2974 +       .quad 0x0000000000000000        /* NULL descriptor */
2975 +       .quad 0x0000000000000000        /* 0x0b reserved */
2976 +       .quad 0x0000000000000000        /* 0x13 reserved */
2977 +       .quad 0x0000000000000000        /* 0x1b reserved */
2978 +       .quad 0x0000000000000000        /* 0x20 unused */
2979 +       .quad 0x0000000000000000        /* 0x28 unused */
2980 +       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
2981 +       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
2982 +       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
2983 +       .quad 0x0000000000000000        /* 0x4b reserved */
2984 +       .quad 0x0000000000000000        /* 0x53 reserved */
2985 +       .quad 0x0000000000000000        /* 0x5b reserved */
2986 +
2987 +       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
2988 +       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
2989 +       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
2990 +       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
2991 +
2992 +       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
2993 +       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
2994 +
2995 +       /*
2996 +        * Segments used for calling PnP BIOS have byte granularity.
2997 +        * They code segments and data segments have fixed 64k limits,
2998 +        * the transfer segment sizes are set at run time.
2999 +        */
3000 +       .quad 0x0000000000000000        /* 0x90 32-bit code */
3001 +       .quad 0x0000000000000000        /* 0x98 16-bit code */
3002 +       .quad 0x0000000000000000        /* 0xa0 16-bit data */
3003 +       .quad 0x0000000000000000        /* 0xa8 16-bit data */
3004 +       .quad 0x0000000000000000        /* 0xb0 16-bit data */
3005 +
3006 +       /*
3007 +        * The APM segments have byte granularity and their bases
3008 +        * are set at run time.  All have 64k limits.
3009 +        */
3010 +       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
3011 +       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
3012 +       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
3013 +
3014 +       .quad 0x0000000000000000        /* 0xd0 - ESPFIX 16-bit SS */
3015 +       .quad 0x0000000000000000        /* 0xd8 - unused */
3016 +       .quad 0x0000000000000000        /* 0xe0 - unused */
3017 +       .quad 0x0000000000000000        /* 0xe8 - unused */
3018 +       .quad 0x0000000000000000        /* 0xf0 - unused */
3019 +       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
3020 +
3021 +#if CONFIG_XEN_COMPAT <= 0x030002
3022 +/*
3023 + * __xen_guest information
3024 + */
3025 +.macro utoa value
3026 + .if (\value) < 0 || (\value) >= 0x10
3027 +       utoa (((\value)>>4)&0x0fffffff)
3028 + .endif
3029 + .if ((\value) & 0xf) < 10
3030 +  .byte '0' + ((\value) & 0xf)
3031 + .else
3032 +  .byte 'A' + ((\value) & 0xf) - 10
3033 + .endif
3034 +.endm
3035 +
3036 +.section __xen_guest
3037 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
3038 +       .ascii  ",XEN_VER=xen-3.0"
3039 +       .ascii  ",VIRT_BASE=0x"
3040 +               utoa __PAGE_OFFSET
3041 +       .ascii  ",ELF_PADDR_OFFSET=0x"
3042 +               utoa __PAGE_OFFSET
3043 +       .ascii  ",VIRT_ENTRY=0x"
3044 +               utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
3045 +       .ascii  ",HYPERCALL_PAGE=0x"
3046 +               utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3047 +       .ascii  ",FEATURES=writable_page_tables"
3048 +       .ascii           "|writable_descriptor_tables"
3049 +       .ascii           "|auto_translated_physmap"
3050 +       .ascii           "|pae_pgdir_above_4gb"
3051 +       .ascii           "|supervisor_mode_kernel"
3052 +#ifdef CONFIG_X86_PAE
3053 +       .ascii  ",PAE=yes[extended-cr3]"
3054 +#else
3055 +       .ascii  ",PAE=no"
3056 +#endif
3057 +       .ascii  ",LOADER=generic"
3058 +       .byte   0
3059 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
3060 +
3061 +
3062 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
3063 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
3064 +       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
3065 +       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long,  __PAGE_OFFSET)
3066 +#if CONFIG_XEN_COMPAT <= 0x030002
3067 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long,  __PAGE_OFFSET)
3068 +#else
3069 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long,  0)
3070 +#endif
3071 +       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long,  startup_32)
3072 +       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long,  hypercall_page)
3073 +       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long,  HYPERVISOR_VIRT_START)
3074 +       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
3075 +#ifdef CONFIG_X86_PAE
3076 +       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
3077 +       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad,  _PAGE_PRESENT,_PAGE_PRESENT)
3078 +#else
3079 +       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "no")
3080 +       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long,  _PAGE_PRESENT,_PAGE_PRESENT)
3081 +#endif
3082 +       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
3083 +       ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long,  1)
3084 Index: head-2008-11-25/arch/x86/kernel/init_task-xen.c
3085 ===================================================================
3086 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
3087 +++ head-2008-11-25/arch/x86/kernel/init_task-xen.c     2007-06-12 13:12:48.000000000 +0200
3088 @@ -0,0 +1,51 @@
3089 +#include <linux/mm.h>
3090 +#include <linux/module.h>
3091 +#include <linux/sched.h>
3092 +#include <linux/init.h>
3093 +#include <linux/init_task.h>
3094 +#include <linux/fs.h>
3095 +#include <linux/mqueue.h>
3096 +
3097 +#include <asm/uaccess.h>
3098 +#include <asm/pgtable.h>
3099 +#include <asm/desc.h>
3100 +
3101 +static struct fs_struct init_fs = INIT_FS;
3102 +static struct files_struct init_files = INIT_FILES;
3103 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3104 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3105 +
3106 +#define swapper_pg_dir ((pgd_t *)NULL)
3107 +struct mm_struct init_mm = INIT_MM(init_mm);
3108 +#undef swapper_pg_dir
3109 +
3110 +EXPORT_SYMBOL(init_mm);
3111 +
3112 +/*
3113 + * Initial thread structure.
3114 + *
3115 + * We need to make sure that this is THREAD_SIZE aligned due to the
3116 + * way process stacks are handled. This is done by having a special
3117 + * "init_task" linker map entry..
3118 + */
3119 +union thread_union init_thread_union
3120 +       __attribute__((__section__(".data.init_task"))) =
3121 +               { INIT_THREAD_INFO(init_task) };
3122 +
3123 +/*
3124 + * Initial task structure.
3125 + *
3126 + * All other task structs will be allocated on slabs in fork.c
3127 + */
3128 +struct task_struct init_task = INIT_TASK(init_task);
3129 +
3130 +EXPORT_SYMBOL(init_task);
3131 +
3132 +#ifndef CONFIG_X86_NO_TSS
3133 +/*
3134 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3135 + * no more per-task TSS's.
3136 + */
3137 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3138 +#endif
3139 +
3140 Index: head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c
3141 ===================================================================
3142 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
3143 +++ head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c    2008-11-25 12:22:34.000000000 +0100
3144 @@ -0,0 +1,2776 @@
3145 +/*
3146 + *     Intel IO-APIC support for multi-Pentium hosts.
3147 + *
3148 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3149 + *
3150 + *     Many thanks to Stig Venaas for trying out countless experimental
3151 + *     patches and reporting/debugging problems patiently!
3152 + *
3153 + *     (c) 1999, Multiple IO-APIC support, developed by
3154 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3155 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3156 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
3157 + *     and Ingo Molnar <mingo@redhat.com>
3158 + *
3159 + *     Fixes
3160 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
3161 + *                                     thanks to Eric Gilmore
3162 + *                                     and Rolf G. Tews
3163 + *                                     for testing these extensively
3164 + *     Paul Diefenbaugh        :       Added full ACPI support
3165 + */
3166 +
3167 +#include <linux/mm.h>
3168 +#include <linux/interrupt.h>
3169 +#include <linux/init.h>
3170 +#include <linux/delay.h>
3171 +#include <linux/sched.h>
3172 +#include <linux/smp_lock.h>
3173 +#include <linux/mc146818rtc.h>
3174 +#include <linux/compiler.h>
3175 +#include <linux/acpi.h>
3176 +#include <linux/module.h>
3177 +#include <linux/sysdev.h>
3178 +
3179 +#include <asm/io.h>
3180 +#include <asm/smp.h>
3181 +#include <asm/desc.h>
3182 +#include <asm/timer.h>
3183 +#include <asm/i8259.h>
3184 +#include <asm/nmi.h>
3185 +
3186 +#include <mach_apic.h>
3187 +
3188 +#include "io_ports.h"
3189 +
3190 +#ifdef CONFIG_XEN
3191 +
3192 +#include <xen/interface/xen.h>
3193 +#include <xen/interface/physdev.h>
3194 +#include <xen/evtchn.h>
3195 +
3196 +/* Fake i8259 */
3197 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
3198 +#define disable_8259A_irq(_irq)  ((void)0)
3199 +#define i8259A_irq_pending(_irq) (0)
3200 +
3201 +unsigned long io_apic_irqs;
3202 +
3203 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
3204 +{
3205 +       struct physdev_apic apic_op;
3206 +       int ret;
3207 +
3208 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3209 +       apic_op.reg = reg;
3210 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
3211 +       if (ret)
3212 +               return ret;
3213 +       return apic_op.value;
3214 +}
3215 +
3216 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
3217 +{
3218 +       struct physdev_apic apic_op;
3219 +
3220 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3221 +       apic_op.reg = reg;
3222 +       apic_op.value = value;
3223 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
3224 +}
3225 +
3226 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
3227 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
3228 +
3229 +#endif /* CONFIG_XEN */
3230 +
3231 +int (*ioapic_renumber_irq)(int ioapic, int irq);
3232 +atomic_t irq_mis_count;
3233 +
3234 +/* Where if anywhere is the i8259 connect in external int mode */
3235 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
3236 +
3237 +static DEFINE_SPINLOCK(ioapic_lock);
3238 +static DEFINE_SPINLOCK(vector_lock);
3239 +
3240 +int timer_over_8254 __initdata = 1;
3241 +
3242 +/*
3243 + *     Is the SiS APIC rmw bug present ?
3244 + *     -1 = don't know, 0 = no, 1 = yes
3245 + */
3246 +int sis_apic_bug = -1;
3247 +
3248 +/*
3249 + * # of IRQ routing registers
3250 + */
3251 +int nr_ioapic_registers[MAX_IO_APICS];
3252 +
3253 +int disable_timer_pin_1 __initdata;
3254 +
3255 +/*
3256 + * Rough estimation of how many shared IRQs there are, can
3257 + * be changed anytime.
3258 + */
3259 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
3260 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
3261 +
3262 +/*
3263 + * This is performance-critical, we want to do it O(1)
3264 + *
3265 + * the indexing order of this array favors 1:1 mappings
3266 + * between pins and IRQs.
3267 + */
3268 +
3269 +static struct irq_pin_list {
3270 +       int apic, pin, next;
3271 +} irq_2_pin[PIN_MAP_SIZE];
3272 +
3273 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
3274 +#ifdef CONFIG_PCI_MSI
3275 +#define vector_to_irq(vector)  \
3276 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
3277 +#else
3278 +#define vector_to_irq(vector)  (vector)
3279 +#endif
3280 +
3281 +/*
3282 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
3283 + * shared ISA-space IRQs, so we have to support them. We are super
3284 + * fast in the common case, and fast for shared ISA-space IRQs.
3285 + */
3286 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
3287 +{
3288 +       static int first_free_entry = NR_IRQS;
3289 +       struct irq_pin_list *entry = irq_2_pin + irq;
3290 +
3291 +       while (entry->next)
3292 +               entry = irq_2_pin + entry->next;
3293 +
3294 +       if (entry->pin != -1) {
3295 +               entry->next = first_free_entry;
3296 +               entry = irq_2_pin + entry->next;
3297 +               if (++first_free_entry >= PIN_MAP_SIZE)
3298 +                       panic("io_apic.c: whoops");
3299 +       }
3300 +       entry->apic = apic;
3301 +       entry->pin = pin;
3302 +}
3303 +
3304 +#ifdef CONFIG_XEN
3305 +#define clear_IO_APIC() ((void)0)
3306 +#else
3307 +/*
3308 + * Reroute an IRQ to a different pin.
3309 + */
3310 +static void __init replace_pin_at_irq(unsigned int irq,
3311 +                                     int oldapic, int oldpin,
3312 +                                     int newapic, int newpin)
3313 +{
3314 +       struct irq_pin_list *entry = irq_2_pin + irq;
3315 +
3316 +       while (1) {
3317 +               if (entry->apic == oldapic && entry->pin == oldpin) {
3318 +                       entry->apic = newapic;
3319 +                       entry->pin = newpin;
3320 +               }
3321 +               if (!entry->next)
3322 +                       break;
3323 +               entry = irq_2_pin + entry->next;
3324 +       }
3325 +}
3326 +
3327 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
3328 +{
3329 +       struct irq_pin_list *entry = irq_2_pin + irq;
3330 +       unsigned int pin, reg;
3331 +
3332 +       for (;;) {
3333 +               pin = entry->pin;
3334 +               if (pin == -1)
3335 +                       break;
3336 +               reg = io_apic_read(entry->apic, 0x10 + pin*2);
3337 +               reg &= ~disable;
3338 +               reg |= enable;
3339 +               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
3340 +               if (!entry->next)
3341 +                       break;
3342 +               entry = irq_2_pin + entry->next;
3343 +       }
3344 +}
3345 +
3346 +/* mask = 1 */
3347 +static void __mask_IO_APIC_irq (unsigned int irq)
3348 +{
3349 +       __modify_IO_APIC_irq(irq, 0x00010000, 0);
3350 +}
3351 +
3352 +/* mask = 0 */
3353 +static void __unmask_IO_APIC_irq (unsigned int irq)
3354 +{
3355 +       __modify_IO_APIC_irq(irq, 0, 0x00010000);
3356 +}
3357 +
3358 +/* mask = 1, trigger = 0 */
3359 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
3360 +{
3361 +       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
3362 +}
3363 +
3364 +/* mask = 0, trigger = 1 */
3365 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
3366 +{
3367 +       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
3368 +}
3369 +
3370 +static void mask_IO_APIC_irq (unsigned int irq)
3371 +{
3372 +       unsigned long flags;
3373 +
3374 +       spin_lock_irqsave(&ioapic_lock, flags);
3375 +       __mask_IO_APIC_irq(irq);
3376 +       spin_unlock_irqrestore(&ioapic_lock, flags);
3377 +}
3378 +
3379 +static void unmask_IO_APIC_irq (unsigned int irq)
3380 +{
3381 +       unsigned long flags;
3382 +
3383 +       spin_lock_irqsave(&ioapic_lock, flags);
3384 +       __unmask_IO_APIC_irq(irq);
3385 +       spin_unlock_irqrestore(&ioapic_lock, flags);
3386 +}
3387 +
3388 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
3389 +{
3390 +       struct IO_APIC_route_entry entry;
3391 +       unsigned long flags;
3392 +
3393 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
3394 +       spin_lock_irqsave(&ioapic_lock, flags);
3395 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3396 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3397 +       spin_unlock_irqrestore(&ioapic_lock, flags);
3398 +       if (entry.delivery_mode == dest_SMI)
3399 +               return;
3400 +
3401 +       /*
3402 +        * Disable it in the IO-APIC irq-routing table:
3403 +        */
3404 +       memset(&entry, 0, sizeof(entry));
3405 +       entry.mask = 1;
3406 +       spin_lock_irqsave(&ioapic_lock, flags);
3407 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3408 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3409 +       spin_unlock_irqrestore(&ioapic_lock, flags);
3410 +}
3411 +
3412 +static void clear_IO_APIC (void)
3413 +{
3414 +       int apic, pin;
3415 +
3416 +       for (apic = 0; apic < nr_ioapics; apic++)
3417 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
3418 +                       clear_IO_APIC_pin(apic, pin);
3419 +}
3420 +
3421 +#ifdef CONFIG_SMP
3422 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
3423 +{
3424 +       unsigned long flags;
3425 +       int pin;
3426 +       struct irq_pin_list *entry = irq_2_pin + irq;
3427 +       unsigned int apicid_value;
3428 +       cpumask_t tmp;
3429 +
3430 +       cpus_and(tmp, cpumask, cpu_online_map);
3431 +       if (cpus_empty(tmp))
3432 +               tmp = TARGET_CPUS;
3433 +
3434 +       cpus_and(cpumask, tmp, CPU_MASK_ALL);
3435 +
3436 +       apicid_value = cpu_mask_to_apicid(cpumask);
3437 +       /* Prepare to do the io_apic_write */
3438 +       apicid_value = apicid_value << 24;
3439 +       spin_lock_irqsave(&ioapic_lock, flags);
3440 +       for (;;) {
3441 +               pin = entry->pin;
3442 +               if (pin == -1)
3443 +                       break;
3444 +               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
3445 +               if (!entry->next)
3446 +                       break;
3447 +               entry = irq_2_pin + entry->next;
3448 +       }
3449 +       set_irq_info(irq, cpumask);
3450 +       spin_unlock_irqrestore(&ioapic_lock, flags);
3451 +}
3452 +
3453 +#if defined(CONFIG_IRQBALANCE)
3454 +# include <asm/processor.h>    /* kernel_thread() */
3455 +# include <linux/kernel_stat.h>        /* kstat */
3456 +# include <linux/slab.h>               /* kmalloc() */
3457 +# include <linux/timer.h>      /* time_after() */
3458 +
3459 +#ifdef CONFIG_BALANCED_IRQ_DEBUG
3460 +#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
3461 +#  define Dprintk(x...) do { TDprintk(x); } while (0)
3462 +# else
3463 +#  define TDprintk(x...)
3464 +#  define Dprintk(x...)
3465 +# endif
3466 +
3467 +#define IRQBALANCE_CHECK_ARCH -999
3468 +#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
3469 +#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
3470 +#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
3471 +#define BALANCED_IRQ_LESS_DELTA                (HZ)
3472 +
3473 +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
3474 +static int physical_balance __read_mostly;
3475 +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
3476 +
3477 +static struct irq_cpu_info {
3478 +       unsigned long * last_irq;
3479 +       unsigned long * irq_delta;
3480 +       unsigned long irq;
3481 +} irq_cpu_data[NR_CPUS];
3482 +
3483 +#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
3484 +#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
3485 +#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
3486 +
3487 +#define IDLE_ENOUGH(cpu,now) \
3488 +       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
3489 +
3490 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
3491 +
3492 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
3493 +
3494 +static cpumask_t balance_irq_affinity[NR_IRQS] = {
3495 +       [0 ... NR_IRQS-1] = CPU_MASK_ALL
3496 +};
3497 +
3498 +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
3499 +{
3500 +       balance_irq_affinity[irq] = mask;
3501 +}
3502 +
3503 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
3504 +                       unsigned long now, int direction)
3505 +{
3506 +       int search_idle = 1;
3507 +       int cpu = curr_cpu;
3508 +
3509 +       goto inside;
3510 +
3511 +       do {
3512 +               if (unlikely(cpu == curr_cpu))
3513 +                       search_idle = 0;
3514 +inside:
3515 +               if (direction == 1) {
3516 +                       cpu++;
3517 +                       if (cpu >= NR_CPUS)
3518 +                               cpu = 0;
3519 +               } else {
3520 +                       cpu--;
3521 +                       if (cpu == -1)
3522 +                               cpu = NR_CPUS-1;
3523 +               }
3524 +       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
3525 +                       (search_idle && !IDLE_ENOUGH(cpu,now)));
3526 +
3527 +       return cpu;
3528 +}
3529 +
3530 +static inline void balance_irq(int cpu, int irq)
3531 +{
3532 +       unsigned long now = jiffies;
3533 +       cpumask_t allowed_mask;
3534 +       unsigned int new_cpu;
3535 +
3536 +       if (irqbalance_disabled)
3537 +               return;
3538 +
3539 +       cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
3540 +       new_cpu = move(cpu, allowed_mask, now, 1);
3541 +       if (cpu != new_cpu) {
3542 +               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
3543 +       }
3544 +}
3545 +
3546 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
3547 +{
3548 +       int i, j;
3549 +       Dprintk("Rotating IRQs among CPUs.\n");
3550 +       for_each_online_cpu(i) {
3551 +               for (j = 0; j < NR_IRQS; j++) {
3552 +                       if (!irq_desc[j].action)
3553 +                               continue;
3554 +                       /* Is it a significant load ?  */
3555 +                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
3556 +                                               useful_load_threshold)
3557 +                               continue;
3558 +                       balance_irq(i, j);
3559 +               }
3560 +       }
3561 +       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3562 +               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3563 +       return;
3564 +}
3565 +
3566 +static void do_irq_balance(void)
3567 +{
3568 +       int i, j;
3569 +       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
3570 +       unsigned long move_this_load = 0;
3571 +       int max_loaded = 0, min_loaded = 0;
3572 +       int load;
3573 +       unsigned long useful_load_threshold = balanced_irq_interval + 10;
3574 +       int selected_irq;
3575 +       int tmp_loaded, first_attempt = 1;
3576 +       unsigned long tmp_cpu_irq;
3577 +       unsigned long imbalance = 0;
3578 +       cpumask_t allowed_mask, target_cpu_mask, tmp;
3579 +
3580 +       for_each_possible_cpu(i) {
3581 +               int package_index;
3582 +               CPU_IRQ(i) = 0;
3583 +               if (!cpu_online(i))
3584 +                       continue;
3585 +               package_index = CPU_TO_PACKAGEINDEX(i);
3586 +               for (j = 0; j < NR_IRQS; j++) {
3587 +                       unsigned long value_now, delta;
3588 +                       /* Is this an active IRQ? */
3589 +                       if (!irq_desc[j].action)
3590 +                               continue;
3591 +                       if ( package_index == i )
3592 +                               IRQ_DELTA(package_index,j) = 0;
3593 +                       /* Determine the total count per processor per IRQ */
3594 +                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
3595 +
3596 +                       /* Determine the activity per processor per IRQ */
3597 +                       delta = value_now - LAST_CPU_IRQ(i,j);
3598 +
3599 +                       /* Update last_cpu_irq[][] for the next time */
3600 +                       LAST_CPU_IRQ(i,j) = value_now;
3601 +
3602 +                       /* Ignore IRQs whose rate is less than the clock */
3603 +                       if (delta < useful_load_threshold)
3604 +                               continue;
3605 +                       /* update the load for the processor or package total */
3606 +                       IRQ_DELTA(package_index,j) += delta;
3607 +
3608 +                       /* Keep track of the higher numbered sibling as well */
3609 +                       if (i != package_index)
3610 +                               CPU_IRQ(i) += delta;
3611 +                       /*
3612 +                        * We have sibling A and sibling B in the package
3613 +                        *
3614 +                        * cpu_irq[A] = load for cpu A + load for cpu B
3615 +                        * cpu_irq[B] = load for cpu B
3616 +                        */
3617 +                       CPU_IRQ(package_index) += delta;
3618 +               }
3619 +       }
3620 +       /* Find the least loaded processor package */
3621 +       for_each_online_cpu(i) {
3622 +               if (i != CPU_TO_PACKAGEINDEX(i))
3623 +                       continue;
3624 +               if (min_cpu_irq > CPU_IRQ(i)) {
3625 +                       min_cpu_irq = CPU_IRQ(i);
3626 +                       min_loaded = i;
3627 +               }
3628 +       }
3629 +       max_cpu_irq = ULONG_MAX;
3630 +
3631 +tryanothercpu:
3632 +       /* Look for heaviest loaded processor.
3633 +        * We may come back to get the next heaviest loaded processor.
3634 +        * Skip processors with trivial loads.
3635 +        */
3636 +       tmp_cpu_irq = 0;
3637 +       tmp_loaded = -1;
3638 +       for_each_online_cpu(i) {
3639 +               if (i != CPU_TO_PACKAGEINDEX(i))
3640 +                       continue;
3641 +               if (max_cpu_irq <= CPU_IRQ(i))
3642 +                       continue;
3643 +               if (tmp_cpu_irq < CPU_IRQ(i)) {
3644 +                       tmp_cpu_irq = CPU_IRQ(i);
3645 +                       tmp_loaded = i;
3646 +               }
3647 +       }
3648 +
3649 +       if (tmp_loaded == -1) {
3650 +        /* In the case of small number of heavy interrupt sources,
3651 +         * loading some of the cpus too much. We use Ingo's original
3652 +         * approach to rotate them around.
3653 +         */
3654 +               if (!first_attempt && imbalance >= useful_load_threshold) {
3655 +                       rotate_irqs_among_cpus(useful_load_threshold);
3656 +                       return;
3657 +               }
3658 +               goto not_worth_the_effort;
3659 +       }
3660 +
3661 +       first_attempt = 0;              /* heaviest search */
3662 +       max_cpu_irq = tmp_cpu_irq;      /* load */
3663 +       max_loaded = tmp_loaded;        /* processor */
3664 +       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
3665 +
3666 +       Dprintk("max_loaded cpu = %d\n", max_loaded);
3667 +       Dprintk("min_loaded cpu = %d\n", min_loaded);
3668 +       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
3669 +       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
3670 +       Dprintk("load imbalance = %lu\n", imbalance);
3671 +
3672 +       /* if imbalance is less than approx 10% of max load, then
3673 +        * observe diminishing returns action. - quit
3674 +        */
3675 +       if (imbalance < (max_cpu_irq >> 3)) {
3676 +               Dprintk("Imbalance too trivial\n");
3677 +               goto not_worth_the_effort;
3678 +       }
3679 +
3680 +tryanotherirq:
3681 +       /* if we select an IRQ to move that can't go where we want, then
3682 +        * see if there is another one to try.
3683 +        */
3684 +       move_this_load = 0;
3685 +       selected_irq = -1;
3686 +       for (j = 0; j < NR_IRQS; j++) {
3687 +               /* Is this an active IRQ? */
3688 +               if (!irq_desc[j].action)
3689 +                       continue;
3690 +               if (imbalance <= IRQ_DELTA(max_loaded,j))
3691 +                       continue;
3692 +               /* Try to find the IRQ that is closest to the imbalance
3693 +                * without going over.
3694 +                */
3695 +               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
3696 +                       move_this_load = IRQ_DELTA(max_loaded,j);
3697 +                       selected_irq = j;
3698 +               }
3699 +       }
3700 +       if (selected_irq == -1) {
3701 +               goto tryanothercpu;
3702 +       }
3703 +
3704 +       imbalance = move_this_load;
3705 +
3706 +       /* For physical_balance case, we accumlated both load
3707 +        * values in the one of the siblings cpu_irq[],
3708 +        * to use the same code for physical and logical processors
3709 +        * as much as possible.
3710 +        *
3711 +        * NOTE: the cpu_irq[] array holds the sum of the load for
3712 +        * sibling A and sibling B in the slot for the lowest numbered
3713 +        * sibling (A), _AND_ the load for sibling B in the slot for
3714 +        * the higher numbered sibling.
3715 +        *
3716 +        * We seek the least loaded sibling by making the comparison
3717 +        * (A+B)/2 vs B
3718 +        */
3719 +       load = CPU_IRQ(min_loaded) >> 1;
3720 +       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
3721 +               if (load > CPU_IRQ(j)) {
3722 +                       /* This won't change cpu_sibling_map[min_loaded] */
3723 +                       load = CPU_IRQ(j);
3724 +                       min_loaded = j;
3725 +               }
3726 +       }
3727 +
3728 +       cpus_and(allowed_mask,
3729 +               cpu_online_map,
3730 +               balance_irq_affinity[selected_irq]);
3731 +       target_cpu_mask = cpumask_of_cpu(min_loaded);
3732 +       cpus_and(tmp, target_cpu_mask, allowed_mask);
3733 +
3734 +       if (!cpus_empty(tmp)) {
3735 +
3736 +               Dprintk("irq = %d moved to cpu = %d\n",
3737 +                               selected_irq, min_loaded);
3738 +               /* mark for change destination */
3739 +               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
3740 +
3741 +               /* Since we made a change, come back sooner to
3742 +                * check for more variation.
3743 +                */
3744 +               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3745 +                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3746 +               return;
3747 +       }
3748 +       goto tryanotherirq;
3749 +
3750 +not_worth_the_effort:
3751 +       /*
3752 +        * if we did not find an IRQ to move, then adjust the time interval
3753 +        * upward
3754 +        */
3755 +       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
3756 +               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
3757 +       Dprintk("IRQ worth rotating not found\n");
3758 +       return;
3759 +}
3760 +
3761 +static int balanced_irq(void *unused)
3762 +{
3763 +       int i;
3764 +       unsigned long prev_balance_time = jiffies;
3765 +       long time_remaining = balanced_irq_interval;
3766 +
3767 +       daemonize("kirqd");
3768 +
3769 +       /* push everything to CPU 0 to give us a starting point.  */
3770 +       for (i = 0 ; i < NR_IRQS ; i++) {
3771 +               irq_desc[i].pending_mask = cpumask_of_cpu(0);
3772 +               set_pending_irq(i, cpumask_of_cpu(0));
3773 +       }
3774 +
3775 +       for ( ; ; ) {
3776 +               time_remaining = schedule_timeout_interruptible(time_remaining);
3777 +               try_to_freeze();
3778 +               if (time_after(jiffies,
3779 +                               prev_balance_time+balanced_irq_interval)) {
3780 +                       preempt_disable();
3781 +                       do_irq_balance();
3782 +                       prev_balance_time = jiffies;
3783 +                       time_remaining = balanced_irq_interval;
3784 +                       preempt_enable();
3785 +               }
3786 +       }
3787 +       return 0;
3788 +}
3789 +
3790 +static int __init balanced_irq_init(void)
3791 +{
3792 +       int i;
3793 +       struct cpuinfo_x86 *c;
3794 +       cpumask_t tmp;
3795 +
3796 +       cpus_shift_right(tmp, cpu_online_map, 2);
3797 +        c = &boot_cpu_data;
3798 +       /* When not overwritten by the command line ask subarchitecture. */
3799 +       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
3800 +               irqbalance_disabled = NO_BALANCE_IRQ;
3801 +       if (irqbalance_disabled)
3802 +               return 0;
3803 +
3804 +        /* disable irqbalance completely if there is only one processor online */
3805 +       if (num_online_cpus() < 2) {
3806 +               irqbalance_disabled = 1;
3807 +               return 0;
3808 +       }
3809 +       /*
3810 +        * Enable physical balance only if more than 1 physical processor
3811 +        * is present
3812 +        */
3813 +       if (smp_num_siblings > 1 && !cpus_empty(tmp))
3814 +               physical_balance = 1;
3815 +
3816 +       for_each_online_cpu(i) {
3817 +               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3818 +               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3819 +               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
3820 +                       printk(KERN_ERR "balanced_irq_init: out of memory");
3821 +                       goto failed;
3822 +               }
3823 +               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
3824 +               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
3825 +       }
3826 +
3827 +       printk(KERN_INFO "Starting balanced_irq\n");
3828 +       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
3829 +               return 0;
3830 +       else
3831 +               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
3832 +failed:
3833 +       for_each_possible_cpu(i) {
3834 +               kfree(irq_cpu_data[i].irq_delta);
3835 +               irq_cpu_data[i].irq_delta = NULL;
3836 +               kfree(irq_cpu_data[i].last_irq);
3837 +               irq_cpu_data[i].last_irq = NULL;
3838 +       }
3839 +       return 0;
3840 +}
3841 +
3842 +int __init irqbalance_disable(char *str)
3843 +{
3844 +       irqbalance_disabled = 1;
3845 +       return 1;
3846 +}
3847 +
3848 +__setup("noirqbalance", irqbalance_disable);
3849 +
3850 +late_initcall(balanced_irq_init);
3851 +#endif /* CONFIG_IRQBALANCE */
3852 +#endif /* CONFIG_SMP */
3853 +#endif
3854 +
3855 +#ifndef CONFIG_SMP
3856 +void fastcall send_IPI_self(int vector)
3857 +{
3858 +#ifndef CONFIG_XEN
3859 +       unsigned int cfg;
3860 +
3861 +       /*
3862 +        * Wait for idle.
3863 +        */
3864 +       apic_wait_icr_idle();
3865 +       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
3866 +       /*
3867 +        * Send the IPI. The write to APIC_ICR fires this off.
3868 +        */
3869 +       apic_write_around(APIC_ICR, cfg);
3870 +#endif
3871 +}
3872 +#endif /* !CONFIG_SMP */
3873 +
3874 +
3875 +/*
3876 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3877 + * specific CPU-side IRQs.
3878 + */
3879 +
3880 +#define MAX_PIRQS 8
3881 +static int pirq_entries [MAX_PIRQS];
3882 +static int pirqs_enabled;
3883 +int skip_ioapic_setup;
3884 +
3885 +static int __init ioapic_setup(char *str)
3886 +{
3887 +       skip_ioapic_setup = 1;
3888 +       return 1;
3889 +}
3890 +
3891 +__setup("noapic", ioapic_setup);
3892 +
3893 +static int __init ioapic_pirq_setup(char *str)
3894 +{
3895 +       int i, max;
3896 +       int ints[MAX_PIRQS+1];
3897 +
3898 +       get_options(str, ARRAY_SIZE(ints), ints);
3899 +
3900 +       for (i = 0; i < MAX_PIRQS; i++)
3901 +               pirq_entries[i] = -1;
3902 +
3903 +       pirqs_enabled = 1;
3904 +       apic_printk(APIC_VERBOSE, KERN_INFO
3905 +                       "PIRQ redirection, working around broken MP-BIOS.\n");
3906 +       max = MAX_PIRQS;
3907 +       if (ints[0] < MAX_PIRQS)
3908 +               max = ints[0];
3909 +
3910 +       for (i = 0; i < max; i++) {
3911 +               apic_printk(APIC_VERBOSE, KERN_DEBUG
3912 +                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3913 +               /*
3914 +                * PIRQs are mapped upside down, usually.
3915 +                */
3916 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3917 +       }
3918 +       return 1;
3919 +}
3920 +
3921 +__setup("pirq=", ioapic_pirq_setup);
3922 +
3923 +/*
3924 + * Find the IRQ entry number of a certain pin.
3925 + */
3926 +static int find_irq_entry(int apic, int pin, int type)
3927 +{
3928 +       int i;
3929 +
3930 +       for (i = 0; i < mp_irq_entries; i++)
3931 +               if (mp_irqs[i].mpc_irqtype == type &&
3932 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
3933 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
3934 +                   mp_irqs[i].mpc_dstirq == pin)
3935 +                       return i;
3936 +
3937 +       return -1;
3938 +}
3939 +
3940 +/*
3941 + * Find the pin to which IRQ[irq] (ISA) is connected
3942 + */
3943 +static int __init find_isa_irq_pin(int irq, int type)
3944 +{
3945 +       int i;
3946 +
3947 +       for (i = 0; i < mp_irq_entries; i++) {
3948 +               int lbus = mp_irqs[i].mpc_srcbus;
3949 +
3950 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3951 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3952 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3953 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3954 +                   ) &&
3955 +                   (mp_irqs[i].mpc_irqtype == type) &&
3956 +                   (mp_irqs[i].mpc_srcbusirq == irq))
3957 +
3958 +                       return mp_irqs[i].mpc_dstirq;
3959 +       }
3960 +       return -1;
3961 +}
3962 +
3963 +static int __init find_isa_irq_apic(int irq, int type)
3964 +{
3965 +       int i;
3966 +
3967 +       for (i = 0; i < mp_irq_entries; i++) {
3968 +               int lbus = mp_irqs[i].mpc_srcbus;
3969 +
3970 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3971 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3972 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3973 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3974 +                   ) &&
3975 +                   (mp_irqs[i].mpc_irqtype == type) &&
3976 +                   (mp_irqs[i].mpc_srcbusirq == irq))
3977 +                       break;
3978 +       }
3979 +       if (i < mp_irq_entries) {
3980 +               int apic;
3981 +               for(apic = 0; apic < nr_ioapics; apic++) {
3982 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
3983 +                               return apic;
3984 +               }
3985 +       }
3986 +
3987 +       return -1;
3988 +}
3989 +
3990 +/*
3991 + * Find a specific PCI IRQ entry.
3992 + * Not an __init, possibly needed by modules
3993 + */
3994 +static int pin_2_irq(int idx, int apic, int pin);
3995 +
3996 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
3997 +{
3998 +       int apic, i, best_guess = -1;
3999 +
4000 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4001 +               "slot:%d, pin:%d.\n", bus, slot, pin);
4002 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
4003 +               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4004 +               return -1;
4005 +       }
4006 +       for (i = 0; i < mp_irq_entries; i++) {
4007 +               int lbus = mp_irqs[i].mpc_srcbus;
4008 +
4009 +               for (apic = 0; apic < nr_ioapics; apic++)
4010 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4011 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4012 +                               break;
4013 +
4014 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4015 +                   !mp_irqs[i].mpc_irqtype &&
4016 +                   (bus == lbus) &&
4017 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4018 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4019 +
4020 +                       if (!(apic || IO_APIC_IRQ(irq)))
4021 +                               continue;
4022 +
4023 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4024 +                               return irq;
4025 +                       /*
4026 +                        * Use the first all-but-pin matching entry as a
4027 +                        * best-guess fuzzy result for broken mptables.
4028 +                        */
4029 +                       if (best_guess < 0)
4030 +                               best_guess = irq;
4031 +               }
4032 +       }
4033 +       return best_guess;
4034 +}
4035 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4036 +
4037 +/*
4038 + * This function currently is only a helper for the i386 smp boot process where
4039 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4040 + * so mask in all cases should simply be TARGET_CPUS
4041 + */
4042 +#ifdef CONFIG_SMP
4043 +#ifndef CONFIG_XEN
4044 +void __init setup_ioapic_dest(void)
4045 +{
4046 +       int pin, ioapic, irq, irq_entry;
4047 +
4048 +       if (skip_ioapic_setup == 1)
4049 +               return;
4050 +
4051 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4052 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4053 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4054 +                       if (irq_entry == -1)
4055 +                               continue;
4056 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
4057 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
4058 +               }
4059 +
4060 +       }
4061 +}
4062 +#endif /* !CONFIG_XEN */
4063 +#endif
4064 +
4065 +/*
4066 + * EISA Edge/Level control register, ELCR
4067 + */
4068 +static int EISA_ELCR(unsigned int irq)
4069 +{
4070 +       if (irq < 16) {
4071 +               unsigned int port = 0x4d0 + (irq >> 3);
4072 +               return (inb(port) >> (irq & 7)) & 1;
4073 +       }
4074 +       apic_printk(APIC_VERBOSE, KERN_INFO
4075 +                       "Broken MPtable reports ISA irq %d\n", irq);
4076 +       return 0;
4077 +}
4078 +
4079 +/* EISA interrupts are always polarity zero and can be edge or level
4080 + * trigger depending on the ELCR value.  If an interrupt is listed as
4081 + * EISA conforming in the MP table, that means its trigger type must
4082 + * be read in from the ELCR */
4083 +
4084 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4085 +#define default_EISA_polarity(idx)     (0)
4086 +
4087 +/* ISA interrupts are always polarity zero edge triggered,
4088 + * when listed as conforming in the MP table. */
4089 +
4090 +#define default_ISA_trigger(idx)       (0)
4091 +#define default_ISA_polarity(idx)      (0)
4092 +
4093 +/* PCI interrupts are always polarity one level triggered,
4094 + * when listed as conforming in the MP table. */
4095 +
4096 +#define default_PCI_trigger(idx)       (1)
4097 +#define default_PCI_polarity(idx)      (1)
4098 +
4099 +/* MCA interrupts are always polarity zero level triggered,
4100 + * when listed as conforming in the MP table. */
4101 +
4102 +#define default_MCA_trigger(idx)       (1)
4103 +#define default_MCA_polarity(idx)      (0)
4104 +
4105 +/* NEC98 interrupts are always polarity zero edge triggered,
4106 + * when listed as conforming in the MP table. */
4107 +
4108 +#define default_NEC98_trigger(idx)     (0)
4109 +#define default_NEC98_polarity(idx)    (0)
4110 +
4111 +static int __init MPBIOS_polarity(int idx)
4112 +{
4113 +       int bus = mp_irqs[idx].mpc_srcbus;
4114 +       int polarity;
4115 +
4116 +       /*
4117 +        * Determine IRQ line polarity (high active or low active):
4118 +        */
4119 +       switch (mp_irqs[idx].mpc_irqflag & 3)
4120 +       {
4121 +               case 0: /* conforms, ie. bus-type dependent polarity */
4122 +               {
4123 +                       switch (mp_bus_id_to_type[bus])
4124 +                       {
4125 +                               case MP_BUS_ISA: /* ISA pin */
4126 +                               {
4127 +                                       polarity = default_ISA_polarity(idx);
4128 +                                       break;
4129 +                               }
4130 +                               case MP_BUS_EISA: /* EISA pin */
4131 +                               {
4132 +                                       polarity = default_EISA_polarity(idx);
4133 +                                       break;
4134 +                               }
4135 +                               case MP_BUS_PCI: /* PCI pin */
4136 +                               {
4137 +                                       polarity = default_PCI_polarity(idx);
4138 +                                       break;
4139 +                               }
4140 +                               case MP_BUS_MCA: /* MCA pin */
4141 +                               {
4142 +                                       polarity = default_MCA_polarity(idx);
4143 +                                       break;
4144 +                               }
4145 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4146 +                               {
4147 +                                       polarity = default_NEC98_polarity(idx);
4148 +                                       break;
4149 +                               }
4150 +                               default:
4151 +                               {
4152 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4153 +                                       polarity = 1;
4154 +                                       break;
4155 +                               }
4156 +                       }
4157 +                       break;
4158 +               }
4159 +               case 1: /* high active */
4160 +               {
4161 +                       polarity = 0;
4162 +                       break;
4163 +               }
4164 +               case 2: /* reserved */
4165 +               {
4166 +                       printk(KERN_WARNING "broken BIOS!!\n");
4167 +                       polarity = 1;
4168 +                       break;
4169 +               }
4170 +               case 3: /* low active */
4171 +               {
4172 +                       polarity = 1;
4173 +                       break;
4174 +               }
4175 +               default: /* invalid */
4176 +               {
4177 +                       printk(KERN_WARNING "broken BIOS!!\n");
4178 +                       polarity = 1;
4179 +                       break;
4180 +               }
4181 +       }
4182 +       return polarity;
4183 +}
4184 +
4185 +static int MPBIOS_trigger(int idx)
4186 +{
4187 +       int bus = mp_irqs[idx].mpc_srcbus;
4188 +       int trigger;
4189 +
4190 +       /*
4191 +        * Determine IRQ trigger mode (edge or level sensitive):
4192 +        */
4193 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
4194 +       {
4195 +               case 0: /* conforms, ie. bus-type dependent */
4196 +               {
4197 +                       switch (mp_bus_id_to_type[bus])
4198 +                       {
4199 +                               case MP_BUS_ISA: /* ISA pin */
4200 +                               {
4201 +                                       trigger = default_ISA_trigger(idx);
4202 +                                       break;
4203 +                               }
4204 +                               case MP_BUS_EISA: /* EISA pin */
4205 +                               {
4206 +                                       trigger = default_EISA_trigger(idx);
4207 +                                       break;
4208 +                               }
4209 +                               case MP_BUS_PCI: /* PCI pin */
4210 +                               {
4211 +                                       trigger = default_PCI_trigger(idx);
4212 +                                       break;
4213 +                               }
4214 +                               case MP_BUS_MCA: /* MCA pin */
4215 +                               {
4216 +                                       trigger = default_MCA_trigger(idx);
4217 +                                       break;
4218 +                               }
4219 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4220 +                               {
4221 +                                       trigger = default_NEC98_trigger(idx);
4222 +                                       break;
4223 +                               }
4224 +                               default:
4225 +                               {
4226 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4227 +                                       trigger = 1;
4228 +                                       break;
4229 +                               }
4230 +                       }
4231 +                       break;
4232 +               }
4233 +               case 1: /* edge */
4234 +               {
4235 +                       trigger = 0;
4236 +                       break;
4237 +               }
4238 +               case 2: /* reserved */
4239 +               {
4240 +                       printk(KERN_WARNING "broken BIOS!!\n");
4241 +                       trigger = 1;
4242 +                       break;
4243 +               }
4244 +               case 3: /* level */
4245 +               {
4246 +                       trigger = 1;
4247 +                       break;
4248 +               }
4249 +               default: /* invalid */
4250 +               {
4251 +                       printk(KERN_WARNING "broken BIOS!!\n");
4252 +                       trigger = 0;
4253 +                       break;
4254 +               }
4255 +       }
4256 +       return trigger;
4257 +}
4258 +
4259 +static inline int irq_polarity(int idx)
4260 +{
4261 +       return MPBIOS_polarity(idx);
4262 +}
4263 +
4264 +static inline int irq_trigger(int idx)
4265 +{
4266 +       return MPBIOS_trigger(idx);
4267 +}
4268 +
4269 +static int pin_2_irq(int idx, int apic, int pin)
4270 +{
4271 +       int irq, i;
4272 +       int bus = mp_irqs[idx].mpc_srcbus;
4273 +
4274 +       /*
4275 +        * Debugging check, we are in big trouble if this message pops up!
4276 +        */
4277 +       if (mp_irqs[idx].mpc_dstirq != pin)
4278 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
4279 +
4280 +       switch (mp_bus_id_to_type[bus])
4281 +       {
4282 +               case MP_BUS_ISA: /* ISA pin */
4283 +               case MP_BUS_EISA:
4284 +               case MP_BUS_MCA:
4285 +               case MP_BUS_NEC98:
4286 +               {
4287 +                       irq = mp_irqs[idx].mpc_srcbusirq;
4288 +                       break;
4289 +               }
4290 +               case MP_BUS_PCI: /* PCI pin */
4291 +               {
4292 +                       /*
4293 +                        * PCI IRQs are mapped in order
4294 +                        */
4295 +                       i = irq = 0;
4296 +                       while (i < apic)
4297 +                               irq += nr_ioapic_registers[i++];
4298 +                       irq += pin;
4299 +
4300 +                       /*
4301 +                        * For MPS mode, so far only needed by ES7000 platform
4302 +                        */
4303 +                       if (ioapic_renumber_irq)
4304 +                               irq = ioapic_renumber_irq(apic, irq);
4305 +
4306 +                       break;
4307 +               }
4308 +               default:
4309 +               {
4310 +                       printk(KERN_ERR "unknown bus type %d.\n",bus);
4311 +                       irq = 0;
4312 +                       break;
4313 +               }
4314 +       }
4315 +
4316 +       /*
4317 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
4318 +        */
4319 +       if ((pin >= 16) && (pin <= 23)) {
4320 +               if (pirq_entries[pin-16] != -1) {
4321 +                       if (!pirq_entries[pin-16]) {
4322 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
4323 +                                               "disabling PIRQ%d\n", pin-16);
4324 +                       } else {
4325 +                               irq = pirq_entries[pin-16];
4326 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
4327 +                                               "using PIRQ%d -> IRQ %d\n",
4328 +                                               pin-16, irq);
4329 +                       }
4330 +               }
4331 +       }
4332 +       return irq;
4333 +}
4334 +
4335 +static inline int IO_APIC_irq_trigger(int irq)
4336 +{
4337 +       int apic, idx, pin;
4338 +
4339 +       for (apic = 0; apic < nr_ioapics; apic++) {
4340 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4341 +                       idx = find_irq_entry(apic,pin,mp_INT);
4342 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
4343 +                               return irq_trigger(idx);
4344 +               }
4345 +       }
4346 +       /*
4347 +        * nonexistent IRQs are edge default
4348 +        */
4349 +       return 0;
4350 +}
4351 +
4352 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
4353 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
4354 +
4355 +int assign_irq_vector(int irq)
4356 +{
4357 +       unsigned long flags;
4358 +       int vector;
4359 +       struct physdev_irq irq_op;
4360 +
4361 +       BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
4362 +
4363 +       if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
4364 +               return -EINVAL;
4365 +
4366 +       spin_lock_irqsave(&vector_lock, flags);
4367 +
4368 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
4369 +               spin_unlock_irqrestore(&vector_lock, flags);
4370 +               return IO_APIC_VECTOR(irq);
4371 +       }
4372 +
4373 +       irq_op.irq = irq;
4374 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
4375 +               spin_unlock_irqrestore(&vector_lock, flags);
4376 +               return -ENOSPC;
4377 +       }
4378 +
4379 +       vector = irq_op.vector;
4380 +       vector_irq[vector] = irq;
4381 +       if (irq != AUTO_ASSIGN)
4382 +               IO_APIC_VECTOR(irq) = vector;
4383 +
4384 +       spin_unlock_irqrestore(&vector_lock, flags);
4385 +
4386 +       return vector;
4387 +}
4388 +
4389 +#ifndef CONFIG_XEN
4390 +static struct hw_interrupt_type ioapic_level_type;
4391 +static struct hw_interrupt_type ioapic_edge_type;
4392 +
4393 +#define IOAPIC_AUTO    -1
4394 +#define IOAPIC_EDGE    0
4395 +#define IOAPIC_LEVEL   1
4396 +
4397 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
4398 +{
4399 +       unsigned idx;
4400 +
4401 +       idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
4402 +
4403 +       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
4404 +                       trigger == IOAPIC_LEVEL)
4405 +               irq_desc[idx].chip = &ioapic_level_type;
4406 +       else
4407 +               irq_desc[idx].chip = &ioapic_edge_type;
4408 +       set_intr_gate(vector, interrupt[idx]);
4409 +}
4410 +#else
4411 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
4412 +#endif
4413 +
4414 +static void __init setup_IO_APIC_irqs(void)
4415 +{
4416 +       struct IO_APIC_route_entry entry;
4417 +       int apic, pin, idx, irq, first_notcon = 1, vector;
4418 +       unsigned long flags;
4419 +
4420 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
4421 +
4422 +       for (apic = 0; apic < nr_ioapics; apic++) {
4423 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4424 +
4425 +               /*
4426 +                * add it to the IO-APIC irq-routing table:
4427 +                */
4428 +               memset(&entry,0,sizeof(entry));
4429 +
4430 +               entry.delivery_mode = INT_DELIVERY_MODE;
4431 +               entry.dest_mode = INT_DEST_MODE;
4432 +               entry.mask = 0;                         /* enable IRQ */
4433 +               entry.dest.logical.logical_dest =
4434 +                                       cpu_mask_to_apicid(TARGET_CPUS);
4435 +
4436 +               idx = find_irq_entry(apic,pin,mp_INT);
4437 +               if (idx == -1) {
4438 +                       if (first_notcon) {
4439 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
4440 +                                               " IO-APIC (apicid-pin) %d-%d",
4441 +                                               mp_ioapics[apic].mpc_apicid,
4442 +                                               pin);
4443 +                               first_notcon = 0;
4444 +                       } else
4445 +                               apic_printk(APIC_VERBOSE, ", %d-%d",
4446 +                                       mp_ioapics[apic].mpc_apicid, pin);
4447 +                       continue;
4448 +               }
4449 +
4450 +               entry.trigger = irq_trigger(idx);
4451 +               entry.polarity = irq_polarity(idx);
4452 +
4453 +               if (irq_trigger(idx)) {
4454 +                       entry.trigger = 1;
4455 +                       entry.mask = 1;
4456 +               }
4457 +
4458 +               irq = pin_2_irq(idx, apic, pin);
4459 +               /*
4460 +                * skip adding the timer int on secondary nodes, which causes
4461 +                * a small but painful rift in the time-space continuum
4462 +                */
4463 +               if (multi_timer_check(apic, irq))
4464 +                       continue;
4465 +               else
4466 +                       add_pin_to_irq(irq, apic, pin);
4467 +
4468 +               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
4469 +                       continue;
4470 +
4471 +               if (IO_APIC_IRQ(irq)) {
4472 +                       vector = assign_irq_vector(irq);
4473 +                       entry.vector = vector;
4474 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
4475 +
4476 +                       if (!apic && (irq < 16))
4477 +                               disable_8259A_irq(irq);
4478 +               }
4479 +               spin_lock_irqsave(&ioapic_lock, flags);
4480 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4481 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4482 +               set_native_irq_info(irq, TARGET_CPUS);
4483 +               spin_unlock_irqrestore(&ioapic_lock, flags);
4484 +       }
4485 +       }
4486 +
4487 +       if (!first_notcon)
4488 +               apic_printk(APIC_VERBOSE, " not connected.\n");
4489 +}
4490 +
4491 +/*
4492 + * Set up the 8259A-master output pin:
4493 + */
4494 +#ifndef CONFIG_XEN
4495 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
4496 +{
4497 +       struct IO_APIC_route_entry entry;
4498 +       unsigned long flags;
4499 +
4500 +       memset(&entry,0,sizeof(entry));
4501 +
4502 +       disable_8259A_irq(0);
4503 +
4504 +       /* mask LVT0 */
4505 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
4506 +
4507 +       /*
4508 +        * We use logical delivery to get the timer IRQ
4509 +        * to the first CPU.
4510 +        */
4511 +       entry.dest_mode = INT_DEST_MODE;
4512 +       entry.mask = 0;                                 /* unmask IRQ now */
4513 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4514 +       entry.delivery_mode = INT_DELIVERY_MODE;
4515 +       entry.polarity = 0;
4516 +       entry.trigger = 0;
4517 +       entry.vector = vector;
4518 +
4519 +       /*
4520 +        * The timer IRQ doesn't have to know that behind the
4521 +        * scene we have a 8259A-master in AEOI mode ...
4522 +        */
4523 +       irq_desc[0].chip = &ioapic_edge_type;
4524 +
4525 +       /*
4526 +        * Add it to the IO-APIC irq-routing table:
4527 +        */
4528 +       spin_lock_irqsave(&ioapic_lock, flags);
4529 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4530 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4531 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4532 +
4533 +       enable_8259A_irq(0);
4534 +}
4535 +
4536 +static inline void UNEXPECTED_IO_APIC(void)
4537 +{
4538 +}
4539 +
4540 +void __init print_IO_APIC(void)
4541 +{
4542 +       int apic, i;
4543 +       union IO_APIC_reg_00 reg_00;
4544 +       union IO_APIC_reg_01 reg_01;
4545 +       union IO_APIC_reg_02 reg_02;
4546 +       union IO_APIC_reg_03 reg_03;
4547 +       unsigned long flags;
4548 +
4549 +       if (apic_verbosity == APIC_QUIET)
4550 +               return;
4551 +
4552 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
4553 +       for (i = 0; i < nr_ioapics; i++)
4554 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
4555 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
4556 +
4557 +       /*
4558 +        * We are a bit conservative about what we expect.  We have to
4559 +        * know about every hardware change ASAP.
4560 +        */
4561 +       printk(KERN_INFO "testing the IO APIC.......................\n");
4562 +
4563 +       for (apic = 0; apic < nr_ioapics; apic++) {
4564 +
4565 +       spin_lock_irqsave(&ioapic_lock, flags);
4566 +       reg_00.raw = io_apic_read(apic, 0);
4567 +       reg_01.raw = io_apic_read(apic, 1);
4568 +       if (reg_01.bits.version >= 0x10)
4569 +               reg_02.raw = io_apic_read(apic, 2);
4570 +       if (reg_01.bits.version >= 0x20)
4571 +               reg_03.raw = io_apic_read(apic, 3);
4572 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4573 +
4574 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
4575 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
4576 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
4577 +       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
4578 +       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
4579 +       if (reg_00.bits.ID >= get_physical_broadcast())
4580 +               UNEXPECTED_IO_APIC();
4581 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
4582 +               UNEXPECTED_IO_APIC();
4583 +
4584 +       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
4585 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
4586 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
4587 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
4588 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
4589 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
4590 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
4591 +               (reg_01.bits.entries != 0x2E) &&
4592 +               (reg_01.bits.entries != 0x3F)
4593 +       )
4594 +               UNEXPECTED_IO_APIC();
4595 +
4596 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
4597 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
4598 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
4599 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
4600 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
4601 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
4602 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
4603 +       )
4604 +               UNEXPECTED_IO_APIC();
4605 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
4606 +               UNEXPECTED_IO_APIC();
4607 +
4608 +       /*
4609 +        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
4610 +        * but the value of reg_02 is read as the previous read register
4611 +        * value, so ignore it if reg_02 == reg_01.
4612 +        */
4613 +       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
4614 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
4615 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
4616 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
4617 +                       UNEXPECTED_IO_APIC();
4618 +       }
4619 +
4620 +       /*
4621 +        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
4622 +        * or reg_03, but the value of reg_0[23] is read as the previous read
4623 +        * register value, so ignore it if reg_03 == reg_0[12].
4624 +        */
4625 +       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
4626 +           reg_03.raw != reg_01.raw) {
4627 +               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
4628 +               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
4629 +               if (reg_03.bits.__reserved_1)
4630 +                       UNEXPECTED_IO_APIC();
4631 +       }
4632 +
4633 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
4634 +
4635 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
4636 +                         " Stat Dest Deli Vect:   \n");
4637 +
4638 +       for (i = 0; i <= reg_01.bits.entries; i++) {
4639 +               struct IO_APIC_route_entry entry;
4640 +
4641 +               spin_lock_irqsave(&ioapic_lock, flags);
4642 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
4643 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
4644 +               spin_unlock_irqrestore(&ioapic_lock, flags);
4645 +
4646 +               printk(KERN_DEBUG " %02x %03X %02X  ",
4647 +                       i,
4648 +                       entry.dest.logical.logical_dest,
4649 +                       entry.dest.physical.physical_dest
4650 +               );
4651 +
4652 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
4653 +                       entry.mask,
4654 +                       entry.trigger,
4655 +                       entry.irr,
4656 +                       entry.polarity,
4657 +                       entry.delivery_status,
4658 +                       entry.dest_mode,
4659 +                       entry.delivery_mode,
4660 +                       entry.vector
4661 +               );
4662 +       }
4663 +       }
4664 +       if (use_pci_vector())
4665 +               printk(KERN_INFO "Using vector-based indexing\n");
4666 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
4667 +       for (i = 0; i < NR_IRQS; i++) {
4668 +               struct irq_pin_list *entry = irq_2_pin + i;
4669 +               if (entry->pin < 0)
4670 +                       continue;
4671 +               if (use_pci_vector() && !platform_legacy_irq(i))
4672 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
4673 +               else
4674 +                       printk(KERN_DEBUG "IRQ%d ", i);
4675 +               for (;;) {
4676 +                       printk("-> %d:%d", entry->apic, entry->pin);
4677 +                       if (!entry->next)
4678 +                               break;
4679 +                       entry = irq_2_pin + entry->next;
4680 +               }
4681 +               printk("\n");
4682 +       }
4683 +
4684 +       printk(KERN_INFO ".................................... done.\n");
4685 +
4686 +       return;
4687 +}
4688 +
4689 +static void print_APIC_bitfield (int base)
4690 +{
4691 +       unsigned int v;
4692 +       int i, j;
4693 +
4694 +       if (apic_verbosity == APIC_QUIET)
4695 +               return;
4696 +
4697 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
4698 +       for (i = 0; i < 8; i++) {
4699 +               v = apic_read(base + i*0x10);
4700 +               for (j = 0; j < 32; j++) {
4701 +                       if (v & (1<<j))
4702 +                               printk("1");
4703 +                       else
4704 +                               printk("0");
4705 +               }
4706 +               printk("\n");
4707 +       }
4708 +}
4709 +
4710 +void /*__init*/ print_local_APIC(void * dummy)
4711 +{
4712 +       unsigned int v, ver, maxlvt;
4713 +
4714 +       if (apic_verbosity == APIC_QUIET)
4715 +               return;
4716 +
4717 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
4718 +               smp_processor_id(), hard_smp_processor_id());
4719 +       v = apic_read(APIC_ID);
4720 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
4721 +       v = apic_read(APIC_LVR);
4722 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
4723 +       ver = GET_APIC_VERSION(v);
4724 +       maxlvt = get_maxlvt();
4725 +
4726 +       v = apic_read(APIC_TASKPRI);
4727 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
4728 +
4729 +       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
4730 +               v = apic_read(APIC_ARBPRI);
4731 +               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
4732 +                       v & APIC_ARBPRI_MASK);
4733 +               v = apic_read(APIC_PROCPRI);
4734 +               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
4735 +       }
4736 +
4737 +       v = apic_read(APIC_EOI);
4738 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
4739 +       v = apic_read(APIC_RRR);
4740 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
4741 +       v = apic_read(APIC_LDR);
4742 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
4743 +       v = apic_read(APIC_DFR);
4744 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
4745 +       v = apic_read(APIC_SPIV);
4746 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
4747 +
4748 +       printk(KERN_DEBUG "... APIC ISR field:\n");
4749 +       print_APIC_bitfield(APIC_ISR);
4750 +       printk(KERN_DEBUG "... APIC TMR field:\n");
4751 +       print_APIC_bitfield(APIC_TMR);
4752 +       printk(KERN_DEBUG "... APIC IRR field:\n");
4753 +       print_APIC_bitfield(APIC_IRR);
4754 +
4755 +       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
4756 +               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
4757 +                       apic_write(APIC_ESR, 0);
4758 +               v = apic_read(APIC_ESR);
4759 +               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
4760 +       }
4761 +
4762 +       v = apic_read(APIC_ICR);
4763 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
4764 +       v = apic_read(APIC_ICR2);
4765 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
4766 +
4767 +       v = apic_read(APIC_LVTT);
4768 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
4769 +
4770 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
4771 +               v = apic_read(APIC_LVTPC);
4772 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
4773 +       }
4774 +       v = apic_read(APIC_LVT0);
4775 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
4776 +       v = apic_read(APIC_LVT1);
4777 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
4778 +
4779 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
4780 +               v = apic_read(APIC_LVTERR);
4781 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
4782 +       }
4783 +
4784 +       v = apic_read(APIC_TMICT);
4785 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
4786 +       v = apic_read(APIC_TMCCT);
4787 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
4788 +       v = apic_read(APIC_TDCR);
4789 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
4790 +       printk("\n");
4791 +}
4792 +
4793 +void print_all_local_APICs (void)
4794 +{
4795 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
4796 +}
4797 +
4798 +void /*__init*/ print_PIC(void)
4799 +{
4800 +       unsigned int v;
4801 +       unsigned long flags;
4802 +
4803 +       if (apic_verbosity == APIC_QUIET)
4804 +               return;
4805 +
4806 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
4807 +
4808 +       spin_lock_irqsave(&i8259A_lock, flags);
4809 +
4810 +       v = inb(0xa1) << 8 | inb(0x21);
4811 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
4812 +
4813 +       v = inb(0xa0) << 8 | inb(0x20);
4814 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
4815 +
4816 +       outb(0x0b,0xa0);
4817 +       outb(0x0b,0x20);
4818 +       v = inb(0xa0) << 8 | inb(0x20);
4819 +       outb(0x0a,0xa0);
4820 +       outb(0x0a,0x20);
4821 +
4822 +       spin_unlock_irqrestore(&i8259A_lock, flags);
4823 +
4824 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
4825 +
4826 +       v = inb(0x4d1) << 8 | inb(0x4d0);
4827 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
4828 +}
4829 +#endif /* !CONFIG_XEN */
4830 +
4831 +static void __init enable_IO_APIC(void)
4832 +{
4833 +       union IO_APIC_reg_01 reg_01;
4834 +       int i8259_apic, i8259_pin;
4835 +       int i, apic;
4836 +       unsigned long flags;
4837 +
4838 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
4839 +               irq_2_pin[i].pin = -1;
4840 +               irq_2_pin[i].next = 0;
4841 +       }
4842 +       if (!pirqs_enabled)
4843 +               for (i = 0; i < MAX_PIRQS; i++)
4844 +                       pirq_entries[i] = -1;
4845 +
4846 +       /*
4847 +        * The number of IO-APIC IRQ registers (== #pins):
4848 +        */
4849 +       for (apic = 0; apic < nr_ioapics; apic++) {
4850 +               spin_lock_irqsave(&ioapic_lock, flags);
4851 +               reg_01.raw = io_apic_read(apic, 1);
4852 +               spin_unlock_irqrestore(&ioapic_lock, flags);
4853 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
4854 +       }
4855 +       for(apic = 0; apic < nr_ioapics; apic++) {
4856 +               int pin;
4857 +               /* See if any of the pins is in ExtINT mode */
4858 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4859 +                       struct IO_APIC_route_entry entry;
4860 +                       spin_lock_irqsave(&ioapic_lock, flags);
4861 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4862 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4863 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
4864 +
4865 +
4866 +                       /* If the interrupt line is enabled and in ExtInt mode
4867 +                        * I have found the pin where the i8259 is connected.
4868 +                        */
4869 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
4870 +                               ioapic_i8259.apic = apic;
4871 +                               ioapic_i8259.pin  = pin;
4872 +                               goto found_i8259;
4873 +                       }
4874 +               }
4875 +       }
4876 + found_i8259:
4877 +       /* Look to see what if the MP table has reported the ExtINT */
4878 +       /* If we could not find the appropriate pin by looking at the ioapic
4879 +        * the i8259 probably is not connected the ioapic but give the
4880 +        * mptable a chance anyway.
4881 +        */
4882 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
4883 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
4884 +       /* Trust the MP table if nothing is setup in the hardware */
4885 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
4886 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
4887 +               ioapic_i8259.pin  = i8259_pin;
4888 +               ioapic_i8259.apic = i8259_apic;
4889 +       }
4890 +       /* Complain if the MP table and the hardware disagree */
4891 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
4892 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
4893 +       {
4894 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
4895 +       }
4896 +
4897 +       /*
4898 +        * Do not trust the IO-APIC being empty at bootup
4899 +        */
4900 +       clear_IO_APIC();
4901 +}
4902 +
4903 +/*
4904 + * Not an __init, needed by the reboot code
4905 + */
4906 +void disable_IO_APIC(void)
4907 +{
4908 +       /*
4909 +        * Clear the IO-APIC before rebooting:
4910 +        */
4911 +       clear_IO_APIC();
4912 +
4913 +#ifndef CONFIG_XEN
4914 +       /*
4915 +        * If the i8259 is routed through an IOAPIC
4916 +        * Put that IOAPIC in virtual wire mode
4917 +        * so legacy interrupts can be delivered.
4918 +        */
4919 +       if (ioapic_i8259.pin != -1) {
4920 +               struct IO_APIC_route_entry entry;
4921 +               unsigned long flags;
4922 +
4923 +               memset(&entry, 0, sizeof(entry));
4924 +               entry.mask            = 0; /* Enabled */
4925 +               entry.trigger         = 0; /* Edge */
4926 +               entry.irr             = 0;
4927 +               entry.polarity        = 0; /* High */
4928 +               entry.delivery_status = 0;
4929 +               entry.dest_mode       = 0; /* Physical */
4930 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
4931 +               entry.vector          = 0;
4932 +               entry.dest.physical.physical_dest =
4933 +                                       GET_APIC_ID(apic_read(APIC_ID));
4934 +
4935 +               /*
4936 +                * Add it to the IO-APIC irq-routing table:
4937 +                */
4938 +               spin_lock_irqsave(&ioapic_lock, flags);
4939 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
4940 +                       *(((int *)&entry)+1));
4941 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
4942 +                       *(((int *)&entry)+0));
4943 +               spin_unlock_irqrestore(&ioapic_lock, flags);
4944 +       }
4945 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
4946 +#endif
4947 +}
4948 +
4949 +/*
4950 + * function to set the IO-APIC physical IDs based on the
4951 + * values stored in the MPC table.
4952 + *
4953 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
4954 + */
4955 +
4956 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
4957 +static void __init setup_ioapic_ids_from_mpc(void)
4958 +{
4959 +       union IO_APIC_reg_00 reg_00;
4960 +       physid_mask_t phys_id_present_map;
4961 +       int apic;
4962 +       int i;
4963 +       unsigned char old_id;
4964 +       unsigned long flags;
4965 +
4966 +       /*
4967 +        * Don't check I/O APIC IDs for xAPIC systems.  They have
4968 +        * no meaning without the serial APIC bus.
4969 +        */
4970 +       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4971 +               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4972 +               return;
4973 +       /*
4974 +        * This is broken; anything with a real cpu count has to
4975 +        * circumvent this idiocy regardless.
4976 +        */
4977 +       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
4978 +
4979 +       /*
4980 +        * Set the IOAPIC ID to the value stored in the MPC table.
4981 +        */
4982 +       for (apic = 0; apic < nr_ioapics; apic++) {
4983 +
4984 +               /* Read the register 0 value */
4985 +               spin_lock_irqsave(&ioapic_lock, flags);
4986 +               reg_00.raw = io_apic_read(apic, 0);
4987 +               spin_unlock_irqrestore(&ioapic_lock, flags);
4988 +
4989 +               old_id = mp_ioapics[apic].mpc_apicid;
4990 +
4991 +               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
4992 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
4993 +                               apic, mp_ioapics[apic].mpc_apicid);
4994 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
4995 +                               reg_00.bits.ID);
4996 +                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
4997 +               }
4998 +
4999 +               /*
5000 +                * Sanity check, is the ID really free? Every APIC in a
5001 +                * system must have a unique ID or we get lots of nice
5002 +                * 'stuck on smp_invalidate_needed IPI wait' messages.
5003 +                */
5004 +               if (check_apicid_used(phys_id_present_map,
5005 +                                       mp_ioapics[apic].mpc_apicid)) {
5006 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5007 +                               apic, mp_ioapics[apic].mpc_apicid);
5008 +                       for (i = 0; i < get_physical_broadcast(); i++)
5009 +                               if (!physid_isset(i, phys_id_present_map))
5010 +                                       break;
5011 +                       if (i >= get_physical_broadcast())
5012 +                               panic("Max APIC ID exceeded!\n");
5013 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5014 +                               i);
5015 +                       physid_set(i, phys_id_present_map);
5016 +                       mp_ioapics[apic].mpc_apicid = i;
5017 +               } else {
5018 +                       physid_mask_t tmp;
5019 +                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5020 +                       apic_printk(APIC_VERBOSE, "Setting %d in the "
5021 +                                       "phys_id_present_map\n",
5022 +                                       mp_ioapics[apic].mpc_apicid);
5023 +                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
5024 +               }
5025 +
5026 +
5027 +               /*
5028 +                * We need to adjust the IRQ routing table
5029 +                * if the ID changed.
5030 +                */
5031 +               if (old_id != mp_ioapics[apic].mpc_apicid)
5032 +                       for (i = 0; i < mp_irq_entries; i++)
5033 +                               if (mp_irqs[i].mpc_dstapic == old_id)
5034 +                                       mp_irqs[i].mpc_dstapic
5035 +                                               = mp_ioapics[apic].mpc_apicid;
5036 +
5037 +               /*
5038 +                * Read the right value from the MPC table and
5039 +                * write it into the ID register.
5040 +                */
5041 +               apic_printk(APIC_VERBOSE, KERN_INFO
5042 +                       "...changing IO-APIC physical APIC ID to %d ...",
5043 +                       mp_ioapics[apic].mpc_apicid);
5044 +
5045 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5046 +               spin_lock_irqsave(&ioapic_lock, flags);
5047 +               io_apic_write(apic, 0, reg_00.raw);
5048 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5049 +
5050 +               /*
5051 +                * Sanity check
5052 +                */
5053 +               spin_lock_irqsave(&ioapic_lock, flags);
5054 +               reg_00.raw = io_apic_read(apic, 0);
5055 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5056 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5057 +                       printk("could not set ID!\n");
5058 +               else
5059 +                       apic_printk(APIC_VERBOSE, " ok.\n");
5060 +       }
5061 +}
5062 +#else
5063 +static void __init setup_ioapic_ids_from_mpc(void) { }
5064 +#endif
5065 +
5066 +#ifndef CONFIG_XEN
5067 +/*
5068 + * There is a nasty bug in some older SMP boards, their mptable lies
5069 + * about the timer IRQ. We do the following to work around the situation:
5070 + *
5071 + *     - timer IRQ defaults to IO-APIC IRQ
5072 + *     - if this function detects that timer IRQs are defunct, then we fall
5073 + *       back to ISA timer IRQs
5074 + */
5075 +static int __init timer_irq_works(void)
5076 +{
5077 +       unsigned long t1 = jiffies;
5078 +
5079 +       local_irq_enable();
5080 +       /* Let ten ticks pass... */
5081 +       mdelay((10 * 1000) / HZ);
5082 +
5083 +       /*
5084 +        * Expect a few ticks at least, to be sure some possible
5085 +        * glue logic does not lock up after one or two first
5086 +        * ticks in a non-ExtINT mode.  Also the local APIC
5087 +        * might have cached one ExtINT interrupt.  Finally, at
5088 +        * least one tick may be lost due to delays.
5089 +        */
5090 +       if (jiffies - t1 > 4)
5091 +               return 1;
5092 +
5093 +       return 0;
5094 +}
5095 +
5096 +/*
5097 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5098 + * number of pending IRQ events unhandled. These cases are very rare,
5099 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5100 + * better to do it this way as thus we do not have to be aware of
5101 + * 'pending' interrupts in the IRQ path, except at this point.
5102 + */
5103 +/*
5104 + * Edge triggered needs to resend any interrupt
5105 + * that was delayed but this is now handled in the device
5106 + * independent code.
5107 + */
5108 +
5109 +/*
5110 + * Starting up a edge-triggered IO-APIC interrupt is
5111 + * nasty - we need to make sure that we get the edge.
5112 + * If it is already asserted for some reason, we need
5113 + * return 1 to indicate that is was pending.
5114 + *
5115 + * This is not complete - we should be able to fake
5116 + * an edge even if it isn't on the 8259A...
5117 + */
5118 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5119 +{
5120 +       int was_pending = 0;
5121 +       unsigned long flags;
5122 +
5123 +       spin_lock_irqsave(&ioapic_lock, flags);
5124 +       if (irq < 16) {
5125 +               disable_8259A_irq(irq);
5126 +               if (i8259A_irq_pending(irq))
5127 +                       was_pending = 1;
5128 +       }
5129 +       __unmask_IO_APIC_irq(irq);
5130 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5131 +
5132 +       return was_pending;
5133 +}
5134 +
5135 +/*
5136 + * Once we have recorded IRQ_PENDING already, we can mask the
5137 + * interrupt for real. This prevents IRQ storms from unhandled
5138 + * devices.
5139 + */
5140 +static void ack_edge_ioapic_irq(unsigned int irq)
5141 +{
5142 +       move_irq(irq);
5143 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5144 +                                       == (IRQ_PENDING | IRQ_DISABLED))
5145 +               mask_IO_APIC_irq(irq);
5146 +       ack_APIC_irq();
5147 +}
5148 +
5149 +/*
5150 + * Level triggered interrupts can just be masked,
5151 + * and shutting down and starting up the interrupt
5152 + * is the same as enabling and disabling them -- except
5153 + * with a startup need to return a "was pending" value.
5154 + *
5155 + * Level triggered interrupts are special because we
5156 + * do not touch any IO-APIC register while handling
5157 + * them. We ack the APIC in the end-IRQ handler, not
5158 + * in the start-IRQ-handler. Protection against reentrance
5159 + * from the same interrupt is still provided, both by the
5160 + * generic IRQ layer and by the fact that an unacked local
5161 + * APIC does not accept IRQs.
5162 + */
5163 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
5164 +{
5165 +       unmask_IO_APIC_irq(irq);
5166 +
5167 +       return 0; /* don't check for pending */
5168 +}
5169 +
5170 +static void end_level_ioapic_irq (unsigned int irq)
5171 +{
5172 +       unsigned long v;
5173 +       int i;
5174 +
5175 +       move_irq(irq);
5176 +/*
5177 + * It appears there is an erratum which affects at least version 0x11
5178 + * of I/O APIC (that's the 82093AA and cores integrated into various
5179 + * chipsets).  Under certain conditions a level-triggered interrupt is
5180 + * erroneously delivered as edge-triggered one but the respective IRR
5181 + * bit gets set nevertheless.  As a result the I/O unit expects an EOI
5182 + * message but it will never arrive and further interrupts are blocked
5183 + * from the source.  The exact reason is so far unknown, but the
5184 + * phenomenon was observed when two consecutive interrupt requests
5185 + * from a given source get delivered to the same CPU and the source is
5186 + * temporarily disabled in between.
5187 + *
5188 + * A workaround is to simulate an EOI message manually.  We achieve it
5189 + * by setting the trigger mode to edge and then to level when the edge
5190 + * trigger mode gets detected in the TMR of a local APIC for a
5191 + * level-triggered interrupt.  We mask the source for the time of the
5192 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
5193 + * The idea is from Manfred Spraul.  --macro
5194 + */
5195 +       i = IO_APIC_VECTOR(irq);
5196 +
5197 +       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
5198 +
5199 +       ack_APIC_irq();
5200 +
5201 +       if (!(v & (1 << (i & 0x1f)))) {
5202 +               atomic_inc(&irq_mis_count);
5203 +               spin_lock(&ioapic_lock);
5204 +               __mask_and_edge_IO_APIC_irq(irq);
5205 +               __unmask_and_level_IO_APIC_irq(irq);
5206 +               spin_unlock(&ioapic_lock);
5207 +       }
5208 +}
5209 +
5210 +#ifdef CONFIG_PCI_MSI
5211 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
5212 +{
5213 +       int irq = vector_to_irq(vector);
5214 +
5215 +       return startup_edge_ioapic_irq(irq);
5216 +}
5217 +
5218 +static void ack_edge_ioapic_vector(unsigned int vector)
5219 +{
5220 +       int irq = vector_to_irq(vector);
5221 +
5222 +       move_native_irq(vector);
5223 +       ack_edge_ioapic_irq(irq);
5224 +}
5225 +
5226 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
5227 +{
5228 +       int irq = vector_to_irq(vector);
5229 +
5230 +       return startup_level_ioapic_irq (irq);
5231 +}
5232 +
5233 +static void end_level_ioapic_vector (unsigned int vector)
5234 +{
5235 +       int irq = vector_to_irq(vector);
5236 +
5237 +       move_native_irq(vector);
5238 +       end_level_ioapic_irq(irq);
5239 +}
5240 +
5241 +static void mask_IO_APIC_vector (unsigned int vector)
5242 +{
5243 +       int irq = vector_to_irq(vector);
5244 +
5245 +       mask_IO_APIC_irq(irq);
5246 +}
5247 +
5248 +static void unmask_IO_APIC_vector (unsigned int vector)
5249 +{
5250 +       int irq = vector_to_irq(vector);
5251 +
5252 +       unmask_IO_APIC_irq(irq);
5253 +}
5254 +
5255 +#ifdef CONFIG_SMP
5256 +static void set_ioapic_affinity_vector (unsigned int vector,
5257 +                                       cpumask_t cpu_mask)
5258 +{
5259 +       int irq = vector_to_irq(vector);
5260 +
5261 +       set_native_irq_info(vector, cpu_mask);
5262 +       set_ioapic_affinity_irq(irq, cpu_mask);
5263 +}
5264 +#endif
5265 +#endif
5266 +
5267 +static int ioapic_retrigger(unsigned int irq)
5268 +{
5269 +       send_IPI_self(IO_APIC_VECTOR(irq));
5270 +
5271 +       return 1;
5272 +}
5273 +
5274 +/*
5275 + * Level and edge triggered IO-APIC interrupts need different handling,
5276 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
5277 + * handled with the level-triggered descriptor, but that one has slightly
5278 + * more overhead. Level-triggered interrupts cannot be handled with the
5279 + * edge-triggered handler, without risking IRQ storms and other ugly
5280 + * races.
5281 + */
5282 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
5283 +       .typename       = "IO-APIC-edge",
5284 +       .startup        = startup_edge_ioapic,
5285 +       .shutdown       = shutdown_edge_ioapic,
5286 +       .enable         = enable_edge_ioapic,
5287 +       .disable        = disable_edge_ioapic,
5288 +       .ack            = ack_edge_ioapic,
5289 +       .end            = end_edge_ioapic,
5290 +#ifdef CONFIG_SMP
5291 +       .set_affinity   = set_ioapic_affinity,
5292 +#endif
5293 +       .retrigger      = ioapic_retrigger,
5294 +};
5295 +
5296 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
5297 +       .typename       = "IO-APIC-level",
5298 +       .startup        = startup_level_ioapic,
5299 +       .shutdown       = shutdown_level_ioapic,
5300 +       .enable         = enable_level_ioapic,
5301 +       .disable        = disable_level_ioapic,
5302 +       .ack            = mask_and_ack_level_ioapic,
5303 +       .end            = end_level_ioapic,
5304 +#ifdef CONFIG_SMP
5305 +       .set_affinity   = set_ioapic_affinity,
5306 +#endif
5307 +       .retrigger      = ioapic_retrigger,
5308 +};
5309 +#endif /* !CONFIG_XEN */
5310 +
5311 +static inline void init_IO_APIC_traps(void)
5312 +{
5313 +       int irq;
5314 +
5315 +       /*
5316 +        * NOTE! The local APIC isn't very good at handling
5317 +        * multiple interrupts at the same interrupt level.
5318 +        * As the interrupt level is determined by taking the
5319 +        * vector number and shifting that right by 4, we
5320 +        * want to spread these out a bit so that they don't
5321 +        * all fall in the same interrupt level.
5322 +        *
5323 +        * Also, we've got to be careful not to trash gate
5324 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
5325 +        */
5326 +       for (irq = 0; irq < NR_IRQS ; irq++) {
5327 +               int tmp = irq;
5328 +               if (use_pci_vector()) {
5329 +                       if (!platform_legacy_irq(tmp))
5330 +                               if ((tmp = vector_to_irq(tmp)) == -1)
5331 +                                       continue;
5332 +               }
5333 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
5334 +                       /*
5335 +                        * Hmm.. We don't have an entry for this,
5336 +                        * so default to an old-fashioned 8259
5337 +                        * interrupt if we can..
5338 +                        */
5339 +                       if (irq < 16)
5340 +                               make_8259A_irq(irq);
5341 +#ifndef CONFIG_XEN
5342 +                       else
5343 +                               /* Strange. Oh, well.. */
5344 +                               irq_desc[irq].chip = &no_irq_type;
5345 +#endif
5346 +               }
5347 +       }
5348 +}
5349 +
5350 +#ifndef CONFIG_XEN
5351 +static void enable_lapic_irq (unsigned int irq)
5352 +{
5353 +       unsigned long v;
5354 +
5355 +       v = apic_read(APIC_LVT0);
5356 +       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
5357 +}
5358 +
5359 +static void disable_lapic_irq (unsigned int irq)
5360 +{
5361 +       unsigned long v;
5362 +
5363 +       v = apic_read(APIC_LVT0);
5364 +       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
5365 +}
5366 +
5367 +static void ack_lapic_irq (unsigned int irq)
5368 +{
5369 +       ack_APIC_irq();
5370 +}
5371 +
5372 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
5373 +
5374 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
5375 +       .typename       = "local-APIC-edge",
5376 +       .startup        = NULL, /* startup_irq() not used for IRQ0 */
5377 +       .shutdown       = NULL, /* shutdown_irq() not used for IRQ0 */
5378 +       .enable         = enable_lapic_irq,
5379 +       .disable        = disable_lapic_irq,
5380 +       .ack            = ack_lapic_irq,
5381 +       .end            = end_lapic_irq
5382 +};
5383 +
5384 +static void setup_nmi (void)
5385 +{
5386 +       /*
5387 +        * Dirty trick to enable the NMI watchdog ...
5388 +        * We put the 8259A master into AEOI mode and
5389 +        * unmask on all local APICs LVT0 as NMI.
5390 +        *
5391 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
5392 +        * is from Maciej W. Rozycki - so we do not have to EOI from
5393 +        * the NMI handler or the timer interrupt.
5394 +        */
5395 +       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
5396 +
5397 +       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
5398 +
5399 +       apic_printk(APIC_VERBOSE, " done.\n");
5400 +}
5401 +
5402 +/*
5403 + * This looks a bit hackish but it's about the only one way of sending
5404 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
5405 + * not support the ExtINT mode, unfortunately.  We need to send these
5406 + * cycles as some i82489DX-based boards have glue logic that keeps the
5407 + * 8259A interrupt line asserted until INTA.  --macro
5408 + */
5409 +static inline void unlock_ExtINT_logic(void)
5410 +{
5411 +       int apic, pin, i;
5412 +       struct IO_APIC_route_entry entry0, entry1;
5413 +       unsigned char save_control, save_freq_select;
5414 +       unsigned long flags;
5415 +
5416 +       pin  = find_isa_irq_pin(8, mp_INT);
5417 +       apic = find_isa_irq_apic(8, mp_INT);
5418 +       if (pin == -1)
5419 +               return;
5420 +
5421 +       spin_lock_irqsave(&ioapic_lock, flags);
5422 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5423 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5424 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5425 +       clear_IO_APIC_pin(apic, pin);
5426 +
5427 +       memset(&entry1, 0, sizeof(entry1));
5428 +
5429 +       entry1.dest_mode = 0;                   /* physical delivery */
5430 +       entry1.mask = 0;                        /* unmask IRQ now */
5431 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
5432 +       entry1.delivery_mode = dest_ExtINT;
5433 +       entry1.polarity = entry0.polarity;
5434 +       entry1.trigger = 0;
5435 +       entry1.vector = 0;
5436 +
5437 +       spin_lock_irqsave(&ioapic_lock, flags);
5438 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
5439 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
5440 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5441 +
5442 +       save_control = CMOS_READ(RTC_CONTROL);
5443 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
5444 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
5445 +                  RTC_FREQ_SELECT);
5446 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
5447 +
5448 +       i = 100;
5449 +       while (i-- > 0) {
5450 +               mdelay(10);
5451 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
5452 +                       i -= 10;
5453 +       }
5454 +
5455 +       CMOS_WRITE(save_control, RTC_CONTROL);
5456 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
5457 +       clear_IO_APIC_pin(apic, pin);
5458 +
5459 +       spin_lock_irqsave(&ioapic_lock, flags);
5460 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
5461 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
5462 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5463 +}
5464 +
5465 +int timer_uses_ioapic_pin_0;
5466 +
5467 +/*
5468 + * This code may look a bit paranoid, but it's supposed to cooperate with
5469 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
5470 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
5471 + * fanatically on his truly buggy board.
5472 + */
5473 +static inline void check_timer(void)
5474 +{
5475 +       int apic1, pin1, apic2, pin2;
5476 +       int vector;
5477 +
5478 +       /*
5479 +        * get/set the timer IRQ vector:
5480 +        */
5481 +       disable_8259A_irq(0);
5482 +       vector = assign_irq_vector(0);
5483 +       set_intr_gate(vector, interrupt[0]);
5484 +
5485 +       /*
5486 +        * Subtle, code in do_timer_interrupt() expects an AEOI
5487 +        * mode for the 8259A whenever interrupts are routed
5488 +        * through I/O APICs.  Also IRQ0 has to be enabled in
5489 +        * the 8259A which implies the virtual wire has to be
5490 +        * disabled in the local APIC.
5491 +        */
5492 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5493 +       init_8259A(1);
5494 +       timer_ack = 1;
5495 +       if (timer_over_8254 > 0)
5496 +               enable_8259A_irq(0);
5497 +
5498 +       pin1  = find_isa_irq_pin(0, mp_INT);
5499 +       apic1 = find_isa_irq_apic(0, mp_INT);
5500 +       pin2  = ioapic_i8259.pin;
5501 +       apic2 = ioapic_i8259.apic;
5502 +
5503 +       if (pin1 == 0)
5504 +               timer_uses_ioapic_pin_0 = 1;
5505 +
5506 +       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
5507 +               vector, apic1, pin1, apic2, pin2);
5508 +
5509 +       if (pin1 != -1) {
5510 +               /*
5511 +                * Ok, does IRQ0 through the IOAPIC work?
5512 +                */
5513 +               unmask_IO_APIC_irq(0);
5514 +               if (timer_irq_works()) {
5515 +                       if (nmi_watchdog == NMI_IO_APIC) {
5516 +                               disable_8259A_irq(0);
5517 +                               setup_nmi();
5518 +                               enable_8259A_irq(0);
5519 +                       }
5520 +                       if (disable_timer_pin_1 > 0)
5521 +                               clear_IO_APIC_pin(0, pin1);
5522 +                       return;
5523 +               }
5524 +               clear_IO_APIC_pin(apic1, pin1);
5525 +               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
5526 +                               "IO-APIC\n");
5527 +       }
5528 +
5529 +       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
5530 +       if (pin2 != -1) {
5531 +               printk("\n..... (found pin %d) ...", pin2);
5532 +               /*
5533 +                * legacy devices should be connected to IO APIC #0
5534 +                */
5535 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
5536 +               if (timer_irq_works()) {
5537 +                       printk("works.\n");
5538 +                       if (pin1 != -1)
5539 +                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
5540 +                       else
5541 +                               add_pin_to_irq(0, apic2, pin2);
5542 +                       if (nmi_watchdog == NMI_IO_APIC) {
5543 +                               setup_nmi();
5544 +                       }
5545 +                       return;
5546 +               }
5547 +               /*
5548 +                * Cleanup, just in case ...
5549 +                */
5550 +               clear_IO_APIC_pin(apic2, pin2);
5551 +       }
5552 +       printk(" failed.\n");
5553 +
5554 +       if (nmi_watchdog == NMI_IO_APIC) {
5555 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
5556 +               nmi_watchdog = 0;
5557 +       }
5558 +
5559 +       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
5560 +
5561 +       disable_8259A_irq(0);
5562 +       irq_desc[0].chip = &lapic_irq_type;
5563 +       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
5564 +       enable_8259A_irq(0);
5565 +
5566 +       if (timer_irq_works()) {
5567 +               printk(" works.\n");
5568 +               return;
5569 +       }
5570 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
5571 +       printk(" failed.\n");
5572 +
5573 +       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
5574 +
5575 +       timer_ack = 0;
5576 +       init_8259A(0);
5577 +       make_8259A_irq(0);
5578 +       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
5579 +
5580 +       unlock_ExtINT_logic();
5581 +
5582 +       if (timer_irq_works()) {
5583 +               printk(" works.\n");
5584 +               return;
5585 +       }
5586 +       printk(" failed :(.\n");
5587 +       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
5588 +               "report.  Then try booting with the 'noapic' option");
5589 +}
5590 +#else
5591 +int timer_uses_ioapic_pin_0 = 0;
5592 +#define check_timer() ((void)0)
5593 +#endif
5594 +
5595 +/*
5596 + *
5597 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
5598 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
5599 + *   Linux doesn't really care, as it's not actually used
5600 + *   for any interrupt handling anyway.
5601 + */
5602 +#define PIC_IRQS       (1 << PIC_CASCADE_IR)
5603 +
5604 +void __init setup_IO_APIC(void)
5605 +{
5606 +       enable_IO_APIC();
5607 +
5608 +       if (acpi_ioapic)
5609 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
5610 +       else
5611 +               io_apic_irqs = ~PIC_IRQS;
5612 +
5613 +       printk("ENABLING IO-APIC IRQs\n");
5614 +
5615 +       /*
5616 +        * Set up IO-APIC IRQ routing.
5617 +        */
5618 +       if (!acpi_ioapic)
5619 +               setup_ioapic_ids_from_mpc();
5620 +#ifndef CONFIG_XEN
5621 +       sync_Arb_IDs();
5622 +#endif
5623 +       setup_IO_APIC_irqs();
5624 +       init_IO_APIC_traps();
5625 +       check_timer();
5626 +       if (!acpi_ioapic)
5627 +               print_IO_APIC();
5628 +}
5629 +
5630 +static int __init setup_disable_8254_timer(char *s)
5631 +{
5632 +       timer_over_8254 = -1;
5633 +       return 1;
5634 +}
5635 +static int __init setup_enable_8254_timer(char *s)
5636 +{
5637 +       timer_over_8254 = 2;
5638 +       return 1;
5639 +}
5640 +
5641 +__setup("disable_8254_timer", setup_disable_8254_timer);
5642 +__setup("enable_8254_timer", setup_enable_8254_timer);
5643 +
5644 +/*
5645 + *     Called after all the initialization is done. If we didnt find any
5646 + *     APIC bugs then we can allow the modify fast path
5647 + */
5648 +
5649 +static int __init io_apic_bug_finalize(void)
5650 +{
5651 +       if(sis_apic_bug == -1)
5652 +               sis_apic_bug = 0;
5653 +       if (is_initial_xendomain()) {
5654 +               struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
5655 +               op.u.platform_quirk.quirk_id = sis_apic_bug ?
5656 +                       QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
5657 +               VOID(HYPERVISOR_platform_op(&op));
5658 +       }
5659 +       return 0;
5660 +}
5661 +
5662 +late_initcall(io_apic_bug_finalize);
5663 +
5664 +struct sysfs_ioapic_data {
5665 +       struct sys_device dev;
5666 +       struct IO_APIC_route_entry entry[0];
5667 +};
5668 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
5669 +
5670 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
5671 +{
5672 +       struct IO_APIC_route_entry *entry;
5673 +       struct sysfs_ioapic_data *data;
5674 +       unsigned long flags;
5675 +       int i;
5676 +
5677 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
5678 +       entry = data->entry;
5679 +       spin_lock_irqsave(&ioapic_lock, flags);
5680 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5681 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
5682 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
5683 +       }
5684 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5685 +
5686 +       return 0;
5687 +}
5688 +
5689 +static int ioapic_resume(struct sys_device *dev)
5690 +{
5691 +       struct IO_APIC_route_entry *entry;
5692 +       struct sysfs_ioapic_data *data;
5693 +       unsigned long flags;
5694 +       union IO_APIC_reg_00 reg_00;
5695 +       int i;
5696 +
5697 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
5698 +       entry = data->entry;
5699 +
5700 +       spin_lock_irqsave(&ioapic_lock, flags);
5701 +       reg_00.raw = io_apic_read(dev->id, 0);
5702 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
5703 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
5704 +               io_apic_write(dev->id, 0, reg_00.raw);
5705 +       }
5706 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5707 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
5708 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
5709 +       }
5710 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5711 +
5712 +       return 0;
5713 +}
5714 +
5715 +static struct sysdev_class ioapic_sysdev_class = {
5716 +       set_kset_name("ioapic"),
5717 +#ifndef CONFIG_XEN
5718 +       .suspend = ioapic_suspend,
5719 +       .resume = ioapic_resume,
5720 +#endif
5721 +};
5722 +
5723 +static int __init ioapic_init_sysfs(void)
5724 +{
5725 +       struct sys_device * dev;
5726 +       int i, size, error = 0;
5727 +
5728 +       error = sysdev_class_register(&ioapic_sysdev_class);
5729 +       if (error)
5730 +               return error;
5731 +
5732 +       for (i = 0; i < nr_ioapics; i++ ) {
5733 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
5734 +                       * sizeof(struct IO_APIC_route_entry);
5735 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
5736 +               if (!mp_ioapic_data[i]) {
5737 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5738 +                       continue;
5739 +               }
5740 +               memset(mp_ioapic_data[i], 0, size);
5741 +               dev = &mp_ioapic_data[i]->dev;
5742 +               dev->id = i;
5743 +               dev->cls = &ioapic_sysdev_class;
5744 +               error = sysdev_register(dev);
5745 +               if (error) {
5746 +                       kfree(mp_ioapic_data[i]);
5747 +                       mp_ioapic_data[i] = NULL;
5748 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5749 +                       continue;
5750 +               }
5751 +       }
5752 +
5753 +       return 0;
5754 +}
5755 +
5756 +device_initcall(ioapic_init_sysfs);
5757 +
5758 +/* --------------------------------------------------------------------------
5759 +                          ACPI-based IOAPIC Configuration
5760 +   -------------------------------------------------------------------------- */
5761 +
5762 +#ifdef CONFIG_ACPI
5763 +
5764 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
5765 +{
5766 +#ifndef CONFIG_XEN
5767 +       union IO_APIC_reg_00 reg_00;
5768 +       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
5769 +       physid_mask_t tmp;
5770 +       unsigned long flags;
5771 +       int i = 0;
5772 +
5773 +       /*
5774 +        * The P4 platform supports up to 256 APIC IDs on two separate APIC
5775 +        * buses (one for LAPICs, one for IOAPICs), where predecessors only
5776 +        * supports up to 16 on one shared APIC bus.
5777 +        *
5778 +        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
5779 +        *      advantage of new APIC bus architecture.
5780 +        */
5781 +
5782 +       if (physids_empty(apic_id_map))
5783 +               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
5784 +
5785 +       spin_lock_irqsave(&ioapic_lock, flags);
5786 +       reg_00.raw = io_apic_read(ioapic, 0);
5787 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5788 +
5789 +       if (apic_id >= get_physical_broadcast()) {
5790 +               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
5791 +                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
5792 +               apic_id = reg_00.bits.ID;
5793 +       }
5794 +
5795 +       /*
5796 +        * Every APIC in a system must have a unique ID or we get lots of nice
5797 +        * 'stuck on smp_invalidate_needed IPI wait' messages.
5798 +        */
5799 +       if (check_apicid_used(apic_id_map, apic_id)) {
5800 +
5801 +               for (i = 0; i < get_physical_broadcast(); i++) {
5802 +                       if (!check_apicid_used(apic_id_map, i))
5803 +                               break;
5804 +               }
5805 +
5806 +               if (i == get_physical_broadcast())
5807 +                       panic("Max apic_id exceeded!\n");
5808 +
5809 +               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
5810 +                       "trying %d\n", ioapic, apic_id, i);
5811 +
5812 +               apic_id = i;
5813 +       }
5814 +
5815 +       tmp = apicid_to_cpu_present(apic_id);
5816 +       physids_or(apic_id_map, apic_id_map, tmp);
5817 +
5818 +       if (reg_00.bits.ID != apic_id) {
5819 +               reg_00.bits.ID = apic_id;
5820 +
5821 +               spin_lock_irqsave(&ioapic_lock, flags);
5822 +               io_apic_write(ioapic, 0, reg_00.raw);
5823 +               reg_00.raw = io_apic_read(ioapic, 0);
5824 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5825 +
5826 +               /* Sanity check */
5827 +               if (reg_00.bits.ID != apic_id) {
5828 +                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
5829 +                       return -1;
5830 +               }
5831 +       }
5832 +
5833 +       apic_printk(APIC_VERBOSE, KERN_INFO
5834 +                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
5835 +#endif /* !CONFIG_XEN */
5836 +
5837 +       return apic_id;
5838 +}
5839 +
5840 +
5841 +int __init io_apic_get_version (int ioapic)
5842 +{
5843 +       union IO_APIC_reg_01    reg_01;
5844 +       unsigned long flags;
5845 +
5846 +       spin_lock_irqsave(&ioapic_lock, flags);
5847 +       reg_01.raw = io_apic_read(ioapic, 1);
5848 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5849 +
5850 +       return reg_01.bits.version;
5851 +}
5852 +
5853 +
5854 +int __init io_apic_get_redir_entries (int ioapic)
5855 +{
5856 +       union IO_APIC_reg_01    reg_01;
5857 +       unsigned long flags;
5858 +
5859 +       spin_lock_irqsave(&ioapic_lock, flags);
5860 +       reg_01.raw = io_apic_read(ioapic, 1);
5861 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5862 +
5863 +       return reg_01.bits.entries;
5864 +}
5865 +
5866 +
5867 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
5868 +{
5869 +       struct IO_APIC_route_entry entry;
5870 +       unsigned long flags;
5871 +
5872 +       if (!IO_APIC_IRQ(irq)) {
5873 +               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
5874 +                       ioapic);
5875 +               return -EINVAL;
5876 +       }
5877 +
5878 +       /*
5879 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
5880 +        * Note that we mask (disable) IRQs now -- these get enabled when the
5881 +        * corresponding device driver registers for this IRQ.
5882 +        */
5883 +
5884 +       memset(&entry,0,sizeof(entry));
5885 +
5886 +       entry.delivery_mode = INT_DELIVERY_MODE;
5887 +       entry.dest_mode = INT_DEST_MODE;
5888 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5889 +       entry.trigger = edge_level;
5890 +       entry.polarity = active_high_low;
5891 +       entry.mask  = 1;
5892 +
5893 +       /*
5894 +        * IRQs < 16 are already in the irq_2_pin[] map
5895 +        */
5896 +       if (irq >= 16)
5897 +               add_pin_to_irq(irq, ioapic, pin);
5898 +
5899 +       entry.vector = assign_irq_vector(irq);
5900 +
5901 +       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
5902 +               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
5903 +               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
5904 +               edge_level, active_high_low);
5905 +
5906 +       ioapic_register_intr(irq, entry.vector, edge_level);
5907 +
5908 +       if (!ioapic && (irq < 16))
5909 +               disable_8259A_irq(irq);
5910 +
5911 +       spin_lock_irqsave(&ioapic_lock, flags);
5912 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
5913 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
5914 +       set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
5915 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5916 +
5917 +       return 0;
5918 +}
5919 +
5920 +#endif /* CONFIG_ACPI */
5921 Index: head-2008-11-25/arch/x86/kernel/ioport_32-xen.c
5922 ===================================================================
5923 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
5924 +++ head-2008-11-25/arch/x86/kernel/ioport_32-xen.c     2008-01-28 12:24:19.000000000 +0100
5925 @@ -0,0 +1,123 @@
5926 +/*
5927 + *     linux/arch/i386/kernel/ioport.c
5928 + *
5929 + * This contains the io-permission bitmap code - written by obz, with changes
5930 + * by Linus.
5931 + */
5932 +
5933 +#include <linux/sched.h>
5934 +#include <linux/kernel.h>
5935 +#include <linux/capability.h>
5936 +#include <linux/errno.h>
5937 +#include <linux/types.h>
5938 +#include <linux/ioport.h>
5939 +#include <linux/smp.h>
5940 +#include <linux/smp_lock.h>
5941 +#include <linux/stddef.h>
5942 +#include <linux/slab.h>
5943 +#include <linux/thread_info.h>
5944 +#include <xen/interface/physdev.h>
5945 +
5946 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
5947 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
5948 +{
5949 +       unsigned long mask;
5950 +       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
5951 +       unsigned int low_index = base & (BITS_PER_LONG-1);
5952 +       int length = low_index + extent;
5953 +
5954 +       if (low_index != 0) {
5955 +               mask = (~0UL << low_index);
5956 +               if (length < BITS_PER_LONG)
5957 +                       mask &= ~(~0UL << length);
5958 +               if (new_value)
5959 +                       *bitmap_base++ |= mask;
5960 +               else
5961 +                       *bitmap_base++ &= ~mask;
5962 +               length -= BITS_PER_LONG;
5963 +       }
5964 +
5965 +       mask = (new_value ? ~0UL : 0UL);
5966 +       while (length >= BITS_PER_LONG) {
5967 +               *bitmap_base++ = mask;
5968 +               length -= BITS_PER_LONG;
5969 +       }
5970 +
5971 +       if (length > 0) {
5972 +               mask = ~(~0UL << length);
5973 +               if (new_value)
5974 +                       *bitmap_base++ |= mask;
5975 +               else
5976 +                       *bitmap_base++ &= ~mask;
5977 +       }
5978 +}
5979 +
5980 +
5981 +/*
5982 + * this changes the io permissions bitmap in the current task.
5983 + */
5984 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
5985 +{
5986 +       struct thread_struct * t = &current->thread;
5987 +       unsigned long *bitmap;
5988 +       struct physdev_set_iobitmap set_iobitmap;
5989 +
5990 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
5991 +               return -EINVAL;
5992 +       if (turn_on && !capable(CAP_SYS_RAWIO))
5993 +               return -EPERM;
5994 +
5995 +       /*
5996 +        * If it's the first ioperm() call in this thread's lifetime, set the
5997 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
5998 +        * this is why we delay this operation until now:
5999 +        */
6000 +       if (!t->io_bitmap_ptr) {
6001 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6002 +               if (!bitmap)
6003 +                       return -ENOMEM;
6004 +
6005 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
6006 +               t->io_bitmap_ptr = bitmap;
6007 +               set_thread_flag(TIF_IO_BITMAP);
6008 +
6009 +               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
6010 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
6011 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
6012 +                                             &set_iobitmap));
6013 +       }
6014 +
6015 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6016 +
6017 +       return 0;
6018 +}
6019 +
6020 +/*
6021 + * sys_iopl has to be used when you want to access the IO ports
6022 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6023 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6024 + *
6025 + * Here we just change the eflags value on the stack: we allow
6026 + * only the super-user to do it. This depends on the stack-layout
6027 + * on system-call entry - see also fork() and the signal handling
6028 + * code.
6029 + */
6030 +
6031 +asmlinkage long sys_iopl(unsigned long unused)
6032 +{
6033 +       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6034 +       unsigned int level = regs->ebx;
6035 +       struct thread_struct *t = &current->thread;
6036 +       unsigned int old = (t->iopl >> 12) & 3;
6037 +
6038 +       if (level > 3)
6039 +               return -EINVAL;
6040 +       /* Trying to gain more privileges? */
6041 +       if (level > old) {
6042 +               if (!capable(CAP_SYS_RAWIO))
6043 +                       return -EPERM;
6044 +       }
6045 +       t->iopl = level << 12;
6046 +       set_iopl_mask(t->iopl);
6047 +       return 0;
6048 +}
6049 Index: head-2008-11-25/arch/x86/kernel/irq_32-xen.c
6050 ===================================================================
6051 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
6052 +++ head-2008-11-25/arch/x86/kernel/irq_32-xen.c        2008-10-29 09:55:56.000000000 +0100
6053 @@ -0,0 +1,324 @@
6054 +/*
6055 + *     linux/arch/i386/kernel/irq.c
6056 + *
6057 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6058 + *
6059 + * This file contains the lowest level x86-specific interrupt
6060 + * entry, irq-stacks and irq statistics code. All the remaining
6061 + * irq logic is done by the generic kernel/irq/ code and
6062 + * by the x86-specific irq controller code. (e.g. i8259.c and
6063 + * io_apic.c.)
6064 + */
6065 +
6066 +#include <asm/uaccess.h>
6067 +#include <linux/module.h>
6068 +#include <linux/seq_file.h>
6069 +#include <linux/interrupt.h>
6070 +#include <linux/kernel_stat.h>
6071 +#include <linux/notifier.h>
6072 +#include <linux/cpu.h>
6073 +#include <linux/delay.h>
6074 +
6075 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6076 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6077 +
6078 +#ifndef CONFIG_X86_LOCAL_APIC
6079 +/*
6080 + * 'what should we do if we get a hw irq event on an illegal vector'.
6081 + * each architecture has to answer this themselves.
6082 + */
6083 +void ack_bad_irq(unsigned int irq)
6084 +{
6085 +       printk("unexpected IRQ trap at vector %02x\n", irq);
6086 +}
6087 +#endif
6088 +
6089 +#ifdef CONFIG_4KSTACKS
6090 +/*
6091 + * per-CPU IRQ handling contexts (thread information and stack)
6092 + */
6093 +union irq_ctx {
6094 +       struct thread_info      tinfo;
6095 +       u32                     stack[THREAD_SIZE/sizeof(u32)];
6096 +};
6097 +
6098 +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
6099 +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
6100 +#endif
6101 +
6102 +/*
6103 + * do_IRQ handles all normal device IRQ's (the special
6104 + * SMP cross-CPU interrupts have their own specific
6105 + * handlers).
6106 + */
6107 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6108 +{
6109 +       /* high bit used in ret_from_ code */
6110 +       int irq = ~regs->orig_eax;
6111 +#ifdef CONFIG_4KSTACKS
6112 +       union irq_ctx *curctx, *irqctx;
6113 +       u32 *isp;
6114 +#endif
6115 +
6116 +       if (unlikely((unsigned)irq >= NR_IRQS)) {
6117 +               printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
6118 +                                       __FUNCTION__, irq);
6119 +               BUG();
6120 +       }
6121 +
6122 +       /*irq_enter();*/
6123 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6124 +       /* Debugging check for stack overflow: is there less than 1KB free? */
6125 +       {
6126 +               long esp;
6127 +
6128 +               __asm__ __volatile__("andl %%esp,%0" :
6129 +                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
6130 +               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6131 +                       printk("do_IRQ: stack overflow: %ld\n",
6132 +                               esp - sizeof(struct thread_info));
6133 +                       dump_stack();
6134 +               }
6135 +       }
6136 +#endif
6137 +
6138 +#ifdef CONFIG_4KSTACKS
6139 +
6140 +       curctx = (union irq_ctx *) current_thread_info();
6141 +       irqctx = hardirq_ctx[smp_processor_id()];
6142 +
6143 +       /*
6144 +        * this is where we switch to the IRQ stack. However, if we are
6145 +        * already using the IRQ stack (because we interrupted a hardirq
6146 +        * handler) we can't do that and just have to keep using the
6147 +        * current stack (which is the irq stack already after all)
6148 +        */
6149 +       if (curctx != irqctx) {
6150 +               int arg1, arg2, ebx;
6151 +
6152 +               /* build the stack frame on the IRQ stack */
6153 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6154 +               irqctx->tinfo.task = curctx->tinfo.task;
6155 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6156 +
6157 +               /*
6158 +                * Copy the softirq bits in preempt_count so that the
6159 +                * softirq checks work in the hardirq context.
6160 +                */
6161 +               irqctx->tinfo.preempt_count =
6162 +                       (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
6163 +                       (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
6164 +
6165 +               asm volatile(
6166 +                       "       xchgl   %%ebx,%%esp      \n"
6167 +                       "       call    __do_IRQ         \n"
6168 +                       "       movl   %%ebx,%%esp      \n"
6169 +                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6170 +                       :  "0" (irq),   "1" (regs),  "2" (isp)
6171 +                       : "memory", "cc", "ecx"
6172 +               );
6173 +       } else
6174 +#endif
6175 +               __do_IRQ(irq, regs);
6176 +
6177 +       /*irq_exit();*/
6178 +
6179 +       return 1;
6180 +}
6181 +
6182 +#ifdef CONFIG_4KSTACKS
6183 +
6184 +/*
6185 + * These should really be __section__(".bss.page_aligned") as well, but
6186 + * gcc's 3.0 and earlier don't handle that correctly.
6187 + */
6188 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6189 +               __attribute__((__aligned__(THREAD_SIZE)));
6190 +
6191 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6192 +               __attribute__((__aligned__(THREAD_SIZE)));
6193 +
6194 +/*
6195 + * allocate per-cpu stacks for hardirq and for softirq processing
6196 + */
6197 +void irq_ctx_init(int cpu)
6198 +{
6199 +       union irq_ctx *irqctx;
6200 +
6201 +       if (hardirq_ctx[cpu])
6202 +               return;
6203 +
6204 +       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
6205 +       irqctx->tinfo.task              = NULL;
6206 +       irqctx->tinfo.exec_domain       = NULL;
6207 +       irqctx->tinfo.cpu               = cpu;
6208 +       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
6209 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
6210 +
6211 +       hardirq_ctx[cpu] = irqctx;
6212 +
6213 +       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
6214 +       irqctx->tinfo.task              = NULL;
6215 +       irqctx->tinfo.exec_domain       = NULL;
6216 +       irqctx->tinfo.cpu               = cpu;
6217 +       irqctx->tinfo.preempt_count     = 0;
6218 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
6219 +
6220 +       softirq_ctx[cpu] = irqctx;
6221 +
6222 +       printk("CPU %u irqstacks, hard=%p soft=%p\n",
6223 +               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
6224 +}
6225 +
6226 +void irq_ctx_exit(int cpu)
6227 +{
6228 +       hardirq_ctx[cpu] = NULL;
6229 +}
6230 +
6231 +extern asmlinkage void __do_softirq(void);
6232 +
6233 +asmlinkage void do_softirq(void)
6234 +{
6235 +       unsigned long flags;
6236 +       struct thread_info *curctx;
6237 +       union irq_ctx *irqctx;
6238 +       u32 *isp;
6239 +
6240 +       if (in_interrupt())
6241 +               return;
6242 +
6243 +       local_irq_save(flags);
6244 +
6245 +       if (local_softirq_pending()) {
6246 +               curctx = current_thread_info();
6247 +               irqctx = softirq_ctx[smp_processor_id()];
6248 +               irqctx->tinfo.task = curctx->task;
6249 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6250 +
6251 +               /* build the stack frame on the softirq stack */
6252 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6253 +
6254 +               asm volatile(
6255 +                       "       xchgl   %%ebx,%%esp     \n"
6256 +                       "       call    __do_softirq    \n"
6257 +                       "       movl    %%ebx,%%esp     \n"
6258 +                       : "=b"(isp)
6259 +                       : "0"(isp)
6260 +                       : "memory", "cc", "edx", "ecx", "eax"
6261 +               );
6262 +               /*
6263 +                * Shouldnt happen, we returned above if in_interrupt():
6264 +                */
6265 +               WARN_ON_ONCE(softirq_count());
6266 +       }
6267 +
6268 +       local_irq_restore(flags);
6269 +}
6270 +
6271 +EXPORT_SYMBOL(do_softirq);
6272 +#endif
6273 +
6274 +/*
6275 + * Interrupt statistics:
6276 + */
6277 +
6278 +atomic_t irq_err_count;
6279 +
6280 +/*
6281 + * /proc/interrupts printing:
6282 + */
6283 +
6284 +int show_interrupts(struct seq_file *p, void *v)
6285 +{
6286 +       int i = *(loff_t *) v, j;
6287 +       struct irqaction * action;
6288 +       unsigned long flags;
6289 +
6290 +       if (i == 0) {
6291 +               seq_printf(p, "           ");
6292 +               for_each_online_cpu(j)
6293 +                       seq_printf(p, "CPU%-8d",j);
6294 +               seq_putc(p, '\n');
6295 +       }
6296 +
6297 +       if (i < NR_IRQS) {
6298 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
6299 +               action = irq_desc[i].action;
6300 +               if (!action)
6301 +                       goto skip;
6302 +               seq_printf(p, "%3d: ",i);
6303 +#ifndef CONFIG_SMP
6304 +               seq_printf(p, "%10u ", kstat_irqs(i));
6305 +#else
6306 +               for_each_online_cpu(j)
6307 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
6308 +#endif
6309 +               seq_printf(p, " %14s", irq_desc[i].chip->typename);
6310 +               seq_printf(p, "  %s", action->name);
6311 +
6312 +               for (action=action->next; action; action = action->next)
6313 +                       seq_printf(p, ", %s", action->name);
6314 +
6315 +               seq_putc(p, '\n');
6316 +skip:
6317 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
6318 +       } else if (i == NR_IRQS) {
6319 +               seq_printf(p, "NMI: ");
6320 +               for_each_online_cpu(j)
6321 +                       seq_printf(p, "%10u ", nmi_count(j));
6322 +               seq_putc(p, '\n');
6323 +#ifdef CONFIG_X86_LOCAL_APIC
6324 +               seq_printf(p, "LOC: ");
6325 +               for_each_online_cpu(j)
6326 +                       seq_printf(p, "%10u ",
6327 +                               per_cpu(irq_stat,j).apic_timer_irqs);
6328 +               seq_putc(p, '\n');
6329 +#endif
6330 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
6331 +#if defined(CONFIG_X86_IO_APIC)
6332 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
6333 +#endif
6334 +       }
6335 +       return 0;
6336 +}
6337 +
6338 +#ifdef CONFIG_HOTPLUG_CPU
6339 +
6340 +void fixup_irqs(cpumask_t map)
6341 +{
6342 +       unsigned int irq;
6343 +       static int warned;
6344 +
6345 +       for (irq = 0; irq < NR_IRQS; irq++) {
6346 +               cpumask_t mask;
6347 +               if (irq == 2)
6348 +                       continue;
6349 +
6350 +               cpus_and(mask, irq_desc[irq].affinity, map);
6351 +               if (any_online_cpu(mask) == NR_CPUS) {
6352 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
6353 +                       mask = map;
6354 +               }
6355 +               if (irq_desc[irq].chip->set_affinity)
6356 +                       irq_desc[irq].chip->set_affinity(irq, mask);
6357 +               else if (irq_desc[irq].action && !(warned++))
6358 +                       printk("Cannot set affinity for irq %i\n", irq);
6359 +       }
6360 +
6361 +#if 0
6362 +       barrier();
6363 +       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
6364 +          [note the nop - the interrupt-enable boundary on x86 is two
6365 +          instructions from sti] - to flush out pending hardirqs and
6366 +          IPIs. After this point nothing is supposed to reach this CPU." */
6367 +       __asm__ __volatile__("sti; nop; cli");
6368 +       barrier();
6369 +#else
6370 +       /* That doesn't seem sufficient.  Give it 1ms. */
6371 +       local_irq_enable();
6372 +       mdelay(1);
6373 +       local_irq_disable();
6374 +#endif
6375 +}
6376 +#endif
6377 +
6378 Index: head-2008-11-25/arch/x86/kernel/ldt_32-xen.c
6379 ===================================================================
6380 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
6381 +++ head-2008-11-25/arch/x86/kernel/ldt_32-xen.c        2007-06-12 13:12:48.000000000 +0200
6382 @@ -0,0 +1,270 @@
6383 +/*
6384 + * linux/kernel/ldt.c
6385 + *
6386 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
6387 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6388 + */
6389 +
6390 +#include <linux/errno.h>
6391 +#include <linux/sched.h>
6392 +#include <linux/string.h>
6393 +#include <linux/mm.h>
6394 +#include <linux/smp.h>
6395 +#include <linux/smp_lock.h>
6396 +#include <linux/vmalloc.h>
6397 +#include <linux/slab.h>
6398 +
6399 +#include <asm/uaccess.h>
6400 +#include <asm/system.h>
6401 +#include <asm/ldt.h>
6402 +#include <asm/desc.h>
6403 +#include <asm/mmu_context.h>
6404 +
6405 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
6406 +static void flush_ldt(void *null)
6407 +{
6408 +       if (current->active_mm)
6409 +               load_LDT(&current->active_mm->context);
6410 +}
6411 +#endif
6412 +
6413 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
6414 +{
6415 +       void *oldldt;
6416 +       void *newldt;
6417 +       int oldsize;
6418 +
6419 +       if (mincount <= pc->size)
6420 +               return 0;
6421 +       oldsize = pc->size;
6422 +       mincount = (mincount+511)&(~511);
6423 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
6424 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
6425 +       else
6426 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
6427 +
6428 +       if (!newldt)
6429 +               return -ENOMEM;
6430 +
6431 +       if (oldsize)
6432 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
6433 +       oldldt = pc->ldt;
6434 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
6435 +       pc->ldt = newldt;
6436 +       wmb();
6437 +       pc->size = mincount;
6438 +       wmb();
6439 +
6440 +       if (reload) {
6441 +#ifdef CONFIG_SMP
6442 +               cpumask_t mask;
6443 +               preempt_disable();
6444 +#endif
6445 +               make_pages_readonly(
6446 +                       pc->ldt,
6447 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6448 +                       XENFEAT_writable_descriptor_tables);
6449 +               load_LDT(pc);
6450 +#ifdef CONFIG_SMP
6451 +               mask = cpumask_of_cpu(smp_processor_id());
6452 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
6453 +                       smp_call_function(flush_ldt, NULL, 1, 1);
6454 +               preempt_enable();
6455 +#endif
6456 +       }
6457 +       if (oldsize) {
6458 +               make_pages_writable(
6459 +                       oldldt,
6460 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
6461 +                       XENFEAT_writable_descriptor_tables);
6462 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
6463 +                       vfree(oldldt);
6464 +               else
6465 +                       kfree(oldldt);
6466 +       }
6467 +       return 0;
6468 +}
6469 +
6470 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
6471 +{
6472 +       int err = alloc_ldt(new, old->size, 0);
6473 +       if (err < 0)
6474 +               return err;
6475 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
6476 +       make_pages_readonly(
6477 +               new->ldt,
6478 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6479 +               XENFEAT_writable_descriptor_tables);
6480 +       return 0;
6481 +}
6482 +
6483 +/*
6484 + * we do not have to muck with descriptors here, that is
6485 + * done in switch_mm() as needed.
6486 + */
6487 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
6488 +{
6489 +       struct mm_struct * old_mm;
6490 +       int retval = 0;
6491 +
6492 +       init_MUTEX(&mm->context.sem);
6493 +       mm->context.size = 0;
6494 +       mm->context.has_foreign_mappings = 0;
6495 +       old_mm = current->mm;
6496 +       if (old_mm && old_mm->context.size > 0) {
6497 +               down(&old_mm->context.sem);
6498 +               retval = copy_ldt(&mm->context, &old_mm->context);
6499 +               up(&old_mm->context.sem);
6500 +       }
6501 +       return retval;
6502 +}
6503 +
6504 +/*
6505 + * No need to lock the MM as we are the last user
6506 + */
6507 +void destroy_context(struct mm_struct *mm)
6508 +{
6509 +       if (mm->context.size) {
6510 +               if (mm == current->active_mm)
6511 +                       clear_LDT();
6512 +               make_pages_writable(
6513 +                       mm->context.ldt,
6514 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6515 +                       XENFEAT_writable_descriptor_tables);
6516 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
6517 +                       vfree(mm->context.ldt);
6518 +               else
6519 +                       kfree(mm->context.ldt);
6520 +               mm->context.size = 0;
6521 +       }
6522 +}
6523 +
6524 +static int read_ldt(void __user * ptr, unsigned long bytecount)
6525 +{
6526 +       int err;
6527 +       unsigned long size;
6528 +       struct mm_struct * mm = current->mm;
6529 +
6530 +       if (!mm->context.size)
6531 +               return 0;
6532 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
6533 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
6534 +
6535 +       down(&mm->context.sem);
6536 +       size = mm->context.size*LDT_ENTRY_SIZE;
6537 +       if (size > bytecount)
6538 +               size = bytecount;
6539 +
6540 +       err = 0;
6541 +       if (copy_to_user(ptr, mm->context.ldt, size))
6542 +               err = -EFAULT;
6543 +       up(&mm->context.sem);
6544 +       if (err < 0)
6545 +               goto error_return;
6546 +       if (size != bytecount) {
6547 +               /* zero-fill the rest */
6548 +               if (clear_user(ptr+size, bytecount-size) != 0) {
6549 +                       err = -EFAULT;
6550 +                       goto error_return;
6551 +               }
6552 +       }
6553 +       return bytecount;
6554 +error_return:
6555 +       return err;
6556 +}
6557 +
6558 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
6559 +{
6560 +       int err;
6561 +       unsigned long size;
6562 +       void *address;
6563 +
6564 +       err = 0;
6565 +       address = &default_ldt[0];
6566 +       size = 5*sizeof(struct desc_struct);
6567 +       if (size > bytecount)
6568 +               size = bytecount;
6569 +
6570 +       err = size;
6571 +       if (copy_to_user(ptr, address, size))
6572 +               err = -EFAULT;
6573 +
6574 +       return err;
6575 +}
6576 +
6577 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
6578 +{
6579 +       struct mm_struct * mm = current->mm;
6580 +       __u32 entry_1, entry_2;
6581 +       int error;
6582 +       struct user_desc ldt_info;
6583 +
6584 +       error = -EINVAL;
6585 +       if (bytecount != sizeof(ldt_info))
6586 +               goto out;
6587 +       error = -EFAULT;
6588 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
6589 +               goto out;
6590 +
6591 +       error = -EINVAL;
6592 +       if (ldt_info.entry_number >= LDT_ENTRIES)
6593 +               goto out;
6594 +       if (ldt_info.contents == 3) {
6595 +               if (oldmode)
6596 +                       goto out;
6597 +               if (ldt_info.seg_not_present == 0)
6598 +                       goto out;
6599 +       }
6600 +
6601 +       down(&mm->context.sem);
6602 +       if (ldt_info.entry_number >= mm->context.size) {
6603 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
6604 +               if (error < 0)
6605 +                       goto out_unlock;
6606 +       }
6607 +
6608 +       /* Allow LDTs to be cleared by the user. */
6609 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
6610 +               if (oldmode || LDT_empty(&ldt_info)) {
6611 +                       entry_1 = 0;
6612 +                       entry_2 = 0;
6613 +                       goto install;
6614 +               }
6615 +       }
6616 +
6617 +       entry_1 = LDT_entry_a(&ldt_info);
6618 +       entry_2 = LDT_entry_b(&ldt_info);
6619 +       if (oldmode)
6620 +               entry_2 &= ~(1 << 20);
6621 +
6622 +       /* Install the new entry ...  */
6623 +install:
6624 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
6625 +                               entry_1, entry_2);
6626 +
6627 +out_unlock:
6628 +       up(&mm->context.sem);
6629 +out:
6630 +       return error;
6631 +}
6632 +
6633 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
6634 +{
6635 +       int ret = -ENOSYS;
6636 +
6637 +       switch (func) {
6638 +       case 0:
6639 +               ret = read_ldt(ptr, bytecount);
6640 +               break;
6641 +       case 1:
6642 +               ret = write_ldt(ptr, bytecount, 1);
6643 +               break;
6644 +       case 2:
6645 +               ret = read_default_ldt(ptr, bytecount);
6646 +               break;
6647 +       case 0x11:
6648 +               ret = write_ldt(ptr, bytecount, 0);
6649 +               break;
6650 +       }
6651 +       return ret;
6652 +}
6653 Index: head-2008-11-25/arch/x86/kernel/microcode-xen.c
6654 ===================================================================
6655 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
6656 +++ head-2008-11-25/arch/x86/kernel/microcode-xen.c     2007-06-12 13:12:48.000000000 +0200
6657 @@ -0,0 +1,144 @@
6658 +/*
6659 + *     Intel CPU Microcode Update Driver for Linux
6660 + *
6661 + *     Copyright (C) 2000-2004 Tigran Aivazian
6662 + *
6663 + *     This driver allows to upgrade microcode on Intel processors
6664 + *     belonging to IA-32 family - PentiumPro, Pentium II,
6665 + *     Pentium III, Xeon, Pentium 4, etc.
6666 + *
6667 + *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
6668 + *     Order Number 245472 or free download from:
6669 + *
6670 + *     http://developer.intel.com/design/pentium4/manuals/245472.htm
6671 + *
6672 + *     For more information, go to http://www.urbanmyth.org/microcode
6673 + *
6674 + *     This program is free software; you can redistribute it and/or
6675 + *     modify it under the terms of the GNU General Public License
6676 + *     as published by the Free Software Foundation; either version
6677 + *     2 of the License, or (at your option) any later version.
6678 + */
6679 +
6680 +//#define DEBUG /* pr_debug */
6681 +#include <linux/capability.h>
6682 +#include <linux/kernel.h>
6683 +#include <linux/init.h>
6684 +#include <linux/sched.h>
6685 +#include <linux/cpumask.h>
6686 +#include <linux/module.h>
6687 +#include <linux/slab.h>
6688 +#include <linux/vmalloc.h>
6689 +#include <linux/miscdevice.h>
6690 +#include <linux/spinlock.h>
6691 +#include <linux/mm.h>
6692 +#include <linux/mutex.h>
6693 +#include <linux/syscalls.h>
6694 +
6695 +#include <asm/msr.h>
6696 +#include <asm/uaccess.h>
6697 +#include <asm/processor.h>
6698 +
6699 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
6700 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
6701 +MODULE_LICENSE("GPL");
6702 +
6703 +static int verbose;
6704 +module_param(verbose, int, 0644);
6705 +
6706 +#define MICROCODE_VERSION      "1.14a-xen"
6707 +
6708 +#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
6709 +#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
6710 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
6711 +
6712 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
6713 +static DEFINE_MUTEX(microcode_mutex);
6714 +
6715 +static int microcode_open (struct inode *unused1, struct file *unused2)
6716 +{
6717 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
6718 +}
6719 +
6720 +
6721 +static int do_microcode_update (const void __user *ubuf, size_t len)
6722 +{
6723 +       int err;
6724 +       void *kbuf;
6725 +
6726 +       kbuf = vmalloc(len);
6727 +       if (!kbuf)
6728 +               return -ENOMEM;
6729 +
6730 +       if (copy_from_user(kbuf, ubuf, len) == 0) {
6731 +               struct xen_platform_op op;
6732 +
6733 +               op.cmd = XENPF_microcode_update;
6734 +               set_xen_guest_handle(op.u.microcode.data, kbuf);
6735 +               op.u.microcode.length = len;
6736 +               err = HYPERVISOR_platform_op(&op);
6737 +       } else
6738 +               err = -EFAULT;
6739 +
6740 +       vfree(kbuf);
6741 +
6742 +       return err;
6743 +}
6744 +
6745 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
6746 +{
6747 +       ssize_t ret;
6748 +
6749 +       if (len < MC_HEADER_SIZE) {
6750 +               printk(KERN_ERR "microcode: not enough data\n");
6751 +               return -EINVAL;
6752 +       }
6753 +
6754 +       mutex_lock(&microcode_mutex);
6755 +
6756 +       ret = do_microcode_update(buf, len);
6757 +       if (!ret)
6758 +               ret = (ssize_t)len;
6759 +
6760 +       mutex_unlock(&microcode_mutex);
6761 +
6762 +       return ret;
6763 +}
6764 +
6765 +static struct file_operations microcode_fops = {
6766 +       .owner          = THIS_MODULE,
6767 +       .write          = microcode_write,
6768 +       .open           = microcode_open,
6769 +};
6770 +
6771 +static struct miscdevice microcode_dev = {
6772 +       .minor          = MICROCODE_MINOR,
6773 +       .name           = "microcode",
6774 +       .fops           = &microcode_fops,
6775 +};
6776 +
6777 +static int __init microcode_init (void)
6778 +{
6779 +       int error;
6780 +
6781 +       error = misc_register(&microcode_dev);
6782 +       if (error) {
6783 +               printk(KERN_ERR
6784 +                       "microcode: can't misc_register on minor=%d\n",
6785 +                       MICROCODE_MINOR);
6786 +               return error;
6787 +       }
6788 +
6789 +       printk(KERN_INFO
6790 +               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
6791 +       return 0;
6792 +}
6793 +
6794 +static void __exit microcode_exit (void)
6795 +{
6796 +       misc_deregister(&microcode_dev);
6797 +}
6798 +
6799 +module_init(microcode_init)
6800 +module_exit(microcode_exit)
6801 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
6802 Index: head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c
6803 ===================================================================
6804 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
6805 +++ head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c    2007-06-12 13:12:48.000000000 +0200
6806 @@ -0,0 +1,1185 @@
6807 +/*
6808 + *     Intel Multiprocessor Specification 1.1 and 1.4
6809 + *     compliant MP-table parsing routines.
6810 + *
6811 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6812 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6813 + *
6814 + *     Fixes
6815 + *             Erich Boleyn    :       MP v1.4 and additional changes.
6816 + *             Alan Cox        :       Added EBDA scanning
6817 + *             Ingo Molnar     :       various cleanups and rewrites
6818 + *             Maciej W. Rozycki:      Bits for default MP configurations
6819 + *             Paul Diefenbaugh:       Added full ACPI support
6820 + */
6821 +
6822 +#include <linux/mm.h>
6823 +#include <linux/init.h>
6824 +#include <linux/acpi.h>
6825 +#include <linux/delay.h>
6826 +#include <linux/bootmem.h>
6827 +#include <linux/smp_lock.h>
6828 +#include <linux/kernel_stat.h>
6829 +#include <linux/mc146818rtc.h>
6830 +#include <linux/bitops.h>
6831 +
6832 +#include <asm/smp.h>
6833 +#include <asm/acpi.h>
6834 +#include <asm/mtrr.h>
6835 +#include <asm/mpspec.h>
6836 +#include <asm/io_apic.h>
6837 +
6838 +#include <mach_apic.h>
6839 +#include <mach_mpparse.h>
6840 +#include <bios_ebda.h>
6841 +
6842 +/* Have we found an MP table */
6843 +int smp_found_config;
6844 +unsigned int __initdata maxcpus = NR_CPUS;
6845 +
6846 +/*
6847 + * Various Linux-internal data structures created from the
6848 + * MP-table.
6849 + */
6850 +int apic_version [MAX_APICS];
6851 +int mp_bus_id_to_type [MAX_MP_BUSSES];
6852 +int mp_bus_id_to_node [MAX_MP_BUSSES];
6853 +int mp_bus_id_to_local [MAX_MP_BUSSES];
6854 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
6855 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
6856 +static int mp_current_pci_id;
6857 +
6858 +/* I/O APIC entries */
6859 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6860 +
6861 +/* # of MP IRQ source entries */
6862 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6863 +
6864 +/* MP IRQ source entries */
6865 +int mp_irq_entries;
6866 +
6867 +int nr_ioapics;
6868 +
6869 +int pic_mode;
6870 +unsigned long mp_lapic_addr;
6871 +
6872 +unsigned int def_to_bigsmp = 0;
6873 +
6874 +/* Processor that is doing the boot up */
6875 +unsigned int boot_cpu_physical_apicid = -1U;
6876 +/* Internal processor count */
6877 +static unsigned int __devinitdata num_processors;
6878 +
6879 +/* Bitmask of physically existing CPUs */
6880 +physid_mask_t phys_cpu_present_map;
6881 +
6882 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
6883 +
6884 +/*
6885 + * Intel MP BIOS table parsing routines:
6886 + */
6887 +
6888 +
6889 +/*
6890 + * Checksum an MP configuration block.
6891 + */
6892 +
6893 +static int __init mpf_checksum(unsigned char *mp, int len)
6894 +{
6895 +       int sum = 0;
6896 +
6897 +       while (len--)
6898 +               sum += *mp++;
6899 +
6900 +       return sum & 0xFF;
6901 +}
6902 +
6903 +/*
6904 + * Have to match translation table entries to main table entries by counter
6905 + * hence the mpc_record variable .... can't see a less disgusting way of
6906 + * doing this ....
6907 + */
6908 +
6909 +static int mpc_record;
6910 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
6911 +
6912 +#ifndef CONFIG_XEN
6913 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
6914 +{
6915 +       int ver, apicid;
6916 +       physid_mask_t phys_cpu;
6917 +
6918 +       if (!(m->mpc_cpuflag & CPU_ENABLED))
6919 +               return;
6920 +
6921 +       apicid = mpc_apic_id(m, translation_table[mpc_record]);
6922 +
6923 +       if (m->mpc_featureflag&(1<<0))
6924 +               Dprintk("    Floating point unit present.\n");
6925 +       if (m->mpc_featureflag&(1<<7))
6926 +               Dprintk("    Machine Exception supported.\n");
6927 +       if (m->mpc_featureflag&(1<<8))
6928 +               Dprintk("    64 bit compare & exchange supported.\n");
6929 +       if (m->mpc_featureflag&(1<<9))
6930 +               Dprintk("    Internal APIC present.\n");
6931 +       if (m->mpc_featureflag&(1<<11))
6932 +               Dprintk("    SEP present.\n");
6933 +       if (m->mpc_featureflag&(1<<12))
6934 +               Dprintk("    MTRR  present.\n");
6935 +       if (m->mpc_featureflag&(1<<13))
6936 +               Dprintk("    PGE  present.\n");
6937 +       if (m->mpc_featureflag&(1<<14))
6938 +               Dprintk("    MCA  present.\n");
6939 +       if (m->mpc_featureflag&(1<<15))
6940 +               Dprintk("    CMOV  present.\n");
6941 +       if (m->mpc_featureflag&(1<<16))
6942 +               Dprintk("    PAT  present.\n");
6943 +       if (m->mpc_featureflag&(1<<17))
6944 +               Dprintk("    PSE  present.\n");
6945 +       if (m->mpc_featureflag&(1<<18))
6946 +               Dprintk("    PSN  present.\n");
6947 +       if (m->mpc_featureflag&(1<<19))
6948 +               Dprintk("    Cache Line Flush Instruction present.\n");
6949 +       /* 20 Reserved */
6950 +       if (m->mpc_featureflag&(1<<21))
6951 +               Dprintk("    Debug Trace and EMON Store present.\n");
6952 +       if (m->mpc_featureflag&(1<<22))
6953 +               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
6954 +       if (m->mpc_featureflag&(1<<23))
6955 +               Dprintk("    MMX  present.\n");
6956 +       if (m->mpc_featureflag&(1<<24))
6957 +               Dprintk("    FXSR  present.\n");
6958 +       if (m->mpc_featureflag&(1<<25))
6959 +               Dprintk("    XMM  present.\n");
6960 +       if (m->mpc_featureflag&(1<<26))
6961 +               Dprintk("    Willamette New Instructions  present.\n");
6962 +       if (m->mpc_featureflag&(1<<27))
6963 +               Dprintk("    Self Snoop  present.\n");
6964 +       if (m->mpc_featureflag&(1<<28))
6965 +               Dprintk("    HT  present.\n");
6966 +       if (m->mpc_featureflag&(1<<29))
6967 +               Dprintk("    Thermal Monitor present.\n");
6968 +       /* 30, 31 Reserved */
6969 +
6970 +
6971 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
6972 +               Dprintk("    Bootup CPU\n");
6973 +               boot_cpu_physical_apicid = m->mpc_apicid;
6974 +       }
6975 +
6976 +       ver = m->mpc_apicver;
6977 +
6978 +       /*
6979 +        * Validate version
6980 +        */
6981 +       if (ver == 0x0) {
6982 +               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
6983 +                               "fixing up to 0x10. (tell your hw vendor)\n",
6984 +                               m->mpc_apicid);
6985 +               ver = 0x10;
6986 +       }
6987 +       apic_version[m->mpc_apicid] = ver;
6988 +
6989 +       phys_cpu = apicid_to_cpu_present(apicid);
6990 +       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
6991 +
6992 +       if (num_processors >= NR_CPUS) {
6993 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
6994 +                       "  Processor ignored.\n", NR_CPUS);
6995 +               return;
6996 +       }
6997 +
6998 +       if (num_processors >= maxcpus) {
6999 +               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7000 +                       " Processor ignored.\n", maxcpus);
7001 +               return;
7002 +       }
7003 +
7004 +       cpu_set(num_processors, cpu_possible_map);
7005 +       num_processors++;
7006 +
7007 +       /*
7008 +        * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
7009 +        * but we need to work other dependencies like SMP_SUSPEND etc
7010 +        * before this can be done without some confusion.
7011 +        * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
7012 +        *       - Ashok Raj <ashok.raj@intel.com>
7013 +        */
7014 +       if (num_processors > 8) {
7015 +               switch (boot_cpu_data.x86_vendor) {
7016 +               case X86_VENDOR_INTEL:
7017 +                       if (!APIC_XAPIC(ver)) {
7018 +                               def_to_bigsmp = 0;
7019 +                               break;
7020 +                       }
7021 +                       /* If P4 and above fall through */
7022 +               case X86_VENDOR_AMD:
7023 +                       def_to_bigsmp = 1;
7024 +               }
7025 +       }
7026 +       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7027 +}
7028 +#else
7029 +void __init MP_processor_info (struct mpc_config_processor *m)
7030 +{
7031 +       num_processors++;
7032 +}
7033 +#endif /* CONFIG_XEN */
7034 +
7035 +static void __init MP_bus_info (struct mpc_config_bus *m)
7036 +{
7037 +       char str[7];
7038 +
7039 +       memcpy(str, m->mpc_bustype, 6);
7040 +       str[6] = 0;
7041 +
7042 +       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7043 +
7044 +       if (m->mpc_busid >= MAX_MP_BUSSES) {
7045 +               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
7046 +                       " is too large, max. supported is %d\n",
7047 +                       m->mpc_busid, str, MAX_MP_BUSSES - 1);
7048 +               return;
7049 +       }
7050 +
7051 +       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7052 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7053 +       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7054 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7055 +       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7056 +               mpc_oem_pci_bus(m, translation_table[mpc_record]);
7057 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7058 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7059 +               mp_current_pci_id++;
7060 +       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7061 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7062 +       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7063 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7064 +       } else {
7065 +               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7066 +       }
7067 +}
7068 +
7069 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7070 +{
7071 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
7072 +               return;
7073 +
7074 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7075 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7076 +       if (nr_ioapics >= MAX_IO_APICS) {
7077 +               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7078 +                       MAX_IO_APICS, nr_ioapics);
7079 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7080 +       }
7081 +       if (!m->mpc_apicaddr) {
7082 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7083 +                       " found in MP table, skipping!\n");
7084 +               return;
7085 +       }
7086 +       mp_ioapics[nr_ioapics] = *m;
7087 +       nr_ioapics++;
7088 +}
7089 +
7090 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7091 +{
7092 +       mp_irqs [mp_irq_entries] = *m;
7093 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7094 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7095 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7096 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7097 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7098 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
7099 +               panic("Max # of irq sources exceeded!!\n");
7100 +}
7101 +
7102 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7103 +{
7104 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7105 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7106 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7107 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7108 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7109 +       /*
7110 +        * Well it seems all SMP boards in existence
7111 +        * use ExtINT/LVT1 == LINT0 and
7112 +        * NMI/LVT2 == LINT1 - the following check
7113 +        * will show us if this assumptions is false.
7114 +        * Until then we do not have to add baggage.
7115 +        */
7116 +       if ((m->mpc_irqtype == mp_ExtINT) &&
7117 +               (m->mpc_destapiclint != 0))
7118 +                       BUG();
7119 +       if ((m->mpc_irqtype == mp_NMI) &&
7120 +               (m->mpc_destapiclint != 1))
7121 +                       BUG();
7122 +}
7123 +
7124 +#ifdef CONFIG_X86_NUMAQ
7125 +static void __init MP_translation_info (struct mpc_config_translation *m)
7126 +{
7127 +       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7128 +
7129 +       if (mpc_record >= MAX_MPC_ENTRY)
7130 +               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7131 +       else
7132 +               translation_table[mpc_record] = m; /* stash this for later */
7133 +       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7134 +               node_set_online(m->trans_quad);
7135 +}
7136 +
7137 +/*
7138 + * Read/parse the MPC oem tables
7139 + */
7140 +
7141 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7142 +       unsigned short oemsize)
7143 +{
7144 +       int count = sizeof (*oemtable); /* the header size */
7145 +       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7146 +
7147 +       mpc_record = 0;
7148 +       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7149 +       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7150 +       {
7151 +               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7152 +                       oemtable->oem_signature[0],
7153 +                       oemtable->oem_signature[1],
7154 +                       oemtable->oem_signature[2],
7155 +                       oemtable->oem_signature[3]);
7156 +               return;
7157 +       }
7158 +       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7159 +       {
7160 +               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7161 +               return;
7162 +       }
7163 +       while (count < oemtable->oem_length) {
7164 +               switch (*oemptr) {
7165 +                       case MP_TRANSLATION:
7166 +                       {
7167 +                               struct mpc_config_translation *m=
7168 +                                       (struct mpc_config_translation *)oemptr;
7169 +                               MP_translation_info(m);
7170 +                               oemptr += sizeof(*m);
7171 +                               count += sizeof(*m);
7172 +                               ++mpc_record;
7173 +                               break;
7174 +                       }
7175 +                       default:
7176 +                       {
7177 +                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7178 +                               return;
7179 +                       }
7180 +               }
7181 +       }
7182 +}
7183 +
7184 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
7185 +               char *productid)
7186 +{
7187 +       if (strncmp(oem, "IBM NUMA", 8))
7188 +               printk("Warning!  May not be a NUMA-Q system!\n");
7189 +       if (mpc->mpc_oemptr)
7190 +               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
7191 +                               mpc->mpc_oemsize);
7192 +}
7193 +#endif /* CONFIG_X86_NUMAQ */
7194 +
7195 +/*
7196 + * Read/parse the MPC
7197 + */
7198 +
7199 +static int __init smp_read_mpc(struct mp_config_table *mpc)
7200 +{
7201 +       char str[16];
7202 +       char oem[10];
7203 +       int count=sizeof(*mpc);
7204 +       unsigned char *mpt=((unsigned char *)mpc)+count;
7205 +
7206 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
7207 +               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
7208 +                       *(u32 *)mpc->mpc_signature);
7209 +               return 0;
7210 +       }
7211 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
7212 +               printk(KERN_ERR "SMP mptable: checksum error!\n");
7213 +               return 0;
7214 +       }
7215 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
7216 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
7217 +                       mpc->mpc_spec);
7218 +               return 0;
7219 +       }
7220 +       if (!mpc->mpc_lapic) {
7221 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
7222 +               return 0;
7223 +       }
7224 +       memcpy(oem,mpc->mpc_oem,8);
7225 +       oem[8]=0;
7226 +       printk(KERN_INFO "OEM ID: %s ",oem);
7227 +
7228 +       memcpy(str,mpc->mpc_productid,12);
7229 +       str[12]=0;
7230 +       printk("Product ID: %s ",str);
7231 +
7232 +       mps_oem_check(mpc, oem, str);
7233 +
7234 +       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
7235 +
7236 +       /*
7237 +        * Save the local APIC address (it might be non-default) -- but only
7238 +        * if we're not using ACPI.
7239 +        */
7240 +       if (!acpi_lapic)
7241 +               mp_lapic_addr = mpc->mpc_lapic;
7242 +
7243 +       /*
7244 +        *      Now process the configuration blocks.
7245 +        */
7246 +       mpc_record = 0;
7247 +       while (count < mpc->mpc_length) {
7248 +               switch(*mpt) {
7249 +                       case MP_PROCESSOR:
7250 +                       {
7251 +                               struct mpc_config_processor *m=
7252 +                                       (struct mpc_config_processor *)mpt;
7253 +                               /* ACPI may have already provided this data */
7254 +                               if (!acpi_lapic)
7255 +                                       MP_processor_info(m);
7256 +                               mpt += sizeof(*m);
7257 +                               count += sizeof(*m);
7258 +                               break;
7259 +                       }
7260 +                       case MP_BUS:
7261 +                       {
7262 +                               struct mpc_config_bus *m=
7263 +                                       (struct mpc_config_bus *)mpt;
7264 +                               MP_bus_info(m);
7265 +                               mpt += sizeof(*m);
7266 +                               count += sizeof(*m);
7267 +                               break;
7268 +                       }
7269 +                       case MP_IOAPIC:
7270 +                       {
7271 +                               struct mpc_config_ioapic *m=
7272 +                                       (struct mpc_config_ioapic *)mpt;
7273 +                               MP_ioapic_info(m);
7274 +                               mpt+=sizeof(*m);
7275 +                               count+=sizeof(*m);
7276 +                               break;
7277 +                       }
7278 +                       case MP_INTSRC:
7279 +                       {
7280 +                               struct mpc_config_intsrc *m=
7281 +                                       (struct mpc_config_intsrc *)mpt;
7282 +
7283 +                               MP_intsrc_info(m);
7284 +                               mpt+=sizeof(*m);
7285 +                               count+=sizeof(*m);
7286 +                               break;
7287 +                       }
7288 +                       case MP_LINTSRC:
7289 +                       {
7290 +                               struct mpc_config_lintsrc *m=
7291 +                                       (struct mpc_config_lintsrc *)mpt;
7292 +                               MP_lintsrc_info(m);
7293 +                               mpt+=sizeof(*m);
7294 +                               count+=sizeof(*m);
7295 +                               break;
7296 +                       }
7297 +                       default:
7298 +                       {
7299 +                               count = mpc->mpc_length;
7300 +                               break;
7301 +                       }
7302 +               }
7303 +               ++mpc_record;
7304 +       }
7305 +       clustered_apic_check();
7306 +       if (!num_processors)
7307 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
7308 +       return num_processors;
7309 +}
7310 +
7311 +static int __init ELCR_trigger(unsigned int irq)
7312 +{
7313 +       unsigned int port;
7314 +
7315 +       port = 0x4d0 + (irq >> 3);
7316 +       return (inb(port) >> (irq & 7)) & 1;
7317 +}
7318 +
7319 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
7320 +{
7321 +       struct mpc_config_intsrc intsrc;
7322 +       int i;
7323 +       int ELCR_fallback = 0;
7324 +
7325 +       intsrc.mpc_type = MP_INTSRC;
7326 +       intsrc.mpc_irqflag = 0;                 /* conforming */
7327 +       intsrc.mpc_srcbus = 0;
7328 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
7329 +
7330 +       intsrc.mpc_irqtype = mp_INT;
7331 +
7332 +       /*
7333 +        *  If true, we have an ISA/PCI system with no IRQ entries
7334 +        *  in the MP table. To prevent the PCI interrupts from being set up
7335 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
7336 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
7337 +        *  never be level sensitive, so we simply see if the ELCR agrees.
7338 +        *  If it does, we assume it's valid.
7339 +        */
7340 +       if (mpc_default_type == 5) {
7341 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
7342 +
7343 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
7344 +                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
7345 +               else {
7346 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
7347 +                       ELCR_fallback = 1;
7348 +               }
7349 +       }
7350 +
7351 +       for (i = 0; i < 16; i++) {
7352 +               switch (mpc_default_type) {
7353 +               case 2:
7354 +                       if (i == 0 || i == 13)
7355 +                               continue;       /* IRQ0 & IRQ13 not connected */
7356 +                       /* fall through */
7357 +               default:
7358 +                       if (i == 2)
7359 +                               continue;       /* IRQ2 is never connected */
7360 +               }
7361 +
7362 +               if (ELCR_fallback) {
7363 +                       /*
7364 +                        *  If the ELCR indicates a level-sensitive interrupt, we
7365 +                        *  copy that information over to the MP table in the
7366 +                        *  irqflag field (level sensitive, active high polarity).
7367 +                        */
7368 +                       if (ELCR_trigger(i))
7369 +                               intsrc.mpc_irqflag = 13;
7370 +                       else
7371 +                               intsrc.mpc_irqflag = 0;
7372 +               }
7373 +
7374 +               intsrc.mpc_srcbusirq = i;
7375 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
7376 +               MP_intsrc_info(&intsrc);
7377 +       }
7378 +
7379 +       intsrc.mpc_irqtype = mp_ExtINT;
7380 +       intsrc.mpc_srcbusirq = 0;
7381 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
7382 +       MP_intsrc_info(&intsrc);
7383 +}
7384 +
7385 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
7386 +{
7387 +       struct mpc_config_processor processor;
7388 +       struct mpc_config_bus bus;
7389 +       struct mpc_config_ioapic ioapic;
7390 +       struct mpc_config_lintsrc lintsrc;
7391 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
7392 +       int i;
7393 +
7394 +       /*
7395 +        * local APIC has default address
7396 +        */
7397 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
7398 +
7399 +       /*
7400 +        * 2 CPUs, numbered 0 & 1.
7401 +        */
7402 +       processor.mpc_type = MP_PROCESSOR;
7403 +       /* Either an integrated APIC or a discrete 82489DX. */
7404 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7405 +       processor.mpc_cpuflag = CPU_ENABLED;
7406 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7407 +                                  (boot_cpu_data.x86_model << 4) |
7408 +                                  boot_cpu_data.x86_mask;
7409 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7410 +       processor.mpc_reserved[0] = 0;
7411 +       processor.mpc_reserved[1] = 0;
7412 +       for (i = 0; i < 2; i++) {
7413 +               processor.mpc_apicid = i;
7414 +               MP_processor_info(&processor);
7415 +       }
7416 +
7417 +       bus.mpc_type = MP_BUS;
7418 +       bus.mpc_busid = 0;
7419 +       switch (mpc_default_type) {
7420 +               default:
7421 +                       printk("???\n");
7422 +                       printk(KERN_ERR "Unknown standard configuration %d\n",
7423 +                               mpc_default_type);
7424 +                       /* fall through */
7425 +               case 1:
7426 +               case 5:
7427 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
7428 +                       break;
7429 +               case 2:
7430 +               case 6:
7431 +               case 3:
7432 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
7433 +                       break;
7434 +               case 4:
7435 +               case 7:
7436 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
7437 +       }
7438 +       MP_bus_info(&bus);
7439 +       if (mpc_default_type > 4) {
7440 +               bus.mpc_busid = 1;
7441 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
7442 +               MP_bus_info(&bus);
7443 +       }
7444 +
7445 +       ioapic.mpc_type = MP_IOAPIC;
7446 +       ioapic.mpc_apicid = 2;
7447 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7448 +       ioapic.mpc_flags = MPC_APIC_USABLE;
7449 +       ioapic.mpc_apicaddr = 0xFEC00000;
7450 +       MP_ioapic_info(&ioapic);
7451 +
7452 +       /*
7453 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
7454 +        */
7455 +       construct_default_ioirq_mptable(mpc_default_type);
7456 +
7457 +       lintsrc.mpc_type = MP_LINTSRC;
7458 +       lintsrc.mpc_irqflag = 0;                /* conforming */
7459 +       lintsrc.mpc_srcbusid = 0;
7460 +       lintsrc.mpc_srcbusirq = 0;
7461 +       lintsrc.mpc_destapic = MP_APIC_ALL;
7462 +       for (i = 0; i < 2; i++) {
7463 +               lintsrc.mpc_irqtype = linttypes[i];
7464 +               lintsrc.mpc_destapiclint = i;
7465 +               MP_lintsrc_info(&lintsrc);
7466 +       }
7467 +}
7468 +
7469 +static struct intel_mp_floating *mpf_found;
7470 +
7471 +/*
7472 + * Scan the memory blocks for an SMP configuration block.
7473 + */
7474 +void __init get_smp_config (void)
7475 +{
7476 +       struct intel_mp_floating *mpf = mpf_found;
7477 +
7478 +       /*
7479 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical
7480 +        * processors, where MPS only supports physical.
7481 +        */
7482 +       if (acpi_lapic && acpi_ioapic) {
7483 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
7484 +               return;
7485 +       }
7486 +       else if (acpi_lapic)
7487 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
7488 +
7489 +       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
7490 +       if (mpf->mpf_feature2 & (1<<7)) {
7491 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
7492 +               pic_mode = 1;
7493 +       } else {
7494 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
7495 +               pic_mode = 0;
7496 +       }
7497 +
7498 +       /*
7499 +        * Now see if we need to read further.
7500 +        */
7501 +       if (mpf->mpf_feature1 != 0) {
7502 +
7503 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
7504 +               construct_default_ISA_mptable(mpf->mpf_feature1);
7505 +
7506 +       } else if (mpf->mpf_physptr) {
7507 +
7508 +               /*
7509 +                * Read the physical hardware table.  Anything here will
7510 +                * override the defaults.
7511 +                */
7512 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
7513 +                       smp_found_config = 0;
7514 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
7515 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
7516 +                       return;
7517 +               }
7518 +               /*
7519 +                * If there are no explicit MP IRQ entries, then we are
7520 +                * broken.  We set up most of the low 16 IO-APIC pins to
7521 +                * ISA defaults and hope it will work.
7522 +                */
7523 +               if (!mp_irq_entries) {
7524 +                       struct mpc_config_bus bus;
7525 +
7526 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
7527 +
7528 +                       bus.mpc_type = MP_BUS;
7529 +                       bus.mpc_busid = 0;
7530 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
7531 +                       MP_bus_info(&bus);
7532 +
7533 +                       construct_default_ioirq_mptable(0);
7534 +               }
7535 +
7536 +       } else
7537 +               BUG();
7538 +
7539 +       printk(KERN_INFO "Processors: %d\n", num_processors);
7540 +       /*
7541 +        * Only use the first configuration found.
7542 +        */
7543 +}
7544 +
7545 +static int __init smp_scan_config (unsigned long base, unsigned long length)
7546 +{
7547 +       unsigned long *bp = isa_bus_to_virt(base);
7548 +       struct intel_mp_floating *mpf;
7549 +
7550 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
7551 +       if (sizeof(*mpf) != 16)
7552 +               printk("Error: MPF size\n");
7553 +
7554 +       while (length > 0) {
7555 +               mpf = (struct intel_mp_floating *)bp;
7556 +               if ((*bp == SMP_MAGIC_IDENT) &&
7557 +                       (mpf->mpf_length == 1) &&
7558 +                       !mpf_checksum((unsigned char *)bp, 16) &&
7559 +                       ((mpf->mpf_specification == 1)
7560 +                               || (mpf->mpf_specification == 4)) ) {
7561 +
7562 +                       smp_found_config = 1;
7563 +#ifndef CONFIG_XEN
7564 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
7565 +                                               virt_to_phys(mpf));
7566 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
7567 +                       if (mpf->mpf_physptr) {
7568 +                               /*
7569 +                                * We cannot access to MPC table to compute
7570 +                                * table size yet, as only few megabytes from
7571 +                                * the bottom is mapped now.
7572 +                                * PC-9800's MPC table places on the very last
7573 +                                * of physical memory; so that simply reserving
7574 +                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
7575 +                                * in reserve_bootmem.
7576 +                                */
7577 +                               unsigned long size = PAGE_SIZE;
7578 +                               unsigned long end = max_low_pfn * PAGE_SIZE;
7579 +                               if (mpf->mpf_physptr + size > end)
7580 +                                       size = end - mpf->mpf_physptr;
7581 +                               reserve_bootmem(mpf->mpf_physptr, size);
7582 +                       }
7583 +#else
7584 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
7585 +                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
7586 +#endif
7587 +
7588 +                       mpf_found = mpf;
7589 +                       return 1;
7590 +               }
7591 +               bp += 4;
7592 +               length -= 16;
7593 +       }
7594 +       return 0;
7595 +}
7596 +
7597 +void __init find_smp_config (void)
7598 +{
7599 +#ifndef CONFIG_XEN
7600 +       unsigned int address;
7601 +#endif
7602 +
7603 +       /*
7604 +        * FIXME: Linux assumes you have 640K of base ram..
7605 +        * this continues the error...
7606 +        *
7607 +        * 1) Scan the bottom 1K for a signature
7608 +        * 2) Scan the top 1K of base RAM
7609 +        * 3) Scan the 64K of bios
7610 +        */
7611 +       if (smp_scan_config(0x0,0x400) ||
7612 +               smp_scan_config(639*0x400,0x400) ||
7613 +                       smp_scan_config(0xF0000,0x10000))
7614 +               return;
7615 +       /*
7616 +        * If it is an SMP machine we should know now, unless the
7617 +        * configuration is in an EISA/MCA bus machine with an
7618 +        * extended bios data area.
7619 +        *
7620 +        * there is a real-mode segmented pointer pointing to the
7621 +        * 4K EBDA area at 0x40E, calculate and scan it here.
7622 +        *
7623 +        * NOTE! There are Linux loaders that will corrupt the EBDA
7624 +        * area, and as such this kind of SMP config may be less
7625 +        * trustworthy, simply because the SMP table may have been
7626 +        * stomped on during early boot. These loaders are buggy and
7627 +        * should be fixed.
7628 +        *
7629 +        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
7630 +        */
7631 +
7632 +#ifndef CONFIG_XEN
7633 +       address = get_bios_ebda();
7634 +       if (address)
7635 +               smp_scan_config(address, 0x400);
7636 +#endif
7637 +}
7638 +
7639 +int es7000_plat;
7640 +
7641 +/* --------------------------------------------------------------------------
7642 +                            ACPI-based MP Configuration
7643 +   -------------------------------------------------------------------------- */
7644 +
7645 +#ifdef CONFIG_ACPI
7646 +
7647 +void __init mp_register_lapic_address (
7648 +       u64                     address)
7649 +{
7650 +#ifndef CONFIG_XEN
7651 +       mp_lapic_addr = (unsigned long) address;
7652 +
7653 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
7654 +
7655 +       if (boot_cpu_physical_apicid == -1U)
7656 +               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
7657 +
7658 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
7659 +#endif
7660 +}
7661 +
7662 +
7663 +void __devinit mp_register_lapic (
7664 +       u8                      id,
7665 +       u8                      enabled)
7666 +{
7667 +       struct mpc_config_processor processor;
7668 +       int                     boot_cpu = 0;
7669 +
7670 +       if (MAX_APICS - id <= 0) {
7671 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
7672 +                       id, MAX_APICS);
7673 +               return;
7674 +       }
7675 +
7676 +       if (id == boot_cpu_physical_apicid)
7677 +               boot_cpu = 1;
7678 +
7679 +#ifndef CONFIG_XEN
7680 +       processor.mpc_type = MP_PROCESSOR;
7681 +       processor.mpc_apicid = id;
7682 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
7683 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
7684 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
7685 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7686 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
7687 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7688 +       processor.mpc_reserved[0] = 0;
7689 +       processor.mpc_reserved[1] = 0;
7690 +#endif
7691 +
7692 +       MP_processor_info(&processor);
7693 +}
7694 +
7695 +#ifdef CONFIG_X86_IO_APIC
7696 +
7697 +#define MP_ISA_BUS             0
7698 +#define MP_MAX_IOAPIC_PIN      127
7699 +
7700 +static struct mp_ioapic_routing {
7701 +       int                     apic_id;
7702 +       int                     gsi_base;
7703 +       int                     gsi_end;
7704 +       u32                     pin_programmed[4];
7705 +} mp_ioapic_routing[MAX_IO_APICS];
7706 +
7707 +
7708 +static int mp_find_ioapic (
7709 +       int                     gsi)
7710 +{
7711 +       int                     i = 0;
7712 +
7713 +       /* Find the IOAPIC that manages this GSI. */
7714 +       for (i = 0; i < nr_ioapics; i++) {
7715 +               if ((gsi >= mp_ioapic_routing[i].gsi_base)
7716 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
7717 +                       return i;
7718 +       }
7719 +
7720 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
7721 +
7722 +       return -1;
7723 +}
7724 +
7725 +
7726 +void __init mp_register_ioapic (
7727 +       u8                      id,
7728 +       u32                     address,
7729 +       u32                     gsi_base)
7730 +{
7731 +       int                     idx = 0;
7732 +       int                     tmpid;
7733 +
7734 +       if (nr_ioapics >= MAX_IO_APICS) {
7735 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
7736 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
7737 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
7738 +       }
7739 +       if (!address) {
7740 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
7741 +                       " found in MADT table, skipping!\n");
7742 +               return;
7743 +       }
7744 +
7745 +       idx = nr_ioapics++;
7746 +
7747 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
7748 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
7749 +       mp_ioapics[idx].mpc_apicaddr = address;
7750 +
7751 +#ifndef CONFIG_XEN
7752 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
7753 +#endif
7754 +       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
7755 +               && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
7756 +               tmpid = io_apic_get_unique_id(idx, id);
7757 +       else
7758 +               tmpid = id;
7759 +       if (tmpid == -1) {
7760 +               nr_ioapics--;
7761 +               return;
7762 +       }
7763 +       mp_ioapics[idx].mpc_apicid = tmpid;
7764 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
7765 +
7766 +       /*
7767 +        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
7768 +        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
7769 +        */
7770 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
7771 +       mp_ioapic_routing[idx].gsi_base = gsi_base;
7772 +       mp_ioapic_routing[idx].gsi_end = gsi_base +
7773 +               io_apic_get_redir_entries(idx);
7774 +
7775 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
7776 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
7777 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
7778 +               mp_ioapic_routing[idx].gsi_base,
7779 +               mp_ioapic_routing[idx].gsi_end);
7780 +
7781 +       return;
7782 +}
7783 +
7784 +
7785 +void __init mp_override_legacy_irq (
7786 +       u8                      bus_irq,
7787 +       u8                      polarity,
7788 +       u8                      trigger,
7789 +       u32                     gsi)
7790 +{
7791 +       struct mpc_config_intsrc intsrc;
7792 +       int                     ioapic = -1;
7793 +       int                     pin = -1;
7794 +
7795 +       /*
7796 +        * Convert 'gsi' to 'ioapic.pin'.
7797 +        */
7798 +       ioapic = mp_find_ioapic(gsi);
7799 +       if (ioapic < 0)
7800 +               return;
7801 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7802 +
7803 +       /*
7804 +        * TBD: This check is for faulty timer entries, where the override
7805 +        *      erroneously sets the trigger to level, resulting in a HUGE
7806 +        *      increase of timer interrupts!
7807 +        */
7808 +       if ((bus_irq == 0) && (trigger == 3))
7809 +               trigger = 1;
7810 +
7811 +       intsrc.mpc_type = MP_INTSRC;
7812 +       intsrc.mpc_irqtype = mp_INT;
7813 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
7814 +       intsrc.mpc_srcbus = MP_ISA_BUS;
7815 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
7816 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
7817 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
7818 +
7819 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
7820 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7821 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7822 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
7823 +
7824 +       mp_irqs[mp_irq_entries] = intsrc;
7825 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
7826 +               panic("Max # of irq sources exceeded!\n");
7827 +
7828 +       return;
7829 +}
7830 +
7831 +void __init mp_config_acpi_legacy_irqs (void)
7832 +{
7833 +       struct mpc_config_intsrc intsrc;
7834 +       int                     i = 0;
7835 +       int                     ioapic = -1;
7836 +
7837 +       /*
7838 +        * Fabricate the legacy ISA bus (bus #31).
7839 +        */
7840 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
7841 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
7842 +
7843 +       /*
7844 +        * Older generations of ES7000 have no legacy identity mappings
7845 +        */
7846 +       if (es7000_plat == 1)
7847 +               return;
7848 +
7849 +       /*
7850 +        * Locate the IOAPIC that manages the ISA IRQs (0-15).
7851 +        */
7852 +       ioapic = mp_find_ioapic(0);
7853 +       if (ioapic < 0)
7854 +               return;
7855 +
7856 +       intsrc.mpc_type = MP_INTSRC;
7857 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
7858 +       intsrc.mpc_srcbus = MP_ISA_BUS;
7859 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
7860 +
7861 +       /*
7862 +        * Use the default configuration for the IRQs 0-15.  Unless
7863 +        * overriden by (MADT) interrupt source override entries.
7864 +        */
7865 +       for (i = 0; i < 16; i++) {
7866 +               int idx;
7867 +
7868 +               for (idx = 0; idx < mp_irq_entries; idx++) {
7869 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
7870 +
7871 +                       /* Do we already have a mapping for this ISA IRQ? */
7872 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
7873 +                               break;
7874 +
7875 +                       /* Do we already have a mapping for this IOAPIC pin */
7876 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
7877 +                               (irq->mpc_dstirq == i))
7878 +                               break;
7879 +               }
7880 +
7881 +               if (idx != mp_irq_entries) {
7882 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
7883 +                       continue;                       /* IRQ already used */
7884 +               }
7885 +
7886 +               intsrc.mpc_irqtype = mp_INT;
7887 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
7888 +               intsrc.mpc_dstirq = i;
7889 +
7890 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
7891 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7892 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7893 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
7894 +                       intsrc.mpc_dstirq);
7895 +
7896 +               mp_irqs[mp_irq_entries] = intsrc;
7897 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
7898 +                       panic("Max # of irq sources exceeded!\n");
7899 +       }
7900 +}
7901 +
7902 +#define MAX_GSI_NUM    4096
7903 +
7904 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
7905 +{
7906 +       int                     ioapic = -1;
7907 +       int                     ioapic_pin = 0;
7908 +       int                     idx, bit = 0;
7909 +       static int              pci_irq = 16;
7910 +       /*
7911 +        * Mapping between Global System Interrups, which
7912 +        * represent all possible interrupts, and IRQs
7913 +        * assigned to actual devices.
7914 +        */
7915 +       static int              gsi_to_irq[MAX_GSI_NUM];
7916 +
7917 +       /* Don't set up the ACPI SCI because it's already set up */
7918 +       if (acpi_fadt.sci_int == gsi)
7919 +               return gsi;
7920 +
7921 +       ioapic = mp_find_ioapic(gsi);
7922 +       if (ioapic < 0) {
7923 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
7924 +               return gsi;
7925 +       }
7926 +
7927 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7928 +
7929 +       if (ioapic_renumber_irq)
7930 +               gsi = ioapic_renumber_irq(ioapic, gsi);
7931 +
7932 +       /*
7933 +        * Avoid pin reprogramming.  PRTs typically include entries
7934 +        * with redundant pin->gsi mappings (but unique PCI devices);
7935 +        * we only program the IOAPIC on the first.
7936 +        */
7937 +       bit = ioapic_pin % 32;
7938 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
7939 +       if (idx > 3) {
7940 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
7941 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
7942 +                       ioapic_pin);
7943 +               return gsi;
7944 +       }
7945 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
7946 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
7947 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
7948 +               return gsi_to_irq[gsi];
7949 +       }
7950 +
7951 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
7952 +
7953 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
7954 +               /*
7955 +                * For PCI devices assign IRQs in order, avoiding gaps
7956 +                * due to unused I/O APIC pins.
7957 +                */
7958 +               int irq = gsi;
7959 +               if (gsi < MAX_GSI_NUM) {
7960 +                       /*
7961 +                        * Retain the VIA chipset work-around (gsi > 15), but
7962 +                        * avoid a problem where the 8254 timer (IRQ0) is setup
7963 +                        * via an override (so it's not on pin 0 of the ioapic),
7964 +                        * and at the same time, the pin 0 interrupt is a PCI
7965 +                        * type.  The gsi > 15 test could cause these two pins
7966 +                        * to be shared as IRQ0, and they are not shareable.
7967 +                        * So test for this condition, and if necessary, avoid
7968 +                        * the pin collision.
7969 +                        */
7970 +                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
7971 +                               gsi = pci_irq++;
7972 +                       /*
7973 +                        * Don't assign IRQ used by ACPI SCI
7974 +                        */
7975 +                       if (gsi == acpi_fadt.sci_int)
7976 +                               gsi = pci_irq++;
7977 +                       gsi_to_irq[irq] = gsi;
7978 +               } else {
7979 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
7980 +                       return gsi;
7981 +               }
7982 +       }
7983 +
7984 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
7985 +                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
7986 +                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
7987 +       return gsi;
7988 +}
7989 +
7990 +#endif /* CONFIG_X86_IO_APIC */
7991 +#endif /* CONFIG_ACPI */
7992 Index: head-2008-11-25/arch/x86/kernel/pci-dma-xen.c
7993 ===================================================================
7994 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
7995 +++ head-2008-11-25/arch/x86/kernel/pci-dma-xen.c       2008-10-29 09:55:56.000000000 +0100
7996 @@ -0,0 +1,409 @@
7997 +/*
7998 + * Dynamic DMA mapping support.
7999 + *
8000 + * On i386 there is no hardware dynamic DMA address translation,
8001 + * so consistent alloc/free are merely page allocation/freeing.
8002 + * The rest of the dynamic DMA mapping interface is implemented
8003 + * in asm/pci.h.
8004 + */
8005 +
8006 +#include <linux/types.h>
8007 +#include <linux/mm.h>
8008 +#include <linux/string.h>
8009 +#include <linux/pci.h>
8010 +#include <linux/module.h>
8011 +#include <linux/version.h>
8012 +#include <asm/io.h>
8013 +#include <xen/balloon.h>
8014 +#include <xen/gnttab.h>
8015 +#include <asm/swiotlb.h>
8016 +#include <asm/tlbflush.h>
8017 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8018 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
8019 +#include <asm/bug.h>
8020 +
8021 +#ifdef __x86_64__
8022 +#include <asm/proto.h>
8023 +
8024 +int iommu_merge __read_mostly = 0;
8025 +EXPORT_SYMBOL(iommu_merge);
8026 +
8027 +dma_addr_t bad_dma_address __read_mostly;
8028 +EXPORT_SYMBOL(bad_dma_address);
8029 +
8030 +/* This tells the BIO block layer to assume merging. Default to off
8031 +   because we cannot guarantee merging later. */
8032 +int iommu_bio_merge __read_mostly = 0;
8033 +EXPORT_SYMBOL(iommu_bio_merge);
8034 +
8035 +int force_iommu __read_mostly= 0;
8036 +
8037 +__init int iommu_setup(char *p)
8038 +{
8039 +    return 1;
8040 +}
8041 +
8042 +void __init pci_iommu_alloc(void)
8043 +{
8044 +#ifdef CONFIG_SWIOTLB
8045 +       pci_swiotlb_init();
8046 +#endif
8047 +}
8048 +
8049 +static int __init pci_iommu_init(void)
8050 +{
8051 +       no_iommu_init();
8052 +       return 0;
8053 +}
8054 +
8055 +/* Must execute after PCI subsystem */
8056 +fs_initcall(pci_iommu_init);
8057 +#endif
8058 +
8059 +struct dma_coherent_mem {
8060 +       void            *virt_base;
8061 +       u32             device_base;
8062 +       int             size;
8063 +       int             flags;
8064 +       unsigned long   *bitmap;
8065 +};
8066 +
8067 +#define IOMMU_BUG_ON(test)                             \
8068 +do {                                                   \
8069 +       if (unlikely(test)) {                           \
8070 +               printk(KERN_ALERT "Fatal DMA error! "   \
8071 +                      "Please use 'swiotlb=force'\n"); \
8072 +               BUG();                                  \
8073 +       }                                               \
8074 +} while (0)
8075 +
8076 +static int check_pages_physically_contiguous(unsigned long pfn,
8077 +                                            unsigned int offset,
8078 +                                            size_t length)
8079 +{
8080 +       unsigned long next_mfn;
8081 +       int i;
8082 +       int nr_pages;
8083 +
8084 +       next_mfn = pfn_to_mfn(pfn);
8085 +       nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
8086 +
8087 +       for (i = 1; i < nr_pages; i++) {
8088 +               if (pfn_to_mfn(++pfn) != ++next_mfn)
8089 +                       return 0;
8090 +       }
8091 +       return 1;
8092 +}
8093 +
8094 +int range_straddles_page_boundary(paddr_t p, size_t size)
8095 +{
8096 +       unsigned long pfn = p >> PAGE_SHIFT;
8097 +       unsigned int offset = p & ~PAGE_MASK;
8098 +
8099 +       return ((offset + size > PAGE_SIZE) &&
8100 +               !check_pages_physically_contiguous(pfn, offset, size));
8101 +}
8102 +
8103 +int
8104 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8105 +          enum dma_data_direction direction)
8106 +{
8107 +       int i, rc;
8108 +
8109 +       if (direction == DMA_NONE)
8110 +               BUG();
8111 +       WARN_ON(nents == 0 || sg[0].length == 0);
8112 +
8113 +       if (swiotlb) {
8114 +               rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8115 +       } else {
8116 +               for (i = 0; i < nents; i++ ) {
8117 +                       BUG_ON(!sg[i].page);
8118 +                       sg[i].dma_address =
8119 +                               gnttab_dma_map_page(sg[i].page) + sg[i].offset;
8120 +                       sg[i].dma_length  = sg[i].length;
8121 +                       IOMMU_BUG_ON(address_needs_mapping(
8122 +                               hwdev, sg[i].dma_address));
8123 +                       IOMMU_BUG_ON(range_straddles_page_boundary(
8124 +                               page_to_pseudophys(sg[i].page) + sg[i].offset,
8125 +                               sg[i].length));
8126 +               }
8127 +               rc = nents;
8128 +       }
8129 +
8130 +       flush_write_buffers();
8131 +       return rc;
8132 +}
8133 +EXPORT_SYMBOL(dma_map_sg);
8134 +
8135 +void
8136 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8137 +            enum dma_data_direction direction)
8138 +{
8139 +       int i;
8140 +
8141 +       BUG_ON(direction == DMA_NONE);
8142 +       if (swiotlb)
8143 +               swiotlb_unmap_sg(hwdev, sg, nents, direction);
8144 +       else {
8145 +               for (i = 0; i < nents; i++ )
8146 +                       gnttab_dma_unmap_page(sg[i].dma_address);
8147 +       }
8148 +}
8149 +EXPORT_SYMBOL(dma_unmap_sg);
8150 +
8151 +#ifdef CONFIG_HIGHMEM
8152 +dma_addr_t
8153 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8154 +            size_t size, enum dma_data_direction direction)
8155 +{
8156 +       dma_addr_t dma_addr;
8157 +
8158 +       BUG_ON(direction == DMA_NONE);
8159 +
8160 +       if (swiotlb) {
8161 +               dma_addr = swiotlb_map_page(
8162 +                       dev, page, offset, size, direction);
8163 +       } else {
8164 +               dma_addr = gnttab_dma_map_page(page) + offset;
8165 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8166 +       }
8167 +
8168 +       return dma_addr;
8169 +}
8170 +EXPORT_SYMBOL(dma_map_page);
8171 +
8172 +void
8173 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8174 +              enum dma_data_direction direction)
8175 +{
8176 +       BUG_ON(direction == DMA_NONE);
8177 +       if (swiotlb)
8178 +               swiotlb_unmap_page(dev, dma_address, size, direction);
8179 +       else
8180 +               gnttab_dma_unmap_page(dma_address);
8181 +}
8182 +EXPORT_SYMBOL(dma_unmap_page);
8183 +#endif /* CONFIG_HIGHMEM */
8184 +
8185 +int
8186 +dma_mapping_error(dma_addr_t dma_addr)
8187 +{
8188 +       if (swiotlb)
8189 +               return swiotlb_dma_mapping_error(dma_addr);
8190 +       return 0;
8191 +}
8192 +EXPORT_SYMBOL(dma_mapping_error);
8193 +
8194 +int
8195 +dma_supported(struct device *dev, u64 mask)
8196 +{
8197 +       if (swiotlb)
8198 +               return swiotlb_dma_supported(dev, mask);
8199 +       /*
8200 +        * By default we'll BUG when an infeasible DMA is requested, and
8201 +        * request swiotlb=force (see IOMMU_BUG_ON).
8202 +        */
8203 +       return 1;
8204 +}
8205 +EXPORT_SYMBOL(dma_supported);
8206 +
8207 +void *dma_alloc_coherent(struct device *dev, size_t size,
8208 +                          dma_addr_t *dma_handle, gfp_t gfp)
8209 +{
8210 +       void *ret;
8211 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8212 +       unsigned int order = get_order(size);
8213 +       unsigned long vstart;
8214 +       u64 mask;
8215 +
8216 +       /* ignore region specifiers */
8217 +       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8218 +
8219 +       if (mem) {
8220 +               int page = bitmap_find_free_region(mem->bitmap, mem->size,
8221 +                                                    order);
8222 +               if (page >= 0) {
8223 +                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8224 +                       ret = mem->virt_base + (page << PAGE_SHIFT);
8225 +                       memset(ret, 0, size);
8226 +                       return ret;
8227 +               }
8228 +               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8229 +                       return NULL;
8230 +       }
8231 +
8232 +       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8233 +               gfp |= GFP_DMA;
8234 +
8235 +       vstart = __get_free_pages(gfp, order);
8236 +       ret = (void *)vstart;
8237 +
8238 +       if (dev != NULL && dev->coherent_dma_mask)
8239 +               mask = dev->coherent_dma_mask;
8240 +       else
8241 +               mask = 0xffffffff;
8242 +
8243 +       if (ret != NULL) {
8244 +               if (xen_create_contiguous_region(vstart, order,
8245 +                                                fls64(mask)) != 0) {
8246 +                       free_pages(vstart, order);
8247 +                       return NULL;
8248 +               }
8249 +               memset(ret, 0, size);
8250 +               *dma_handle = virt_to_bus(ret);
8251 +       }
8252 +       return ret;
8253 +}
8254 +EXPORT_SYMBOL(dma_alloc_coherent);
8255 +
8256 +void dma_free_coherent(struct device *dev, size_t size,
8257 +                        void *vaddr, dma_addr_t dma_handle)
8258 +{
8259 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8260 +       int order = get_order(size);
8261 +
8262 +       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
8263 +               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
8264 +
8265 +               bitmap_release_region(mem->bitmap, page, order);
8266 +       } else {
8267 +               xen_destroy_contiguous_region((unsigned long)vaddr, order);
8268 +               free_pages((unsigned long)vaddr, order);
8269 +       }
8270 +}
8271 +EXPORT_SYMBOL(dma_free_coherent);
8272 +
8273 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
8274 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
8275 +                               dma_addr_t device_addr, size_t size, int flags)
8276 +{
8277 +       void __iomem *mem_base;
8278 +       int pages = size >> PAGE_SHIFT;
8279 +       int bitmap_size = (pages + 31)/32;
8280 +
8281 +       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
8282 +               goto out;
8283 +       if (!size)
8284 +               goto out;
8285 +       if (dev->dma_mem)
8286 +               goto out;
8287 +
8288 +       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
8289 +
8290 +       mem_base = ioremap(bus_addr, size);
8291 +       if (!mem_base)
8292 +               goto out;
8293 +
8294 +       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
8295 +       if (!dev->dma_mem)
8296 +               goto out;
8297 +       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
8298 +       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
8299 +       if (!dev->dma_mem->bitmap)
8300 +               goto free1_out;
8301 +       memset(dev->dma_mem->bitmap, 0, bitmap_size);
8302 +
8303 +       dev->dma_mem->virt_base = mem_base;
8304 +       dev->dma_mem->device_base = device_addr;
8305 +       dev->dma_mem->size = pages;
8306 +       dev->dma_mem->flags = flags;
8307 +
8308 +       if (flags & DMA_MEMORY_MAP)
8309 +               return DMA_MEMORY_MAP;
8310 +
8311 +       return DMA_MEMORY_IO;
8312 +
8313 + free1_out:
8314 +       kfree(dev->dma_mem->bitmap);
8315 + out:
8316 +       return 0;
8317 +}
8318 +EXPORT_SYMBOL(dma_declare_coherent_memory);
8319 +
8320 +void dma_release_declared_memory(struct device *dev)
8321 +{
8322 +       struct dma_coherent_mem *mem = dev->dma_mem;
8323 +
8324 +       if(!mem)
8325 +               return;
8326 +       dev->dma_mem = NULL;
8327 +       iounmap(mem->virt_base);
8328 +       kfree(mem->bitmap);
8329 +       kfree(mem);
8330 +}
8331 +EXPORT_SYMBOL(dma_release_declared_memory);
8332 +
8333 +void *dma_mark_declared_memory_occupied(struct device *dev,
8334 +                                       dma_addr_t device_addr, size_t size)
8335 +{
8336 +       struct dma_coherent_mem *mem = dev->dma_mem;
8337 +       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
8338 +       int pos, err;
8339 +
8340 +       if (!mem)
8341 +               return ERR_PTR(-EINVAL);
8342 +
8343 +       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
8344 +       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
8345 +       if (err != 0)
8346 +               return ERR_PTR(err);
8347 +       return mem->virt_base + (pos << PAGE_SHIFT);
8348 +}
8349 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
8350 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
8351 +
8352 +dma_addr_t
8353 +dma_map_single(struct device *dev, void *ptr, size_t size,
8354 +              enum dma_data_direction direction)
8355 +{
8356 +       dma_addr_t dma;
8357 +
8358 +       if (direction == DMA_NONE)
8359 +               BUG();
8360 +       WARN_ON(size == 0);
8361 +
8362 +       if (swiotlb) {
8363 +               dma = swiotlb_map_single(dev, ptr, size, direction);
8364 +       } else {
8365 +               dma = gnttab_dma_map_page(virt_to_page(ptr)) +
8366 +                     offset_in_page(ptr);
8367 +               IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
8368 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma));
8369 +       }
8370 +
8371 +       flush_write_buffers();
8372 +       return dma;
8373 +}
8374 +EXPORT_SYMBOL(dma_map_single);
8375 +
8376 +void
8377 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
8378 +                enum dma_data_direction direction)
8379 +{
8380 +       if (direction == DMA_NONE)
8381 +               BUG();
8382 +       if (swiotlb)
8383 +               swiotlb_unmap_single(dev, dma_addr, size, direction);
8384 +       else
8385 +               gnttab_dma_unmap_page(dma_addr);
8386 +}
8387 +EXPORT_SYMBOL(dma_unmap_single);
8388 +
8389 +void
8390 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
8391 +                       enum dma_data_direction direction)
8392 +{
8393 +       if (swiotlb)
8394 +               swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
8395 +}
8396 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
8397 +
8398 +void
8399 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
8400 +                           enum dma_data_direction direction)
8401 +{
8402 +       if (swiotlb)
8403 +               swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
8404 +}
8405 +EXPORT_SYMBOL(dma_sync_single_for_device);
8406 Index: head-2008-11-25/arch/x86/kernel/process_32-xen.c
8407 ===================================================================
8408 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
8409 +++ head-2008-11-25/arch/x86/kernel/process_32-xen.c    2008-07-21 11:00:32.000000000 +0200
8410 @@ -0,0 +1,877 @@
8411 +/*
8412 + *  linux/arch/i386/kernel/process.c
8413 + *
8414 + *  Copyright (C) 1995  Linus Torvalds
8415 + *
8416 + *  Pentium III FXSR, SSE support
8417 + *     Gareth Hughes <gareth@valinux.com>, May 2000
8418 + */
8419 +
8420 +/*
8421 + * This file handles the architecture-dependent parts of process handling..
8422 + */
8423 +
8424 +#include <stdarg.h>
8425 +
8426 +#include <linux/cpu.h>
8427 +#include <linux/errno.h>
8428 +#include <linux/sched.h>
8429 +#include <linux/fs.h>
8430 +#include <linux/kernel.h>
8431 +#include <linux/mm.h>
8432 +#include <linux/elfcore.h>
8433 +#include <linux/smp.h>
8434 +#include <linux/smp_lock.h>
8435 +#include <linux/stddef.h>
8436 +#include <linux/slab.h>
8437 +#include <linux/vmalloc.h>
8438 +#include <linux/user.h>
8439 +#include <linux/a.out.h>
8440 +#include <linux/interrupt.h>
8441 +#include <linux/utsname.h>
8442 +#include <linux/delay.h>
8443 +#include <linux/reboot.h>
8444 +#include <linux/init.h>
8445 +#include <linux/mc146818rtc.h>
8446 +#include <linux/module.h>
8447 +#include <linux/kallsyms.h>
8448 +#include <linux/ptrace.h>
8449 +#include <linux/random.h>
8450 +
8451 +#include <asm/uaccess.h>
8452 +#include <asm/pgtable.h>
8453 +#include <asm/system.h>
8454 +#include <asm/io.h>
8455 +#include <asm/ldt.h>
8456 +#include <asm/processor.h>
8457 +#include <asm/i387.h>
8458 +#include <asm/desc.h>
8459 +#include <asm/vm86.h>
8460 +#ifdef CONFIG_MATH_EMULATION
8461 +#include <asm/math_emu.h>
8462 +#endif
8463 +
8464 +#include <xen/interface/physdev.h>
8465 +#include <xen/interface/vcpu.h>
8466 +#include <xen/cpu_hotplug.h>
8467 +
8468 +#include <linux/err.h>
8469 +
8470 +#include <asm/tlbflush.h>
8471 +#include <asm/cpu.h>
8472 +
8473 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
8474 +
8475 +static int hlt_counter;
8476 +
8477 +unsigned long boot_option_idle_override = 0;
8478 +EXPORT_SYMBOL(boot_option_idle_override);
8479 +
8480 +/*
8481 + * Return saved PC of a blocked thread.
8482 + */
8483 +unsigned long thread_saved_pc(struct task_struct *tsk)
8484 +{
8485 +       return ((unsigned long *)tsk->thread.esp)[3];
8486 +}
8487 +
8488 +/*
8489 + * Powermanagement idle function, if any..
8490 + */
8491 +void (*pm_idle)(void);
8492 +EXPORT_SYMBOL(pm_idle);
8493 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
8494 +
8495 +void disable_hlt(void)
8496 +{
8497 +       hlt_counter++;
8498 +}
8499 +
8500 +EXPORT_SYMBOL(disable_hlt);
8501 +
8502 +void enable_hlt(void)
8503 +{
8504 +       hlt_counter--;
8505 +}
8506 +
8507 +EXPORT_SYMBOL(enable_hlt);
8508 +
8509 +/*
8510 + * On SMP it's slightly faster (but much more power-consuming!)
8511 + * to poll the ->work.need_resched flag instead of waiting for the
8512 + * cross-CPU IPI to arrive. Use this option with caution.
8513 + */
8514 +static void poll_idle (void)
8515 +{
8516 +       local_irq_enable();
8517 +
8518 +       asm volatile(
8519 +               "2:"
8520 +               "testl %0, %1;"
8521 +               "rep; nop;"
8522 +               "je 2b;"
8523 +               : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
8524 +}
8525 +
8526 +static void xen_idle(void)
8527 +{
8528 +       local_irq_disable();
8529 +
8530 +       if (need_resched())
8531 +               local_irq_enable();
8532 +       else {
8533 +               current_thread_info()->status &= ~TS_POLLING;
8534 +               smp_mb__after_clear_bit();
8535 +               safe_halt();
8536 +               current_thread_info()->status |= TS_POLLING;
8537 +       }
8538 +}
8539 +#ifdef CONFIG_APM_MODULE
8540 +EXPORT_SYMBOL(default_idle);
8541 +#endif
8542 +
8543 +#ifdef CONFIG_HOTPLUG_CPU
8544 +extern cpumask_t cpu_initialized;
8545 +static inline void play_dead(void)
8546 +{
8547 +       idle_task_exit();
8548 +       local_irq_disable();
8549 +       cpu_clear(smp_processor_id(), cpu_initialized);
8550 +       preempt_enable_no_resched();
8551 +       VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
8552 +       cpu_bringup();
8553 +}
8554 +#else
8555 +static inline void play_dead(void)
8556 +{
8557 +       BUG();
8558 +}
8559 +#endif /* CONFIG_HOTPLUG_CPU */
8560 +
8561 +/*
8562 + * The idle thread. There's no useful work to be
8563 + * done, so just try to conserve power and have a
8564 + * low exit latency (ie sit in a loop waiting for
8565 + * somebody to say that they'd like to reschedule)
8566 + */
8567 +void cpu_idle(void)
8568 +{
8569 +       int cpu = smp_processor_id();
8570 +
8571 +       current_thread_info()->status |= TS_POLLING;
8572 +
8573 +       /* endless idle loop with no priority at all */
8574 +       while (1) {
8575 +               while (!need_resched()) {
8576 +                       void (*idle)(void);
8577 +
8578 +                       if (__get_cpu_var(cpu_idle_state))
8579 +                               __get_cpu_var(cpu_idle_state) = 0;
8580 +
8581 +                       rmb();
8582 +                       idle = xen_idle; /* no alternatives */
8583 +
8584 +                       if (cpu_is_offline(cpu))
8585 +                               play_dead();
8586 +
8587 +                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
8588 +                       idle();
8589 +               }
8590 +               preempt_enable_no_resched();
8591 +               schedule();
8592 +               preempt_disable();
8593 +       }
8594 +}
8595 +
8596 +void cpu_idle_wait(void)
8597 +{
8598 +       unsigned int cpu, this_cpu = get_cpu();
8599 +       cpumask_t map;
8600 +
8601 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
8602 +       put_cpu();
8603 +
8604 +       cpus_clear(map);
8605 +       for_each_online_cpu(cpu) {
8606 +               per_cpu(cpu_idle_state, cpu) = 1;
8607 +               cpu_set(cpu, map);
8608 +       }
8609 +
8610 +       __get_cpu_var(cpu_idle_state) = 0;
8611 +
8612 +       wmb();
8613 +       do {
8614 +               ssleep(1);
8615 +               for_each_online_cpu(cpu) {
8616 +                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
8617 +                               cpu_clear(cpu, map);
8618 +               }
8619 +               cpus_and(map, map, cpu_online_map);
8620 +       } while (!cpus_empty(map));
8621 +}
8622 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
8623 +
8624 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
8625 +{
8626 +}
8627 +
8628 +static int __init idle_setup (char *str)
8629 +{
8630 +       if (!strncmp(str, "poll", 4)) {
8631 +               printk("using polling idle threads.\n");
8632 +               pm_idle = poll_idle;
8633 +       }
8634 +
8635 +       boot_option_idle_override = 1;
8636 +       return 1;
8637 +}
8638 +
8639 +__setup("idle=", idle_setup);
8640 +
8641 +void show_regs(struct pt_regs * regs)
8642 +{
8643 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
8644 +
8645 +       printk("\n");
8646 +       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
8647 +       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
8648 +       print_symbol("EIP is at %s\n", regs->eip);
8649 +
8650 +       if (user_mode_vm(regs))
8651 +               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
8652 +       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
8653 +              regs->eflags, print_tainted(), system_utsname.release,
8654 +              (int)strcspn(system_utsname.version, " "),
8655 +              system_utsname.version);
8656 +       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
8657 +               regs->eax,regs->ebx,regs->ecx,regs->edx);
8658 +       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
8659 +               regs->esi, regs->edi, regs->ebp);
8660 +       printk(" DS: %04x ES: %04x\n",
8661 +               0xffff & regs->xds,0xffff & regs->xes);
8662 +
8663 +       cr0 = read_cr0();
8664 +       cr2 = read_cr2();
8665 +       cr3 = read_cr3();
8666 +       cr4 = read_cr4_safe();
8667 +       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
8668 +       show_trace(NULL, regs, &regs->esp);
8669 +}
8670 +
8671 +/*
8672 + * This gets run with %ebx containing the
8673 + * function to call, and %edx containing
8674 + * the "args".
8675 + */
8676 +extern void kernel_thread_helper(void);
8677 +__asm__(".section .text\n"
8678 +       ".align 4\n"
8679 +       "kernel_thread_helper:\n\t"
8680 +       "movl %edx,%eax\n\t"
8681 +       "pushl %edx\n\t"
8682 +       "call *%ebx\n\t"
8683 +       "pushl %eax\n\t"
8684 +       "call do_exit\n"
8685 +       ".previous");
8686 +
8687 +/*
8688 + * Create a kernel thread
8689 + */
8690 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
8691 +{
8692 +       struct pt_regs regs;
8693 +
8694 +       memset(&regs, 0, sizeof(regs));
8695 +
8696 +       regs.ebx = (unsigned long) fn;
8697 +       regs.edx = (unsigned long) arg;
8698 +
8699 +       regs.xds = __USER_DS;
8700 +       regs.xes = __USER_DS;
8701 +       regs.orig_eax = -1;
8702 +       regs.eip = (unsigned long) kernel_thread_helper;
8703 +       regs.xcs = GET_KERNEL_CS();
8704 +       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
8705 +
8706 +       /* Ok, create the new process.. */
8707 +       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
8708 +}
8709 +EXPORT_SYMBOL(kernel_thread);
8710 +
8711 +/*
8712 + * Free current thread data structures etc..
8713 + */
8714 +void exit_thread(void)
8715 +{
8716 +       /* The process may have allocated an io port bitmap... nuke it. */
8717 +       if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
8718 +               struct task_struct *tsk = current;
8719 +               struct thread_struct *t = &tsk->thread;
8720 +               struct physdev_set_iobitmap set_iobitmap;
8721 +               memset(&set_iobitmap, 0, sizeof(set_iobitmap));
8722 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
8723 +                                             &set_iobitmap));
8724 +               kfree(t->io_bitmap_ptr);
8725 +               t->io_bitmap_ptr = NULL;
8726 +               clear_thread_flag(TIF_IO_BITMAP);
8727 +       }
8728 +}
8729 +
8730 +void flush_thread(void)
8731 +{
8732 +       struct task_struct *tsk = current;
8733 +
8734 +       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
8735 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
8736 +       clear_tsk_thread_flag(tsk, TIF_DEBUG);
8737 +       /*
8738 +        * Forget coprocessor state..
8739 +        */
8740 +       clear_fpu(tsk);
8741 +       clear_used_math();
8742 +}
8743 +
8744 +void release_thread(struct task_struct *dead_task)
8745 +{
8746 +       BUG_ON(dead_task->mm);
8747 +       release_vm86_irqs(dead_task);
8748 +}
8749 +
8750 +/*
8751 + * This gets called before we allocate a new thread and copy
8752 + * the current task into it.
8753 + */
8754 +void prepare_to_copy(struct task_struct *tsk)
8755 +{
8756 +       unlazy_fpu(tsk);
8757 +}
8758 +
8759 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
8760 +       unsigned long unused,
8761 +       struct task_struct * p, struct pt_regs * regs)
8762 +{
8763 +       struct pt_regs * childregs;
8764 +       struct task_struct *tsk;
8765 +       int err;
8766 +
8767 +       childregs = task_pt_regs(p);
8768 +       *childregs = *regs;
8769 +       childregs->eax = 0;
8770 +       childregs->esp = esp;
8771 +
8772 +       p->thread.esp = (unsigned long) childregs;
8773 +       p->thread.esp0 = (unsigned long) (childregs+1);
8774 +
8775 +       p->thread.eip = (unsigned long) ret_from_fork;
8776 +
8777 +       savesegment(fs,p->thread.fs);
8778 +       savesegment(gs,p->thread.gs);
8779 +
8780 +       tsk = current;
8781 +       if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
8782 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
8783 +               if (!p->thread.io_bitmap_ptr) {
8784 +                       p->thread.io_bitmap_max = 0;
8785 +                       return -ENOMEM;
8786 +               }
8787 +               memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
8788 +                       IO_BITMAP_BYTES);
8789 +               set_tsk_thread_flag(p, TIF_IO_BITMAP);
8790 +       }
8791 +
8792 +       /*
8793 +        * Set a new TLS for the child thread?
8794 +        */
8795 +       if (clone_flags & CLONE_SETTLS) {
8796 +               struct desc_struct *desc;
8797 +               struct user_desc info;
8798 +               int idx;
8799 +
8800 +               err = -EFAULT;
8801 +               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
8802 +                       goto out;
8803 +               err = -EINVAL;
8804 +               if (LDT_empty(&info))
8805 +                       goto out;
8806 +
8807 +               idx = info.entry_number;
8808 +               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
8809 +                       goto out;
8810 +
8811 +               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
8812 +               desc->a = LDT_entry_a(&info);
8813 +               desc->b = LDT_entry_b(&info);
8814 +       }
8815 +
8816 +       p->thread.iopl = current->thread.iopl;
8817 +
8818 +       err = 0;
8819 + out:
8820 +       if (err && p->thread.io_bitmap_ptr) {
8821 +               kfree(p->thread.io_bitmap_ptr);
8822 +               p->thread.io_bitmap_max = 0;
8823 +       }
8824 +       return err;
8825 +}
8826 +
8827 +/*
8828 + * fill in the user structure for a core dump..
8829 + */
8830 +void dump_thread(struct pt_regs * regs, struct user * dump)
8831 +{
8832 +       int i;
8833 +
8834 +/* changed the size calculations - should hopefully work better. lbt */
8835 +       dump->magic = CMAGIC;
8836 +       dump->start_code = 0;
8837 +       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
8838 +       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
8839 +       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
8840 +       dump->u_dsize -= dump->u_tsize;
8841 +       dump->u_ssize = 0;
8842 +       for (i = 0; i < 8; i++)
8843 +               dump->u_debugreg[i] = current->thread.debugreg[i];
8844 +
8845 +       if (dump->start_stack < TASK_SIZE)
8846 +               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
8847 +
8848 +       dump->regs.ebx = regs->ebx;
8849 +       dump->regs.ecx = regs->ecx;
8850 +       dump->regs.edx = regs->edx;
8851 +       dump->regs.esi = regs->esi;
8852 +       dump->regs.edi = regs->edi;
8853 +       dump->regs.ebp = regs->ebp;
8854 +       dump->regs.eax = regs->eax;
8855 +       dump->regs.ds = regs->xds;
8856 +       dump->regs.es = regs->xes;
8857 +       savesegment(fs,dump->regs.fs);
8858 +       savesegment(gs,dump->regs.gs);
8859 +       dump->regs.orig_eax = regs->orig_eax;
8860 +       dump->regs.eip = regs->eip;
8861 +       dump->regs.cs = regs->xcs;
8862 +       dump->regs.eflags = regs->eflags;
8863 +       dump->regs.esp = regs->esp;
8864 +       dump->regs.ss = regs->xss;
8865 +
8866 +       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
8867 +}
8868 +EXPORT_SYMBOL(dump_thread);
8869 +
8870 +/*
8871 + * Capture the user space registers if the task is not running (in user space)
8872 + */
8873 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
8874 +{
8875 +       struct pt_regs ptregs = *task_pt_regs(tsk);
8876 +       ptregs.xcs &= 0xffff;
8877 +       ptregs.xds &= 0xffff;
8878 +       ptregs.xes &= 0xffff;
8879 +       ptregs.xss &= 0xffff;
8880 +
8881 +       elf_core_copy_regs(regs, &ptregs);
8882 +
8883 +       return 1;
8884 +}
8885 +
8886 +static noinline void __switch_to_xtra(struct task_struct *next_p)
8887 +{
8888 +       struct thread_struct *next;
8889 +
8890 +       next = &next_p->thread;
8891 +
8892 +       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
8893 +               set_debugreg(next->debugreg[0], 0);
8894 +               set_debugreg(next->debugreg[1], 1);
8895 +               set_debugreg(next->debugreg[2], 2);
8896 +               set_debugreg(next->debugreg[3], 3);
8897 +               /* no 4 and 5 */
8898 +               set_debugreg(next->debugreg[6], 6);
8899 +               set_debugreg(next->debugreg[7], 7);
8900 +       }
8901 +}
8902 +
8903 +/*
8904 + * This function selects if the context switch from prev to next
8905 + * has to tweak the TSC disable bit in the cr4.
8906 + */
8907 +static inline void disable_tsc(struct task_struct *prev_p,
8908 +                              struct task_struct *next_p)
8909 +{
8910 +       struct thread_info *prev, *next;
8911 +
8912 +       /*
8913 +        * gcc should eliminate the ->thread_info dereference if
8914 +        * has_secure_computing returns 0 at compile time (SECCOMP=n).
8915 +        */
8916 +       prev = task_thread_info(prev_p);
8917 +       next = task_thread_info(next_p);
8918 +
8919 +       if (has_secure_computing(prev) || has_secure_computing(next)) {
8920 +               /* slow path here */
8921 +               if (has_secure_computing(prev) &&
8922 +                   !has_secure_computing(next)) {
8923 +                       write_cr4(read_cr4() & ~X86_CR4_TSD);
8924 +               } else if (!has_secure_computing(prev) &&
8925 +                          has_secure_computing(next))
8926 +                       write_cr4(read_cr4() | X86_CR4_TSD);
8927 +       }
8928 +}
8929 +
8930 +/*
8931 + *     switch_to(x,yn) should switch tasks from x to y.
8932 + *
8933 + * We fsave/fwait so that an exception goes off at the right time
8934 + * (as a call from the fsave or fwait in effect) rather than to
8935 + * the wrong process. Lazy FP saving no longer makes any sense
8936 + * with modern CPU's, and this simplifies a lot of things (SMP
8937 + * and UP become the same).
8938 + *
8939 + * NOTE! We used to use the x86 hardware context switching. The
8940 + * reason for not using it any more becomes apparent when you
8941 + * try to recover gracefully from saved state that is no longer
8942 + * valid (stale segment register values in particular). With the
8943 + * hardware task-switch, there is no way to fix up bad state in
8944 + * a reasonable manner.
8945 + *
8946 + * The fact that Intel documents the hardware task-switching to
8947 + * be slow is a fairly red herring - this code is not noticeably
8948 + * faster. However, there _is_ some room for improvement here,
8949 + * so the performance issues may eventually be a valid point.
8950 + * More important, however, is the fact that this allows us much
8951 + * more flexibility.
8952 + *
8953 + * The return value (in %eax) will be the "prev" task after
8954 + * the task-switch, and shows up in ret_from_fork in entry.S,
8955 + * for example.
8956 + */
8957 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
8958 +{
8959 +       struct thread_struct *prev = &prev_p->thread,
8960 +                                *next = &next_p->thread;
8961 +       int cpu = smp_processor_id();
8962 +#ifndef CONFIG_X86_NO_TSS
8963 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
8964 +#endif
8965 +#if CONFIG_XEN_COMPAT > 0x030002
8966 +       struct physdev_set_iopl iopl_op;
8967 +       struct physdev_set_iobitmap iobmp_op;
8968 +#else
8969 +       struct physdev_op _pdo[2], *pdo = _pdo;
8970 +#define iopl_op pdo->u.set_iopl
8971 +#define iobmp_op pdo->u.set_iobitmap
8972 +#endif
8973 +       multicall_entry_t _mcl[8], *mcl = _mcl;
8974 +
8975 +       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
8976 +
8977 +       /*
8978 +        * This is basically '__unlazy_fpu', except that we queue a
8979 +        * multicall to indicate FPU task switch, rather than
8980 +        * synchronously trapping to Xen.
8981 +        */
8982 +       if (prev_p->thread_info->status & TS_USEDFPU) {
8983 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
8984 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
8985 +               mcl->args[0] = 1;
8986 +               mcl++;
8987 +       }
8988 +#if 0 /* lazy fpu sanity check */
8989 +       else BUG_ON(!(read_cr0() & 8));
8990 +#endif
8991 +
8992 +       /*
8993 +        * Reload esp0.
8994 +        * This is load_esp0(tss, next) with a multicall.
8995 +        */
8996 +       mcl->op      = __HYPERVISOR_stack_switch;
8997 +       mcl->args[0] = __KERNEL_DS;
8998 +       mcl->args[1] = next->esp0;
8999 +       mcl++;
9000 +
9001 +       /*
9002 +        * Load the per-thread Thread-Local Storage descriptor.
9003 +        * This is load_TLS(next, cpu) with multicalls.
9004 +        */
9005 +#define C(i) do {                                                      \
9006 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
9007 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
9008 +               mcl->op = __HYPERVISOR_update_descriptor;               \
9009 +               *(u64 *)&mcl->args[0] = virt_to_machine(                \
9010 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9011 +               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
9012 +               mcl++;                                                  \
9013 +       }                                                               \
9014 +} while (0)
9015 +       C(0); C(1); C(2);
9016 +#undef C
9017 +
9018 +       if (unlikely(prev->iopl != next->iopl)) {
9019 +               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
9020 +#if CONFIG_XEN_COMPAT > 0x030002
9021 +               mcl->op      = __HYPERVISOR_physdev_op;
9022 +               mcl->args[0] = PHYSDEVOP_set_iopl;
9023 +               mcl->args[1] = (unsigned long)&iopl_op;
9024 +#else
9025 +               mcl->op      = __HYPERVISOR_physdev_op_compat;
9026 +               pdo->cmd     = PHYSDEVOP_set_iopl;
9027 +               mcl->args[0] = (unsigned long)pdo++;
9028 +#endif
9029 +               mcl++;
9030 +       }
9031 +
9032 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9033 +               set_xen_guest_handle(iobmp_op.bitmap,
9034 +                                    (char *)next->io_bitmap_ptr);
9035 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9036 +#if CONFIG_XEN_COMPAT > 0x030002
9037 +               mcl->op      = __HYPERVISOR_physdev_op;
9038 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
9039 +               mcl->args[1] = (unsigned long)&iobmp_op;
9040 +#else
9041 +               mcl->op      = __HYPERVISOR_physdev_op_compat;
9042 +               pdo->cmd     = PHYSDEVOP_set_iobitmap;
9043 +               mcl->args[0] = (unsigned long)pdo++;
9044 +#endif
9045 +               mcl++;
9046 +       }
9047 +
9048 +#if CONFIG_XEN_COMPAT <= 0x030002
9049 +       BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
9050 +#endif
9051 +       BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
9052 +       if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
9053 +               BUG();
9054 +
9055 +       /*
9056 +        * Restore %fs and %gs if needed.
9057 +        *
9058 +        * Glibc normally makes %fs be zero, and %gs is one of
9059 +        * the TLS segments.
9060 +        */
9061 +       if (unlikely(next->fs))
9062 +               loadsegment(fs, next->fs);
9063 +
9064 +       if (next->gs)
9065 +               loadsegment(gs, next->gs);
9066 +
9067 +       /*
9068 +        * Now maybe handle debug registers
9069 +        */
9070 +       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
9071 +               __switch_to_xtra(next_p);
9072 +
9073 +       disable_tsc(prev_p, next_p);
9074 +
9075 +       return prev_p;
9076 +}
9077 +
9078 +asmlinkage int sys_fork(struct pt_regs regs)
9079 +{
9080 +       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9081 +}
9082 +
9083 +asmlinkage int sys_clone(struct pt_regs regs)
9084 +{
9085 +       unsigned long clone_flags;
9086 +       unsigned long newsp;
9087 +       int __user *parent_tidptr, *child_tidptr;
9088 +
9089 +       clone_flags = regs.ebx;
9090 +       newsp = regs.ecx;
9091 +       parent_tidptr = (int __user *)regs.edx;
9092 +       child_tidptr = (int __user *)regs.edi;
9093 +       if (!newsp)
9094 +               newsp = regs.esp;
9095 +       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9096 +}
9097 +
9098 +/*
9099 + * This is trivial, and on the face of it looks like it
9100 + * could equally well be done in user mode.
9101 + *
9102 + * Not so, for quite unobvious reasons - register pressure.
9103 + * In user mode vfork() cannot have a stack frame, and if
9104 + * done by calling the "clone()" system call directly, you
9105 + * do not have enough call-clobbered registers to hold all
9106 + * the information you need.
9107 + */
9108 +asmlinkage int sys_vfork(struct pt_regs regs)
9109 +{
9110 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9111 +}
9112 +
9113 +/*
9114 + * sys_execve() executes a new program.
9115 + */
9116 +asmlinkage int sys_execve(struct pt_regs regs)
9117 +{
9118 +       int error;
9119 +       char * filename;
9120 +
9121 +       filename = getname((char __user *) regs.ebx);
9122 +       error = PTR_ERR(filename);
9123 +       if (IS_ERR(filename))
9124 +               goto out;
9125 +       error = do_execve(filename,
9126 +                       (char __user * __user *) regs.ecx,
9127 +                       (char __user * __user *) regs.edx,
9128 +                       &regs);
9129 +       if (error == 0) {
9130 +               task_lock(current);
9131 +               current->ptrace &= ~PT_DTRACE;
9132 +               task_unlock(current);
9133 +               /* Make sure we don't return using sysenter.. */
9134 +               set_thread_flag(TIF_IRET);
9135 +       }
9136 +       putname(filename);
9137 +out:
9138 +       return error;
9139 +}
9140 +
9141 +#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
9142 +#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
9143 +
9144 +unsigned long get_wchan(struct task_struct *p)
9145 +{
9146 +       unsigned long ebp, esp, eip;
9147 +       unsigned long stack_page;
9148 +       int count = 0;
9149 +       if (!p || p == current || p->state == TASK_RUNNING)
9150 +               return 0;
9151 +       stack_page = (unsigned long)task_stack_page(p);
9152 +       esp = p->thread.esp;
9153 +       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9154 +               return 0;
9155 +       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9156 +       ebp = *(unsigned long *) esp;
9157 +       do {
9158 +               if (ebp < stack_page || ebp > top_ebp+stack_page)
9159 +                       return 0;
9160 +               eip = *(unsigned long *) (ebp+4);
9161 +               if (!in_sched_functions(eip))
9162 +                       return eip;
9163 +               ebp = *(unsigned long *) ebp;
9164 +       } while (count++ < 16);
9165 +       return 0;
9166 +}
9167 +
9168 +/*
9169 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9170 + */
9171 +static int get_free_idx(void)
9172 +{
9173 +       struct thread_struct *t = &current->thread;
9174 +       int idx;
9175 +
9176 +       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9177 +               if (desc_empty(t->tls_array + idx))
9178 +                       return idx + GDT_ENTRY_TLS_MIN;
9179 +       return -ESRCH;
9180 +}
9181 +
9182 +/*
9183 + * Set a given TLS descriptor:
9184 + */
9185 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9186 +{
9187 +       struct thread_struct *t = &current->thread;
9188 +       struct user_desc info;
9189 +       struct desc_struct *desc;
9190 +       int cpu, idx;
9191 +
9192 +       if (copy_from_user(&info, u_info, sizeof(info)))
9193 +               return -EFAULT;
9194 +       idx = info.entry_number;
9195 +
9196 +       /*
9197 +        * index -1 means the kernel should try to find and
9198 +        * allocate an empty descriptor:
9199 +        */
9200 +       if (idx == -1) {
9201 +               idx = get_free_idx();
9202 +               if (idx < 0)
9203 +                       return idx;
9204 +               if (put_user(idx, &u_info->entry_number))
9205 +                       return -EFAULT;
9206 +       }
9207 +
9208 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9209 +               return -EINVAL;
9210 +
9211 +       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9212 +
9213 +       /*
9214 +        * We must not get preempted while modifying the TLS.
9215 +        */
9216 +       cpu = get_cpu();
9217 +
9218 +       if (LDT_empty(&info)) {
9219 +               desc->a = 0;
9220 +               desc->b = 0;
9221 +       } else {
9222 +               desc->a = LDT_entry_a(&info);
9223 +               desc->b = LDT_entry_b(&info);
9224 +       }
9225 +       load_TLS(t, cpu);
9226 +
9227 +       put_cpu();
9228 +
9229 +       return 0;
9230 +}
9231 +
9232 +/*
9233 + * Get the current Thread-Local Storage area:
9234 + */
9235 +
9236 +#define GET_BASE(desc) ( \
9237 +       (((desc)->a >> 16) & 0x0000ffff) | \
9238 +       (((desc)->b << 16) & 0x00ff0000) | \
9239 +       ( (desc)->b        & 0xff000000)   )
9240 +
9241 +#define GET_LIMIT(desc) ( \
9242 +       ((desc)->a & 0x0ffff) | \
9243 +        ((desc)->b & 0xf0000) )
9244 +
9245 +#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
9246 +#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
9247 +#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
9248 +#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
9249 +#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
9250 +#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
9251 +
9252 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9253 +{
9254 +       struct user_desc info;
9255 +       struct desc_struct *desc;
9256 +       int idx;
9257 +
9258 +       if (get_user(idx, &u_info->entry_number))
9259 +               return -EFAULT;
9260 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9261 +               return -EINVAL;
9262 +
9263 +       memset(&info, 0, sizeof(info));
9264 +
9265 +       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9266 +
9267 +       info.entry_number = idx;
9268 +       info.base_addr = GET_BASE(desc);
9269 +       info.limit = GET_LIMIT(desc);
9270 +       info.seg_32bit = GET_32BIT(desc);
9271 +       info.contents = GET_CONTENTS(desc);
9272 +       info.read_exec_only = !GET_WRITABLE(desc);
9273 +       info.limit_in_pages = GET_LIMIT_PAGES(desc);
9274 +       info.seg_not_present = !GET_PRESENT(desc);
9275 +       info.useable = GET_USEABLE(desc);
9276 +
9277 +       if (copy_to_user(u_info, &info, sizeof(info)))
9278 +               return -EFAULT;
9279 +       return 0;
9280 +}
9281 +
9282 +unsigned long arch_align_stack(unsigned long sp)
9283 +{
9284 +       if (randomize_va_space)
9285 +               sp -= get_random_int() % 8192;
9286 +       return sp & ~0xf;
9287 +}
9288 Index: head-2008-11-25/arch/x86/kernel/quirks-xen.c
9289 ===================================================================
9290 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
9291 +++ head-2008-11-25/arch/x86/kernel/quirks-xen.c        2008-01-28 12:24:19.000000000 +0100
9292 @@ -0,0 +1,47 @@
9293 +/*
9294 + * This file contains work-arounds for x86 and x86_64 platform bugs.
9295 + */
9296 +#include <linux/pci.h>
9297 +#include <linux/irq.h>
9298 +
9299 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9300 +
9301 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9302 +{
9303 +       u8 config, rev;
9304 +       u32 word;
9305 +
9306 +       /* BIOS may enable hardware IRQ balancing for
9307 +        * E7520/E7320/E7525(revision ID 0x9 and below)
9308 +        * based platforms.
9309 +        * Disable SW irqbalance/affinity on those platforms.
9310 +        */
9311 +       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
9312 +       if (rev > 0x9)
9313 +               return;
9314 +
9315 +       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
9316 +
9317 +       /* enable access to config space*/
9318 +       pci_read_config_byte(dev, 0xf4, &config);
9319 +       pci_write_config_byte(dev, 0xf4, config|0x2);
9320 +
9321 +       /* read xTPR register */
9322 +       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
9323 +
9324 +       if (!(word & (1 << 13))) {
9325 +               struct xen_platform_op op;
9326 +               printk(KERN_INFO "Disabling irq balancing and affinity\n");
9327 +               op.cmd = XENPF_platform_quirk;
9328 +               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
9329 +               WARN_ON(HYPERVISOR_platform_op(&op));
9330 +       }
9331 +
9332 +       /* put back the original value for config space*/
9333 +       if (!(config & 0x2))
9334 +               pci_write_config_byte(dev, 0xf4, config);
9335 +}
9336 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
9337 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
9338 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
9339 +#endif
9340 Index: head-2008-11-25/arch/x86/kernel/setup_32-xen.c
9341 ===================================================================
9342 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
9343 +++ head-2008-11-25/arch/x86/kernel/setup_32-xen.c      2008-04-22 15:41:51.000000000 +0200
9344 @@ -0,0 +1,1919 @@
9345 +/*
9346 + *  linux/arch/i386/kernel/setup.c
9347 + *
9348 + *  Copyright (C) 1995  Linus Torvalds
9349 + *
9350 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
9351 + *
9352 + *  Memory region support
9353 + *     David Parsons <orc@pell.chi.il.us>, July-August 1999
9354 + *
9355 + *  Added E820 sanitization routine (removes overlapping memory regions);
9356 + *  Brian Moyle <bmoyle@mvista.com>, February 2001
9357 + *
9358 + * Moved CPU detection code to cpu/${cpu}.c
9359 + *    Patrick Mochel <mochel@osdl.org>, March 2002
9360 + *
9361 + *  Provisions for empty E820 memory regions (reported by certain BIOSes).
9362 + *  Alex Achenbach <xela@slit.de>, December 2002.
9363 + *
9364 + */
9365 +
9366 +/*
9367 + * This file handles the architecture-dependent parts of initialization
9368 + */
9369 +
9370 +#include <linux/sched.h>
9371 +#include <linux/mm.h>
9372 +#include <linux/mmzone.h>
9373 +#include <linux/screen_info.h>
9374 +#include <linux/ioport.h>
9375 +#include <linux/acpi.h>
9376 +#include <linux/apm_bios.h>
9377 +#include <linux/initrd.h>
9378 +#include <linux/bootmem.h>
9379 +#include <linux/seq_file.h>
9380 +#include <linux/platform_device.h>
9381 +#include <linux/console.h>
9382 +#include <linux/mca.h>
9383 +#include <linux/root_dev.h>
9384 +#include <linux/highmem.h>
9385 +#include <linux/module.h>
9386 +#include <linux/efi.h>
9387 +#include <linux/init.h>
9388 +#include <linux/edd.h>
9389 +#include <linux/nodemask.h>
9390 +#include <linux/kernel.h>
9391 +#include <linux/percpu.h>
9392 +#include <linux/notifier.h>
9393 +#include <linux/kexec.h>
9394 +#include <linux/crash_dump.h>
9395 +#include <linux/dmi.h>
9396 +#include <linux/pfn.h>
9397 +
9398 +#include <video/edid.h>
9399 +
9400 +#include <asm/apic.h>
9401 +#include <asm/e820.h>
9402 +#include <asm/mpspec.h>
9403 +#include <asm/setup.h>
9404 +#include <asm/arch_hooks.h>
9405 +#include <asm/sections.h>
9406 +#include <asm/io_apic.h>
9407 +#include <asm/ist.h>
9408 +#include <asm/io.h>
9409 +#include <asm/hypervisor.h>
9410 +#include <xen/interface/physdev.h>
9411 +#include <xen/interface/memory.h>
9412 +#include <xen/features.h>
9413 +#include <xen/firmware.h>
9414 +#include <xen/xencons.h>
9415 +#include <setup_arch.h>
9416 +#include <bios_ebda.h>
9417 +
9418 +#ifdef CONFIG_XEN
9419 +#include <xen/interface/kexec.h>
9420 +#endif
9421 +
9422 +/* Forward Declaration. */
9423 +void __init find_max_pfn(void);
9424 +
9425 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
9426 +static struct notifier_block xen_panic_block = {
9427 +       xen_panic_event, NULL, 0 /* try to go last */
9428 +};
9429 +
9430 +extern char hypercall_page[PAGE_SIZE];
9431 +EXPORT_SYMBOL(hypercall_page);
9432 +
9433 +int disable_pse __devinitdata = 0;
9434 +
9435 +/*
9436 + * Machine setup..
9437 + */
9438 +
9439 +#ifdef CONFIG_EFI
9440 +int efi_enabled = 0;
9441 +EXPORT_SYMBOL(efi_enabled);
9442 +#endif
9443 +
9444 +/* cpu data as detected by the assembly code in head.S */
9445 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9446 +/* common cpu data for all cpus */
9447 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9448 +EXPORT_SYMBOL(boot_cpu_data);
9449 +
9450 +unsigned long mmu_cr4_features;
9451 +
9452 +#ifdef CONFIG_ACPI
9453 +       int acpi_disabled = 0;
9454 +#else
9455 +       int acpi_disabled = 1;
9456 +#endif
9457 +EXPORT_SYMBOL(acpi_disabled);
9458 +
9459 +#ifdef CONFIG_ACPI
9460 +int __initdata acpi_force = 0;
9461 +extern acpi_interrupt_flags    acpi_sci_flags;
9462 +#endif
9463 +
9464 +/* for MCA, but anyone else can use it if they want */
9465 +unsigned int machine_id;
9466 +#ifdef CONFIG_MCA
9467 +EXPORT_SYMBOL(machine_id);
9468 +#endif
9469 +unsigned int machine_submodel_id;
9470 +unsigned int BIOS_revision;
9471 +unsigned int mca_pentium_flag;
9472 +
9473 +/* For PCI or other memory-mapped resources */
9474 +unsigned long pci_mem_start = 0x10000000;
9475 +#ifdef CONFIG_PCI
9476 +EXPORT_SYMBOL(pci_mem_start);
9477 +#endif
9478 +
9479 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
9480 +int bootloader_type;
9481 +
9482 +/* user-defined highmem size */
9483 +static unsigned int highmem_pages = -1;
9484 +
9485 +/*
9486 + * Setup options
9487 + */
9488 +struct drive_info_struct { char dummy[32]; } drive_info;
9489 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
9490 +    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
9491 +EXPORT_SYMBOL(drive_info);
9492 +#endif
9493 +struct screen_info screen_info;
9494 +EXPORT_SYMBOL(screen_info);
9495 +struct apm_info apm_info;
9496 +EXPORT_SYMBOL(apm_info);
9497 +struct sys_desc_table_struct {
9498 +       unsigned short length;
9499 +       unsigned char table[0];
9500 +};
9501 +struct edid_info edid_info;
9502 +EXPORT_SYMBOL_GPL(edid_info);
9503 +#ifndef CONFIG_XEN
9504 +#define copy_edid() (edid_info = EDID_INFO)
9505 +#endif
9506 +struct ist_info ist_info;
9507 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
9508 +       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
9509 +EXPORT_SYMBOL(ist_info);
9510 +#endif
9511 +struct e820map e820;
9512 +#ifdef CONFIG_XEN
9513 +struct e820map machine_e820;
9514 +#endif
9515 +
9516 +extern void early_cpu_init(void);
9517 +extern void generic_apic_probe(char *);
9518 +extern int root_mountflags;
9519 +
9520 +unsigned long saved_videomode;
9521 +
9522 +#define RAMDISK_IMAGE_START_MASK       0x07FF
9523 +#define RAMDISK_PROMPT_FLAG            0x8000
9524 +#define RAMDISK_LOAD_FLAG              0x4000
9525 +
9526 +static char command_line[COMMAND_LINE_SIZE];
9527 +
9528 +unsigned char __initdata boot_params[PARAM_SIZE];
9529 +
9530 +static struct resource data_resource = {
9531 +       .name   = "Kernel data",
9532 +       .start  = 0,
9533 +       .end    = 0,
9534 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
9535 +};
9536 +
9537 +static struct resource code_resource = {
9538 +       .name   = "Kernel code",
9539 +       .start  = 0,
9540 +       .end    = 0,
9541 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
9542 +};
9543 +
9544 +static struct resource system_rom_resource = {
9545 +       .name   = "System ROM",
9546 +       .start  = 0xf0000,
9547 +       .end    = 0xfffff,
9548 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9549 +};
9550 +
9551 +static struct resource extension_rom_resource = {
9552 +       .name   = "Extension ROM",
9553 +       .start  = 0xe0000,
9554 +       .end    = 0xeffff,
9555 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9556 +};
9557 +
9558 +static struct resource adapter_rom_resources[] = { {
9559 +       .name   = "Adapter ROM",
9560 +       .start  = 0xc8000,
9561 +       .end    = 0,
9562 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9563 +}, {
9564 +       .name   = "Adapter ROM",
9565 +       .start  = 0,
9566 +       .end    = 0,
9567 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9568 +}, {
9569 +       .name   = "Adapter ROM",
9570 +       .start  = 0,
9571 +       .end    = 0,
9572 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9573 +}, {
9574 +       .name   = "Adapter ROM",
9575 +       .start  = 0,
9576 +       .end    = 0,
9577 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9578 +}, {
9579 +       .name   = "Adapter ROM",
9580 +       .start  = 0,
9581 +       .end    = 0,
9582 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9583 +}, {
9584 +       .name   = "Adapter ROM",
9585 +       .start  = 0,
9586 +       .end    = 0,
9587 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9588 +} };
9589 +
9590 +#define ADAPTER_ROM_RESOURCES \
9591 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
9592 +
9593 +static struct resource video_rom_resource = {
9594 +       .name   = "Video ROM",
9595 +       .start  = 0xc0000,
9596 +       .end    = 0xc7fff,
9597 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9598 +};
9599 +
9600 +static struct resource video_ram_resource = {
9601 +       .name   = "Video RAM area",
9602 +       .start  = 0xa0000,
9603 +       .end    = 0xbffff,
9604 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
9605 +};
9606 +
9607 +static struct resource standard_io_resources[] = { {
9608 +       .name   = "dma1",
9609 +       .start  = 0x0000,
9610 +       .end    = 0x001f,
9611 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9612 +}, {
9613 +       .name   = "pic1",
9614 +       .start  = 0x0020,
9615 +       .end    = 0x0021,
9616 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9617 +}, {
9618 +       .name   = "timer0",
9619 +       .start  = 0x0040,
9620 +       .end    = 0x0043,
9621 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9622 +}, {
9623 +       .name   = "timer1",
9624 +       .start  = 0x0050,
9625 +       .end    = 0x0053,
9626 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9627 +}, {
9628 +       .name   = "keyboard",
9629 +       .start  = 0x0060,
9630 +       .end    = 0x006f,
9631 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9632 +}, {
9633 +       .name   = "dma page reg",
9634 +       .start  = 0x0080,
9635 +       .end    = 0x008f,
9636 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9637 +}, {
9638 +       .name   = "pic2",
9639 +       .start  = 0x00a0,
9640 +       .end    = 0x00a1,
9641 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9642 +}, {
9643 +       .name   = "dma2",
9644 +       .start  = 0x00c0,
9645 +       .end    = 0x00df,
9646 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9647 +}, {
9648 +       .name   = "fpu",
9649 +       .start  = 0x00f0,
9650 +       .end    = 0x00ff,
9651 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
9652 +} };
9653 +
9654 +#define STANDARD_IO_RESOURCES \
9655 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
9656 +
9657 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
9658 +
9659 +static int __init romchecksum(unsigned char *rom, unsigned long length)
9660 +{
9661 +       unsigned char *p, sum = 0;
9662 +
9663 +       for (p = rom; p < rom + length; p++)
9664 +               sum += *p;
9665 +       return sum == 0;
9666 +}
9667 +
9668 +static void __init probe_roms(void)
9669 +{
9670 +       unsigned long start, length, upper;
9671 +       unsigned char *rom;
9672 +       int           i;
9673 +
9674 +#ifdef CONFIG_XEN
9675 +       /* Nothing to do if not running in dom0. */
9676 +       if (!is_initial_xendomain())
9677 +               return;
9678 +#endif
9679 +
9680 +       /* video rom */
9681 +       upper = adapter_rom_resources[0].start;
9682 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
9683 +               rom = isa_bus_to_virt(start);
9684 +               if (!romsignature(rom))
9685 +                       continue;
9686 +
9687 +               video_rom_resource.start = start;
9688 +
9689 +               /* 0 < length <= 0x7f * 512, historically */
9690 +               length = rom[2] * 512;
9691 +
9692 +               /* if checksum okay, trust length byte */
9693 +               if (length && romchecksum(rom, length))
9694 +                       video_rom_resource.end = start + length - 1;
9695 +
9696 +               request_resource(&iomem_resource, &video_rom_resource);
9697 +               break;
9698 +       }
9699 +
9700 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
9701 +       if (start < upper)
9702 +               start = upper;
9703 +
9704 +       /* system rom */
9705 +       request_resource(&iomem_resource, &system_rom_resource);
9706 +       upper = system_rom_resource.start;
9707 +
9708 +       /* check for extension rom (ignore length byte!) */
9709 +       rom = isa_bus_to_virt(extension_rom_resource.start);
9710 +       if (romsignature(rom)) {
9711 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
9712 +               if (romchecksum(rom, length)) {
9713 +                       request_resource(&iomem_resource, &extension_rom_resource);
9714 +                       upper = extension_rom_resource.start;
9715 +               }
9716 +       }
9717 +
9718 +       /* check for adapter roms on 2k boundaries */
9719 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
9720 +               rom = isa_bus_to_virt(start);
9721 +               if (!romsignature(rom))
9722 +                       continue;
9723 +
9724 +               /* 0 < length <= 0x7f * 512, historically */
9725 +               length = rom[2] * 512;
9726 +
9727 +               /* but accept any length that fits if checksum okay */
9728 +               if (!length || start + length > upper || !romchecksum(rom, length))
9729 +                       continue;
9730 +
9731 +               adapter_rom_resources[i].start = start;
9732 +               adapter_rom_resources[i].end = start + length - 1;
9733 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
9734 +
9735 +               start = adapter_rom_resources[i++].end & ~2047UL;
9736 +       }
9737 +}
9738 +
9739 +/*
9740 + * Point at the empty zero page to start with. We map the real shared_info
9741 + * page as soon as fixmap is up and running.
9742 + */
9743 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
9744 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
9745 +
9746 +unsigned long *phys_to_machine_mapping;
9747 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
9748 +EXPORT_SYMBOL(phys_to_machine_mapping);
9749 +
9750 +/* Raw start-of-day parameters from the hypervisor. */
9751 +start_info_t *xen_start_info;
9752 +EXPORT_SYMBOL(xen_start_info);
9753 +
9754 +void __init add_memory_region(unsigned long long start,
9755 +                             unsigned long long size, int type)
9756 +{
9757 +       int x;
9758 +
9759 +       if (!efi_enabled) {
9760 +                       x = e820.nr_map;
9761 +
9762 +               if (x == E820MAX) {
9763 +                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
9764 +                   return;
9765 +               }
9766 +
9767 +               e820.map[x].addr = start;
9768 +               e820.map[x].size = size;
9769 +               e820.map[x].type = type;
9770 +               e820.nr_map++;
9771 +       }
9772 +} /* add_memory_region */
9773 +
9774 +static void __init limit_regions(unsigned long long size)
9775 +{
9776 +       unsigned long long current_addr = 0;
9777 +       int i;
9778 +
9779 +       if (efi_enabled) {
9780 +               efi_memory_desc_t *md;
9781 +               void *p;
9782 +
9783 +               for (p = memmap.map, i = 0; p < memmap.map_end;
9784 +                       p += memmap.desc_size, i++) {
9785 +                       md = p;
9786 +                       current_addr = md->phys_addr + (md->num_pages << 12);
9787 +                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
9788 +                               if (current_addr >= size) {
9789 +                                       md->num_pages -=
9790 +                                               (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
9791 +                                       memmap.nr_map = i + 1;
9792 +                                       return;
9793 +                               }
9794 +                       }
9795 +               }
9796 +       }
9797 +       for (i = 0; i < e820.nr_map; i++) {
9798 +               current_addr = e820.map[i].addr + e820.map[i].size;
9799 +               if (current_addr < size)
9800 +                       continue;
9801 +
9802 +               if (e820.map[i].type != E820_RAM)
9803 +                       continue;
9804 +
9805 +               if (e820.map[i].addr >= size) {
9806 +                       /*
9807 +                        * This region starts past the end of the
9808 +                        * requested size, skip it completely.
9809 +                        */
9810 +                       e820.nr_map = i;
9811 +               } else {
9812 +                       e820.nr_map = i + 1;
9813 +                       e820.map[i].size -= current_addr - size;
9814 +               }
9815 +               return;
9816 +       }
9817 +#ifdef CONFIG_XEN
9818 +       if (i==e820.nr_map && current_addr < size) {
9819 +               /*
9820 +                 * The e820 map finished before our requested size so
9821 +                 * extend the final entry to the requested address.
9822 +                 */
9823 +               --i;
9824 +               if (e820.map[i].type == E820_RAM)
9825 +                       e820.map[i].size -= current_addr - size;
9826 +               else
9827 +                       add_memory_region(current_addr, size - current_addr, E820_RAM);
9828 +       }
9829 +#endif
9830 +}
9831 +
9832 +#define E820_DEBUG     1
9833 +
9834 +static void __init print_memory_map(char *who)
9835 +{
9836 +       int i;
9837 +
9838 +       for (i = 0; i < e820.nr_map; i++) {
9839 +               printk(" %s: %016Lx - %016Lx ", who,
9840 +                       e820.map[i].addr,
9841 +                       e820.map[i].addr + e820.map[i].size);
9842 +               switch (e820.map[i].type) {
9843 +               case E820_RAM:  printk("(usable)\n");
9844 +                               break;
9845 +               case E820_RESERVED:
9846 +                               printk("(reserved)\n");
9847 +                               break;
9848 +               case E820_ACPI:
9849 +                               printk("(ACPI data)\n");
9850 +                               break;
9851 +               case E820_NVS:
9852 +                               printk("(ACPI NVS)\n");
9853 +                               break;
9854 +               default:        printk("type %lu\n", e820.map[i].type);
9855 +                               break;
9856 +               }
9857 +       }
9858 +}
9859 +
9860 +/*
9861 + * Sanitize the BIOS e820 map.
9862 + *
9863 + * Some e820 responses include overlapping entries.  The following
9864 + * replaces the original e820 map with a new one, removing overlaps.
9865 + *
9866 + */
9867 +struct change_member {
9868 +       struct e820entry *pbios; /* pointer to original bios entry */
9869 +       unsigned long long addr; /* address for this change point */
9870 +};
9871 +static struct change_member change_point_list[2*E820MAX] __initdata;
9872 +static struct change_member *change_point[2*E820MAX] __initdata;
9873 +static struct e820entry *overlap_list[E820MAX] __initdata;
9874 +static struct e820entry new_bios[E820MAX] __initdata;
9875 +
9876 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
9877 +{
9878 +       struct change_member *change_tmp;
9879 +       unsigned long current_type, last_type;
9880 +       unsigned long long last_addr;
9881 +       int chgidx, still_changing;
9882 +       int overlap_entries;
9883 +       int new_bios_entry;
9884 +       int old_nr, new_nr, chg_nr;
9885 +       int i;
9886 +
9887 +       /*
9888 +               Visually we're performing the following (1,2,3,4 = memory types)...
9889 +
9890 +               Sample memory map (w/overlaps):
9891 +                  ____22__________________
9892 +                  ______________________4_
9893 +                  ____1111________________
9894 +                  _44_____________________
9895 +                  11111111________________
9896 +                  ____________________33__
9897 +                  ___________44___________
9898 +                  __________33333_________
9899 +                  ______________22________
9900 +                  ___________________2222_
9901 +                  _________111111111______
9902 +                  _____________________11_
9903 +                  _________________4______
9904 +
9905 +               Sanitized equivalent (no overlap):
9906 +                  1_______________________
9907 +                  _44_____________________
9908 +                  ___1____________________
9909 +                  ____22__________________
9910 +                  ______11________________
9911 +                  _________1______________
9912 +                  __________3_____________
9913 +                  ___________44___________
9914 +                  _____________33_________
9915 +                  _______________2________
9916 +                  ________________1_______
9917 +                  _________________4______
9918 +                  ___________________2____
9919 +                  ____________________33__
9920 +                  ______________________4_
9921 +       */
9922 +
9923 +       /* if there's only one memory region, don't bother */
9924 +       if (*pnr_map < 2)
9925 +               return -1;
9926 +
9927 +       old_nr = *pnr_map;
9928 +
9929 +       /* bail out if we find any unreasonable addresses in bios map */
9930 +       for (i=0; i<old_nr; i++)
9931 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
9932 +                       return -1;
9933 +
9934 +       /* create pointers for initial change-point information (for sorting) */
9935 +       for (i=0; i < 2*old_nr; i++)
9936 +               change_point[i] = &change_point_list[i];
9937 +
9938 +       /* record all known change-points (starting and ending addresses),
9939 +          omitting those that are for empty memory regions */
9940 +       chgidx = 0;
9941 +       for (i=0; i < old_nr; i++)      {
9942 +               if (biosmap[i].size != 0) {
9943 +                       change_point[chgidx]->addr = biosmap[i].addr;
9944 +                       change_point[chgidx++]->pbios = &biosmap[i];
9945 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
9946 +                       change_point[chgidx++]->pbios = &biosmap[i];
9947 +               }
9948 +       }
9949 +       chg_nr = chgidx;        /* true number of change-points */
9950 +
9951 +       /* sort change-point list by memory addresses (low -> high) */
9952 +       still_changing = 1;
9953 +       while (still_changing)  {
9954 +               still_changing = 0;
9955 +               for (i=1; i < chg_nr; i++)  {
9956 +                       /* if <current_addr> > <last_addr>, swap */
9957 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
9958 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
9959 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
9960 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
9961 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
9962 +                          )
9963 +                       {
9964 +                               change_tmp = change_point[i];
9965 +                               change_point[i] = change_point[i-1];
9966 +                               change_point[i-1] = change_tmp;
9967 +                               still_changing=1;
9968 +                       }
9969 +               }
9970 +       }
9971 +
9972 +       /* create a new bios memory map, removing overlaps */
9973 +       overlap_entries=0;       /* number of entries in the overlap table */
9974 +       new_bios_entry=0;        /* index for creating new bios map entries */
9975 +       last_type = 0;           /* start with undefined memory type */
9976 +       last_addr = 0;           /* start with 0 as last starting address */
9977 +       /* loop through change-points, determining affect on the new bios map */
9978 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
9979 +       {
9980 +               /* keep track of all overlapping bios entries */
9981 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
9982 +               {
9983 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
9984 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
9985 +               }
9986 +               else
9987 +               {
9988 +                       /* remove entry from list (order independent, so swap with last) */
9989 +                       for (i=0; i<overlap_entries; i++)
9990 +                       {
9991 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
9992 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
9993 +                       }
9994 +                       overlap_entries--;
9995 +               }
9996 +               /* if there are overlapping entries, decide which "type" to use */
9997 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
9998 +               current_type = 0;
9999 +               for (i=0; i<overlap_entries; i++)
10000 +                       if (overlap_list[i]->type > current_type)
10001 +                               current_type = overlap_list[i]->type;
10002 +               /* continue building up new bios map based on this information */
10003 +               if (current_type != last_type)  {
10004 +                       if (last_type != 0)      {
10005 +                               new_bios[new_bios_entry].size =
10006 +                                       change_point[chgidx]->addr - last_addr;
10007 +                               /* move forward only if the new size was non-zero */
10008 +                               if (new_bios[new_bios_entry].size != 0)
10009 +                                       if (++new_bios_entry >= E820MAX)
10010 +                                               break;  /* no more space left for new bios entries */
10011 +                       }
10012 +                       if (current_type != 0)  {
10013 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10014 +                               new_bios[new_bios_entry].type = current_type;
10015 +                               last_addr=change_point[chgidx]->addr;
10016 +                       }
10017 +                       last_type = current_type;
10018 +               }
10019 +       }
10020 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
10021 +
10022 +       /* copy new bios mapping into original location */
10023 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10024 +       *pnr_map = new_nr;
10025 +
10026 +       return 0;
10027 +}
10028 +
10029 +/*
10030 + * Copy the BIOS e820 map into a safe place.
10031 + *
10032 + * Sanity-check it while we're at it..
10033 + *
10034 + * If we're lucky and live on a modern system, the setup code
10035 + * will have given us a memory map that we can use to properly
10036 + * set up memory.  If we aren't, we'll fake a memory map.
10037 + *
10038 + * We check to see that the memory map contains at least 2 elements
10039 + * before we'll use it, because the detection code in setup.S may
10040 + * not be perfect and most every PC known to man has two memory
10041 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
10042 + * thinkpad 560x, for example, does not cooperate with the memory
10043 + * detection code.)
10044 + */
10045 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10046 +{
10047 +#ifndef CONFIG_XEN
10048 +       /* Only one memory region (or negative)? Ignore it */
10049 +       if (nr_map < 2)
10050 +               return -1;
10051 +#else
10052 +       BUG_ON(nr_map < 1);
10053 +#endif
10054 +
10055 +       do {
10056 +               unsigned long long start = biosmap->addr;
10057 +               unsigned long long size = biosmap->size;
10058 +               unsigned long long end = start + size;
10059 +               unsigned long type = biosmap->type;
10060 +
10061 +               /* Overflow in 64 bits? Ignore the memory map. */
10062 +               if (start > end)
10063 +                       return -1;
10064 +
10065 +#ifndef CONFIG_XEN
10066 +               /*
10067 +                * Some BIOSes claim RAM in the 640k - 1M region.
10068 +                * Not right. Fix it up.
10069 +                */
10070 +               if (type == E820_RAM) {
10071 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
10072 +                               if (start < 0xA0000ULL)
10073 +                                       add_memory_region(start, 0xA0000ULL-start, type);
10074 +                               if (end <= 0x100000ULL)
10075 +                                       continue;
10076 +                               start = 0x100000ULL;
10077 +                               size = end - start;
10078 +                       }
10079 +               }
10080 +#endif
10081 +               add_memory_region(start, size, type);
10082 +       } while (biosmap++,--nr_map);
10083 +
10084 +#ifdef CONFIG_XEN
10085 +       if (is_initial_xendomain()) {
10086 +               struct xen_memory_map memmap;
10087 +
10088 +               memmap.nr_entries = E820MAX;
10089 +               set_xen_guest_handle(memmap.buffer, machine_e820.map);
10090 +
10091 +               if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
10092 +                       BUG();
10093 +               machine_e820.nr_map = memmap.nr_entries;
10094 +       } else
10095 +               machine_e820 = e820;
10096 +#endif
10097 +
10098 +       return 0;
10099 +}
10100 +
10101 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10102 +struct edd edd;
10103 +#ifdef CONFIG_EDD_MODULE
10104 +EXPORT_SYMBOL(edd);
10105 +#endif
10106 +#ifndef CONFIG_XEN
10107 +/**
10108 + * copy_edd() - Copy the BIOS EDD information
10109 + *              from boot_params into a safe place.
10110 + *
10111 + */
10112 +static inline void copy_edd(void)
10113 +{
10114 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10115 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10116 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10117 +     edd.edd_info_nr = EDD_NR;
10118 +}
10119 +#endif
10120 +#else
10121 +static inline void copy_edd(void)
10122 +{
10123 +}
10124 +#endif
10125 +
10126 +static void __init parse_cmdline_early (char ** cmdline_p)
10127 +{
10128 +       char c = ' ', *to = command_line, *from = saved_command_line;
10129 +       int len = 0, max_cmdline;
10130 +       int userdef = 0;
10131 +
10132 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10133 +               max_cmdline = COMMAND_LINE_SIZE;
10134 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10135 +       /* Save unparsed command line copy for /proc/cmdline */
10136 +       saved_command_line[max_cmdline-1] = '\0';
10137 +
10138 +       for (;;) {
10139 +               if (c != ' ')
10140 +                       goto next_char;
10141 +               /*
10142 +                * "mem=nopentium" disables the 4MB page tables.
10143 +                * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10144 +                * to <mem>, overriding the bios size.
10145 +                * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10146 +                * <start> to <start>+<mem>, overriding the bios size.
10147 +                *
10148 +                * HPA tells me bootloaders need to parse mem=, so no new
10149 +                * option should be mem=  [also see Documentation/i386/boot.txt]
10150 +                */
10151 +               if (!memcmp(from, "mem=", 4)) {
10152 +                       if (to != command_line)
10153 +                               to--;
10154 +                       if (!memcmp(from+4, "nopentium", 9)) {
10155 +                               from += 9+4;
10156 +                               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10157 +                               disable_pse = 1;
10158 +                       } else {
10159 +                               /* If the user specifies memory size, we
10160 +                                * limit the BIOS-provided memory map to
10161 +                                * that size. exactmap can be used to specify
10162 +                                * the exact map. mem=number can be used to
10163 +                                * trim the existing memory map.
10164 +                                */
10165 +                               unsigned long long mem_size;
10166 +
10167 +                               mem_size = memparse(from+4, &from);
10168 +                               limit_regions(mem_size);
10169 +                               userdef=1;
10170 +                       }
10171 +               }
10172 +
10173 +               else if (!memcmp(from, "memmap=", 7)) {
10174 +                       if (to != command_line)
10175 +                               to--;
10176 +                       if (!memcmp(from+7, "exactmap", 8)) {
10177 +#ifdef CONFIG_CRASH_DUMP
10178 +                               /* If we are doing a crash dump, we
10179 +                                * still need to know the real mem
10180 +                                * size before original memory map is
10181 +                                * reset.
10182 +                                */
10183 +                               find_max_pfn();
10184 +                               saved_max_pfn = max_pfn;
10185 +#endif
10186 +                               from += 8+7;
10187 +                               e820.nr_map = 0;
10188 +                               userdef = 1;
10189 +                       } else {
10190 +                               /* If the user specifies memory size, we
10191 +                                * limit the BIOS-provided memory map to
10192 +                                * that size. exactmap can be used to specify
10193 +                                * the exact map. mem=number can be used to
10194 +                                * trim the existing memory map.
10195 +                                */
10196 +                               unsigned long long start_at, mem_size;
10197 +
10198 +                               mem_size = memparse(from+7, &from);
10199 +                               if (*from == '@') {
10200 +                                       start_at = memparse(from+1, &from);
10201 +                                       add_memory_region(start_at, mem_size, E820_RAM);
10202 +                               } else if (*from == '#') {
10203 +                                       start_at = memparse(from+1, &from);
10204 +                                       add_memory_region(start_at, mem_size, E820_ACPI);
10205 +                               } else if (*from == '$') {
10206 +                                       start_at = memparse(from+1, &from);
10207 +                                       add_memory_region(start_at, mem_size, E820_RESERVED);
10208 +                               } else {
10209 +                                       limit_regions(mem_size);
10210 +                                       userdef=1;
10211 +                               }
10212 +                       }
10213 +               }
10214 +
10215 +               else if (!memcmp(from, "noexec=", 7))
10216 +                       noexec_setup(from + 7);
10217 +
10218 +
10219 +#ifdef  CONFIG_X86_MPPARSE
10220 +               /*
10221 +                * If the BIOS enumerates physical processors before logical,
10222 +                * maxcpus=N at enumeration-time can be used to disable HT.
10223 +                */
10224 +               else if (!memcmp(from, "maxcpus=", 8)) {
10225 +                       extern unsigned int maxcpus;
10226 +
10227 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
10228 +               }
10229 +#endif
10230 +
10231 +#ifdef CONFIG_ACPI
10232 +               /* "acpi=off" disables both ACPI table parsing and interpreter */
10233 +               else if (!memcmp(from, "acpi=off", 8)) {
10234 +                       disable_acpi();
10235 +               }
10236 +
10237 +               /* acpi=force to over-ride black-list */
10238 +               else if (!memcmp(from, "acpi=force", 10)) {
10239 +                       acpi_force = 1;
10240 +                       acpi_ht = 1;
10241 +                       acpi_disabled = 0;
10242 +               }
10243 +
10244 +               /* acpi=strict disables out-of-spec workarounds */
10245 +               else if (!memcmp(from, "acpi=strict", 11)) {
10246 +                       acpi_strict = 1;
10247 +               }
10248 +
10249 +               /* Limit ACPI just to boot-time to enable HT */
10250 +               else if (!memcmp(from, "acpi=ht", 7)) {
10251 +                       if (!acpi_force)
10252 +                               disable_acpi();
10253 +                       acpi_ht = 1;
10254 +               }
10255 +
10256 +               /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10257 +               else if (!memcmp(from, "pci=noacpi", 10)) {
10258 +                       acpi_disable_pci();
10259 +               }
10260 +               /* "acpi=noirq" disables ACPI interrupt routing */
10261 +               else if (!memcmp(from, "acpi=noirq", 10)) {
10262 +                       acpi_noirq_set();
10263 +               }
10264 +
10265 +               else if (!memcmp(from, "acpi_sci=edge", 13))
10266 +                       acpi_sci_flags.trigger =  1;
10267 +
10268 +               else if (!memcmp(from, "acpi_sci=level", 14))
10269 +                       acpi_sci_flags.trigger = 3;
10270 +
10271 +               else if (!memcmp(from, "acpi_sci=high", 13))
10272 +                       acpi_sci_flags.polarity = 1;
10273 +
10274 +               else if (!memcmp(from, "acpi_sci=low", 12))
10275 +                       acpi_sci_flags.polarity = 3;
10276 +
10277 +#ifdef CONFIG_X86_IO_APIC
10278 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
10279 +                       acpi_skip_timer_override = 1;
10280 +
10281 +               if (!memcmp(from, "disable_timer_pin_1", 19))
10282 +                       disable_timer_pin_1 = 1;
10283 +               if (!memcmp(from, "enable_timer_pin_1", 18))
10284 +                       disable_timer_pin_1 = -1;
10285 +
10286 +               /* disable IO-APIC */
10287 +               else if (!memcmp(from, "noapic", 6))
10288 +                       disable_ioapic_setup();
10289 +#endif /* CONFIG_X86_IO_APIC */
10290 +#endif /* CONFIG_ACPI */
10291 +
10292 +#ifdef CONFIG_X86_LOCAL_APIC
10293 +               /* enable local APIC */
10294 +               else if (!memcmp(from, "lapic", 5))
10295 +                       lapic_enable();
10296 +
10297 +               /* disable local APIC */
10298 +               else if (!memcmp(from, "nolapic", 6))
10299 +                       lapic_disable();
10300 +#endif /* CONFIG_X86_LOCAL_APIC */
10301 +
10302 +#ifdef CONFIG_KEXEC
10303 +               /* crashkernel=size@addr specifies the location to reserve for
10304 +                * a crash kernel.  By reserving this memory we guarantee
10305 +                * that linux never set's it up as a DMA target.
10306 +                * Useful for holding code to do something appropriate
10307 +                * after a kernel panic.
10308 +                */
10309 +               else if (!memcmp(from, "crashkernel=", 12)) {
10310 +#ifndef CONFIG_XEN
10311 +                       unsigned long size, base;
10312 +                       size = memparse(from+12, &from);
10313 +                       if (*from == '@') {
10314 +                               base = memparse(from+1, &from);
10315 +                               /* FIXME: Do I want a sanity check
10316 +                                * to validate the memory range?
10317 +                                */
10318 +                               crashk_res.start = base;
10319 +                               crashk_res.end   = base + size - 1;
10320 +                       }
10321 +#else
10322 +                       printk("Ignoring crashkernel command line, "
10323 +                              "parameter will be supplied by xen\n");
10324 +#endif
10325 +               }
10326 +#endif
10327 +#ifdef CONFIG_PROC_VMCORE
10328 +               /* elfcorehdr= specifies the location of elf core header
10329 +                * stored by the crashed kernel.
10330 +                */
10331 +               else if (!memcmp(from, "elfcorehdr=", 11))
10332 +                       elfcorehdr_addr = memparse(from+11, &from);
10333 +#endif
10334 +
10335 +               /*
10336 +                * highmem=size forces highmem to be exactly 'size' bytes.
10337 +                * This works even on boxes that have no highmem otherwise.
10338 +                * This also works to reduce highmem size on bigger boxes.
10339 +                */
10340 +               else if (!memcmp(from, "highmem=", 8))
10341 +                       highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
10342 +
10343 +               /*
10344 +                * vmalloc=size forces the vmalloc area to be exactly 'size'
10345 +                * bytes. This can be used to increase (or decrease) the
10346 +                * vmalloc area - the default is 128m.
10347 +                */
10348 +               else if (!memcmp(from, "vmalloc=", 8))
10349 +                       __VMALLOC_RESERVE = memparse(from+8, &from);
10350 +
10351 +       next_char:
10352 +               c = *(from++);
10353 +               if (!c)
10354 +                       break;
10355 +               if (COMMAND_LINE_SIZE <= ++len)
10356 +                       break;
10357 +               *(to++) = c;
10358 +       }
10359 +       *to = '\0';
10360 +       *cmdline_p = command_line;
10361 +       if (userdef) {
10362 +               printk(KERN_INFO "user-defined physical RAM map:\n");
10363 +               print_memory_map("user");
10364 +       }
10365 +}
10366 +
10367 +/*
10368 + * Callback for efi_memory_walk.
10369 + */
10370 +static int __init
10371 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
10372 +{
10373 +       unsigned long *max_pfn = arg, pfn;
10374 +
10375 +       if (start < end) {
10376 +               pfn = PFN_UP(end -1);
10377 +               if (pfn > *max_pfn)
10378 +                       *max_pfn = pfn;
10379 +       }
10380 +       return 0;
10381 +}
10382 +
10383 +static int __init
10384 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
10385 +{
10386 +       memory_present(0, start, end);
10387 +       return 0;
10388 +}
10389 +
10390 +/*
10391 + * This function checks if any part of the range <start,end> is mapped
10392 + * with type.
10393 + */
10394 +int
10395 +e820_any_mapped(u64 start, u64 end, unsigned type)
10396 +{
10397 +       int i;
10398 +
10399 +#ifndef CONFIG_XEN
10400 +       for (i = 0; i < e820.nr_map; i++) {
10401 +               const struct e820entry *ei = &e820.map[i];
10402 +#else
10403 +       if (!is_initial_xendomain())
10404 +               return 0;
10405 +       for (i = 0; i < machine_e820.nr_map; ++i) {
10406 +               const struct e820entry *ei = &machine_e820.map[i];
10407 +#endif
10408 +
10409 +               if (type && ei->type != type)
10410 +                       continue;
10411 +               if (ei->addr >= end || ei->addr + ei->size <= start)
10412 +                       continue;
10413 +               return 1;
10414 +       }
10415 +       return 0;
10416 +}
10417 +EXPORT_SYMBOL_GPL(e820_any_mapped);
10418 +
10419 + /*
10420 +  * This function checks if the entire range <start,end> is mapped with type.
10421 +  *
10422 +  * Note: this function only works correct if the e820 table is sorted and
10423 +  * not-overlapping, which is the case
10424 +  */
10425 +int __init
10426 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
10427 +{
10428 +       u64 start = s;
10429 +       u64 end = e;
10430 +       int i;
10431 +
10432 +#ifndef CONFIG_XEN
10433 +       for (i = 0; i < e820.nr_map; i++) {
10434 +               struct e820entry *ei = &e820.map[i];
10435 +#else
10436 +       if (!is_initial_xendomain())
10437 +               return 0;
10438 +       for (i = 0; i < machine_e820.nr_map; ++i) {
10439 +               const struct e820entry *ei = &machine_e820.map[i];
10440 +#endif
10441 +               if (type && ei->type != type)
10442 +                       continue;
10443 +               /* is the region (part) in overlap with the current region ?*/
10444 +               if (ei->addr >= end || ei->addr + ei->size <= start)
10445 +                       continue;
10446 +               /* if the region is at the beginning of <start,end> we move
10447 +                * start to the end of the region since it's ok until there
10448 +                */
10449 +               if (ei->addr <= start)
10450 +                       start = ei->addr + ei->size;
10451 +               /* if start is now at or beyond end, we're done, full
10452 +                * coverage */
10453 +               if (start >= end)
10454 +                       return 1; /* we're done */
10455 +       }
10456 +       return 0;
10457 +}
10458 +
10459 +/*
10460 + * Find the highest page frame number we have available
10461 + */
10462 +void __init find_max_pfn(void)
10463 +{
10464 +       int i;
10465 +
10466 +       max_pfn = 0;
10467 +       if (efi_enabled) {
10468 +               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
10469 +               efi_memmap_walk(efi_memory_present_wrapper, NULL);
10470 +               return;
10471 +       }
10472 +
10473 +       for (i = 0; i < e820.nr_map; i++) {
10474 +               unsigned long start, end;
10475 +               /* RAM? */
10476 +               if (e820.map[i].type != E820_RAM)
10477 +                       continue;
10478 +               start = PFN_UP(e820.map[i].addr);
10479 +               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10480 +               if (start >= end)
10481 +                       continue;
10482 +               if (end > max_pfn)
10483 +                       max_pfn = end;
10484 +               memory_present(0, start, end);
10485 +       }
10486 +}
10487 +
10488 +/*
10489 + * Determine low and high memory ranges:
10490 + */
10491 +unsigned long __init find_max_low_pfn(void)
10492 +{
10493 +       unsigned long max_low_pfn;
10494 +
10495 +       max_low_pfn = max_pfn;
10496 +       if (max_low_pfn > MAXMEM_PFN) {
10497 +               if (highmem_pages == -1)
10498 +                       highmem_pages = max_pfn - MAXMEM_PFN;
10499 +               if (highmem_pages + MAXMEM_PFN < max_pfn)
10500 +                       max_pfn = MAXMEM_PFN + highmem_pages;
10501 +               if (highmem_pages + MAXMEM_PFN > max_pfn) {
10502 +                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
10503 +                       highmem_pages = 0;
10504 +               }
10505 +               max_low_pfn = MAXMEM_PFN;
10506 +#ifndef CONFIG_HIGHMEM
10507 +               /* Maximum memory usable is what is directly addressable */
10508 +               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
10509 +                                       MAXMEM>>20);
10510 +               if (max_pfn > MAX_NONPAE_PFN)
10511 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10512 +               else
10513 +                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
10514 +               max_pfn = MAXMEM_PFN;
10515 +#else /* !CONFIG_HIGHMEM */
10516 +#ifndef CONFIG_X86_PAE
10517 +               if (max_pfn > MAX_NONPAE_PFN) {
10518 +                       max_pfn = MAX_NONPAE_PFN;
10519 +                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
10520 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10521 +               }
10522 +#endif /* !CONFIG_X86_PAE */
10523 +#endif /* !CONFIG_HIGHMEM */
10524 +       } else {
10525 +               if (highmem_pages == -1)
10526 +                       highmem_pages = 0;
10527 +#ifdef CONFIG_HIGHMEM
10528 +               if (highmem_pages >= max_pfn) {
10529 +                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
10530 +                       highmem_pages = 0;
10531 +               }
10532 +               if (highmem_pages) {
10533 +                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
10534 +                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
10535 +                               highmem_pages = 0;
10536 +                       }
10537 +                       max_low_pfn -= highmem_pages;
10538 +               }
10539 +#else
10540 +               if (highmem_pages)
10541 +                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
10542 +#endif
10543 +       }
10544 +       return max_low_pfn;
10545 +}
10546 +
10547 +/*
10548 + * Free all available memory for boot time allocation.  Used
10549 + * as a callback function by efi_memory_walk()
10550 + */
10551 +
10552 +static int __init
10553 +free_available_memory(unsigned long start, unsigned long end, void *arg)
10554 +{
10555 +       /* check max_low_pfn */
10556 +       if (start >= (max_low_pfn << PAGE_SHIFT))
10557 +               return 0;
10558 +       if (end >= (max_low_pfn << PAGE_SHIFT))
10559 +               end = max_low_pfn << PAGE_SHIFT;
10560 +       if (start < end)
10561 +               free_bootmem(start, end - start);
10562 +
10563 +       return 0;
10564 +}
10565 +/*
10566 + * Register fully available low RAM pages with the bootmem allocator.
10567 + */
10568 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
10569 +{
10570 +       int i;
10571 +
10572 +       if (efi_enabled) {
10573 +               efi_memmap_walk(free_available_memory, NULL);
10574 +               return;
10575 +       }
10576 +       for (i = 0; i < e820.nr_map; i++) {
10577 +               unsigned long curr_pfn, last_pfn, size;
10578 +               /*
10579 +                * Reserve usable low memory
10580 +                */
10581 +               if (e820.map[i].type != E820_RAM)
10582 +                       continue;
10583 +               /*
10584 +                * We are rounding up the start address of usable memory:
10585 +                */
10586 +               curr_pfn = PFN_UP(e820.map[i].addr);
10587 +               if (curr_pfn >= max_low_pfn)
10588 +                       continue;
10589 +               /*
10590 +                * ... and at the end of the usable range downwards:
10591 +                */
10592 +               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10593 +
10594 +#ifdef CONFIG_XEN
10595 +               /*
10596 +                 * Truncate to the number of actual pages currently
10597 +                 * present.
10598 +                 */
10599 +               if (last_pfn > xen_start_info->nr_pages)
10600 +                       last_pfn = xen_start_info->nr_pages;
10601 +#endif
10602 +
10603 +               if (last_pfn > max_low_pfn)
10604 +                       last_pfn = max_low_pfn;
10605 +
10606 +               /*
10607 +                * .. finally, did all the rounding and playing
10608 +                * around just make the area go away?
10609 +                */
10610 +               if (last_pfn <= curr_pfn)
10611 +                       continue;
10612 +
10613 +               size = last_pfn - curr_pfn;
10614 +               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
10615 +       }
10616 +}
10617 +
10618 +#ifndef CONFIG_XEN
10619 +/*
10620 + * workaround for Dell systems that neglect to reserve EBDA
10621 + */
10622 +static void __init reserve_ebda_region(void)
10623 +{
10624 +       unsigned int addr;
10625 +       addr = get_bios_ebda();
10626 +       if (addr)
10627 +               reserve_bootmem(addr, PAGE_SIZE);
10628 +}
10629 +#endif
10630 +
10631 +#ifndef CONFIG_NEED_MULTIPLE_NODES
10632 +void __init setup_bootmem_allocator(void);
10633 +static unsigned long __init setup_memory(void)
10634 +{
10635 +       /*
10636 +        * partially used pages are not usable - thus
10637 +        * we are rounding upwards:
10638 +        */
10639 +       min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
10640 +               xen_start_info->nr_pt_frames;
10641 +
10642 +       find_max_pfn();
10643 +
10644 +       max_low_pfn = find_max_low_pfn();
10645 +
10646 +#ifdef CONFIG_HIGHMEM
10647 +       highstart_pfn = highend_pfn = max_pfn;
10648 +       if (max_pfn > max_low_pfn) {
10649 +               highstart_pfn = max_low_pfn;
10650 +       }
10651 +       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
10652 +               pages_to_mb(highend_pfn - highstart_pfn));
10653 +#endif
10654 +       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
10655 +                       pages_to_mb(max_low_pfn));
10656 +
10657 +       setup_bootmem_allocator();
10658 +
10659 +       return max_low_pfn;
10660 +}
10661 +
10662 +void __init zone_sizes_init(void)
10663 +{
10664 +       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
10665 +       unsigned int max_dma, low;
10666 +
10667 +       max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
10668 +       low = max_low_pfn;
10669 +
10670 +       if (low < max_dma)
10671 +               zones_size[ZONE_DMA] = low;
10672 +       else {
10673 +               zones_size[ZONE_DMA] = max_dma;
10674 +               zones_size[ZONE_NORMAL] = low - max_dma;
10675 +#ifdef CONFIG_HIGHMEM
10676 +               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
10677 +#endif
10678 +       }
10679 +       free_area_init(zones_size);
10680 +}
10681 +#else
10682 +extern unsigned long __init setup_memory(void);
10683 +extern void zone_sizes_init(void);
10684 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
10685 +
10686 +void __init setup_bootmem_allocator(void)
10687 +{
10688 +       unsigned long bootmap_size;
10689 +       /*
10690 +        * Initialize the boot-time allocator (with low memory only):
10691 +        */
10692 +       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
10693 +
10694 +       register_bootmem_low_pages(max_low_pfn);
10695 +
10696 +       /*
10697 +        * Reserve the bootmem bitmap itself as well. We do this in two
10698 +        * steps (first step was init_bootmem()) because this catches
10699 +        * the (very unlikely) case of us accidentally initializing the
10700 +        * bootmem allocator with an invalid RAM area.
10701 +        */
10702 +       reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
10703 +                        bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
10704 +
10705 +#ifndef CONFIG_XEN
10706 +       /*
10707 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
10708 +        * enabling clean reboots, SMP operation, laptop functions.
10709 +        */
10710 +       reserve_bootmem(0, PAGE_SIZE);
10711 +
10712 +       /* reserve EBDA region, it's a 4K region */
10713 +       reserve_ebda_region();
10714 +
10715 +    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
10716 +       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
10717 +       unless you have no PS/2 mouse plugged in. */
10718 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10719 +           boot_cpu_data.x86 == 6)
10720 +            reserve_bootmem(0xa0000 - 4096, 4096);
10721 +
10722 +#ifdef CONFIG_SMP
10723 +       /*
10724 +        * But first pinch a few for the stack/trampoline stuff
10725 +        * FIXME: Don't need the extra page at 4K, but need to fix
10726 +        * trampoline before removing it. (see the GDT stuff)
10727 +        */
10728 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
10729 +#endif
10730 +#ifdef CONFIG_ACPI_SLEEP
10731 +       /*
10732 +        * Reserve low memory region for sleep support.
10733 +        */
10734 +       acpi_reserve_bootmem();
10735 +#endif
10736 +#endif /* !CONFIG_XEN */
10737 +
10738 +#ifdef CONFIG_BLK_DEV_INITRD
10739 +       if (xen_start_info->mod_start) {
10740 +               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
10741 +                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
10742 +                       initrd_start = INITRD_START + PAGE_OFFSET;
10743 +                       initrd_end = initrd_start+INITRD_SIZE;
10744 +                       initrd_below_start_ok = 1;
10745 +               }
10746 +               else {
10747 +                       printk(KERN_ERR "initrd extends beyond end of memory "
10748 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
10749 +                           INITRD_START + INITRD_SIZE,
10750 +                           max_low_pfn << PAGE_SHIFT);
10751 +                       initrd_start = 0;
10752 +               }
10753 +       }
10754 +#endif
10755 +#ifdef CONFIG_KEXEC
10756 +#ifdef CONFIG_XEN
10757 +       xen_machine_kexec_setup_resources();
10758 +#else
10759 +       if (crashk_res.start != crashk_res.end)
10760 +               reserve_bootmem(crashk_res.start,
10761 +                       crashk_res.end - crashk_res.start + 1);
10762 +#endif
10763 +#endif
10764 +}
10765 +
10766 +/*
10767 + * The node 0 pgdat is initialized before all of these because
10768 + * it's needed for bootmem.  node>0 pgdats have their virtual
10769 + * space allocated before the pagetables are in place to access
10770 + * them, so they can't be cleared then.
10771 + *
10772 + * This should all compile down to nothing when NUMA is off.
10773 + */
10774 +void __init remapped_pgdat_init(void)
10775 +{
10776 +       int nid;
10777 +
10778 +       for_each_online_node(nid) {
10779 +               if (nid != 0)
10780 +                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
10781 +       }
10782 +}
10783 +
10784 +/*
10785 + * Request address space for all standard RAM and ROM resources
10786 + * and also for regions reported as reserved by the e820.
10787 + */
10788 +static void __init
10789 +legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
10790 +                           struct resource *code_resource,
10791 +                           struct resource *data_resource)
10792 +{
10793 +       int i;
10794 +
10795 +       probe_roms();
10796 +
10797 +       for (i = 0; i < nr_map; i++) {
10798 +               struct resource *res;
10799 +#ifndef CONFIG_RESOURCES_64BIT
10800 +               if (e820[i].addr + e820[i].size > 0x100000000ULL)
10801 +                       continue;
10802 +#endif
10803 +               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
10804 +               switch (e820[i].type) {
10805 +               case E820_RAM:  res->name = "System RAM"; break;
10806 +               case E820_ACPI: res->name = "ACPI Tables"; break;
10807 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
10808 +               default:        res->name = "reserved";
10809 +               }
10810 +               res->start = e820[i].addr;
10811 +               res->end = res->start + e820[i].size - 1;
10812 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
10813 +               if (request_resource(&iomem_resource, res)) {
10814 +                       kfree(res);
10815 +                       continue;
10816 +               }
10817 +               if (e820[i].type == E820_RAM) {
10818 +                       /*
10819 +                        *  We don't know which RAM region contains kernel data,
10820 +                        *  so we try it repeatedly and let the resource manager
10821 +                        *  test it.
10822 +                        */
10823 +#ifndef CONFIG_XEN
10824 +                       request_resource(res, code_resource);
10825 +                       request_resource(res, data_resource);
10826 +#endif
10827 +#ifdef CONFIG_KEXEC
10828 +                       if (crashk_res.start != crashk_res.end)
10829 +                            request_resource(res, &crashk_res);
10830 +#ifdef CONFIG_XEN
10831 +                       xen_machine_kexec_register_resources(res);
10832 +#endif
10833 +#endif
10834 +               }
10835 +       }
10836 +}
10837 +
10838 +/*
10839 + * Locate a unused range of the physical address space below 4G which
10840 + * can be used for PCI mappings.
10841 + */
10842 +static void __init
10843 +e820_setup_gap(struct e820entry *e820, int nr_map)
10844 +{
10845 +       unsigned long gapstart, gapsize, round;
10846 +       unsigned long long last;
10847 +       int i;
10848 +
10849 +       /*
10850 +        * Search for the bigest gap in the low 32 bits of the e820
10851 +        * memory space.
10852 +        */
10853 +       last = 0x100000000ull;
10854 +       gapstart = 0x10000000;
10855 +       gapsize = 0x400000;
10856 +       i = nr_map;
10857 +       while (--i >= 0) {
10858 +               unsigned long long start = e820[i].addr;
10859 +               unsigned long long end = start + e820[i].size;
10860 +
10861 +               /*
10862 +                * Since "last" is at most 4GB, we know we'll
10863 +                * fit in 32 bits if this condition is true
10864 +                */
10865 +               if (last > end) {
10866 +                       unsigned long gap = last - end;
10867 +
10868 +                       if (gap > gapsize) {
10869 +                               gapsize = gap;
10870 +                               gapstart = end;
10871 +                       }
10872 +               }
10873 +               if (start < last)
10874 +                       last = start;
10875 +       }
10876 +
10877 +       /*
10878 +        * See how much we want to round up: start off with
10879 +        * rounding to the next 1MB area.
10880 +        */
10881 +       round = 0x100000;
10882 +       while ((gapsize >> 4) > round)
10883 +               round += round;
10884 +       /* Fun with two's complement */
10885 +       pci_mem_start = (gapstart + round) & -round;
10886 +
10887 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
10888 +               pci_mem_start, gapstart, gapsize);
10889 +}
10890 +
10891 +/*
10892 + * Request address space for all standard resources
10893 + *
10894 + * This is called just before pcibios_init(), which is also a
10895 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
10896 + */
10897 +static int __init request_standard_resources(void)
10898 +{
10899 +       int           i;
10900 +
10901 +       /* Nothing to do if not running in dom0. */
10902 +       if (!is_initial_xendomain())
10903 +               return 0;
10904 +
10905 +       printk("Setting up standard PCI resources\n");
10906 +#ifdef CONFIG_XEN
10907 +       legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
10908 +                                   &code_resource, &data_resource);
10909 +#else
10910 +       if (efi_enabled)
10911 +               efi_initialize_iomem_resources(&code_resource, &data_resource);
10912 +       else
10913 +               legacy_init_iomem_resources(e820.map, e820.nr_map,
10914 +                                           &code_resource, &data_resource);
10915 +#endif
10916 +
10917 +       /* EFI systems may still have VGA */
10918 +       request_resource(&iomem_resource, &video_ram_resource);
10919 +
10920 +       /* request I/O space for devices used on all i[345]86 PCs */
10921 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
10922 +               request_resource(&ioport_resource, &standard_io_resources[i]);
10923 +       return 0;
10924 +}
10925 +
10926 +subsys_initcall(request_standard_resources);
10927 +
10928 +static void __init register_memory(void)
10929 +{
10930 +#ifdef CONFIG_XEN
10931 +       if (is_initial_xendomain())
10932 +               e820_setup_gap(machine_e820.map, machine_e820.nr_map);
10933 +       else
10934 +#endif
10935 +               e820_setup_gap(e820.map, e820.nr_map);
10936 +}
10937 +
10938 +#ifdef CONFIG_MCA
10939 +static void set_mca_bus(int x)
10940 +{
10941 +       MCA_bus = x;
10942 +}
10943 +#else
10944 +static void set_mca_bus(int x) { }
10945 +#endif
10946 +
10947 +/*
10948 + * Determine if we were loaded by an EFI loader.  If so, then we have also been
10949 + * passed the efi memmap, systab, etc., so we should use these data structures
10950 + * for initialization.  Note, the efi init code path is determined by the
10951 + * global efi_enabled. This allows the same kernel image to be used on existing
10952 + * systems (with a traditional BIOS) as well as on EFI systems.
10953 + */
10954 +void __init setup_arch(char **cmdline_p)
10955 +{
10956 +       int i, j, k, fpp;
10957 +       struct physdev_set_iopl set_iopl;
10958 +       unsigned long max_low_pfn;
10959 +       unsigned long p2m_pages;
10960 +
10961 +       /* Force a quick death if the kernel panics (not domain 0). */
10962 +       extern int panic_timeout;
10963 +       if (!panic_timeout && !is_initial_xendomain())
10964 +               panic_timeout = 1;
10965 +
10966 +       /* Register a call for panic conditions. */
10967 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
10968 +
10969 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10970 +                                    VMASST_TYPE_4gb_segments));
10971 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10972 +                                    VMASST_TYPE_writable_pagetables));
10973 +
10974 +       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
10975 +       pre_setup_arch_hook();
10976 +       early_cpu_init();
10977 +#ifdef CONFIG_SMP
10978 +       prefill_possible_map();
10979 +#endif
10980 +
10981 +       /*
10982 +        * FIXME: This isn't an official loader_type right
10983 +        * now but does currently work with elilo.
10984 +        * If we were configured as an EFI kernel, check to make
10985 +        * sure that we were loaded correctly from elilo and that
10986 +        * the system table is valid.  If not, then initialize normally.
10987 +        */
10988 +#ifdef CONFIG_EFI
10989 +       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
10990 +               efi_enabled = 1;
10991 +#endif
10992 +
10993 +       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
10994 +          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
10995 +       */
10996 +       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
10997 +       drive_info = DRIVE_INFO;
10998 +       screen_info = SCREEN_INFO;
10999 +       copy_edid();
11000 +       apm_info.bios = APM_BIOS_INFO;
11001 +       ist_info = IST_INFO;
11002 +       saved_videomode = VIDEO_MODE;
11003 +       if( SYS_DESC_TABLE.length != 0 ) {
11004 +               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11005 +               machine_id = SYS_DESC_TABLE.table[0];
11006 +               machine_submodel_id = SYS_DESC_TABLE.table[1];
11007 +               BIOS_revision = SYS_DESC_TABLE.table[2];
11008 +       }
11009 +       bootloader_type = LOADER_TYPE;
11010 +
11011 +       if (is_initial_xendomain()) {
11012 +               const struct dom0_vga_console_info *info =
11013 +                       (void *)((char *)xen_start_info +
11014 +                                xen_start_info->console.dom0.info_off);
11015 +
11016 +               dom0_init_screen_info(info,
11017 +                                     xen_start_info->console.dom0.info_size);
11018 +               xen_start_info->console.domU.mfn = 0;
11019 +               xen_start_info->console.domU.evtchn = 0;
11020 +       } else
11021 +               screen_info.orig_video_isVGA = 0;
11022 +
11023 +#ifdef CONFIG_BLK_DEV_RAM
11024 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11025 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11026 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11027 +#endif
11028 +
11029 +       ARCH_SETUP
11030 +       if (efi_enabled)
11031 +               efi_init();
11032 +       else {
11033 +               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11034 +               print_memory_map(machine_specific_memory_setup());
11035 +       }
11036 +
11037 +       copy_edd();
11038 +
11039 +       if (!MOUNT_ROOT_RDONLY)
11040 +               root_mountflags &= ~MS_RDONLY;
11041 +       init_mm.start_code = (unsigned long) _text;
11042 +       init_mm.end_code = (unsigned long) _etext;
11043 +       init_mm.end_data = (unsigned long) _edata;
11044 +       init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11045 +                      xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11046 +
11047 +       code_resource.start = virt_to_phys(_text);
11048 +       code_resource.end = virt_to_phys(_etext)-1;
11049 +       data_resource.start = virt_to_phys(_etext);
11050 +       data_resource.end = virt_to_phys(_edata)-1;
11051 +
11052 +       parse_cmdline_early(cmdline_p);
11053 +
11054 +#ifdef CONFIG_EARLY_PRINTK
11055 +       {
11056 +               char *s = strstr(*cmdline_p, "earlyprintk=");
11057 +               if (s) {
11058 +                       setup_early_printk(strchr(s, '=') + 1);
11059 +                       printk("early console enabled\n");
11060 +               }
11061 +       }
11062 +#endif
11063 +
11064 +       max_low_pfn = setup_memory();
11065 +
11066 +       /*
11067 +        * NOTE: before this point _nobody_ is allowed to allocate
11068 +        * any memory using the bootmem allocator.  Although the
11069 +        * alloctor is now initialised only the first 8Mb of the kernel
11070 +        * virtual address space has been mapped.  All allocations before
11071 +        * paging_init() has completed must use the alloc_bootmem_low_pages()
11072 +        * variant (which allocates DMA'able memory) and care must be taken
11073 +        * not to exceed the 8Mb limit.
11074 +        */
11075 +
11076 +#ifdef CONFIG_SMP
11077 +       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11078 +#endif
11079 +       paging_init();
11080 +       remapped_pgdat_init();
11081 +       sparse_init();
11082 +       zone_sizes_init();
11083 +
11084 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11085 +       /*
11086 +        * Find and reserve possible boot-time SMP configuration:
11087 +        */
11088 +       find_smp_config();
11089 +#endif
11090 +
11091 +       p2m_pages = max_pfn;
11092 +       if (xen_start_info->nr_pages > max_pfn) {
11093 +               /*
11094 +                * the max_pfn was shrunk (probably by mem= or highmem=
11095 +                * kernel parameter); shrink reservation with the HV
11096 +                */
11097 +               struct xen_memory_reservation reservation = {
11098 +                       .address_bits = 0,
11099 +                       .extent_order = 0,
11100 +                       .domid = DOMID_SELF
11101 +               };
11102 +               unsigned int difference;
11103 +               int ret;
11104 +
11105 +               difference = xen_start_info->nr_pages - max_pfn;
11106 +
11107 +               set_xen_guest_handle(reservation.extent_start,
11108 +                                    ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
11109 +               reservation.nr_extents = difference;
11110 +               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
11111 +                                          &reservation);
11112 +               BUG_ON (ret != difference);
11113 +       }
11114 +       else if (max_pfn > xen_start_info->nr_pages)
11115 +               p2m_pages = xen_start_info->nr_pages;
11116 +
11117 +       /* Make sure we have a correctly sized P->M table. */
11118 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11119 +               phys_to_machine_mapping = alloc_bootmem_low_pages(
11120 +                    max_pfn * sizeof(unsigned long));
11121 +               memset(phys_to_machine_mapping, ~0,
11122 +                      max_pfn * sizeof(unsigned long));
11123 +               memcpy(phys_to_machine_mapping,
11124 +                      (unsigned long *)xen_start_info->mfn_list,
11125 +                      p2m_pages * sizeof(unsigned long));
11126 +               free_bootmem(
11127 +                    __pa(xen_start_info->mfn_list),
11128 +                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11129 +                                    sizeof(unsigned long))));
11130 +
11131 +               /*
11132 +                * Initialise the list of the frames that specify the list of
11133 +                * frames that make up the p2m table. Used by save/restore
11134 +                */
11135 +               pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11136 +
11137 +               fpp = PAGE_SIZE/sizeof(unsigned long);
11138 +               for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11139 +                       if ((j % fpp) == 0) {
11140 +                               k++;
11141 +                               BUG_ON(k>=16);
11142 +                               pfn_to_mfn_frame_list[k] =
11143 +                                       alloc_bootmem_low_pages(PAGE_SIZE);
11144 +                               pfn_to_mfn_frame_list_list[k] =
11145 +                                       virt_to_mfn(pfn_to_mfn_frame_list[k]);
11146 +                               j=0;
11147 +                       }
11148 +                       pfn_to_mfn_frame_list[k][j] =
11149 +                               virt_to_mfn(&phys_to_machine_mapping[i]);
11150 +               }
11151 +               HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11152 +               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11153 +                    virt_to_mfn(pfn_to_mfn_frame_list_list);
11154 +       }
11155 +
11156 +       /* Mark all ISA DMA channels in-use - using them wouldn't work. */
11157 +       for (i = 0; i < MAX_DMA_CHANNELS; ++i)
11158 +               if (i != 4 && request_dma(i, "xen") != 0)
11159 +                       BUG();
11160 +
11161 +       /*
11162 +        * NOTE: at this point the bootmem allocator is fully available.
11163 +        */
11164 +
11165 +       if (is_initial_xendomain())
11166 +               dmi_scan_machine();
11167 +
11168 +#ifdef CONFIG_X86_GENERICARCH
11169 +       generic_apic_probe(*cmdline_p);
11170 +#endif
11171 +       if (efi_enabled)
11172 +               efi_map_memmap();
11173 +
11174 +       set_iopl.iopl = 1;
11175 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
11176 +
11177 +#ifdef CONFIG_ACPI
11178 +       if (!is_initial_xendomain()) {
11179 +               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11180 +               acpi_disabled = 1;
11181 +               acpi_ht = 0;
11182 +       }
11183 +
11184 +       /*
11185 +        * Parse the ACPI tables for possible boot-time SMP configuration.
11186 +        */
11187 +       acpi_boot_table_init();
11188 +#endif
11189 +
11190 +#ifdef CONFIG_X86_IO_APIC
11191 +       check_acpi_pci();       /* Checks more than just ACPI actually */
11192 +#endif
11193 +
11194 +#ifdef CONFIG_ACPI
11195 +       acpi_boot_init();
11196 +
11197 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11198 +       if (def_to_bigsmp)
11199 +               printk(KERN_WARNING "More than 8 CPUs detected and "
11200 +                       "CONFIG_X86_PC cannot handle it.\nUse "
11201 +                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11202 +#endif
11203 +#endif
11204 +#ifdef CONFIG_X86_LOCAL_APIC
11205 +       if (smp_found_config)
11206 +               get_smp_config();
11207 +#endif
11208 +
11209 +       register_memory();
11210 +
11211 +       if (is_initial_xendomain()) {
11212 +#ifdef CONFIG_VT
11213 +#if defined(CONFIG_VGA_CONSOLE)
11214 +               if (!efi_enabled ||
11215 +                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11216 +                       conswitchp = &vga_con;
11217 +#elif defined(CONFIG_DUMMY_CONSOLE)
11218 +               conswitchp = &dummy_con;
11219 +#endif
11220 +#endif
11221 +       } else {
11222 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
11223 +               conswitchp = &dummy_con;
11224 +#endif
11225 +       }
11226 +       tsc_init();
11227 +}
11228 +
11229 +static int
11230 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11231 +{
11232 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
11233 +       /* we're never actually going to get here... */
11234 +       return NOTIFY_DONE;
11235 +}
11236 +
11237 +static __init int add_pcspkr(void)
11238 +{
11239 +       struct platform_device *pd;
11240 +       int ret;
11241 +
11242 +       if (!is_initial_xendomain())
11243 +               return 0;
11244 +
11245 +       pd = platform_device_alloc("pcspkr", -1);
11246 +       if (!pd)
11247 +               return -ENOMEM;
11248 +
11249 +       ret = platform_device_add(pd);
11250 +       if (ret)
11251 +               platform_device_put(pd);
11252 +
11253 +       return ret;
11254 +}
11255 +device_initcall(add_pcspkr);
11256 +
11257 +/*
11258 + * Local Variables:
11259 + * mode:c
11260 + * c-file-style:"k&r"
11261 + * c-basic-offset:8
11262 + * End:
11263 + */
11264 Index: head-2008-11-25/arch/x86/kernel/smp_32-xen.c
11265 ===================================================================
11266 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
11267 +++ head-2008-11-25/arch/x86/kernel/smp_32-xen.c        2007-12-10 08:47:31.000000000 +0100
11268 @@ -0,0 +1,605 @@
11269 +/*
11270 + *     Intel SMP support routines.
11271 + *
11272 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11273 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11274 + *
11275 + *     This code is released under the GNU General Public License version 2 or
11276 + *     later.
11277 + */
11278 +
11279 +#include <linux/init.h>
11280 +
11281 +#include <linux/mm.h>
11282 +#include <linux/delay.h>
11283 +#include <linux/spinlock.h>
11284 +#include <linux/smp_lock.h>
11285 +#include <linux/kernel_stat.h>
11286 +#include <linux/mc146818rtc.h>
11287 +#include <linux/cache.h>
11288 +#include <linux/interrupt.h>
11289 +#include <linux/cpu.h>
11290 +#include <linux/module.h>
11291 +
11292 +#include <asm/mtrr.h>
11293 +#include <asm/tlbflush.h>
11294 +#if 0
11295 +#include <mach_apic.h>
11296 +#endif
11297 +#include <xen/evtchn.h>
11298 +
11299 +/*
11300 + *     Some notes on x86 processor bugs affecting SMP operation:
11301 + *
11302 + *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11303 + *     The Linux implications for SMP are handled as follows:
11304 + *
11305 + *     Pentium III / [Xeon]
11306 + *             None of the E1AP-E3AP errata are visible to the user.
11307 + *
11308 + *     E1AP.   see PII A1AP
11309 + *     E2AP.   see PII A2AP
11310 + *     E3AP.   see PII A3AP
11311 + *
11312 + *     Pentium II / [Xeon]
11313 + *             None of the A1AP-A3AP errata are visible to the user.
11314 + *
11315 + *     A1AP.   see PPro 1AP
11316 + *     A2AP.   see PPro 2AP
11317 + *     A3AP.   see PPro 7AP
11318 + *
11319 + *     Pentium Pro
11320 + *             None of 1AP-9AP errata are visible to the normal user,
11321 + *     except occasional delivery of 'spurious interrupt' as trap #15.
11322 + *     This is very rare and a non-problem.
11323 + *
11324 + *     1AP.    Linux maps APIC as non-cacheable
11325 + *     2AP.    worked around in hardware
11326 + *     3AP.    fixed in C0 and above steppings microcode update.
11327 + *             Linux does not use excessive STARTUP_IPIs.
11328 + *     4AP.    worked around in hardware
11329 + *     5AP.    symmetric IO mode (normal Linux operation) not affected.
11330 + *             'noapic' mode has vector 0xf filled out properly.
11331 + *     6AP.    'noapic' mode might be affected - fixed in later steppings
11332 + *     7AP.    We do not assume writes to the LVT deassering IRQs
11333 + *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
11334 + *     9AP.    We do not use mixed mode
11335 + *
11336 + *     Pentium
11337 + *             There is a marginal case where REP MOVS on 100MHz SMP
11338 + *     machines with B stepping processors can fail. XXX should provide
11339 + *     an L1cache=Writethrough or L1cache=off option.
11340 + *
11341 + *             B stepping CPUs may hang. There are hardware work arounds
11342 + *     for this. We warn about it in case your board doesn't have the work
11343 + *     arounds. Basically thats so I can tell anyone with a B stepping
11344 + *     CPU and SMP problems "tough".
11345 + *
11346 + *     Specific items [From Pentium Processor Specification Update]
11347 + *
11348 + *     1AP.    Linux doesn't use remote read
11349 + *     2AP.    Linux doesn't trust APIC errors
11350 + *     3AP.    We work around this
11351 + *     4AP.    Linux never generated 3 interrupts of the same priority
11352 + *             to cause a lost local interrupt.
11353 + *     5AP.    Remote read is never used
11354 + *     6AP.    not affected - worked around in hardware
11355 + *     7AP.    not affected - worked around in hardware
11356 + *     8AP.    worked around in hardware - we get explicit CS errors if not
11357 + *     9AP.    only 'noapic' mode affected. Might generate spurious
11358 + *             interrupts, we log only the first one and count the
11359 + *             rest silently.
11360 + *     10AP.   not affected - worked around in hardware
11361 + *     11AP.   Linux reads the APIC between writes to avoid this, as per
11362 + *             the documentation. Make sure you preserve this as it affects
11363 + *             the C stepping chips too.
11364 + *     12AP.   not affected - worked around in hardware
11365 + *     13AP.   not affected - worked around in hardware
11366 + *     14AP.   we always deassert INIT during bootup
11367 + *     15AP.   not affected - worked around in hardware
11368 + *     16AP.   not affected - worked around in hardware
11369 + *     17AP.   not affected - worked around in hardware
11370 + *     18AP.   not affected - worked around in hardware
11371 + *     19AP.   not affected - worked around in BIOS
11372 + *
11373 + *     If this sounds worrying believe me these bugs are either ___RARE___,
11374 + *     or are signal timing bugs worked around in hardware and there's
11375 + *     about nothing of note with C stepping upwards.
11376 + */
11377 +
11378 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
11379 +
11380 +/*
11381 + * the following functions deal with sending IPIs between CPUs.
11382 + *
11383 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
11384 + */
11385 +
11386 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
11387 +{
11388 +       unsigned int icr = shortcut | APIC_DEST_LOGICAL;
11389 +
11390 +       switch (vector) {
11391 +       default:
11392 +               icr |= APIC_DM_FIXED | vector;
11393 +               break;
11394 +       case NMI_VECTOR:
11395 +               icr |= APIC_DM_NMI;
11396 +               break;
11397 +       }
11398 +       return icr;
11399 +}
11400 +
11401 +static inline int __prepare_ICR2 (unsigned int mask)
11402 +{
11403 +       return SET_APIC_DEST_FIELD(mask);
11404 +}
11405 +
11406 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
11407 +
11408 +static inline void __send_IPI_one(unsigned int cpu, int vector)
11409 +{
11410 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
11411 +       BUG_ON(irq < 0);
11412 +       notify_remote_via_irq(irq);
11413 +}
11414 +
11415 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
11416 +{
11417 +       int cpu;
11418 +
11419 +       switch (shortcut) {
11420 +       case APIC_DEST_SELF:
11421 +               __send_IPI_one(smp_processor_id(), vector);
11422 +               break;
11423 +       case APIC_DEST_ALLBUT:
11424 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11425 +                       if (cpu == smp_processor_id())
11426 +                               continue;
11427 +                       if (cpu_isset(cpu, cpu_online_map)) {
11428 +                               __send_IPI_one(cpu, vector);
11429 +                       }
11430 +               }
11431 +               break;
11432 +       default:
11433 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
11434 +                      vector);
11435 +               break;
11436 +       }
11437 +}
11438 +
11439 +void fastcall send_IPI_self(int vector)
11440 +{
11441 +       __send_IPI_shortcut(APIC_DEST_SELF, vector);
11442 +}
11443 +
11444 +/*
11445 + * This is only used on smaller machines.
11446 + */
11447 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
11448 +{
11449 +       unsigned long flags;
11450 +       unsigned int cpu;
11451 +
11452 +       local_irq_save(flags);
11453 +       WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
11454 +
11455 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11456 +               if (cpu_isset(cpu, mask)) {
11457 +                       __send_IPI_one(cpu, vector);
11458 +               }
11459 +       }
11460 +
11461 +       local_irq_restore(flags);
11462 +}
11463 +
11464 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
11465 +{
11466 +
11467 +       send_IPI_mask_bitmask(mask, vector);
11468 +}
11469 +
11470 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
11471 +
11472 +#if 0 /* XEN */
11473 +/*
11474 + *     Smarter SMP flushing macros.
11475 + *             c/o Linus Torvalds.
11476 + *
11477 + *     These mean you can really definitely utterly forget about
11478 + *     writing to user space from interrupts. (Its not allowed anyway).
11479 + *
11480 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
11481 + */
11482 +
11483 +static cpumask_t flush_cpumask;
11484 +static struct mm_struct * flush_mm;
11485 +static unsigned long flush_va;
11486 +static DEFINE_SPINLOCK(tlbstate_lock);
11487 +#define FLUSH_ALL      0xffffffff
11488 +
11489 +/*
11490 + * We cannot call mmdrop() because we are in interrupt context,
11491 + * instead update mm->cpu_vm_mask.
11492 + *
11493 + * We need to reload %cr3 since the page tables may be going
11494 + * away from under us..
11495 + */
11496 +static inline void leave_mm (unsigned long cpu)
11497 +{
11498 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
11499 +               BUG();
11500 +       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
11501 +       load_cr3(swapper_pg_dir);
11502 +}
11503 +
11504 +/*
11505 + *
11506 + * The flush IPI assumes that a thread switch happens in this order:
11507 + * [cpu0: the cpu that switches]
11508 + * 1) switch_mm() either 1a) or 1b)
11509 + * 1a) thread switch to a different mm
11510 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
11511 + *     Stop ipi delivery for the old mm. This is not synchronized with
11512 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
11513 + *     for the wrong mm, and in the worst case we perform a superflous
11514 + *     tlb flush.
11515 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
11516 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
11517 + *     was in lazy tlb mode.
11518 + * 1a3) update cpu_tlbstate[].active_mm
11519 + *     Now cpu0 accepts tlb flushes for the new mm.
11520 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
11521 + *     Now the other cpus will send tlb flush ipis.
11522 + * 1a4) change cr3.
11523 + * 1b) thread switch without mm change
11524 + *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
11525 + *     flush ipis.
11526 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
11527 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
11528 + *     Atomically set the bit [other cpus will start sending flush ipis],
11529 + *     and test the bit.
11530 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
11531 + * 2) switch %%esp, ie current
11532 + *
11533 + * The interrupt must handle 2 special cases:
11534 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
11535 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
11536 + *   runs in kernel space, the cpu could load tlb entries for user space
11537 + *   pages.
11538 + *
11539 + * The good news is that cpu_tlbstate is local to each cpu, no
11540 + * write/read ordering problems.
11541 + */
11542 +
11543 +/*
11544 + * TLB flush IPI:
11545 + *
11546 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
11547 + * 2) Leave the mm if we are in the lazy tlb mode.
11548 + */
11549 +
11550 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
11551 +                                    struct pt_regs *regs)
11552 +{
11553 +       unsigned long cpu;
11554 +
11555 +       cpu = get_cpu();
11556 +
11557 +       if (!cpu_isset(cpu, flush_cpumask))
11558 +               goto out;
11559 +               /*
11560 +                * This was a BUG() but until someone can quote me the
11561 +                * line from the intel manual that guarantees an IPI to
11562 +                * multiple CPUs is retried _only_ on the erroring CPUs
11563 +                * its staying as a return
11564 +                *
11565 +                * BUG();
11566 +                */
11567 +
11568 +       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
11569 +               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
11570 +                       if (flush_va == FLUSH_ALL)
11571 +                               local_flush_tlb();
11572 +                       else
11573 +                               __flush_tlb_one(flush_va);
11574 +               } else
11575 +                       leave_mm(cpu);
11576 +       }
11577 +       smp_mb__before_clear_bit();
11578 +       cpu_clear(cpu, flush_cpumask);
11579 +       smp_mb__after_clear_bit();
11580 +out:
11581 +       put_cpu_no_resched();
11582 +
11583 +       return IRQ_HANDLED;
11584 +}
11585 +
11586 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
11587 +                                               unsigned long va)
11588 +{
11589 +       /*
11590 +        * A couple of (to be removed) sanity checks:
11591 +        *
11592 +        * - current CPU must not be in mask
11593 +        * - mask must exist :)
11594 +        */
11595 +       BUG_ON(cpus_empty(cpumask));
11596 +       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
11597 +       BUG_ON(!mm);
11598 +
11599 +       /* If a CPU which we ran on has gone down, OK. */
11600 +       cpus_and(cpumask, cpumask, cpu_online_map);
11601 +       if (cpus_empty(cpumask))
11602 +               return;
11603 +
11604 +       /*
11605 +        * i'm not happy about this global shared spinlock in the
11606 +        * MM hot path, but we'll see how contended it is.
11607 +        * Temporarily this turns IRQs off, so that lockups are
11608 +        * detected by the NMI watchdog.
11609 +        */
11610 +       spin_lock(&tlbstate_lock);
11611 +
11612 +       flush_mm = mm;
11613 +       flush_va = va;
11614 +#if NR_CPUS <= BITS_PER_LONG
11615 +       atomic_set_mask(cpumask, &flush_cpumask);
11616 +#else
11617 +       {
11618 +               int k;
11619 +               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
11620 +               unsigned long *cpu_mask = (unsigned long *)&cpumask;
11621 +               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
11622 +                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
11623 +       }
11624 +#endif
11625 +       /*
11626 +        * We have to send the IPI only to
11627 +        * CPUs affected.
11628 +        */
11629 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
11630 +
11631 +       while (!cpus_empty(flush_cpumask))
11632 +               /* nothing. lockup detection does not belong here */
11633 +               mb();
11634 +
11635 +       flush_mm = NULL;
11636 +       flush_va = 0;
11637 +       spin_unlock(&tlbstate_lock);
11638 +}
11639 +
11640 +void flush_tlb_current_task(void)
11641 +{
11642 +       struct mm_struct *mm = current->mm;
11643 +       cpumask_t cpu_mask;
11644 +
11645 +       preempt_disable();
11646 +       cpu_mask = mm->cpu_vm_mask;
11647 +       cpu_clear(smp_processor_id(), cpu_mask);
11648 +
11649 +       local_flush_tlb();
11650 +       if (!cpus_empty(cpu_mask))
11651 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11652 +       preempt_enable();
11653 +}
11654 +
11655 +void flush_tlb_mm (struct mm_struct * mm)
11656 +{
11657 +       cpumask_t cpu_mask;
11658 +
11659 +       preempt_disable();
11660 +       cpu_mask = mm->cpu_vm_mask;
11661 +       cpu_clear(smp_processor_id(), cpu_mask);
11662 +
11663 +       if (current->active_mm == mm) {
11664 +               if (current->mm)
11665 +                       local_flush_tlb();
11666 +               else
11667 +                       leave_mm(smp_processor_id());
11668 +       }
11669 +       if (!cpus_empty(cpu_mask))
11670 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11671 +
11672 +       preempt_enable();
11673 +}
11674 +
11675 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
11676 +{
11677 +       struct mm_struct *mm = vma->vm_mm;
11678 +       cpumask_t cpu_mask;
11679 +
11680 +       preempt_disable();
11681 +       cpu_mask = mm->cpu_vm_mask;
11682 +       cpu_clear(smp_processor_id(), cpu_mask);
11683 +
11684 +       if (current->active_mm == mm) {
11685 +               if(current->mm)
11686 +                       __flush_tlb_one(va);
11687 +               else
11688 +                       leave_mm(smp_processor_id());
11689 +       }
11690 +
11691 +       if (!cpus_empty(cpu_mask))
11692 +               flush_tlb_others(cpu_mask, mm, va);
11693 +
11694 +       preempt_enable();
11695 +}
11696 +EXPORT_SYMBOL(flush_tlb_page);
11697 +
11698 +static void do_flush_tlb_all(void* info)
11699 +{
11700 +       unsigned long cpu = smp_processor_id();
11701 +
11702 +       __flush_tlb_all();
11703 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
11704 +               leave_mm(cpu);
11705 +}
11706 +
11707 +void flush_tlb_all(void)
11708 +{
11709 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
11710 +}
11711 +
11712 +#endif /* XEN */
11713 +
11714 +/*
11715 + * this function sends a 'reschedule' IPI to another CPU.
11716 + * it goes straight through and wastes no time serializing
11717 + * anything. Worst case is that we lose a reschedule ...
11718 + */
11719 +void smp_send_reschedule(int cpu)
11720 +{
11721 +       WARN_ON(cpu_is_offline(cpu));
11722 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
11723 +}
11724 +
11725 +/*
11726 + * Structure and data for smp_call_function(). This is designed to minimise
11727 + * static memory requirements. It also looks cleaner.
11728 + */
11729 +static DEFINE_SPINLOCK(call_lock);
11730 +
11731 +struct call_data_struct {
11732 +       void (*func) (void *info);
11733 +       void *info;
11734 +       atomic_t started;
11735 +       atomic_t finished;
11736 +       int wait;
11737 +};
11738 +
11739 +void lock_ipi_call_lock(void)
11740 +{
11741 +       spin_lock_irq(&call_lock);
11742 +}
11743 +
11744 +void unlock_ipi_call_lock(void)
11745 +{
11746 +       spin_unlock_irq(&call_lock);
11747 +}
11748 +
11749 +static struct call_data_struct *call_data;
11750 +
11751 +/**
11752 + * smp_call_function(): Run a function on all other CPUs.
11753 + * @func: The function to run. This must be fast and non-blocking.
11754 + * @info: An arbitrary pointer to pass to the function.
11755 + * @nonatomic: currently unused.
11756 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
11757 + *
11758 + * Returns 0 on success, else a negative status code. Does not return until
11759 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
11760 + *
11761 + * You must not call this function with disabled interrupts or from a
11762 + * hardware interrupt handler or from a bottom half handler.
11763 + */
11764 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
11765 +                       int wait)
11766 +{
11767 +       struct call_data_struct data;
11768 +       int cpus;
11769 +
11770 +       /* Holding any lock stops cpus from going down. */
11771 +       spin_lock(&call_lock);
11772 +       cpus = num_online_cpus() - 1;
11773 +       if (!cpus) {
11774 +               spin_unlock(&call_lock);
11775 +               return 0;
11776 +       }
11777 +
11778 +       /* Can deadlock when called with interrupts disabled */
11779 +       WARN_ON(irqs_disabled());
11780 +
11781 +       data.func = func;
11782 +       data.info = info;
11783 +       atomic_set(&data.started, 0);
11784 +       data.wait = wait;
11785 +       if (wait)
11786 +               atomic_set(&data.finished, 0);
11787 +
11788 +       call_data = &data;
11789 +       mb();
11790 +
11791 +       /* Send a message to all other CPUs and wait for them to respond */
11792 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
11793 +
11794 +       /* Wait for response */
11795 +       while (atomic_read(&data.started) != cpus)
11796 +               cpu_relax();
11797 +
11798 +       if (wait)
11799 +               while (atomic_read(&data.finished) != cpus)
11800 +                       cpu_relax();
11801 +       spin_unlock(&call_lock);
11802 +
11803 +       return 0;
11804 +}
11805 +EXPORT_SYMBOL(smp_call_function);
11806 +
11807 +static void stop_this_cpu (void * dummy)
11808 +{
11809 +       /*
11810 +        * Remove this CPU:
11811 +        */
11812 +       cpu_clear(smp_processor_id(), cpu_online_map);
11813 +       local_irq_disable();
11814 +       disable_all_local_evtchn();
11815 +       if (cpu_data[smp_processor_id()].hlt_works_ok)
11816 +               for(;;) halt();
11817 +       for (;;);
11818 +}
11819 +
11820 +/*
11821 + * this function calls the 'stop' function on all other CPUs in the system.
11822 + */
11823 +
11824 +void smp_send_stop(void)
11825 +{
11826 +       smp_call_function(stop_this_cpu, NULL, 1, 0);
11827 +
11828 +       local_irq_disable();
11829 +       disable_all_local_evtchn();
11830 +       local_irq_enable();
11831 +}
11832 +
11833 +/*
11834 + * Reschedule call back. Nothing to do,
11835 + * all the work is done automatically when
11836 + * we return from the interrupt.
11837 + */
11838 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
11839 +                                    struct pt_regs *regs)
11840 +{
11841 +
11842 +       return IRQ_HANDLED;
11843 +}
11844 +
11845 +#include <linux/kallsyms.h>
11846 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
11847 +                                       struct pt_regs *regs)
11848 +{
11849 +       void (*func) (void *info) = call_data->func;
11850 +       void *info = call_data->info;
11851 +       int wait = call_data->wait;
11852 +
11853 +       /*
11854 +        * Notify initiating CPU that I've grabbed the data and am
11855 +        * about to execute the function
11856 +        */
11857 +       mb();
11858 +       atomic_inc(&call_data->started);
11859 +       /*
11860 +        * At this point the info structure may be out of scope unless wait==1
11861 +        */
11862 +       irq_enter();
11863 +       (*func)(info);
11864 +       irq_exit();
11865 +
11866 +       if (wait) {
11867 +               mb();
11868 +               atomic_inc(&call_data->finished);
11869 +       }
11870 +
11871 +       return IRQ_HANDLED;
11872 +}
11873 +
11874 Index: head-2008-11-25/arch/x86/kernel/time_32-xen.c
11875 ===================================================================
11876 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
11877 +++ head-2008-11-25/arch/x86/kernel/time_32-xen.c       2008-09-01 12:07:31.000000000 +0200
11878 @@ -0,0 +1,1209 @@
11879 +/*
11880 + *  linux/arch/i386/kernel/time.c
11881 + *
11882 + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
11883 + *
11884 + * This file contains the PC-specific time handling details:
11885 + * reading the RTC at bootup, etc..
11886 + * 1994-07-02    Alan Modra
11887 + *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
11888 + * 1995-03-26    Markus Kuhn
11889 + *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
11890 + *      precision CMOS clock update
11891 + * 1996-05-03    Ingo Molnar
11892 + *      fixed time warps in do_[slow|fast]_gettimeoffset()
11893 + * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
11894 + *             "A Kernel Model for Precision Timekeeping" by Dave Mills
11895 + * 1998-09-05    (Various)
11896 + *     More robust do_fast_gettimeoffset() algorithm implemented
11897 + *     (works with APM, Cyrix 6x86MX and Centaur C6),
11898 + *     monotonic gettimeofday() with fast_get_timeoffset(),
11899 + *     drift-proof precision TSC calibration on boot
11900 + *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
11901 + *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
11902 + *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
11903 + * 1998-12-16    Andrea Arcangeli
11904 + *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
11905 + *     because was not accounting lost_ticks.
11906 + * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
11907 + *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
11908 + *     serialize accesses to xtime/lost_ticks).
11909 + */
11910 +
11911 +#include <linux/errno.h>
11912 +#include <linux/sched.h>
11913 +#include <linux/kernel.h>
11914 +#include <linux/param.h>
11915 +#include <linux/string.h>
11916 +#include <linux/mm.h>
11917 +#include <linux/interrupt.h>
11918 +#include <linux/time.h>
11919 +#include <linux/delay.h>
11920 +#include <linux/init.h>
11921 +#include <linux/smp.h>
11922 +#include <linux/module.h>
11923 +#include <linux/sysdev.h>
11924 +#include <linux/bcd.h>
11925 +#include <linux/efi.h>
11926 +#include <linux/mca.h>
11927 +#include <linux/sysctl.h>
11928 +#include <linux/percpu.h>
11929 +#include <linux/kernel_stat.h>
11930 +#include <linux/posix-timers.h>
11931 +#include <linux/cpufreq.h>
11932 +
11933 +#include <asm/io.h>
11934 +#include <asm/smp.h>
11935 +#include <asm/irq.h>
11936 +#include <asm/msr.h>
11937 +#include <asm/delay.h>
11938 +#include <asm/mpspec.h>
11939 +#include <asm/uaccess.h>
11940 +#include <asm/processor.h>
11941 +#include <asm/timer.h>
11942 +#include <asm/sections.h>
11943 +
11944 +#include "mach_time.h"
11945 +
11946 +#include <linux/timex.h>
11947 +
11948 +#include <asm/hpet.h>
11949 +
11950 +#include <asm/arch_hooks.h>
11951 +
11952 +#include <xen/evtchn.h>
11953 +#include <xen/interface/vcpu.h>
11954 +
11955 +#if defined (__i386__)
11956 +#include <asm/i8259.h>
11957 +#endif
11958 +
11959 +int pit_latch_buggy;              /* extern */
11960 +
11961 +#if defined(__x86_64__)
11962 +unsigned long vxtime_hz = PIT_TICK_RATE;
11963 +struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
11964 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
11965 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
11966 +struct timespec __xtime __section_xtime;
11967 +struct timezone __sys_tz __section_sys_tz;
11968 +#endif
11969 +
11970 +unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
11971 +EXPORT_SYMBOL(cpu_khz);
11972 +
11973 +extern unsigned long wall_jiffies;
11974 +
11975 +DEFINE_SPINLOCK(rtc_lock);
11976 +EXPORT_SYMBOL(rtc_lock);
11977 +
11978 +extern struct init_timer_opts timer_tsc_init;
11979 +extern struct timer_opts timer_tsc;
11980 +#define timer_none timer_tsc
11981 +
11982 +/* These are peridically updated in shared_info, and then copied here. */
11983 +struct shadow_time_info {
11984 +       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
11985 +       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
11986 +       u32 tsc_to_nsec_mul;
11987 +       u32 tsc_to_usec_mul;
11988 +       int tsc_shift;
11989 +       u32 version;
11990 +};
11991 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
11992 +static struct timespec shadow_tv;
11993 +static u32 shadow_tv_version;
11994 +
11995 +static struct timeval monotonic_tv;
11996 +static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED;
11997 +
11998 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
11999 +static u64 processed_system_time;   /* System time (ns) at last processing. */
12000 +static DEFINE_PER_CPU(u64, processed_system_time);
12001 +
12002 +/* How much CPU time was spent blocked and how much was 'stolen'? */
12003 +static DEFINE_PER_CPU(u64, processed_stolen_time);
12004 +static DEFINE_PER_CPU(u64, processed_blocked_time);
12005 +
12006 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
12007 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
12008 +
12009 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
12010 +#define NS_PER_TICK (1000000000LL/HZ)
12011 +
12012 +static void __clock_was_set(void *unused)
12013 +{
12014 +       clock_was_set();
12015 +}
12016 +static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
12017 +
12018 +/*
12019 + * GCC 4.3 can turn loops over an induction variable into division. We do
12020 + * not support arbitrary 64-bit division, and so must break the induction.
12021 + */
12022 +#define clobber_induction_variable(v) asm ( "" : "+r" (v) )
12023 +
12024 +static inline void __normalize_time(time_t *sec, s64 *nsec)
12025 +{
12026 +       while (*nsec >= NSEC_PER_SEC) {
12027 +               clobber_induction_variable(*nsec);
12028 +               (*nsec) -= NSEC_PER_SEC;
12029 +               (*sec)++;
12030 +       }
12031 +       while (*nsec < 0) {
12032 +               clobber_induction_variable(*nsec);
12033 +               (*nsec) += NSEC_PER_SEC;
12034 +               (*sec)--;
12035 +       }
12036 +}
12037 +
12038 +/* Does this guest OS track Xen time, or set its wall clock independently? */
12039 +static int independent_wallclock = 0;
12040 +static int __init __independent_wallclock(char *str)
12041 +{
12042 +       independent_wallclock = 1;
12043 +       return 1;
12044 +}
12045 +__setup("independent_wallclock", __independent_wallclock);
12046 +
12047 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
12048 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
12049 +static int __init __permitted_clock_jitter(char *str)
12050 +{
12051 +       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
12052 +       return 1;
12053 +}
12054 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
12055 +
12056 +#if 0
12057 +static void delay_tsc(unsigned long loops)
12058 +{
12059 +       unsigned long bclock, now;
12060 +
12061 +       rdtscl(bclock);
12062 +       do {
12063 +               rep_nop();
12064 +               rdtscl(now);
12065 +       } while ((now - bclock) < loops);
12066 +}
12067 +
12068 +struct timer_opts timer_tsc = {
12069 +       .name = "tsc",
12070 +       .delay = delay_tsc,
12071 +};
12072 +#endif
12073 +
12074 +/*
12075 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
12076 + * yielding a 64-bit result.
12077 + */
12078 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
12079 +{
12080 +       u64 product;
12081 +#ifdef __i386__
12082 +       u32 tmp1, tmp2;
12083 +#endif
12084 +
12085 +       if (shift < 0)
12086 +               delta >>= -shift;
12087 +       else
12088 +               delta <<= shift;
12089 +
12090 +#ifdef __i386__
12091 +       __asm__ (
12092 +               "mul  %5       ; "
12093 +               "mov  %4,%%eax ; "
12094 +               "mov  %%edx,%4 ; "
12095 +               "mul  %5       ; "
12096 +               "xor  %5,%5    ; "
12097 +               "add  %4,%%eax ; "
12098 +               "adc  %5,%%edx ; "
12099 +               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
12100 +               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
12101 +#else
12102 +       __asm__ (
12103 +               "mul %%rdx ; shrd $32,%%rdx,%%rax"
12104 +               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
12105 +#endif
12106 +
12107 +       return product;
12108 +}
12109 +
12110 +#if 0 /* defined (__i386__) */
12111 +int read_current_timer(unsigned long *timer_val)
12112 +{
12113 +       rdtscl(*timer_val);
12114 +       return 0;
12115 +}
12116 +#endif
12117 +
12118 +void init_cpu_khz(void)
12119 +{
12120 +       u64 __cpu_khz = 1000000ULL << 32;
12121 +       struct vcpu_time_info *info = &vcpu_info(0)->time;
12122 +       do_div(__cpu_khz, info->tsc_to_system_mul);
12123 +       if (info->tsc_shift < 0)
12124 +               cpu_khz = __cpu_khz << -info->tsc_shift;
12125 +       else
12126 +               cpu_khz = __cpu_khz >> info->tsc_shift;
12127 +}
12128 +
12129 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
12130 +{
12131 +       u64 now, delta;
12132 +       rdtscll(now);
12133 +       delta = now - shadow->tsc_timestamp;
12134 +       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
12135 +}
12136 +
12137 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
12138 +{
12139 +       u64 now, delta;
12140 +       rdtscll(now);
12141 +       delta = now - shadow->tsc_timestamp;
12142 +       return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
12143 +}
12144 +
12145 +static void __update_wallclock(time_t sec, long nsec)
12146 +{
12147 +       long wtm_nsec, xtime_nsec;
12148 +       time_t wtm_sec, xtime_sec;
12149 +       u64 tmp, wc_nsec;
12150 +
12151 +       /* Adjust wall-clock time base based on wall_jiffies ticks. */
12152 +       wc_nsec = processed_system_time;
12153 +       wc_nsec += sec * (u64)NSEC_PER_SEC;
12154 +       wc_nsec += nsec;
12155 +       wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
12156 +
12157 +       /* Split wallclock base into seconds and nanoseconds. */
12158 +       tmp = wc_nsec;
12159 +       xtime_nsec = do_div(tmp, 1000000000);
12160 +       xtime_sec  = (time_t)tmp;
12161 +
12162 +       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
12163 +       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
12164 +
12165 +       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
12166 +       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
12167 +
12168 +       ntp_clear();
12169 +}
12170 +
12171 +static void update_wallclock(void)
12172 +{
12173 +       shared_info_t *s = HYPERVISOR_shared_info;
12174 +
12175 +       do {
12176 +               shadow_tv_version = s->wc_version;
12177 +               rmb();
12178 +               shadow_tv.tv_sec  = s->wc_sec;
12179 +               shadow_tv.tv_nsec = s->wc_nsec;
12180 +               rmb();
12181 +       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
12182 +
12183 +       if (!independent_wallclock)
12184 +               __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
12185 +}
12186 +
12187 +/*
12188 + * Reads a consistent set of time-base values from Xen, into a shadow data
12189 + * area.
12190 + */
12191 +static void get_time_values_from_xen(unsigned int cpu)
12192 +{
12193 +       struct vcpu_time_info   *src;
12194 +       struct shadow_time_info *dst;
12195 +       unsigned long flags;
12196 +       u32 pre_version, post_version;
12197 +
12198 +       src = &vcpu_info(cpu)->time;
12199 +       dst = &per_cpu(shadow_time, cpu);
12200 +
12201 +       local_irq_save(flags);
12202 +
12203 +       do {
12204 +               pre_version = dst->version = src->version;
12205 +               rmb();
12206 +               dst->tsc_timestamp     = src->tsc_timestamp;
12207 +               dst->system_timestamp  = src->system_time;
12208 +               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
12209 +               dst->tsc_shift         = src->tsc_shift;
12210 +               rmb();
12211 +               post_version = src->version;
12212 +       } while ((pre_version & 1) | (pre_version ^ post_version));
12213 +
12214 +       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
12215 +
12216 +       local_irq_restore(flags);
12217 +}
12218 +
12219 +static inline int time_values_up_to_date(unsigned int cpu)
12220 +{
12221 +       struct vcpu_time_info   *src;
12222 +       struct shadow_time_info *dst;
12223 +
12224 +       src = &vcpu_info(cpu)->time;
12225 +       dst = &per_cpu(shadow_time, cpu);
12226 +
12227 +       rmb();
12228 +       return (dst->version == src->version);
12229 +}
12230 +
12231 +/*
12232 + * This is a special lock that is owned by the CPU and holds the index
12233 + * register we are working with.  It is required for NMI access to the
12234 + * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
12235 + */
12236 +volatile unsigned long cmos_lock = 0;
12237 +EXPORT_SYMBOL(cmos_lock);
12238 +
12239 +/* Routines for accessing the CMOS RAM/RTC. */
12240 +unsigned char rtc_cmos_read(unsigned char addr)
12241 +{
12242 +       unsigned char val;
12243 +       lock_cmos_prefix(addr);
12244 +       outb_p(addr, RTC_PORT(0));
12245 +       val = inb_p(RTC_PORT(1));
12246 +       lock_cmos_suffix(addr);
12247 +       return val;
12248 +}
12249 +EXPORT_SYMBOL(rtc_cmos_read);
12250 +
12251 +void rtc_cmos_write(unsigned char val, unsigned char addr)
12252 +{
12253 +       lock_cmos_prefix(addr);
12254 +       outb_p(addr, RTC_PORT(0));
12255 +       outb_p(val, RTC_PORT(1));
12256 +       lock_cmos_suffix(addr);
12257 +}
12258 +EXPORT_SYMBOL(rtc_cmos_write);
12259 +
12260 +/*
12261 + * This version of gettimeofday has microsecond resolution
12262 + * and better than microsecond precision on fast x86 machines with TSC.
12263 + */
12264 +void do_gettimeofday(struct timeval *tv)
12265 +{
12266 +       unsigned long seq;
12267 +       unsigned long usec, sec;
12268 +       unsigned long flags;
12269 +       s64 nsec;
12270 +       unsigned int cpu;
12271 +       struct shadow_time_info *shadow;
12272 +       u32 local_time_version;
12273 +
12274 +       cpu = get_cpu();
12275 +       shadow = &per_cpu(shadow_time, cpu);
12276 +
12277 +       do {
12278 +               unsigned long lost;
12279 +
12280 +               local_time_version = shadow->version;
12281 +               seq = read_seqbegin(&xtime_lock);
12282 +
12283 +               usec = get_usec_offset(shadow);
12284 +               lost = jiffies - wall_jiffies;
12285 +
12286 +               if (unlikely(lost))
12287 +                       usec += lost * (USEC_PER_SEC / HZ);
12288 +
12289 +               sec = xtime.tv_sec;
12290 +               usec += (xtime.tv_nsec / NSEC_PER_USEC);
12291 +
12292 +               nsec = shadow->system_timestamp - processed_system_time;
12293 +               __normalize_time(&sec, &nsec);
12294 +               usec += (long)nsec / NSEC_PER_USEC;
12295 +
12296 +               if (unlikely(!time_values_up_to_date(cpu))) {
12297 +                       /*
12298 +                        * We may have blocked for a long time,
12299 +                        * rendering our calculations invalid
12300 +                        * (e.g. the time delta may have
12301 +                        * overflowed). Detect that and recalculate
12302 +                        * with fresh values.
12303 +                        */
12304 +                       get_time_values_from_xen(cpu);
12305 +                       continue;
12306 +               }
12307 +       } while (read_seqretry(&xtime_lock, seq) ||
12308 +                (local_time_version != shadow->version));
12309 +
12310 +       put_cpu();
12311 +
12312 +       while (usec >= USEC_PER_SEC) {
12313 +               usec -= USEC_PER_SEC;
12314 +               sec++;
12315 +       }
12316 +
12317 +       spin_lock_irqsave(&monotonic_lock, flags);
12318 +       if ((sec > monotonic_tv.tv_sec) ||
12319 +           ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec)))
12320 +       {
12321 +               monotonic_tv.tv_sec = sec;
12322 +               monotonic_tv.tv_usec = usec;
12323 +       } else {
12324 +               sec = monotonic_tv.tv_sec;
12325 +               usec = monotonic_tv.tv_usec;
12326 +       }
12327 +       spin_unlock_irqrestore(&monotonic_lock, flags);
12328 +
12329 +       tv->tv_sec = sec;
12330 +       tv->tv_usec = usec;
12331 +}
12332 +
12333 +EXPORT_SYMBOL(do_gettimeofday);
12334 +
12335 +int do_settimeofday(struct timespec *tv)
12336 +{
12337 +       time_t sec;
12338 +       s64 nsec;
12339 +       unsigned int cpu;
12340 +       struct shadow_time_info *shadow;
12341 +       struct xen_platform_op op;
12342 +
12343 +       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
12344 +               return -EINVAL;
12345 +
12346 +       cpu = get_cpu();
12347 +       shadow = &per_cpu(shadow_time, cpu);
12348 +
12349 +       write_seqlock_irq(&xtime_lock);
12350 +
12351 +       /*
12352 +        * Ensure we don't get blocked for a long time so that our time delta
12353 +        * overflows. If that were to happen then our shadow time values would
12354 +        * be stale, so we can retry with fresh ones.
12355 +        */
12356 +       for (;;) {
12357 +               nsec = tv->tv_nsec - get_nsec_offset(shadow);
12358 +               if (time_values_up_to_date(cpu))
12359 +                       break;
12360 +               get_time_values_from_xen(cpu);
12361 +       }
12362 +       sec = tv->tv_sec;
12363 +       __normalize_time(&sec, &nsec);
12364 +
12365 +       if (is_initial_xendomain() && !independent_wallclock) {
12366 +               op.cmd = XENPF_settime;
12367 +               op.u.settime.secs        = sec;
12368 +               op.u.settime.nsecs       = nsec;
12369 +               op.u.settime.system_time = shadow->system_timestamp;
12370 +               WARN_ON(HYPERVISOR_platform_op(&op));
12371 +               update_wallclock();
12372 +       } else if (independent_wallclock) {
12373 +               nsec -= shadow->system_timestamp;
12374 +               __normalize_time(&sec, &nsec);
12375 +               __update_wallclock(sec, nsec);
12376 +       }
12377 +
12378 +       /* Reset monotonic gettimeofday() timeval. */
12379 +       spin_lock(&monotonic_lock);
12380 +       monotonic_tv.tv_sec = 0;
12381 +       monotonic_tv.tv_usec = 0;
12382 +       spin_unlock(&monotonic_lock);
12383 +
12384 +       write_sequnlock_irq(&xtime_lock);
12385 +
12386 +       put_cpu();
12387 +
12388 +       clock_was_set();
12389 +       return 0;
12390 +}
12391 +
12392 +EXPORT_SYMBOL(do_settimeofday);
12393 +
12394 +static void sync_xen_wallclock(unsigned long dummy);
12395 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
12396 +static void sync_xen_wallclock(unsigned long dummy)
12397 +{
12398 +       time_t sec;
12399 +       s64 nsec;
12400 +       struct xen_platform_op op;
12401 +
12402 +       if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
12403 +               return;
12404 +
12405 +       write_seqlock_irq(&xtime_lock);
12406 +
12407 +       sec  = xtime.tv_sec;
12408 +       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
12409 +       __normalize_time(&sec, &nsec);
12410 +
12411 +       op.cmd = XENPF_settime;
12412 +       op.u.settime.secs        = sec;
12413 +       op.u.settime.nsecs       = nsec;
12414 +       op.u.settime.system_time = processed_system_time;
12415 +       WARN_ON(HYPERVISOR_platform_op(&op));
12416 +
12417 +       update_wallclock();
12418 +
12419 +       write_sequnlock_irq(&xtime_lock);
12420 +
12421 +       /* Once per minute. */
12422 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
12423 +}
12424 +
12425 +static int set_rtc_mmss(unsigned long nowtime)
12426 +{
12427 +       int retval;
12428 +       unsigned long flags;
12429 +
12430 +       if (independent_wallclock || !is_initial_xendomain())
12431 +               return 0;
12432 +
12433 +       /* gets recalled with irq locally disabled */
12434 +       /* XXX - does irqsave resolve this? -johnstul */
12435 +       spin_lock_irqsave(&rtc_lock, flags);
12436 +       if (efi_enabled)
12437 +               retval = efi_set_rtc_mmss(nowtime);
12438 +       else
12439 +               retval = mach_set_rtc_mmss(nowtime);
12440 +       spin_unlock_irqrestore(&rtc_lock, flags);
12441 +
12442 +       return retval;
12443 +}
12444 +
12445 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
12446 + *             Note: This function is required to return accurate
12447 + *             time even in the absence of multiple timer ticks.
12448 + */
12449 +unsigned long long monotonic_clock(void)
12450 +{
12451 +       unsigned int cpu = get_cpu();
12452 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12453 +       u64 time;
12454 +       u32 local_time_version;
12455 +
12456 +       do {
12457 +               local_time_version = shadow->version;
12458 +               barrier();
12459 +               time = shadow->system_timestamp + get_nsec_offset(shadow);
12460 +               if (!time_values_up_to_date(cpu))
12461 +                       get_time_values_from_xen(cpu);
12462 +               barrier();
12463 +       } while (local_time_version != shadow->version);
12464 +
12465 +       put_cpu();
12466 +
12467 +       return time;
12468 +}
12469 +EXPORT_SYMBOL(monotonic_clock);
12470 +
12471 +#ifdef __x86_64__
12472 +unsigned long long sched_clock(void)
12473 +{
12474 +       return monotonic_clock();
12475 +}
12476 +#endif
12477 +
12478 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
12479 +unsigned long profile_pc(struct pt_regs *regs)
12480 +{
12481 +       unsigned long pc = instruction_pointer(regs);
12482 +
12483 +#ifdef __x86_64__
12484 +       /* Assume the lock function has either no stack frame or only a single word.
12485 +          This checks if the address on the stack looks like a kernel text address.
12486 +          There is a small window for false hits, but in that case the tick
12487 +          is just accounted to the spinlock function.
12488 +          Better would be to write these functions in assembler again
12489 +          and check exactly. */
12490 +       if (!user_mode_vm(regs) && in_lock_functions(pc)) {
12491 +               char *v = *(char **)regs->rsp;
12492 +               if ((v >= _stext && v <= _etext) ||
12493 +                       (v >= _sinittext && v <= _einittext) ||
12494 +                       (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
12495 +                       return (unsigned long)v;
12496 +               return ((unsigned long *)regs->rsp)[1];
12497 +       }
12498 +#else
12499 +       if (!user_mode_vm(regs) && in_lock_functions(pc))
12500 +               return *(unsigned long *)(regs->ebp + 4);
12501 +#endif
12502 +
12503 +       return pc;
12504 +}
12505 +EXPORT_SYMBOL(profile_pc);
12506 +#endif
12507 +
12508 +/*
12509 + * This is the same as the above, except we _also_ save the current
12510 + * Time Stamp Counter value at the time of the timer interrupt, so that
12511 + * we later on can estimate the time of day more exactly.
12512 + */
12513 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
12514 +{
12515 +       s64 delta, delta_cpu, stolen, blocked;
12516 +       u64 sched_time;
12517 +       unsigned int i, cpu = smp_processor_id();
12518 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12519 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12520 +
12521 +       /*
12522 +        * Here we are in the timer irq handler. We just have irqs locally
12523 +        * disabled but we don't know if the timer_bh is running on the other
12524 +        * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
12525 +        * the irq version of write_lock because as just said we have irq
12526 +        * locally disabled. -arca
12527 +        */
12528 +       write_seqlock(&xtime_lock);
12529 +
12530 +       do {
12531 +               get_time_values_from_xen(cpu);
12532 +
12533 +               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
12534 +               delta = delta_cpu =
12535 +                       shadow->system_timestamp + get_nsec_offset(shadow);
12536 +               delta     -= processed_system_time;
12537 +               delta_cpu -= per_cpu(processed_system_time, cpu);
12538 +
12539 +               /*
12540 +                * Obtain a consistent snapshot of stolen/blocked cycles. We
12541 +                * can use state_entry_time to detect if we get preempted here.
12542 +                */
12543 +               do {
12544 +                       sched_time = runstate->state_entry_time;
12545 +                       barrier();
12546 +                       stolen = runstate->time[RUNSTATE_runnable] +
12547 +                               runstate->time[RUNSTATE_offline] -
12548 +                               per_cpu(processed_stolen_time, cpu);
12549 +                       blocked = runstate->time[RUNSTATE_blocked] -
12550 +                               per_cpu(processed_blocked_time, cpu);
12551 +                       barrier();
12552 +               } while (sched_time != runstate->state_entry_time);
12553 +       } while (!time_values_up_to_date(cpu));
12554 +
12555 +       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
12556 +            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
12557 +           && printk_ratelimit()) {
12558 +               printk("Timer ISR/%u: Time went backwards: "
12559 +                      "delta=%lld delta_cpu=%lld shadow=%lld "
12560 +                      "off=%lld processed=%lld cpu_processed=%lld\n",
12561 +                      cpu, delta, delta_cpu, shadow->system_timestamp,
12562 +                      (s64)get_nsec_offset(shadow),
12563 +                      processed_system_time,
12564 +                      per_cpu(processed_system_time, cpu));
12565 +               for (i = 0; i < num_online_cpus(); i++)
12566 +                       printk(" %d: %lld\n", i,
12567 +                              per_cpu(processed_system_time, i));
12568 +       }
12569 +
12570 +       /* System-wide jiffy work. */
12571 +       while (delta >= NS_PER_TICK) {
12572 +               delta -= NS_PER_TICK;
12573 +               processed_system_time += NS_PER_TICK;
12574 +               do_timer(regs);
12575 +       }
12576 +
12577 +       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
12578 +               update_wallclock();
12579 +               if (keventd_up())
12580 +                       schedule_work(&clock_was_set_work);
12581 +       }
12582 +
12583 +       write_sequnlock(&xtime_lock);
12584 +
12585 +       /*
12586 +        * Account stolen ticks.
12587 +        * HACK: Passing NULL to account_steal_time()
12588 +        * ensures that the ticks are accounted as stolen.
12589 +        */
12590 +       if ((stolen > 0) && (delta_cpu > 0)) {
12591 +               delta_cpu -= stolen;
12592 +               if (unlikely(delta_cpu < 0))
12593 +                       stolen += delta_cpu; /* clamp local-time progress */
12594 +               do_div(stolen, NS_PER_TICK);
12595 +               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
12596 +               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
12597 +               account_steal_time(NULL, (cputime_t)stolen);
12598 +       }
12599 +
12600 +       /*
12601 +        * Account blocked ticks.
12602 +        * HACK: Passing idle_task to account_steal_time()
12603 +        * ensures that the ticks are accounted as idle/wait.
12604 +        */
12605 +       if ((blocked > 0) && (delta_cpu > 0)) {
12606 +               delta_cpu -= blocked;
12607 +               if (unlikely(delta_cpu < 0))
12608 +                       blocked += delta_cpu; /* clamp local-time progress */
12609 +               do_div(blocked, NS_PER_TICK);
12610 +               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
12611 +               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
12612 +               account_steal_time(idle_task(cpu), (cputime_t)blocked);
12613 +       }
12614 +
12615 +       /* Account user/system ticks. */
12616 +       if (delta_cpu > 0) {
12617 +               do_div(delta_cpu, NS_PER_TICK);
12618 +               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
12619 +               if (user_mode_vm(regs))
12620 +                       account_user_time(current, (cputime_t)delta_cpu);
12621 +               else
12622 +                       account_system_time(current, HARDIRQ_OFFSET,
12623 +                                           (cputime_t)delta_cpu);
12624 +       }
12625 +
12626 +       /* Offlined for more than a few seconds? Avoid lockup warnings. */
12627 +       if (stolen > 5*HZ)
12628 +               touch_softlockup_watchdog();
12629 +
12630 +       /* Local timer processing (see update_process_times()). */
12631 +       run_local_timers();
12632 +       if (rcu_pending(cpu))
12633 +               rcu_check_callbacks(cpu, user_mode_vm(regs));
12634 +       scheduler_tick();
12635 +       run_posix_cpu_timers(current);
12636 +       profile_tick(CPU_PROFILING, regs);
12637 +
12638 +       return IRQ_HANDLED;
12639 +}
12640 +
12641 +static void init_missing_ticks_accounting(unsigned int cpu)
12642 +{
12643 +       struct vcpu_register_runstate_memory_area area;
12644 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12645 +       int rc;
12646 +
12647 +       memset(runstate, 0, sizeof(*runstate));
12648 +
12649 +       area.addr.v = runstate;
12650 +       rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
12651 +       WARN_ON(rc && rc != -ENOSYS);
12652 +
12653 +       per_cpu(processed_blocked_time, cpu) =
12654 +               runstate->time[RUNSTATE_blocked];
12655 +       per_cpu(processed_stolen_time, cpu) =
12656 +               runstate->time[RUNSTATE_runnable] +
12657 +               runstate->time[RUNSTATE_offline];
12658 +}
12659 +
12660 +/* not static: needed by APM */
12661 +unsigned long get_cmos_time(void)
12662 +{
12663 +       unsigned long retval;
12664 +       unsigned long flags;
12665 +
12666 +       spin_lock_irqsave(&rtc_lock, flags);
12667 +
12668 +       if (efi_enabled)
12669 +               retval = efi_get_time();
12670 +       else
12671 +               retval = mach_get_cmos_time();
12672 +
12673 +       spin_unlock_irqrestore(&rtc_lock, flags);
12674 +
12675 +       return retval;
12676 +}
12677 +EXPORT_SYMBOL(get_cmos_time);
12678 +
12679 +static void sync_cmos_clock(unsigned long dummy);
12680 +
12681 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
12682 +
12683 +static void sync_cmos_clock(unsigned long dummy)
12684 +{
12685 +       struct timeval now, next;
12686 +       int fail = 1;
12687 +
12688 +       /*
12689 +        * If we have an externally synchronized Linux clock, then update
12690 +        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
12691 +        * called as close as possible to 500 ms before the new second starts.
12692 +        * This code is run on a timer.  If the clock is set, that timer
12693 +        * may not expire at the correct time.  Thus, we adjust...
12694 +        */
12695 +       if (!ntp_synced())
12696 +               /*
12697 +                * Not synced, exit, do not restart a timer (if one is
12698 +                * running, let it run out).
12699 +                */
12700 +               return;
12701 +
12702 +       do_gettimeofday(&now);
12703 +       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
12704 +           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
12705 +               fail = set_rtc_mmss(now.tv_sec);
12706 +
12707 +       next.tv_usec = USEC_AFTER - now.tv_usec;
12708 +       if (next.tv_usec <= 0)
12709 +               next.tv_usec += USEC_PER_SEC;
12710 +
12711 +       if (!fail)
12712 +               next.tv_sec = 659;
12713 +       else
12714 +               next.tv_sec = 0;
12715 +
12716 +       if (next.tv_usec >= USEC_PER_SEC) {
12717 +               next.tv_sec++;
12718 +               next.tv_usec -= USEC_PER_SEC;
12719 +       }
12720 +       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
12721 +}
12722 +
12723 +void notify_arch_cmos_timer(void)
12724 +{
12725 +       mod_timer(&sync_cmos_timer, jiffies + 1);
12726 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
12727 +}
12728 +
12729 +static int timer_resume(struct sys_device *dev)
12730 +{
12731 +       extern void time_resume(void);
12732 +       time_resume();
12733 +       return 0;
12734 +}
12735 +
12736 +static struct sysdev_class timer_sysclass = {
12737 +       .resume = timer_resume,
12738 +       set_kset_name("timer"),
12739 +};
12740 +
12741 +
12742 +/* XXX this driverfs stuff should probably go elsewhere later -john */
12743 +static struct sys_device device_timer = {
12744 +       .id     = 0,
12745 +       .cls    = &timer_sysclass,
12746 +};
12747 +
12748 +static int time_init_device(void)
12749 +{
12750 +       int error = sysdev_class_register(&timer_sysclass);
12751 +       if (!error)
12752 +               error = sysdev_register(&device_timer);
12753 +       return error;
12754 +}
12755 +
12756 +device_initcall(time_init_device);
12757 +
12758 +#ifdef CONFIG_HPET_TIMER
12759 +extern void (*late_time_init)(void);
12760 +/* Duplicate of time_init() below, with hpet_enable part added */
12761 +static void __init hpet_time_init(void)
12762 +{
12763 +       xtime.tv_sec = get_cmos_time();
12764 +       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
12765 +       set_normalized_timespec(&wall_to_monotonic,
12766 +               -xtime.tv_sec, -xtime.tv_nsec);
12767 +
12768 +       if ((hpet_enable() >= 0) && hpet_use_timer) {
12769 +               printk("Using HPET for base-timer\n");
12770 +       }
12771 +
12772 +       time_init_hook();
12773 +}
12774 +#endif
12775 +
12776 +/* Dynamically-mapped IRQ. */
12777 +DEFINE_PER_CPU(int, timer_irq);
12778 +
12779 +extern void (*late_time_init)(void);
12780 +static void setup_cpu0_timer_irq(void)
12781 +{
12782 +       per_cpu(timer_irq, 0) =
12783 +               bind_virq_to_irqhandler(
12784 +                       VIRQ_TIMER,
12785 +                       0,
12786 +                       timer_interrupt,
12787 +                       SA_INTERRUPT,
12788 +                       "timer0",
12789 +                       NULL);
12790 +       BUG_ON(per_cpu(timer_irq, 0) < 0);
12791 +}
12792 +
12793 +static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
12794 +       .period_ns = NS_PER_TICK
12795 +};
12796 +
12797 +void __init time_init(void)
12798 +{
12799 +#ifdef CONFIG_HPET_TIMER
12800 +       if (is_hpet_capable()) {
12801 +               /*
12802 +                * HPET initialization needs to do memory-mapped io. So, let
12803 +                * us do a late initialization after mem_init().
12804 +                */
12805 +               late_time_init = hpet_time_init;
12806 +               return;
12807 +       }
12808 +#endif
12809 +
12810 +       switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
12811 +                                  &xen_set_periodic_tick)) {
12812 +       case 0:
12813 +#if CONFIG_XEN_COMPAT <= 0x030004
12814 +       case -ENOSYS:
12815 +#endif
12816 +               break;
12817 +       default:
12818 +               BUG();
12819 +       }
12820 +
12821 +       get_time_values_from_xen(0);
12822 +
12823 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12824 +       per_cpu(processed_system_time, 0) = processed_system_time;
12825 +       init_missing_ticks_accounting(0);
12826 +
12827 +       update_wallclock();
12828 +
12829 +       init_cpu_khz();
12830 +       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
12831 +              cpu_khz / 1000, cpu_khz % 1000);
12832 +
12833 +#if defined(__x86_64__)
12834 +       vxtime.mode = VXTIME_TSC;
12835 +       vxtime.quot = (1000000L << 32) / vxtime_hz;
12836 +       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
12837 +       sync_core();
12838 +       rdtscll(vxtime.last_tsc);
12839 +#endif
12840 +
12841 +       /* Cannot request_irq() until kmem is initialised. */
12842 +       late_time_init = setup_cpu0_timer_irq;
12843 +}
12844 +
12845 +/* Convert jiffies to system time. */
12846 +u64 jiffies_to_st(unsigned long j)
12847 +{
12848 +       unsigned long seq;
12849 +       long delta;
12850 +       u64 st;
12851 +
12852 +       do {
12853 +               seq = read_seqbegin(&xtime_lock);
12854 +               delta = j - jiffies;
12855 +               if (delta < 1) {
12856 +                       /* Triggers in some wrap-around cases, but that's okay:
12857 +                        * we just end up with a shorter timeout. */
12858 +                       st = processed_system_time + NS_PER_TICK;
12859 +               } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
12860 +                       /* Very long timeout means there is no pending timer.
12861 +                        * We indicate this to Xen by passing zero timeout. */
12862 +                       st = 0;
12863 +               } else {
12864 +                       st = processed_system_time + delta * (u64)NS_PER_TICK;
12865 +               }
12866 +       } while (read_seqretry(&xtime_lock, seq));
12867 +
12868 +       return st;
12869 +}
12870 +EXPORT_SYMBOL(jiffies_to_st);
12871 +
12872 +/*
12873 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
12874 + * These functions are based on implementations from arch/s390/kernel/time.c
12875 + */
12876 +static void stop_hz_timer(void)
12877 +{
12878 +       struct vcpu_set_singleshot_timer singleshot;
12879 +       unsigned int cpu = smp_processor_id();
12880 +       unsigned long j;
12881 +       int rc;
12882 +
12883 +       cpu_set(cpu, nohz_cpu_mask);
12884 +
12885 +       /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
12886 +       /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
12887 +       /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
12888 +       /* stop the hz timer then the cpumasks created for subsequent values */
12889 +       /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
12890 +       /* nohz_cpu_mask and so will not depend on this cpu.                 */
12891 +
12892 +       smp_mb();
12893 +
12894 +       /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
12895 +       if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
12896 +           (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
12897 +               cpu_clear(cpu, nohz_cpu_mask);
12898 +               j = jiffies + 1;
12899 +       }
12900 +
12901 +       singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2;
12902 +       singleshot.flags = 0;
12903 +       rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
12904 +#if CONFIG_XEN_COMPAT <= 0x030004
12905 +       if (rc) {
12906 +               BUG_ON(rc != -ENOSYS);
12907 +               rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
12908 +       }
12909 +#endif
12910 +       BUG_ON(rc);
12911 +}
12912 +
12913 +static void start_hz_timer(void)
12914 +{
12915 +       cpu_clear(smp_processor_id(), nohz_cpu_mask);
12916 +}
12917 +
12918 +void raw_safe_halt(void)
12919 +{
12920 +       stop_hz_timer();
12921 +       /* Blocking includes an implicit local_irq_enable(). */
12922 +       HYPERVISOR_block();
12923 +       start_hz_timer();
12924 +}
12925 +EXPORT_SYMBOL(raw_safe_halt);
12926 +
12927 +void halt(void)
12928 +{
12929 +       if (irqs_disabled())
12930 +               VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
12931 +}
12932 +EXPORT_SYMBOL(halt);
12933 +
12934 +/* No locking required. Interrupts are disabled on all CPUs. */
12935 +void time_resume(void)
12936 +{
12937 +       unsigned int cpu;
12938 +
12939 +       init_cpu_khz();
12940 +
12941 +       for_each_online_cpu(cpu) {
12942 +               switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12943 +                                          &xen_set_periodic_tick)) {
12944 +               case 0:
12945 +#if CONFIG_XEN_COMPAT <= 0x030004
12946 +               case -ENOSYS:
12947 +#endif
12948 +                       break;
12949 +               default:
12950 +                       BUG();
12951 +               }
12952 +               get_time_values_from_xen(cpu);
12953 +               per_cpu(processed_system_time, cpu) =
12954 +                       per_cpu(shadow_time, 0).system_timestamp;
12955 +               init_missing_ticks_accounting(cpu);
12956 +       }
12957 +
12958 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12959 +
12960 +       update_wallclock();
12961 +}
12962 +
12963 +#ifdef CONFIG_SMP
12964 +static char timer_name[NR_CPUS][15];
12965 +
12966 +int __cpuinit local_setup_timer(unsigned int cpu)
12967 +{
12968 +       int seq, irq;
12969 +
12970 +       BUG_ON(cpu == 0);
12971 +
12972 +       switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12973 +                          &xen_set_periodic_tick)) {
12974 +       case 0:
12975 +#if CONFIG_XEN_COMPAT <= 0x030004
12976 +       case -ENOSYS:
12977 +#endif
12978 +               break;
12979 +       default:
12980 +               BUG();
12981 +       }
12982 +
12983 +       do {
12984 +               seq = read_seqbegin(&xtime_lock);
12985 +               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
12986 +               per_cpu(processed_system_time, cpu) =
12987 +                       per_cpu(shadow_time, 0).system_timestamp;
12988 +               init_missing_ticks_accounting(cpu);
12989 +       } while (read_seqretry(&xtime_lock, seq));
12990 +
12991 +       sprintf(timer_name[cpu], "timer%u", cpu);
12992 +       irq = bind_virq_to_irqhandler(VIRQ_TIMER,
12993 +                                     cpu,
12994 +                                     timer_interrupt,
12995 +                                     SA_INTERRUPT,
12996 +                                     timer_name[cpu],
12997 +                                     NULL);
12998 +       if (irq < 0)
12999 +               return irq;
13000 +       per_cpu(timer_irq, cpu) = irq;
13001 +
13002 +       return 0;
13003 +}
13004 +
13005 +void __cpuexit local_teardown_timer(unsigned int cpu)
13006 +{
13007 +       BUG_ON(cpu == 0);
13008 +       unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
13009 +}
13010 +#endif
13011 +
13012 +#ifdef CONFIG_CPU_FREQ
13013 +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
13014 +                               void *data)
13015 +{
13016 +       struct cpufreq_freqs *freq = data;
13017 +       struct xen_platform_op op;
13018 +
13019 +       if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
13020 +               return 0;
13021 +
13022 +       if (val == CPUFREQ_PRECHANGE)
13023 +               return 0;
13024 +
13025 +       op.cmd = XENPF_change_freq;
13026 +       op.u.change_freq.flags = 0;
13027 +       op.u.change_freq.cpu = freq->cpu;
13028 +       op.u.change_freq.freq = (u64)freq->new * 1000;
13029 +       WARN_ON(HYPERVISOR_platform_op(&op));
13030 +
13031 +       return 0;
13032 +}
13033 +
13034 +static struct notifier_block time_cpufreq_notifier_block = {
13035 +       .notifier_call = time_cpufreq_notifier
13036 +};
13037 +
13038 +static int __init cpufreq_time_setup(void)
13039 +{
13040 +       if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
13041 +                       CPUFREQ_TRANSITION_NOTIFIER)) {
13042 +               printk(KERN_ERR "failed to set up cpufreq notifier\n");
13043 +               return -ENODEV;
13044 +       }
13045 +       return 0;
13046 +}
13047 +
13048 +core_initcall(cpufreq_time_setup);
13049 +#endif
13050 +
13051 +/*
13052 + * /proc/sys/xen: This really belongs in another file. It can stay here for
13053 + * now however.
13054 + */
13055 +static ctl_table xen_subtable[] = {
13056 +       {
13057 +               .ctl_name       = 1,
13058 +               .procname       = "independent_wallclock",
13059 +               .data           = &independent_wallclock,
13060 +               .maxlen         = sizeof(independent_wallclock),
13061 +               .mode           = 0644,
13062 +               .proc_handler   = proc_dointvec
13063 +       },
13064 +       {
13065 +               .ctl_name       = 2,
13066 +               .procname       = "permitted_clock_jitter",
13067 +               .data           = &permitted_clock_jitter,
13068 +               .maxlen         = sizeof(permitted_clock_jitter),
13069 +               .mode           = 0644,
13070 +               .proc_handler   = proc_doulongvec_minmax
13071 +       },
13072 +       { 0 }
13073 +};
13074 +static ctl_table xen_table[] = {
13075 +       {
13076 +               .ctl_name       = 123,
13077 +               .procname       = "xen",
13078 +               .mode           = 0555,
13079 +               .child          = xen_subtable},
13080 +       { 0 }
13081 +};
13082 +static int __init xen_sysctl_init(void)
13083 +{
13084 +       (void)register_sysctl_table(xen_table, 0);
13085 +       return 0;
13086 +}
13087 +__initcall(xen_sysctl_init);
13088 Index: head-2008-11-25/arch/x86/kernel/traps_32-xen.c
13089 ===================================================================
13090 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
13091 +++ head-2008-11-25/arch/x86/kernel/traps_32-xen.c      2008-04-02 12:34:02.000000000 +0200
13092 @@ -0,0 +1,1190 @@
13093 +/*
13094 + *  linux/arch/i386/traps.c
13095 + *
13096 + *  Copyright (C) 1991, 1992  Linus Torvalds
13097 + *
13098 + *  Pentium III FXSR, SSE support
13099 + *     Gareth Hughes <gareth@valinux.com>, May 2000
13100 + */
13101 +
13102 +/*
13103 + * 'Traps.c' handles hardware traps and faults after we have saved some
13104 + * state in 'asm.s'.
13105 + */
13106 +#include <linux/sched.h>
13107 +#include <linux/kernel.h>
13108 +#include <linux/string.h>
13109 +#include <linux/errno.h>
13110 +#include <linux/timer.h>
13111 +#include <linux/mm.h>
13112 +#include <linux/init.h>
13113 +#include <linux/delay.h>
13114 +#include <linux/spinlock.h>
13115 +#include <linux/interrupt.h>
13116 +#include <linux/highmem.h>
13117 +#include <linux/kallsyms.h>
13118 +#include <linux/ptrace.h>
13119 +#include <linux/utsname.h>
13120 +#include <linux/kprobes.h>
13121 +#include <linux/kexec.h>
13122 +#include <linux/unwind.h>
13123 +
13124 +#ifdef CONFIG_EISA
13125 +#include <linux/ioport.h>
13126 +#include <linux/eisa.h>
13127 +#endif
13128 +
13129 +#ifdef CONFIG_MCA
13130 +#include <linux/mca.h>
13131 +#endif
13132 +
13133 +#include <asm/processor.h>
13134 +#include <asm/system.h>
13135 +#include <asm/uaccess.h>
13136 +#include <asm/io.h>
13137 +#include <asm/atomic.h>
13138 +#include <asm/debugreg.h>
13139 +#include <asm/desc.h>
13140 +#include <asm/i387.h>
13141 +#include <asm/nmi.h>
13142 +#include <asm/unwind.h>
13143 +#include <asm/smp.h>
13144 +#include <asm/arch_hooks.h>
13145 +#include <asm/kdebug.h>
13146 +
13147 +#include <linux/module.h>
13148 +
13149 +#include "mach_traps.h"
13150 +
13151 +asmlinkage int system_call(void);
13152 +
13153 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
13154 +               { 0, 0 }, { 0, 0 } };
13155 +
13156 +/* Do we ignore FPU interrupts ? */
13157 +char ignore_fpu_irq = 0;
13158 +
13159 +#ifndef CONFIG_X86_NO_IDT
13160 +/*
13161 + * The IDT has to be page-aligned to simplify the Pentium
13162 + * F0 0F bug workaround.. We have a special link segment
13163 + * for this.
13164 + */
13165 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
13166 +#endif
13167 +
13168 +asmlinkage void divide_error(void);
13169 +asmlinkage void debug(void);
13170 +asmlinkage void nmi(void);
13171 +asmlinkage void int3(void);
13172 +asmlinkage void overflow(void);
13173 +asmlinkage void bounds(void);
13174 +asmlinkage void invalid_op(void);
13175 +asmlinkage void device_not_available(void);
13176 +asmlinkage void coprocessor_segment_overrun(void);
13177 +asmlinkage void invalid_TSS(void);
13178 +asmlinkage void segment_not_present(void);
13179 +asmlinkage void stack_segment(void);
13180 +asmlinkage void general_protection(void);
13181 +asmlinkage void page_fault(void);
13182 +asmlinkage void coprocessor_error(void);
13183 +asmlinkage void simd_coprocessor_error(void);
13184 +asmlinkage void alignment_check(void);
13185 +#ifndef CONFIG_XEN
13186 +asmlinkage void spurious_interrupt_bug(void);
13187 +#else
13188 +asmlinkage void fixup_4gb_segment(void);
13189 +#endif
13190 +asmlinkage void machine_check(void);
13191 +
13192 +static int kstack_depth_to_print = 24;
13193 +#ifdef CONFIG_STACK_UNWIND
13194 +static int call_trace = 1;
13195 +#else
13196 +#define call_trace (-1)
13197 +#endif
13198 +ATOMIC_NOTIFIER_HEAD(i386die_chain);
13199 +
13200 +int register_die_notifier(struct notifier_block *nb)
13201 +{
13202 +       vmalloc_sync_all();
13203 +       return atomic_notifier_chain_register(&i386die_chain, nb);
13204 +}
13205 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
13206 +
13207 +int unregister_die_notifier(struct notifier_block *nb)
13208 +{
13209 +       return atomic_notifier_chain_unregister(&i386die_chain, nb);
13210 +}
13211 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
13212 +
13213 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
13214 +{
13215 +       return  p > (void *)tinfo &&
13216 +               p < (void *)tinfo + THREAD_SIZE - 3;
13217 +}
13218 +
13219 +/*
13220 + * Print one address/symbol entries per line.
13221 + */
13222 +static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
13223 +{
13224 +       printk(" [<%08lx>] ", addr);
13225 +
13226 +       print_symbol("%s\n", addr);
13227 +}
13228 +
13229 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
13230 +                               unsigned long *stack, unsigned long ebp,
13231 +                               char *log_lvl)
13232 +{
13233 +       unsigned long addr;
13234 +
13235 +#ifdef CONFIG_FRAME_POINTER
13236 +       while (valid_stack_ptr(tinfo, (void *)ebp)) {
13237 +               addr = *(unsigned long *)(ebp + 4);
13238 +               print_addr_and_symbol(addr, log_lvl);
13239 +               /*
13240 +                * break out of recursive entries (such as
13241 +                * end_of_stack_stop_unwind_function):
13242 +                */
13243 +               if (ebp == *(unsigned long *)ebp)
13244 +                       break;
13245 +               ebp = *(unsigned long *)ebp;
13246 +       }
13247 +#else
13248 +       while (valid_stack_ptr(tinfo, stack)) {
13249 +               addr = *stack++;
13250 +               if (__kernel_text_address(addr))
13251 +                       print_addr_and_symbol(addr, log_lvl);
13252 +       }
13253 +#endif
13254 +       return ebp;
13255 +}
13256 +
13257 +static asmlinkage int
13258 +show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
13259 +{
13260 +       int n = 0;
13261 +
13262 +       while (unwind(info) == 0 && UNW_PC(info)) {
13263 +               n++;
13264 +               print_addr_and_symbol(UNW_PC(info), log_lvl);
13265 +               if (arch_unw_user_mode(info))
13266 +                       break;
13267 +       }
13268 +       return n;
13269 +}
13270 +
13271 +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
13272 +                              unsigned long *stack, char *log_lvl)
13273 +{
13274 +       unsigned long ebp;
13275 +
13276 +       if (!task)
13277 +               task = current;
13278 +
13279 +       if (call_trace >= 0) {
13280 +               int unw_ret = 0;
13281 +               struct unwind_frame_info info;
13282 +
13283 +               if (regs) {
13284 +                       if (unwind_init_frame_info(&info, task, regs) == 0)
13285 +                               unw_ret = show_trace_unwind(&info, log_lvl);
13286 +               } else if (task == current)
13287 +                       unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
13288 +               else {
13289 +                       if (unwind_init_blocked(&info, task) == 0)
13290 +                               unw_ret = show_trace_unwind(&info, log_lvl);
13291 +               }
13292 +               if (unw_ret > 0) {
13293 +                       if (call_trace == 1 && !arch_unw_user_mode(&info)) {
13294 +                               print_symbol("DWARF2 unwinder stuck at %s\n",
13295 +                                            UNW_PC(&info));
13296 +                               if (UNW_SP(&info) >= PAGE_OFFSET) {
13297 +                                       printk("Leftover inexact backtrace:\n");
13298 +                                       stack = (void *)UNW_SP(&info);
13299 +                               } else
13300 +                                       printk("Full inexact backtrace again:\n");
13301 +                       } else if (call_trace >= 1)
13302 +                               return;
13303 +                       else
13304 +                               printk("Full inexact backtrace again:\n");
13305 +               } else
13306 +                       printk("Inexact backtrace:\n");
13307 +       }
13308 +
13309 +       if (task == current) {
13310 +               /* Grab ebp right from our regs */
13311 +               asm ("movl %%ebp, %0" : "=r" (ebp) : );
13312 +       } else {
13313 +               /* ebp is the last reg pushed by switch_to */
13314 +               ebp = *(unsigned long *) task->thread.esp;
13315 +       }
13316 +
13317 +       while (1) {
13318 +               struct thread_info *context;
13319 +               context = (struct thread_info *)
13320 +                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
13321 +               ebp = print_context_stack(context, stack, ebp, log_lvl);
13322 +               stack = (unsigned long*)context->previous_esp;
13323 +               if (!stack)
13324 +                       break;
13325 +               printk("%s =======================\n", log_lvl);
13326 +       }
13327 +}
13328 +
13329 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
13330 +{
13331 +       show_trace_log_lvl(task, regs, stack, "");
13332 +}
13333 +
13334 +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
13335 +                              unsigned long *esp, char *log_lvl)
13336 +{
13337 +       unsigned long *stack;
13338 +       int i;
13339 +
13340 +       if (esp == NULL) {
13341 +               if (task)
13342 +                       esp = (unsigned long*)task->thread.esp;
13343 +               else
13344 +                       esp = (unsigned long *)&esp;
13345 +       }
13346 +
13347 +       stack = esp;
13348 +       for(i = 0; i < kstack_depth_to_print; i++) {
13349 +               if (kstack_end(stack))
13350 +                       break;
13351 +               if (i && ((i % 8) == 0))
13352 +                       printk("\n%s       ", log_lvl);
13353 +               printk("%08lx ", *stack++);
13354 +       }
13355 +       printk("\n%sCall Trace:\n", log_lvl);
13356 +       show_trace_log_lvl(task, regs, esp, log_lvl);
13357 +}
13358 +
13359 +void show_stack(struct task_struct *task, unsigned long *esp)
13360 +{
13361 +       printk("       ");
13362 +       show_stack_log_lvl(task, NULL, esp, "");
13363 +}
13364 +
13365 +/*
13366 + * The architecture-independent dump_stack generator
13367 + */
13368 +void dump_stack(void)
13369 +{
13370 +       unsigned long stack;
13371 +
13372 +       show_trace(current, NULL, &stack);
13373 +}
13374 +
13375 +EXPORT_SYMBOL(dump_stack);
13376 +
13377 +void show_registers(struct pt_regs *regs)
13378 +{
13379 +       int i;
13380 +       int in_kernel = 1;
13381 +       unsigned long esp;
13382 +       unsigned short ss;
13383 +
13384 +       esp = (unsigned long) (&regs->esp);
13385 +       savesegment(ss, ss);
13386 +       if (user_mode_vm(regs)) {
13387 +               in_kernel = 0;
13388 +               esp = regs->esp;
13389 +               ss = regs->xss & 0xffff;
13390 +       }
13391 +       print_modules();
13392 +       printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
13393 +                       "EFLAGS: %08lx   (%s %.*s) \n",
13394 +               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
13395 +               print_tainted(), regs->eflags, system_utsname.release,
13396 +               (int)strcspn(system_utsname.version, " "),
13397 +               system_utsname.version);
13398 +       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
13399 +       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
13400 +               regs->eax, regs->ebx, regs->ecx, regs->edx);
13401 +       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
13402 +               regs->esi, regs->edi, regs->ebp, esp);
13403 +       printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
13404 +               regs->xds & 0xffff, regs->xes & 0xffff, ss);
13405 +       printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
13406 +               TASK_COMM_LEN, current->comm, current->pid,
13407 +               current_thread_info(), current, current->thread_info);
13408 +       /*
13409 +        * When in-kernel, we also print out the stack and code at the
13410 +        * time of the fault..
13411 +        */
13412 +       if (in_kernel) {
13413 +               u8 __user *eip;
13414 +
13415 +               printk("\n" KERN_EMERG "Stack: ");
13416 +               show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
13417 +
13418 +               printk(KERN_EMERG "Code: ");
13419 +
13420 +               eip = (u8 __user *)regs->eip - 43;
13421 +               for (i = 0; i < 64; i++, eip++) {
13422 +                       unsigned char c;
13423 +
13424 +                       if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
13425 +                               printk(" Bad EIP value.");
13426 +                               break;
13427 +                       }
13428 +                       if (eip == (u8 __user *)regs->eip)
13429 +                               printk("<%02x> ", c);
13430 +                       else
13431 +                               printk("%02x ", c);
13432 +               }
13433 +       }
13434 +       printk("\n");
13435 +}
13436 +
13437 +static void handle_BUG(struct pt_regs *regs)
13438 +{
13439 +       unsigned long eip = regs->eip;
13440 +       unsigned short ud2;
13441 +
13442 +       if (eip < PAGE_OFFSET)
13443 +               return;
13444 +       if (__get_user(ud2, (unsigned short __user *)eip))
13445 +               return;
13446 +       if (ud2 != 0x0b0f)
13447 +               return;
13448 +
13449 +       printk(KERN_EMERG "------------[ cut here ]------------\n");
13450 +
13451 +#ifdef CONFIG_DEBUG_BUGVERBOSE
13452 +       do {
13453 +               unsigned short line;
13454 +               char *file;
13455 +               char c;
13456 +
13457 +               if (__get_user(line, (unsigned short __user *)(eip + 2)))
13458 +                       break;
13459 +               if (__get_user(file, (char * __user *)(eip + 4)) ||
13460 +                   (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
13461 +                       file = "<bad filename>";
13462 +
13463 +               printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
13464 +               return;
13465 +       } while (0);
13466 +#endif
13467 +       printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
13468 +}
13469 +
13470 +/* This is gone through when something in the kernel
13471 + * has done something bad and is about to be terminated.
13472 +*/
13473 +void die(const char * str, struct pt_regs * regs, long err)
13474 +{
13475 +       static struct {
13476 +               spinlock_t lock;
13477 +               u32 lock_owner;
13478 +               int lock_owner_depth;
13479 +       } die = {
13480 +               .lock =                 SPIN_LOCK_UNLOCKED,
13481 +               .lock_owner =           -1,
13482 +               .lock_owner_depth =     0
13483 +       };
13484 +       static int die_counter;
13485 +       unsigned long flags;
13486 +
13487 +       oops_enter();
13488 +
13489 +       if (die.lock_owner != raw_smp_processor_id()) {
13490 +               console_verbose();
13491 +               spin_lock_irqsave(&die.lock, flags);
13492 +               die.lock_owner = smp_processor_id();
13493 +               die.lock_owner_depth = 0;
13494 +               bust_spinlocks(1);
13495 +       }
13496 +       else
13497 +               local_save_flags(flags);
13498 +
13499 +       if (++die.lock_owner_depth < 3) {
13500 +               int nl = 0;
13501 +               unsigned long esp;
13502 +               unsigned short ss;
13503 +
13504 +               handle_BUG(regs);
13505 +               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
13506 +#ifdef CONFIG_PREEMPT
13507 +               printk(KERN_EMERG "PREEMPT ");
13508 +               nl = 1;
13509 +#endif
13510 +#ifdef CONFIG_SMP
13511 +               if (!nl)
13512 +                       printk(KERN_EMERG);
13513 +               printk("SMP ");
13514 +               nl = 1;
13515 +#endif
13516 +#ifdef CONFIG_DEBUG_PAGEALLOC
13517 +               if (!nl)
13518 +                       printk(KERN_EMERG);
13519 +               printk("DEBUG_PAGEALLOC");
13520 +               nl = 1;
13521 +#endif
13522 +               if (nl)
13523 +                       printk("\n");
13524 +               if (notify_die(DIE_OOPS, str, regs, err,
13525 +                                       current->thread.trap_no, SIGSEGV) !=
13526 +                               NOTIFY_STOP) {
13527 +                       show_registers(regs);
13528 +                       /* Executive summary in case the oops scrolled away */
13529 +                       esp = (unsigned long) (&regs->esp);
13530 +                       savesegment(ss, ss);
13531 +                       if (user_mode(regs)) {
13532 +                               esp = regs->esp;
13533 +                               ss = regs->xss & 0xffff;
13534 +                       }
13535 +                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
13536 +                       print_symbol("%s", regs->eip);
13537 +                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
13538 +               }
13539 +               else
13540 +                       regs = NULL;
13541 +       } else
13542 +               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
13543 +
13544 +       bust_spinlocks(0);
13545 +       die.lock_owner = -1;
13546 +       spin_unlock_irqrestore(&die.lock, flags);
13547 +
13548 +       if (!regs)
13549 +               return;
13550 +
13551 +       if (kexec_should_crash(current))
13552 +               crash_kexec(regs);
13553 +
13554 +       if (in_interrupt())
13555 +               panic("Fatal exception in interrupt");
13556 +
13557 +       if (panic_on_oops)
13558 +               panic("Fatal exception");
13559 +
13560 +       oops_exit();
13561 +       do_exit(SIGSEGV);
13562 +}
13563 +
13564 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
13565 +{
13566 +       if (!user_mode_vm(regs))
13567 +               die(str, regs, err);
13568 +}
13569 +
13570 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
13571 +                             struct pt_regs * regs, long error_code,
13572 +                             siginfo_t *info)
13573 +{
13574 +       struct task_struct *tsk = current;
13575 +       tsk->thread.error_code = error_code;
13576 +       tsk->thread.trap_no = trapnr;
13577 +
13578 +       if (regs->eflags & VM_MASK) {
13579 +               if (vm86)
13580 +                       goto vm86_trap;
13581 +               goto trap_signal;
13582 +       }
13583 +
13584 +       if (!user_mode(regs))
13585 +               goto kernel_trap;
13586 +
13587 +       trap_signal: {
13588 +               if (info)
13589 +                       force_sig_info(signr, info, tsk);
13590 +               else
13591 +                       force_sig(signr, tsk);
13592 +               return;
13593 +       }
13594 +
13595 +       kernel_trap: {
13596 +               if (!fixup_exception(regs))
13597 +                       die(str, regs, error_code);
13598 +               return;
13599 +       }
13600 +
13601 +       vm86_trap: {
13602 +               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
13603 +               if (ret) goto trap_signal;
13604 +               return;
13605 +       }
13606 +}
13607 +
13608 +#define DO_ERROR(trapnr, signr, str, name) \
13609 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13610 +{ \
13611 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13612 +                                               == NOTIFY_STOP) \
13613 +               return; \
13614 +       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
13615 +}
13616 +
13617 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13618 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13619 +{ \
13620 +       siginfo_t info; \
13621 +       info.si_signo = signr; \
13622 +       info.si_errno = 0; \
13623 +       info.si_code = sicode; \
13624 +       info.si_addr = (void __user *)siaddr; \
13625 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13626 +                                               == NOTIFY_STOP) \
13627 +               return; \
13628 +       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
13629 +}
13630 +
13631 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
13632 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13633 +{ \
13634 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13635 +                                               == NOTIFY_STOP) \
13636 +               return; \
13637 +       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
13638 +}
13639 +
13640 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13641 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13642 +{ \
13643 +       siginfo_t info; \
13644 +       info.si_signo = signr; \
13645 +       info.si_errno = 0; \
13646 +       info.si_code = sicode; \
13647 +       info.si_addr = (void __user *)siaddr; \
13648 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13649 +                                               == NOTIFY_STOP) \
13650 +               return; \
13651 +       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
13652 +}
13653 +
13654 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
13655 +#ifndef CONFIG_KPROBES
13656 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
13657 +#endif
13658 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
13659 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
13660 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
13661 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
13662 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
13663 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
13664 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
13665 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
13666 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
13667 +
13668 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
13669 +                                             long error_code)
13670 +{
13671 +       current->thread.error_code = error_code;
13672 +       current->thread.trap_no = 13;
13673 +
13674 +       if (regs->eflags & VM_MASK)
13675 +               goto gp_in_vm86;
13676 +
13677 +       if (!user_mode(regs))
13678 +               goto gp_in_kernel;
13679 +
13680 +       current->thread.error_code = error_code;
13681 +       current->thread.trap_no = 13;
13682 +       force_sig(SIGSEGV, current);
13683 +       return;
13684 +
13685 +gp_in_vm86:
13686 +       local_irq_enable();
13687 +       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
13688 +       return;
13689 +
13690 +gp_in_kernel:
13691 +       if (!fixup_exception(regs)) {
13692 +               if (notify_die(DIE_GPF, "general protection fault", regs,
13693 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
13694 +                       return;
13695 +               die("general protection fault", regs, error_code);
13696 +       }
13697 +}
13698 +
13699 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
13700 +{
13701 +       printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
13702 +                       "to continue\n");
13703 +       printk(KERN_EMERG "You probably have a hardware problem with your RAM "
13704 +                       "chips\n");
13705 +
13706 +       /* Clear and disable the memory parity error line. */
13707 +       clear_mem_error(reason);
13708 +}
13709 +
13710 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
13711 +{
13712 +       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
13713 +       show_registers(regs);
13714 +
13715 +       /* Re-enable the IOCK line, wait for a few seconds */
13716 +       clear_io_check_error(reason);
13717 +}
13718 +
13719 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
13720 +{
13721 +#ifdef CONFIG_MCA
13722 +       /* Might actually be able to figure out what the guilty party
13723 +       * is. */
13724 +       if( MCA_bus ) {
13725 +               mca_handle_nmi();
13726 +               return;
13727 +       }
13728 +#endif
13729 +       printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
13730 +               reason, smp_processor_id());
13731 +       printk("Dazed and confused, but trying to continue\n");
13732 +       printk("Do you have a strange power saving mode enabled?\n");
13733 +}
13734 +
13735 +static DEFINE_SPINLOCK(nmi_print_lock);
13736 +
13737 +void die_nmi (struct pt_regs *regs, const char *msg)
13738 +{
13739 +       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
13740 +           NOTIFY_STOP)
13741 +               return;
13742 +
13743 +       spin_lock(&nmi_print_lock);
13744 +       /*
13745 +       * We are in trouble anyway, lets at least try
13746 +       * to get a message out.
13747 +       */
13748 +       bust_spinlocks(1);
13749 +       printk(KERN_EMERG "%s", msg);
13750 +       printk(" on CPU%d, eip %08lx, registers:\n",
13751 +               smp_processor_id(), regs->eip);
13752 +       show_registers(regs);
13753 +       printk(KERN_EMERG "console shuts up ...\n");
13754 +       console_silent();
13755 +       spin_unlock(&nmi_print_lock);
13756 +       bust_spinlocks(0);
13757 +
13758 +       /* If we are in kernel we are probably nested up pretty bad
13759 +        * and might aswell get out now while we still can.
13760 +       */
13761 +       if (!user_mode_vm(regs)) {
13762 +               current->thread.trap_no = 2;
13763 +               crash_kexec(regs);
13764 +       }
13765 +
13766 +       do_exit(SIGSEGV);
13767 +}
13768 +
13769 +static void default_do_nmi(struct pt_regs * regs)
13770 +{
13771 +       unsigned char reason = 0;
13772 +
13773 +       /* Only the BSP gets external NMIs from the system.  */
13774 +       if (!smp_processor_id())
13775 +               reason = get_nmi_reason();
13776 +
13777 +       if (!(reason & 0xc0)) {
13778 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
13779 +                                                       == NOTIFY_STOP)
13780 +                       return;
13781 +#ifdef CONFIG_X86_LOCAL_APIC
13782 +               /*
13783 +                * Ok, so this is none of the documented NMI sources,
13784 +                * so it must be the NMI watchdog.
13785 +                */
13786 +               if (nmi_watchdog) {
13787 +                       nmi_watchdog_tick(regs);
13788 +                       return;
13789 +               }
13790 +#endif
13791 +               unknown_nmi_error(reason, regs);
13792 +               return;
13793 +       }
13794 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
13795 +               return;
13796 +       if (reason & 0x80)
13797 +               mem_parity_error(reason, regs);
13798 +       if (reason & 0x40)
13799 +               io_check_error(reason, regs);
13800 +       /*
13801 +        * Reassert NMI in case it became active meanwhile
13802 +        * as it's edge-triggered.
13803 +        */
13804 +       reassert_nmi();
13805 +}
13806 +
13807 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
13808 +{
13809 +       return 0;
13810 +}
13811 +
13812 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
13813 +
13814 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
13815 +{
13816 +       int cpu;
13817 +
13818 +       nmi_enter();
13819 +
13820 +       cpu = smp_processor_id();
13821 +
13822 +       ++nmi_count(cpu);
13823 +
13824 +       if (!rcu_dereference(nmi_callback)(regs, cpu))
13825 +               default_do_nmi(regs);
13826 +
13827 +       nmi_exit();
13828 +}
13829 +
13830 +void set_nmi_callback(nmi_callback_t callback)
13831 +{
13832 +       vmalloc_sync_all();
13833 +       rcu_assign_pointer(nmi_callback, callback);
13834 +}
13835 +EXPORT_SYMBOL_GPL(set_nmi_callback);
13836 +
13837 +void unset_nmi_callback(void)
13838 +{
13839 +       nmi_callback = dummy_nmi_callback;
13840 +}
13841 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
13842 +
13843 +#ifdef CONFIG_KPROBES
13844 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
13845 +{
13846 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
13847 +                       == NOTIFY_STOP)
13848 +               return;
13849 +       /* This is an interrupt gate, because kprobes wants interrupts
13850 +       disabled.  Normal trap handlers don't. */
13851 +       restore_interrupts(regs);
13852 +       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
13853 +}
13854 +#endif
13855 +
13856 +/*
13857 + * Our handling of the processor debug registers is non-trivial.
13858 + * We do not clear them on entry and exit from the kernel. Therefore
13859 + * it is possible to get a watchpoint trap here from inside the kernel.
13860 + * However, the code in ./ptrace.c has ensured that the user can
13861 + * only set watchpoints on userspace addresses. Therefore the in-kernel
13862 + * watchpoint trap can only occur in code which is reading/writing
13863 + * from user space. Such code must not hold kernel locks (since it
13864 + * can equally take a page fault), therefore it is safe to call
13865 + * force_sig_info even though that claims and releases locks.
13866 + *
13867 + * Code in ./signal.c ensures that the debug control register
13868 + * is restored before we deliver any signal, and therefore that
13869 + * user code runs with the correct debug control register even though
13870 + * we clear it here.
13871 + *
13872 + * Being careful here means that we don't have to be as careful in a
13873 + * lot of more complicated places (task switching can be a bit lazy
13874 + * about restoring all the debug state, and ptrace doesn't have to
13875 + * find every occurrence of the TF bit that could be saved away even
13876 + * by user code)
13877 + */
13878 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
13879 +{
13880 +       unsigned int condition;
13881 +       struct task_struct *tsk = current;
13882 +
13883 +       get_debugreg(condition, 6);
13884 +
13885 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
13886 +                                       SIGTRAP) == NOTIFY_STOP)
13887 +               return;
13888 +       /* It's safe to allow irq's after DR6 has been saved */
13889 +       if (regs->eflags & X86_EFLAGS_IF)
13890 +               local_irq_enable();
13891 +
13892 +       /* Mask out spurious debug traps due to lazy DR7 setting */
13893 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
13894 +               if (!tsk->thread.debugreg[7])
13895 +                       goto clear_dr7;
13896 +       }
13897 +
13898 +       if (regs->eflags & VM_MASK)
13899 +               goto debug_vm86;
13900 +
13901 +       /* Save debug status register where ptrace can see it */
13902 +       tsk->thread.debugreg[6] = condition;
13903 +
13904 +       /*
13905 +        * Single-stepping through TF: make sure we ignore any events in
13906 +        * kernel space (but re-enable TF when returning to user mode).
13907 +        */
13908 +       if (condition & DR_STEP) {
13909 +               /*
13910 +                * We already checked v86 mode above, so we can
13911 +                * check for kernel mode by just checking the CPL
13912 +                * of CS.
13913 +                */
13914 +               if (!user_mode(regs))
13915 +                       goto clear_TF_reenable;
13916 +       }
13917 +
13918 +       /* Ok, finally something we can handle */
13919 +       send_sigtrap(tsk, regs, error_code);
13920 +
13921 +       /* Disable additional traps. They'll be re-enabled when
13922 +        * the signal is delivered.
13923 +        */
13924 +clear_dr7:
13925 +       set_debugreg(0, 7);
13926 +       return;
13927 +
13928 +debug_vm86:
13929 +       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
13930 +       return;
13931 +
13932 +clear_TF_reenable:
13933 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
13934 +       regs->eflags &= ~TF_MASK;
13935 +       return;
13936 +}
13937 +
13938 +/*
13939 + * Note that we play around with the 'TS' bit in an attempt to get
13940 + * the correct behaviour even in the presence of the asynchronous
13941 + * IRQ13 behaviour
13942 + */
13943 +void math_error(void __user *eip)
13944 +{
13945 +       struct task_struct * task;
13946 +       siginfo_t info;
13947 +       unsigned short cwd, swd;
13948 +
13949 +       /*
13950 +        * Save the info for the exception handler and clear the error.
13951 +        */
13952 +       task = current;
13953 +       save_init_fpu(task);
13954 +       task->thread.trap_no = 16;
13955 +       task->thread.error_code = 0;
13956 +       info.si_signo = SIGFPE;
13957 +       info.si_errno = 0;
13958 +       info.si_code = __SI_FAULT;
13959 +       info.si_addr = eip;
13960 +       /*
13961 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
13962 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
13963 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
13964 +        * fault bit.  We should only be taking one exception at a time,
13965 +        * so if this combination doesn't produce any single exception,
13966 +        * then we have a bad program that isn't syncronizing its FPU usage
13967 +        * and it will suffer the consequences since we won't be able to
13968 +        * fully reproduce the context of the exception
13969 +        */
13970 +       cwd = get_fpu_cwd(task);
13971 +       swd = get_fpu_swd(task);
13972 +       switch (swd & ~cwd & 0x3f) {
13973 +               case 0x000: /* No unmasked exception */
13974 +                       return;
13975 +               default:    /* Multiple exceptions */
13976 +                       break;
13977 +               case 0x001: /* Invalid Op */
13978 +                       /*
13979 +                        * swd & 0x240 == 0x040: Stack Underflow
13980 +                        * swd & 0x240 == 0x240: Stack Overflow
13981 +                        * User must clear the SF bit (0x40) if set
13982 +                        */
13983 +                       info.si_code = FPE_FLTINV;
13984 +                       break;
13985 +               case 0x002: /* Denormalize */
13986 +               case 0x010: /* Underflow */
13987 +                       info.si_code = FPE_FLTUND;
13988 +                       break;
13989 +               case 0x004: /* Zero Divide */
13990 +                       info.si_code = FPE_FLTDIV;
13991 +                       break;
13992 +               case 0x008: /* Overflow */
13993 +                       info.si_code = FPE_FLTOVF;
13994 +                       break;
13995 +               case 0x020: /* Precision */
13996 +                       info.si_code = FPE_FLTRES;
13997 +                       break;
13998 +       }
13999 +       force_sig_info(SIGFPE, &info, task);
14000 +}
14001 +
14002 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
14003 +{
14004 +       ignore_fpu_irq = 1;
14005 +       math_error((void __user *)regs->eip);
14006 +}
14007 +
14008 +static void simd_math_error(void __user *eip)
14009 +{
14010 +       struct task_struct * task;
14011 +       siginfo_t info;
14012 +       unsigned short mxcsr;
14013 +
14014 +       /*
14015 +        * Save the info for the exception handler and clear the error.
14016 +        */
14017 +       task = current;
14018 +       save_init_fpu(task);
14019 +       task->thread.trap_no = 19;
14020 +       task->thread.error_code = 0;
14021 +       info.si_signo = SIGFPE;
14022 +       info.si_errno = 0;
14023 +       info.si_code = __SI_FAULT;
14024 +       info.si_addr = eip;
14025 +       /*
14026 +        * The SIMD FPU exceptions are handled a little differently, as there
14027 +        * is only a single status/control register.  Thus, to determine which
14028 +        * unmasked exception was caught we must mask the exception mask bits
14029 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
14030 +        */
14031 +       mxcsr = get_fpu_mxcsr(task);
14032 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
14033 +               case 0x000:
14034 +               default:
14035 +                       break;
14036 +               case 0x001: /* Invalid Op */
14037 +                       info.si_code = FPE_FLTINV;
14038 +                       break;
14039 +               case 0x002: /* Denormalize */
14040 +               case 0x010: /* Underflow */
14041 +                       info.si_code = FPE_FLTUND;
14042 +                       break;
14043 +               case 0x004: /* Zero Divide */
14044 +                       info.si_code = FPE_FLTDIV;
14045 +                       break;
14046 +               case 0x008: /* Overflow */
14047 +                       info.si_code = FPE_FLTOVF;
14048 +                       break;
14049 +               case 0x020: /* Precision */
14050 +                       info.si_code = FPE_FLTRES;
14051 +                       break;
14052 +       }
14053 +       force_sig_info(SIGFPE, &info, task);
14054 +}
14055 +
14056 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
14057 +                                         long error_code)
14058 +{
14059 +       if (cpu_has_xmm) {
14060 +               /* Handle SIMD FPU exceptions on PIII+ processors. */
14061 +               ignore_fpu_irq = 1;
14062 +               simd_math_error((void __user *)regs->eip);
14063 +       } else {
14064 +               /*
14065 +                * Handle strange cache flush from user space exception
14066 +                * in all other cases.  This is undocumented behaviour.
14067 +                */
14068 +               if (regs->eflags & VM_MASK) {
14069 +                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
14070 +                                         error_code);
14071 +                       return;
14072 +               }
14073 +               current->thread.trap_no = 19;
14074 +               current->thread.error_code = error_code;
14075 +               die_if_kernel("cache flush denied", regs, error_code);
14076 +               force_sig(SIGSEGV, current);
14077 +       }
14078 +}
14079 +
14080 +#ifndef CONFIG_XEN
14081 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
14082 +                                         long error_code)
14083 +{
14084 +#if 0
14085 +       /* No need to warn about this any longer. */
14086 +       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
14087 +#endif
14088 +}
14089 +
14090 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
14091 +{
14092 +       unsigned long *switch16_ptr, *switch32_ptr;
14093 +       struct pt_regs *regs;
14094 +       unsigned long stack_top, stack_bot;
14095 +       unsigned short iret_frame16_off;
14096 +       int cpu = smp_processor_id();
14097 +       /* reserve the space on 32bit stack for the magic switch16 pointer */
14098 +       memmove(stk, stk + 8, sizeof(struct pt_regs));
14099 +       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
14100 +       regs = (struct pt_regs *)stk;
14101 +       /* now the switch32 on 16bit stack */
14102 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14103 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14104 +       switch32_ptr = (unsigned long *)(stack_top - 8);
14105 +       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
14106 +       /* copy iret frame on 16bit stack */
14107 +       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
14108 +       /* fill in the switch pointers */
14109 +       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
14110 +       switch16_ptr[1] = __ESPFIX_SS;
14111 +       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
14112 +               8 - CPU_16BIT_STACK_SIZE;
14113 +       switch32_ptr[1] = __KERNEL_DS;
14114 +}
14115 +
14116 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
14117 +{
14118 +       unsigned long *switch32_ptr;
14119 +       unsigned char *stack16, *stack32;
14120 +       unsigned long stack_top, stack_bot;
14121 +       int len;
14122 +       int cpu = smp_processor_id();
14123 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14124 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14125 +       switch32_ptr = (unsigned long *)(stack_top - 8);
14126 +       /* copy the data from 16bit stack to 32bit stack */
14127 +       len = CPU_16BIT_STACK_SIZE - 8 - sp;
14128 +       stack16 = (unsigned char *)(stack_bot + sp);
14129 +       stack32 = (unsigned char *)
14130 +               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
14131 +       memcpy(stack32, stack16, len);
14132 +       return stack32;
14133 +}
14134 +#endif
14135 +
14136 +/*
14137 + *  'math_state_restore()' saves the current math information in the
14138 + * old math state array, and gets the new ones from the current task
14139 + *
14140 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
14141 + * Don't touch unless you *really* know how it works.
14142 + *
14143 + * Must be called with kernel preemption disabled (in this case,
14144 + * local interrupts are disabled at the call-site in entry.S).
14145 + */
14146 +asmlinkage void math_state_restore(struct pt_regs regs)
14147 +{
14148 +       struct thread_info *thread = current_thread_info();
14149 +       struct task_struct *tsk = thread->task;
14150 +
14151 +       /* NB. 'clts' is done for us by Xen during virtual trap. */
14152 +       if (!tsk_used_math(tsk))
14153 +               init_fpu(tsk);
14154 +       restore_fpu(tsk);
14155 +       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
14156 +}
14157 +
14158 +#ifndef CONFIG_MATH_EMULATION
14159 +
14160 +asmlinkage void math_emulate(long arg)
14161 +{
14162 +       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
14163 +       printk(KERN_EMERG "killing %s.\n",current->comm);
14164 +       force_sig(SIGFPE,current);
14165 +       schedule();
14166 +}
14167 +
14168 +#endif /* CONFIG_MATH_EMULATION */
14169 +
14170 +#ifdef CONFIG_X86_F00F_BUG
14171 +void __init trap_init_f00f_bug(void)
14172 +{
14173 +       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
14174 +
14175 +       /*
14176 +        * Update the IDT descriptor and reload the IDT so that
14177 +        * it uses the read-only mapped virtual address.
14178 +        */
14179 +       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
14180 +       load_idt(&idt_descr);
14181 +}
14182 +#endif
14183 +
14184 +
14185 +/*
14186 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
14187 + * for those that specify <dpl>|4 in the second field.
14188 + */
14189 +static trap_info_t __cpuinitdata trap_table[] = {
14190 +       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
14191 +       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
14192 +       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
14193 +       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
14194 +       {  5, 0, __KERNEL_CS, (unsigned long)bounds                     },
14195 +       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
14196 +       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
14197 +       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
14198 +       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
14199 +       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
14200 +       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
14201 +       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
14202 +       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
14203 +       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
14204 +       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
14205 +       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
14206 +#ifdef CONFIG_X86_MCE
14207 +       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
14208 +#endif
14209 +       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
14210 +       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
14211 +       {  0, 0,           0, 0                                         }
14212 +};
14213 +
14214 +void __init trap_init(void)
14215 +{
14216 +       int ret;
14217 +
14218 +       ret = HYPERVISOR_set_trap_table(trap_table);
14219 +       if (ret)
14220 +               printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
14221 +
14222 +       if (cpu_has_fxsr) {
14223 +               /*
14224 +                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
14225 +                * Generates a compile-time "error: zero width for bit-field" if
14226 +                * the alignment is wrong.
14227 +                */
14228 +               struct fxsrAlignAssert {
14229 +                       int _:!(offsetof(struct task_struct,
14230 +                                       thread.i387.fxsave) & 15);
14231 +               };
14232 +
14233 +               printk(KERN_INFO "Enabling fast FPU save and restore... ");
14234 +               set_in_cr4(X86_CR4_OSFXSR);
14235 +               printk("done.\n");
14236 +       }
14237 +       if (cpu_has_xmm) {
14238 +               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
14239 +                               "support... ");
14240 +               set_in_cr4(X86_CR4_OSXMMEXCPT);
14241 +               printk("done.\n");
14242 +       }
14243 +
14244 +       /*
14245 +        * Should be a barrier for any external CPU state.
14246 +        */
14247 +       cpu_init();
14248 +}
14249 +
14250 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
14251 +{
14252 +       const trap_info_t *t = trap_table;
14253 +
14254 +       for (t = trap_table; t->address; t++) {
14255 +               trap_ctxt[t->vector].flags = t->flags;
14256 +               trap_ctxt[t->vector].cs = t->cs;
14257 +               trap_ctxt[t->vector].address = t->address;
14258 +       }
14259 +}
14260 +
14261 +static int __init kstack_setup(char *s)
14262 +{
14263 +       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
14264 +       return 1;
14265 +}
14266 +__setup("kstack=", kstack_setup);
14267 +
14268 +#ifdef CONFIG_STACK_UNWIND
14269 +static int __init call_trace_setup(char *s)
14270 +{
14271 +       if (strcmp(s, "old") == 0)
14272 +               call_trace = -1;
14273 +       else if (strcmp(s, "both") == 0)
14274 +               call_trace = 0;
14275 +       else if (strcmp(s, "newfallback") == 0)
14276 +               call_trace = 1;
14277 +       else if (strcmp(s, "new") == 2)
14278 +               call_trace = 2;
14279 +       return 1;
14280 +}
14281 +__setup("call_trace=", call_trace_setup);
14282 +#endif
14283 Index: head-2008-11-25/arch/x86/mach-xen/Makefile
14284 ===================================================================
14285 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
14286 +++ head-2008-11-25/arch/x86/mach-xen/Makefile  2007-06-12 13:12:48.000000000 +0200
14287 @@ -0,0 +1,5 @@
14288 +#
14289 +# Makefile for the linux kernel.
14290 +#
14291 +
14292 +obj-y                          := setup.o
14293 Index: head-2008-11-25/arch/x86/mach-xen/setup.c
14294 ===================================================================
14295 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
14296 +++ head-2008-11-25/arch/x86/mach-xen/setup.c   2008-04-02 12:34:02.000000000 +0200
14297 @@ -0,0 +1,158 @@
14298 +/*
14299 + *     Machine specific setup for generic
14300 + */
14301 +
14302 +#include <linux/mm.h>
14303 +#include <linux/smp.h>
14304 +#include <linux/init.h>
14305 +#include <linux/interrupt.h>
14306 +#include <linux/module.h>
14307 +#include <asm/acpi.h>
14308 +#include <asm/arch_hooks.h>
14309 +#include <asm/e820.h>
14310 +#include <asm/setup.h>
14311 +#include <asm/fixmap.h>
14312 +
14313 +#include <xen/interface/callback.h>
14314 +#include <xen/interface/memory.h>
14315 +
14316 +#ifdef CONFIG_HOTPLUG_CPU
14317 +#define DEFAULT_SEND_IPI       (1)
14318 +#else
14319 +#define DEFAULT_SEND_IPI       (0)
14320 +#endif
14321 +
14322 +int no_broadcast=DEFAULT_SEND_IPI;
14323 +
14324 +static __init int no_ipi_broadcast(char *str)
14325 +{
14326 +       get_option(&str, &no_broadcast);
14327 +       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
14328 +                                                                                       "IPI Broadcast");
14329 +       return 1;
14330 +}
14331 +
14332 +__setup("no_ipi_broadcast", no_ipi_broadcast);
14333 +
14334 +static int __init print_ipi_mode(void)
14335 +{
14336 +       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
14337 +                                                                                       "Shortcut");
14338 +       return 0;
14339 +}
14340 +
14341 +late_initcall(print_ipi_mode);
14342 +
14343 +/**
14344 + * machine_specific_memory_setup - Hook for machine specific memory setup.
14345 + *
14346 + * Description:
14347 + *     This is included late in kernel/setup.c so that it can make
14348 + *     use of all of the static functions.
14349 + **/
14350 +
14351 +char * __init machine_specific_memory_setup(void)
14352 +{
14353 +       int rc;
14354 +       struct xen_memory_map memmap;
14355 +       /*
14356 +        * This is rather large for a stack variable but this early in
14357 +        * the boot process we know we have plenty slack space.
14358 +        */
14359 +       struct e820entry map[E820MAX];
14360 +
14361 +       memmap.nr_entries = E820MAX;
14362 +       set_xen_guest_handle(memmap.buffer, map);
14363 +
14364 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
14365 +       if ( rc == -ENOSYS ) {
14366 +               memmap.nr_entries = 1;
14367 +               map[0].addr = 0ULL;
14368 +               map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
14369 +               /* 8MB slack (to balance backend allocations). */
14370 +               map[0].size += 8ULL << 20;
14371 +               map[0].type = E820_RAM;
14372 +               rc = 0;
14373 +       }
14374 +       BUG_ON(rc);
14375 +
14376 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
14377 +
14378 +       BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
14379 +
14380 +       return "Xen";
14381 +}
14382 +
14383 +
14384 +extern void hypervisor_callback(void);
14385 +extern void failsafe_callback(void);
14386 +extern void nmi(void);
14387 +
14388 +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
14389 +EXPORT_SYMBOL(machine_to_phys_mapping);
14390 +unsigned int machine_to_phys_order;
14391 +EXPORT_SYMBOL(machine_to_phys_order);
14392 +
14393 +void __init pre_setup_arch_hook(void)
14394 +{
14395 +       struct xen_machphys_mapping mapping;
14396 +       unsigned long machine_to_phys_nr_ents;
14397 +       struct xen_platform_parameters pp;
14398 +
14399 +       init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
14400 +
14401 +       setup_xen_features();
14402 +
14403 +       if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
14404 +               set_fixaddr_top(pp.virt_start);
14405 +
14406 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
14407 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
14408 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
14409 +       } else
14410 +               machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
14411 +       machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
14412 +
14413 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
14414 +               phys_to_machine_mapping =
14415 +                       (unsigned long *)xen_start_info->mfn_list;
14416 +}
14417 +
14418 +void __init machine_specific_arch_setup(void)
14419 +{
14420 +       int ret;
14421 +       static struct callback_register __initdata event = {
14422 +               .type = CALLBACKTYPE_event,
14423 +               .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
14424 +       };
14425 +       static struct callback_register __initdata failsafe = {
14426 +               .type = CALLBACKTYPE_failsafe,
14427 +               .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
14428 +       };
14429 +       static struct callback_register __initdata nmi_cb = {
14430 +               .type = CALLBACKTYPE_nmi,
14431 +               .address = { __KERNEL_CS, (unsigned long)nmi },
14432 +       };
14433 +
14434 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
14435 +       if (ret == 0)
14436 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
14437 +#if CONFIG_XEN_COMPAT <= 0x030002
14438 +       if (ret == -ENOSYS)
14439 +               ret = HYPERVISOR_set_callbacks(
14440 +                       event.address.cs, event.address.eip,
14441 +                       failsafe.address.cs, failsafe.address.eip);
14442 +#endif
14443 +       BUG_ON(ret);
14444 +
14445 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
14446 +#if CONFIG_XEN_COMPAT <= 0x030002
14447 +       if (ret == -ENOSYS) {
14448 +               static struct xennmi_callback __initdata cb = {
14449 +                       .handler_address = (unsigned long)nmi
14450 +               };
14451 +
14452 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
14453 +       }
14454 +#endif
14455 +}
14456 Index: head-2008-11-25/arch/x86/lib/scrub.c
14457 ===================================================================
14458 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
14459 +++ head-2008-11-25/arch/x86/lib/scrub.c        2008-02-08 12:30:51.000000000 +0100
14460 @@ -0,0 +1,21 @@
14461 +#include <asm/cpufeature.h>
14462 +#include <asm/page.h>
14463 +#include <asm/processor.h>
14464 +
14465 +void scrub_pages(void *v, unsigned int count)
14466 +{
14467 +       if (likely(cpu_has_xmm2)) {
14468 +               unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
14469 +
14470 +               for (; n--; v += sizeof(long) * 4)
14471 +                       asm("movnti %1,(%0)\n\t"
14472 +                           "movnti %1,%c2(%0)\n\t"
14473 +                           "movnti %1,2*%c2(%0)\n\t"
14474 +                           "movnti %1,3*%c2(%0)\n\t"
14475 +                           : : "r" (v), "r" (0L), "i" (sizeof(long))
14476 +                           : "memory");
14477 +               asm volatile("sfence" : : : "memory");
14478 +       } else
14479 +               for (; count--; v += PAGE_SIZE)
14480 +                       clear_page(v);
14481 +}
14482 Index: head-2008-11-25/arch/x86/mm/fault_32-xen.c
14483 ===================================================================
14484 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
14485 +++ head-2008-11-25/arch/x86/mm/fault_32-xen.c  2007-12-10 08:47:31.000000000 +0100
14486 @@ -0,0 +1,779 @@
14487 +/*
14488 + *  linux/arch/i386/mm/fault.c
14489 + *
14490 + *  Copyright (C) 1995  Linus Torvalds
14491 + */
14492 +
14493 +#include <linux/signal.h>
14494 +#include <linux/sched.h>
14495 +#include <linux/kernel.h>
14496 +#include <linux/errno.h>
14497 +#include <linux/string.h>
14498 +#include <linux/types.h>
14499 +#include <linux/ptrace.h>
14500 +#include <linux/mman.h>
14501 +#include <linux/mm.h>
14502 +#include <linux/smp.h>
14503 +#include <linux/smp_lock.h>
14504 +#include <linux/interrupt.h>
14505 +#include <linux/init.h>
14506 +#include <linux/tty.h>
14507 +#include <linux/vt_kern.h>             /* For unblank_screen() */
14508 +#include <linux/highmem.h>
14509 +#include <linux/module.h>
14510 +#include <linux/kprobes.h>
14511 +
14512 +#include <asm/system.h>
14513 +#include <asm/uaccess.h>
14514 +#include <asm/desc.h>
14515 +#include <asm/kdebug.h>
14516 +
14517 +extern void die(const char *,struct pt_regs *,long);
14518 +
14519 +#ifdef CONFIG_KPROBES
14520 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
14521 +int register_page_fault_notifier(struct notifier_block *nb)
14522 +{
14523 +       vmalloc_sync_all();
14524 +       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
14525 +}
14526 +
14527 +int unregister_page_fault_notifier(struct notifier_block *nb)
14528 +{
14529 +       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
14530 +}
14531 +
14532 +static inline int notify_page_fault(enum die_val val, const char *str,
14533 +                       struct pt_regs *regs, long err, int trap, int sig)
14534 +{
14535 +       struct die_args args = {
14536 +               .regs = regs,
14537 +               .str = str,
14538 +               .err = err,
14539 +               .trapnr = trap,
14540 +               .signr = sig
14541 +       };
14542 +       return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
14543 +}
14544 +#else
14545 +static inline int notify_page_fault(enum die_val val, const char *str,
14546 +                       struct pt_regs *regs, long err, int trap, int sig)
14547 +{
14548 +       return NOTIFY_DONE;
14549 +}
14550 +#endif
14551 +
14552 +
14553 +/*
14554 + * Unlock any spinlocks which will prevent us from getting the
14555 + * message out
14556 + */
14557 +void bust_spinlocks(int yes)
14558 +{
14559 +       int loglevel_save = console_loglevel;
14560 +
14561 +       if (yes) {
14562 +               oops_in_progress = 1;
14563 +               return;
14564 +       }
14565 +#ifdef CONFIG_VT
14566 +       unblank_screen();
14567 +#endif
14568 +       oops_in_progress = 0;
14569 +       /*
14570 +        * OK, the message is on the console.  Now we call printk()
14571 +        * without oops_in_progress set so that printk will give klogd
14572 +        * a poke.  Hold onto your hats...
14573 +        */
14574 +       console_loglevel = 15;          /* NMI oopser may have shut the console up */
14575 +       printk(" ");
14576 +       console_loglevel = loglevel_save;
14577 +}
14578 +
14579 +/*
14580 + * Return EIP plus the CS segment base.  The segment limit is also
14581 + * adjusted, clamped to the kernel/user address space (whichever is
14582 + * appropriate), and returned in *eip_limit.
14583 + *
14584 + * The segment is checked, because it might have been changed by another
14585 + * task between the original faulting instruction and here.
14586 + *
14587 + * If CS is no longer a valid code segment, or if EIP is beyond the
14588 + * limit, or if it is a kernel address when CS is not a kernel segment,
14589 + * then the returned value will be greater than *eip_limit.
14590 + *
14591 + * This is slow, but is very rarely executed.
14592 + */
14593 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
14594 +                                           unsigned long *eip_limit)
14595 +{
14596 +       unsigned long eip = regs->eip;
14597 +       unsigned seg = regs->xcs & 0xffff;
14598 +       u32 seg_ar, seg_limit, base, *desc;
14599 +
14600 +       /* Unlikely, but must come before segment checks. */
14601 +       if (unlikely(regs->eflags & VM_MASK)) {
14602 +               base = seg << 4;
14603 +               *eip_limit = base + 0xffff;
14604 +               return base + (eip & 0xffff);
14605 +       }
14606 +
14607 +       /* The standard kernel/user address space limit. */
14608 +       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
14609 +
14610 +       /* By far the most common cases. */
14611 +       if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
14612 +               return eip;
14613 +
14614 +       /* Check the segment exists, is within the current LDT/GDT size,
14615 +          that kernel/user (ring 0..3) has the appropriate privilege,
14616 +          that it's a code segment, and get the limit. */
14617 +       __asm__ ("larl %3,%0; lsll %3,%1"
14618 +                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
14619 +       if ((~seg_ar & 0x9800) || eip > seg_limit) {
14620 +               *eip_limit = 0;
14621 +               return 1;        /* So that returned eip > *eip_limit. */
14622 +       }
14623 +
14624 +       /* Get the GDT/LDT descriptor base.
14625 +          When you look for races in this code remember that
14626 +          LDT and other horrors are only used in user space. */
14627 +       if (seg & (1<<2)) {
14628 +               /* Must lock the LDT while reading it. */
14629 +               down(&current->mm->context.sem);
14630 +               desc = current->mm->context.ldt;
14631 +               desc = (void *)desc + (seg & ~7);
14632 +       } else {
14633 +               /* Must disable preemption while reading the GDT. */
14634 +               desc = (u32 *)get_cpu_gdt_table(get_cpu());
14635 +               desc = (void *)desc + (seg & ~7);
14636 +       }
14637 +
14638 +       /* Decode the code segment base from the descriptor */
14639 +       base = get_desc_base((unsigned long *)desc);
14640 +
14641 +       if (seg & (1<<2)) {
14642 +               up(&current->mm->context.sem);
14643 +       } else
14644 +               put_cpu();
14645 +
14646 +       /* Adjust EIP and segment limit, and clamp at the kernel limit.
14647 +          It's legitimate for segments to wrap at 0xffffffff. */
14648 +       seg_limit += base;
14649 +       if (seg_limit < *eip_limit && seg_limit >= base)
14650 +               *eip_limit = seg_limit;
14651 +       return eip + base;
14652 +}
14653 +
14654 +/*
14655 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
14656 + * Check that here and ignore it.
14657 + */
14658 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
14659 +{
14660 +       unsigned long limit;
14661 +       unsigned long instr = get_segment_eip (regs, &limit);
14662 +       int scan_more = 1;
14663 +       int prefetch = 0;
14664 +       int i;
14665 +
14666 +       for (i = 0; scan_more && i < 15; i++) {
14667 +               unsigned char opcode;
14668 +               unsigned char instr_hi;
14669 +               unsigned char instr_lo;
14670 +
14671 +               if (instr > limit)
14672 +                       break;
14673 +               if (__get_user(opcode, (unsigned char __user *) instr))
14674 +                       break;
14675 +
14676 +               instr_hi = opcode & 0xf0;
14677 +               instr_lo = opcode & 0x0f;
14678 +               instr++;
14679 +
14680 +               switch (instr_hi) {
14681 +               case 0x20:
14682 +               case 0x30:
14683 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
14684 +                       scan_more = ((instr_lo & 7) == 0x6);
14685 +                       break;
14686 +
14687 +               case 0x60:
14688 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
14689 +                       scan_more = (instr_lo & 0xC) == 0x4;
14690 +                       break;
14691 +               case 0xF0:
14692 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
14693 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
14694 +                       break;
14695 +               case 0x00:
14696 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
14697 +                       scan_more = 0;
14698 +                       if (instr > limit)
14699 +                               break;
14700 +                       if (__get_user(opcode, (unsigned char __user *) instr))
14701 +                               break;
14702 +                       prefetch = (instr_lo == 0xF) &&
14703 +                               (opcode == 0x0D || opcode == 0x18);
14704 +                       break;
14705 +               default:
14706 +                       scan_more = 0;
14707 +                       break;
14708 +               }
14709 +       }
14710 +       return prefetch;
14711 +}
14712 +
14713 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
14714 +                             unsigned long error_code)
14715 +{
14716 +       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
14717 +                    boot_cpu_data.x86 >= 6)) {
14718 +               /* Catch an obscure case of prefetch inside an NX page. */
14719 +               if (nx_enabled && (error_code & 16))
14720 +                       return 0;
14721 +               return __is_prefetch(regs, addr);
14722 +       }
14723 +       return 0;
14724 +}
14725 +
14726 +static noinline void force_sig_info_fault(int si_signo, int si_code,
14727 +       unsigned long address, struct task_struct *tsk)
14728 +{
14729 +       siginfo_t info;
14730 +
14731 +       info.si_signo = si_signo;
14732 +       info.si_errno = 0;
14733 +       info.si_code = si_code;
14734 +       info.si_addr = (void __user *)address;
14735 +       force_sig_info(si_signo, &info, tsk);
14736 +}
14737 +
14738 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
14739 +
14740 +#ifdef CONFIG_X86_PAE
14741 +static void dump_fault_path(unsigned long address)
14742 +{
14743 +       unsigned long *p, page;
14744 +       unsigned long mfn;
14745 +
14746 +       page = read_cr3();
14747 +       p  = (unsigned long *)__va(page);
14748 +       p += (address >> 30) * 2;
14749 +       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
14750 +       if (p[0] & _PAGE_PRESENT) {
14751 +               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14752 +               page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14753 +               p  = (unsigned long *)__va(page);
14754 +               address &= 0x3fffffff;
14755 +               p += (address >> 21) * 2;
14756 +               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
14757 +                      page, p[1], p[0]);
14758 +               mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14759 +#ifdef CONFIG_HIGHPTE
14760 +               if (mfn_to_pfn(mfn) >= highstart_pfn)
14761 +                       return;
14762 +#endif
14763 +               if (p[0] & _PAGE_PRESENT) {
14764 +                       page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14765 +                       p  = (unsigned long *) __va(page);
14766 +                       address &= 0x001fffff;
14767 +                       p += (address >> 12) * 2;
14768 +                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
14769 +                              page, p[1], p[0]);
14770 +               }
14771 +       }
14772 +}
14773 +#else
14774 +static void dump_fault_path(unsigned long address)
14775 +{
14776 +       unsigned long page;
14777 +
14778 +       page = read_cr3();
14779 +       page = ((unsigned long *) __va(page))[address >> 22];
14780 +       if (oops_may_print())
14781 +               printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
14782 +                      machine_to_phys(page));
14783 +       /*
14784 +        * We must not directly access the pte in the highpte
14785 +        * case if the page table is located in highmem.
14786 +        * And lets rather not kmap-atomic the pte, just in case
14787 +        * it's allocated already.
14788 +        */
14789 +#ifdef CONFIG_HIGHPTE
14790 +       if ((page >> PAGE_SHIFT) >= highstart_pfn)
14791 +               return;
14792 +#endif
14793 +       if ((page & 1) && oops_may_print()) {
14794 +               page &= PAGE_MASK;
14795 +               address &= 0x003ff000;
14796 +               page = machine_to_phys(page);
14797 +               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
14798 +               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
14799 +                      machine_to_phys(page));
14800 +       }
14801 +}
14802 +#endif
14803 +
14804 +static int spurious_fault(struct pt_regs *regs,
14805 +                         unsigned long address,
14806 +                         unsigned long error_code)
14807 +{
14808 +       pgd_t *pgd;
14809 +       pud_t *pud;
14810 +       pmd_t *pmd;
14811 +       pte_t *pte;
14812 +
14813 +       /* Reserved-bit violation or user access to kernel space? */
14814 +       if (error_code & 0x0c)
14815 +               return 0;
14816 +
14817 +       pgd = init_mm.pgd + pgd_index(address);
14818 +       if (!pgd_present(*pgd))
14819 +               return 0;
14820 +
14821 +       pud = pud_offset(pgd, address);
14822 +       if (!pud_present(*pud))
14823 +               return 0;
14824 +
14825 +       pmd = pmd_offset(pud, address);
14826 +       if (!pmd_present(*pmd))
14827 +               return 0;
14828 +
14829 +       pte = pte_offset_kernel(pmd, address);
14830 +       if (!pte_present(*pte))
14831 +               return 0;
14832 +       if ((error_code & 0x02) && !pte_write(*pte))
14833 +               return 0;
14834 +#ifdef CONFIG_X86_PAE
14835 +       if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
14836 +               return 0;
14837 +#endif
14838 +
14839 +       return 1;
14840 +}
14841 +
14842 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
14843 +{
14844 +       unsigned index = pgd_index(address);
14845 +       pgd_t *pgd_k;
14846 +       pud_t *pud, *pud_k;
14847 +       pmd_t *pmd, *pmd_k;
14848 +
14849 +       pgd += index;
14850 +       pgd_k = init_mm.pgd + index;
14851 +
14852 +       if (!pgd_present(*pgd_k))
14853 +               return NULL;
14854 +
14855 +       /*
14856 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
14857 +        * and redundant with the set_pmd() on non-PAE. As would
14858 +        * set_pud.
14859 +        */
14860 +
14861 +       pud = pud_offset(pgd, address);
14862 +       pud_k = pud_offset(pgd_k, address);
14863 +       if (!pud_present(*pud_k))
14864 +               return NULL;
14865 +
14866 +       pmd = pmd_offset(pud, address);
14867 +       pmd_k = pmd_offset(pud_k, address);
14868 +       if (!pmd_present(*pmd_k))
14869 +               return NULL;
14870 +       if (!pmd_present(*pmd))
14871 +#if CONFIG_XEN_COMPAT > 0x030002
14872 +               set_pmd(pmd, *pmd_k);
14873 +#else
14874 +               /*
14875 +                * When running on older Xen we must launder *pmd_k through
14876 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
14877 +                */
14878 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
14879 +#endif
14880 +       else
14881 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
14882 +       return pmd_k;
14883 +}
14884 +
14885 +/*
14886 + * Handle a fault on the vmalloc or module mapping area
14887 + *
14888 + * This assumes no large pages in there.
14889 + */
14890 +static inline int vmalloc_fault(unsigned long address)
14891 +{
14892 +       unsigned long pgd_paddr;
14893 +       pmd_t *pmd_k;
14894 +       pte_t *pte_k;
14895 +       /*
14896 +        * Synchronize this task's top level page-table
14897 +        * with the 'reference' page table.
14898 +        *
14899 +        * Do _not_ use "current" here. We might be inside
14900 +        * an interrupt in the middle of a task switch..
14901 +        */
14902 +       pgd_paddr = read_cr3();
14903 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
14904 +       if (!pmd_k)
14905 +               return -1;
14906 +       pte_k = pte_offset_kernel(pmd_k, address);
14907 +       if (!pte_present(*pte_k))
14908 +               return -1;
14909 +       return 0;
14910 +}
14911 +
14912 +/*
14913 + * This routine handles page faults.  It determines the address,
14914 + * and the problem, and then passes it off to one of the appropriate
14915 + * routines.
14916 + *
14917 + * error_code:
14918 + *     bit 0 == 0 means no page found, 1 means protection fault
14919 + *     bit 1 == 0 means read, 1 means write
14920 + *     bit 2 == 0 means kernel, 1 means user-mode
14921 + *     bit 3 == 1 means use of reserved bit detected
14922 + *     bit 4 == 1 means fault was an instruction fetch
14923 + */
14924 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
14925 +                                     unsigned long error_code)
14926 +{
14927 +       struct task_struct *tsk;
14928 +       struct mm_struct *mm;
14929 +       struct vm_area_struct * vma;
14930 +       unsigned long address;
14931 +       int write, si_code;
14932 +
14933 +       /* get the address */
14934 +        address = read_cr2();
14935 +
14936 +       /* Set the "privileged fault" bit to something sane. */
14937 +       error_code &= ~4;
14938 +       error_code |= (regs->xcs & 2) << 1;
14939 +       if (regs->eflags & X86_EFLAGS_VM)
14940 +               error_code |= 4;
14941 +
14942 +       tsk = current;
14943 +
14944 +       si_code = SEGV_MAPERR;
14945 +
14946 +       /*
14947 +        * We fault-in kernel-space virtual memory on-demand. The
14948 +        * 'reference' page table is init_mm.pgd.
14949 +        *
14950 +        * NOTE! We MUST NOT take any locks for this case. We may
14951 +        * be in an interrupt or a critical region, and should
14952 +        * only copy the information from the master page table,
14953 +        * nothing more.
14954 +        *
14955 +        * This verifies that the fault happens in kernel space
14956 +        * (error_code & 4) == 0, and that the fault was not a
14957 +        * protection error (error_code & 9) == 0.
14958 +        */
14959 +       if (unlikely(address >= TASK_SIZE)) {
14960 +#ifdef CONFIG_XEN
14961 +               /* Faults in hypervisor area can never be patched up. */
14962 +               if (address >= hypervisor_virt_start)
14963 +                       goto bad_area_nosemaphore;
14964 +#endif
14965 +               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
14966 +                       return;
14967 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
14968 +               if (spurious_fault(regs, address, error_code))
14969 +                       return;
14970 +               if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14971 +                                               SIGSEGV) == NOTIFY_STOP)
14972 +                       return;
14973 +               /*
14974 +                * Don't take the mm semaphore here. If we fixup a prefetch
14975 +                * fault we could otherwise deadlock.
14976 +                */
14977 +               goto bad_area_nosemaphore;
14978 +       }
14979 +
14980 +       if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14981 +                                       SIGSEGV) == NOTIFY_STOP)
14982 +               return;
14983 +
14984 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
14985 +          fault has been handled. */
14986 +       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
14987 +               local_irq_enable();
14988 +
14989 +       mm = tsk->mm;
14990 +
14991 +       /*
14992 +        * If we're in an interrupt, have no user context or are running in an
14993 +        * atomic region then we must not take the fault..
14994 +        */
14995 +       if (in_atomic() || !mm)
14996 +               goto bad_area_nosemaphore;
14997 +
14998 +       /* When running in the kernel we expect faults to occur only to
14999 +        * addresses in user space.  All other faults represent errors in the
15000 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
15001 +        * erroneous fault occurring in a code path which already holds mmap_sem
15002 +        * we will deadlock attempting to validate the fault against the
15003 +        * address space.  Luckily the kernel only validly references user
15004 +        * space from well defined areas of code, which are listed in the
15005 +        * exceptions table.
15006 +        *
15007 +        * As the vast majority of faults will be valid we will only perform
15008 +        * the source reference check when there is a possibilty of a deadlock.
15009 +        * Attempt to lock the address space, if we cannot we then validate the
15010 +        * source.  If this is invalid we can skip the address space check,
15011 +        * thus avoiding the deadlock.
15012 +        */
15013 +       if (!down_read_trylock(&mm->mmap_sem)) {
15014 +               if ((error_code & 4) == 0 &&
15015 +                   !search_exception_tables(regs->eip))
15016 +                       goto bad_area_nosemaphore;
15017 +               down_read(&mm->mmap_sem);
15018 +       }
15019 +
15020 +       vma = find_vma(mm, address);
15021 +       if (!vma)
15022 +               goto bad_area;
15023 +       if (vma->vm_start <= address)
15024 +               goto good_area;
15025 +       if (!(vma->vm_flags & VM_GROWSDOWN))
15026 +               goto bad_area;
15027 +       if (error_code & 4) {
15028 +               /*
15029 +                * Accessing the stack below %esp is always a bug.
15030 +                * The large cushion allows instructions like enter
15031 +                * and pusha to work.  ("enter $65535,$31" pushes
15032 +                * 32 pointers and then decrements %esp by 65535.)
15033 +                */
15034 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
15035 +                       goto bad_area;
15036 +       }
15037 +       if (expand_stack(vma, address))
15038 +               goto bad_area;
15039 +/*
15040 + * Ok, we have a good vm_area for this memory access, so
15041 + * we can handle it..
15042 + */
15043 +good_area:
15044 +       si_code = SEGV_ACCERR;
15045 +       write = 0;
15046 +       switch (error_code & 3) {
15047 +               default:        /* 3: write, present */
15048 +#ifdef TEST_VERIFY_AREA
15049 +                       if (regs->cs == GET_KERNEL_CS())
15050 +                               printk("WP fault at %08lx\n", regs->eip);
15051 +#endif
15052 +                       /* fall through */
15053 +               case 2:         /* write, not present */
15054 +                       if (!(vma->vm_flags & VM_WRITE))
15055 +                               goto bad_area;
15056 +                       write++;
15057 +                       break;
15058 +               case 1:         /* read, present */
15059 +                       goto bad_area;
15060 +               case 0:         /* read, not present */
15061 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
15062 +                               goto bad_area;
15063 +       }
15064 +
15065 + survive:
15066 +       /*
15067 +        * If for any reason at all we couldn't handle the fault,
15068 +        * make sure we exit gracefully rather than endlessly redo
15069 +        * the fault.
15070 +        */
15071 +       switch (handle_mm_fault(mm, vma, address, write)) {
15072 +               case VM_FAULT_MINOR:
15073 +                       tsk->min_flt++;
15074 +                       break;
15075 +               case VM_FAULT_MAJOR:
15076 +                       tsk->maj_flt++;
15077 +                       break;
15078 +               case VM_FAULT_SIGBUS:
15079 +                       goto do_sigbus;
15080 +               case VM_FAULT_OOM:
15081 +                       goto out_of_memory;
15082 +               default:
15083 +                       BUG();
15084 +       }
15085 +
15086 +       /*
15087 +        * Did it hit the DOS screen memory VA from vm86 mode?
15088 +        */
15089 +       if (regs->eflags & VM_MASK) {
15090 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
15091 +               if (bit < 32)
15092 +                       tsk->thread.screen_bitmap |= 1 << bit;
15093 +       }
15094 +       up_read(&mm->mmap_sem);
15095 +       return;
15096 +
15097 +/*
15098 + * Something tried to access memory that isn't in our memory map..
15099 + * Fix it, but check if it's kernel or user first..
15100 + */
15101 +bad_area:
15102 +       up_read(&mm->mmap_sem);
15103 +
15104 +bad_area_nosemaphore:
15105 +       /* User mode accesses just cause a SIGSEGV */
15106 +       if (error_code & 4) {
15107 +               /*
15108 +                * Valid to do another page fault here because this one came
15109 +                * from user space.
15110 +                */
15111 +               if (is_prefetch(regs, address, error_code))
15112 +                       return;
15113 +
15114 +               tsk->thread.cr2 = address;
15115 +               /* Kernel addresses are always protection faults */
15116 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
15117 +               tsk->thread.trap_no = 14;
15118 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
15119 +               return;
15120 +       }
15121 +
15122 +#ifdef CONFIG_X86_F00F_BUG
15123 +       /*
15124 +        * Pentium F0 0F C7 C8 bug workaround.
15125 +        */
15126 +       if (boot_cpu_data.f00f_bug) {
15127 +               unsigned long nr;
15128 +
15129 +               nr = (address - idt_descr.address) >> 3;
15130 +
15131 +               if (nr == 6) {
15132 +                       do_invalid_op(regs, 0);
15133 +                       return;
15134 +               }
15135 +       }
15136 +#endif
15137 +
15138 +no_context:
15139 +       /* Are we prepared to handle this kernel fault?  */
15140 +       if (fixup_exception(regs))
15141 +               return;
15142 +
15143 +       /*
15144 +        * Valid to do another page fault here, because if this fault
15145 +        * had been triggered by is_prefetch fixup_exception would have
15146 +        * handled it.
15147 +        */
15148 +       if (is_prefetch(regs, address, error_code))
15149 +               return;
15150 +
15151 +/*
15152 + * Oops. The kernel tried to access some bad page. We'll have to
15153 + * terminate things with extreme prejudice.
15154 + */
15155 +
15156 +       bust_spinlocks(1);
15157 +
15158 +       if (oops_may_print()) {
15159 +       #ifdef CONFIG_X86_PAE
15160 +               if (error_code & 16) {
15161 +                       pte_t *pte = lookup_address(address);
15162 +
15163 +                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
15164 +                               printk(KERN_CRIT "kernel tried to execute "
15165 +                                       "NX-protected page - exploit attempt? "
15166 +                                       "(uid: %d)\n", current->uid);
15167 +               }
15168 +       #endif
15169 +               if (address < PAGE_SIZE)
15170 +                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
15171 +                                       "pointer dereference");
15172 +               else
15173 +                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
15174 +                                       " request");
15175 +               printk(" at virtual address %08lx\n",address);
15176 +               printk(KERN_ALERT " printing eip:\n");
15177 +               printk("%08lx\n", regs->eip);
15178 +       }
15179 +       dump_fault_path(address);
15180 +       tsk->thread.cr2 = address;
15181 +       tsk->thread.trap_no = 14;
15182 +       tsk->thread.error_code = error_code;
15183 +       die("Oops", regs, error_code);
15184 +       bust_spinlocks(0);
15185 +       do_exit(SIGKILL);
15186 +
15187 +/*
15188 + * We ran out of memory, or some other thing happened to us that made
15189 + * us unable to handle the page fault gracefully.
15190 + */
15191 +out_of_memory:
15192 +       up_read(&mm->mmap_sem);
15193 +       if (tsk->pid == 1) {
15194 +               yield();
15195 +               down_read(&mm->mmap_sem);
15196 +               goto survive;
15197 +       }
15198 +       printk("VM: killing process %s\n", tsk->comm);
15199 +       if (error_code & 4)
15200 +               do_exit(SIGKILL);
15201 +       goto no_context;
15202 +
15203 +do_sigbus:
15204 +       up_read(&mm->mmap_sem);
15205 +
15206 +       /* Kernel mode? Handle exceptions or die */
15207 +       if (!(error_code & 4))
15208 +               goto no_context;
15209 +
15210 +       /* User space => ok to do another page fault */
15211 +       if (is_prefetch(regs, address, error_code))
15212 +               return;
15213 +
15214 +       tsk->thread.cr2 = address;
15215 +       tsk->thread.error_code = error_code;
15216 +       tsk->thread.trap_no = 14;
15217 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
15218 +}
15219 +
15220 +#if !HAVE_SHARED_KERNEL_PMD
15221 +void vmalloc_sync_all(void)
15222 +{
15223 +       /*
15224 +        * Note that races in the updates of insync and start aren't
15225 +        * problematic: insync can only get set bits added, and updates to
15226 +        * start are only improving performance (without affecting correctness
15227 +        * if undone).
15228 +        * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
15229 +        *      This change works just fine with 2-level paging too.
15230 +        */
15231 +#define sync_index(a) ((a) >> PMD_SHIFT)
15232 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
15233 +       static unsigned long start = TASK_SIZE;
15234 +       unsigned long address;
15235 +
15236 +       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
15237 +       for (address = start;
15238 +            address >= TASK_SIZE && address < hypervisor_virt_start;
15239 +            address += 1UL << PMD_SHIFT) {
15240 +               if (!test_bit(sync_index(address), insync)) {
15241 +                       unsigned long flags;
15242 +                       struct page *page;
15243 +
15244 +                       spin_lock_irqsave(&pgd_lock, flags);
15245 +                       /* XEN: failure path assumes non-empty pgd_list. */
15246 +                       if (unlikely(!pgd_list)) {
15247 +                               spin_unlock_irqrestore(&pgd_lock, flags);
15248 +                               return;
15249 +                       }
15250 +                       for (page = pgd_list; page; page =
15251 +                                       (struct page *)page->index)
15252 +                               if (!vmalloc_sync_one(page_address(page),
15253 +                                                               address)) {
15254 +                                       BUG_ON(page != pgd_list);
15255 +                                       break;
15256 +                               }
15257 +                       spin_unlock_irqrestore(&pgd_lock, flags);
15258 +                       if (!page)
15259 +                               set_bit(sync_index(address), insync);
15260 +               }
15261 +               if (address == start && test_bit(sync_index(address), insync))
15262 +                       start = address + (1UL << PMD_SHIFT);
15263 +       }
15264 +}
15265 +#endif
15266 Index: head-2008-11-25/arch/x86/mm/highmem_32-xen.c
15267 ===================================================================
15268 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
15269 +++ head-2008-11-25/arch/x86/mm/highmem_32-xen.c        2008-10-29 09:55:56.000000000 +0100
15270 @@ -0,0 +1,183 @@
15271 +#include <linux/highmem.h>
15272 +#include <linux/module.h>
15273 +
15274 +void *kmap(struct page *page)
15275 +{
15276 +       might_sleep();
15277 +       if (!PageHighMem(page))
15278 +               return page_address(page);
15279 +       return kmap_high(page);
15280 +}
15281 +
15282 +void kunmap(struct page *page)
15283 +{
15284 +       if (in_interrupt())
15285 +               BUG();
15286 +       if (!PageHighMem(page))
15287 +               return;
15288 +       kunmap_high(page);
15289 +}
15290 +
15291 +/*
15292 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
15293 + * no global lock is needed and because the kmap code must perform a global TLB
15294 + * invalidation when the kmap pool wraps.
15295 + *
15296 + * However when holding an atomic kmap is is not legal to sleep, so atomic
15297 + * kmaps are appropriate for short, tight code paths only.
15298 + */
15299 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
15300 +{
15301 +       enum fixed_addresses idx;
15302 +       unsigned long vaddr;
15303 +
15304 +       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
15305 +       inc_preempt_count();
15306 +       if (!PageHighMem(page))
15307 +               return page_address(page);
15308 +
15309 +       idx = type + KM_TYPE_NR*smp_processor_id();
15310 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15311 +#ifdef CONFIG_DEBUG_HIGHMEM
15312 +       if (!pte_none(*(kmap_pte-idx)))
15313 +               BUG();
15314 +#endif
15315 +       set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
15316 +
15317 +       return (void*) vaddr;
15318 +}
15319 +
15320 +void *kmap_atomic(struct page *page, enum km_type type)
15321 +{
15322 +       return __kmap_atomic(page, type, kmap_prot);
15323 +}
15324 +
15325 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
15326 +void *kmap_atomic_pte(struct page *page, enum km_type type)
15327 +{
15328 +       return __kmap_atomic(page, type,
15329 +                            test_bit(PG_pinned, &page->flags)
15330 +                            ? PAGE_KERNEL_RO : kmap_prot);
15331 +}
15332 +
15333 +void kunmap_atomic(void *kvaddr, enum km_type type)
15334 +{
15335 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
15336 +       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
15337 +       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
15338 +
15339 +       if (vaddr < FIXADDR_START) { // FIXME
15340 +               dec_preempt_count();
15341 +               preempt_check_resched();
15342 +               return;
15343 +       }
15344 +#endif
15345 +
15346 +#if defined(CONFIG_DEBUG_HIGHMEM)
15347 +       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
15348 +               BUG();
15349 +
15350 +       /*
15351 +        * force other mappings to Oops if they'll try to access
15352 +        * this pte without first remap it
15353 +        */
15354 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
15355 +       __flush_tlb_one(vaddr);
15356 +#elif defined(CONFIG_XEN)
15357 +       /*
15358 +        * We must ensure there are no dangling pagetable references when
15359 +        * returning memory to Xen (decrease_reservation).
15360 +        * XXX TODO: We could make this faster by only zapping when
15361 +        * kmap_flush_unused is called but that is trickier and more invasive.
15362 +        */
15363 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
15364 +#endif
15365 +
15366 +       dec_preempt_count();
15367 +       preempt_check_resched();
15368 +}
15369 +
15370 +/* This is the same as kmap_atomic() but can map memory that doesn't
15371 + * have a struct page associated with it.
15372 + */
15373 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
15374 +{
15375 +       enum fixed_addresses idx;
15376 +       unsigned long vaddr;
15377 +
15378 +       inc_preempt_count();
15379 +
15380 +       idx = type + KM_TYPE_NR*smp_processor_id();
15381 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15382 +       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
15383 +       __flush_tlb_one(vaddr);
15384 +
15385 +       return (void*) vaddr;
15386 +}
15387 +
15388 +struct page *kmap_atomic_to_page(void *ptr)
15389 +{
15390 +       unsigned long idx, vaddr = (unsigned long)ptr;
15391 +       pte_t *pte;
15392 +
15393 +       if (vaddr < FIXADDR_START)
15394 +               return virt_to_page(ptr);
15395 +
15396 +       idx = virt_to_fix(vaddr);
15397 +       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
15398 +       return pte_page(*pte);
15399 +}
15400 +
15401 +void clear_highpage(struct page *page)
15402 +{
15403 +       void *kaddr;
15404 +
15405 +       if (likely(xen_feature(XENFEAT_highmem_assist))
15406 +           && PageHighMem(page)) {
15407 +               struct mmuext_op meo;
15408 +
15409 +               meo.cmd = MMUEXT_CLEAR_PAGE;
15410 +               meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
15411 +               if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15412 +                       return;
15413 +       }
15414 +
15415 +       kaddr = kmap_atomic(page, KM_USER0);
15416 +       clear_page(kaddr);
15417 +       kunmap_atomic(kaddr, KM_USER0);
15418 +}
15419 +
15420 +void copy_highpage(struct page *to, struct page *from)
15421 +{
15422 +       void *vfrom, *vto;
15423 +
15424 +       if (likely(xen_feature(XENFEAT_highmem_assist))
15425 +           && (PageHighMem(from) || PageHighMem(to))) {
15426 +               unsigned long from_pfn = page_to_pfn(from);
15427 +               unsigned long to_pfn = page_to_pfn(to);
15428 +               struct mmuext_op meo;
15429 +
15430 +               meo.cmd = MMUEXT_COPY_PAGE;
15431 +               meo.arg1.mfn = pfn_to_mfn(to_pfn);
15432 +               meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
15433 +               if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
15434 +                   && mfn_to_pfn(meo.arg1.mfn) == to_pfn
15435 +                   && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15436 +                       return;
15437 +       }
15438 +
15439 +       vfrom = kmap_atomic(from, KM_USER0);
15440 +       vto = kmap_atomic(to, KM_USER1);
15441 +       copy_page(vto, vfrom);
15442 +       kunmap_atomic(vfrom, KM_USER0);
15443 +       kunmap_atomic(vto, KM_USER1);
15444 +}
15445 +
15446 +EXPORT_SYMBOL(kmap);
15447 +EXPORT_SYMBOL(kunmap);
15448 +EXPORT_SYMBOL(kmap_atomic);
15449 +EXPORT_SYMBOL(kmap_atomic_pte);
15450 +EXPORT_SYMBOL(kunmap_atomic);
15451 +EXPORT_SYMBOL(kmap_atomic_to_page);
15452 +EXPORT_SYMBOL(clear_highpage);
15453 +EXPORT_SYMBOL(copy_highpage);
15454 Index: head-2008-11-25/arch/x86/mm/hypervisor.c
15455 ===================================================================
15456 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
15457 +++ head-2008-11-25/arch/x86/mm/hypervisor.c    2008-10-29 09:55:56.000000000 +0100
15458 @@ -0,0 +1,547 @@
15459 +/******************************************************************************
15460 + * mm/hypervisor.c
15461 + *
15462 + * Update page tables via the hypervisor.
15463 + *
15464 + * Copyright (c) 2002-2004, K A Fraser
15465 + *
15466 + * This program is free software; you can redistribute it and/or
15467 + * modify it under the terms of the GNU General Public License version 2
15468 + * as published by the Free Software Foundation; or, when distributed
15469 + * separately from the Linux kernel or incorporated into other
15470 + * software packages, subject to the following license:
15471 + *
15472 + * Permission is hereby granted, free of charge, to any person obtaining a copy
15473 + * of this source file (the "Software"), to deal in the Software without
15474 + * restriction, including without limitation the rights to use, copy, modify,
15475 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
15476 + * and to permit persons to whom the Software is furnished to do so, subject to
15477 + * the following conditions:
15478 + *
15479 + * The above copyright notice and this permission notice shall be included in
15480 + * all copies or substantial portions of the Software.
15481 + *
15482 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15483 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15484 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15485 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15486 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
15487 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
15488 + * IN THE SOFTWARE.
15489 + */
15490 +
15491 +#include <linux/sched.h>
15492 +#include <linux/mm.h>
15493 +#include <linux/vmalloc.h>
15494 +#include <asm/page.h>
15495 +#include <asm/pgtable.h>
15496 +#include <asm/hypervisor.h>
15497 +#include <xen/balloon.h>
15498 +#include <xen/features.h>
15499 +#include <xen/interface/memory.h>
15500 +#include <linux/module.h>
15501 +#include <linux/percpu.h>
15502 +#include <asm/tlbflush.h>
15503 +#include <linux/highmem.h>
15504 +
15505 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
15506 +{
15507 +       mmu_update_t u;
15508 +#ifdef CONFIG_HIGHPTE
15509 +       u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ?
15510 +               arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr);
15511 +#else
15512 +       u.ptr = virt_to_machine(ptr);
15513 +#endif
15514 +       u.val = __pte_val(val);
15515 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15516 +}
15517 +EXPORT_SYMBOL_GPL(xen_l1_entry_update);
15518 +
15519 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
15520 +{
15521 +       mmu_update_t u;
15522 +       u.ptr = virt_to_machine(ptr);
15523 +       u.val = __pmd_val(val);
15524 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15525 +}
15526 +
15527 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
15528 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
15529 +{
15530 +       mmu_update_t u;
15531 +       u.ptr = virt_to_machine(ptr);
15532 +       u.val = __pud_val(val);
15533 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15534 +}
15535 +#endif
15536 +
15537 +#ifdef CONFIG_X86_64
15538 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
15539 +{
15540 +       mmu_update_t u;
15541 +       u.ptr = virt_to_machine(ptr);
15542 +       u.val = __pgd_val(val);
15543 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15544 +}
15545 +#endif /* CONFIG_X86_64 */
15546 +
15547 +void xen_pt_switch(unsigned long ptr)
15548 +{
15549 +       struct mmuext_op op;
15550 +       op.cmd = MMUEXT_NEW_BASEPTR;
15551 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15552 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15553 +}
15554 +
15555 +void xen_new_user_pt(unsigned long ptr)
15556 +{
15557 +       struct mmuext_op op;
15558 +       op.cmd = MMUEXT_NEW_USER_BASEPTR;
15559 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15560 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15561 +}
15562 +
15563 +void xen_tlb_flush(void)
15564 +{
15565 +       struct mmuext_op op;
15566 +       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
15567 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15568 +}
15569 +EXPORT_SYMBOL(xen_tlb_flush);
15570 +
15571 +void xen_invlpg(unsigned long ptr)
15572 +{
15573 +       struct mmuext_op op;
15574 +       op.cmd = MMUEXT_INVLPG_LOCAL;
15575 +       op.arg1.linear_addr = ptr & PAGE_MASK;
15576 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15577 +}
15578 +EXPORT_SYMBOL(xen_invlpg);
15579 +
15580 +#ifdef CONFIG_SMP
15581 +
15582 +void xen_tlb_flush_all(void)
15583 +{
15584 +       struct mmuext_op op;
15585 +       op.cmd = MMUEXT_TLB_FLUSH_ALL;
15586 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15587 +}
15588 +
15589 +void xen_tlb_flush_mask(cpumask_t *mask)
15590 +{
15591 +       struct mmuext_op op;
15592 +       if ( cpus_empty(*mask) )
15593 +               return;
15594 +       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
15595 +       set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15596 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15597 +}
15598 +
15599 +void xen_invlpg_all(unsigned long ptr)
15600 +{
15601 +       struct mmuext_op op;
15602 +       op.cmd = MMUEXT_INVLPG_ALL;
15603 +       op.arg1.linear_addr = ptr & PAGE_MASK;
15604 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15605 +}
15606 +
15607 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
15608 +{
15609 +       struct mmuext_op op;
15610 +       if ( cpus_empty(*mask) )
15611 +               return;
15612 +       op.cmd = MMUEXT_INVLPG_MULTI;
15613 +       op.arg1.linear_addr = ptr & PAGE_MASK;
15614 +       set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15615 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15616 +}
15617 +
15618 +#endif /* CONFIG_SMP */
15619 +
15620 +void xen_pgd_pin(unsigned long ptr)
15621 +{
15622 +       struct mmuext_op op;
15623 +#ifdef CONFIG_X86_64
15624 +       op.cmd = MMUEXT_PIN_L4_TABLE;
15625 +#elif defined(CONFIG_X86_PAE)
15626 +       op.cmd = MMUEXT_PIN_L3_TABLE;
15627 +#else
15628 +       op.cmd = MMUEXT_PIN_L2_TABLE;
15629 +#endif
15630 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15631 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15632 +}
15633 +
15634 +void xen_pgd_unpin(unsigned long ptr)
15635 +{
15636 +       struct mmuext_op op;
15637 +       op.cmd = MMUEXT_UNPIN_TABLE;
15638 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15639 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15640 +}
15641 +
15642 +void xen_set_ldt(const void *ptr, unsigned int ents)
15643 +{
15644 +       struct mmuext_op op;
15645 +       op.cmd = MMUEXT_SET_LDT;
15646 +       op.arg1.linear_addr = (unsigned long)ptr;
15647 +       op.arg2.nr_ents     = ents;
15648 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15649 +}
15650 +
15651 +/* Protected by balloon_lock. */
15652 +#define MAX_CONTIG_ORDER 9 /* 2MB */
15653 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
15654 +static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
15655 +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
15656 +
15657 +/* Ensure multi-page extents are contiguous in machine memory. */
15658 +int xen_create_contiguous_region(
15659 +       unsigned long vstart, unsigned int order, unsigned int address_bits)
15660 +{
15661 +       unsigned long *in_frames = discontig_frames, out_frame;
15662 +       unsigned long  frame, flags;
15663 +       unsigned int   i;
15664 +       int            rc, success;
15665 +       struct xen_memory_exchange exchange = {
15666 +               .in = {
15667 +                       .nr_extents   = 1UL << order,
15668 +                       .extent_order = 0,
15669 +                       .domid        = DOMID_SELF
15670 +               },
15671 +               .out = {
15672 +                       .nr_extents   = 1,
15673 +                       .extent_order = order,
15674 +                       .address_bits = address_bits,
15675 +                       .domid        = DOMID_SELF
15676 +               }
15677 +       };
15678 +
15679 +       /*
15680 +        * Currently an auto-translated guest will not perform I/O, nor will
15681 +        * it require PAE page directories below 4GB. Therefore any calls to
15682 +        * this function are redundant and can be ignored.
15683 +        */
15684 +       if (xen_feature(XENFEAT_auto_translated_physmap))
15685 +               return 0;
15686 +
15687 +       if (unlikely(order > MAX_CONTIG_ORDER))
15688 +               return -ENOMEM;
15689 +
15690 +       set_xen_guest_handle(exchange.in.extent_start, in_frames);
15691 +       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
15692 +
15693 +       scrub_pages((void *)vstart, 1 << order);
15694 +
15695 +       balloon_lock(flags);
15696 +
15697 +       /* 1. Zap current PTEs, remembering MFNs. */
15698 +       for (i = 0; i < (1U<<order); i++) {
15699 +               in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
15700 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15701 +                                       __pte_ma(0), 0);
15702 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15703 +                       INVALID_P2M_ENTRY);
15704 +       }
15705 +       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15706 +               BUG();
15707 +
15708 +       /* 2. Get a new contiguous memory extent. */
15709 +       out_frame = __pa(vstart) >> PAGE_SHIFT;
15710 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15711 +       success = (exchange.nr_exchanged == (1UL << order));
15712 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15713 +       BUG_ON(success && (rc != 0));
15714 +#if CONFIG_XEN_COMPAT <= 0x030002
15715 +       if (unlikely(rc == -ENOSYS)) {
15716 +               /* Compatibility when XENMEM_exchange is unsupported. */
15717 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15718 +                                        &exchange.in) != (1UL << order))
15719 +                       BUG();
15720 +               success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15721 +                                               &exchange.out) == 1);
15722 +               if (!success) {
15723 +                       /* Couldn't get special memory: fall back to normal. */
15724 +                       for (i = 0; i < (1U<<order); i++)
15725 +                               in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
15726 +                       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15727 +                                                &exchange.in) != (1UL<<order))
15728 +                               BUG();
15729 +               }
15730 +       }
15731 +#endif
15732 +
15733 +       /* 3. Map the new extent in place of old pages. */
15734 +       for (i = 0; i < (1U<<order); i++) {
15735 +               frame = success ? (out_frame + i) : in_frames[i];
15736 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15737 +                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
15738 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15739 +       }
15740 +
15741 +       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15742 +                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
15743 +                                                  : UVMF_INVLPG|UVMF_ALL;
15744 +       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15745 +               BUG();
15746 +
15747 +       balloon_unlock(flags);
15748 +
15749 +       return success ? 0 : -ENOMEM;
15750 +}
15751 +EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
15752 +
15753 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
15754 +{
15755 +       unsigned long *out_frames = discontig_frames, in_frame;
15756 +       unsigned long  frame, flags;
15757 +       unsigned int   i;
15758 +       int            rc, success;
15759 +       struct xen_memory_exchange exchange = {
15760 +               .in = {
15761 +                       .nr_extents   = 1,
15762 +                       .extent_order = order,
15763 +                       .domid        = DOMID_SELF
15764 +               },
15765 +               .out = {
15766 +                       .nr_extents   = 1UL << order,
15767 +                       .extent_order = 0,
15768 +                       .domid        = DOMID_SELF
15769 +               }
15770 +       };
15771 +
15772 +       if (xen_feature(XENFEAT_auto_translated_physmap))
15773 +               return;
15774 +
15775 +       if (unlikely(order > MAX_CONTIG_ORDER))
15776 +               return;
15777 +
15778 +       set_xen_guest_handle(exchange.in.extent_start, &in_frame);
15779 +       set_xen_guest_handle(exchange.out.extent_start, out_frames);
15780 +
15781 +       scrub_pages((void *)vstart, 1 << order);
15782 +
15783 +       balloon_lock(flags);
15784 +
15785 +       /* 1. Find start MFN of contiguous extent. */
15786 +       in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
15787 +
15788 +       /* 2. Zap current PTEs. */
15789 +       for (i = 0; i < (1U<<order); i++) {
15790 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15791 +                                       __pte_ma(0), 0);
15792 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15793 +                       INVALID_P2M_ENTRY);
15794 +               out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
15795 +       }
15796 +       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15797 +               BUG();
15798 +
15799 +       /* 3. Do the exchange for non-contiguous MFNs. */
15800 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15801 +       success = (exchange.nr_exchanged == 1);
15802 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15803 +       BUG_ON(success && (rc != 0));
15804 +#if CONFIG_XEN_COMPAT <= 0x030002
15805 +       if (unlikely(rc == -ENOSYS)) {
15806 +               /* Compatibility when XENMEM_exchange is unsupported. */
15807 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15808 +                                        &exchange.in) != 1)
15809 +                       BUG();
15810 +               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15811 +                                        &exchange.out) != (1UL << order))
15812 +                       BUG();
15813 +               success = 1;
15814 +       }
15815 +#endif
15816 +
15817 +       /* 4. Map new pages in place of old pages. */
15818 +       for (i = 0; i < (1U<<order); i++) {
15819 +               frame = success ? out_frames[i] : (in_frame + i);
15820 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15821 +                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
15822 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15823 +       }
15824 +
15825 +       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15826 +                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
15827 +                                                  : UVMF_INVLPG|UVMF_ALL;
15828 +       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15829 +               BUG();
15830 +
15831 +       balloon_unlock(flags);
15832 +}
15833 +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
15834 +
15835 +int xen_limit_pages_to_max_mfn(
15836 +       struct page *pages, unsigned int order, unsigned int address_bits)
15837 +{
15838 +       unsigned long flags, frame;
15839 +       unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
15840 +       struct page *page;
15841 +       unsigned int i, n, nr_mcl;
15842 +       int rc, success;
15843 +       DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
15844 +
15845 +       struct xen_memory_exchange exchange = {
15846 +               .in = {
15847 +                       .extent_order = 0,
15848 +                       .domid        = DOMID_SELF
15849 +               },
15850 +               .out = {
15851 +                       .extent_order = 0,
15852 +                       .address_bits = address_bits,
15853 +                       .domid        = DOMID_SELF
15854 +               }
15855 +       };
15856 +
15857 +       if (xen_feature(XENFEAT_auto_translated_physmap))
15858 +               return 0;
15859 +
15860 +       if (unlikely(order > MAX_CONTIG_ORDER))
15861 +               return -ENOMEM;
15862 +
15863 +       bitmap_zero(limit_map, 1U << order);
15864 +       set_xen_guest_handle(exchange.in.extent_start, in_frames);
15865 +       set_xen_guest_handle(exchange.out.extent_start, out_frames);
15866 +
15867 +       /* 0. Scrub the pages. */
15868 +       for (i = 0, n = 0; i < 1U<<order ; i++) {
15869 +               page = &pages[i];
15870 +               if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
15871 +                       continue;
15872 +               __set_bit(i, limit_map);
15873 +
15874 +               if (!PageHighMem(page))
15875 +                       scrub_pages(page_address(page), 1);
15876 +#ifdef CONFIG_XEN_SCRUB_PAGES
15877 +               else {
15878 +                       scrub_pages(kmap(page), 1);
15879 +                       kunmap(page);
15880 +                       ++n;
15881 +               }
15882 +#endif
15883 +       }
15884 +       if (bitmap_empty(limit_map, 1U << order))
15885 +               return 0;
15886 +
15887 +       if (n)
15888 +               kmap_flush_unused();
15889 +
15890 +       balloon_lock(flags);
15891 +
15892 +       /* 1. Zap current PTEs (if any), remembering MFNs. */
15893 +       for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15894 +               if(!test_bit(i, limit_map))
15895 +                       continue;
15896 +               page = &pages[i];
15897 +
15898 +               out_frames[n] = page_to_pfn(page);
15899 +               in_frames[n] = pfn_to_mfn(out_frames[n]);
15900 +
15901 +               if (!PageHighMem(page))
15902 +                       MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15903 +                                               (unsigned long)page_address(page),
15904 +                                               __pte_ma(0), 0);
15905 +
15906 +               set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
15907 +               ++n;
15908 +       }
15909 +       if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15910 +               BUG();
15911 +
15912 +       /* 2. Get new memory below the required limit. */
15913 +       exchange.in.nr_extents = n;
15914 +       exchange.out.nr_extents = n;
15915 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15916 +       success = (exchange.nr_exchanged == n);
15917 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15918 +       BUG_ON(success && (rc != 0));
15919 +#if CONFIG_XEN_COMPAT <= 0x030002
15920 +       if (unlikely(rc == -ENOSYS)) {
15921 +               /* Compatibility when XENMEM_exchange is unsupported. */
15922 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15923 +                                        &exchange.in) != n)
15924 +                       BUG();
15925 +               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15926 +                                        &exchange.out) != n)
15927 +                       BUG();
15928 +               success = 1;
15929 +       }
15930 +#endif
15931 +
15932 +       /* 3. Map the new pages in place of old pages. */
15933 +       for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15934 +               if(!test_bit(i, limit_map))
15935 +                       continue;
15936 +               page = &pages[i];
15937 +
15938 +               frame = success ? out_frames[n] : in_frames[n];
15939 +
15940 +               if (!PageHighMem(page))
15941 +                       MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15942 +                                               (unsigned long)page_address(page),
15943 +                                               pfn_pte_ma(frame, PAGE_KERNEL), 0);
15944 +
15945 +               set_phys_to_machine(page_to_pfn(page), frame);
15946 +               ++n;
15947 +       }
15948 +       if (nr_mcl) {
15949 +               cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
15950 +                                                               ? UVMF_TLB_FLUSH|UVMF_ALL
15951 +                                                               : UVMF_INVLPG|UVMF_ALL;
15952 +               if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15953 +                       BUG();
15954 +       }
15955 +
15956 +       balloon_unlock(flags);
15957 +
15958 +       return success ? 0 : -ENOMEM;
15959 +}
15960 +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
15961 +
15962 +#ifdef __i386__
15963 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
15964 +{
15965 +       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
15966 +       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
15967 +       return HYPERVISOR_update_descriptor(
15968 +               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
15969 +}
15970 +#endif
15971 +
15972 +#define MAX_BATCHED_FULL_PTES 32
15973 +
15974 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
15975 +                        unsigned long addr, unsigned long end, pgprot_t newprot)
15976 +{
15977 +       int rc = 0, i = 0;
15978 +       mmu_update_t u[MAX_BATCHED_FULL_PTES];
15979 +       pte_t *pte;
15980 +       spinlock_t *ptl;
15981 +
15982 +       if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
15983 +               return 0;
15984 +
15985 +       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
15986 +       do {
15987 +               if (pte_present(*pte)) {
15988 +                       u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
15989 +                                  | ((unsigned long)pte & ~PAGE_MASK)
15990 +                                  | MMU_PT_UPDATE_PRESERVE_AD;
15991 +                       u[i].val = __pte_val(pte_modify(*pte, newprot));
15992 +                       if (++i == MAX_BATCHED_FULL_PTES) {
15993 +                               if ((rc = HYPERVISOR_mmu_update(
15994 +                                       &u[0], i, NULL, DOMID_SELF)) != 0)
15995 +                                       break;
15996 +                               i = 0;
15997 +                       }
15998 +               }
15999 +       } while (pte++, addr += PAGE_SIZE, addr != end);
16000 +       if (i)
16001 +               rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
16002 +       pte_unmap_unlock(pte - 1, ptl);
16003 +       BUG_ON(rc && rc != -ENOSYS);
16004 +       return !rc;
16005 +}
16006 Index: head-2008-11-25/arch/x86/mm/init_32-xen.c
16007 ===================================================================
16008 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
16009 +++ head-2008-11-25/arch/x86/mm/init_32-xen.c   2008-10-29 09:55:56.000000000 +0100
16010 @@ -0,0 +1,840 @@
16011 +/*
16012 + *  linux/arch/i386/mm/init.c
16013 + *
16014 + *  Copyright (C) 1995  Linus Torvalds
16015 + *
16016 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16017 + */
16018 +
16019 +#include <linux/module.h>
16020 +#include <linux/signal.h>
16021 +#include <linux/sched.h>
16022 +#include <linux/kernel.h>
16023 +#include <linux/errno.h>
16024 +#include <linux/string.h>
16025 +#include <linux/types.h>
16026 +#include <linux/ptrace.h>
16027 +#include <linux/mman.h>
16028 +#include <linux/mm.h>
16029 +#include <linux/hugetlb.h>
16030 +#include <linux/swap.h>
16031 +#include <linux/smp.h>
16032 +#include <linux/init.h>
16033 +#include <linux/highmem.h>
16034 +#include <linux/pagemap.h>
16035 +#include <linux/poison.h>
16036 +#include <linux/bootmem.h>
16037 +#include <linux/slab.h>
16038 +#include <linux/proc_fs.h>
16039 +#include <linux/efi.h>
16040 +#include <linux/memory_hotplug.h>
16041 +#include <linux/initrd.h>
16042 +#include <linux/cpumask.h>
16043 +#include <linux/dma-mapping.h>
16044 +#include <linux/scatterlist.h>
16045 +
16046 +#include <asm/processor.h>
16047 +#include <asm/system.h>
16048 +#include <asm/uaccess.h>
16049 +#include <asm/pgtable.h>
16050 +#include <asm/dma.h>
16051 +#include <asm/fixmap.h>
16052 +#include <asm/e820.h>
16053 +#include <asm/apic.h>
16054 +#include <asm/tlb.h>
16055 +#include <asm/tlbflush.h>
16056 +#include <asm/sections.h>
16057 +#include <asm/hypervisor.h>
16058 +#include <asm/swiotlb.h>
16059 +
16060 +unsigned int __VMALLOC_RESERVE = 128 << 20;
16061 +
16062 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
16063 +unsigned long highstart_pfn, highend_pfn;
16064 +
16065 +static int noinline do_test_wp_bit(void);
16066 +
16067 +/*
16068 + * Creates a middle page table and puts a pointer to it in the
16069 + * given global directory entry. This only returns the gd entry
16070 + * in non-PAE compilation mode, since the middle layer is folded.
16071 + */
16072 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
16073 +{
16074 +       pud_t *pud;
16075 +       pmd_t *pmd_table;
16076 +
16077 +#ifdef CONFIG_X86_PAE
16078 +       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16079 +       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
16080 +       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
16081 +       pud = pud_offset(pgd, 0);
16082 +       if (pmd_table != pmd_offset(pud, 0))
16083 +               BUG();
16084 +#else
16085 +       pud = pud_offset(pgd, 0);
16086 +       pmd_table = pmd_offset(pud, 0);
16087 +#endif
16088 +
16089 +       return pmd_table;
16090 +}
16091 +
16092 +/*
16093 + * Create a page table and place a pointer to it in a middle page
16094 + * directory entry.
16095 + */
16096 +static pte_t * __init one_page_table_init(pmd_t *pmd)
16097 +{
16098 +       if (pmd_none(*pmd)) {
16099 +               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16100 +               make_lowmem_page_readonly(page_table,
16101 +                                         XENFEAT_writable_page_tables);
16102 +               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
16103 +               if (page_table != pte_offset_kernel(pmd, 0))
16104 +                       BUG();
16105 +
16106 +               return page_table;
16107 +       }
16108 +
16109 +       return pte_offset_kernel(pmd, 0);
16110 +}
16111 +
16112 +/*
16113 + * This function initializes a certain range of kernel virtual memory
16114 + * with new bootmem page tables, everywhere page tables are missing in
16115 + * the given range.
16116 + */
16117 +
16118 +/*
16119 + * NOTE: The pagetables are allocated contiguous on the physical space
16120 + * so we can cache the place of the first one and move around without
16121 + * checking the pgd every time.
16122 + */
16123 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
16124 +{
16125 +       pgd_t *pgd;
16126 +       pud_t *pud;
16127 +       pmd_t *pmd;
16128 +       int pgd_idx, pmd_idx;
16129 +       unsigned long vaddr;
16130 +
16131 +       vaddr = start;
16132 +       pgd_idx = pgd_index(vaddr);
16133 +       pmd_idx = pmd_index(vaddr);
16134 +       pgd = pgd_base + pgd_idx;
16135 +
16136 +       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
16137 +               if (pgd_none(*pgd))
16138 +                       one_md_table_init(pgd);
16139 +               pud = pud_offset(pgd, vaddr);
16140 +               pmd = pmd_offset(pud, vaddr);
16141 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
16142 +                       if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
16143 +                               one_page_table_init(pmd);
16144 +
16145 +                       vaddr += PMD_SIZE;
16146 +               }
16147 +               pmd_idx = 0;
16148 +       }
16149 +}
16150 +
16151 +static inline int is_kernel_text(unsigned long addr)
16152 +{
16153 +       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
16154 +               return 1;
16155 +       return 0;
16156 +}
16157 +
16158 +/*
16159 + * This maps the physical memory to kernel virtual address space, a total
16160 + * of max_low_pfn pages, by creating page tables starting from address
16161 + * PAGE_OFFSET.
16162 + */
16163 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
16164 +{
16165 +       unsigned long pfn;
16166 +       pgd_t *pgd;
16167 +       pmd_t *pmd;
16168 +       pte_t *pte;
16169 +       int pgd_idx, pmd_idx, pte_ofs;
16170 +
16171 +       unsigned long max_ram_pfn = xen_start_info->nr_pages;
16172 +       if (max_ram_pfn > max_low_pfn)
16173 +               max_ram_pfn = max_low_pfn;
16174 +
16175 +       pgd_idx = pgd_index(PAGE_OFFSET);
16176 +       pgd = pgd_base + pgd_idx;
16177 +       pfn = 0;
16178 +       pmd_idx = pmd_index(PAGE_OFFSET);
16179 +       pte_ofs = pte_index(PAGE_OFFSET);
16180 +
16181 +       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
16182 +#ifdef CONFIG_XEN
16183 +               /*
16184 +                * Native linux hasn't PAE-paging enabled yet at this
16185 +                * point.  When running as xen domain we are in PAE
16186 +                * mode already, thus we can't simply hook a empty
16187 +                * pmd.  That would kill the mappings we are currently
16188 +                * using ...
16189 +                */
16190 +               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
16191 +#else
16192 +               pmd = one_md_table_init(pgd);
16193 +#endif
16194 +               if (pfn >= max_low_pfn)
16195 +                       continue;
16196 +               pmd += pmd_idx;
16197 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
16198 +                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
16199 +                       if (address >= hypervisor_virt_start)
16200 +                               continue;
16201 +
16202 +                       /* Map with big pages if possible, otherwise create normal page tables. */
16203 +                       if (cpu_has_pse) {
16204 +                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
16205 +
16206 +                               if (is_kernel_text(address) || is_kernel_text(address2))
16207 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
16208 +                               else
16209 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
16210 +                               pfn += PTRS_PER_PTE;
16211 +                       } else {
16212 +                               pte = one_page_table_init(pmd);
16213 +
16214 +                               pte += pte_ofs;
16215 +                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
16216 +                                               /* XEN: Only map initial RAM allocation. */
16217 +                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
16218 +                                                       continue;
16219 +                                               if (is_kernel_text(address))
16220 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
16221 +                                               else
16222 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
16223 +                               }
16224 +                               pte_ofs = 0;
16225 +                       }
16226 +               }
16227 +               pmd_idx = 0;
16228 +       }
16229 +}
16230 +
16231 +#ifndef CONFIG_XEN
16232 +
16233 +static inline int page_kills_ppro(unsigned long pagenr)
16234 +{
16235 +       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
16236 +               return 1;
16237 +       return 0;
16238 +}
16239 +
16240 +#else
16241 +
16242 +#define page_kills_ppro(p)     0
16243 +
16244 +#endif
16245 +
16246 +extern int is_available_memory(efi_memory_desc_t *);
16247 +
16248 +int page_is_ram(unsigned long pagenr)
16249 +{
16250 +       int i;
16251 +       unsigned long addr, end;
16252 +
16253 +       if (efi_enabled) {
16254 +               efi_memory_desc_t *md;
16255 +               void *p;
16256 +
16257 +               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
16258 +                       md = p;
16259 +                       if (!is_available_memory(md))
16260 +                               continue;
16261 +                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16262 +                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
16263 +
16264 +                       if ((pagenr >= addr) && (pagenr < end))
16265 +                               return 1;
16266 +               }
16267 +               return 0;
16268 +       }
16269 +
16270 +       for (i = 0; i < e820.nr_map; i++) {
16271 +
16272 +               if (e820.map[i].type != E820_RAM)       /* not usable memory */
16273 +                       continue;
16274 +               /*
16275 +                *      !!!FIXME!!! Some BIOSen report areas as RAM that
16276 +                *      are not. Notably the 640->1Mb area. We need a sanity
16277 +                *      check here.
16278 +                */
16279 +               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16280 +               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
16281 +               if  ((pagenr >= addr) && (pagenr < end))
16282 +                       return 1;
16283 +       }
16284 +       return 0;
16285 +}
16286 +
16287 +#ifdef CONFIG_HIGHMEM
16288 +pte_t *kmap_pte;
16289 +pgprot_t kmap_prot;
16290 +
16291 +#define kmap_get_fixmap_pte(vaddr)                                     \
16292 +       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
16293 +
16294 +static void __init kmap_init(void)
16295 +{
16296 +       unsigned long kmap_vstart;
16297 +
16298 +       /* cache the first kmap pte */
16299 +       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
16300 +       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
16301 +
16302 +       kmap_prot = PAGE_KERNEL;
16303 +}
16304 +
16305 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
16306 +{
16307 +       pgd_t *pgd;
16308 +       pud_t *pud;
16309 +       pmd_t *pmd;
16310 +       pte_t *pte;
16311 +       unsigned long vaddr;
16312 +
16313 +       vaddr = PKMAP_BASE;
16314 +       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
16315 +
16316 +       pgd = swapper_pg_dir + pgd_index(vaddr);
16317 +       pud = pud_offset(pgd, vaddr);
16318 +       pmd = pmd_offset(pud, vaddr);
16319 +       pte = pte_offset_kernel(pmd, vaddr);
16320 +       pkmap_page_table = pte;
16321 +}
16322 +
16323 +static void __meminit free_new_highpage(struct page *page, int pfn)
16324 +{
16325 +       init_page_count(page);
16326 +       if (pfn < xen_start_info->nr_pages)
16327 +               __free_page(page);
16328 +       totalhigh_pages++;
16329 +}
16330 +
16331 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
16332 +{
16333 +       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
16334 +               ClearPageReserved(page);
16335 +               free_new_highpage(page, pfn);
16336 +       } else
16337 +               SetPageReserved(page);
16338 +}
16339 +
16340 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
16341 +{
16342 +       free_new_highpage(page, pfn);
16343 +       totalram_pages++;
16344 +#ifdef CONFIG_FLATMEM
16345 +       max_mapnr = max(pfn, max_mapnr);
16346 +#endif
16347 +       num_physpages++;
16348 +       return 0;
16349 +}
16350 +
16351 +/*
16352 + * Not currently handling the NUMA case.
16353 + * Assuming single node and all memory that
16354 + * has been added dynamically that would be
16355 + * onlined here is in HIGHMEM
16356 + */
16357 +void online_page(struct page *page)
16358 +{
16359 +       ClearPageReserved(page);
16360 +       add_one_highpage_hotplug(page, page_to_pfn(page));
16361 +}
16362 +
16363 +
16364 +#ifdef CONFIG_NUMA
16365 +extern void set_highmem_pages_init(int);
16366 +#else
16367 +static void __init set_highmem_pages_init(int bad_ppro)
16368 +{
16369 +       int pfn;
16370 +       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
16371 +               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
16372 +       totalram_pages += totalhigh_pages;
16373 +}
16374 +#endif /* CONFIG_FLATMEM */
16375 +
16376 +#else
16377 +#define kmap_init() do { } while (0)
16378 +#define permanent_kmaps_init(pgd_base) do { } while (0)
16379 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
16380 +#endif /* CONFIG_HIGHMEM */
16381 +
16382 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
16383 +EXPORT_SYMBOL(__PAGE_KERNEL);
16384 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
16385 +
16386 +#ifdef CONFIG_NUMA
16387 +extern void __init remap_numa_kva(void);
16388 +#else
16389 +#define remap_numa_kva() do {} while (0)
16390 +#endif
16391 +
16392 +pgd_t *swapper_pg_dir;
16393 +
16394 +static void __init pagetable_init (void)
16395 +{
16396 +       unsigned long vaddr;
16397 +       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
16398 +
16399 +       /* Enable PSE if available */
16400 +       if (cpu_has_pse) {
16401 +               set_in_cr4(X86_CR4_PSE);
16402 +       }
16403 +
16404 +       /* Enable PGE if available */
16405 +       if (cpu_has_pge) {
16406 +               set_in_cr4(X86_CR4_PGE);
16407 +               __PAGE_KERNEL |= _PAGE_GLOBAL;
16408 +               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
16409 +       }
16410 +
16411 +       kernel_physical_mapping_init(pgd_base);
16412 +       remap_numa_kva();
16413 +
16414 +       /*
16415 +        * Fixed mappings, only the page table structure has to be
16416 +        * created - mappings will be set by set_fixmap():
16417 +        */
16418 +       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
16419 +       page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
16420 +
16421 +       permanent_kmaps_init(pgd_base);
16422 +}
16423 +
16424 +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
16425 +/*
16426 + * Swap suspend & friends need this for resume because things like the intel-agp
16427 + * driver might have split up a kernel 4MB mapping.
16428 + */
16429 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
16430 +       __attribute__ ((aligned (PAGE_SIZE)));
16431 +
16432 +static inline void save_pg_dir(void)
16433 +{
16434 +       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
16435 +}
16436 +#else
16437 +static inline void save_pg_dir(void)
16438 +{
16439 +}
16440 +#endif
16441 +
16442 +void zap_low_mappings (void)
16443 +{
16444 +       int i;
16445 +
16446 +       save_pg_dir();
16447 +
16448 +       /*
16449 +        * Zap initial low-memory mappings.
16450 +        *
16451 +        * Note that "pgd_clear()" doesn't do it for
16452 +        * us, because pgd_clear() is a no-op on i386.
16453 +        */
16454 +       for (i = 0; i < USER_PTRS_PER_PGD; i++)
16455 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16456 +               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
16457 +#else
16458 +               set_pgd(swapper_pg_dir+i, __pgd(0));
16459 +#endif
16460 +       flush_tlb_all();
16461 +}
16462 +
16463 +static int disable_nx __initdata = 0;
16464 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
16465 +EXPORT_SYMBOL(__supported_pte_mask);
16466 +
16467 +/*
16468 + * noexec = on|off
16469 + *
16470 + * Control non executable mappings.
16471 + *
16472 + * on      Enable
16473 + * off     Disable
16474 + */
16475 +void __init noexec_setup(const char *str)
16476 +{
16477 +       if (!strncmp(str, "on",2) && cpu_has_nx) {
16478 +               __supported_pte_mask |= _PAGE_NX;
16479 +               disable_nx = 0;
16480 +       } else if (!strncmp(str,"off",3)) {
16481 +               disable_nx = 1;
16482 +               __supported_pte_mask &= ~_PAGE_NX;
16483 +       }
16484 +}
16485 +
16486 +int nx_enabled = 0;
16487 +#ifdef CONFIG_X86_PAE
16488 +
16489 +static void __init set_nx(void)
16490 +{
16491 +       unsigned int v[4], l, h;
16492 +
16493 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
16494 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
16495 +               if ((v[3] & (1 << 20)) && !disable_nx) {
16496 +                       rdmsr(MSR_EFER, l, h);
16497 +                       l |= EFER_NX;
16498 +                       wrmsr(MSR_EFER, l, h);
16499 +                       nx_enabled = 1;
16500 +                       __supported_pte_mask |= _PAGE_NX;
16501 +               }
16502 +       }
16503 +}
16504 +
16505 +/*
16506 + * Enables/disables executability of a given kernel page and
16507 + * returns the previous setting.
16508 + */
16509 +int __init set_kernel_exec(unsigned long vaddr, int enable)
16510 +{
16511 +       pte_t *pte;
16512 +       int ret = 1;
16513 +
16514 +       if (!nx_enabled)
16515 +               goto out;
16516 +
16517 +       pte = lookup_address(vaddr);
16518 +       BUG_ON(!pte);
16519 +
16520 +       if (!pte_exec_kernel(*pte))
16521 +               ret = 0;
16522 +
16523 +       if (enable)
16524 +               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
16525 +       else
16526 +               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
16527 +       __flush_tlb_all();
16528 +out:
16529 +       return ret;
16530 +}
16531 +
16532 +#endif
16533 +
16534 +/*
16535 + * paging_init() sets up the page tables - note that the first 8MB are
16536 + * already mapped by head.S.
16537 + *
16538 + * This routines also unmaps the page at virtual kernel address 0, so
16539 + * that we can trap those pesky NULL-reference errors in the kernel.
16540 + */
16541 +void __init paging_init(void)
16542 +{
16543 +       int i;
16544 +
16545 +#ifdef CONFIG_X86_PAE
16546 +       set_nx();
16547 +       if (nx_enabled)
16548 +               printk("NX (Execute Disable) protection: active\n");
16549 +#endif
16550 +
16551 +       pagetable_init();
16552 +
16553 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16554 +       /*
16555 +        * We will bail out later - printk doesn't work right now so
16556 +        * the user would just see a hanging kernel.
16557 +        * when running as xen domain we are already in PAE mode at
16558 +        * this point.
16559 +        */
16560 +       if (cpu_has_pae)
16561 +               set_in_cr4(X86_CR4_PAE);
16562 +#endif
16563 +       __flush_tlb_all();
16564 +
16565 +       kmap_init();
16566 +
16567 +       /* Switch to the real shared_info page, and clear the
16568 +        * dummy page. */
16569 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
16570 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
16571 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
16572 +
16573 +       /* Setup mapping of lower 1st MB */
16574 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
16575 +               if (is_initial_xendomain())
16576 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
16577 +               else
16578 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
16579 +                                    virt_to_machine(empty_zero_page),
16580 +                                    PAGE_KERNEL_RO);
16581 +}
16582 +
16583 +/*
16584 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
16585 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
16586 + * used to involve black magic jumps to work around some nasty CPU bugs,
16587 + * but fortunately the switch to using exceptions got rid of all that.
16588 + */
16589 +
16590 +static void __init test_wp_bit(void)
16591 +{
16592 +       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
16593 +
16594 +       /* Any page-aligned address will do, the test is non-destructive */
16595 +       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
16596 +       boot_cpu_data.wp_works_ok = do_test_wp_bit();
16597 +       clear_fixmap(FIX_WP_TEST);
16598 +
16599 +       if (!boot_cpu_data.wp_works_ok) {
16600 +               printk("No.\n");
16601 +#ifdef CONFIG_X86_WP_WORKS_OK
16602 +               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
16603 +#endif
16604 +       } else {
16605 +               printk("Ok.\n");
16606 +       }
16607 +}
16608 +
16609 +static void __init set_max_mapnr_init(void)
16610 +{
16611 +#ifdef CONFIG_HIGHMEM
16612 +       num_physpages = highend_pfn;
16613 +#else
16614 +       num_physpages = max_low_pfn;
16615 +#endif
16616 +#ifdef CONFIG_FLATMEM
16617 +       max_mapnr = num_physpages;
16618 +#endif
16619 +}
16620 +
16621 +static struct kcore_list kcore_mem, kcore_vmalloc;
16622 +
16623 +void __init mem_init(void)
16624 +{
16625 +       extern int ppro_with_ram_bug(void);
16626 +       int codesize, reservedpages, datasize, initsize;
16627 +       int tmp;
16628 +       int bad_ppro;
16629 +       unsigned long pfn;
16630 +
16631 +#if defined(CONFIG_SWIOTLB)
16632 +       swiotlb_init();
16633 +#endif
16634 +
16635 +#ifdef CONFIG_FLATMEM
16636 +       if (!mem_map)
16637 +               BUG();
16638 +#endif
16639 +
16640 +       bad_ppro = ppro_with_ram_bug();
16641 +
16642 +#ifdef CONFIG_HIGHMEM
16643 +       /* check that fixmap and pkmap do not overlap */
16644 +       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
16645 +               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
16646 +               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
16647 +                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
16648 +               BUG();
16649 +       }
16650 +#endif
16651 +
16652 +       set_max_mapnr_init();
16653 +
16654 +#ifdef CONFIG_HIGHMEM
16655 +       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
16656 +#else
16657 +       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
16658 +#endif
16659 +       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
16660 +              VMALLOC_START,VMALLOC_END,MAXMEM);
16661 +       BUG_ON(VMALLOC_START > VMALLOC_END);
16662 +
16663 +       /* this will put all low memory onto the freelists */
16664 +       totalram_pages += free_all_bootmem();
16665 +       /* XEN: init and count low-mem pages outside initial allocation. */
16666 +       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
16667 +               ClearPageReserved(pfn_to_page(pfn));
16668 +               init_page_count(pfn_to_page(pfn));
16669 +               totalram_pages++;
16670 +       }
16671 +
16672 +       reservedpages = 0;
16673 +       for (tmp = 0; tmp < max_low_pfn; tmp++)
16674 +               /*
16675 +                * Only count reserved RAM pages
16676 +                */
16677 +               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
16678 +                       reservedpages++;
16679 +
16680 +       set_highmem_pages_init(bad_ppro);
16681 +
16682 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
16683 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
16684 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
16685 +
16686 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
16687 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
16688 +                  VMALLOC_END-VMALLOC_START);
16689 +
16690 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
16691 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
16692 +               num_physpages << (PAGE_SHIFT-10),
16693 +               codesize >> 10,
16694 +               reservedpages << (PAGE_SHIFT-10),
16695 +               datasize >> 10,
16696 +               initsize >> 10,
16697 +               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
16698 +              );
16699 +
16700 +#ifdef CONFIG_X86_PAE
16701 +       if (!cpu_has_pae)
16702 +               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
16703 +#endif
16704 +       if (boot_cpu_data.wp_works_ok < 0)
16705 +               test_wp_bit();
16706 +
16707 +       /*
16708 +        * Subtle. SMP is doing it's boot stuff late (because it has to
16709 +        * fork idle threads) - but it also needs low mappings for the
16710 +        * protected-mode entry to work. We zap these entries only after
16711 +        * the WP-bit has been tested.
16712 +        */
16713 +#ifndef CONFIG_SMP
16714 +       zap_low_mappings();
16715 +#endif
16716 +
16717 +       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
16718 +}
16719 +
16720 +/*
16721 + * this is for the non-NUMA, single node SMP system case.
16722 + * Specifically, in the case of x86, we will always add
16723 + * memory to the highmem for now.
16724 + */
16725 +#ifdef CONFIG_MEMORY_HOTPLUG
16726 +#ifndef CONFIG_NEED_MULTIPLE_NODES
16727 +int arch_add_memory(int nid, u64 start, u64 size)
16728 +{
16729 +       struct pglist_data *pgdata = &contig_page_data;
16730 +       struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
16731 +       unsigned long start_pfn = start >> PAGE_SHIFT;
16732 +       unsigned long nr_pages = size >> PAGE_SHIFT;
16733 +
16734 +       return __add_pages(zone, start_pfn, nr_pages);
16735 +}
16736 +
16737 +int remove_memory(u64 start, u64 size)
16738 +{
16739 +       return -EINVAL;
16740 +}
16741 +#endif
16742 +#endif
16743 +
16744 +kmem_cache_t *pgd_cache;
16745 +kmem_cache_t *pmd_cache;
16746 +
16747 +void __init pgtable_cache_init(void)
16748 +{
16749 +       if (PTRS_PER_PMD > 1) {
16750 +               pmd_cache = kmem_cache_create("pmd",
16751 +                                       PTRS_PER_PMD*sizeof(pmd_t),
16752 +                                       PTRS_PER_PMD*sizeof(pmd_t),
16753 +                                       0,
16754 +                                       pmd_ctor,
16755 +                                       NULL);
16756 +               if (!pmd_cache)
16757 +                       panic("pgtable_cache_init(): cannot create pmd cache");
16758 +       }
16759 +       pgd_cache = kmem_cache_create("pgd",
16760 +#ifndef CONFIG_XEN
16761 +                               PTRS_PER_PGD*sizeof(pgd_t),
16762 +                               PTRS_PER_PGD*sizeof(pgd_t),
16763 +#else
16764 +                               PAGE_SIZE,
16765 +                               PAGE_SIZE,
16766 +#endif
16767 +                               0,
16768 +                               pgd_ctor,
16769 +                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
16770 +       if (!pgd_cache)
16771 +               panic("pgtable_cache_init(): Cannot create pgd cache");
16772 +}
16773 +
16774 +/*
16775 + * This function cannot be __init, since exceptions don't work in that
16776 + * section.  Put this after the callers, so that it cannot be inlined.
16777 + */
16778 +static int noinline do_test_wp_bit(void)
16779 +{
16780 +       char tmp_reg;
16781 +       int flag;
16782 +
16783 +       __asm__ __volatile__(
16784 +               "       movb %0,%1      \n"
16785 +               "1:     movb %1,%0      \n"
16786 +               "       xorl %2,%2      \n"
16787 +               "2:                     \n"
16788 +               ".section __ex_table,\"a\"\n"
16789 +               "       .align 4        \n"
16790 +               "       .long 1b,2b     \n"
16791 +               ".previous              \n"
16792 +               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
16793 +                "=q" (tmp_reg),
16794 +                "=r" (flag)
16795 +               :"2" (1)
16796 +               :"memory");
16797 +
16798 +       return flag;
16799 +}
16800 +
16801 +#ifdef CONFIG_DEBUG_RODATA
16802 +
16803 +void mark_rodata_ro(void)
16804 +{
16805 +       unsigned long addr = (unsigned long)__start_rodata;
16806 +
16807 +       for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
16808 +               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
16809 +
16810 +       printk("Write protecting the kernel read-only data: %uk\n",
16811 +                       (__end_rodata - __start_rodata) >> 10);
16812 +
16813 +       /*
16814 +        * change_page_attr() requires a global_flush_tlb() call after it.
16815 +        * We do this after the printk so that if something went wrong in the
16816 +        * change, the printk gets out at least to give a better debug hint
16817 +        * of who is the culprit.
16818 +        */
16819 +       global_flush_tlb();
16820 +}
16821 +#endif
16822 +
16823 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
16824 +{
16825 +       unsigned long addr;
16826 +
16827 +       for (addr = begin; addr < end; addr += PAGE_SIZE) {
16828 +               ClearPageReserved(virt_to_page(addr));
16829 +               init_page_count(virt_to_page(addr));
16830 +               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
16831 +               free_page(addr);
16832 +               totalram_pages++;
16833 +       }
16834 +       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
16835 +}
16836 +
16837 +void free_initmem(void)
16838 +{
16839 +       free_init_pages("unused kernel memory",
16840 +                       (unsigned long)(&__init_begin),
16841 +                       (unsigned long)(&__init_end));
16842 +}
16843 +
16844 +#ifdef CONFIG_BLK_DEV_INITRD
16845 +void free_initrd_mem(unsigned long start, unsigned long end)
16846 +{
16847 +       free_init_pages("initrd memory", start, end);
16848 +}
16849 +#endif
16850 +
16851 Index: head-2008-11-25/arch/x86/mm/ioremap_32-xen.c
16852 ===================================================================
16853 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
16854 +++ head-2008-11-25/arch/x86/mm/ioremap_32-xen.c        2008-04-02 12:34:02.000000000 +0200
16855 @@ -0,0 +1,443 @@
16856 +/*
16857 + * arch/i386/mm/ioremap.c
16858 + *
16859 + * Re-map IO memory to kernel address space so that we can access it.
16860 + * This is needed for high PCI addresses that aren't mapped in the
16861 + * 640k-1MB IO memory area on PC's
16862 + *
16863 + * (C) Copyright 1995 1996 Linus Torvalds
16864 + */
16865 +
16866 +#include <linux/vmalloc.h>
16867 +#include <linux/init.h>
16868 +#include <linux/slab.h>
16869 +#include <linux/module.h>
16870 +#include <asm/io.h>
16871 +#include <asm/fixmap.h>
16872 +#include <asm/cacheflush.h>
16873 +#include <asm/tlbflush.h>
16874 +#include <asm/pgtable.h>
16875 +#include <asm/pgalloc.h>
16876 +
16877 +#define ISA_START_ADDRESS      0x0
16878 +#define ISA_END_ADDRESS                0x100000
16879 +
16880 +static int direct_remap_area_pte_fn(pte_t *pte,
16881 +                                   struct page *pmd_page,
16882 +                                   unsigned long address,
16883 +                                   void *data)
16884 +{
16885 +       mmu_update_t **v = (mmu_update_t **)data;
16886 +
16887 +       BUG_ON(!pte_none(*pte));
16888 +
16889 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
16890 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
16891 +       (*v)++;
16892 +
16893 +       return 0;
16894 +}
16895 +
16896 +static int __direct_remap_pfn_range(struct mm_struct *mm,
16897 +                                   unsigned long address,
16898 +                                   unsigned long mfn,
16899 +                                   unsigned long size,
16900 +                                   pgprot_t prot,
16901 +                                   domid_t  domid)
16902 +{
16903 +       int rc;
16904 +       unsigned long i, start_address;
16905 +       mmu_update_t *u, *v, *w;
16906 +
16907 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
16908 +       if (u == NULL)
16909 +               return -ENOMEM;
16910 +
16911 +       start_address = address;
16912 +
16913 +       flush_cache_all();
16914 +
16915 +       for (i = 0; i < size; i += PAGE_SIZE) {
16916 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
16917 +                       /* Flush a full batch after filling in the PTE ptrs. */
16918 +                       rc = apply_to_page_range(mm, start_address,
16919 +                                                address - start_address,
16920 +                                                direct_remap_area_pte_fn, &w);
16921 +                       if (rc)
16922 +                               goto out;
16923 +                       rc = -EFAULT;
16924 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
16925 +                               goto out;
16926 +                       v = w = u;
16927 +                       start_address = address;
16928 +               }
16929 +
16930 +               /*
16931 +                * Fill in the machine address: PTE ptr is done later by
16932 +                * apply_to_page_range().
16933 +                */
16934 +               v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
16935 +
16936 +               mfn++;
16937 +               address += PAGE_SIZE;
16938 +               v++;
16939 +       }
16940 +
16941 +       if (v != u) {
16942 +               /* Final batch. */
16943 +               rc = apply_to_page_range(mm, start_address,
16944 +                                        address - start_address,
16945 +                                        direct_remap_area_pte_fn, &w);
16946 +               if (rc)
16947 +                       goto out;
16948 +               rc = -EFAULT;
16949 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
16950 +                       goto out;
16951 +       }
16952 +
16953 +       rc = 0;
16954 +
16955 + out:
16956 +       flush_tlb_all();
16957 +
16958 +       free_page((unsigned long)u);
16959 +
16960 +       return rc;
16961 +}
16962 +
16963 +int direct_remap_pfn_range(struct vm_area_struct *vma,
16964 +                          unsigned long address,
16965 +                          unsigned long mfn,
16966 +                          unsigned long size,
16967 +                          pgprot_t prot,
16968 +                          domid_t  domid)
16969 +{
16970 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16971 +               return remap_pfn_range(vma, address, mfn, size, prot);
16972 +
16973 +       if (domid == DOMID_SELF)
16974 +               return -EINVAL;
16975 +
16976 +       vma->vm_flags |= VM_IO | VM_RESERVED;
16977 +
16978 +       vma->vm_mm->context.has_foreign_mappings = 1;
16979 +
16980 +       return __direct_remap_pfn_range(
16981 +               vma->vm_mm, address, mfn, size, prot, domid);
16982 +}
16983 +EXPORT_SYMBOL(direct_remap_pfn_range);
16984 +
16985 +int direct_kernel_remap_pfn_range(unsigned long address,
16986 +                                 unsigned long mfn,
16987 +                                 unsigned long size,
16988 +                                 pgprot_t prot,
16989 +                                 domid_t  domid)
16990 +{
16991 +       return __direct_remap_pfn_range(
16992 +               &init_mm, address, mfn, size, prot, domid);
16993 +}
16994 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
16995 +
16996 +static int lookup_pte_fn(
16997 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
16998 +{
16999 +       uint64_t *ptep = (uint64_t *)data;
17000 +       if (ptep)
17001 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17002 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17003 +       return 0;
17004 +}
17005 +
17006 +int create_lookup_pte_addr(struct mm_struct *mm,
17007 +                          unsigned long address,
17008 +                          uint64_t *ptep)
17009 +{
17010 +       return apply_to_page_range(mm, address, PAGE_SIZE,
17011 +                                  lookup_pte_fn, ptep);
17012 +}
17013 +
17014 +EXPORT_SYMBOL(create_lookup_pte_addr);
17015 +
17016 +static int noop_fn(
17017 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
17018 +{
17019 +       return 0;
17020 +}
17021 +
17022 +int touch_pte_range(struct mm_struct *mm,
17023 +                   unsigned long address,
17024 +                   unsigned long size)
17025 +{
17026 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
17027 +}
17028 +
17029 +EXPORT_SYMBOL(touch_pte_range);
17030 +
17031 +/*
17032 + * Does @address reside within a non-highmem page that is local to this virtual
17033 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
17034 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
17035 + * why this works.
17036 + */
17037 +static inline int is_local_lowmem(unsigned long address)
17038 +{
17039 +       extern unsigned long max_low_pfn;
17040 +       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
17041 +}
17042 +
17043 +/*
17044 + * Generic mapping function (not visible outside):
17045 + */
17046 +
17047 +/*
17048 + * Remap an arbitrary physical address space into the kernel virtual
17049 + * address space. Needed when the kernel wants to access high addresses
17050 + * directly.
17051 + *
17052 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
17053 + * have to convert them into an offset in a page-aligned mapping, but the
17054 + * caller shouldn't need to know that small detail.
17055 + */
17056 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
17057 +{
17058 +       void __iomem * addr;
17059 +       struct vm_struct * area;
17060 +       unsigned long offset, last_addr;
17061 +       domid_t domid = DOMID_IO;
17062 +
17063 +       /* Don't allow wraparound or zero size */
17064 +       last_addr = phys_addr + size - 1;
17065 +       if (!size || last_addr < phys_addr)
17066 +               return NULL;
17067 +
17068 +       /*
17069 +        * Don't remap the low PCI/ISA area, it's always mapped..
17070 +        */
17071 +       if (is_initial_xendomain() &&
17072 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17073 +               return (void __iomem *) isa_bus_to_virt(phys_addr);
17074 +
17075 +       /*
17076 +        * Don't allow anybody to remap normal RAM that we're using..
17077 +        */
17078 +       if (is_local_lowmem(phys_addr)) {
17079 +               char *t_addr, *t_end;
17080 +               struct page *page;
17081 +
17082 +               t_addr = bus_to_virt(phys_addr);
17083 +               t_end = t_addr + (size - 1);
17084 +
17085 +               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
17086 +                       if(!PageReserved(page))
17087 +                               return NULL;
17088 +
17089 +               domid = DOMID_SELF;
17090 +       }
17091 +
17092 +       /*
17093 +        * Mappings have to be page-aligned
17094 +        */
17095 +       offset = phys_addr & ~PAGE_MASK;
17096 +       phys_addr &= PAGE_MASK;
17097 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
17098 +
17099 +       /*
17100 +        * Ok, go for it..
17101 +        */
17102 +       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
17103 +       if (!area)
17104 +               return NULL;
17105 +       area->phys_addr = phys_addr;
17106 +       addr = (void __iomem *) area->addr;
17107 +       flags |= _KERNPG_TABLE;
17108 +       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
17109 +                                    phys_addr>>PAGE_SHIFT,
17110 +                                    size, __pgprot(flags), domid)) {
17111 +               vunmap((void __force *) addr);
17112 +               return NULL;
17113 +       }
17114 +       return (void __iomem *) (offset + (char __iomem *)addr);
17115 +}
17116 +EXPORT_SYMBOL(__ioremap);
17117 +
17118 +/**
17119 + * ioremap_nocache     -   map bus memory into CPU space
17120 + * @offset:    bus address of the memory
17121 + * @size:      size of the resource to map
17122 + *
17123 + * ioremap_nocache performs a platform specific sequence of operations to
17124 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
17125 + * writew/writel functions and the other mmio helpers. The returned
17126 + * address is not guaranteed to be usable directly as a virtual
17127 + * address.
17128 + *
17129 + * This version of ioremap ensures that the memory is marked uncachable
17130 + * on the CPU as well as honouring existing caching rules from things like
17131 + * the PCI bus. Note that there are other caches and buffers on many
17132 + * busses. In particular driver authors should read up on PCI writes
17133 + *
17134 + * It's useful if some control registers are in such an area and
17135 + * write combining or read caching is not desirable:
17136 + *
17137 + * Must be freed with iounmap.
17138 + */
17139 +
17140 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
17141 +{
17142 +       unsigned long last_addr;
17143 +       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
17144 +       if (!p)
17145 +               return p;
17146 +
17147 +       /* Guaranteed to be > phys_addr, as per __ioremap() */
17148 +       last_addr = phys_addr + size - 1;
17149 +
17150 +       if (is_local_lowmem(last_addr)) {
17151 +               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
17152 +               unsigned long npages;
17153 +
17154 +               phys_addr &= PAGE_MASK;
17155 +
17156 +               /* This might overflow and become zero.. */
17157 +               last_addr = PAGE_ALIGN(last_addr);
17158 +
17159 +               /* .. but that's ok, because modulo-2**n arithmetic will make
17160 +               * the page-aligned "last - first" come out right.
17161 +               */
17162 +               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
17163 +
17164 +               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
17165 +                       iounmap(p);
17166 +                       p = NULL;
17167 +               }
17168 +               global_flush_tlb();
17169 +       }
17170 +
17171 +       return p;
17172 +}
17173 +EXPORT_SYMBOL(ioremap_nocache);
17174 +
17175 +/**
17176 + * iounmap - Free a IO remapping
17177 + * @addr: virtual address from ioremap_*
17178 + *
17179 + * Caller must ensure there is only one unmapping for the same pointer.
17180 + */
17181 +void iounmap(volatile void __iomem *addr)
17182 +{
17183 +       struct vm_struct *p, *o;
17184 +
17185 +       if ((void __force *)addr <= high_memory)
17186 +               return;
17187 +
17188 +       /*
17189 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
17190 +        * vm_area and by simply returning an address into the kernel mapping
17191 +        * of ISA space.   So handle that here.
17192 +        */
17193 +       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17194 +               return;
17195 +
17196 +       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
17197 +
17198 +       /* Use the vm area unlocked, assuming the caller
17199 +          ensures there isn't another iounmap for the same address
17200 +          in parallel. Reuse of the virtual address is prevented by
17201 +          leaving it in the global lists until we're done with it.
17202 +          cpa takes care of the direct mappings. */
17203 +       read_lock(&vmlist_lock);
17204 +       for (p = vmlist; p; p = p->next) {
17205 +               if (p->addr == addr)
17206 +                       break;
17207 +       }
17208 +       read_unlock(&vmlist_lock);
17209 +
17210 +       if (!p) {
17211 +               printk("iounmap: bad address %p\n", addr);
17212 +               dump_stack();
17213 +               return;
17214 +       }
17215 +
17216 +       /* Reset the direct mapping. Can block */
17217 +       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
17218 +               /* p->size includes the guard page, but cpa doesn't like that */
17219 +               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
17220 +                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
17221 +                                PAGE_KERNEL);
17222 +               global_flush_tlb();
17223 +       }
17224 +
17225 +       /* Finally remove it */
17226 +       o = remove_vm_area((void *)addr);
17227 +       BUG_ON(p != o || o == NULL);
17228 +       kfree(p);
17229 +}
17230 +EXPORT_SYMBOL(iounmap);
17231 +
17232 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
17233 +{
17234 +       unsigned long offset, last_addr;
17235 +       unsigned int nrpages;
17236 +       enum fixed_addresses idx;
17237 +
17238 +       /* Don't allow wraparound or zero size */
17239 +       last_addr = phys_addr + size - 1;
17240 +       if (!size || last_addr < phys_addr)
17241 +               return NULL;
17242 +
17243 +       /*
17244 +        * Don't remap the low PCI/ISA area, it's always mapped..
17245 +        */
17246 +       if (is_initial_xendomain() &&
17247 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17248 +               return isa_bus_to_virt(phys_addr);
17249 +
17250 +       /*
17251 +        * Mappings have to be page-aligned
17252 +        */
17253 +       offset = phys_addr & ~PAGE_MASK;
17254 +       phys_addr &= PAGE_MASK;
17255 +       size = PAGE_ALIGN(last_addr) - phys_addr;
17256 +
17257 +       /*
17258 +        * Mappings have to fit in the FIX_BTMAP area.
17259 +        */
17260 +       nrpages = size >> PAGE_SHIFT;
17261 +       if (nrpages > NR_FIX_BTMAPS)
17262 +               return NULL;
17263 +
17264 +       /*
17265 +        * Ok, go for it..
17266 +        */
17267 +       idx = FIX_BTMAP_BEGIN;
17268 +       while (nrpages > 0) {
17269 +               set_fixmap(idx, phys_addr);
17270 +               phys_addr += PAGE_SIZE;
17271 +               --idx;
17272 +               --nrpages;
17273 +       }
17274 +       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
17275 +}
17276 +
17277 +void __init bt_iounmap(void *addr, unsigned long size)
17278 +{
17279 +       unsigned long virt_addr;
17280 +       unsigned long offset;
17281 +       unsigned int nrpages;
17282 +       enum fixed_addresses idx;
17283 +
17284 +       virt_addr = (unsigned long)addr;
17285 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
17286 +               return;
17287 +       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17288 +               return;
17289 +       offset = virt_addr & ~PAGE_MASK;
17290 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
17291 +
17292 +       idx = FIX_BTMAP_BEGIN;
17293 +       while (nrpages > 0) {
17294 +               clear_fixmap(idx);
17295 +               --idx;
17296 +               --nrpages;
17297 +       }
17298 +}
17299 Index: head-2008-11-25/arch/x86/mm/pgtable_32-xen.c
17300 ===================================================================
17301 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
17302 +++ head-2008-11-25/arch/x86/mm/pgtable_32-xen.c        2007-10-09 11:48:25.000000000 +0200
17303 @@ -0,0 +1,725 @@
17304 +/*
17305 + *  linux/arch/i386/mm/pgtable.c
17306 + */
17307 +
17308 +#include <linux/sched.h>
17309 +#include <linux/kernel.h>
17310 +#include <linux/errno.h>
17311 +#include <linux/mm.h>
17312 +#include <linux/swap.h>
17313 +#include <linux/smp.h>
17314 +#include <linux/highmem.h>
17315 +#include <linux/slab.h>
17316 +#include <linux/pagemap.h>
17317 +#include <linux/spinlock.h>
17318 +#include <linux/module.h>
17319 +
17320 +#include <asm/system.h>
17321 +#include <asm/pgtable.h>
17322 +#include <asm/pgalloc.h>
17323 +#include <asm/fixmap.h>
17324 +#include <asm/e820.h>
17325 +#include <asm/tlb.h>
17326 +#include <asm/tlbflush.h>
17327 +#include <asm/io.h>
17328 +#include <asm/mmu_context.h>
17329 +
17330 +#include <xen/features.h>
17331 +#include <asm/hypervisor.h>
17332 +
17333 +static void pgd_test_and_unpin(pgd_t *pgd);
17334 +
17335 +void show_mem(void)
17336 +{
17337 +       int total = 0, reserved = 0;
17338 +       int shared = 0, cached = 0;
17339 +       int highmem = 0;
17340 +       struct page *page;
17341 +       pg_data_t *pgdat;
17342 +       unsigned long i;
17343 +       unsigned long flags;
17344 +
17345 +       printk(KERN_INFO "Mem-info:\n");
17346 +       show_free_areas();
17347 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
17348 +       for_each_online_pgdat(pgdat) {
17349 +               pgdat_resize_lock(pgdat, &flags);
17350 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
17351 +                       page = pgdat_page_nr(pgdat, i);
17352 +                       total++;
17353 +                       if (PageHighMem(page))
17354 +                               highmem++;
17355 +                       if (PageReserved(page))
17356 +                               reserved++;
17357 +                       else if (PageSwapCache(page))
17358 +                               cached++;
17359 +                       else if (page_count(page))
17360 +                               shared += page_count(page) - 1;
17361 +               }
17362 +               pgdat_resize_unlock(pgdat, &flags);
17363 +       }
17364 +       printk(KERN_INFO "%d pages of RAM\n", total);
17365 +       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
17366 +       printk(KERN_INFO "%d reserved pages\n", reserved);
17367 +       printk(KERN_INFO "%d pages shared\n", shared);
17368 +       printk(KERN_INFO "%d pages swap cached\n", cached);
17369 +
17370 +       printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
17371 +       printk(KERN_INFO "%lu pages writeback\n",
17372 +                                       global_page_state(NR_WRITEBACK));
17373 +       printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
17374 +       printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
17375 +       printk(KERN_INFO "%lu pages pagetables\n",
17376 +                                       global_page_state(NR_PAGETABLE));
17377 +}
17378 +
17379 +/*
17380 + * Associate a large virtual page frame with a given physical page frame
17381 + * and protection flags for that frame. pfn is for the base of the page,
17382 + * vaddr is what the page gets mapped to - both must be properly aligned.
17383 + * The pmd must already be instantiated. Assumes PAE mode.
17384 + */
17385 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
17386 +{
17387 +       pgd_t *pgd;
17388 +       pud_t *pud;
17389 +       pmd_t *pmd;
17390 +
17391 +       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
17392 +               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
17393 +               return; /* BUG(); */
17394 +       }
17395 +       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
17396 +               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
17397 +               return; /* BUG(); */
17398 +       }
17399 +       pgd = swapper_pg_dir + pgd_index(vaddr);
17400 +       if (pgd_none(*pgd)) {
17401 +               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
17402 +               return; /* BUG(); */
17403 +       }
17404 +       pud = pud_offset(pgd, vaddr);
17405 +       pmd = pmd_offset(pud, vaddr);
17406 +       set_pmd(pmd, pfn_pmd(pfn, flags));
17407 +       /*
17408 +        * It's enough to flush this one mapping.
17409 +        * (PGE mappings get flushed as well)
17410 +        */
17411 +       __flush_tlb_one(vaddr);
17412 +}
17413 +
17414 +static int nr_fixmaps = 0;
17415 +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
17416 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
17417 +EXPORT_SYMBOL(__FIXADDR_TOP);
17418 +
17419 +void __init set_fixaddr_top(unsigned long top)
17420 +{
17421 +       BUG_ON(nr_fixmaps > 0);
17422 +       hypervisor_virt_start = top;
17423 +       __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
17424 +}
17425 +
17426 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
17427 +{
17428 +       unsigned long address = __fix_to_virt(idx);
17429 +       pte_t pte;
17430 +
17431 +       if (idx >= __end_of_fixed_addresses) {
17432 +               BUG();
17433 +               return;
17434 +       }
17435 +       switch (idx) {
17436 +       case FIX_WP_TEST:
17437 +       case FIX_VDSO:
17438 +               pte = pfn_pte(phys >> PAGE_SHIFT, flags);
17439 +               break;
17440 +       default:
17441 +               pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
17442 +               break;
17443 +       }
17444 +       if (HYPERVISOR_update_va_mapping(address, pte,
17445 +                                        UVMF_INVLPG|UVMF_ALL))
17446 +               BUG();
17447 +       nr_fixmaps++;
17448 +}
17449 +
17450 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17451 +{
17452 +       pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
17453 +       if (pte)
17454 +               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
17455 +       return pte;
17456 +}
17457 +
17458 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17459 +{
17460 +       struct page *pte;
17461 +
17462 +#ifdef CONFIG_HIGHPTE
17463 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17464 +#else
17465 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17466 +#endif
17467 +       if (pte) {
17468 +               SetPageForeign(pte, pte_free);
17469 +               init_page_count(pte);
17470 +       }
17471 +       return pte;
17472 +}
17473 +
17474 +void pte_free(struct page *pte)
17475 +{
17476 +       unsigned long pfn = page_to_pfn(pte);
17477 +
17478 +       if (!PageHighMem(pte)) {
17479 +               unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
17480 +
17481 +               if (!pte_write(*virt_to_ptep(va)))
17482 +                       if (HYPERVISOR_update_va_mapping(
17483 +                               va, pfn_pte(pfn, PAGE_KERNEL), 0))
17484 +                               BUG();
17485 +       } else
17486 +               clear_bit(PG_pinned, &pte->flags);
17487 +
17488 +       ClearPageForeign(pte);
17489 +       init_page_count(pte);
17490 +
17491 +       __free_page(pte);
17492 +}
17493 +
17494 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
17495 +{
17496 +       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17497 +}
17498 +
17499 +/*
17500 + * List of all pgd's needed for non-PAE so it can invalidate entries
17501 + * in both cached and uncached pgd's; not needed for PAE since the
17502 + * kernel pmd is shared. If PAE were not to share the pmd a similar
17503 + * tactic would be needed. This is essentially codepath-based locking
17504 + * against pageattr.c; it is the unique case in which a valid change
17505 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
17506 + * vmalloc faults work because attached pagetables are never freed.
17507 + * The locking scheme was chosen on the basis of manfred's
17508 + * recommendations and having no core impact whatsoever.
17509 + * -- wli
17510 + */
17511 +DEFINE_SPINLOCK(pgd_lock);
17512 +struct page *pgd_list;
17513 +
17514 +static inline void pgd_list_add(pgd_t *pgd)
17515 +{
17516 +       struct page *page = virt_to_page(pgd);
17517 +       page->index = (unsigned long)pgd_list;
17518 +       if (pgd_list)
17519 +               set_page_private(pgd_list, (unsigned long)&page->index);
17520 +       pgd_list = page;
17521 +       set_page_private(page, (unsigned long)&pgd_list);
17522 +}
17523 +
17524 +static inline void pgd_list_del(pgd_t *pgd)
17525 +{
17526 +       struct page *next, **pprev, *page = virt_to_page(pgd);
17527 +       next = (struct page *)page->index;
17528 +       pprev = (struct page **)page_private(page);
17529 +       *pprev = next;
17530 +       if (next)
17531 +               set_page_private(next, (unsigned long)pprev);
17532 +}
17533 +
17534 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17535 +{
17536 +       unsigned long flags;
17537 +
17538 +       if (PTRS_PER_PMD > 1) {
17539 +               if (HAVE_SHARED_KERNEL_PMD)
17540 +                       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17541 +                                       swapper_pg_dir + USER_PTRS_PER_PGD,
17542 +                                       KERNEL_PGD_PTRS);
17543 +       } else {
17544 +               spin_lock_irqsave(&pgd_lock, flags);
17545 +               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17546 +                               swapper_pg_dir + USER_PTRS_PER_PGD,
17547 +                               KERNEL_PGD_PTRS);
17548 +               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
17549 +               pgd_list_add(pgd);
17550 +               spin_unlock_irqrestore(&pgd_lock, flags);
17551 +       }
17552 +}
17553 +
17554 +/* never called when PTRS_PER_PMD > 1 */
17555 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17556 +{
17557 +       unsigned long flags; /* can be called from interrupt context */
17558 +
17559 +       spin_lock_irqsave(&pgd_lock, flags);
17560 +       pgd_list_del(pgd);
17561 +       spin_unlock_irqrestore(&pgd_lock, flags);
17562 +
17563 +       pgd_test_and_unpin(pgd);
17564 +}
17565 +
17566 +pgd_t *pgd_alloc(struct mm_struct *mm)
17567 +{
17568 +       int i;
17569 +       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
17570 +       pmd_t **pmd;
17571 +       unsigned long flags;
17572 +
17573 +       pgd_test_and_unpin(pgd);
17574 +
17575 +       if (PTRS_PER_PMD == 1 || !pgd)
17576 +               return pgd;
17577 +
17578 +       if (HAVE_SHARED_KERNEL_PMD) {
17579 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17580 +                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17581 +                       if (!pmd)
17582 +                               goto out_oom;
17583 +                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
17584 +               }
17585 +               return pgd;
17586 +       }
17587 +
17588 +       /*
17589 +        * We can race save/restore (if we sleep during a GFP_KERNEL memory
17590 +        * allocation). We therefore store virtual addresses of pmds as they
17591 +        * do not change across save/restore, and poke the machine addresses
17592 +        * into the pgdir under the pgd_lock.
17593 +        */
17594 +       pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
17595 +       if (!pmd) {
17596 +               kmem_cache_free(pgd_cache, pgd);
17597 +               return NULL;
17598 +       }
17599 +
17600 +       /* Allocate pmds, remember virtual addresses. */
17601 +       for (i = 0; i < PTRS_PER_PGD; ++i) {
17602 +               pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17603 +               if (!pmd[i])
17604 +                       goto out_oom;
17605 +       }
17606 +
17607 +       spin_lock_irqsave(&pgd_lock, flags);
17608 +
17609 +       /* Protect against save/restore: move below 4GB under pgd_lock. */
17610 +       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
17611 +               int rc = xen_create_contiguous_region(
17612 +                       (unsigned long)pgd, 0, 32);
17613 +               if (rc) {
17614 +                       spin_unlock_irqrestore(&pgd_lock, flags);
17615 +                       goto out_oom;
17616 +               }
17617 +       }
17618 +
17619 +       /* Copy kernel pmd contents and write-protect the new pmds. */
17620 +       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17621 +               unsigned long v = (unsigned long)i << PGDIR_SHIFT;
17622 +               pgd_t *kpgd = pgd_offset_k(v);
17623 +               pud_t *kpud = pud_offset(kpgd, v);
17624 +               pmd_t *kpmd = pmd_offset(kpud, v);
17625 +               memcpy(pmd[i], kpmd, PAGE_SIZE);
17626 +               make_lowmem_page_readonly(
17627 +                       pmd[i], XENFEAT_writable_page_tables);
17628 +       }
17629 +
17630 +       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
17631 +       for (i = 0; i < PTRS_PER_PGD; i++)
17632 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
17633 +
17634 +       /* Ensure this pgd gets picked up and pinned on save/restore. */
17635 +       pgd_list_add(pgd);
17636 +
17637 +       spin_unlock_irqrestore(&pgd_lock, flags);
17638 +
17639 +       kfree(pmd);
17640 +
17641 +       return pgd;
17642 +
17643 +out_oom:
17644 +       if (HAVE_SHARED_KERNEL_PMD) {
17645 +               for (i--; i >= 0; i--)
17646 +                       kmem_cache_free(pmd_cache,
17647 +                                       (void *)__va(pgd_val(pgd[i])-1));
17648 +       } else {
17649 +               for (i--; i >= 0; i--)
17650 +                       kmem_cache_free(pmd_cache, pmd[i]);
17651 +               kfree(pmd);
17652 +       }
17653 +       kmem_cache_free(pgd_cache, pgd);
17654 +       return NULL;
17655 +}
17656 +
17657 +void pgd_free(pgd_t *pgd)
17658 +{
17659 +       int i;
17660 +
17661 +       /*
17662 +        * After this the pgd should not be pinned for the duration of this
17663 +        * function's execution. We should never sleep and thus never race:
17664 +        *  1. User pmds will not become write-protected under our feet due
17665 +        *     to a concurrent mm_pin_all().
17666 +        *  2. The machine addresses in PGD entries will not become invalid
17667 +        *     due to a concurrent save/restore.
17668 +        */
17669 +       pgd_test_and_unpin(pgd);
17670 +
17671 +       /* in the PAE case user pgd entries are overwritten before usage */
17672 +       if (PTRS_PER_PMD > 1) {
17673 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17674 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17675 +                       kmem_cache_free(pmd_cache, pmd);
17676 +               }
17677 +
17678 +               if (!HAVE_SHARED_KERNEL_PMD) {
17679 +                       unsigned long flags;
17680 +                       spin_lock_irqsave(&pgd_lock, flags);
17681 +                       pgd_list_del(pgd);
17682 +                       spin_unlock_irqrestore(&pgd_lock, flags);
17683 +
17684 +                       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17685 +                               pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17686 +                               make_lowmem_page_writable(
17687 +                                       pmd, XENFEAT_writable_page_tables);
17688 +                               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17689 +                               kmem_cache_free(pmd_cache, pmd);
17690 +                       }
17691 +
17692 +                       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
17693 +                               xen_destroy_contiguous_region(
17694 +                                       (unsigned long)pgd, 0);
17695 +               }
17696 +       }
17697 +
17698 +       /* in the non-PAE case, free_pgtables() clears user pgd entries */
17699 +       kmem_cache_free(pgd_cache, pgd);
17700 +}
17701 +
17702 +void make_lowmem_page_readonly(void *va, unsigned int feature)
17703 +{
17704 +       pte_t *pte;
17705 +       int rc;
17706 +
17707 +       if (xen_feature(feature))
17708 +               return;
17709 +
17710 +       pte = virt_to_ptep(va);
17711 +       rc = HYPERVISOR_update_va_mapping(
17712 +               (unsigned long)va, pte_wrprotect(*pte), 0);
17713 +       BUG_ON(rc);
17714 +}
17715 +
17716 +void make_lowmem_page_writable(void *va, unsigned int feature)
17717 +{
17718 +       pte_t *pte;
17719 +       int rc;
17720 +
17721 +       if (xen_feature(feature))
17722 +               return;
17723 +
17724 +       pte = virt_to_ptep(va);
17725 +       rc = HYPERVISOR_update_va_mapping(
17726 +               (unsigned long)va, pte_mkwrite(*pte), 0);
17727 +       BUG_ON(rc);
17728 +}
17729 +
17730 +void make_page_readonly(void *va, unsigned int feature)
17731 +{
17732 +       pte_t *pte;
17733 +       int rc;
17734 +
17735 +       if (xen_feature(feature))
17736 +               return;
17737 +
17738 +       pte = virt_to_ptep(va);
17739 +       rc = HYPERVISOR_update_va_mapping(
17740 +               (unsigned long)va, pte_wrprotect(*pte), 0);
17741 +       if (rc) /* fallback? */
17742 +               xen_l1_entry_update(pte, pte_wrprotect(*pte));
17743 +       if ((unsigned long)va >= (unsigned long)high_memory) {
17744 +               unsigned long pfn = pte_pfn(*pte);
17745 +#ifdef CONFIG_HIGHMEM
17746 +               if (pfn >= highstart_pfn)
17747 +                       kmap_flush_unused(); /* flush stale writable kmaps */
17748 +               else
17749 +#endif
17750 +                       make_lowmem_page_readonly(
17751 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
17752 +       }
17753 +}
17754 +
17755 +void make_page_writable(void *va, unsigned int feature)
17756 +{
17757 +       pte_t *pte;
17758 +       int rc;
17759 +
17760 +       if (xen_feature(feature))
17761 +               return;
17762 +
17763 +       pte = virt_to_ptep(va);
17764 +       rc = HYPERVISOR_update_va_mapping(
17765 +               (unsigned long)va, pte_mkwrite(*pte), 0);
17766 +       if (rc) /* fallback? */
17767 +               xen_l1_entry_update(pte, pte_mkwrite(*pte));
17768 +       if ((unsigned long)va >= (unsigned long)high_memory) {
17769 +               unsigned long pfn = pte_pfn(*pte);
17770 +#ifdef CONFIG_HIGHMEM
17771 +               if (pfn < highstart_pfn)
17772 +#endif
17773 +                       make_lowmem_page_writable(
17774 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
17775 +       }
17776 +}
17777 +
17778 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17779 +{
17780 +       if (xen_feature(feature))
17781 +               return;
17782 +
17783 +       while (nr-- != 0) {
17784 +               make_page_readonly(va, feature);
17785 +               va = (void *)((unsigned long)va + PAGE_SIZE);
17786 +       }
17787 +}
17788 +
17789 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17790 +{
17791 +       if (xen_feature(feature))
17792 +               return;
17793 +
17794 +       while (nr-- != 0) {
17795 +               make_page_writable(va, feature);
17796 +               va = (void *)((unsigned long)va + PAGE_SIZE);
17797 +       }
17798 +}
17799 +
17800 +static void _pin_lock(struct mm_struct *mm, int lock) {
17801 +       if (lock)
17802 +               spin_lock(&mm->page_table_lock);
17803 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17804 +       /* While mm->page_table_lock protects us against insertions and
17805 +        * removals of higher level page table pages, it doesn't protect
17806 +        * against updates of pte-s. Such updates, however, require the
17807 +        * pte pages to be in consistent state (unpinned+writable or
17808 +        * pinned+readonly). The pinning and attribute changes, however
17809 +        * cannot be done atomically, which is why such updates must be
17810 +        * prevented from happening concurrently.
17811 +        * Note that no pte lock can ever elsewhere be acquired nesting
17812 +        * with an already acquired one in the same mm, or with the mm's
17813 +        * page_table_lock already acquired, as that would break in the
17814 +        * non-split case (where all these are actually resolving to the
17815 +        * one page_table_lock). Thus acquiring all of them here is not
17816 +        * going to result in dead locks, and the order of acquires
17817 +        * doesn't matter.
17818 +        */
17819 +       {
17820 +               pgd_t *pgd = mm->pgd;
17821 +               unsigned g;
17822 +
17823 +               for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17824 +                       pud_t *pud;
17825 +                       unsigned u;
17826 +
17827 +                       if (pgd_none(*pgd))
17828 +                               continue;
17829 +                       pud = pud_offset(pgd, 0);
17830 +                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17831 +                               pmd_t *pmd;
17832 +                               unsigned m;
17833 +
17834 +                               if (pud_none(*pud))
17835 +                                       continue;
17836 +                               pmd = pmd_offset(pud, 0);
17837 +                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17838 +                                       spinlock_t *ptl;
17839 +
17840 +                                       if (pmd_none(*pmd))
17841 +                                               continue;
17842 +                                       ptl = pte_lockptr(0, pmd);
17843 +                                       if (lock)
17844 +                                               spin_lock(ptl);
17845 +                                       else
17846 +                                               spin_unlock(ptl);
17847 +                               }
17848 +                       }
17849 +               }
17850 +       }
17851 +#endif
17852 +       if (!lock)
17853 +               spin_unlock(&mm->page_table_lock);
17854 +}
17855 +#define pin_lock(mm) _pin_lock(mm, 1)
17856 +#define pin_unlock(mm) _pin_lock(mm, 0)
17857 +
17858 +#define PIN_BATCH 4
17859 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17860 +
17861 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
17862 +                                             unsigned int cpu, unsigned seq)
17863 +{
17864 +       unsigned long pfn = page_to_pfn(page);
17865 +
17866 +       if (PageHighMem(page)) {
17867 +               if (pgprot_val(flags) & _PAGE_RW)
17868 +                       clear_bit(PG_pinned, &page->flags);
17869 +               else
17870 +                       set_bit(PG_pinned, &page->flags);
17871 +       } else {
17872 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17873 +                               (unsigned long)__va(pfn << PAGE_SHIFT),
17874 +                               pfn_pte(pfn, flags), 0);
17875 +               if (unlikely(++seq == PIN_BATCH)) {
17876 +                       if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17877 +                                                               PIN_BATCH, NULL)))
17878 +                               BUG();
17879 +                       seq = 0;
17880 +               }
17881 +       }
17882 +
17883 +       return seq;
17884 +}
17885 +
17886 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17887 +{
17888 +       pgd_t *pgd = pgd_base;
17889 +       pud_t *pud;
17890 +       pmd_t *pmd;
17891 +       int    g, u, m;
17892 +       unsigned int cpu, seq;
17893 +
17894 +       if (xen_feature(XENFEAT_auto_translated_physmap))
17895 +               return;
17896 +
17897 +       cpu = get_cpu();
17898 +
17899 +       for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17900 +               if (pgd_none(*pgd))
17901 +                       continue;
17902 +               pud = pud_offset(pgd, 0);
17903 +               if (PTRS_PER_PUD > 1) /* not folded */
17904 +                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
17905 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17906 +                       if (pud_none(*pud))
17907 +                               continue;
17908 +                       pmd = pmd_offset(pud, 0);
17909 +                       if (PTRS_PER_PMD > 1) /* not folded */
17910 +                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
17911 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17912 +                               if (pmd_none(*pmd))
17913 +                                       continue;
17914 +                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
17915 +                       }
17916 +               }
17917 +       }
17918 +
17919 +       if (likely(seq != 0)) {
17920 +               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17921 +                       (unsigned long)pgd_base,
17922 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17923 +                       UVMF_TLB_FLUSH);
17924 +               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17925 +                                                       seq + 1, NULL)))
17926 +                       BUG();
17927 +       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
17928 +                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17929 +                       UVMF_TLB_FLUSH))
17930 +               BUG();
17931 +
17932 +       put_cpu();
17933 +}
17934 +
17935 +static void __pgd_pin(pgd_t *pgd)
17936 +{
17937 +       pgd_walk(pgd, PAGE_KERNEL_RO);
17938 +       kmap_flush_unused();
17939 +       xen_pgd_pin(__pa(pgd));
17940 +       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
17941 +}
17942 +
17943 +static void __pgd_unpin(pgd_t *pgd)
17944 +{
17945 +       xen_pgd_unpin(__pa(pgd));
17946 +       pgd_walk(pgd, PAGE_KERNEL);
17947 +       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
17948 +}
17949 +
17950 +static void pgd_test_and_unpin(pgd_t *pgd)
17951 +{
17952 +       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
17953 +               __pgd_unpin(pgd);
17954 +}
17955 +
17956 +void mm_pin(struct mm_struct *mm)
17957 +{
17958 +       if (xen_feature(XENFEAT_writable_page_tables))
17959 +               return;
17960 +       pin_lock(mm);
17961 +       __pgd_pin(mm->pgd);
17962 +       pin_unlock(mm);
17963 +}
17964 +
17965 +void mm_unpin(struct mm_struct *mm)
17966 +{
17967 +       if (xen_feature(XENFEAT_writable_page_tables))
17968 +               return;
17969 +       pin_lock(mm);
17970 +       __pgd_unpin(mm->pgd);
17971 +       pin_unlock(mm);
17972 +}
17973 +
17974 +void mm_pin_all(void)
17975 +{
17976 +       struct page *page;
17977 +       unsigned long flags;
17978 +
17979 +       if (xen_feature(XENFEAT_writable_page_tables))
17980 +               return;
17981 +
17982 +       /*
17983 +        * Allow uninterrupted access to the pgd_list. Also protects
17984 +        * __pgd_pin() by disabling preemption.
17985 +        * All other CPUs must be at a safe point (e.g., in stop_machine
17986 +        * or offlined entirely).
17987 +        */
17988 +       spin_lock_irqsave(&pgd_lock, flags);
17989 +       for (page = pgd_list; page; page = (struct page *)page->index) {
17990 +               if (!test_bit(PG_pinned, &page->flags))
17991 +                       __pgd_pin((pgd_t *)page_address(page));
17992 +       }
17993 +       spin_unlock_irqrestore(&pgd_lock, flags);
17994 +}
17995 +
17996 +void _arch_dup_mmap(struct mm_struct *mm)
17997 +{
17998 +       if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
17999 +               mm_pin(mm);
18000 +}
18001 +
18002 +void _arch_exit_mmap(struct mm_struct *mm)
18003 +{
18004 +       struct task_struct *tsk = current;
18005 +
18006 +       task_lock(tsk);
18007 +
18008 +       /*
18009 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18010 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18011 +        */
18012 +       if (tsk->active_mm == mm) {
18013 +               tsk->active_mm = &init_mm;
18014 +               atomic_inc(&init_mm.mm_count);
18015 +
18016 +               switch_mm(mm, &init_mm, tsk);
18017 +
18018 +               atomic_dec(&mm->mm_count);
18019 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
18020 +       }
18021 +
18022 +       task_unlock(tsk);
18023 +
18024 +       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
18025 +           (atomic_read(&mm->mm_count) == 1) &&
18026 +           !mm->context.has_foreign_mappings)
18027 +               mm_unpin(mm);
18028 +}
18029 Index: head-2008-11-25/arch/x86/oprofile/xenoprof.c
18030 ===================================================================
18031 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
18032 +++ head-2008-11-25/arch/x86/oprofile/xenoprof.c        2008-01-28 12:24:19.000000000 +0100
18033 @@ -0,0 +1,179 @@
18034 +/**
18035 + * @file xenoprof.c
18036 + *
18037 + * @remark Copyright 2002 OProfile authors
18038 + * @remark Read the file COPYING
18039 + *
18040 + * @author John Levon <levon@movementarian.org>
18041 + *
18042 + * Modified by Aravind Menon and Jose Renato Santos for Xen
18043 + * These modifications are:
18044 + * Copyright (C) 2005 Hewlett-Packard Co.
18045 + *
18046 + * x86-specific part
18047 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
18048 + *                    VA Linux Systems Japan K.K.
18049 + */
18050 +
18051 +#include <linux/init.h>
18052 +#include <linux/oprofile.h>
18053 +#include <linux/sched.h>
18054 +#include <asm/pgtable.h>
18055 +
18056 +#include <xen/driver_util.h>
18057 +#include <xen/interface/xen.h>
18058 +#include <xen/interface/xenoprof.h>
18059 +#include <xen/xenoprof.h>
18060 +#include "op_counter.h"
18061 +
18062 +static unsigned int num_events = 0;
18063 +
18064 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
18065 +{
18066 +       num_events = init->num_events;
18067 +       /* just in case - make sure we do not overflow event list
18068 +          (i.e. counter_config list) */
18069 +       if (num_events > OP_MAX_COUNTER) {
18070 +               num_events = OP_MAX_COUNTER;
18071 +               init->num_events = num_events;
18072 +       }
18073 +}
18074 +
18075 +void xenoprof_arch_counter(void)
18076 +{
18077 +       int i;
18078 +       struct xenoprof_counter counter;
18079 +
18080 +       for (i=0; i<num_events; i++) {
18081 +               counter.ind       = i;
18082 +               counter.count     = (uint64_t)counter_config[i].count;
18083 +               counter.enabled   = (uint32_t)counter_config[i].enabled;
18084 +               counter.event     = (uint32_t)counter_config[i].event;
18085 +               counter.kernel    = (uint32_t)counter_config[i].kernel;
18086 +               counter.user      = (uint32_t)counter_config[i].user;
18087 +               counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
18088 +               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
18089 +                                              &counter));
18090 +       }
18091 +}
18092 +
18093 +void xenoprof_arch_start(void)
18094 +{
18095 +       /* nothing */
18096 +}
18097 +
18098 +void xenoprof_arch_stop(void)
18099 +{
18100 +       /* nothing */
18101 +}
18102 +
18103 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
18104 +{
18105 +       if (sbuf->buffer) {
18106 +               vunmap(sbuf->buffer);
18107 +               sbuf->buffer = NULL;
18108 +       }
18109 +}
18110 +
18111 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
18112 +                                   struct xenoprof_shared_buffer * sbuf)
18113 +{
18114 +       int npages, ret;
18115 +       struct vm_struct *area;
18116 +
18117 +       sbuf->buffer = NULL;
18118 +       if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
18119 +               return ret;
18120 +
18121 +       npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
18122 +
18123 +       area = alloc_vm_area(npages * PAGE_SIZE);
18124 +       if (area == NULL)
18125 +               return -ENOMEM;
18126 +
18127 +       if ( (ret = direct_kernel_remap_pfn_range(
18128 +                     (unsigned long)area->addr,
18129 +                     get_buffer->buf_gmaddr >> PAGE_SHIFT,
18130 +                     npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
18131 +                     DOMID_SELF)) ) {
18132 +               vunmap(area->addr);
18133 +               return ret;
18134 +       }
18135 +
18136 +       sbuf->buffer = area->addr;
18137 +       return ret;
18138 +}
18139 +
18140 +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
18141 +                             struct xenoprof_shared_buffer * sbuf)
18142 +{
18143 +       int ret;
18144 +       int npages;
18145 +       struct vm_struct *area;
18146 +       pgprot_t prot = __pgprot(_KERNPG_TABLE);
18147 +
18148 +       sbuf->buffer = NULL;
18149 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
18150 +       if (ret)
18151 +               goto out;
18152 +
18153 +       npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
18154 +
18155 +       area = alloc_vm_area(npages * PAGE_SIZE);
18156 +       if (area == NULL) {
18157 +               ret = -ENOMEM;
18158 +               goto out;
18159 +       }
18160 +
18161 +       ret = direct_kernel_remap_pfn_range(
18162 +               (unsigned long)area->addr,
18163 +               pdomain->buf_gmaddr >> PAGE_SHIFT,
18164 +               npages * PAGE_SIZE, prot, DOMID_SELF);
18165 +       if (ret) {
18166 +               vunmap(area->addr);
18167 +               goto out;
18168 +       }
18169 +       sbuf->buffer = area->addr;
18170 +
18171 +out:
18172 +       return ret;
18173 +}
18174 +
18175 +struct op_counter_config counter_config[OP_MAX_COUNTER];
18176 +
18177 +int xenoprof_create_files(struct super_block * sb, struct dentry * root)
18178 +{
18179 +       unsigned int i;
18180 +
18181 +       for (i = 0; i < num_events; ++i) {
18182 +               struct dentry * dir;
18183 +               char buf[2];
18184 +
18185 +               snprintf(buf, 2, "%d", i);
18186 +               dir = oprofilefs_mkdir(sb, root, buf);
18187 +               oprofilefs_create_ulong(sb, dir, "enabled",
18188 +                                       &counter_config[i].enabled);
18189 +               oprofilefs_create_ulong(sb, dir, "event",
18190 +                                       &counter_config[i].event);
18191 +               oprofilefs_create_ulong(sb, dir, "count",
18192 +                                       &counter_config[i].count);
18193 +               oprofilefs_create_ulong(sb, dir, "unit_mask",
18194 +                                       &counter_config[i].unit_mask);
18195 +               oprofilefs_create_ulong(sb, dir, "kernel",
18196 +                                       &counter_config[i].kernel);
18197 +               oprofilefs_create_ulong(sb, dir, "user",
18198 +                                       &counter_config[i].user);
18199 +       }
18200 +
18201 +       return 0;
18202 +}
18203 +
18204 +int __init oprofile_arch_init(struct oprofile_operations * ops)
18205 +{
18206 +       return xenoprofile_init(ops);
18207 +}
18208 +
18209 +void oprofile_arch_exit(void)
18210 +{
18211 +       xenoprofile_exit();
18212 +}
18213 Index: head-2008-11-25/arch/x86/pci/irq-xen.c
18214 ===================================================================
18215 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
18216 +++ head-2008-11-25/arch/x86/pci/irq-xen.c      2008-03-06 08:54:32.000000000 +0100
18217 @@ -0,0 +1,1211 @@
18218 +/*
18219 + *     Low-Level PCI Support for PC -- Routing of Interrupts
18220 + *
18221 + *     (c) 1999--2000 Martin Mares <mj@ucw.cz>
18222 + */
18223 +
18224 +#include <linux/types.h>
18225 +#include <linux/kernel.h>
18226 +#include <linux/pci.h>
18227 +#include <linux/init.h>
18228 +#include <linux/slab.h>
18229 +#include <linux/interrupt.h>
18230 +#include <linux/dmi.h>
18231 +#include <asm/io.h>
18232 +#include <asm/smp.h>
18233 +#include <asm/io_apic.h>
18234 +#include <linux/irq.h>
18235 +#include <linux/acpi.h>
18236 +
18237 +#include "pci.h"
18238 +
18239 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
18240 +#define PIRQ_VERSION 0x0100
18241 +
18242 +static int broken_hp_bios_irq9;
18243 +static int acer_tm360_irqrouting;
18244 +
18245 +static struct irq_routing_table *pirq_table;
18246 +
18247 +static int pirq_enable_irq(struct pci_dev *dev);
18248 +
18249 +/*
18250 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
18251 + * Avoid using: 13, 14 and 15 (FP error and IDE).
18252 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
18253 + */
18254 +unsigned int pcibios_irq_mask = 0xfff8;
18255 +
18256 +static int pirq_penalty[16] = {
18257 +       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
18258 +       0, 0, 0, 0, 1000, 100000, 100000, 100000
18259 +};
18260 +
18261 +struct irq_router {
18262 +       char *name;
18263 +       u16 vendor, device;
18264 +       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
18265 +       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
18266 +};
18267 +
18268 +struct irq_router_handler {
18269 +       u16 vendor;
18270 +       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
18271 +};
18272 +
18273 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
18274 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
18275 +
18276 +/*
18277 + *  Check passed address for the PCI IRQ Routing Table signature
18278 + *  and perform checksum verification.
18279 + */
18280 +
18281 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
18282 +{
18283 +       struct irq_routing_table *rt;
18284 +       int i;
18285 +       u8 sum;
18286 +
18287 +       rt = (struct irq_routing_table *) addr;
18288 +       if (rt->signature != PIRQ_SIGNATURE ||
18289 +           rt->version != PIRQ_VERSION ||
18290 +           rt->size % 16 ||
18291 +           rt->size < sizeof(struct irq_routing_table))
18292 +               return NULL;
18293 +       sum = 0;
18294 +       for (i=0; i < rt->size; i++)
18295 +               sum += addr[i];
18296 +       if (!sum) {
18297 +               DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
18298 +               return rt;
18299 +       }
18300 +       return NULL;
18301 +}
18302 +
18303 +
18304 +
18305 +/*
18306 + *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
18307 + */
18308 +
18309 +static struct irq_routing_table * __init pirq_find_routing_table(void)
18310 +{
18311 +       u8 *addr;
18312 +       struct irq_routing_table *rt;
18313 +
18314 +#ifdef CONFIG_XEN
18315 +       if (!is_initial_xendomain())
18316 +               return NULL;
18317 +#endif
18318 +       if (pirq_table_addr) {
18319 +               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
18320 +               if (rt)
18321 +                       return rt;
18322 +               printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
18323 +       }
18324 +       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
18325 +               rt = pirq_check_routing_table(addr);
18326 +               if (rt)
18327 +                       return rt;
18328 +       }
18329 +       return NULL;
18330 +}
18331 +
18332 +/*
18333 + *  If we have a IRQ routing table, use it to search for peer host
18334 + *  bridges.  It's a gross hack, but since there are no other known
18335 + *  ways how to get a list of buses, we have to go this way.
18336 + */
18337 +
18338 +static void __init pirq_peer_trick(void)
18339 +{
18340 +       struct irq_routing_table *rt = pirq_table;
18341 +       u8 busmap[256];
18342 +       int i;
18343 +       struct irq_info *e;
18344 +
18345 +       memset(busmap, 0, sizeof(busmap));
18346 +       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
18347 +               e = &rt->slots[i];
18348 +#ifdef DEBUG
18349 +               {
18350 +                       int j;
18351 +                       DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
18352 +                       for(j=0; j<4; j++)
18353 +                               DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
18354 +                       DBG("\n");
18355 +               }
18356 +#endif
18357 +               busmap[e->bus] = 1;
18358 +       }
18359 +       for(i = 1; i < 256; i++) {
18360 +               if (!busmap[i] || pci_find_bus(0, i))
18361 +                       continue;
18362 +               if (pci_scan_bus(i, &pci_root_ops, NULL))
18363 +                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
18364 +       }
18365 +       pcibios_last_bus = -1;
18366 +}
18367 +
18368 +/*
18369 + *  Code for querying and setting of IRQ routes on various interrupt routers.
18370 + */
18371 +
18372 +void eisa_set_level_irq(unsigned int irq)
18373 +{
18374 +       unsigned char mask = 1 << (irq & 7);
18375 +       unsigned int port = 0x4d0 + (irq >> 3);
18376 +       unsigned char val;
18377 +       static u16 eisa_irq_mask;
18378 +
18379 +       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
18380 +               return;
18381 +
18382 +       eisa_irq_mask |= (1 << irq);
18383 +       printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
18384 +       val = inb(port);
18385 +       if (!(val & mask)) {
18386 +               DBG(KERN_DEBUG " -> edge");
18387 +               outb(val | mask, port);
18388 +       }
18389 +}
18390 +
18391 +/*
18392 + * Common IRQ routing practice: nybbles in config space,
18393 + * offset by some magic constant.
18394 + */
18395 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
18396 +{
18397 +       u8 x;
18398 +       unsigned reg = offset + (nr >> 1);
18399 +
18400 +       pci_read_config_byte(router, reg, &x);
18401 +       return (nr & 1) ? (x >> 4) : (x & 0xf);
18402 +}
18403 +
18404 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
18405 +{
18406 +       u8 x;
18407 +       unsigned reg = offset + (nr >> 1);
18408 +
18409 +       pci_read_config_byte(router, reg, &x);
18410 +       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
18411 +       pci_write_config_byte(router, reg, x);
18412 +}
18413 +
18414 +/*
18415 + * ALI pirq entries are damn ugly, and completely undocumented.
18416 + * This has been figured out from pirq tables, and it's not a pretty
18417 + * picture.
18418 + */
18419 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18420 +{
18421 +       static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18422 +
18423 +       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18424 +}
18425 +
18426 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18427 +{
18428 +       static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18429 +       unsigned int val = irqmap[irq];
18430 +
18431 +       if (val) {
18432 +               write_config_nybble(router, 0x48, pirq-1, val);
18433 +               return 1;
18434 +       }
18435 +       return 0;
18436 +}
18437 +
18438 +/*
18439 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
18440 + * just a pointer to the config space.
18441 + */
18442 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18443 +{
18444 +       u8 x;
18445 +
18446 +       pci_read_config_byte(router, pirq, &x);
18447 +       return (x < 16) ? x : 0;
18448 +}
18449 +
18450 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18451 +{
18452 +       pci_write_config_byte(router, pirq, irq);
18453 +       return 1;
18454 +}
18455 +
18456 +/*
18457 + * The VIA pirq rules are nibble-based, like ALI,
18458 + * but without the ugly irq number munging.
18459 + * However, PIRQD is in the upper instead of lower 4 bits.
18460 + */
18461 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18462 +{
18463 +       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
18464 +}
18465 +
18466 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18467 +{
18468 +       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
18469 +       return 1;
18470 +}
18471 +
18472 +/*
18473 + * The VIA pirq rules are nibble-based, like ALI,
18474 + * but without the ugly irq number munging.
18475 + * However, for 82C586, nibble map is different .
18476 + */
18477 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18478 +{
18479 +       static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18480 +       return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18481 +}
18482 +
18483 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18484 +{
18485 +       static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18486 +       write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18487 +       return 1;
18488 +}
18489 +
18490 +/*
18491 + * ITE 8330G pirq rules are nibble-based
18492 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
18493 + *       2+3 are both mapped to irq 9 on my system
18494 + */
18495 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18496 +{
18497 +       static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18498 +       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18499 +}
18500 +
18501 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18502 +{
18503 +       static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18504 +       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18505 +       return 1;
18506 +}
18507 +
18508 +/*
18509 + * OPTI: high four bits are nibble pointer..
18510 + * I wonder what the low bits do?
18511 + */
18512 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18513 +{
18514 +       return read_config_nybble(router, 0xb8, pirq >> 4);
18515 +}
18516 +
18517 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18518 +{
18519 +       write_config_nybble(router, 0xb8, pirq >> 4, irq);
18520 +       return 1;
18521 +}
18522 +
18523 +/*
18524 + * Cyrix: nibble offset 0x5C
18525 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
18526 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
18527 + */
18528 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18529 +{
18530 +       return read_config_nybble(router, 0x5C, (pirq-1)^1);
18531 +}
18532 +
18533 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18534 +{
18535 +       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
18536 +       return 1;
18537 +}
18538 +
18539 +/*
18540 + *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
18541 + *     We have to deal with the following issues here:
18542 + *     - vendors have different ideas about the meaning of link values
18543 + *     - some onboard devices (integrated in the chipset) have special
18544 + *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
18545 + *     - different revision of the router have a different layout for
18546 + *       the routing registers, particularly for the onchip devices
18547 + *
18548 + *     For all routing registers the common thing is we have one byte
18549 + *     per routeable link which is defined as:
18550 + *              bit 7      IRQ mapping enabled (0) or disabled (1)
18551 + *              bits [6:4] reserved (sometimes used for onchip devices)
18552 + *              bits [3:0] IRQ to map to
18553 + *                  allowed: 3-7, 9-12, 14-15
18554 + *                  reserved: 0, 1, 2, 8, 13
18555 + *
18556 + *     The config-space registers located at 0x41/0x42/0x43/0x44 are
18557 + *     always used to route the normal PCI INT A/B/C/D respectively.
18558 + *     Apparently there are systems implementing PCI routing table using
18559 + *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
18560 + *     We try our best to handle both link mappings.
18561 + *
18562 + *     Currently (2003-05-21) it appears most SiS chipsets follow the
18563 + *     definition of routing registers from the SiS-5595 southbridge.
18564 + *     According to the SiS 5595 datasheets the revision id's of the
18565 + *     router (ISA-bridge) should be 0x01 or 0xb0.
18566 + *
18567 + *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
18568 + *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
18569 + *     They seem to work with the current routing code. However there is
18570 + *     some concern because of the two USB-OHCI HCs (original SiS 5595
18571 + *     had only one). YMMV.
18572 + *
18573 + *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
18574 + *
18575 + *     0x61:   IDEIRQ:
18576 + *             bits [6:5] must be written 01
18577 + *             bit 4 channel-select primary (0), secondary (1)
18578 + *
18579 + *     0x62:   USBIRQ:
18580 + *             bit 6 OHCI function disabled (0), enabled (1)
18581 + *
18582 + *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
18583 + *
18584 + *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
18585 + *
18586 + *     We support USBIRQ (in addition to INTA-INTD) and keep the
18587 + *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
18588 + *
18589 + *     Currently the only reported exception is the new SiS 65x chipset
18590 + *     which includes the SiS 69x southbridge. Here we have the 85C503
18591 + *     router revision 0x04 and there are changes in the register layout
18592 + *     mostly related to the different USB HCs with USB 2.0 support.
18593 + *
18594 + *     Onchip routing for router rev-id 0x04 (try-and-error observation)
18595 + *
18596 + *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
18597 + *                             bit 6-4 are probably unused, not like 5595
18598 + */
18599 +
18600 +#define PIRQ_SIS_IRQ_MASK      0x0f
18601 +#define PIRQ_SIS_IRQ_DISABLE   0x80
18602 +#define PIRQ_SIS_USB_ENABLE    0x40
18603 +
18604 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18605 +{
18606 +       u8 x;
18607 +       int reg;
18608 +
18609 +       reg = pirq;
18610 +       if (reg >= 0x01 && reg <= 0x04)
18611 +               reg += 0x40;
18612 +       pci_read_config_byte(router, reg, &x);
18613 +       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
18614 +}
18615 +
18616 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18617 +{
18618 +       u8 x;
18619 +       int reg;
18620 +
18621 +       reg = pirq;
18622 +       if (reg >= 0x01 && reg <= 0x04)
18623 +               reg += 0x40;
18624 +       pci_read_config_byte(router, reg, &x);
18625 +       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
18626 +       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
18627 +       pci_write_config_byte(router, reg, x);
18628 +       return 1;
18629 +}
18630 +
18631 +
18632 +/*
18633 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
18634 + *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
18635 + *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
18636 + *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
18637 + *       for the busbridge to the docking station.
18638 + */
18639 +
18640 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18641 +{
18642 +       if (pirq > 8) {
18643 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18644 +               return 0;
18645 +       }
18646 +       return read_config_nybble(router, 0x74, pirq-1);
18647 +}
18648 +
18649 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18650 +{
18651 +       if (pirq > 8) {
18652 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18653 +               return 0;
18654 +       }
18655 +       write_config_nybble(router, 0x74, pirq-1, irq);
18656 +       return 1;
18657 +}
18658 +
18659 +/*
18660 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
18661 + * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
18662 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
18663 + * register is a straight binary coding of desired PIC IRQ (low nibble).
18664 + *
18665 + * The 'link' value in the PIRQ table is already in the correct format
18666 + * for the Index register.  There are some special index values:
18667 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
18668 + * and 0x03 for SMBus.
18669 + */
18670 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18671 +{
18672 +       outb_p(pirq, 0xc00);
18673 +       return inb(0xc01) & 0xf;
18674 +}
18675 +
18676 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18677 +{
18678 +       outb_p(pirq, 0xc00);
18679 +       outb_p(irq, 0xc01);
18680 +       return 1;
18681 +}
18682 +
18683 +/* Support for AMD756 PCI IRQ Routing
18684 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
18685 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
18686 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
18687 + * The AMD756 pirq rules are nibble-based
18688 + * offset 0x56 0-3 PIRQA  4-7  PIRQB
18689 + * offset 0x57 0-3 PIRQC  4-7  PIRQD
18690 + */
18691 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18692 +{
18693 +       u8 irq;
18694 +       irq = 0;
18695 +       if (pirq <= 4)
18696 +       {
18697 +               irq = read_config_nybble(router, 0x56, pirq - 1);
18698 +       }
18699 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
18700 +               dev->vendor, dev->device, pirq, irq);
18701 +       return irq;
18702 +}
18703 +
18704 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18705 +{
18706 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
18707 +               dev->vendor, dev->device, pirq, irq);
18708 +       if (pirq <= 4)
18709 +       {
18710 +               write_config_nybble(router, 0x56, pirq - 1, irq);
18711 +       }
18712 +       return 1;
18713 +}
18714 +
18715 +#ifdef CONFIG_PCI_BIOS
18716 +
18717 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18718 +{
18719 +       struct pci_dev *bridge;
18720 +       int pin = pci_get_interrupt_pin(dev, &bridge);
18721 +       return pcibios_set_irq_routing(bridge, pin, irq);
18722 +}
18723 +
18724 +#endif
18725 +
18726 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18727 +{
18728 +       static struct pci_device_id __initdata pirq_440gx[] = {
18729 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
18730 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
18731 +               { },
18732 +       };
18733 +
18734 +       /* 440GX has a proprietary PIRQ router -- don't use it */
18735 +       if (pci_dev_present(pirq_440gx))
18736 +               return 0;
18737 +
18738 +       switch(device)
18739 +       {
18740 +               case PCI_DEVICE_ID_INTEL_82371FB_0:
18741 +               case PCI_DEVICE_ID_INTEL_82371SB_0:
18742 +               case PCI_DEVICE_ID_INTEL_82371AB_0:
18743 +               case PCI_DEVICE_ID_INTEL_82371MX:
18744 +               case PCI_DEVICE_ID_INTEL_82443MX_0:
18745 +               case PCI_DEVICE_ID_INTEL_82801AA_0:
18746 +               case PCI_DEVICE_ID_INTEL_82801AB_0:
18747 +               case PCI_DEVICE_ID_INTEL_82801BA_0:
18748 +               case PCI_DEVICE_ID_INTEL_82801BA_10:
18749 +               case PCI_DEVICE_ID_INTEL_82801CA_0:
18750 +               case PCI_DEVICE_ID_INTEL_82801CA_12:
18751 +               case PCI_DEVICE_ID_INTEL_82801DB_0:
18752 +               case PCI_DEVICE_ID_INTEL_82801E_0:
18753 +               case PCI_DEVICE_ID_INTEL_82801EB_0:
18754 +               case PCI_DEVICE_ID_INTEL_ESB_1:
18755 +               case PCI_DEVICE_ID_INTEL_ICH6_0:
18756 +               case PCI_DEVICE_ID_INTEL_ICH6_1:
18757 +               case PCI_DEVICE_ID_INTEL_ICH7_0:
18758 +               case PCI_DEVICE_ID_INTEL_ICH7_1:
18759 +               case PCI_DEVICE_ID_INTEL_ICH7_30:
18760 +               case PCI_DEVICE_ID_INTEL_ICH7_31:
18761 +               case PCI_DEVICE_ID_INTEL_ESB2_0:
18762 +               case PCI_DEVICE_ID_INTEL_ICH8_0:
18763 +               case PCI_DEVICE_ID_INTEL_ICH8_1:
18764 +               case PCI_DEVICE_ID_INTEL_ICH8_2:
18765 +               case PCI_DEVICE_ID_INTEL_ICH8_3:
18766 +               case PCI_DEVICE_ID_INTEL_ICH8_4:
18767 +               case PCI_DEVICE_ID_INTEL_ICH9_0:
18768 +               case PCI_DEVICE_ID_INTEL_ICH9_1:
18769 +               case PCI_DEVICE_ID_INTEL_ICH9_2:
18770 +               case PCI_DEVICE_ID_INTEL_ICH9_3:
18771 +               case PCI_DEVICE_ID_INTEL_ICH9_4:
18772 +               case PCI_DEVICE_ID_INTEL_ICH9_5:
18773 +                       r->name = "PIIX/ICH";
18774 +                       r->get = pirq_piix_get;
18775 +                       r->set = pirq_piix_set;
18776 +                       return 1;
18777 +       }
18778 +       return 0;
18779 +}
18780 +
18781 +static __init int via_router_probe(struct irq_router *r,
18782 +                               struct pci_dev *router, u16 device)
18783 +{
18784 +       /* FIXME: We should move some of the quirk fixup stuff here */
18785 +
18786 +       /*
18787 +        * work arounds for some buggy BIOSes
18788 +        */
18789 +       if (device == PCI_DEVICE_ID_VIA_82C586_0) {
18790 +               switch(router->device) {
18791 +               case PCI_DEVICE_ID_VIA_82C686:
18792 +                       /*
18793 +                        * Asus k7m bios wrongly reports 82C686A
18794 +                        * as 586-compatible
18795 +                        */
18796 +                       device = PCI_DEVICE_ID_VIA_82C686;
18797 +                       break;
18798 +               case PCI_DEVICE_ID_VIA_8235:
18799 +                       /**
18800 +                        * Asus a7v-x bios wrongly reports 8235
18801 +                        * as 586-compatible
18802 +                        */
18803 +                       device = PCI_DEVICE_ID_VIA_8235;
18804 +                       break;
18805 +               }
18806 +       }
18807 +
18808 +       switch(device) {
18809 +       case PCI_DEVICE_ID_VIA_82C586_0:
18810 +               r->name = "VIA";
18811 +               r->get = pirq_via586_get;
18812 +               r->set = pirq_via586_set;
18813 +               return 1;
18814 +       case PCI_DEVICE_ID_VIA_82C596:
18815 +       case PCI_DEVICE_ID_VIA_82C686:
18816 +       case PCI_DEVICE_ID_VIA_8231:
18817 +       case PCI_DEVICE_ID_VIA_8233A:
18818 +       case PCI_DEVICE_ID_VIA_8235:
18819 +       case PCI_DEVICE_ID_VIA_8237:
18820 +               /* FIXME: add new ones for 8233/5 */
18821 +               r->name = "VIA";
18822 +               r->get = pirq_via_get;
18823 +               r->set = pirq_via_set;
18824 +               return 1;
18825 +       }
18826 +       return 0;
18827 +}
18828 +
18829 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18830 +{
18831 +       switch(device)
18832 +       {
18833 +               case PCI_DEVICE_ID_VLSI_82C534:
18834 +                       r->name = "VLSI 82C534";
18835 +                       r->get = pirq_vlsi_get;
18836 +                       r->set = pirq_vlsi_set;
18837 +                       return 1;
18838 +       }
18839 +       return 0;
18840 +}
18841 +
18842 +
18843 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18844 +{
18845 +       switch(device)
18846 +       {
18847 +               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
18848 +               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
18849 +                       r->name = "ServerWorks";
18850 +                       r->get = pirq_serverworks_get;
18851 +                       r->set = pirq_serverworks_set;
18852 +                       return 1;
18853 +       }
18854 +       return 0;
18855 +}
18856 +
18857 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18858 +{
18859 +       if (device != PCI_DEVICE_ID_SI_503)
18860 +               return 0;
18861 +
18862 +       r->name = "SIS";
18863 +       r->get = pirq_sis_get;
18864 +       r->set = pirq_sis_set;
18865 +       return 1;
18866 +}
18867 +
18868 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18869 +{
18870 +       switch(device)
18871 +       {
18872 +               case PCI_DEVICE_ID_CYRIX_5520:
18873 +                       r->name = "NatSemi";
18874 +                       r->get = pirq_cyrix_get;
18875 +                       r->set = pirq_cyrix_set;
18876 +                       return 1;
18877 +       }
18878 +       return 0;
18879 +}
18880 +
18881 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18882 +{
18883 +       switch(device)
18884 +       {
18885 +               case PCI_DEVICE_ID_OPTI_82C700:
18886 +                       r->name = "OPTI";
18887 +                       r->get = pirq_opti_get;
18888 +                       r->set = pirq_opti_set;
18889 +                       return 1;
18890 +       }
18891 +       return 0;
18892 +}
18893 +
18894 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18895 +{
18896 +       switch(device)
18897 +       {
18898 +               case PCI_DEVICE_ID_ITE_IT8330G_0:
18899 +                       r->name = "ITE";
18900 +                       r->get = pirq_ite_get;
18901 +                       r->set = pirq_ite_set;
18902 +                       return 1;
18903 +       }
18904 +       return 0;
18905 +}
18906 +
18907 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18908 +{
18909 +       switch(device)
18910 +       {
18911 +       case PCI_DEVICE_ID_AL_M1533:
18912 +       case PCI_DEVICE_ID_AL_M1563:
18913 +               printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
18914 +               r->name = "ALI";
18915 +               r->get = pirq_ali_get;
18916 +               r->set = pirq_ali_set;
18917 +               return 1;
18918 +       }
18919 +       return 0;
18920 +}
18921 +
18922 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18923 +{
18924 +       switch(device)
18925 +       {
18926 +               case PCI_DEVICE_ID_AMD_VIPER_740B:
18927 +                       r->name = "AMD756";
18928 +                       break;
18929 +               case PCI_DEVICE_ID_AMD_VIPER_7413:
18930 +                       r->name = "AMD766";
18931 +                       break;
18932 +               case PCI_DEVICE_ID_AMD_VIPER_7443:
18933 +                       r->name = "AMD768";
18934 +                       break;
18935 +               default:
18936 +                       return 0;
18937 +       }
18938 +       r->get = pirq_amd756_get;
18939 +       r->set = pirq_amd756_set;
18940 +       return 1;
18941 +}
18942 +
18943 +static __initdata struct irq_router_handler pirq_routers[] = {
18944 +       { PCI_VENDOR_ID_INTEL, intel_router_probe },
18945 +       { PCI_VENDOR_ID_AL, ali_router_probe },
18946 +       { PCI_VENDOR_ID_ITE, ite_router_probe },
18947 +       { PCI_VENDOR_ID_VIA, via_router_probe },
18948 +       { PCI_VENDOR_ID_OPTI, opti_router_probe },
18949 +       { PCI_VENDOR_ID_SI, sis_router_probe },
18950 +       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
18951 +       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
18952 +       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
18953 +       { PCI_VENDOR_ID_AMD, amd_router_probe },
18954 +       /* Someone with docs needs to add the ATI Radeon IGP */
18955 +       { 0, NULL }
18956 +};
18957 +static struct irq_router pirq_router;
18958 +static struct pci_dev *pirq_router_dev;
18959 +
18960 +
18961 +/*
18962 + *     FIXME: should we have an option to say "generic for
18963 + *     chipset" ?
18964 + */
18965 +
18966 +static void __init pirq_find_router(struct irq_router *r)
18967 +{
18968 +       struct irq_routing_table *rt = pirq_table;
18969 +       struct irq_router_handler *h;
18970 +
18971 +#ifdef CONFIG_PCI_BIOS
18972 +       if (!rt->signature) {
18973 +               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
18974 +               r->set = pirq_bios_set;
18975 +               r->name = "BIOS";
18976 +               return;
18977 +       }
18978 +#endif
18979 +
18980 +       /* Default unless a driver reloads it */
18981 +       r->name = "default";
18982 +       r->get = NULL;
18983 +       r->set = NULL;
18984 +
18985 +       DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
18986 +           rt->rtr_vendor, rt->rtr_device);
18987 +
18988 +       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
18989 +       if (!pirq_router_dev) {
18990 +               DBG(KERN_DEBUG "PCI: Interrupt router not found at "
18991 +                       "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
18992 +               return;
18993 +       }
18994 +
18995 +       for( h = pirq_routers; h->vendor; h++) {
18996 +               /* First look for a router match */
18997 +               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
18998 +                       break;
18999 +               /* Fall back to a device match */
19000 +               if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19001 +                       break;
19002 +       }
19003 +       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19004 +               pirq_router.name,
19005 +               pirq_router_dev->vendor,
19006 +               pirq_router_dev->device,
19007 +               pci_name(pirq_router_dev));
19008 +}
19009 +
19010 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
19011 +{
19012 +       struct irq_routing_table *rt = pirq_table;
19013 +       int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19014 +       struct irq_info *info;
19015 +
19016 +       for (info = rt->slots; entries--; info++)
19017 +               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19018 +                       return info;
19019 +       return NULL;
19020 +}
19021 +
19022 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19023 +{
19024 +       u8 pin;
19025 +       struct irq_info *info;
19026 +       int i, pirq, newirq;
19027 +       int irq = 0;
19028 +       u32 mask;
19029 +       struct irq_router *r = &pirq_router;
19030 +       struct pci_dev *dev2 = NULL;
19031 +       char *msg = NULL;
19032 +
19033 +       /* Find IRQ pin */
19034 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19035 +       if (!pin) {
19036 +               DBG(KERN_DEBUG " -> no interrupt pin\n");
19037 +               return 0;
19038 +       }
19039 +       pin = pin - 1;
19040 +
19041 +       /* Find IRQ routing entry */
19042 +
19043 +       if (!pirq_table)
19044 +               return 0;
19045 +
19046 +       DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19047 +       info = pirq_get_info(dev);
19048 +       if (!info) {
19049 +               DBG(" -> not found in routing table\n" KERN_DEBUG);
19050 +               return 0;
19051 +       }
19052 +       pirq = info->irq[pin].link;
19053 +       mask = info->irq[pin].bitmap;
19054 +       if (!pirq) {
19055 +               DBG(" -> not routed\n" KERN_DEBUG);
19056 +               return 0;
19057 +       }
19058 +       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19059 +       mask &= pcibios_irq_mask;
19060 +
19061 +       /* Work around broken HP Pavilion Notebooks which assign USB to
19062 +          IRQ 9 even though it is actually wired to IRQ 11 */
19063 +
19064 +       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19065 +               dev->irq = 11;
19066 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19067 +               r->set(pirq_router_dev, dev, pirq, 11);
19068 +       }
19069 +
19070 +       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19071 +       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19072 +               pirq = 0x68;
19073 +               mask = 0x400;
19074 +               dev->irq = r->get(pirq_router_dev, dev, pirq);
19075 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19076 +       }
19077 +
19078 +       /*
19079 +        * Find the best IRQ to assign: use the one
19080 +        * reported by the device if possible.
19081 +        */
19082 +       newirq = dev->irq;
19083 +       if (newirq && !((1 << newirq) & mask)) {
19084 +               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19085 +               else printk("\n" KERN_WARNING
19086 +                       "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19087 +                       "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19088 +                       pci_name(dev));
19089 +       }
19090 +       if (!newirq && assign) {
19091 +               for (i = 0; i < 16; i++) {
19092 +                       if (!(mask & (1 << i)))
19093 +                               continue;
19094 +                       if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
19095 +                               newirq = i;
19096 +               }
19097 +       }
19098 +       DBG(" -> newirq=%d", newirq);
19099 +
19100 +       /* Check if it is hardcoded */
19101 +       if ((pirq & 0xf0) == 0xf0) {
19102 +               irq = pirq & 0xf;
19103 +               DBG(" -> hardcoded IRQ %d\n", irq);
19104 +               msg = "Hardcoded";
19105 +       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19106 +       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19107 +               DBG(" -> got IRQ %d\n", irq);
19108 +               msg = "Found";
19109 +               eisa_set_level_irq(irq);
19110 +       } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19111 +               DBG(" -> assigning IRQ %d", newirq);
19112 +               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19113 +                       eisa_set_level_irq(newirq);
19114 +                       DBG(" ... OK\n");
19115 +                       msg = "Assigned";
19116 +                       irq = newirq;
19117 +               }
19118 +       }
19119 +
19120 +       if (!irq) {
19121 +               DBG(" ... failed\n");
19122 +               if (newirq && mask == (1 << newirq)) {
19123 +                       msg = "Guessed";
19124 +                       irq = newirq;
19125 +               } else
19126 +                       return 0;
19127 +       }
19128 +       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19129 +
19130 +       /* Update IRQ for all devices with the same pirq value */
19131 +       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
19132 +               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
19133 +               if (!pin)
19134 +                       continue;
19135 +               pin--;
19136 +               info = pirq_get_info(dev2);
19137 +               if (!info)
19138 +                       continue;
19139 +               if (info->irq[pin].link == pirq) {
19140 +                       /* We refuse to override the dev->irq information. Give a warning! */
19141 +                       if ( dev2->irq && dev2->irq != irq && \
19142 +                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
19143 +                       ((1 << dev2->irq) & mask)) ) {
19144 +#ifndef CONFIG_PCI_MSI
19145 +                               printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
19146 +                                      pci_name(dev2), dev2->irq, irq);
19147 +#endif
19148 +                               continue;
19149 +                       }
19150 +                       dev2->irq = irq;
19151 +                       pirq_penalty[irq]++;
19152 +                       if (dev != dev2)
19153 +                               printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
19154 +               }
19155 +       }
19156 +       return 1;
19157 +}
19158 +
19159 +static void __init pcibios_fixup_irqs(void)
19160 +{
19161 +       struct pci_dev *dev = NULL;
19162 +       u8 pin;
19163 +
19164 +       DBG(KERN_DEBUG "PCI: IRQ fixup\n");
19165 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19166 +               /*
19167 +                * If the BIOS has set an out of range IRQ number, just ignore it.
19168 +                * Also keep track of which IRQ's are already in use.
19169 +                */
19170 +               if (dev->irq >= 16) {
19171 +                       DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
19172 +                       dev->irq = 0;
19173 +               }
19174 +               /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
19175 +               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
19176 +                       pirq_penalty[dev->irq] = 0;
19177 +               pirq_penalty[dev->irq]++;
19178 +       }
19179 +
19180 +       dev = NULL;
19181 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19182 +               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19183 +#ifdef CONFIG_X86_IO_APIC
19184 +               /*
19185 +                * Recalculate IRQ numbers if we use the I/O APIC.
19186 +                */
19187 +               if (io_apic_assign_pci_irqs)
19188 +               {
19189 +                       int irq;
19190 +
19191 +                       if (pin) {
19192 +                               pin--;          /* interrupt pins are numbered starting from 1 */
19193 +                               irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19194 +       /*
19195 +        * Busses behind bridges are typically not listed in the MP-table.
19196 +        * In this case we have to look up the IRQ based on the parent bus,
19197 +        * parent slot, and pin number. The SMP code detects such bridged
19198 +        * busses itself so we should get into this branch reliably.
19199 +        */
19200 +                               if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19201 +                                       struct pci_dev * bridge = dev->bus->self;
19202 +
19203 +                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19204 +                                       irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19205 +                                                       PCI_SLOT(bridge->devfn), pin);
19206 +                                       if (irq >= 0)
19207 +                                               printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19208 +                                                       pci_name(bridge), 'A' + pin, irq);
19209 +                               }
19210 +                               if (irq >= 0) {
19211 +                                       if (use_pci_vector() &&
19212 +                                               !platform_legacy_irq(irq))
19213 +                                               irq = IO_APIC_VECTOR(irq);
19214 +
19215 +                                       printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19216 +                                               pci_name(dev), 'A' + pin, irq);
19217 +                                       dev->irq = irq;
19218 +                               }
19219 +                       }
19220 +               }
19221 +#endif
19222 +               /*
19223 +                * Still no IRQ? Try to lookup one...
19224 +                */
19225 +               if (pin && !dev->irq)
19226 +                       pcibios_lookup_irq(dev, 0);
19227 +       }
19228 +}
19229 +
19230 +/*
19231 + * Work around broken HP Pavilion Notebooks which assign USB to
19232 + * IRQ 9 even though it is actually wired to IRQ 11
19233 + */
19234 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
19235 +{
19236 +       if (!broken_hp_bios_irq9) {
19237 +               broken_hp_bios_irq9 = 1;
19238 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19239 +       }
19240 +       return 0;
19241 +}
19242 +
19243 +/*
19244 + * Work around broken Acer TravelMate 360 Notebooks which assign
19245 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
19246 + */
19247 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
19248 +{
19249 +       if (!acer_tm360_irqrouting) {
19250 +               acer_tm360_irqrouting = 1;
19251 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19252 +       }
19253 +       return 0;
19254 +}
19255 +
19256 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
19257 +       {
19258 +               .callback = fix_broken_hp_bios_irq9,
19259 +               .ident = "HP Pavilion N5400 Series Laptop",
19260 +               .matches = {
19261 +                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
19262 +                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
19263 +                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
19264 +                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
19265 +               },
19266 +       },
19267 +       {
19268 +               .callback = fix_acer_tm360_irqrouting,
19269 +               .ident = "Acer TravelMate 36x Laptop",
19270 +               .matches = {
19271 +                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
19272 +                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
19273 +               },
19274 +       },
19275 +       { }
19276 +};
19277 +
19278 +static int __init pcibios_irq_init(void)
19279 +{
19280 +       DBG(KERN_DEBUG "PCI: IRQ init\n");
19281 +
19282 +       if (pcibios_enable_irq || raw_pci_ops == NULL)
19283 +               return 0;
19284 +
19285 +       dmi_check_system(pciirq_dmi_table);
19286 +
19287 +       pirq_table = pirq_find_routing_table();
19288 +
19289 +#ifdef CONFIG_PCI_BIOS
19290 +       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
19291 +               pirq_table = pcibios_get_irq_routing_table();
19292 +#endif
19293 +       if (pirq_table) {
19294 +               pirq_peer_trick();
19295 +               pirq_find_router(&pirq_router);
19296 +               if (pirq_table->exclusive_irqs) {
19297 +                       int i;
19298 +                       for (i=0; i<16; i++)
19299 +                               if (!(pirq_table->exclusive_irqs & (1 << i)))
19300 +                                       pirq_penalty[i] += 100;
19301 +               }
19302 +               /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
19303 +               if (io_apic_assign_pci_irqs)
19304 +                       pirq_table = NULL;
19305 +       }
19306 +
19307 +       pcibios_enable_irq = pirq_enable_irq;
19308 +
19309 +       pcibios_fixup_irqs();
19310 +       return 0;
19311 +}
19312 +
19313 +subsys_initcall(pcibios_irq_init);
19314 +
19315 +
19316 +static void pirq_penalize_isa_irq(int irq, int active)
19317 +{
19318 +       /*
19319 +        *  If any ISAPnP device reports an IRQ in its list of possible
19320 +        *  IRQ's, we try to avoid assigning it to PCI devices.
19321 +        */
19322 +       if (irq < 16) {
19323 +               if (active)
19324 +                       pirq_penalty[irq] += 1000;
19325 +               else
19326 +                       pirq_penalty[irq] += 100;
19327 +       }
19328 +}
19329 +
19330 +void pcibios_penalize_isa_irq(int irq, int active)
19331 +{
19332 +#ifdef CONFIG_ACPI
19333 +       if (!acpi_noirq)
19334 +               acpi_penalize_isa_irq(irq, active);
19335 +       else
19336 +#endif
19337 +               pirq_penalize_isa_irq(irq, active);
19338 +}
19339 +
19340 +static int pirq_enable_irq(struct pci_dev *dev)
19341 +{
19342 +       u8 pin;
19343 +       struct pci_dev *temp_dev;
19344 +
19345 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19346 +       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
19347 +               char *msg = "";
19348 +
19349 +               pin--;          /* interrupt pins are numbered starting from 1 */
19350 +
19351 +               if (io_apic_assign_pci_irqs) {
19352 +                       int irq;
19353 +
19354 +                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19355 +                       /*
19356 +                        * Busses behind bridges are typically not listed in the MP-table.
19357 +                        * In this case we have to look up the IRQ based on the parent bus,
19358 +                        * parent slot, and pin number. The SMP code detects such bridged
19359 +                        * busses itself so we should get into this branch reliably.
19360 +                        */
19361 +                       temp_dev = dev;
19362 +                       while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19363 +                               struct pci_dev * bridge = dev->bus->self;
19364 +
19365 +                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19366 +                               irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19367 +                                               PCI_SLOT(bridge->devfn), pin);
19368 +                               if (irq >= 0)
19369 +                                       printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19370 +                                               pci_name(bridge), 'A' + pin, irq);
19371 +                               dev = bridge;
19372 +                       }
19373 +                       dev = temp_dev;
19374 +                       if (irq >= 0) {
19375 +#ifdef CONFIG_PCI_MSI
19376 +                               if (!platform_legacy_irq(irq))
19377 +                                       irq = IO_APIC_VECTOR(irq);
19378 +#endif
19379 +                               printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19380 +                                       pci_name(dev), 'A' + pin, irq);
19381 +                               dev->irq = irq;
19382 +                               return 0;
19383 +                       } else
19384 +                               msg = " Probably buggy MP table.";
19385 +               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
19386 +                       msg = "";
19387 +               else
19388 +                       msg = " Please try using pci=biosirq.";
19389 +
19390 +               /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
19391 +               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
19392 +                       return 0;
19393 +
19394 +               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
19395 +                      'A' + pin, pci_name(dev), msg);
19396 +       }
19397 +       return 0;
19398 +}
19399 +
19400 +int pci_vector_resources(int last, int nr_released)
19401 +{
19402 +       int count = nr_released;
19403 +
19404 +       int next = last;
19405 +       int offset = (last % 8);
19406 +
19407 +       while (next < FIRST_SYSTEM_VECTOR) {
19408 +               next += 8;
19409 +#ifdef CONFIG_X86_64
19410 +               if (next == IA32_SYSCALL_VECTOR)
19411 +                       continue;
19412 +#else
19413 +               if (next == SYSCALL_VECTOR)
19414 +                       continue;
19415 +#endif
19416 +               count++;
19417 +               if (next >= FIRST_SYSTEM_VECTOR) {
19418 +                       if (offset%8) {
19419 +                               next = FIRST_DEVICE_VECTOR + offset;
19420 +                               offset++;
19421 +                               continue;
19422 +                       }
19423 +                       count--;
19424 +               }
19425 +       }
19426 +
19427 +       return count;
19428 +}
19429 Index: head-2008-11-25/arch/x86/pci/pcifront.c
19430 ===================================================================
19431 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
19432 +++ head-2008-11-25/arch/x86/pci/pcifront.c     2007-06-12 13:12:49.000000000 +0200
19433 @@ -0,0 +1,55 @@
19434 +/*
19435 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
19436 + *                     to support the Xen PCI Frontend's operation
19437 + *
19438 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
19439 + */
19440 +#include <linux/module.h>
19441 +#include <linux/init.h>
19442 +#include <linux/pci.h>
19443 +#include <asm/acpi.h>
19444 +#include "pci.h"
19445 +
19446 +static int pcifront_enable_irq(struct pci_dev *dev)
19447 +{
19448 +       u8 irq;
19449 +       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
19450 +       dev->irq = irq;
19451 +
19452 +       return 0;
19453 +}
19454 +
19455 +extern u8 pci_cache_line_size;
19456 +
19457 +static int __init pcifront_x86_stub_init(void)
19458 +{
19459 +       struct cpuinfo_x86 *c = &boot_cpu_data;
19460 +
19461 +       /* Only install our method if we haven't found real hardware already */
19462 +       if (raw_pci_ops)
19463 +               return 0;
19464 +
19465 +       printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
19466 +
19467 +       /* Copied from arch/i386/pci/common.c */
19468 +       pci_cache_line_size = 32 >> 2;
19469 +       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
19470 +               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
19471 +       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
19472 +               pci_cache_line_size = 128 >> 2; /* P4 */
19473 +
19474 +       /* On x86, we need to disable the normal IRQ routing table and
19475 +        * just ask the backend
19476 +        */
19477 +       pcibios_enable_irq = pcifront_enable_irq;
19478 +       pcibios_disable_irq = NULL;
19479 +
19480 +#ifdef CONFIG_ACPI
19481 +       /* Keep ACPI out of the picture */
19482 +       acpi_noirq = 1;
19483 +#endif
19484 +
19485 +       return 0;
19486 +}
19487 +
19488 +arch_initcall(pcifront_x86_stub_init);
19489 Index: head-2008-11-25/arch/x86/ia32/ia32entry-xen.S
19490 ===================================================================
19491 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
19492 +++ head-2008-11-25/arch/x86/ia32/ia32entry-xen.S       2008-04-02 12:34:02.000000000 +0200
19493 @@ -0,0 +1,666 @@
19494 +/*
19495 + * Compatibility mode system call entry point for x86-64.
19496 + *
19497 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
19498 + */
19499 +
19500 +#include <asm/dwarf2.h>
19501 +#include <asm/calling.h>
19502 +#include <asm/asm-offsets.h>
19503 +#include <asm/current.h>
19504 +#include <asm/errno.h>
19505 +#include <asm/ia32_unistd.h>
19506 +#include <asm/thread_info.h>
19507 +#include <asm/segment.h>
19508 +#include <asm/vsyscall32.h>
19509 +#include <asm/irqflags.h>
19510 +#include <linux/linkage.h>
19511 +
19512 +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19513 +
19514 +       .macro IA32_ARG_FIXUP noebp=0
19515 +       movl    %edi,%r8d
19516 +       .if \noebp
19517 +       .else
19518 +       movl    %ebp,%r9d
19519 +       .endif
19520 +       xchg    %ecx,%esi
19521 +       movl    %ebx,%edi
19522 +       movl    %edx,%edx       /* zero extension */
19523 +       .endm
19524 +
19525 +       /* clobbers %eax */
19526 +       .macro  CLEAR_RREGS
19527 +       xorl    %eax,%eax
19528 +       movq    %rax,R11(%rsp)
19529 +       movq    %rax,R10(%rsp)
19530 +       movq    %rax,R9(%rsp)
19531 +       movq    %rax,R8(%rsp)
19532 +       .endm
19533 +
19534 +       .macro LOAD_ARGS32 offset
19535 +       movl \offset(%rsp),%r11d
19536 +       movl \offset+8(%rsp),%r10d
19537 +       movl \offset+16(%rsp),%r9d
19538 +       movl \offset+24(%rsp),%r8d
19539 +       movl \offset+40(%rsp),%ecx
19540 +       movl \offset+48(%rsp),%edx
19541 +       movl \offset+56(%rsp),%esi
19542 +       movl \offset+64(%rsp),%edi
19543 +       movl \offset+72(%rsp),%eax
19544 +       .endm
19545 +
19546 +       .macro CFI_STARTPROC32 simple
19547 +       CFI_STARTPROC   \simple
19548 +       CFI_UNDEFINED   r8
19549 +       CFI_UNDEFINED   r9
19550 +       CFI_UNDEFINED   r10
19551 +       CFI_UNDEFINED   r11
19552 +       CFI_UNDEFINED   r12
19553 +       CFI_UNDEFINED   r13
19554 +       CFI_UNDEFINED   r14
19555 +       CFI_UNDEFINED   r15
19556 +       .endm
19557 +
19558 +/*
19559 + * 32bit SYSENTER instruction entry.
19560 + *
19561 + * Arguments:
19562 + * %eax        System call number.
19563 + * %ebx Arg1
19564 + * %ecx Arg2
19565 + * %edx Arg3
19566 + * %esi Arg4
19567 + * %edi Arg5
19568 + * %ebp user stack
19569 + * 0(%ebp) Arg6
19570 + *
19571 + * Interrupts on.
19572 + *
19573 + * This is purely a fast path. For anything complicated we use the int 0x80
19574 + * path below. Set up a complete hardware stack frame to share code
19575 + * with the int 0x80 path.
19576 + */
19577 +ENTRY(ia32_sysenter_target)
19578 +       CFI_STARTPROC32 simple
19579 +       CFI_DEF_CFA     rsp,SS+8-RIP+16
19580 +       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
19581 +       CFI_REL_OFFSET  rsp,RSP-RIP+16
19582 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
19583 +       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
19584 +       CFI_REL_OFFSET  rip,RIP-RIP+16
19585 +       CFI_REL_OFFSET  r11,8
19586 +       CFI_REL_OFFSET  rcx,0
19587 +       movq    8(%rsp),%r11
19588 +       CFI_RESTORE     r11
19589 +       popq    %rcx
19590 +       CFI_ADJUST_CFA_OFFSET -8
19591 +       CFI_RESTORE     rcx
19592 +       movl    %ebp,%ebp               /* zero extension */
19593 +       movl    %eax,%eax
19594 +       movl    $__USER32_DS,40(%rsp)
19595 +       movq    %rbp,32(%rsp)
19596 +       movl    $__USER32_CS,16(%rsp)
19597 +       movl    $VSYSCALL32_SYSEXIT,8(%rsp)
19598 +       movq    %rax,(%rsp)
19599 +       cld
19600 +       SAVE_ARGS 0,0,0
19601 +       /* no need to do an access_ok check here because rbp has been
19602 +          32bit zero extended */
19603 +1:     movl    (%rbp),%r9d
19604 +       .section __ex_table,"a"
19605 +       .quad 1b,ia32_badarg
19606 +       .previous
19607 +       GET_THREAD_INFO(%r10)
19608 +       orl    $TS_COMPAT,threadinfo_status(%r10)
19609 +       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19610 +       jnz  sysenter_tracesys
19611 +sysenter_do_call:
19612 +       cmpl    $(IA32_NR_syscalls-1),%eax
19613 +       ja      ia32_badsys
19614 +       IA32_ARG_FIXUP 1
19615 +       call    *ia32_sys_call_table(,%rax,8)
19616 +       movq    %rax,RAX-ARGOFFSET(%rsp)
19617 +       jmp int_ret_from_sys_call
19618 +
19619 +sysenter_tracesys:
19620 +       SAVE_REST
19621 +       CLEAR_RREGS
19622 +       movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
19623 +       movq    %rsp,%rdi        /* &pt_regs -> arg1 */
19624 +       call    syscall_trace_enter
19625 +       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
19626 +       RESTORE_REST
19627 +       movl    %ebp, %ebp
19628 +       /* no need to do an access_ok check here because rbp has been
19629 +          32bit zero extended */
19630 +1:     movl    (%rbp),%r9d
19631 +       .section __ex_table,"a"
19632 +       .quad 1b,ia32_badarg
19633 +       .previous
19634 +       jmp     sysenter_do_call
19635 +       CFI_ENDPROC
19636 +ENDPROC(ia32_sysenter_target)
19637 +
19638 +/*
19639 + * 32bit SYSCALL instruction entry.
19640 + *
19641 + * Arguments:
19642 + * %eax        System call number.
19643 + * %ebx Arg1
19644 + * %ecx return EIP
19645 + * %edx Arg3
19646 + * %esi Arg4
19647 + * %edi Arg5
19648 + * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
19649 + * %esp user stack
19650 + * 0(%esp) Arg6
19651 + *
19652 + * Interrupts on.
19653 + *
19654 + * This is purely a fast path. For anything complicated we use the int 0x80
19655 + * path below. Set up a complete hardware stack frame to share code
19656 + * with the int 0x80 path.
19657 + */
19658 +ENTRY(ia32_cstar_target)
19659 +       CFI_STARTPROC32 simple
19660 +       CFI_DEF_CFA     rsp,SS+8-RIP+16
19661 +       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
19662 +       CFI_REL_OFFSET  rsp,RSP-RIP+16
19663 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
19664 +       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
19665 +       CFI_REL_OFFSET  rip,RIP-RIP+16
19666 +       movl    %eax,%eax       /* zero extension */
19667 +       movl    RSP-RIP+16(%rsp),%r8d
19668 +       SAVE_ARGS -8,1,1
19669 +       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
19670 +       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
19671 +       movl    %ebp,%ecx
19672 +       movl    $__USER32_CS,CS-ARGOFFSET(%rsp)
19673 +       movl    $__USER32_DS,SS-ARGOFFSET(%rsp)
19674 +       /* no need to do an access_ok check here because r8 has been
19675 +          32bit zero extended */
19676 +       /* hardware stack frame is complete now */
19677 +1:     movl    (%r8),%r9d
19678 +       .section __ex_table,"a"
19679 +       .quad 1b,ia32_badarg
19680 +       .previous
19681 +       GET_THREAD_INFO(%r10)
19682 +       orl   $TS_COMPAT,threadinfo_status(%r10)
19683 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19684 +       jnz   cstar_tracesys
19685 +cstar_do_call:
19686 +       cmpl $IA32_NR_syscalls-1,%eax
19687 +       ja  ia32_badsys
19688 +       IA32_ARG_FIXUP 1
19689 +       call *ia32_sys_call_table(,%rax,8)
19690 +       movq %rax,RAX-ARGOFFSET(%rsp)
19691 +       jmp int_ret_from_sys_call
19692 +
19693 +cstar_tracesys:
19694 +       SAVE_REST
19695 +       CLEAR_RREGS
19696 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
19697 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
19698 +       call syscall_trace_enter
19699 +       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
19700 +       RESTORE_REST
19701 +       movl RSP-ARGOFFSET(%rsp), %r8d
19702 +       /* no need to do an access_ok check here because r8 has been
19703 +          32bit zero extended */
19704 +1:     movl    (%r8),%r9d
19705 +       .section __ex_table,"a"
19706 +       .quad 1b,ia32_badarg
19707 +       .previous
19708 +       jmp cstar_do_call
19709 +END(ia32_cstar_target)
19710 +
19711 +ia32_badarg:
19712 +       movq $-EFAULT,%rax
19713 +       jmp ia32_sysret
19714 +       CFI_ENDPROC
19715 +
19716 +/*
19717 + * Emulated IA32 system calls via int 0x80.
19718 + *
19719 + * Arguments:
19720 + * %eax        System call number.
19721 + * %ebx Arg1
19722 + * %ecx Arg2
19723 + * %edx Arg3
19724 + * %esi Arg4
19725 + * %edi Arg5
19726 + * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
19727 + *
19728 + * Notes:
19729 + * Uses the same stack frame as the x86-64 version.
19730 + * All registers except %eax must be saved (but ptrace may violate that)
19731 + * Arguments are zero extended. For system calls that want sign extension and
19732 + * take long arguments a wrapper is needed. Most calls can just be called
19733 + * directly.
19734 + * Assumes it is only called from user space and entered with interrupts on.
19735 + */
19736 +
19737 +ENTRY(ia32_syscall)
19738 +       CFI_STARTPROC   simple
19739 +       CFI_DEF_CFA     rsp,SS+8-RIP+16
19740 +       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
19741 +       CFI_REL_OFFSET  rsp,RSP-RIP+16
19742 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
19743 +       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
19744 +       CFI_REL_OFFSET  rip,RIP-RIP+16
19745 +       CFI_REL_OFFSET  r11,8
19746 +       CFI_REL_OFFSET  rcx,0
19747 +       movq 8(%rsp),%r11
19748 +       CFI_RESTORE     r11
19749 +       popq %rcx
19750 +       CFI_ADJUST_CFA_OFFSET -8
19751 +       CFI_RESTORE     rcx
19752 +       movl %eax,%eax
19753 +       movq %rax,(%rsp)
19754 +       cld
19755 +       /* note the registers are not zero extended to the sf.
19756 +          this could be a problem. */
19757 +       SAVE_ARGS 0,0,1
19758 +       GET_THREAD_INFO(%r10)
19759 +       orl   $TS_COMPAT,threadinfo_status(%r10)
19760 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19761 +       jnz ia32_tracesys
19762 +ia32_do_syscall:
19763 +       cmpl $(IA32_NR_syscalls-1),%eax
19764 +       ja  ia32_badsys
19765 +       IA32_ARG_FIXUP
19766 +       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
19767 +ia32_sysret:
19768 +       movq %rax,RAX-ARGOFFSET(%rsp)
19769 +       jmp int_ret_from_sys_call
19770 +
19771 +ia32_tracesys:
19772 +       SAVE_REST
19773 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
19774 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
19775 +       call syscall_trace_enter
19776 +       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
19777 +       RESTORE_REST
19778 +       jmp ia32_do_syscall
19779 +END(ia32_syscall)
19780 +
19781 +ia32_badsys:
19782 +       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
19783 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
19784 +       jmp int_ret_from_sys_call
19785 +
19786 +quiet_ni_syscall:
19787 +       movq $-ENOSYS,%rax
19788 +       ret
19789 +       CFI_ENDPROC
19790 +
19791 +       .macro PTREGSCALL label, func, arg
19792 +       .globl \label
19793 +\label:
19794 +       leaq \func(%rip),%rax
19795 +       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
19796 +       jmp  ia32_ptregs_common
19797 +       .endm
19798 +
19799 +       CFI_STARTPROC32
19800 +
19801 +       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
19802 +       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
19803 +       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
19804 +       PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
19805 +       PTREGSCALL stub32_execve, sys32_execve, %rcx
19806 +       PTREGSCALL stub32_fork, sys_fork, %rdi
19807 +       PTREGSCALL stub32_clone, sys32_clone, %rdx
19808 +       PTREGSCALL stub32_vfork, sys_vfork, %rdi
19809 +       PTREGSCALL stub32_iopl, sys_iopl, %rsi
19810 +       PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
19811 +
19812 +ENTRY(ia32_ptregs_common)
19813 +       popq %r11
19814 +       CFI_ENDPROC
19815 +       CFI_STARTPROC32 simple
19816 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
19817 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
19818 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
19819 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
19820 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
19821 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
19822 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
19823 +/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
19824 +/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
19825 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
19826 +/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
19827 +       SAVE_REST
19828 +       call *%rax
19829 +       RESTORE_REST
19830 +       jmp  ia32_sysret        /* misbalances the return cache */
19831 +       CFI_ENDPROC
19832 +END(ia32_ptregs_common)
19833 +
19834 +       .section .rodata,"a"
19835 +       .align 8
19836 +ia32_sys_call_table:
19837 +       .quad sys_restart_syscall
19838 +       .quad sys_exit
19839 +       .quad stub32_fork
19840 +       .quad sys_read
19841 +       .quad sys_write
19842 +       .quad compat_sys_open           /* 5 */
19843 +       .quad sys_close
19844 +       .quad sys32_waitpid
19845 +       .quad sys_creat
19846 +       .quad sys_link
19847 +       .quad sys_unlink                /* 10 */
19848 +       .quad stub32_execve
19849 +       .quad sys_chdir
19850 +       .quad compat_sys_time
19851 +       .quad sys_mknod
19852 +       .quad sys_chmod         /* 15 */
19853 +       .quad sys_lchown16
19854 +       .quad quiet_ni_syscall                  /* old break syscall holder */
19855 +       .quad sys_stat
19856 +       .quad sys32_lseek
19857 +       .quad sys_getpid                /* 20 */
19858 +       .quad compat_sys_mount  /* mount  */
19859 +       .quad sys_oldumount     /* old_umount  */
19860 +       .quad sys_setuid16
19861 +       .quad sys_getuid16
19862 +       .quad compat_sys_stime  /* stime */             /* 25 */
19863 +       .quad sys32_ptrace      /* ptrace */
19864 +       .quad sys_alarm
19865 +       .quad sys_fstat /* (old)fstat */
19866 +       .quad sys_pause
19867 +       .quad compat_sys_utime  /* 30 */
19868 +       .quad quiet_ni_syscall  /* old stty syscall holder */
19869 +       .quad quiet_ni_syscall  /* old gtty syscall holder */
19870 +       .quad sys_access
19871 +       .quad sys_nice
19872 +       .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
19873 +       .quad sys_sync
19874 +       .quad sys32_kill
19875 +       .quad sys_rename
19876 +       .quad sys_mkdir
19877 +       .quad sys_rmdir         /* 40 */
19878 +       .quad sys_dup
19879 +       .quad sys32_pipe
19880 +       .quad compat_sys_times
19881 +       .quad quiet_ni_syscall                  /* old prof syscall holder */
19882 +       .quad sys_brk           /* 45 */
19883 +       .quad sys_setgid16
19884 +       .quad sys_getgid16
19885 +       .quad sys_signal
19886 +       .quad sys_geteuid16
19887 +       .quad sys_getegid16     /* 50 */
19888 +       .quad sys_acct
19889 +       .quad sys_umount                        /* new_umount */
19890 +       .quad quiet_ni_syscall                  /* old lock syscall holder */
19891 +       .quad compat_sys_ioctl
19892 +       .quad compat_sys_fcntl64                /* 55 */
19893 +       .quad quiet_ni_syscall                  /* old mpx syscall holder */
19894 +       .quad sys_setpgid
19895 +       .quad quiet_ni_syscall                  /* old ulimit syscall holder */
19896 +       .quad sys32_olduname
19897 +       .quad sys_umask         /* 60 */
19898 +       .quad sys_chroot
19899 +       .quad sys32_ustat
19900 +       .quad sys_dup2
19901 +       .quad sys_getppid
19902 +       .quad sys_getpgrp               /* 65 */
19903 +       .quad sys_setsid
19904 +       .quad sys32_sigaction
19905 +       .quad sys_sgetmask
19906 +       .quad sys_ssetmask
19907 +       .quad sys_setreuid16    /* 70 */
19908 +       .quad sys_setregid16
19909 +       .quad stub32_sigsuspend
19910 +       .quad compat_sys_sigpending
19911 +       .quad sys_sethostname
19912 +       .quad compat_sys_setrlimit      /* 75 */
19913 +       .quad compat_sys_old_getrlimit  /* old_getrlimit */
19914 +       .quad compat_sys_getrusage
19915 +       .quad sys32_gettimeofday
19916 +       .quad sys32_settimeofday
19917 +       .quad sys_getgroups16   /* 80 */
19918 +       .quad sys_setgroups16
19919 +       .quad sys32_old_select
19920 +       .quad sys_symlink
19921 +       .quad sys_lstat
19922 +       .quad sys_readlink              /* 85 */
19923 +#ifdef CONFIG_IA32_AOUT
19924 +       .quad sys_uselib
19925 +#else
19926 +       .quad quiet_ni_syscall
19927 +#endif
19928 +       .quad sys_swapon
19929 +       .quad sys_reboot
19930 +       .quad compat_sys_old_readdir
19931 +       .quad sys32_mmap                /* 90 */
19932 +       .quad sys_munmap
19933 +       .quad sys_truncate
19934 +       .quad sys_ftruncate
19935 +       .quad sys_fchmod
19936 +       .quad sys_fchown16              /* 95 */
19937 +       .quad sys_getpriority
19938 +       .quad sys_setpriority
19939 +       .quad quiet_ni_syscall                  /* old profil syscall holder */
19940 +       .quad compat_sys_statfs
19941 +       .quad compat_sys_fstatfs                /* 100 */
19942 +       .quad sys_ioperm
19943 +       .quad compat_sys_socketcall
19944 +       .quad sys_syslog
19945 +       .quad compat_sys_setitimer
19946 +       .quad compat_sys_getitimer      /* 105 */
19947 +       .quad compat_sys_newstat
19948 +       .quad compat_sys_newlstat
19949 +       .quad compat_sys_newfstat
19950 +       .quad sys32_uname
19951 +       .quad stub32_iopl               /* 110 */
19952 +       .quad sys_vhangup
19953 +       .quad quiet_ni_syscall  /* old "idle" system call */
19954 +       .quad sys32_vm86_warning        /* vm86old */
19955 +       .quad compat_sys_wait4
19956 +       .quad sys_swapoff               /* 115 */
19957 +       .quad sys32_sysinfo
19958 +       .quad sys32_ipc
19959 +       .quad sys_fsync
19960 +       .quad stub32_sigreturn
19961 +       .quad stub32_clone              /* 120 */
19962 +       .quad sys_setdomainname
19963 +       .quad sys_uname
19964 +       .quad sys_modify_ldt
19965 +       .quad compat_sys_adjtimex
19966 +       .quad sys32_mprotect            /* 125 */
19967 +       .quad compat_sys_sigprocmask
19968 +       .quad quiet_ni_syscall          /* create_module */
19969 +       .quad sys_init_module
19970 +       .quad sys_delete_module
19971 +       .quad quiet_ni_syscall          /* 130  get_kernel_syms */
19972 +       .quad sys_quotactl
19973 +       .quad sys_getpgid
19974 +       .quad sys_fchdir
19975 +       .quad quiet_ni_syscall  /* bdflush */
19976 +       .quad sys_sysfs         /* 135 */
19977 +       .quad sys_personality
19978 +       .quad quiet_ni_syscall  /* for afs_syscall */
19979 +       .quad sys_setfsuid16
19980 +       .quad sys_setfsgid16
19981 +       .quad sys_llseek                /* 140 */
19982 +       .quad compat_sys_getdents
19983 +       .quad compat_sys_select
19984 +       .quad sys_flock
19985 +       .quad sys_msync
19986 +       .quad compat_sys_readv          /* 145 */
19987 +       .quad compat_sys_writev
19988 +       .quad sys_getsid
19989 +       .quad sys_fdatasync
19990 +       .quad sys32_sysctl      /* sysctl */
19991 +       .quad sys_mlock         /* 150 */
19992 +       .quad sys_munlock
19993 +       .quad sys_mlockall
19994 +       .quad sys_munlockall
19995 +       .quad sys_sched_setparam
19996 +       .quad sys_sched_getparam   /* 155 */
19997 +       .quad sys_sched_setscheduler
19998 +       .quad sys_sched_getscheduler
19999 +       .quad sys_sched_yield
20000 +       .quad sys_sched_get_priority_max
20001 +       .quad sys_sched_get_priority_min  /* 160 */
20002 +       .quad sys_sched_rr_get_interval
20003 +       .quad compat_sys_nanosleep
20004 +       .quad sys_mremap
20005 +       .quad sys_setresuid16
20006 +       .quad sys_getresuid16   /* 165 */
20007 +       .quad sys32_vm86_warning        /* vm86 */
20008 +       .quad quiet_ni_syscall  /* query_module */
20009 +       .quad sys_poll
20010 +       .quad compat_sys_nfsservctl
20011 +       .quad sys_setresgid16   /* 170 */
20012 +       .quad sys_getresgid16
20013 +       .quad sys_prctl
20014 +       .quad stub32_rt_sigreturn
20015 +       .quad sys32_rt_sigaction
20016 +       .quad sys32_rt_sigprocmask      /* 175 */
20017 +       .quad sys32_rt_sigpending
20018 +       .quad compat_sys_rt_sigtimedwait
20019 +       .quad sys32_rt_sigqueueinfo
20020 +       .quad stub32_rt_sigsuspend
20021 +       .quad sys32_pread               /* 180 */
20022 +       .quad sys32_pwrite
20023 +       .quad sys_chown16
20024 +       .quad sys_getcwd
20025 +       .quad sys_capget
20026 +       .quad sys_capset
20027 +       .quad stub32_sigaltstack
20028 +       .quad sys32_sendfile
20029 +       .quad quiet_ni_syscall          /* streams1 */
20030 +       .quad quiet_ni_syscall          /* streams2 */
20031 +       .quad stub32_vfork            /* 190 */
20032 +       .quad compat_sys_getrlimit
20033 +       .quad sys32_mmap2
20034 +       .quad sys32_truncate64
20035 +       .quad sys32_ftruncate64
20036 +       .quad sys32_stat64              /* 195 */
20037 +       .quad sys32_lstat64
20038 +       .quad sys32_fstat64
20039 +       .quad sys_lchown
20040 +       .quad sys_getuid
20041 +       .quad sys_getgid                /* 200 */
20042 +       .quad sys_geteuid
20043 +       .quad sys_getegid
20044 +       .quad sys_setreuid
20045 +       .quad sys_setregid
20046 +       .quad sys_getgroups     /* 205 */
20047 +       .quad sys_setgroups
20048 +       .quad sys_fchown
20049 +       .quad sys_setresuid
20050 +       .quad sys_getresuid
20051 +       .quad sys_setresgid     /* 210 */
20052 +       .quad sys_getresgid
20053 +       .quad sys_chown
20054 +       .quad sys_setuid
20055 +       .quad sys_setgid
20056 +       .quad sys_setfsuid              /* 215 */
20057 +       .quad sys_setfsgid
20058 +       .quad sys_pivot_root
20059 +       .quad sys_mincore
20060 +       .quad sys_madvise
20061 +       .quad compat_sys_getdents64     /* 220 getdents64 */
20062 +       .quad compat_sys_fcntl64
20063 +       .quad quiet_ni_syscall          /* tux */
20064 +       .quad quiet_ni_syscall          /* security */
20065 +       .quad sys_gettid
20066 +       .quad sys_readahead     /* 225 */
20067 +       .quad sys_setxattr
20068 +       .quad sys_lsetxattr
20069 +       .quad sys_fsetxattr
20070 +       .quad sys_getxattr
20071 +       .quad sys_lgetxattr     /* 230 */
20072 +       .quad sys_fgetxattr
20073 +       .quad sys_listxattr
20074 +       .quad sys_llistxattr
20075 +       .quad sys_flistxattr
20076 +       .quad sys_removexattr   /* 235 */
20077 +       .quad sys_lremovexattr
20078 +       .quad sys_fremovexattr
20079 +       .quad sys_tkill
20080 +       .quad sys_sendfile64
20081 +       .quad compat_sys_futex          /* 240 */
20082 +       .quad compat_sys_sched_setaffinity
20083 +       .quad compat_sys_sched_getaffinity
20084 +       .quad sys32_set_thread_area
20085 +       .quad sys32_get_thread_area
20086 +       .quad compat_sys_io_setup       /* 245 */
20087 +       .quad sys_io_destroy
20088 +       .quad compat_sys_io_getevents
20089 +       .quad compat_sys_io_submit
20090 +       .quad sys_io_cancel
20091 +       .quad sys_fadvise64             /* 250 */
20092 +       .quad quiet_ni_syscall  /* free_huge_pages */
20093 +       .quad sys_exit_group
20094 +       .quad sys32_lookup_dcookie
20095 +       .quad sys_epoll_create
20096 +       .quad sys_epoll_ctl             /* 255 */
20097 +       .quad sys_epoll_wait
20098 +       .quad sys_remap_file_pages
20099 +       .quad sys_set_tid_address
20100 +       .quad compat_sys_timer_create
20101 +       .quad compat_sys_timer_settime  /* 260 */
20102 +       .quad compat_sys_timer_gettime
20103 +       .quad sys_timer_getoverrun
20104 +       .quad sys_timer_delete
20105 +       .quad compat_sys_clock_settime
20106 +       .quad compat_sys_clock_gettime  /* 265 */
20107 +       .quad compat_sys_clock_getres
20108 +       .quad compat_sys_clock_nanosleep
20109 +       .quad compat_sys_statfs64
20110 +       .quad compat_sys_fstatfs64
20111 +       .quad sys_tgkill                /* 270 */
20112 +       .quad compat_sys_utimes
20113 +       .quad sys32_fadvise64_64
20114 +       .quad quiet_ni_syscall  /* sys_vserver */
20115 +       .quad sys_mbind
20116 +       .quad compat_sys_get_mempolicy  /* 275 */
20117 +       .quad sys_set_mempolicy
20118 +       .quad compat_sys_mq_open
20119 +       .quad sys_mq_unlink
20120 +       .quad compat_sys_mq_timedsend
20121 +       .quad compat_sys_mq_timedreceive        /* 280 */
20122 +       .quad compat_sys_mq_notify
20123 +       .quad compat_sys_mq_getsetattr
20124 +       .quad compat_sys_kexec_load     /* reserved for kexec */
20125 +       .quad compat_sys_waitid
20126 +       .quad quiet_ni_syscall          /* 285: sys_altroot */
20127 +       .quad sys_add_key
20128 +       .quad sys_request_key
20129 +       .quad sys_keyctl
20130 +       .quad sys_ioprio_set
20131 +       .quad sys_ioprio_get            /* 290 */
20132 +       .quad sys_inotify_init
20133 +       .quad sys_inotify_add_watch
20134 +       .quad sys_inotify_rm_watch
20135 +       .quad sys_migrate_pages
20136 +       .quad compat_sys_openat         /* 295 */
20137 +       .quad sys_mkdirat
20138 +       .quad sys_mknodat
20139 +       .quad sys_fchownat
20140 +       .quad compat_sys_futimesat
20141 +       .quad sys32_fstatat             /* 300 */
20142 +       .quad sys_unlinkat
20143 +       .quad sys_renameat
20144 +       .quad sys_linkat
20145 +       .quad sys_symlinkat
20146 +       .quad sys_readlinkat            /* 305 */
20147 +       .quad sys_fchmodat
20148 +       .quad sys_faccessat
20149 +       .quad quiet_ni_syscall          /* pselect6 for now */
20150 +       .quad quiet_ni_syscall          /* ppoll for now */
20151 +       .quad sys_unshare               /* 310 */
20152 +       .quad compat_sys_set_robust_list
20153 +       .quad compat_sys_get_robust_list
20154 +       .quad sys_splice
20155 +       .quad sys_sync_file_range
20156 +       .quad sys_tee
20157 +       .quad compat_sys_vmsplice
20158 +       .quad compat_sys_move_pages
20159 +ia32_syscall_end:
20160 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c
20161 ===================================================================
20162 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
20163 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200
20164 @@ -0,0 +1,146 @@
20165 +/*
20166 + *  acpi.c - Architecture-Specific Low-Level ACPI Support
20167 + *
20168 + *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
20169 + *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
20170 + *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
20171 + *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
20172 + *  Copyright (C) 2003 Pavel Machek, SuSE Labs
20173 + *
20174 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20175 + *
20176 + *  This program is free software; you can redistribute it and/or modify
20177 + *  it under the terms of the GNU General Public License as published by
20178 + *  the Free Software Foundation; either version 2 of the License, or
20179 + *  (at your option) any later version.
20180 + *
20181 + *  This program is distributed in the hope that it will be useful,
20182 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
20183 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20184 + *  GNU General Public License for more details.
20185 + *
20186 + *  You should have received a copy of the GNU General Public License
20187 + *  along with this program; if not, write to the Free Software
20188 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20189 + *
20190 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20191 + */
20192 +
20193 +#include <linux/kernel.h>
20194 +#include <linux/init.h>
20195 +#include <linux/types.h>
20196 +#include <linux/stddef.h>
20197 +#include <linux/slab.h>
20198 +#include <linux/pci.h>
20199 +#include <linux/bootmem.h>
20200 +#include <linux/acpi.h>
20201 +#include <linux/cpumask.h>
20202 +
20203 +#include <asm/mpspec.h>
20204 +#include <asm/io.h>
20205 +#include <asm/apic.h>
20206 +#include <asm/apicdef.h>
20207 +#include <asm/page.h>
20208 +#include <asm/pgtable.h>
20209 +#include <asm/pgalloc.h>
20210 +#include <asm/io_apic.h>
20211 +#include <asm/proto.h>
20212 +#include <asm/tlbflush.h>
20213 +
20214 +/* --------------------------------------------------------------------------
20215 +                              Low-Level Sleep Support
20216 +   -------------------------------------------------------------------------- */
20217 +
20218 +#ifdef CONFIG_ACPI_SLEEP
20219 +
20220 +#ifndef CONFIG_ACPI_PV_SLEEP
20221 +/* address in low memory of the wakeup routine. */
20222 +unsigned long acpi_wakeup_address = 0;
20223 +unsigned long acpi_video_flags;
20224 +extern char wakeup_start, wakeup_end;
20225 +
20226 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
20227 +
20228 +static pgd_t low_ptr;
20229 +
20230 +static void init_low_mapping(void)
20231 +{
20232 +       pgd_t *slot0 = pgd_offset(current->mm, 0UL);
20233 +       low_ptr = *slot0;
20234 +       set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
20235 +       WARN_ON(num_online_cpus() != 1);
20236 +       local_flush_tlb();
20237 +}
20238 +#endif
20239 +
20240 +/**
20241 + * acpi_save_state_mem - save kernel state
20242 + *
20243 + * Create an identity mapped page table and copy the wakeup routine to
20244 + * low memory.
20245 + */
20246 +int acpi_save_state_mem(void)
20247 +{
20248 +#ifndef CONFIG_ACPI_PV_SLEEP
20249 +       init_low_mapping();
20250 +
20251 +       memcpy((void *)acpi_wakeup_address, &wakeup_start,
20252 +              &wakeup_end - &wakeup_start);
20253 +       acpi_copy_wakeup_routine(acpi_wakeup_address);
20254 +#endif
20255 +       return 0;
20256 +}
20257 +
20258 +/*
20259 + * acpi_restore_state
20260 + */
20261 +void acpi_restore_state_mem(void)
20262 +{
20263 +#ifndef CONFIG_ACPI_PV_SLEEP
20264 +       set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
20265 +       local_flush_tlb();
20266 +#endif
20267 +}
20268 +
20269 +/**
20270 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
20271 + *
20272 + * We allocate a page in low memory for the wakeup
20273 + * routine for when we come back from a sleep state. The
20274 + * runtime allocator allows specification of <16M pages, but not
20275 + * <1M pages.
20276 + */
20277 +void __init acpi_reserve_bootmem(void)
20278 +{
20279 +#ifndef CONFIG_ACPI_PV_SLEEP
20280 +       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
20281 +       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
20282 +               printk(KERN_CRIT
20283 +                      "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
20284 +#endif
20285 +}
20286 +
20287 +#ifndef CONFIG_ACPI_PV_SLEEP
20288 +static int __init acpi_sleep_setup(char *str)
20289 +{
20290 +       while ((str != NULL) && (*str != '\0')) {
20291 +               if (strncmp(str, "s3_bios", 7) == 0)
20292 +                       acpi_video_flags = 1;
20293 +               if (strncmp(str, "s3_mode", 7) == 0)
20294 +                       acpi_video_flags |= 2;
20295 +               str = strchr(str, ',');
20296 +               if (str != NULL)
20297 +                       str += strspn(str, ", \t");
20298 +       }
20299 +
20300 +       return 1;
20301 +}
20302 +
20303 +__setup("acpi_sleep=", acpi_sleep_setup);
20304 +#endif                         /* CONFIG_ACPI_PV_SLEEP */
20305 +
20306 +#endif                         /*CONFIG_ACPI_SLEEP */
20307 +
20308 +void acpi_pci_link_exit(void)
20309 +{
20310 +}
20311 Index: head-2008-11-25/arch/x86/kernel/apic_64-xen.c
20312 ===================================================================
20313 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
20314 +++ head-2008-11-25/arch/x86/kernel/apic_64-xen.c       2007-06-12 13:13:01.000000000 +0200
20315 @@ -0,0 +1,197 @@
20316 +/*
20317 + *     Local APIC handling, local APIC timers
20318 + *
20319 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
20320 + *
20321 + *     Fixes
20322 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
20323 + *                                     thanks to Eric Gilmore
20324 + *                                     and Rolf G. Tews
20325 + *                                     for testing these extensively.
20326 + *     Maciej W. Rozycki       :       Various updates and fixes.
20327 + *     Mikael Pettersson       :       Power Management for UP-APIC.
20328 + *     Pavel Machek and
20329 + *     Mikael Pettersson       :       PM converted to driver model.
20330 + */
20331 +
20332 +#include <linux/init.h>
20333 +
20334 +#include <linux/mm.h>
20335 +#include <linux/delay.h>
20336 +#include <linux/bootmem.h>
20337 +#include <linux/smp_lock.h>
20338 +#include <linux/interrupt.h>
20339 +#include <linux/mc146818rtc.h>
20340 +#include <linux/kernel_stat.h>
20341 +#include <linux/sysdev.h>
20342 +#include <linux/module.h>
20343 +
20344 +#include <asm/atomic.h>
20345 +#include <asm/smp.h>
20346 +#include <asm/mtrr.h>
20347 +#include <asm/mpspec.h>
20348 +#include <asm/desc.h>
20349 +#include <asm/arch_hooks.h>
20350 +#include <asm/hpet.h>
20351 +#include <asm/idle.h>
20352 +
20353 +int apic_verbosity;
20354 +
20355 +/*
20356 + * 'what should we do if we get a hw irq event on an illegal vector'.
20357 + * each architecture has to answer this themselves.
20358 + */
20359 +void ack_bad_irq(unsigned int irq)
20360 +{
20361 +       printk("unexpected IRQ trap at vector %02x\n", irq);
20362 +       /*
20363 +        * Currently unexpected vectors happen only on SMP and APIC.
20364 +        * We _must_ ack these because every local APIC has only N
20365 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
20366 +        * holds up an irq slot - in excessive cases (when multiple
20367 +        * unexpected vectors occur) that might lock up the APIC
20368 +        * completely.
20369 +        * But don't ack when the APIC is disabled. -AK
20370 +        */
20371 +       if (!disable_apic)
20372 +               ack_APIC_irq();
20373 +}
20374 +
20375 +int setup_profiling_timer(unsigned int multiplier)
20376 +{
20377 +       return -EINVAL;
20378 +}
20379 +
20380 +void smp_local_timer_interrupt(struct pt_regs *regs)
20381 +{
20382 +       profile_tick(CPU_PROFILING, regs);
20383 +#ifndef CONFIG_XEN
20384 +#ifdef CONFIG_SMP
20385 +               update_process_times(user_mode(regs));
20386 +#endif
20387 +#endif
20388 +       /*
20389 +        * We take the 'long' return path, and there every subsystem
20390 +        * grabs the appropriate locks (kernel lock/ irq lock).
20391 +        *
20392 +        * we might want to decouple profiling from the 'long path',
20393 +        * and do the profiling totally in assembly.
20394 +        *
20395 +        * Currently this isn't too much of an issue (performance wise),
20396 +        * we can take more than 100K local irqs per second on a 100 MHz P5.
20397 +        */
20398 +}
20399 +
20400 +/*
20401 + * Local APIC timer interrupt. This is the most natural way for doing
20402 + * local interrupts, but local timer interrupts can be emulated by
20403 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
20404 + *
20405 + * [ if a single-CPU system runs an SMP kernel then we call the local
20406 + *   interrupt as well. Thus we cannot inline the local irq ... ]
20407 + */
20408 +void smp_apic_timer_interrupt(struct pt_regs *regs)
20409 +{
20410 +       /*
20411 +        * the NMI deadlock-detector uses this.
20412 +        */
20413 +       add_pda(apic_timer_irqs, 1);
20414 +
20415 +       /*
20416 +        * NOTE! We'd better ACK the irq immediately,
20417 +        * because timer handling can be slow.
20418 +        */
20419 +       ack_APIC_irq();
20420 +       /*
20421 +        * update_process_times() expects us to have done irq_enter().
20422 +        * Besides, if we don't timer interrupts ignore the global
20423 +        * interrupt lock, which is the WrongThing (tm) to do.
20424 +        */
20425 +       exit_idle();
20426 +       irq_enter();
20427 +       smp_local_timer_interrupt(regs);
20428 +       irq_exit();
20429 +}
20430 +
20431 +/*
20432 + * This interrupt should _never_ happen with our APIC/SMP architecture
20433 + */
20434 +asmlinkage void smp_spurious_interrupt(void)
20435 +{
20436 +       unsigned int v;
20437 +       exit_idle();
20438 +       irq_enter();
20439 +       /*
20440 +        * Check if this really is a spurious interrupt and ACK it
20441 +        * if it is a vectored one.  Just in case...
20442 +        * Spurious interrupts should not be ACKed.
20443 +        */
20444 +       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
20445 +       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
20446 +               ack_APIC_irq();
20447 +
20448 +#if 0
20449 +       static unsigned long last_warning;
20450 +       static unsigned long skipped;
20451 +
20452 +       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
20453 +       if (time_before(last_warning+30*HZ,jiffies)) {
20454 +               printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
20455 +                      smp_processor_id(), skipped);
20456 +               last_warning = jiffies;
20457 +               skipped = 0;
20458 +       } else {
20459 +               skipped++;
20460 +       }
20461 +#endif
20462 +       irq_exit();
20463 +}
20464 +
20465 +/*
20466 + * This interrupt should never happen with our APIC/SMP architecture
20467 + */
20468 +
20469 +asmlinkage void smp_error_interrupt(void)
20470 +{
20471 +       unsigned int v, v1;
20472 +
20473 +       exit_idle();
20474 +       irq_enter();
20475 +       /* First tickle the hardware, only then report what went on. -- REW */
20476 +       v = apic_read(APIC_ESR);
20477 +       apic_write(APIC_ESR, 0);
20478 +       v1 = apic_read(APIC_ESR);
20479 +       ack_APIC_irq();
20480 +       atomic_inc(&irq_err_count);
20481 +
20482 +       /* Here is what the APIC error bits mean:
20483 +          0: Send CS error
20484 +          1: Receive CS error
20485 +          2: Send accept error
20486 +          3: Receive accept error
20487 +          4: Reserved
20488 +          5: Send illegal vector
20489 +          6: Received illegal vector
20490 +          7: Illegal register address
20491 +       */
20492 +       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
20493 +               smp_processor_id(), v , v1);
20494 +       irq_exit();
20495 +}
20496 +
20497 +int disable_apic;
20498 +
20499 +/*
20500 + * This initializes the IO-APIC and APIC hardware if this is
20501 + * a UP kernel.
20502 + */
20503 +int __init APIC_init_uniprocessor (void)
20504 +{
20505 +#ifdef CONFIG_X86_IO_APIC
20506 +       if (smp_found_config)
20507 +               if (!skip_ioapic_setup && nr_ioapics)
20508 +                       setup_IO_APIC();
20509 +#endif
20510 +
20511 +       return 1;
20512 +}
20513 Index: head-2008-11-25/arch/x86/kernel/e820_64-xen.c
20514 ===================================================================
20515 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
20516 +++ head-2008-11-25/arch/x86/kernel/e820_64-xen.c       2008-04-22 19:56:27.000000000 +0200
20517 @@ -0,0 +1,798 @@
20518 +/*
20519 + * Handle the memory map.
20520 + * The functions here do the job until bootmem takes over.
20521 + *
20522 + *  Getting sanitize_e820_map() in sync with i386 version by applying change:
20523 + *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
20524 + *     Alex Achenbach <xela@slit.de>, December 2002.
20525 + *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
20526 + *
20527 + */
20528 +#include <linux/kernel.h>
20529 +#include <linux/types.h>
20530 +#include <linux/init.h>
20531 +#include <linux/bootmem.h>
20532 +#include <linux/ioport.h>
20533 +#include <linux/string.h>
20534 +#include <linux/kexec.h>
20535 +#include <linux/module.h>
20536 +
20537 +#include <asm/pgtable.h>
20538 +#include <asm/page.h>
20539 +#include <asm/e820.h>
20540 +#include <asm/proto.h>
20541 +#include <asm/bootsetup.h>
20542 +#include <asm/sections.h>
20543 +#include <xen/interface/memory.h>
20544 +
20545 +/*
20546 + * PFN of last memory page.
20547 + */
20548 +unsigned long end_pfn;
20549 +EXPORT_SYMBOL(end_pfn);
20550 +
20551 +/*
20552 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
20553 + * The direct mapping extends to end_pfn_map, so that we can directly access
20554 + * apertures, ACPI and other tables without having to play with fixmaps.
20555 + */
20556 +unsigned long end_pfn_map;
20557 +
20558 +/*
20559 + * Last pfn which the user wants to use.
20560 + */
20561 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
20562 +
20563 +extern struct resource code_resource, data_resource;
20564 +
20565 +#ifdef CONFIG_XEN
20566 +extern struct e820map machine_e820;
20567 +#endif
20568 +
20569 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */
20570 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
20571 +{
20572 +       unsigned long addr = *addrp, last = addr + size;
20573 +
20574 +#ifndef CONFIG_XEN
20575 +       /* various gunk below that needed for SMP startup */
20576 +       if (addr < 0x8000) {
20577 +               *addrp = 0x8000;
20578 +               return 1;
20579 +       }
20580 +
20581 +       /* direct mapping tables of the kernel */
20582 +       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
20583 +               *addrp = table_end << PAGE_SHIFT;
20584 +               return 1;
20585 +       }
20586 +
20587 +       /* initrd */
20588 +#ifdef CONFIG_BLK_DEV_INITRD
20589 +       if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
20590 +           addr < INITRD_START+INITRD_SIZE) {
20591 +               *addrp = INITRD_START + INITRD_SIZE;
20592 +               return 1;
20593 +       }
20594 +#endif
20595 +       /* kernel code + 640k memory hole (later should not be needed, but
20596 +          be paranoid for now) */
20597 +       if (last >= 640*1024 && addr < 1024*1024) {
20598 +               *addrp = 1024*1024;
20599 +               return 1;
20600 +       }
20601 +       if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
20602 +               *addrp = __pa_symbol(&_end);
20603 +               return 1;
20604 +       }
20605 +
20606 +       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
20607 +               *addrp = ebda_addr + ebda_size;
20608 +               return 1;
20609 +       }
20610 +
20611 +       /* XXX ramdisk image here? */
20612 +#else
20613 +       if (last < (table_end<<PAGE_SHIFT)) {
20614 +               *addrp = table_end << PAGE_SHIFT;
20615 +               return 1;
20616 +       }
20617 +#endif
20618 +       return 0;
20619 +}
20620 +
20621 +/*
20622 + * This function checks if any part of the range <start,end> is mapped
20623 + * with type.
20624 + */
20625 +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
20626 +{
20627 +       int i;
20628 +
20629 +#ifndef CONFIG_XEN
20630 +       for (i = 0; i < e820.nr_map; i++) {
20631 +               struct e820entry *ei = &e820.map[i];
20632 +#else
20633 +       if (!is_initial_xendomain())
20634 +               return 0;
20635 +       for (i = 0; i < machine_e820.nr_map; i++) {
20636 +               const struct e820entry *ei = &machine_e820.map[i];
20637 +#endif
20638 +
20639 +               if (type && ei->type != type)
20640 +                       continue;
20641 +               if (ei->addr >= end || ei->addr + ei->size <= start)
20642 +                       continue;
20643 +               return 1;
20644 +       }
20645 +       return 0;
20646 +}
20647 +EXPORT_SYMBOL_GPL(e820_any_mapped);
20648 +
20649 +/*
20650 + * This function checks if the entire range <start,end> is mapped with type.
20651 + *
20652 + * Note: this function only works correct if the e820 table is sorted and
20653 + * not-overlapping, which is the case
20654 + */
20655 +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
20656 +{
20657 +       int i;
20658 +
20659 +#ifndef CONFIG_XEN
20660 +       for (i = 0; i < e820.nr_map; i++) {
20661 +               struct e820entry *ei = &e820.map[i];
20662 +#else
20663 +       if (!is_initial_xendomain())
20664 +               return 0;
20665 +       for (i = 0; i < machine_e820.nr_map; i++) {
20666 +               const struct e820entry *ei = &machine_e820.map[i];
20667 +#endif
20668 +
20669 +               if (type && ei->type != type)
20670 +                       continue;
20671 +               /* is the region (part) in overlap with the current region ?*/
20672 +               if (ei->addr >= end || ei->addr + ei->size <= start)
20673 +                       continue;
20674 +
20675 +               /* if the region is at the beginning of <start,end> we move
20676 +                * start to the end of the region since it's ok until there
20677 +                */
20678 +               if (ei->addr <= start)
20679 +                       start = ei->addr + ei->size;
20680 +               /* if start is now at or beyond end, we're done, full coverage */
20681 +               if (start >= end)
20682 +                       return 1; /* we're done */
20683 +       }
20684 +       return 0;
20685 +}
20686 +
20687 +/*
20688 + * Find a free area in a specific range.
20689 + */
20690 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
20691 +{
20692 +       int i;
20693 +       for (i = 0; i < e820.nr_map; i++) {
20694 +               struct e820entry *ei = &e820.map[i];
20695 +               unsigned long addr = ei->addr, last;
20696 +               if (ei->type != E820_RAM)
20697 +                       continue;
20698 +               if (addr < start)
20699 +                       addr = start;
20700 +               if (addr > ei->addr + ei->size)
20701 +                       continue;
20702 +               while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
20703 +                       ;
20704 +               last = addr + size;
20705 +               if (last > ei->addr + ei->size)
20706 +                       continue;
20707 +               if (last > end)
20708 +                       continue;
20709 +               return addr;
20710 +       }
20711 +       return -1UL;
20712 +}
20713 +
20714 +/*
20715 + * Free bootmem based on the e820 table for a node.
20716 + */
20717 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
20718 +{
20719 +       int i;
20720 +       for (i = 0; i < e820.nr_map; i++) {
20721 +               struct e820entry *ei = &e820.map[i];
20722 +               unsigned long last, addr;
20723 +
20724 +               if (ei->type != E820_RAM ||
20725 +                   ei->addr+ei->size <= start ||
20726 +                   ei->addr >= end)
20727 +                       continue;
20728 +
20729 +               addr = round_up(ei->addr, PAGE_SIZE);
20730 +               if (addr < start)
20731 +                       addr = start;
20732 +
20733 +               last = round_down(ei->addr + ei->size, PAGE_SIZE);
20734 +               if (last >= end)
20735 +                       last = end;
20736 +
20737 +               if (last > addr && last-addr >= PAGE_SIZE)
20738 +                       free_bootmem_node(pgdat, addr, last-addr);
20739 +       }
20740 +}
20741 +
20742 +/*
20743 + * Find the highest page frame number we have available
20744 + */
20745 +unsigned long __init e820_end_of_ram(void)
20746 +{
20747 +       int i;
20748 +       unsigned long end_pfn = 0;
20749 +
20750 +       for (i = 0; i < e820.nr_map; i++) {
20751 +               struct e820entry *ei = &e820.map[i];
20752 +               unsigned long start, end;
20753 +
20754 +               start = round_up(ei->addr, PAGE_SIZE);
20755 +               end = round_down(ei->addr + ei->size, PAGE_SIZE);
20756 +               if (start >= end)
20757 +                       continue;
20758 +               if (ei->type == E820_RAM) {
20759 +               if (end > end_pfn<<PAGE_SHIFT)
20760 +                       end_pfn = end>>PAGE_SHIFT;
20761 +               } else {
20762 +                       if (end > end_pfn_map<<PAGE_SHIFT)
20763 +                               end_pfn_map = end>>PAGE_SHIFT;
20764 +               }
20765 +       }
20766 +
20767 +       if (end_pfn > end_pfn_map)
20768 +               end_pfn_map = end_pfn;
20769 +       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
20770 +               end_pfn_map = MAXMEM>>PAGE_SHIFT;
20771 +       if (end_pfn > end_user_pfn)
20772 +               end_pfn = end_user_pfn;
20773 +       if (end_pfn > end_pfn_map)
20774 +               end_pfn = end_pfn_map;
20775 +
20776 +       return end_pfn;
20777 +}
20778 +
20779 +/*
20780 + * Compute how much memory is missing in a range.
20781 + * Unlike the other functions in this file the arguments are in page numbers.
20782 + */
20783 +unsigned long __init
20784 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
20785 +{
20786 +       unsigned long ram = 0;
20787 +       unsigned long start = start_pfn << PAGE_SHIFT;
20788 +       unsigned long end = end_pfn << PAGE_SHIFT;
20789 +       int i;
20790 +       for (i = 0; i < e820.nr_map; i++) {
20791 +               struct e820entry *ei = &e820.map[i];
20792 +               unsigned long last, addr;
20793 +
20794 +               if (ei->type != E820_RAM ||
20795 +                   ei->addr+ei->size <= start ||
20796 +                   ei->addr >= end)
20797 +                       continue;
20798 +
20799 +               addr = round_up(ei->addr, PAGE_SIZE);
20800 +               if (addr < start)
20801 +                       addr = start;
20802 +
20803 +               last = round_down(ei->addr + ei->size, PAGE_SIZE);
20804 +               if (last >= end)
20805 +                       last = end;
20806 +
20807 +               if (last > addr)
20808 +                       ram += last - addr;
20809 +       }
20810 +       return ((end - start) - ram) >> PAGE_SHIFT;
20811 +}
20812 +
20813 +/*
20814 + * Mark e820 reserved areas as busy for the resource manager.
20815 + */
20816 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
20817 +{
20818 +       int i;
20819 +       for (i = 0; i < nr_map; i++) {
20820 +               struct resource *res;
20821 +               res = alloc_bootmem_low(sizeof(struct resource));
20822 +               switch (e820[i].type) {
20823 +               case E820_RAM:  res->name = "System RAM"; break;
20824 +               case E820_ACPI: res->name = "ACPI Tables"; break;
20825 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
20826 +               default:        res->name = "reserved";
20827 +               }
20828 +               res->start = e820[i].addr;
20829 +               res->end = res->start + e820[i].size - 1;
20830 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
20831 +               request_resource(&iomem_resource, res);
20832 +               if (e820[i].type == E820_RAM) {
20833 +                       /*
20834 +                        *  We don't know which RAM region contains kernel data,
20835 +                        *  so we try it repeatedly and let the resource manager
20836 +                        *  test it.
20837 +                        */
20838 +#ifndef CONFIG_XEN
20839 +                       request_resource(res, &code_resource);
20840 +                       request_resource(res, &data_resource);
20841 +#endif
20842 +#ifdef CONFIG_KEXEC
20843 +                       if (crashk_res.start != crashk_res.end)
20844 +                               request_resource(res, &crashk_res);
20845 +#ifdef CONFIG_XEN
20846 +                       xen_machine_kexec_register_resources(res);
20847 +#endif
20848 +#endif
20849 +               }
20850 +       }
20851 +}
20852 +
20853 +/*
20854 + * Add a memory region to the kernel e820 map.
20855 + */
20856 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
20857 +{
20858 +       int x = e820.nr_map;
20859 +
20860 +       if (x == E820MAX) {
20861 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
20862 +               return;
20863 +       }
20864 +
20865 +       e820.map[x].addr = start;
20866 +       e820.map[x].size = size;
20867 +       e820.map[x].type = type;
20868 +       e820.nr_map++;
20869 +}
20870 +
20871 +void __init e820_print_map(char *who)
20872 +{
20873 +       int i;
20874 +
20875 +       for (i = 0; i < e820.nr_map; i++) {
20876 +               printk(" %s: %016Lx - %016Lx ", who,
20877 +                       (unsigned long long) e820.map[i].addr,
20878 +                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
20879 +               switch (e820.map[i].type) {
20880 +               case E820_RAM:  printk("(usable)\n");
20881 +                               break;
20882 +               case E820_RESERVED:
20883 +                               printk("(reserved)\n");
20884 +                               break;
20885 +               case E820_ACPI:
20886 +                               printk("(ACPI data)\n");
20887 +                               break;
20888 +               case E820_NVS:
20889 +                               printk("(ACPI NVS)\n");
20890 +                               break;
20891 +               default:        printk("type %u\n", e820.map[i].type);
20892 +                               break;
20893 +               }
20894 +       }
20895 +}
20896 +
20897 +/*
20898 + * Sanitize the BIOS e820 map.
20899 + *
20900 + * Some e820 responses include overlapping entries.  The following
20901 + * replaces the original e820 map with a new one, removing overlaps.
20902 + *
20903 + */
20904 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
20905 +{
20906 +       struct change_member {
20907 +               struct e820entry *pbios; /* pointer to original bios entry */
20908 +               unsigned long long addr; /* address for this change point */
20909 +       };
20910 +       static struct change_member change_point_list[2*E820MAX] __initdata;
20911 +       static struct change_member *change_point[2*E820MAX] __initdata;
20912 +       static struct e820entry *overlap_list[E820MAX] __initdata;
20913 +       static struct e820entry new_bios[E820MAX] __initdata;
20914 +       struct change_member *change_tmp;
20915 +       unsigned long current_type, last_type;
20916 +       unsigned long long last_addr;
20917 +       int chgidx, still_changing;
20918 +       int overlap_entries;
20919 +       int new_bios_entry;
20920 +       int old_nr, new_nr, chg_nr;
20921 +       int i;
20922 +
20923 +       /*
20924 +               Visually we're performing the following (1,2,3,4 = memory types)...
20925 +
20926 +               Sample memory map (w/overlaps):
20927 +                  ____22__________________
20928 +                  ______________________4_
20929 +                  ____1111________________
20930 +                  _44_____________________
20931 +                  11111111________________
20932 +                  ____________________33__
20933 +                  ___________44___________
20934 +                  __________33333_________
20935 +                  ______________22________
20936 +                  ___________________2222_
20937 +                  _________111111111______
20938 +                  _____________________11_
20939 +                  _________________4______
20940 +
20941 +               Sanitized equivalent (no overlap):
20942 +                  1_______________________
20943 +                  _44_____________________
20944 +                  ___1____________________
20945 +                  ____22__________________
20946 +                  ______11________________
20947 +                  _________1______________
20948 +                  __________3_____________
20949 +                  ___________44___________
20950 +                  _____________33_________
20951 +                  _______________2________
20952 +                  ________________1_______
20953 +                  _________________4______
20954 +                  ___________________2____
20955 +                  ____________________33__
20956 +                  ______________________4_
20957 +       */
20958 +
20959 +       /* if there's only one memory region, don't bother */
20960 +       if (*pnr_map < 2)
20961 +               return -1;
20962 +
20963 +       old_nr = *pnr_map;
20964 +
20965 +       /* bail out if we find any unreasonable addresses in bios map */
20966 +       for (i=0; i<old_nr; i++)
20967 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
20968 +                       return -1;
20969 +
20970 +       /* create pointers for initial change-point information (for sorting) */
20971 +       for (i=0; i < 2*old_nr; i++)
20972 +               change_point[i] = &change_point_list[i];
20973 +
20974 +       /* record all known change-points (starting and ending addresses),
20975 +          omitting those that are for empty memory regions */
20976 +       chgidx = 0;
20977 +       for (i=0; i < old_nr; i++)      {
20978 +               if (biosmap[i].size != 0) {
20979 +                       change_point[chgidx]->addr = biosmap[i].addr;
20980 +                       change_point[chgidx++]->pbios = &biosmap[i];
20981 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
20982 +                       change_point[chgidx++]->pbios = &biosmap[i];
20983 +               }
20984 +       }
20985 +       chg_nr = chgidx;
20986 +
20987 +       /* sort change-point list by memory addresses (low -> high) */
20988 +       still_changing = 1;
20989 +       while (still_changing)  {
20990 +               still_changing = 0;
20991 +               for (i=1; i < chg_nr; i++)  {
20992 +                       /* if <current_addr> > <last_addr>, swap */
20993 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
20994 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
20995 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
20996 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
20997 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
20998 +                          )
20999 +                       {
21000 +                               change_tmp = change_point[i];
21001 +                               change_point[i] = change_point[i-1];
21002 +                               change_point[i-1] = change_tmp;
21003 +                               still_changing=1;
21004 +                       }
21005 +               }
21006 +       }
21007 +
21008 +       /* create a new bios memory map, removing overlaps */
21009 +       overlap_entries=0;       /* number of entries in the overlap table */
21010 +       new_bios_entry=0;        /* index for creating new bios map entries */
21011 +       last_type = 0;           /* start with undefined memory type */
21012 +       last_addr = 0;           /* start with 0 as last starting address */
21013 +       /* loop through change-points, determining affect on the new bios map */
21014 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
21015 +       {
21016 +               /* keep track of all overlapping bios entries */
21017 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
21018 +               {
21019 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
21020 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
21021 +               }
21022 +               else
21023 +               {
21024 +                       /* remove entry from list (order independent, so swap with last) */
21025 +                       for (i=0; i<overlap_entries; i++)
21026 +                       {
21027 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
21028 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
21029 +                       }
21030 +                       overlap_entries--;
21031 +               }
21032 +               /* if there are overlapping entries, decide which "type" to use */
21033 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
21034 +               current_type = 0;
21035 +               for (i=0; i<overlap_entries; i++)
21036 +                       if (overlap_list[i]->type > current_type)
21037 +                               current_type = overlap_list[i]->type;
21038 +               /* continue building up new bios map based on this information */
21039 +               if (current_type != last_type)  {
21040 +                       if (last_type != 0)      {
21041 +                               new_bios[new_bios_entry].size =
21042 +                                       change_point[chgidx]->addr - last_addr;
21043 +                               /* move forward only if the new size was non-zero */
21044 +                               if (new_bios[new_bios_entry].size != 0)
21045 +                                       if (++new_bios_entry >= E820MAX)
21046 +                                               break;  /* no more space left for new bios entries */
21047 +                       }
21048 +                       if (current_type != 0)  {
21049 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
21050 +                               new_bios[new_bios_entry].type = current_type;
21051 +                               last_addr=change_point[chgidx]->addr;
21052 +                       }
21053 +                       last_type = current_type;
21054 +               }
21055 +       }
21056 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
21057 +
21058 +       /* copy new bios mapping into original location */
21059 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
21060 +       *pnr_map = new_nr;
21061 +
21062 +       return 0;
21063 +}
21064 +
21065 +/*
21066 + * Copy the BIOS e820 map into a safe place.
21067 + *
21068 + * Sanity-check it while we're at it..
21069 + *
21070 + * If we're lucky and live on a modern system, the setup code
21071 + * will have given us a memory map that we can use to properly
21072 + * set up memory.  If we aren't, we'll fake a memory map.
21073 + *
21074 + * We check to see that the memory map contains at least 2 elements
21075 + * before we'll use it, because the detection code in setup.S may
21076 + * not be perfect and most every PC known to man has two memory
21077 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
21078 + * thinkpad 560x, for example, does not cooperate with the memory
21079 + * detection code.)
21080 + */
21081 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
21082 +{
21083 +#ifndef CONFIG_XEN
21084 +       /* Only one memory region (or negative)? Ignore it */
21085 +       if (nr_map < 2)
21086 +               return -1;
21087 +#else
21088 +       BUG_ON(nr_map < 1);
21089 +#endif
21090 +
21091 +       do {
21092 +               unsigned long start = biosmap->addr;
21093 +               unsigned long size = biosmap->size;
21094 +               unsigned long end = start + size;
21095 +               unsigned long type = biosmap->type;
21096 +
21097 +               /* Overflow in 64 bits? Ignore the memory map. */
21098 +               if (start > end)
21099 +                       return -1;
21100 +
21101 +#ifndef CONFIG_XEN
21102 +               /*
21103 +                * Some BIOSes claim RAM in the 640k - 1M region.
21104 +                * Not right. Fix it up.
21105 +                *
21106 +                * This should be removed on Hammer which is supposed to not
21107 +                * have non e820 covered ISA mappings there, but I had some strange
21108 +                * problems so it stays for now.  -AK
21109 +                */
21110 +               if (type == E820_RAM) {
21111 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
21112 +                               if (start < 0xA0000ULL)
21113 +                                       add_memory_region(start, 0xA0000ULL-start, type);
21114 +                               if (end <= 0x100000ULL)
21115 +                                       continue;
21116 +                               start = 0x100000ULL;
21117 +                               size = end - start;
21118 +                       }
21119 +               }
21120 +#endif
21121 +
21122 +               add_memory_region(start, size, type);
21123 +       } while (biosmap++,--nr_map);
21124 +
21125 +#ifdef CONFIG_XEN
21126 +       if (is_initial_xendomain()) {
21127 +               struct xen_memory_map memmap;
21128 +
21129 +               memmap.nr_entries = E820MAX;
21130 +               set_xen_guest_handle(memmap.buffer, machine_e820.map);
21131 +
21132 +               if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
21133 +                       BUG();
21134 +               machine_e820.nr_map = memmap.nr_entries;
21135 +       } else
21136 +               machine_e820 = e820;
21137 +#endif
21138 +
21139 +       return 0;
21140 +}
21141 +
21142 +#ifndef CONFIG_XEN
21143 +void __init setup_memory_region(void)
21144 +{
21145 +       char *who = "BIOS-e820";
21146 +
21147 +       /*
21148 +        * Try to copy the BIOS-supplied E820-map.
21149 +        *
21150 +        * Otherwise fake a memory map; one section from 0k->640k,
21151 +        * the next section from 1mb->appropriate_mem_k
21152 +        */
21153 +       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
21154 +       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
21155 +               unsigned long mem_size;
21156 +
21157 +               /* compare results from other methods and take the greater */
21158 +               if (ALT_MEM_K < EXT_MEM_K) {
21159 +                       mem_size = EXT_MEM_K;
21160 +                       who = "BIOS-88";
21161 +               } else {
21162 +                       mem_size = ALT_MEM_K;
21163 +                       who = "BIOS-e801";
21164 +               }
21165 +
21166 +               e820.nr_map = 0;
21167 +               add_memory_region(0, LOWMEMSIZE(), E820_RAM);
21168 +               add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
21169 +       }
21170 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21171 +       e820_print_map(who);
21172 +}
21173 +
21174 +#else  /* CONFIG_XEN */
21175 +
21176 +void __init setup_memory_region(void)
21177 +{
21178 +       int rc;
21179 +       struct xen_memory_map memmap;
21180 +       /*
21181 +        * This is rather large for a stack variable but this early in
21182 +        * the boot process we know we have plenty slack space.
21183 +        */
21184 +       struct e820entry map[E820MAX];
21185 +
21186 +       memmap.nr_entries = E820MAX;
21187 +       set_xen_guest_handle(memmap.buffer, map);
21188 +
21189 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
21190 +       if ( rc == -ENOSYS ) {
21191 +               memmap.nr_entries = 1;
21192 +               map[0].addr = 0ULL;
21193 +               map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
21194 +               /* 8MB slack (to balance backend allocations). */
21195 +               map[0].size += 8 << 20;
21196 +               map[0].type = E820_RAM;
21197 +               rc = 0;
21198 +       }
21199 +       BUG_ON(rc);
21200 +
21201 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
21202 +
21203 +       BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
21204 +
21205 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21206 +       e820_print_map("Xen");
21207 +}
21208 +#endif
21209 +
21210 +void __init parse_memopt(char *p, char **from)
21211 +{
21212 +       int i;
21213 +       unsigned long current_end;
21214 +       unsigned long end;
21215 +
21216 +       end_user_pfn = memparse(p, from);
21217 +       end_user_pfn >>= PAGE_SHIFT;
21218 +
21219 +       end = end_user_pfn<<PAGE_SHIFT;
21220 +       i = e820.nr_map-1;
21221 +       current_end = e820.map[i].addr + e820.map[i].size;
21222 +
21223 +       if (current_end < end) {
21224 +               /*
21225 +                 * The e820 map ends before our requested size so
21226 +                 * extend the final entry to the requested address.
21227 +                 */
21228 +               if (e820.map[i].type == E820_RAM)
21229 +                       e820.map[i].size = end - e820.map[i].addr;
21230 +               else
21231 +                       add_memory_region(current_end, end - current_end, E820_RAM);
21232 +       }
21233 +}
21234 +
21235 +void __init parse_memmapopt(char *p, char **from)
21236 +{
21237 +       unsigned long long start_at, mem_size;
21238 +
21239 +       mem_size = memparse(p, from);
21240 +       p = *from;
21241 +       if (*p == '@') {
21242 +               start_at = memparse(p+1, from);
21243 +               add_memory_region(start_at, mem_size, E820_RAM);
21244 +       } else if (*p == '#') {
21245 +               start_at = memparse(p+1, from);
21246 +               add_memory_region(start_at, mem_size, E820_ACPI);
21247 +       } else if (*p == '$') {
21248 +               start_at = memparse(p+1, from);
21249 +               add_memory_region(start_at, mem_size, E820_RESERVED);
21250 +       } else {
21251 +               end_user_pfn = (mem_size >> PAGE_SHIFT);
21252 +       }
21253 +       p = *from;
21254 +}
21255 +
21256 +unsigned long pci_mem_start = 0xaeedbabe;
21257 +EXPORT_SYMBOL(pci_mem_start);
21258 +
21259 +/*
21260 + * Search for the biggest gap in the low 32 bits of the e820
21261 + * memory space.  We pass this space to PCI to assign MMIO resources
21262 + * for hotplug or unconfigured devices in.
21263 + * Hopefully the BIOS let enough space left.
21264 + */
21265 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
21266 +{
21267 +       unsigned long gapstart, gapsize, round;
21268 +       unsigned long last;
21269 +       int i;
21270 +       int found = 0;
21271 +
21272 +       last = 0x100000000ull;
21273 +       gapstart = 0x10000000;
21274 +       gapsize = 0x400000;
21275 +       i = nr_map;
21276 +       while (--i >= 0) {
21277 +               unsigned long long start = e820[i].addr;
21278 +               unsigned long long end = start + e820[i].size;
21279 +
21280 +               /*
21281 +                * Since "last" is at most 4GB, we know we'll
21282 +                * fit in 32 bits if this condition is true
21283 +                */
21284 +               if (last > end) {
21285 +                       unsigned long gap = last - end;
21286 +
21287 +                       if (gap > gapsize) {
21288 +                               gapsize = gap;
21289 +                               gapstart = end;
21290 +                               found = 1;
21291 +                       }
21292 +               }
21293 +               if (start < last)
21294 +                       last = start;
21295 +       }
21296 +
21297 +       if (!found) {
21298 +               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
21299 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
21300 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
21301 +       }
21302 +
21303 +       /*
21304 +        * See how much we want to round up: start off with
21305 +        * rounding to the next 1MB area.
21306 +        */
21307 +       round = 0x100000;
21308 +       while ((gapsize >> 4) > round)
21309 +               round += round;
21310 +       /* Fun with two's complement */
21311 +       pci_mem_start = (gapstart + round) & -round;
21312 +
21313 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
21314 +               pci_mem_start, gapstart, gapsize);
21315 +}
21316 Index: head-2008-11-25/arch/x86/kernel/early_printk-xen.c
21317 ===================================================================
21318 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
21319 +++ head-2008-11-25/arch/x86/kernel/early_printk-xen.c  2007-06-12 13:13:01.000000000 +0200
21320 @@ -0,0 +1,302 @@
21321 +#include <linux/console.h>
21322 +#include <linux/kernel.h>
21323 +#include <linux/init.h>
21324 +#include <linux/string.h>
21325 +#include <linux/screen_info.h>
21326 +#include <asm/io.h>
21327 +#include <asm/processor.h>
21328 +#include <asm/fcntl.h>
21329 +
21330 +/* Simple VGA output */
21331 +
21332 +#ifdef __i386__
21333 +#include <asm/setup.h>
21334 +#define VGABASE                (__ISA_IO_base + 0xb8000)
21335 +#else
21336 +#include <asm/bootsetup.h>
21337 +#define VGABASE                ((void __iomem *)0xffffffff800b8000UL)
21338 +#endif
21339 +
21340 +#ifndef CONFIG_XEN
21341 +static int max_ypos = 25, max_xpos = 80;
21342 +static int current_ypos = 25, current_xpos = 0;
21343 +
21344 +static void early_vga_write(struct console *con, const char *str, unsigned n)
21345 +{
21346 +       char c;
21347 +       int  i, k, j;
21348 +
21349 +       while ((c = *str++) != '\0' && n-- > 0) {
21350 +               if (current_ypos >= max_ypos) {
21351 +                       /* scroll 1 line up */
21352 +                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
21353 +                               for (i = 0; i < max_xpos; i++) {
21354 +                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
21355 +                                              VGABASE + 2*(max_xpos*j + i));
21356 +                               }
21357 +                       }
21358 +                       for (i = 0; i < max_xpos; i++)
21359 +                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
21360 +                       current_ypos = max_ypos-1;
21361 +               }
21362 +               if (c == '\n') {
21363 +                       current_xpos = 0;
21364 +                       current_ypos++;
21365 +               } else if (c != '\r')  {
21366 +                       writew(((0x7 << 8) | (unsigned short) c),
21367 +                              VGABASE + 2*(max_xpos*current_ypos +
21368 +                                               current_xpos++));
21369 +                       if (current_xpos >= max_xpos) {
21370 +                               current_xpos = 0;
21371 +                               current_ypos++;
21372 +                       }
21373 +               }
21374 +       }
21375 +}
21376 +
21377 +static struct console early_vga_console = {
21378 +       .name =         "earlyvga",
21379 +       .write =        early_vga_write,
21380 +       .flags =        CON_PRINTBUFFER,
21381 +       .index =        -1,
21382 +};
21383 +
21384 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
21385 +
21386 +static int early_serial_base = 0x3f8;  /* ttyS0 */
21387 +
21388 +#define XMTRDY          0x20
21389 +
21390 +#define DLAB           0x80
21391 +
21392 +#define TXR             0       /*  Transmit register (WRITE) */
21393 +#define RXR             0       /*  Receive register  (READ)  */
21394 +#define IER             1       /*  Interrupt Enable          */
21395 +#define IIR             2       /*  Interrupt ID              */
21396 +#define FCR             2       /*  FIFO control              */
21397 +#define LCR             3       /*  Line control              */
21398 +#define MCR             4       /*  Modem control             */
21399 +#define LSR             5       /*  Line Status               */
21400 +#define MSR             6       /*  Modem Status              */
21401 +#define DLL             0       /*  Divisor Latch Low         */
21402 +#define DLH             1       /*  Divisor latch High        */
21403 +
21404 +static int early_serial_putc(unsigned char ch)
21405 +{
21406 +       unsigned timeout = 0xffff;
21407 +       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
21408 +               cpu_relax();
21409 +       outb(ch, early_serial_base + TXR);
21410 +       return timeout ? 0 : -1;
21411 +}
21412 +
21413 +static void early_serial_write(struct console *con, const char *s, unsigned n)
21414 +{
21415 +       while (*s && n-- > 0) {
21416 +               early_serial_putc(*s);
21417 +               if (*s == '\n')
21418 +                       early_serial_putc('\r');
21419 +               s++;
21420 +       }
21421 +}
21422 +
21423 +#define DEFAULT_BAUD 9600
21424 +
21425 +static __init void early_serial_init(char *s)
21426 +{
21427 +       unsigned char c;
21428 +       unsigned divisor;
21429 +       unsigned baud = DEFAULT_BAUD;
21430 +       char *e;
21431 +
21432 +       if (*s == ',')
21433 +               ++s;
21434 +
21435 +       if (*s) {
21436 +               unsigned port;
21437 +               if (!strncmp(s,"0x",2)) {
21438 +                       early_serial_base = simple_strtoul(s, &e, 16);
21439 +               } else {
21440 +                       static int bases[] = { 0x3f8, 0x2f8 };
21441 +
21442 +                       if (!strncmp(s,"ttyS",4))
21443 +                               s += 4;
21444 +                       port = simple_strtoul(s, &e, 10);
21445 +                       if (port > 1 || s == e)
21446 +                               port = 0;
21447 +                       early_serial_base = bases[port];
21448 +               }
21449 +               s += strcspn(s, ",");
21450 +               if (*s == ',')
21451 +                       s++;
21452 +       }
21453 +
21454 +       outb(0x3, early_serial_base + LCR);     /* 8n1 */
21455 +       outb(0, early_serial_base + IER);       /* no interrupt */
21456 +       outb(0, early_serial_base + FCR);       /* no fifo */
21457 +       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
21458 +
21459 +       if (*s) {
21460 +               baud = simple_strtoul(s, &e, 0);
21461 +               if (baud == 0 || s == e)
21462 +                       baud = DEFAULT_BAUD;
21463 +       }
21464 +
21465 +       divisor = 115200 / baud;
21466 +       c = inb(early_serial_base + LCR);
21467 +       outb(c | DLAB, early_serial_base + LCR);
21468 +       outb(divisor & 0xff, early_serial_base + DLL);
21469 +       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
21470 +       outb(c & ~DLAB, early_serial_base + LCR);
21471 +}
21472 +
21473 +#else /* CONFIG_XEN */
21474 +
21475 +static void
21476 +early_serial_write(struct console *con, const char *s, unsigned count)
21477 +{
21478 +       int n;
21479 +
21480 +       while (count > 0) {
21481 +               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
21482 +               if (n <= 0)
21483 +                       break;
21484 +               count -= n;
21485 +               s += n;
21486 +       }
21487 +}
21488 +
21489 +static __init void early_serial_init(char *s)
21490 +{
21491 +}
21492 +
21493 +/*
21494 + * No early VGA console on Xen, as we do not have convenient ISA-space
21495 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
21496 + */
21497 +#define early_vga_console early_serial_console
21498 +
21499 +#endif
21500 +
21501 +static struct console early_serial_console = {
21502 +       .name =         "earlyser",
21503 +       .write =        early_serial_write,
21504 +       .flags =        CON_PRINTBUFFER,
21505 +       .index =        -1,
21506 +};
21507 +
21508 +/* Console interface to a host file on AMD's SimNow! */
21509 +
21510 +static int simnow_fd;
21511 +
21512 +enum {
21513 +       MAGIC1 = 0xBACCD00A,
21514 +       MAGIC2 = 0xCA110000,
21515 +       XOPEN = 5,
21516 +       XWRITE = 4,
21517 +};
21518 +
21519 +static noinline long simnow(long cmd, long a, long b, long c)
21520 +{
21521 +       long ret;
21522 +       asm volatile("cpuid" :
21523 +                    "=a" (ret) :
21524 +                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
21525 +       return ret;
21526 +}
21527 +
21528 +void __init simnow_init(char *str)
21529 +{
21530 +       char *fn = "klog";
21531 +       if (*str == '=')
21532 +               fn = ++str;
21533 +       /* error ignored */
21534 +       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
21535 +}
21536 +
21537 +static void simnow_write(struct console *con, const char *s, unsigned n)
21538 +{
21539 +       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
21540 +}
21541 +
21542 +static struct console simnow_console = {
21543 +       .name =         "simnow",
21544 +       .write =        simnow_write,
21545 +       .flags =        CON_PRINTBUFFER,
21546 +       .index =        -1,
21547 +};
21548 +
21549 +/* Direct interface for emergencies */
21550 +struct console *early_console = &early_vga_console;
21551 +static int early_console_initialized = 0;
21552 +
21553 +void early_printk(const char *fmt, ...)
21554 +{
21555 +       char buf[512];
21556 +       int n;
21557 +       va_list ap;
21558 +
21559 +       va_start(ap,fmt);
21560 +       n = vscnprintf(buf,512,fmt,ap);
21561 +       early_console->write(early_console,buf,n);
21562 +       va_end(ap);
21563 +}
21564 +
21565 +static int __initdata keep_early;
21566 +
21567 +int __init setup_early_printk(char *opt)
21568 +{
21569 +       char *space;
21570 +       char buf[256];
21571 +
21572 +       if (early_console_initialized)
21573 +               return 1;
21574 +
21575 +       strlcpy(buf,opt,sizeof(buf));
21576 +       space = strchr(buf, ' ');
21577 +       if (space)
21578 +               *space = 0;
21579 +
21580 +       if (strstr(buf,"keep"))
21581 +               keep_early = 1;
21582 +
21583 +       if (!strncmp(buf, "serial", 6)) {
21584 +               early_serial_init(buf + 6);
21585 +               early_console = &early_serial_console;
21586 +       } else if (!strncmp(buf, "ttyS", 4)) {
21587 +               early_serial_init(buf);
21588 +               early_console = &early_serial_console;
21589 +       } else if (!strncmp(buf, "vga", 3)
21590 +#ifndef CONFIG_XEN
21591 +                  && SCREEN_INFO.orig_video_isVGA == 1) {
21592 +               max_xpos = SCREEN_INFO.orig_video_cols;
21593 +               max_ypos = SCREEN_INFO.orig_video_lines;
21594 +               current_ypos = SCREEN_INFO.orig_y;
21595 +#else
21596 +                  || !strncmp(buf, "xen", 3)) {
21597 +#endif
21598 +               early_console = &early_vga_console;
21599 +       } else if (!strncmp(buf, "simnow", 6)) {
21600 +               simnow_init(buf + 6);
21601 +               early_console = &simnow_console;
21602 +               keep_early = 1;
21603 +       }
21604 +       early_console_initialized = 1;
21605 +       register_console(early_console);
21606 +       return 0;
21607 +}
21608 +
21609 +void __init disable_early_printk(void)
21610 +{
21611 +       if (!early_console_initialized || !early_console)
21612 +               return;
21613 +       if (!keep_early) {
21614 +               printk("disabling early console\n");
21615 +               unregister_console(early_console);
21616 +               early_console_initialized = 0;
21617 +       } else {
21618 +               printk("keeping early console\n");
21619 +       }
21620 +}
21621 +
21622 +__setup("earlyprintk=", setup_early_printk);
21623 Index: head-2008-11-25/arch/x86/kernel/entry_64-xen.S
21624 ===================================================================
21625 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
21626 +++ head-2008-11-25/arch/x86/kernel/entry_64-xen.S      2008-10-29 09:55:56.000000000 +0100
21627 @@ -0,0 +1,1322 @@
21628 +/*
21629 + *  linux/arch/x86_64/entry.S
21630 + *
21631 + *  Copyright (C) 1991, 1992  Linus Torvalds
21632 + *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
21633 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
21634 + *
21635 + *  $Id$
21636 + *
21637 + *  Jun Nakajima <jun.nakajima@intel.com>
21638 + *  Asit Mallick <asit.k.mallick@intel.com>
21639 + *      Modified for Xen
21640 + */
21641 +
21642 +/*
21643 + * entry.S contains the system-call and fault low-level handling routines.
21644 + *
21645 + * NOTE: This code handles signal-recognition, which happens every time
21646 + * after an interrupt and after each system call.
21647 + *
21648 + * Normal syscalls and interrupts don't save a full stack frame, this is
21649 + * only done for syscall tracing, signals or fork/exec et.al.
21650 + *
21651 + * A note on terminology:
21652 + * - top of stack: Architecture defined interrupt frame from SS to RIP
21653 + * at the top of the kernel process stack.
21654 + * - partial stack frame: partially saved registers upto R11.
21655 + * - full stack frame: Like partial stack frame, but all register saved.
21656 + *
21657 + * TODO:
21658 + * - schedule it carefully for the final hardware.
21659 + */
21660 +
21661 +#define ASSEMBLY 1
21662 +#include <linux/linkage.h>
21663 +#include <asm/segment.h>
21664 +#include <asm/smp.h>
21665 +#include <asm/cache.h>
21666 +#include <asm/errno.h>
21667 +#include <asm/dwarf2.h>
21668 +#include <asm/calling.h>
21669 +#include <asm/asm-offsets.h>
21670 +#include <asm/msr.h>
21671 +#include <asm/unistd.h>
21672 +#include <asm/thread_info.h>
21673 +#include <asm/hw_irq.h>
21674 +#include <asm/page.h>
21675 +#include <asm/irqflags.h>
21676 +#include <asm/errno.h>
21677 +#include <xen/interface/arch-x86_64.h>
21678 +#include <xen/interface/features.h>
21679 +
21680 +#include "xen_entry.S"
21681 +
21682 +       .code64
21683 +
21684 +#ifndef CONFIG_PREEMPT
21685 +#define retint_kernel retint_restore_args
21686 +#endif
21687 +
21688 +
21689 +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
21690 +#ifdef CONFIG_TRACE_IRQFLAGS
21691 +       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
21692 +       jnc  1f
21693 +       TRACE_IRQS_ON
21694 +1:
21695 +#endif
21696 +.endm
21697 +
21698 +NMI_MASK = 0x80000000
21699 +
21700 +/*
21701 + * C code is not supposed to know about undefined top of stack. Every time
21702 + * a C function with an pt_regs argument is called from the SYSCALL based
21703 + * fast path FIXUP_TOP_OF_STACK is needed.
21704 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
21705 + * manipulation.
21706 + */
21707 +
21708 +       /* %rsp:at FRAMEEND */
21709 +       .macro FIXUP_TOP_OF_STACK tmp
21710 +       movq    $__USER_CS,CS(%rsp)
21711 +       movq    $-1,RCX(%rsp)
21712 +       .endm
21713 +
21714 +       .macro RESTORE_TOP_OF_STACK tmp,offset=0
21715 +       .endm
21716 +
21717 +       .macro FAKE_STACK_FRAME child_rip
21718 +       /* push in order ss, rsp, eflags, cs, rip */
21719 +       xorl %eax, %eax
21720 +       pushq %rax /* ss */
21721 +       CFI_ADJUST_CFA_OFFSET   8
21722 +       /*CFI_REL_OFFSET        ss,0*/
21723 +       pushq %rax /* rsp */
21724 +       CFI_ADJUST_CFA_OFFSET   8
21725 +       CFI_REL_OFFSET  rsp,0
21726 +       pushq $(1<<9) /* eflags - interrupts on */
21727 +       CFI_ADJUST_CFA_OFFSET   8
21728 +       /*CFI_REL_OFFSET        rflags,0*/
21729 +       pushq $__KERNEL_CS /* cs */
21730 +       CFI_ADJUST_CFA_OFFSET   8
21731 +       /*CFI_REL_OFFSET        cs,0*/
21732 +       pushq \child_rip /* rip */
21733 +       CFI_ADJUST_CFA_OFFSET   8
21734 +       CFI_REL_OFFSET  rip,0
21735 +       pushq   %rax /* orig rax */
21736 +       CFI_ADJUST_CFA_OFFSET   8
21737 +       .endm
21738 +
21739 +       .macro UNFAKE_STACK_FRAME
21740 +       addq $8*6, %rsp
21741 +       CFI_ADJUST_CFA_OFFSET   -(6*8)
21742 +       .endm
21743 +
21744 +       .macro  CFI_DEFAULT_STACK start=1,adj=0
21745 +       .if \start
21746 +       CFI_STARTPROC   simple
21747 +       CFI_DEF_CFA     rsp,SS+8 - \adj*ARGOFFSET
21748 +       .else
21749 +       CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
21750 +       .endif
21751 +       .if \adj == 0
21752 +       CFI_REL_OFFSET  r15,R15
21753 +       CFI_REL_OFFSET  r14,R14
21754 +       CFI_REL_OFFSET  r13,R13
21755 +       CFI_REL_OFFSET  r12,R12
21756 +       CFI_REL_OFFSET  rbp,RBP
21757 +       CFI_REL_OFFSET  rbx,RBX
21758 +       .endif
21759 +       CFI_REL_OFFSET  r11,R11 - \adj*ARGOFFSET
21760 +       CFI_REL_OFFSET  r10,R10 - \adj*ARGOFFSET
21761 +       CFI_REL_OFFSET  r9,R9 - \adj*ARGOFFSET
21762 +       CFI_REL_OFFSET  r8,R8 - \adj*ARGOFFSET
21763 +       CFI_REL_OFFSET  rax,RAX - \adj*ARGOFFSET
21764 +       CFI_REL_OFFSET  rcx,RCX - \adj*ARGOFFSET
21765 +       CFI_REL_OFFSET  rdx,RDX - \adj*ARGOFFSET
21766 +       CFI_REL_OFFSET  rsi,RSI - \adj*ARGOFFSET
21767 +       CFI_REL_OFFSET  rdi,RDI - \adj*ARGOFFSET
21768 +       CFI_REL_OFFSET  rip,RIP - \adj*ARGOFFSET
21769 +       /*CFI_REL_OFFSET        cs,CS - \adj*ARGOFFSET*/
21770 +       /*CFI_REL_OFFSET        rflags,EFLAGS - \adj*ARGOFFSET*/
21771 +       CFI_REL_OFFSET  rsp,RSP - \adj*ARGOFFSET
21772 +       /*CFI_REL_OFFSET        ss,SS - \adj*ARGOFFSET*/
21773 +       .endm
21774 +
21775 +        /*
21776 +         * Must be consistent with the definition in arch-x86/xen-x86_64.h:
21777 +         *     struct iret_context {
21778 +         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
21779 +         *     };
21780 +         * with rax, r11, and rcx being taken care of in the hypercall stub.
21781 +         */
21782 +       .macro HYPERVISOR_IRET flag
21783 +       testb $3,1*8(%rsp)
21784 +       jnz   2f
21785 +       testl $NMI_MASK,2*8(%rsp)
21786 +       jnz   2f
21787 +
21788 +       cmpb  $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
21789 +       jne   1f
21790 +
21791 +       /* Direct iret to kernel space. Correct CS and SS. */
21792 +       orl   $3,1*8(%rsp)
21793 +       orl   $3,4*8(%rsp)
21794 +1:     iretq
21795 +
21796 +2:     /* Slow iret via hypervisor. */
21797 +       andl  $~NMI_MASK, 2*8(%rsp)
21798 +       pushq $\flag
21799 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
21800 +       .endm
21801 +
21802 +/*
21803 + * A newly forked process directly context switches into this.
21804 + */
21805 +/* rdi:        prev */
21806 +ENTRY(ret_from_fork)
21807 +       CFI_DEFAULT_STACK
21808 +       push kernel_eflags(%rip)
21809 +       CFI_ADJUST_CFA_OFFSET 4
21810 +       popf                            # reset kernel eflags
21811 +       CFI_ADJUST_CFA_OFFSET -4
21812 +       call schedule_tail
21813 +       GET_THREAD_INFO(%rcx)
21814 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
21815 +       jnz rff_trace
21816 +rff_action:
21817 +       RESTORE_REST
21818 +       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
21819 +       je   int_ret_from_sys_call
21820 +       testl $_TIF_IA32,threadinfo_flags(%rcx)
21821 +       jnz  int_ret_from_sys_call
21822 +       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
21823 +       jmp ret_from_sys_call
21824 +rff_trace:
21825 +       movq %rsp,%rdi
21826 +       call syscall_trace_leave
21827 +       GET_THREAD_INFO(%rcx)
21828 +       jmp rff_action
21829 +       CFI_ENDPROC
21830 +END(ret_from_fork)
21831 +
21832 +/*
21833 + * initial frame state for interrupts and exceptions
21834 + */
21835 +       .macro _frame ref
21836 +       CFI_STARTPROC simple
21837 +       CFI_DEF_CFA rsp,SS+8-\ref
21838 +       /*CFI_REL_OFFSET ss,SS-\ref*/
21839 +       CFI_REL_OFFSET rsp,RSP-\ref
21840 +       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
21841 +       /*CFI_REL_OFFSET cs,CS-\ref*/
21842 +       CFI_REL_OFFSET rip,RIP-\ref
21843 +       .endm
21844 +
21845 +/*
21846 + * System call entry. Upto 6 arguments in registers are supported.
21847 + *
21848 + * SYSCALL does not save anything on the stack and does not change the
21849 + * stack pointer.
21850 + */
21851 +
21852 +/*
21853 + * Register setup:
21854 + * rax  system call number
21855 + * rdi  arg0
21856 + * rcx  return address for syscall/sysret, C arg3
21857 + * rsi  arg1
21858 + * rdx  arg2
21859 + * r10  arg3   (--> moved to rcx for C)
21860 + * r8   arg4
21861 + * r9   arg5
21862 + * r11  eflags for syscall/sysret, temporary for C
21863 + * r12-r15,rbp,rbx saved by C code, not touched.
21864 + *
21865 + * Interrupts are enabled on entry.
21866 + * Only called from user space.
21867 + *
21868 + * XXX if we had a free scratch register we could save the RSP into the stack frame
21869 + *      and report it properly in ps. Unfortunately we haven't.
21870 + *
21871 + * When user can change the frames always force IRET. That is because
21872 + * it deals with uncanonical addresses better. SYSRET has trouble
21873 + * with them due to bugs in both AMD and Intel CPUs.
21874 + */
21875 +
21876 +ENTRY(system_call)
21877 +       _frame (RIP-0x10)
21878 +       SAVE_ARGS -8,0
21879 +       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
21880 +       GET_THREAD_INFO(%rcx)
21881 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
21882 +       CFI_REMEMBER_STATE
21883 +       jnz tracesys
21884 +       cmpq $__NR_syscall_max,%rax
21885 +       ja badsys
21886 +       movq %r10,%rcx
21887 +       call *sys_call_table(,%rax,8)  # XXX:    rip relative
21888 +       movq %rax,RAX-ARGOFFSET(%rsp)
21889 +/*
21890 + * Syscall return path ending with SYSRET (fast path)
21891 + * Has incomplete stack frame and undefined top of stack.
21892 + */
21893 +       .globl ret_from_sys_call
21894 +ret_from_sys_call:
21895 +       movl $_TIF_ALLWORK_MASK,%edi
21896 +       /* edi: flagmask */
21897 +sysret_check:
21898 +       GET_THREAD_INFO(%rcx)
21899 +        XEN_BLOCK_EVENTS(%rsi)
21900 +       TRACE_IRQS_OFF
21901 +       movl threadinfo_flags(%rcx),%edx
21902 +       andl %edi,%edx
21903 +       CFI_REMEMBER_STATE
21904 +       jnz  sysret_careful
21905 +       /*
21906 +        * sysretq will re-enable interrupts:
21907 +        */
21908 +       TRACE_IRQS_ON
21909 +        XEN_UNBLOCK_EVENTS(%rsi)
21910 +       RESTORE_ARGS 0,8,0
21911 +        HYPERVISOR_IRET VGCF_IN_SYSCALL
21912 +
21913 +       /* Handle reschedules */
21914 +       /* edx: work, edi: workmask */
21915 +sysret_careful:
21916 +       CFI_RESTORE_STATE
21917 +       bt $TIF_NEED_RESCHED,%edx
21918 +       jnc sysret_signal
21919 +       TRACE_IRQS_ON
21920 +       XEN_UNBLOCK_EVENTS(%rsi)
21921 +       pushq %rdi
21922 +       CFI_ADJUST_CFA_OFFSET 8
21923 +       call schedule
21924 +       popq  %rdi
21925 +       CFI_ADJUST_CFA_OFFSET -8
21926 +       jmp sysret_check
21927 +
21928 +       /* Handle a signal */
21929 +sysret_signal:
21930 +       TRACE_IRQS_ON
21931 +/*     sti */
21932 +        XEN_UNBLOCK_EVENTS(%rsi)
21933 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
21934 +       jz    1f
21935 +
21936 +       /* Really a signal */
21937 +       /* edx: work flags (arg3) */
21938 +       leaq do_notify_resume(%rip),%rax
21939 +       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
21940 +       xorl %esi,%esi # oldset -> arg2
21941 +       call ptregscall_common
21942 +1:     movl $_TIF_NEED_RESCHED,%edi
21943 +       /* Use IRET because user could have changed frame. This
21944 +          works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
21945 +       XEN_BLOCK_EVENTS(%rsi)
21946 +       TRACE_IRQS_OFF
21947 +       jmp int_with_check
21948 +
21949 +badsys:
21950 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
21951 +       jmp ret_from_sys_call
21952 +
21953 +       /* Do syscall tracing */
21954 +tracesys:
21955 +       CFI_RESTORE_STATE
21956 +       SAVE_REST
21957 +       movq $-ENOSYS,RAX(%rsp)
21958 +       FIXUP_TOP_OF_STACK %rdi
21959 +       movq %rsp,%rdi
21960 +       call syscall_trace_enter
21961 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
21962 +       RESTORE_REST
21963 +       cmpq $__NR_syscall_max,%rax
21964 +       ja  1f
21965 +       movq %r10,%rcx  /* fixup for C */
21966 +       call *sys_call_table(,%rax,8)
21967 +1:     movq %rax,RAX-ARGOFFSET(%rsp)
21968 +       /* Use IRET because user could have changed frame */
21969 +       jmp int_ret_from_sys_call
21970 +       CFI_ENDPROC
21971 +END(system_call)
21972 +
21973 +/*
21974 + * Syscall return path ending with IRET.
21975 + * Has correct top of stack, but partial stack frame.
21976 + */
21977 +ENTRY(int_ret_from_sys_call)
21978 +       CFI_STARTPROC   simple
21979 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
21980 +       /*CFI_REL_OFFSET        ss,SS-ARGOFFSET*/
21981 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
21982 +       /*CFI_REL_OFFSET        rflags,EFLAGS-ARGOFFSET*/
21983 +       /*CFI_REL_OFFSET        cs,CS-ARGOFFSET*/
21984 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
21985 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
21986 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
21987 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
21988 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
21989 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
21990 +       CFI_REL_OFFSET  r8,R8-ARGOFFSET
21991 +       CFI_REL_OFFSET  r9,R9-ARGOFFSET
21992 +       CFI_REL_OFFSET  r10,R10-ARGOFFSET
21993 +       CFI_REL_OFFSET  r11,R11-ARGOFFSET
21994 +        XEN_BLOCK_EVENTS(%rsi)
21995 +       TRACE_IRQS_OFF
21996 +       testb $3,CS-ARGOFFSET(%rsp)
21997 +        jnz 1f
21998 +        /* Need to set the proper %ss (not NULL) for ring 3 iretq */
21999 +        movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
22000 +        jmp retint_restore_args   # retrun from ring3 kernel
22001 +1:
22002 +       movl $_TIF_ALLWORK_MASK,%edi
22003 +       /* edi: mask to check */
22004 +int_with_check:
22005 +       GET_THREAD_INFO(%rcx)
22006 +       movl threadinfo_flags(%rcx),%edx
22007 +       andl %edi,%edx
22008 +       jnz   int_careful
22009 +       andl    $~TS_COMPAT,threadinfo_status(%rcx)
22010 +       jmp   retint_restore_args
22011 +
22012 +       /* Either reschedule or signal or syscall exit tracking needed. */
22013 +       /* First do a reschedule test. */
22014 +       /* edx: work, edi: workmask */
22015 +int_careful:
22016 +       bt $TIF_NEED_RESCHED,%edx
22017 +       jnc  int_very_careful
22018 +       TRACE_IRQS_ON
22019 +/*     sti */
22020 +        XEN_UNBLOCK_EVENTS(%rsi)
22021 +       pushq %rdi
22022 +       CFI_ADJUST_CFA_OFFSET 8
22023 +       call schedule
22024 +       popq %rdi
22025 +       CFI_ADJUST_CFA_OFFSET -8
22026 +       XEN_BLOCK_EVENTS(%rsi)
22027 +       TRACE_IRQS_OFF
22028 +       jmp int_with_check
22029 +
22030 +       /* handle signals and tracing -- both require a full stack frame */
22031 +int_very_careful:
22032 +       TRACE_IRQS_ON
22033 +/*     sti */
22034 +        XEN_UNBLOCK_EVENTS(%rsi)
22035 +       SAVE_REST
22036 +       /* Check for syscall exit trace */
22037 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
22038 +       jz int_signal
22039 +       pushq %rdi
22040 +       CFI_ADJUST_CFA_OFFSET 8
22041 +       leaq 8(%rsp),%rdi       # &ptregs -> arg1
22042 +       call syscall_trace_leave
22043 +       popq %rdi
22044 +       CFI_ADJUST_CFA_OFFSET -8
22045 +       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
22046 +       XEN_BLOCK_EVENTS(%rsi)
22047 +       TRACE_IRQS_OFF
22048 +       jmp int_restore_rest
22049 +
22050 +int_signal:
22051 +       testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
22052 +       jz 1f
22053 +       movq %rsp,%rdi          # &ptregs -> arg1
22054 +       xorl %esi,%esi          # oldset -> arg2
22055 +       call do_notify_resume
22056 +1:     movl $_TIF_NEED_RESCHED,%edi
22057 +int_restore_rest:
22058 +       RESTORE_REST
22059 +       XEN_BLOCK_EVENTS(%rsi)
22060 +       TRACE_IRQS_OFF
22061 +       jmp int_with_check
22062 +       CFI_ENDPROC
22063 +END(int_ret_from_sys_call)
22064 +
22065 +/*
22066 + * Certain special system calls that need to save a complete full stack frame.
22067 + */
22068 +
22069 +       .macro PTREGSCALL label,func,arg
22070 +       .globl \label
22071 +\label:
22072 +       leaq    \func(%rip),%rax
22073 +       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
22074 +       jmp     ptregscall_common
22075 +END(\label)
22076 +       .endm
22077 +
22078 +       CFI_STARTPROC
22079 +
22080 +       PTREGSCALL stub_clone, sys_clone, %r8
22081 +       PTREGSCALL stub_fork, sys_fork, %rdi
22082 +       PTREGSCALL stub_vfork, sys_vfork, %rdi
22083 +       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
22084 +       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
22085 +       PTREGSCALL stub_iopl, sys_iopl, %rsi
22086 +
22087 +ENTRY(ptregscall_common)
22088 +       popq %r11
22089 +       CFI_ADJUST_CFA_OFFSET -8
22090 +       CFI_REGISTER rip, r11
22091 +       SAVE_REST
22092 +       movq %r11, %r15
22093 +       CFI_REGISTER rip, r15
22094 +       FIXUP_TOP_OF_STACK %r11
22095 +       call *%rax
22096 +       RESTORE_TOP_OF_STACK %r11
22097 +       movq %r15, %r11
22098 +       CFI_REGISTER rip, r11
22099 +       RESTORE_REST
22100 +       pushq %r11
22101 +       CFI_ADJUST_CFA_OFFSET 8
22102 +       CFI_REL_OFFSET rip, 0
22103 +       ret
22104 +       CFI_ENDPROC
22105 +END(ptregscall_common)
22106 +
22107 +ENTRY(stub_execve)
22108 +       CFI_STARTPROC
22109 +       popq %r11
22110 +       CFI_ADJUST_CFA_OFFSET -8
22111 +       CFI_REGISTER rip, r11
22112 +       SAVE_REST
22113 +       FIXUP_TOP_OF_STACK %r11
22114 +       call sys_execve
22115 +       RESTORE_TOP_OF_STACK %r11
22116 +       movq %rax,RAX(%rsp)
22117 +       RESTORE_REST
22118 +       jmp int_ret_from_sys_call
22119 +       CFI_ENDPROC
22120 +END(stub_execve)
22121 +
22122 +/*
22123 + * sigreturn is special because it needs to restore all registers on return.
22124 + * This cannot be done with SYSRET, so use the IRET return path instead.
22125 + */
22126 +ENTRY(stub_rt_sigreturn)
22127 +       CFI_STARTPROC
22128 +       addq $8, %rsp
22129 +       CFI_ADJUST_CFA_OFFSET   -8
22130 +       SAVE_REST
22131 +       movq %rsp,%rdi
22132 +       FIXUP_TOP_OF_STACK %r11
22133 +       call sys_rt_sigreturn
22134 +       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
22135 +       RESTORE_REST
22136 +       jmp int_ret_from_sys_call
22137 +       CFI_ENDPROC
22138 +END(stub_rt_sigreturn)
22139 +
22140 +/* initial frame state for interrupts (and exceptions without error code) */
22141 +#define INTR_FRAME _frame (RIP-0x10); \
22142 +       CFI_REL_OFFSET rcx,0; \
22143 +       CFI_REL_OFFSET r11,8
22144 +
22145 +/* initial frame state for exceptions with error code (and interrupts with
22146 +   vector already pushed) */
22147 +#define XCPT_FRAME _frame (RIP-0x18); \
22148 +       CFI_REL_OFFSET rcx,0; \
22149 +       CFI_REL_OFFSET r11,8
22150 +
22151 +/*
22152 + * Interrupt exit.
22153 + *
22154 + */
22155 +
22156 +retint_check:
22157 +       CFI_DEFAULT_STACK adj=1
22158 +       movl threadinfo_flags(%rcx),%edx
22159 +       andl %edi,%edx
22160 +       CFI_REMEMBER_STATE
22161 +       jnz  retint_careful
22162 +retint_restore_args:
22163 +       movl EFLAGS-REST_SKIP(%rsp), %eax
22164 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
22165 +       XEN_GET_VCPU_INFO(%rsi)
22166 +       andb evtchn_upcall_mask(%rsi),%al
22167 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
22168 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
22169 +       XEN_PUT_VCPU_INFO(%rsi)
22170 +
22171 +       RESTORE_ARGS 0,8,0
22172 +       HYPERVISOR_IRET 0
22173 +
22174 +       /* edi: workmask, edx: work */
22175 +retint_careful:
22176 +       CFI_RESTORE_STATE
22177 +       bt    $TIF_NEED_RESCHED,%edx
22178 +       jnc   retint_signal
22179 +       TRACE_IRQS_ON
22180 +       XEN_UNBLOCK_EVENTS(%rsi)
22181 +/*     sti */
22182 +       pushq %rdi
22183 +       CFI_ADJUST_CFA_OFFSET   8
22184 +       call  schedule
22185 +       popq %rdi
22186 +       CFI_ADJUST_CFA_OFFSET   -8
22187 +       GET_THREAD_INFO(%rcx)
22188 +       XEN_BLOCK_EVENTS(%rsi)
22189 +/*     cli */
22190 +       TRACE_IRQS_OFF
22191 +       jmp retint_check
22192 +
22193 +retint_signal:
22194 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
22195 +       jz    retint_restore_args
22196 +       TRACE_IRQS_ON
22197 +        XEN_UNBLOCK_EVENTS(%rsi)
22198 +       SAVE_REST
22199 +       movq $-1,ORIG_RAX(%rsp)
22200 +       xorl %esi,%esi          # oldset
22201 +       movq %rsp,%rdi          # &pt_regs
22202 +       call do_notify_resume
22203 +       RESTORE_REST
22204 +        XEN_BLOCK_EVENTS(%rsi)
22205 +       TRACE_IRQS_OFF
22206 +       movl $_TIF_NEED_RESCHED,%edi
22207 +       GET_THREAD_INFO(%rcx)
22208 +       jmp retint_check
22209 +
22210 +#ifdef CONFIG_PREEMPT
22211 +       /* Returning to kernel space. Check if we need preemption */
22212 +       /* rcx:  threadinfo. interrupts off. */
22213 +       .p2align
22214 +retint_kernel:
22215 +       cmpl $0,threadinfo_preempt_count(%rcx)
22216 +       jnz  retint_restore_args
22217 +       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
22218 +       jnc  retint_restore_args
22219 +       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
22220 +       jnc  retint_restore_args
22221 +       call preempt_schedule_irq
22222 +       jmp retint_kernel       /* check again */
22223 +#endif
22224 +
22225 +       CFI_ENDPROC
22226 +END(retint_check)
22227 +
22228 +#ifndef CONFIG_XEN
22229 +/*
22230 + * APIC interrupts.
22231 + */
22232 +       .macro apicinterrupt num,func
22233 +       INTR_FRAME
22234 +       pushq $~(\num)
22235 +       CFI_ADJUST_CFA_OFFSET 8
22236 +       interrupt \func
22237 +       jmp error_entry
22238 +       CFI_ENDPROC
22239 +       .endm
22240 +
22241 +ENTRY(thermal_interrupt)
22242 +       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
22243 +END(thermal_interrupt)
22244 +
22245 +ENTRY(threshold_interrupt)
22246 +       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
22247 +END(threshold_interrupt)
22248 +
22249 +#ifdef CONFIG_SMP
22250 +ENTRY(reschedule_interrupt)
22251 +       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
22252 +END(reschedule_interrupt)
22253 +
22254 +       .macro INVALIDATE_ENTRY num
22255 +ENTRY(invalidate_interrupt\num)
22256 +       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
22257 +END(invalidate_interrupt\num)
22258 +       .endm
22259 +
22260 +       INVALIDATE_ENTRY 0
22261 +       INVALIDATE_ENTRY 1
22262 +       INVALIDATE_ENTRY 2
22263 +       INVALIDATE_ENTRY 3
22264 +       INVALIDATE_ENTRY 4
22265 +       INVALIDATE_ENTRY 5
22266 +       INVALIDATE_ENTRY 6
22267 +       INVALIDATE_ENTRY 7
22268 +
22269 +ENTRY(call_function_interrupt)
22270 +       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
22271 +END(call_function_interrupt)
22272 +#endif
22273 +
22274 +#ifdef CONFIG_X86_LOCAL_APIC
22275 +ENTRY(apic_timer_interrupt)
22276 +       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
22277 +END(apic_timer_interrupt)
22278 +
22279 +ENTRY(error_interrupt)
22280 +       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
22281 +END(error_interrupt)
22282 +
22283 +ENTRY(spurious_interrupt)
22284 +       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
22285 +END(spurious_interrupt)
22286 +#endif
22287 +#endif /* !CONFIG_XEN */
22288 +
22289 +/*
22290 + * Exception entry points.
22291 + */
22292 +       .macro zeroentry sym
22293 +       INTR_FRAME
22294 +        movq (%rsp),%rcx
22295 +       CFI_RESTORE rcx
22296 +        movq 8(%rsp),%r11
22297 +       CFI_RESTORE r11
22298 +        addq $0x10,%rsp /* skip rcx and r11 */
22299 +       CFI_ADJUST_CFA_OFFSET -0x10
22300 +       pushq $0        /* push error code/oldrax */
22301 +       CFI_ADJUST_CFA_OFFSET 8
22302 +       pushq %rax      /* push real oldrax to the rdi slot */
22303 +       CFI_ADJUST_CFA_OFFSET 8
22304 +       CFI_REL_OFFSET rax,0
22305 +       leaq  \sym(%rip),%rax
22306 +       jmp error_entry
22307 +       CFI_ENDPROC
22308 +       .endm
22309 +
22310 +       .macro errorentry sym
22311 +       XCPT_FRAME
22312 +        movq (%rsp),%rcx
22313 +       CFI_RESTORE rcx
22314 +        movq 8(%rsp),%r11
22315 +       CFI_RESTORE r11
22316 +        addq $0x10,%rsp /* rsp points to the error code */
22317 +       CFI_ADJUST_CFA_OFFSET -0x10
22318 +       pushq %rax
22319 +       CFI_ADJUST_CFA_OFFSET 8
22320 +       CFI_REL_OFFSET rax,0
22321 +       leaq  \sym(%rip),%rax
22322 +       jmp error_entry
22323 +       CFI_ENDPROC
22324 +       .endm
22325 +
22326 +#if 0 /* not XEN */
22327 +       /* error code is on the stack already */
22328 +       /* handle NMI like exceptions that can happen everywhere */
22329 +       .macro paranoidentry sym, ist=0, irqtrace=1
22330 +        movq (%rsp),%rcx
22331 +        movq 8(%rsp),%r11
22332 +        addq $0x10,%rsp /* skip rcx and r11 */
22333 +       SAVE_ALL
22334 +       cld
22335 +#if 0 /* not XEN */
22336 +       movl $1,%ebx
22337 +       movl  $MSR_GS_BASE,%ecx
22338 +       rdmsr
22339 +       testl %edx,%edx
22340 +       js    1f
22341 +       swapgs
22342 +       xorl  %ebx,%ebx
22343 +1:
22344 +#endif
22345 +       .if \ist
22346 +       movq    %gs:pda_data_offset, %rbp
22347 +       .endif
22348 +       movq %rsp,%rdi
22349 +       movq ORIG_RAX(%rsp),%rsi
22350 +       movq $-1,ORIG_RAX(%rsp)
22351 +       .if \ist
22352 +       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22353 +       .endif
22354 +       call \sym
22355 +       .if \ist
22356 +       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22357 +       .endif
22358 +/*     cli */
22359 +       XEN_BLOCK_EVENTS(%rsi)
22360 +       .if \irqtrace
22361 +       TRACE_IRQS_OFF
22362 +       .endif
22363 +       .endm
22364 +
22365 +       /*
22366 +        * "Paranoid" exit path from exception stack.
22367 +        * Paranoid because this is used by NMIs and cannot take
22368 +        * any kernel state for granted.
22369 +        * We don't do kernel preemption checks here, because only
22370 +        * NMI should be common and it does not enable IRQs and
22371 +        * cannot get reschedule ticks.
22372 +        *
22373 +        * "trace" is 0 for the NMI handler only, because irq-tracing
22374 +        * is fundamentally NMI-unsafe. (we cannot change the soft and
22375 +        * hard flags at once, atomically)
22376 +        */
22377 +       .macro paranoidexit trace=1
22378 +       /* ebx: no swapgs flag */
22379 +paranoid_exit\trace:
22380 +       testl %ebx,%ebx                         /* swapgs needed? */
22381 +       jnz paranoid_restore\trace
22382 +       testl $3,CS(%rsp)
22383 +       jnz   paranoid_userspace\trace
22384 +paranoid_swapgs\trace:
22385 +       TRACE_IRQS_IRETQ 0
22386 +       swapgs
22387 +paranoid_restore\trace:
22388 +       RESTORE_ALL 8
22389 +       iretq
22390 +paranoid_userspace\trace:
22391 +       GET_THREAD_INFO(%rcx)
22392 +       movl threadinfo_flags(%rcx),%ebx
22393 +       andl $_TIF_WORK_MASK,%ebx
22394 +       jz paranoid_swapgs\trace
22395 +       movq %rsp,%rdi                  /* &pt_regs */
22396 +       call sync_regs
22397 +       movq %rax,%rsp                  /* switch stack for scheduling */
22398 +       testl $_TIF_NEED_RESCHED,%ebx
22399 +       jnz paranoid_schedule\trace
22400 +       movl %ebx,%edx                  /* arg3: thread flags */
22401 +       .if \trace
22402 +       TRACE_IRQS_ON
22403 +       .endif
22404 +       sti
22405 +       xorl %esi,%esi                  /* arg2: oldset */
22406 +       movq %rsp,%rdi                  /* arg1: &pt_regs */
22407 +       call do_notify_resume
22408 +       cli
22409 +       .if \trace
22410 +       TRACE_IRQS_OFF
22411 +       .endif
22412 +       jmp paranoid_userspace\trace
22413 +paranoid_schedule\trace:
22414 +       .if \trace
22415 +       TRACE_IRQS_ON
22416 +       .endif
22417 +       sti
22418 +       call schedule
22419 +       cli
22420 +       .if \trace
22421 +       TRACE_IRQS_OFF
22422 +       .endif
22423 +       jmp paranoid_userspace\trace
22424 +       CFI_ENDPROC
22425 +       .endm
22426 +#endif
22427 +
22428 +/*
22429 + * Exception entry point. This expects an error code/orig_rax on the stack
22430 + * and the exception handler in %rax.
22431 + */
22432 +ENTRY(error_entry)
22433 +       _frame RDI
22434 +       CFI_REL_OFFSET rax,0
22435 +       /* rdi slot contains rax, oldrax contains error code */
22436 +       cld
22437 +       subq  $14*8,%rsp
22438 +       CFI_ADJUST_CFA_OFFSET   (14*8)
22439 +       movq %rsi,13*8(%rsp)
22440 +       CFI_REL_OFFSET  rsi,RSI
22441 +       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
22442 +       CFI_REGISTER    rax,rsi
22443 +       movq %rdx,12*8(%rsp)
22444 +       CFI_REL_OFFSET  rdx,RDX
22445 +       movq %rcx,11*8(%rsp)
22446 +       CFI_REL_OFFSET  rcx,RCX
22447 +       movq %rsi,10*8(%rsp)    /* store rax */
22448 +       CFI_REL_OFFSET  rax,RAX
22449 +       movq %r8, 9*8(%rsp)
22450 +       CFI_REL_OFFSET  r8,R8
22451 +       movq %r9, 8*8(%rsp)
22452 +       CFI_REL_OFFSET  r9,R9
22453 +       movq %r10,7*8(%rsp)
22454 +       CFI_REL_OFFSET  r10,R10
22455 +       movq %r11,6*8(%rsp)
22456 +       CFI_REL_OFFSET  r11,R11
22457 +       movq %rbx,5*8(%rsp)
22458 +       CFI_REL_OFFSET  rbx,RBX
22459 +       movq %rbp,4*8(%rsp)
22460 +       CFI_REL_OFFSET  rbp,RBP
22461 +       movq %r12,3*8(%rsp)
22462 +       CFI_REL_OFFSET  r12,R12
22463 +       movq %r13,2*8(%rsp)
22464 +       CFI_REL_OFFSET  r13,R13
22465 +       movq %r14,1*8(%rsp)
22466 +       CFI_REL_OFFSET  r14,R14
22467 +       movq %r15,(%rsp)
22468 +       CFI_REL_OFFSET  r15,R15
22469 +#if 0
22470 +       cmpl $__KERNEL_CS,CS(%rsp)
22471 +       CFI_REMEMBER_STATE
22472 +       je  error_kernelspace
22473 +#endif
22474 +error_call_handler:
22475 +       movq %rdi, RDI(%rsp)
22476 +       CFI_REL_OFFSET  rdi,RDI
22477 +       movq %rsp,%rdi
22478 +       movq ORIG_RAX(%rsp),%rsi        # get error code
22479 +       movq $-1,ORIG_RAX(%rsp)
22480 +       call *%rax
22481 +error_exit:
22482 +       RESTORE_REST
22483 +/*     cli */
22484 +       XEN_BLOCK_EVENTS(%rsi)
22485 +       TRACE_IRQS_OFF
22486 +       GET_THREAD_INFO(%rcx)
22487 +       testb $3,CS-ARGOFFSET(%rsp)
22488 +       jz retint_kernel
22489 +       movl  threadinfo_flags(%rcx),%edx
22490 +       movl  $_TIF_WORK_MASK,%edi
22491 +       andl  %edi,%edx
22492 +       jnz   retint_careful
22493 +       /*
22494 +        * The iret might restore flags:
22495 +        */
22496 +       TRACE_IRQS_IRETQ
22497 +       jmp   retint_restore_args
22498 +
22499 +#if 0
22500 +         /*
22501 +         * We need to re-write the logic here because we don't do iretq to
22502 +         * to return to user mode. It's still possible that we get trap/fault
22503 +         * in the kernel (when accessing buffers pointed to by system calls,
22504 +         * for example).
22505 +         *
22506 +         */
22507 +       CFI_RESTORE_STATE
22508 +error_kernelspace:
22509 +       incl %ebx
22510 +       /* There are two places in the kernel that can potentially fault with
22511 +          usergs. Handle them here. The exception handlers after
22512 +          iret run with kernel gs again, so don't set the user space flag.
22513 +          B stepping K8s sometimes report an truncated RIP for IRET
22514 +          exceptions returning to compat mode. Check for these here too. */
22515 +       leaq iret_label(%rip),%rbp
22516 +       cmpq %rbp,RIP(%rsp)
22517 +       je   error_swapgs
22518 +       movl %ebp,%ebp  /* zero extend */
22519 +       cmpq %rbp,RIP(%rsp)
22520 +       je   error_swapgs
22521 +       cmpq $gs_change,RIP(%rsp)
22522 +        je   error_swapgs
22523 +       jmp  error_sti
22524 +#endif
22525 +       CFI_ENDPROC
22526 +END(error_entry)
22527 +
22528 +ENTRY(hypervisor_callback)
22529 +       zeroentry do_hypervisor_callback
22530 +END(hypervisor_callback)
22531 +
22532 +/*
22533 + * Copied from arch/xen/i386/kernel/entry.S
22534 + */
22535 +# A note on the "critical region" in our callback handler.
22536 +# We want to avoid stacking callback handlers due to events occurring
22537 +# during handling of the last event. To do this, we keep events disabled
22538 +# until we've done all processing. HOWEVER, we must enable events before
22539 +# popping the stack frame (can't be done atomically) and so it would still
22540 +# be possible to get enough handler activations to overflow the stack.
22541 +# Although unlikely, bugs of that kind are hard to track down, so we'd
22542 +# like to avoid the possibility.
22543 +# So, on entry to the handler we detect whether we interrupted an
22544 +# existing activation in its critical region -- if so, we pop the current
22545 +# activation and restart the handler using the previous one.
22546 +ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
22547 +       CFI_STARTPROC
22548 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
22549 +# see the correct pointer to the pt_regs
22550 +       movq %rdi, %rsp            # we don't return, adjust the stack frame
22551 +       CFI_ENDPROC
22552 +       CFI_DEFAULT_STACK
22553 +11:    incl %gs:pda_irqcount
22554 +       movq %rsp,%rbp
22555 +       CFI_DEF_CFA_REGISTER rbp
22556 +       cmovzq %gs:pda_irqstackptr,%rsp
22557 +       pushq %rbp                      # backlink for old unwinder
22558 +       call evtchn_do_upcall
22559 +       popq %rsp
22560 +       CFI_DEF_CFA_REGISTER rsp
22561 +       decl %gs:pda_irqcount
22562 +       jmp  error_exit
22563 +       CFI_ENDPROC
22564 +END(do_hypervisor_callback)
22565 +
22566 +#ifdef CONFIG_X86_LOCAL_APIC
22567 +KPROBE_ENTRY(nmi)
22568 +       zeroentry do_nmi_callback
22569 +ENTRY(do_nmi_callback)
22570 +       CFI_STARTPROC
22571 +        addq $8, %rsp
22572 +       CFI_ENDPROC
22573 +       CFI_DEFAULT_STACK
22574 +        call do_nmi
22575 +        orl  $NMI_MASK,EFLAGS(%rsp)
22576 +        RESTORE_REST
22577 +        XEN_BLOCK_EVENTS(%rsi)
22578 +       TRACE_IRQS_OFF
22579 +        GET_THREAD_INFO(%rcx)
22580 +        jmp  retint_restore_args
22581 +       CFI_ENDPROC
22582 +       .previous .text
22583 +END(nmi)
22584 +#endif
22585 +
22586 +        ALIGN
22587 +restore_all_enable_events:
22588 +       CFI_DEFAULT_STACK adj=1
22589 +       TRACE_IRQS_ON
22590 +       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
22591 +
22592 +scrit: /**** START OF CRITICAL REGION ****/
22593 +       XEN_TEST_PENDING(%rsi)
22594 +       CFI_REMEMBER_STATE
22595 +       jnz  14f                        # process more events if necessary...
22596 +       XEN_PUT_VCPU_INFO(%rsi)
22597 +        RESTORE_ARGS 0,8,0
22598 +        HYPERVISOR_IRET 0
22599 +
22600 +       CFI_RESTORE_STATE
22601 +14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
22602 +       XEN_PUT_VCPU_INFO(%rsi)
22603 +       SAVE_REST
22604 +        movq %rsp,%rdi                  # set the argument again
22605 +       jmp  11b
22606 +       CFI_ENDPROC
22607 +ecrit:  /**** END OF CRITICAL REGION ****/
22608 +# At this point, unlike on x86-32, we don't do the fixup to simplify the
22609 +# code and the stack frame is more complex on x86-64.
22610 +# When the kernel is interrupted in the critical section, the kernel
22611 +# will do IRET in that case, and everything will be restored at that point,
22612 +# i.e. it just resumes from the next instruction interrupted with the same context.
22613 +
22614 +# Hypervisor uses this for application faults while it executes.
22615 +# We get here for two reasons:
22616 +#  1. Fault while reloading DS, ES, FS or GS
22617 +#  2. Fault while executing IRET
22618 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
22619 +# registers that could be reloaded and zeroed the others.
22620 +# Category 2 we fix up by killing the current process. We cannot use the
22621 +# normal Linux return path in this case because if we use the IRET hypercall
22622 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
22623 +# We distinguish between categories by comparing each saved segment register
22624 +# with its current contents: any discrepancy means we in category 1.
22625 +ENTRY(failsafe_callback)
22626 +       _frame (RIP-0x30)
22627 +       CFI_REL_OFFSET rcx, 0
22628 +       CFI_REL_OFFSET r11, 8
22629 +       movw %ds,%cx
22630 +       cmpw %cx,0x10(%rsp)
22631 +       CFI_REMEMBER_STATE
22632 +       jne 1f
22633 +       movw %es,%cx
22634 +       cmpw %cx,0x18(%rsp)
22635 +       jne 1f
22636 +       movw %fs,%cx
22637 +       cmpw %cx,0x20(%rsp)
22638 +       jne 1f
22639 +       movw %gs,%cx
22640 +       cmpw %cx,0x28(%rsp)
22641 +       jne 1f
22642 +       /* All segments match their saved values => Category 2 (Bad IRET). */
22643 +       movq (%rsp),%rcx
22644 +       CFI_RESTORE rcx
22645 +       movq 8(%rsp),%r11
22646 +       CFI_RESTORE r11
22647 +       addq $0x30,%rsp
22648 +       CFI_ADJUST_CFA_OFFSET -0x30
22649 +       movq $11,%rdi   /* SIGSEGV */
22650 +       jmp do_exit
22651 +       CFI_RESTORE_STATE
22652 +1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
22653 +       movq (%rsp),%rcx
22654 +       CFI_RESTORE rcx
22655 +       movq 8(%rsp),%r11
22656 +       CFI_RESTORE r11
22657 +       addq $0x30,%rsp
22658 +       CFI_ADJUST_CFA_OFFSET -0x30
22659 +       pushq $0
22660 +       CFI_ADJUST_CFA_OFFSET 8
22661 +       SAVE_ALL
22662 +       jmp error_exit
22663 +       CFI_ENDPROC
22664 +#if 0
22665 +        .section __ex_table,"a"
22666 +        .align 8
22667 +        .quad gs_change,bad_gs
22668 +        .previous
22669 +        .section .fixup,"ax"
22670 +       /* running with kernelgs */
22671 +bad_gs:
22672 +/*     swapgs          */      /* switch back to user gs */
22673 +       xorl %eax,%eax
22674 +        movl %eax,%gs
22675 +        jmp  2b
22676 +        .previous
22677 +#endif
22678 +
22679 +/*
22680 + * Create a kernel thread.
22681 + *
22682 + * C extern interface:
22683 + *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
22684 + *
22685 + * asm input arguments:
22686 + *     rdi: fn, rsi: arg, rdx: flags
22687 + */
22688 +ENTRY(kernel_thread)
22689 +       CFI_STARTPROC
22690 +       FAKE_STACK_FRAME $child_rip
22691 +       SAVE_ALL
22692 +
22693 +       # rdi: flags, rsi: usp, rdx: will be &pt_regs
22694 +       movq %rdx,%rdi
22695 +       orq  kernel_thread_flags(%rip),%rdi
22696 +       movq $-1, %rsi
22697 +       movq %rsp, %rdx
22698 +
22699 +       xorl %r8d,%r8d
22700 +       xorl %r9d,%r9d
22701 +
22702 +       # clone now
22703 +       call do_fork
22704 +       movq %rax,RAX(%rsp)
22705 +       xorl %edi,%edi
22706 +
22707 +       /*
22708 +        * It isn't worth to check for reschedule here,
22709 +        * so internally to the x86_64 port you can rely on kernel_thread()
22710 +        * not to reschedule the child before returning, this avoids the need
22711 +        * of hacks for example to fork off the per-CPU idle tasks.
22712 +         * [Hopefully no generic code relies on the reschedule -AK]
22713 +        */
22714 +       RESTORE_ALL
22715 +       UNFAKE_STACK_FRAME
22716 +       ret
22717 +       CFI_ENDPROC
22718 +ENDPROC(kernel_thread)
22719 +
22720 +child_rip:
22721 +       pushq $0                # fake return address
22722 +       CFI_STARTPROC
22723 +       /*
22724 +        * Here we are in the child and the registers are set as they were
22725 +        * at kernel_thread() invocation in the parent.
22726 +        */
22727 +       movq %rdi, %rax
22728 +       movq %rsi, %rdi
22729 +       call *%rax
22730 +       # exit
22731 +       xorl %edi, %edi
22732 +       call do_exit
22733 +       CFI_ENDPROC
22734 +ENDPROC(child_rip)
22735 +
22736 +/*
22737 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
22738 + *
22739 + * C extern interface:
22740 + *      extern long execve(char *name, char **argv, char **envp)
22741 + *
22742 + * asm input arguments:
22743 + *     rdi: name, rsi: argv, rdx: envp
22744 + *
22745 + * We want to fallback into:
22746 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
22747 + *
22748 + * do_sys_execve asm fallback arguments:
22749 + *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
22750 + */
22751 +ENTRY(execve)
22752 +       CFI_STARTPROC
22753 +       FAKE_STACK_FRAME $0
22754 +       SAVE_ALL
22755 +       call sys_execve
22756 +       movq %rax, RAX(%rsp)
22757 +       RESTORE_REST
22758 +       testq %rax,%rax
22759 +       jne 1f
22760 +        jmp int_ret_from_sys_call
22761 +1:      RESTORE_ARGS
22762 +       UNFAKE_STACK_FRAME
22763 +       ret
22764 +       CFI_ENDPROC
22765 +ENDPROC(execve)
22766 +
22767 +KPROBE_ENTRY(page_fault)
22768 +       errorentry do_page_fault
22769 +END(page_fault)
22770 +       .previous .text
22771 +
22772 +ENTRY(coprocessor_error)
22773 +       zeroentry do_coprocessor_error
22774 +END(coprocessor_error)
22775 +
22776 +ENTRY(simd_coprocessor_error)
22777 +       zeroentry do_simd_coprocessor_error
22778 +END(simd_coprocessor_error)
22779 +
22780 +ENTRY(device_not_available)
22781 +       zeroentry math_state_restore
22782 +END(device_not_available)
22783 +
22784 +       /* runs on exception stack */
22785 +KPROBE_ENTRY(debug)
22786 +/*     INTR_FRAME
22787 +       pushq $0
22788 +       CFI_ADJUST_CFA_OFFSET 8 */
22789 +       zeroentry do_debug
22790 +/*     paranoidexit
22791 +       CFI_ENDPROC */
22792 +END(debug)
22793 +       .previous .text
22794 +
22795 +#if 0
22796 +       /* runs on exception stack */
22797 +KPROBE_ENTRY(nmi)
22798 +       INTR_FRAME
22799 +       pushq $-1
22800 +       CFI_ADJUST_CFA_OFFSET 8
22801 +       paranoidentry do_nmi, 0, 0
22802 +#ifdef CONFIG_TRACE_IRQFLAGS
22803 +       paranoidexit 0
22804 +#else
22805 +       jmp paranoid_exit1
22806 +       CFI_ENDPROC
22807 +#endif
22808 +END(nmi)
22809 +       .previous .text
22810 +#endif
22811 +
22812 +KPROBE_ENTRY(int3)
22813 +/*     INTR_FRAME
22814 +       pushq $0
22815 +       CFI_ADJUST_CFA_OFFSET 8 */
22816 +       zeroentry do_int3
22817 +/*     jmp paranoid_exit1
22818 +       CFI_ENDPROC */
22819 +END(int3)
22820 +       .previous .text
22821 +
22822 +ENTRY(overflow)
22823 +       zeroentry do_overflow
22824 +END(overflow)
22825 +
22826 +ENTRY(bounds)
22827 +       zeroentry do_bounds
22828 +END(bounds)
22829 +
22830 +ENTRY(invalid_op)
22831 +       zeroentry do_invalid_op
22832 +END(invalid_op)
22833 +
22834 +ENTRY(coprocessor_segment_overrun)
22835 +       zeroentry do_coprocessor_segment_overrun
22836 +END(coprocessor_segment_overrun)
22837 +
22838 +ENTRY(reserved)
22839 +       zeroentry do_reserved
22840 +END(reserved)
22841 +
22842 +#if 0
22843 +       /* runs on exception stack */
22844 +ENTRY(double_fault)
22845 +       XCPT_FRAME
22846 +       paranoidentry do_double_fault
22847 +       jmp paranoid_exit1
22848 +       CFI_ENDPROC
22849 +END(double_fault)
22850 +#endif
22851 +
22852 +ENTRY(invalid_TSS)
22853 +       errorentry do_invalid_TSS
22854 +END(invalid_TSS)
22855 +
22856 +ENTRY(segment_not_present)
22857 +       errorentry do_segment_not_present
22858 +END(segment_not_present)
22859 +
22860 +       /* runs on exception stack */
22861 +ENTRY(stack_segment)
22862 +/*     XCPT_FRAME
22863 +       paranoidentry do_stack_segment */
22864 +       errorentry do_stack_segment
22865 +/*     jmp paranoid_exit1
22866 +       CFI_ENDPROC */
22867 +END(stack_segment)
22868 +
22869 +KPROBE_ENTRY(general_protection)
22870 +       errorentry do_general_protection
22871 +END(general_protection)
22872 +       .previous .text
22873 +
22874 +ENTRY(alignment_check)
22875 +       errorentry do_alignment_check
22876 +END(alignment_check)
22877 +
22878 +ENTRY(divide_error)
22879 +       zeroentry do_divide_error
22880 +END(divide_error)
22881 +
22882 +ENTRY(spurious_interrupt_bug)
22883 +       zeroentry do_spurious_interrupt_bug
22884 +END(spurious_interrupt_bug)
22885 +
22886 +#ifdef CONFIG_X86_MCE
22887 +       /* runs on exception stack */
22888 +ENTRY(machine_check)
22889 +       INTR_FRAME
22890 +       pushq $0
22891 +       CFI_ADJUST_CFA_OFFSET 8
22892 +       paranoidentry do_machine_check
22893 +       jmp paranoid_exit1
22894 +       CFI_ENDPROC
22895 +END(machine_check)
22896 +#endif
22897 +
22898 +/* Call softirq on interrupt stack. Interrupts are off. */
22899 +ENTRY(call_softirq)
22900 +       CFI_STARTPROC
22901 +       push %rbp
22902 +       CFI_ADJUST_CFA_OFFSET   8
22903 +       CFI_REL_OFFSET rbp,0
22904 +       mov  %rsp,%rbp
22905 +       CFI_DEF_CFA_REGISTER rbp
22906 +       incl %gs:pda_irqcount
22907 +       cmove %gs:pda_irqstackptr,%rsp
22908 +       push  %rbp                      # backlink for old unwinder
22909 +       call __do_softirq
22910 +       leaveq
22911 +       CFI_DEF_CFA_REGISTER    rsp
22912 +       CFI_ADJUST_CFA_OFFSET   -8
22913 +       decl %gs:pda_irqcount
22914 +       ret
22915 +       CFI_ENDPROC
22916 +ENDPROC(call_softirq)
22917 +
22918 +#ifdef CONFIG_STACK_UNWIND
22919 +ENTRY(arch_unwind_init_running)
22920 +       CFI_STARTPROC
22921 +       movq    %r15, R15(%rdi)
22922 +       movq    %r14, R14(%rdi)
22923 +       xchgq   %rsi, %rdx
22924 +       movq    %r13, R13(%rdi)
22925 +       movq    %r12, R12(%rdi)
22926 +       xorl    %eax, %eax
22927 +       movq    %rbp, RBP(%rdi)
22928 +       movq    %rbx, RBX(%rdi)
22929 +       movq    (%rsp), %rcx
22930 +       movq    %rax, R11(%rdi)
22931 +       movq    %rax, R10(%rdi)
22932 +       movq    %rax, R9(%rdi)
22933 +       movq    %rax, R8(%rdi)
22934 +       movq    %rax, RAX(%rdi)
22935 +       movq    %rax, RCX(%rdi)
22936 +       movq    %rax, RDX(%rdi)
22937 +       movq    %rax, RSI(%rdi)
22938 +       movq    %rax, RDI(%rdi)
22939 +       movq    %rax, ORIG_RAX(%rdi)
22940 +       movq    %rcx, RIP(%rdi)
22941 +       leaq    8(%rsp), %rcx
22942 +       movq    $__KERNEL_CS, CS(%rdi)
22943 +       movq    %rax, EFLAGS(%rdi)
22944 +       movq    %rcx, RSP(%rdi)
22945 +       movq    $__KERNEL_DS, SS(%rdi)
22946 +       jmpq    *%rdx
22947 +       CFI_ENDPROC
22948 +ENDPROC(arch_unwind_init_running)
22949 +#endif
22950 Index: head-2008-11-25/arch/x86/kernel/genapic_64-xen.c
22951 ===================================================================
22952 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
22953 +++ head-2008-11-25/arch/x86/kernel/genapic_64-xen.c    2007-06-12 13:13:01.000000000 +0200
22954 @@ -0,0 +1,143 @@
22955 +/*
22956 + * Copyright 2004 James Cleverdon, IBM.
22957 + * Subject to the GNU Public License, v.2
22958 + *
22959 + * Generic APIC sub-arch probe layer.
22960 + *
22961 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
22962 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
22963 + * James Cleverdon.
22964 + */
22965 +#include <linux/threads.h>
22966 +#include <linux/cpumask.h>
22967 +#include <linux/string.h>
22968 +#include <linux/kernel.h>
22969 +#include <linux/ctype.h>
22970 +#include <linux/init.h>
22971 +#include <linux/module.h>
22972 +
22973 +#include <asm/smp.h>
22974 +#include <asm/ipi.h>
22975 +
22976 +#if defined(CONFIG_ACPI)
22977 +#include <acpi/acpi_bus.h>
22978 +#endif
22979 +
22980 +/* which logical CPU number maps to which CPU (physical APIC ID) */
22981 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
22982 +EXPORT_SYMBOL(x86_cpu_to_apicid);
22983 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
22984 +
22985 +extern struct genapic apic_cluster;
22986 +extern struct genapic apic_flat;
22987 +extern struct genapic apic_physflat;
22988 +
22989 +#ifndef CONFIG_XEN
22990 +struct genapic *genapic = &apic_flat;
22991 +#else
22992 +extern struct genapic apic_xen;
22993 +struct genapic *genapic = &apic_xen;
22994 +#endif
22995 +
22996 +
22997 +/*
22998 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
22999 + */
23000 +void __init clustered_apic_check(void)
23001 +{
23002 +#ifndef CONFIG_XEN
23003 +       long i;
23004 +       u8 clusters, max_cluster;
23005 +       u8 id;
23006 +       u8 cluster_cnt[NUM_APIC_CLUSTERS];
23007 +       int max_apic = 0;
23008 +
23009 +#if defined(CONFIG_ACPI)
23010 +       /*
23011 +        * Some x86_64 machines use physical APIC mode regardless of how many
23012 +        * procs/clusters are present (x86_64 ES7000 is an example).
23013 +        */
23014 +       if (acpi_fadt.revision > FADT2_REVISION_ID)
23015 +               if (acpi_fadt.force_apic_physical_destination_mode) {
23016 +                       genapic = &apic_cluster;
23017 +                       goto print;
23018 +               }
23019 +#endif
23020 +
23021 +       memset(cluster_cnt, 0, sizeof(cluster_cnt));
23022 +       for (i = 0; i < NR_CPUS; i++) {
23023 +               id = bios_cpu_apicid[i];
23024 +               if (id == BAD_APICID)
23025 +                       continue;
23026 +               if (id > max_apic)
23027 +                       max_apic = id;
23028 +               cluster_cnt[APIC_CLUSTERID(id)]++;
23029 +       }
23030 +
23031 +       /* Don't use clustered mode on AMD platforms. */
23032 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
23033 +               genapic = &apic_physflat;
23034 +#ifndef CONFIG_HOTPLUG_CPU
23035 +               /* In the CPU hotplug case we cannot use broadcast mode
23036 +                  because that opens a race when a CPU is removed.
23037 +                  Stay at physflat mode in this case.
23038 +                  It is bad to do this unconditionally though. Once
23039 +                  we have ACPI platform support for CPU hotplug
23040 +                  we should detect hotplug capablity from ACPI tables and
23041 +                  only do this when really needed. -AK */
23042 +               if (max_apic <= 8)
23043 +                       genapic = &apic_flat;
23044 +#endif
23045 +               goto print;
23046 +       }
23047 +
23048 +       clusters = 0;
23049 +       max_cluster = 0;
23050 +
23051 +       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
23052 +               if (cluster_cnt[i] > 0) {
23053 +                       ++clusters;
23054 +                       if (cluster_cnt[i] > max_cluster)
23055 +                               max_cluster = cluster_cnt[i];
23056 +               }
23057 +       }
23058 +
23059 +       /*
23060 +        * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
23061 +        * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
23062 +        * else physical mode.
23063 +        * (We don't use lowest priority delivery + HW APIC IRQ steering, so
23064 +        * can ignore the clustered logical case and go straight to physical.)
23065 +        */
23066 +       if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
23067 +#ifdef CONFIG_HOTPLUG_CPU
23068 +               /* Don't use APIC shortcuts in CPU hotplug to avoid races */
23069 +               genapic = &apic_physflat;
23070 +#else
23071 +               genapic = &apic_flat;
23072 +#endif
23073 +       } else
23074 +               genapic = &apic_cluster;
23075 +
23076 +print:
23077 +#else
23078 +       /* hardcode to xen apic functions */
23079 +       genapic = &apic_xen;
23080 +#endif
23081 +       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
23082 +}
23083 +
23084 +/* Same for both flat and clustered. */
23085 +
23086 +#ifdef CONFIG_XEN
23087 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
23088 +#endif
23089 +
23090 +void send_IPI_self(int vector)
23091 +{
23092 +#ifndef CONFIG_XEN
23093 +       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23094 +#else
23095 +       xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23096 +#endif
23097 +}
23098 Index: head-2008-11-25/arch/x86/kernel/genapic_xen_64.c
23099 ===================================================================
23100 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
23101 +++ head-2008-11-25/arch/x86/kernel/genapic_xen_64.c    2007-06-12 13:13:01.000000000 +0200
23102 @@ -0,0 +1,161 @@
23103 +/*
23104 + * Copyright 2004 James Cleverdon, IBM.
23105 + * Subject to the GNU Public License, v.2
23106 + *
23107 + * Xen APIC subarch code.  Maximum 8 CPUs, logical delivery.
23108 + *
23109 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
23110 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
23111 + * James Cleverdon.
23112 + *
23113 + * Hacked to pieces for Xen by Chris Wright.
23114 + */
23115 +#include <linux/threads.h>
23116 +#include <linux/cpumask.h>
23117 +#include <linux/string.h>
23118 +#include <linux/kernel.h>
23119 +#include <linux/ctype.h>
23120 +#include <linux/init.h>
23121 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23122 +#include <asm/smp.h>
23123 +#include <asm/ipi.h>
23124 +#else
23125 +#include <asm/apic.h>
23126 +#include <asm/apicdef.h>
23127 +#include <asm/genapic.h>
23128 +#endif
23129 +#include <xen/evtchn.h>
23130 +
23131 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
23132 +
23133 +static inline void __send_IPI_one(unsigned int cpu, int vector)
23134 +{
23135 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
23136 +       BUG_ON(irq < 0);
23137 +       notify_remote_via_irq(irq);
23138 +}
23139 +
23140 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
23141 +{
23142 +       int cpu;
23143 +
23144 +       switch (shortcut) {
23145 +       case APIC_DEST_SELF:
23146 +               __send_IPI_one(smp_processor_id(), vector);
23147 +               break;
23148 +       case APIC_DEST_ALLBUT:
23149 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23150 +                       if (cpu == smp_processor_id())
23151 +                               continue;
23152 +                       if (cpu_isset(cpu, cpu_online_map)) {
23153 +                               __send_IPI_one(cpu, vector);
23154 +                       }
23155 +               }
23156 +               break;
23157 +       case APIC_DEST_ALLINC:
23158 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23159 +                       if (cpu_isset(cpu, cpu_online_map)) {
23160 +                               __send_IPI_one(cpu, vector);
23161 +                       }
23162 +               }
23163 +               break;
23164 +       default:
23165 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
23166 +                      vector);
23167 +               break;
23168 +       }
23169 +}
23170 +
23171 +static cpumask_t xen_target_cpus(void)
23172 +{
23173 +       return cpu_online_map;
23174 +}
23175 +
23176 +/*
23177 + * Set up the logical destination ID.
23178 + * Do nothing, not called now.
23179 + */
23180 +static void xen_init_apic_ldr(void)
23181 +{
23182 +       Dprintk("%s\n", __FUNCTION__);
23183 +       return;
23184 +}
23185 +
23186 +static void xen_send_IPI_allbutself(int vector)
23187 +{
23188 +       /*
23189 +        * if there are no other CPUs in the system then
23190 +        * we get an APIC send error if we try to broadcast.
23191 +        * thus we have to avoid sending IPIs in this case.
23192 +        */
23193 +       Dprintk("%s\n", __FUNCTION__);
23194 +       if (num_online_cpus() > 1)
23195 +               xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
23196 +}
23197 +
23198 +static void xen_send_IPI_all(int vector)
23199 +{
23200 +       Dprintk("%s\n", __FUNCTION__);
23201 +       xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
23202 +}
23203 +
23204 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
23205 +{
23206 +       unsigned long mask = cpus_addr(cpumask)[0];
23207 +       unsigned int cpu;
23208 +       unsigned long flags;
23209 +
23210 +       Dprintk("%s\n", __FUNCTION__);
23211 +       local_irq_save(flags);
23212 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
23213 +
23214 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23215 +               if (cpu_isset(cpu, cpumask)) {
23216 +                       __send_IPI_one(cpu, vector);
23217 +               }
23218 +       }
23219 +       local_irq_restore(flags);
23220 +}
23221 +
23222 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23223 +static int xen_apic_id_registered(void)
23224 +{
23225 +       /* better be set */
23226 +       Dprintk("%s\n", __FUNCTION__);
23227 +       return physid_isset(smp_processor_id(), phys_cpu_present_map);
23228 +}
23229 +#endif
23230 +
23231 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
23232 +{
23233 +       Dprintk("%s\n", __FUNCTION__);
23234 +       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
23235 +}
23236 +
23237 +static unsigned int phys_pkg_id(int index_msb)
23238 +{
23239 +       u32 ebx;
23240 +
23241 +       Dprintk("%s\n", __FUNCTION__);
23242 +       ebx = cpuid_ebx(1);
23243 +       return ((ebx >> 24) & 0xFF) >> index_msb;
23244 +}
23245 +
23246 +struct genapic apic_xen =  {
23247 +       .name = "xen",
23248 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23249 +       .int_delivery_mode = dest_LowestPrio,
23250 +#endif
23251 +       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
23252 +       .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
23253 +       .target_cpus = xen_target_cpus,
23254 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23255 +       .apic_id_registered = xen_apic_id_registered,
23256 +#endif
23257 +       .init_apic_ldr = xen_init_apic_ldr,
23258 +       .send_IPI_all = xen_send_IPI_all,
23259 +       .send_IPI_allbutself = xen_send_IPI_allbutself,
23260 +       .send_IPI_mask = xen_send_IPI_mask,
23261 +       .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
23262 +       .phys_pkg_id = phys_pkg_id,
23263 +};
23264 Index: head-2008-11-25/arch/x86/kernel/head_64-xen.S
23265 ===================================================================
23266 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
23267 +++ head-2008-11-25/arch/x86/kernel/head_64-xen.S       2007-08-06 15:10:49.000000000 +0200
23268 @@ -0,0 +1,214 @@
23269 +/*
23270 + *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
23271 + *
23272 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23273 + *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
23274 + *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
23275 + *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
23276 + *
23277 + *  $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
23278 + *
23279 + *  Jun Nakajima <jun.nakajima@intel.com>
23280 + *    Modified for Xen
23281 + */
23282 +
23283 +
23284 +#include <linux/linkage.h>
23285 +#include <linux/threads.h>
23286 +#include <linux/init.h>
23287 +#include <linux/elfnote.h>
23288 +#include <asm/desc.h>
23289 +#include <asm/segment.h>
23290 +#include <asm/page.h>
23291 +#include <asm/msr.h>
23292 +#include <asm/cache.h>
23293 +#include <asm/dwarf2.h>
23294 +#include <xen/interface/elfnote.h>
23295 +
23296 +       .section .bootstrap.text, "ax", @progbits
23297 +       .code64
23298 +       .globl startup_64
23299 +startup_64:
23300 +       movq $(init_thread_union+THREAD_SIZE-8),%rsp
23301 +
23302 +       /* rsi is pointer to startup info structure.
23303 +          pass it to C */
23304 +       movq %rsi,%rdi
23305 +       pushq $0                # fake return address
23306 +       jmp x86_64_start_kernel
23307 +
23308 +#ifdef CONFIG_ACPI_SLEEP
23309 +.org 0xf00
23310 +       .globl pGDT32
23311 +pGDT32:
23312 +       .word   gdt_end-cpu_gdt_table-1
23313 +       .long   cpu_gdt_table-__START_KERNEL_map
23314 +#endif
23315 +ENTRY(stext)
23316 +ENTRY(_stext)
23317 +
23318 +       $page = 0
23319 +#define NEXT_PAGE(name) \
23320 +       $page = $page + 1; \
23321 +       .org $page * 0x1000; \
23322 +       phys_##name = $page * 0x1000 + __PHYSICAL_START; \
23323 +ENTRY(name)
23324 +
23325 +NEXT_PAGE(init_level4_pgt)
23326 +       /* This gets initialized in x86_64_start_kernel */
23327 +       .fill   512,8,0
23328 +NEXT_PAGE(init_level4_user_pgt)
23329 +        /*
23330 +         * We update two pgd entries to make kernel and user pgd consistent
23331 +         * at pgd_populate(). It can be used for kernel modules. So we place
23332 +         * this page here for those cases to avoid memory corruption.
23333 +         * We also use this page to establish the initial mapping for the
23334 +         * vsyscall area.
23335 +         */
23336 +       .fill   512,8,0
23337 +
23338 +NEXT_PAGE(level3_kernel_pgt)
23339 +       .fill   512,8,0
23340 +
23341 +        /*
23342 +         * This is used for vsyscall area mapping as we have a different
23343 +         * level4 page table for user.
23344 +         */
23345 +NEXT_PAGE(level3_user_pgt)
23346 +        .fill  512,8,0
23347 +
23348 +NEXT_PAGE(level2_kernel_pgt)
23349 +       .fill   512,8,0
23350 +
23351 +NEXT_PAGE(hypercall_page)
23352 +       CFI_STARTPROC
23353 +       .rept 0x1000 / 0x20
23354 +       .skip 1 /* push %rcx */
23355 +       CFI_ADJUST_CFA_OFFSET   8
23356 +       CFI_REL_OFFSET  rcx,0
23357 +       .skip 2 /* push %r11 */
23358 +       CFI_ADJUST_CFA_OFFSET   8
23359 +       CFI_REL_OFFSET  rcx,0
23360 +       .skip 5 /* mov $#,%eax */
23361 +       .skip 2 /* syscall */
23362 +       .skip 2 /* pop %r11 */
23363 +       CFI_ADJUST_CFA_OFFSET -8
23364 +       CFI_RESTORE r11
23365 +       .skip 1 /* pop %rcx */
23366 +       CFI_ADJUST_CFA_OFFSET -8
23367 +       CFI_RESTORE rcx
23368 +       .align 0x20,0 /* ret */
23369 +       .endr
23370 +       CFI_ENDPROC
23371 +
23372 +#undef NEXT_PAGE
23373 +
23374 +       .data
23375 +/* Just dummy symbol to allow compilation. Not used in sleep path */
23376 +#ifdef CONFIG_ACPI_SLEEP
23377 +       .align PAGE_SIZE
23378 +ENTRY(wakeup_level4_pgt)
23379 +       .fill   512,8,0
23380 +#endif
23381 +
23382 +       .data
23383 +
23384 +       .align 16
23385 +       .globl cpu_gdt_descr
23386 +cpu_gdt_descr:
23387 +       .word   gdt_end-cpu_gdt_table-1
23388 +gdt:
23389 +       .quad   cpu_gdt_table
23390 +#ifdef CONFIG_SMP
23391 +       .rept   NR_CPUS-1
23392 +       .word   0
23393 +       .quad   0
23394 +       .endr
23395 +#endif
23396 +
23397 +/* We need valid kernel segments for data and code in long mode too
23398 + * IRET will check the segment types  kkeil 2000/10/28
23399 + * Also sysret mandates a special GDT layout
23400 + */
23401 +
23402 +       .section .data.page_aligned, "aw"
23403 +       .align PAGE_SIZE
23404 +
23405 +/* The TLS descriptors are currently at a different place compared to i386.
23406 +   Hopefully nobody expects them at a fixed place (Wine?) */
23407 +
23408 +ENTRY(cpu_gdt_table)
23409 +       .quad   0x0000000000000000      /* NULL descriptor */
23410 +       .quad   0x0                     /* unused */
23411 +       .quad   0x00af9a000000ffff      /* __KERNEL_CS */
23412 +       .quad   0x00cf92000000ffff      /* __KERNEL_DS */
23413 +       .quad   0x00cffa000000ffff      /* __USER32_CS */
23414 +       .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */
23415 +       .quad   0x00affa000000ffff      /* __USER_CS */
23416 +       .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
23417 +       .quad   0,0                     /* TSS */
23418 +       .quad   0,0                     /* LDT */
23419 +       .quad   0,0,0                   /* three TLS descriptors */
23420 +       .quad   0                       /* unused */
23421 +gdt_end:
23422 +       /* asm/segment.h:GDT_ENTRIES must match this */
23423 +       /* This should be a multiple of the cache line size */
23424 +       /* GDTs of other CPUs are now dynamically allocated */
23425 +
23426 +       /* zero the remaining page */
23427 +       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
23428 +
23429 +       .section .bss.page_aligned, "aw", @nobits
23430 +       .align PAGE_SIZE
23431 +ENTRY(empty_zero_page)
23432 +       .skip PAGE_SIZE
23433 +
23434 +#if CONFIG_XEN_COMPAT <= 0x030002
23435 +/*
23436 + * __xen_guest information
23437 + */
23438 +.macro utoh value
23439 + .if (\value) < 0 || (\value) >= 0x10
23440 +       utoh (((\value)>>4)&0x0fffffffffffffff)
23441 + .endif
23442 + .if ((\value) & 0xf) < 10
23443 +  .byte '0' + ((\value) & 0xf)
23444 + .else
23445 +  .byte 'A' + ((\value) & 0xf) - 10
23446 + .endif
23447 +.endm
23448 +
23449 +.section __xen_guest
23450 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
23451 +       .ascii  ",XEN_VER=xen-3.0"
23452 +       .ascii  ",VIRT_BASE=0x"
23453 +               utoh __START_KERNEL_map
23454 +       .ascii  ",ELF_PADDR_OFFSET=0x"
23455 +               utoh __START_KERNEL_map
23456 +       .ascii  ",VIRT_ENTRY=0x"
23457 +               utoh (__START_KERNEL_map + __PHYSICAL_START)
23458 +       .ascii  ",HYPERCALL_PAGE=0x"
23459 +               utoh (phys_hypercall_page >> PAGE_SHIFT)
23460 +       .ascii  ",FEATURES=writable_page_tables"
23461 +       .ascii           "|writable_descriptor_tables"
23462 +       .ascii           "|auto_translated_physmap"
23463 +       .ascii           "|supervisor_mode_kernel"
23464 +       .ascii  ",LOADER=generic"
23465 +       .byte   0
23466 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
23467 +
23468 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
23469 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
23470 +       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
23471 +       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad,  __START_KERNEL_map)
23472 +#if CONFIG_XEN_COMPAT <= 0x030002
23473 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  __START_KERNEL_map)
23474 +#else
23475 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  0)
23476 +#endif
23477 +       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad,  startup_64)
23478 +       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad,  hypercall_page)
23479 +       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad,  _PAGE_PRESENT,_PAGE_PRESENT)
23480 +       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
23481 +       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
23482 +       ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long,  1)
23483 Index: head-2008-11-25/arch/x86/kernel/head64-xen.c
23484 ===================================================================
23485 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
23486 +++ head-2008-11-25/arch/x86/kernel/head64-xen.c        2007-06-12 13:13:01.000000000 +0200
23487 @@ -0,0 +1,162 @@
23488 +/*
23489 + *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
23490 + *
23491 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23492 + *
23493 + *  Jun Nakajima <jun.nakajima@intel.com>
23494 + *     Modified for Xen.
23495 + */
23496 +
23497 +#include <linux/init.h>
23498 +#include <linux/linkage.h>
23499 +#include <linux/types.h>
23500 +#include <linux/kernel.h>
23501 +#include <linux/string.h>
23502 +#include <linux/percpu.h>
23503 +#include <linux/module.h>
23504 +
23505 +#include <asm/processor.h>
23506 +#include <asm/proto.h>
23507 +#include <asm/smp.h>
23508 +#include <asm/bootsetup.h>
23509 +#include <asm/setup.h>
23510 +#include <asm/desc.h>
23511 +#include <asm/pgtable.h>
23512 +#include <asm/sections.h>
23513 +
23514 +unsigned long start_pfn;
23515 +
23516 +/* Don't add a printk in there. printk relies on the PDA which is not initialized
23517 +   yet. */
23518 +#if 0
23519 +static void __init clear_bss(void)
23520 +{
23521 +       memset(__bss_start, 0,
23522 +              (unsigned long) __bss_stop - (unsigned long) __bss_start);
23523 +}
23524 +#endif
23525 +
23526 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
23527 +#define OLD_CL_MAGIC_ADDR      0x90020
23528 +#define OLD_CL_MAGIC            0xA33F
23529 +#define OLD_CL_BASE_ADDR        0x90000
23530 +#define OLD_CL_OFFSET           0x90022
23531 +
23532 +extern char saved_command_line[];
23533 +
23534 +static void __init copy_bootdata(char *real_mode_data)
23535 +{
23536 +#ifndef CONFIG_XEN
23537 +       int new_data;
23538 +       char * command_line;
23539 +
23540 +       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
23541 +       new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
23542 +       if (!new_data) {
23543 +               if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
23544 +                       printk("so old bootloader that it does not support commandline?!\n");
23545 +                       return;
23546 +               }
23547 +               new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
23548 +               printk("old bootloader convention, maybe loadlin?\n");
23549 +       }
23550 +       command_line = (char *) ((u64)(new_data));
23551 +       memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
23552 +#else
23553 +       int max_cmdline;
23554 +
23555 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
23556 +               max_cmdline = COMMAND_LINE_SIZE;
23557 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
23558 +       saved_command_line[max_cmdline-1] = '\0';
23559 +#endif
23560 +       printk("Bootdata ok (command line is %s)\n", saved_command_line);
23561 +}
23562 +
23563 +static void __init setup_boot_cpu_data(void)
23564 +{
23565 +       unsigned int dummy, eax;
23566 +
23567 +       /* get vendor info */
23568 +       cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
23569 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
23570 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
23571 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
23572 +
23573 +       /* get cpu type */
23574 +       cpuid(1, &eax, &dummy, &dummy,
23575 +               (unsigned int *) &boot_cpu_data.x86_capability);
23576 +       boot_cpu_data.x86 = (eax >> 8) & 0xf;
23577 +       boot_cpu_data.x86_model = (eax >> 4) & 0xf;
23578 +       boot_cpu_data.x86_mask = eax & 0xf;
23579 +}
23580 +
23581 +#include <xen/interface/memory.h>
23582 +unsigned long *machine_to_phys_mapping;
23583 +EXPORT_SYMBOL(machine_to_phys_mapping);
23584 +unsigned int machine_to_phys_order;
23585 +EXPORT_SYMBOL(machine_to_phys_order);
23586 +
23587 +void __init x86_64_start_kernel(char * real_mode_data)
23588 +{
23589 +       struct xen_machphys_mapping mapping;
23590 +       unsigned long machine_to_phys_nr_ents;
23591 +       char *s;
23592 +       int i;
23593 +
23594 +       setup_xen_features();
23595 +
23596 +       xen_start_info = (struct start_info *)real_mode_data;
23597 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
23598 +               phys_to_machine_mapping =
23599 +                       (unsigned long *)xen_start_info->mfn_list;
23600 +       start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
23601 +               xen_start_info->nr_pt_frames;
23602 +
23603 +       machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
23604 +       machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
23605 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
23606 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
23607 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
23608 +       }
23609 +       while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
23610 +               machine_to_phys_order++;
23611 +
23612 +#if 0
23613 +       for (i = 0; i < 256; i++)
23614 +               set_intr_gate(i, early_idt_handler);
23615 +       asm volatile("lidt %0" :: "m" (idt_descr));
23616 +#endif
23617 +
23618 +       /*
23619 +        * This must be called really, really early:
23620 +        */
23621 +       lockdep_init();
23622 +
23623 +       for (i = 0; i < NR_CPUS; i++)
23624 +               cpu_pda(i) = &boot_cpu_pda[i];
23625 +
23626 +       pda_init(0);
23627 +       copy_bootdata(real_mode_data);
23628 +#ifdef CONFIG_SMP
23629 +       cpu_set(0, cpu_online_map);
23630 +#endif
23631 +       s = strstr(saved_command_line, "earlyprintk=");
23632 +       if (s != NULL)
23633 +               setup_early_printk(strchr(s, '=') + 1);
23634 +#ifdef CONFIG_NUMA
23635 +       s = strstr(saved_command_line, "numa=");
23636 +       if (s != NULL)
23637 +               numa_setup(s+5);
23638 +#endif
23639 +#ifdef CONFIG_X86_IO_APIC
23640 +       if (strstr(saved_command_line, "disableapic"))
23641 +               disable_apic = 1;
23642 +#endif
23643 +       /* You need early console to see that */
23644 +       if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
23645 +               panic("Kernel too big for kernel mapping\n");
23646 +
23647 +       setup_boot_cpu_data();
23648 +       start_kernel();
23649 +}
23650 Index: head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c
23651 ===================================================================
23652 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
23653 +++ head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c    2008-11-25 12:22:34.000000000 +0100
23654 @@ -0,0 +1,2268 @@
23655 +/*
23656 + *     Intel IO-APIC support for multi-Pentium hosts.
23657 + *
23658 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
23659 + *
23660 + *     Many thanks to Stig Venaas for trying out countless experimental
23661 + *     patches and reporting/debugging problems patiently!
23662 + *
23663 + *     (c) 1999, Multiple IO-APIC support, developed by
23664 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
23665 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
23666 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
23667 + *     and Ingo Molnar <mingo@redhat.com>
23668 + *
23669 + *     Fixes
23670 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
23671 + *                                     thanks to Eric Gilmore
23672 + *                                     and Rolf G. Tews
23673 + *                                     for testing these extensively
23674 + *     Paul Diefenbaugh        :       Added full ACPI support
23675 + */
23676 +
23677 +#include <linux/mm.h>
23678 +#include <linux/interrupt.h>
23679 +#include <linux/init.h>
23680 +#include <linux/delay.h>
23681 +#include <linux/sched.h>
23682 +#include <linux/smp_lock.h>
23683 +#include <linux/mc146818rtc.h>
23684 +#include <linux/acpi.h>
23685 +#include <linux/sysdev.h>
23686 +#ifdef CONFIG_ACPI
23687 +#include <acpi/acpi_bus.h>
23688 +#endif
23689 +
23690 +#include <asm/io.h>
23691 +#include <asm/smp.h>
23692 +#include <asm/desc.h>
23693 +#include <asm/proto.h>
23694 +#include <asm/mach_apic.h>
23695 +#include <asm/acpi.h>
23696 +#include <asm/dma.h>
23697 +#include <asm/nmi.h>
23698 +
23699 +#define __apicdebuginit  __init
23700 +
23701 +int sis_apic_bug; /* not actually supported, dummy for compile */
23702 +
23703 +static int no_timer_check;
23704 +
23705 +int disable_timer_pin_1 __initdata;
23706 +
23707 +#ifndef CONFIG_XEN
23708 +int timer_over_8254 __initdata = 0;
23709 +
23710 +/* Where if anywhere is the i8259 connect in external int mode */
23711 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
23712 +#endif
23713 +
23714 +static DEFINE_SPINLOCK(ioapic_lock);
23715 +static DEFINE_SPINLOCK(vector_lock);
23716 +
23717 +/*
23718 + * # of IRQ routing registers
23719 + */
23720 +int nr_ioapic_registers[MAX_IO_APICS];
23721 +
23722 +/*
23723 + * Rough estimation of how many shared IRQs there are, can
23724 + * be changed anytime.
23725 + */
23726 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
23727 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
23728 +
23729 +/*
23730 + * This is performance-critical, we want to do it O(1)
23731 + *
23732 + * the indexing order of this array favors 1:1 mappings
23733 + * between pins and IRQs.
23734 + */
23735 +
23736 +static struct irq_pin_list {
23737 +       short apic, pin, next;
23738 +} irq_2_pin[PIN_MAP_SIZE];
23739 +
23740 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
23741 +#ifdef CONFIG_PCI_MSI
23742 +#define vector_to_irq(vector)  \
23743 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
23744 +#else
23745 +#define vector_to_irq(vector)  (vector)
23746 +#endif
23747 +
23748 +#ifdef CONFIG_XEN
23749 +
23750 +#include <xen/interface/xen.h>
23751 +#include <xen/interface/physdev.h>
23752 +#include <xen/evtchn.h>
23753 +
23754 +/* Fake i8259 */
23755 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
23756 +#define disable_8259A_irq(_irq)  ((void)0)
23757 +#define i8259A_irq_pending(_irq) (0)
23758 +
23759 +unsigned long io_apic_irqs;
23760 +
23761 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
23762 +{
23763 +       struct physdev_apic apic_op;
23764 +       int ret;
23765 +
23766 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23767 +       apic_op.reg = reg;
23768 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
23769 +       if (ret)
23770 +               return ret;
23771 +       return apic_op.value;
23772 +}
23773 +
23774 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
23775 +{
23776 +       struct physdev_apic apic_op;
23777 +
23778 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23779 +       apic_op.reg = reg;
23780 +       apic_op.value = value;
23781 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
23782 +}
23783 +
23784 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
23785 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
23786 +
23787 +#define clear_IO_APIC() ((void)0)
23788 +
23789 +#else
23790 +
23791 +#ifdef CONFIG_SMP
23792 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
23793 +{
23794 +       unsigned long flags;
23795 +       unsigned int dest;
23796 +       cpumask_t tmp;
23797 +
23798 +       cpus_and(tmp, mask, cpu_online_map);
23799 +       if (cpus_empty(tmp))
23800 +               tmp = TARGET_CPUS;
23801 +
23802 +       cpus_and(mask, tmp, CPU_MASK_ALL);
23803 +
23804 +       dest = cpu_mask_to_apicid(mask);
23805 +
23806 +       /*
23807 +        * Only the high 8 bits are valid.
23808 +        */
23809 +       dest = SET_APIC_LOGICAL_ID(dest);
23810 +
23811 +       spin_lock_irqsave(&ioapic_lock, flags);
23812 +       __DO_ACTION(1, = dest, )
23813 +       set_irq_info(irq, mask);
23814 +       spin_unlock_irqrestore(&ioapic_lock, flags);
23815 +}
23816 +#endif
23817 +
23818 +#endif /* !CONFIG_XEN */
23819 +
23820 +/*
23821 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
23822 + * shared ISA-space IRQs, so we have to support them. We are super
23823 + * fast in the common case, and fast for shared ISA-space IRQs.
23824 + */
23825 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
23826 +{
23827 +       static int first_free_entry = NR_IRQS;
23828 +       struct irq_pin_list *entry = irq_2_pin + irq;
23829 +
23830 +       BUG_ON(irq >= NR_IRQS);
23831 +       while (entry->next)
23832 +               entry = irq_2_pin + entry->next;
23833 +
23834 +       if (entry->pin != -1) {
23835 +               entry->next = first_free_entry;
23836 +               entry = irq_2_pin + entry->next;
23837 +               if (++first_free_entry >= PIN_MAP_SIZE)
23838 +                       panic("io_apic.c: ran out of irq_2_pin entries!");
23839 +       }
23840 +       entry->apic = apic;
23841 +       entry->pin = pin;
23842 +}
23843 +
23844 +#ifndef CONFIG_XEN
23845 +#define __DO_ACTION(R, ACTION, FINAL)                                  \
23846 +                                                                       \
23847 +{                                                                      \
23848 +       int pin;                                                        \
23849 +       struct irq_pin_list *entry = irq_2_pin + irq;                   \
23850 +                                                                       \
23851 +       BUG_ON(irq >= NR_IRQS);                                         \
23852 +       for (;;) {                                                      \
23853 +               unsigned int reg;                                       \
23854 +               pin = entry->pin;                                       \
23855 +               if (pin == -1)                                          \
23856 +                       break;                                          \
23857 +               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
23858 +               reg ACTION;                                             \
23859 +               io_apic_modify(entry->apic, reg);                       \
23860 +               if (!entry->next)                                       \
23861 +                       break;                                          \
23862 +               entry = irq_2_pin + entry->next;                        \
23863 +       }                                                               \
23864 +       FINAL;                                                          \
23865 +}
23866 +
23867 +#define DO_ACTION(name,R,ACTION, FINAL)                                        \
23868 +                                                                       \
23869 +       static void name##_IO_APIC_irq (unsigned int irq)               \
23870 +       __DO_ACTION(R, ACTION, FINAL)
23871 +
23872 +DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
23873 +                                               /* mask = 1 */
23874 +DO_ACTION( __unmask,           0, &= 0xfffeffff, )
23875 +                                               /* mask = 0 */
23876 +
23877 +static void mask_IO_APIC_irq (unsigned int irq)
23878 +{
23879 +       unsigned long flags;
23880 +
23881 +       spin_lock_irqsave(&ioapic_lock, flags);
23882 +       __mask_IO_APIC_irq(irq);
23883 +       spin_unlock_irqrestore(&ioapic_lock, flags);
23884 +}
23885 +
23886 +static void unmask_IO_APIC_irq (unsigned int irq)
23887 +{
23888 +       unsigned long flags;
23889 +
23890 +       spin_lock_irqsave(&ioapic_lock, flags);
23891 +       __unmask_IO_APIC_irq(irq);
23892 +       spin_unlock_irqrestore(&ioapic_lock, flags);
23893 +}
23894 +
23895 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
23896 +{
23897 +       struct IO_APIC_route_entry entry;
23898 +       unsigned long flags;
23899 +
23900 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
23901 +       spin_lock_irqsave(&ioapic_lock, flags);
23902 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
23903 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
23904 +       spin_unlock_irqrestore(&ioapic_lock, flags);
23905 +       if (entry.delivery_mode == dest_SMI)
23906 +               return;
23907 +       /*
23908 +        * Disable it in the IO-APIC irq-routing table:
23909 +        */
23910 +       memset(&entry, 0, sizeof(entry));
23911 +       entry.mask = 1;
23912 +       spin_lock_irqsave(&ioapic_lock, flags);
23913 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
23914 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
23915 +       spin_unlock_irqrestore(&ioapic_lock, flags);
23916 +}
23917 +
23918 +static void clear_IO_APIC (void)
23919 +{
23920 +       int apic, pin;
23921 +
23922 +       for (apic = 0; apic < nr_ioapics; apic++)
23923 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
23924 +                       clear_IO_APIC_pin(apic, pin);
23925 +}
23926 +
23927 +#endif /* !CONFIG_XEN */
23928 +
23929 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
23930 +
23931 +/*
23932 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
23933 + * specific CPU-side IRQs.
23934 + */
23935 +
23936 +#define MAX_PIRQS 8
23937 +static int pirq_entries [MAX_PIRQS];
23938 +static int pirqs_enabled;
23939 +int skip_ioapic_setup;
23940 +int ioapic_force;
23941 +
23942 +/* dummy parsing: see setup.c */
23943 +
23944 +static int __init disable_ioapic_setup(char *str)
23945 +{
23946 +       skip_ioapic_setup = 1;
23947 +       return 1;
23948 +}
23949 +
23950 +static int __init enable_ioapic_setup(char *str)
23951 +{
23952 +       ioapic_force = 1;
23953 +       skip_ioapic_setup = 0;
23954 +       return 1;
23955 +}
23956 +
23957 +__setup("noapic", disable_ioapic_setup);
23958 +__setup("apic", enable_ioapic_setup);
23959 +
23960 +#ifndef CONFIG_XEN
23961 +static int __init setup_disable_8254_timer(char *s)
23962 +{
23963 +       timer_over_8254 = -1;
23964 +       return 1;
23965 +}
23966 +static int __init setup_enable_8254_timer(char *s)
23967 +{
23968 +       timer_over_8254 = 2;
23969 +       return 1;
23970 +}
23971 +
23972 +__setup("disable_8254_timer", setup_disable_8254_timer);
23973 +__setup("enable_8254_timer", setup_enable_8254_timer);
23974 +#endif /* !CONFIG_XEN */
23975 +
23976 +#include <asm/pci-direct.h>
23977 +#include <linux/pci_ids.h>
23978 +#include <linux/pci.h>
23979 +
23980 +
23981 +#ifdef CONFIG_ACPI
23982 +
23983 +static int nvidia_hpet_detected __initdata;
23984 +
23985 +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
23986 +{
23987 +       nvidia_hpet_detected = 1;
23988 +       return 0;
23989 +}
23990 +#endif
23991 +
23992 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
23993 +   off. Check for an Nvidia or VIA PCI bridge and turn it off.
23994 +   Use pci direct infrastructure because this runs before the PCI subsystem.
23995 +
23996 +   Can be overwritten with "apic"
23997 +
23998 +   And another hack to disable the IOMMU on VIA chipsets.
23999 +
24000 +   ... and others. Really should move this somewhere else.
24001 +
24002 +   Kludge-O-Rama. */
24003 +void __init check_ioapic(void)
24004 +{
24005 +       int num,slot,func;
24006 +       /* Poor man's PCI discovery */
24007 +       for (num = 0; num < 32; num++) {
24008 +               for (slot = 0; slot < 32; slot++) {
24009 +                       for (func = 0; func < 8; func++) {
24010 +                               u32 class;
24011 +                               u32 vendor;
24012 +                               u8 type;
24013 +                               class = read_pci_config(num,slot,func,
24014 +                                                       PCI_CLASS_REVISION);
24015 +                               if (class == 0xffffffff)
24016 +                                       break;
24017 +
24018 +                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
24019 +                                       continue;
24020 +
24021 +                               vendor = read_pci_config(num, slot, func,
24022 +                                                        PCI_VENDOR_ID);
24023 +                               vendor &= 0xffff;
24024 +                               switch (vendor) {
24025 +                               case PCI_VENDOR_ID_VIA:
24026 +#ifdef CONFIG_IOMMU
24027 +                                       if ((end_pfn > MAX_DMA32_PFN ||
24028 +                                            force_iommu) &&
24029 +                                           !iommu_aperture_allowed) {
24030 +                                               printk(KERN_INFO
24031 +    "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
24032 +                                               iommu_aperture_disabled = 1;
24033 +                                       }
24034 +#endif
24035 +                                       return;
24036 +                               case PCI_VENDOR_ID_NVIDIA:
24037 +#ifdef CONFIG_ACPI
24038 +                                       /*
24039 +                                        * All timer overrides on Nvidia are
24040 +                                        * wrong unless HPET is enabled.
24041 +                                        */
24042 +                                       nvidia_hpet_detected = 0;
24043 +                                       acpi_table_parse(ACPI_HPET,
24044 +                                                       nvidia_hpet_check);
24045 +                                       if (nvidia_hpet_detected == 0) {
24046 +                                               acpi_skip_timer_override = 1;
24047 +                                               printk(KERN_INFO "Nvidia board "
24048 +                                                   "detected. Ignoring ACPI "
24049 +                                                   "timer override.\n");
24050 +                                       }
24051 +#endif
24052 +                                       /* RED-PEN skip them on mptables too? */
24053 +                                       return;
24054 +                               case PCI_VENDOR_ID_ATI:
24055 +
24056 +                               /* This should be actually default, but
24057 +                                  for 2.6.16 let's do it for ATI only where
24058 +                                  it's really needed. */
24059 +#ifndef CONFIG_XEN
24060 +                                       if (timer_over_8254 == 1) {
24061 +                                               timer_over_8254 = 0;
24062 +                                       printk(KERN_INFO
24063 +               "ATI board detected. Disabling timer routing over 8254.\n");
24064 +                                       }
24065 +#endif
24066 +                                       return;
24067 +                               }
24068 +
24069 +
24070 +                               /* No multi-function device? */
24071 +                               type = read_pci_config_byte(num,slot,func,
24072 +                                                           PCI_HEADER_TYPE);
24073 +                               if (!(type & 0x80))
24074 +                                       break;
24075 +                       }
24076 +               }
24077 +       }
24078 +}
24079 +
24080 +static int __init ioapic_pirq_setup(char *str)
24081 +{
24082 +       int i, max;
24083 +       int ints[MAX_PIRQS+1];
24084 +
24085 +       get_options(str, ARRAY_SIZE(ints), ints);
24086 +
24087 +       for (i = 0; i < MAX_PIRQS; i++)
24088 +               pirq_entries[i] = -1;
24089 +
24090 +       pirqs_enabled = 1;
24091 +       apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
24092 +       max = MAX_PIRQS;
24093 +       if (ints[0] < MAX_PIRQS)
24094 +               max = ints[0];
24095 +
24096 +       for (i = 0; i < max; i++) {
24097 +               apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
24098 +               /*
24099 +                * PIRQs are mapped upside down, usually.
24100 +                */
24101 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
24102 +       }
24103 +       return 1;
24104 +}
24105 +
24106 +__setup("pirq=", ioapic_pirq_setup);
24107 +
24108 +/*
24109 + * Find the IRQ entry number of a certain pin.
24110 + */
24111 +static int find_irq_entry(int apic, int pin, int type)
24112 +{
24113 +       int i;
24114 +
24115 +       for (i = 0; i < mp_irq_entries; i++)
24116 +               if (mp_irqs[i].mpc_irqtype == type &&
24117 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
24118 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
24119 +                   mp_irqs[i].mpc_dstirq == pin)
24120 +                       return i;
24121 +
24122 +       return -1;
24123 +}
24124 +
24125 +#ifndef CONFIG_XEN
24126 +/*
24127 + * Find the pin to which IRQ[irq] (ISA) is connected
24128 + */
24129 +static int __init find_isa_irq_pin(int irq, int type)
24130 +{
24131 +       int i;
24132 +
24133 +       for (i = 0; i < mp_irq_entries; i++) {
24134 +               int lbus = mp_irqs[i].mpc_srcbus;
24135 +
24136 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24137 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24138 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24139 +                   (mp_irqs[i].mpc_irqtype == type) &&
24140 +                   (mp_irqs[i].mpc_srcbusirq == irq))
24141 +
24142 +                       return mp_irqs[i].mpc_dstirq;
24143 +       }
24144 +       return -1;
24145 +}
24146 +
24147 +static int __init find_isa_irq_apic(int irq, int type)
24148 +{
24149 +       int i;
24150 +
24151 +       for (i = 0; i < mp_irq_entries; i++) {
24152 +               int lbus = mp_irqs[i].mpc_srcbus;
24153 +
24154 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24155 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24156 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24157 +                   (mp_irqs[i].mpc_irqtype == type) &&
24158 +                   (mp_irqs[i].mpc_srcbusirq == irq))
24159 +                       break;
24160 +       }
24161 +       if (i < mp_irq_entries) {
24162 +               int apic;
24163 +               for(apic = 0; apic < nr_ioapics; apic++) {
24164 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
24165 +                               return apic;
24166 +               }
24167 +       }
24168 +
24169 +       return -1;
24170 +}
24171 +#endif
24172 +
24173 +/*
24174 + * Find a specific PCI IRQ entry.
24175 + * Not an __init, possibly needed by modules
24176 + */
24177 +static int pin_2_irq(int idx, int apic, int pin);
24178 +
24179 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
24180 +{
24181 +       int apic, i, best_guess = -1;
24182 +
24183 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
24184 +               bus, slot, pin);
24185 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
24186 +               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
24187 +               return -1;
24188 +       }
24189 +       for (i = 0; i < mp_irq_entries; i++) {
24190 +               int lbus = mp_irqs[i].mpc_srcbus;
24191 +
24192 +               for (apic = 0; apic < nr_ioapics; apic++)
24193 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
24194 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
24195 +                               break;
24196 +
24197 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
24198 +                   !mp_irqs[i].mpc_irqtype &&
24199 +                   (bus == lbus) &&
24200 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
24201 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
24202 +
24203 +                       if (!(apic || IO_APIC_IRQ(irq)))
24204 +                               continue;
24205 +
24206 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
24207 +                               return irq;
24208 +                       /*
24209 +                        * Use the first all-but-pin matching entry as a
24210 +                        * best-guess fuzzy result for broken mptables.
24211 +                        */
24212 +                       if (best_guess < 0)
24213 +                               best_guess = irq;
24214 +               }
24215 +       }
24216 +       BUG_ON(best_guess >= NR_IRQS);
24217 +       return best_guess;
24218 +}
24219 +
24220 +/*
24221 + * EISA Edge/Level control register, ELCR
24222 + */
24223 +static int EISA_ELCR(unsigned int irq)
24224 +{
24225 +       if (irq < 16) {
24226 +               unsigned int port = 0x4d0 + (irq >> 3);
24227 +               return (inb(port) >> (irq & 7)) & 1;
24228 +       }
24229 +       apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
24230 +       return 0;
24231 +}
24232 +
24233 +/* EISA interrupts are always polarity zero and can be edge or level
24234 + * trigger depending on the ELCR value.  If an interrupt is listed as
24235 + * EISA conforming in the MP table, that means its trigger type must
24236 + * be read in from the ELCR */
24237 +
24238 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
24239 +#define default_EISA_polarity(idx)     (0)
24240 +
24241 +/* ISA interrupts are always polarity zero edge triggered,
24242 + * when listed as conforming in the MP table. */
24243 +
24244 +#define default_ISA_trigger(idx)       (0)
24245 +#define default_ISA_polarity(idx)      (0)
24246 +
24247 +/* PCI interrupts are always polarity one level triggered,
24248 + * when listed as conforming in the MP table. */
24249 +
24250 +#define default_PCI_trigger(idx)       (1)
24251 +#define default_PCI_polarity(idx)      (1)
24252 +
24253 +/* MCA interrupts are always polarity zero level triggered,
24254 + * when listed as conforming in the MP table. */
24255 +
24256 +#define default_MCA_trigger(idx)       (1)
24257 +#define default_MCA_polarity(idx)      (0)
24258 +
24259 +static int __init MPBIOS_polarity(int idx)
24260 +{
24261 +       int bus = mp_irqs[idx].mpc_srcbus;
24262 +       int polarity;
24263 +
24264 +       /*
24265 +        * Determine IRQ line polarity (high active or low active):
24266 +        */
24267 +       switch (mp_irqs[idx].mpc_irqflag & 3)
24268 +       {
24269 +               case 0: /* conforms, ie. bus-type dependent polarity */
24270 +               {
24271 +                       switch (mp_bus_id_to_type[bus])
24272 +                       {
24273 +                               case MP_BUS_ISA: /* ISA pin */
24274 +                               {
24275 +                                       polarity = default_ISA_polarity(idx);
24276 +                                       break;
24277 +                               }
24278 +                               case MP_BUS_EISA: /* EISA pin */
24279 +                               {
24280 +                                       polarity = default_EISA_polarity(idx);
24281 +                                       break;
24282 +                               }
24283 +                               case MP_BUS_PCI: /* PCI pin */
24284 +                               {
24285 +                                       polarity = default_PCI_polarity(idx);
24286 +                                       break;
24287 +                               }
24288 +                               case MP_BUS_MCA: /* MCA pin */
24289 +                               {
24290 +                                       polarity = default_MCA_polarity(idx);
24291 +                                       break;
24292 +                               }
24293 +                               default:
24294 +                               {
24295 +                                       printk(KERN_WARNING "broken BIOS!!\n");
24296 +                                       polarity = 1;
24297 +                                       break;
24298 +                               }
24299 +                       }
24300 +                       break;
24301 +               }
24302 +               case 1: /* high active */
24303 +               {
24304 +                       polarity = 0;
24305 +                       break;
24306 +               }
24307 +               case 2: /* reserved */
24308 +               {
24309 +                       printk(KERN_WARNING "broken BIOS!!\n");
24310 +                       polarity = 1;
24311 +                       break;
24312 +               }
24313 +               case 3: /* low active */
24314 +               {
24315 +                       polarity = 1;
24316 +                       break;
24317 +               }
24318 +               default: /* invalid */
24319 +               {
24320 +                       printk(KERN_WARNING "broken BIOS!!\n");
24321 +                       polarity = 1;
24322 +                       break;
24323 +               }
24324 +       }
24325 +       return polarity;
24326 +}
24327 +
24328 +static int MPBIOS_trigger(int idx)
24329 +{
24330 +       int bus = mp_irqs[idx].mpc_srcbus;
24331 +       int trigger;
24332 +
24333 +       /*
24334 +        * Determine IRQ trigger mode (edge or level sensitive):
24335 +        */
24336 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
24337 +       {
24338 +               case 0: /* conforms, ie. bus-type dependent */
24339 +               {
24340 +                       switch (mp_bus_id_to_type[bus])
24341 +                       {
24342 +                               case MP_BUS_ISA: /* ISA pin */
24343 +                               {
24344 +                                       trigger = default_ISA_trigger(idx);
24345 +                                       break;
24346 +                               }
24347 +                               case MP_BUS_EISA: /* EISA pin */
24348 +                               {
24349 +                                       trigger = default_EISA_trigger(idx);
24350 +                                       break;
24351 +                               }
24352 +                               case MP_BUS_PCI: /* PCI pin */
24353 +                               {
24354 +                                       trigger = default_PCI_trigger(idx);
24355 +                                       break;
24356 +                               }
24357 +                               case MP_BUS_MCA: /* MCA pin */
24358 +                               {
24359 +                                       trigger = default_MCA_trigger(idx);
24360 +                                       break;
24361 +                               }
24362 +                               default:
24363 +                               {
24364 +                                       printk(KERN_WARNING "broken BIOS!!\n");
24365 +                                       trigger = 1;
24366 +                                       break;
24367 +                               }
24368 +                       }
24369 +                       break;
24370 +               }
24371 +               case 1: /* edge */
24372 +               {
24373 +                       trigger = 0;
24374 +                       break;
24375 +               }
24376 +               case 2: /* reserved */
24377 +               {
24378 +                       printk(KERN_WARNING "broken BIOS!!\n");
24379 +                       trigger = 1;
24380 +                       break;
24381 +               }
24382 +               case 3: /* level */
24383 +               {
24384 +                       trigger = 1;
24385 +                       break;
24386 +               }
24387 +               default: /* invalid */
24388 +               {
24389 +                       printk(KERN_WARNING "broken BIOS!!\n");
24390 +                       trigger = 0;
24391 +                       break;
24392 +               }
24393 +       }
24394 +       return trigger;
24395 +}
24396 +
24397 +static inline int irq_polarity(int idx)
24398 +{
24399 +       return MPBIOS_polarity(idx);
24400 +}
24401 +
24402 +static inline int irq_trigger(int idx)
24403 +{
24404 +       return MPBIOS_trigger(idx);
24405 +}
24406 +
24407 +static int next_irq = 16;
24408 +
24409 +/*
24410 + * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
24411 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
24412 + * from ACPI, which can reach 800 in large boxen.
24413 + *
24414 + * Compact the sparse GSI space into a sequential IRQ series and reuse
24415 + * vectors if possible.
24416 + */
24417 +int gsi_irq_sharing(int gsi)
24418 +{
24419 +       int i, tries, vector;
24420 +
24421 +       BUG_ON(gsi >= NR_IRQ_VECTORS);
24422 +
24423 +       if (platform_legacy_irq(gsi))
24424 +               return gsi;
24425 +
24426 +       if (gsi_2_irq[gsi] != 0xFF)
24427 +               return (int)gsi_2_irq[gsi];
24428 +
24429 +       tries = NR_IRQS;
24430 +  try_again:
24431 +       vector = assign_irq_vector(gsi);
24432 +
24433 +       /*
24434 +        * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
24435 +        * use of vector and if found, return that IRQ.  However, we never want
24436 +        * to share legacy IRQs, which usually have a different trigger mode
24437 +        * than PCI.
24438 +        */
24439 +       for (i = 0; i < NR_IRQS; i++)
24440 +               if (IO_APIC_VECTOR(i) == vector)
24441 +                       break;
24442 +       if (platform_legacy_irq(i)) {
24443 +               if (--tries >= 0) {
24444 +                       IO_APIC_VECTOR(i) = 0;
24445 +                       goto try_again;
24446 +               }
24447 +               panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
24448 +       }
24449 +       if (i < NR_IRQS) {
24450 +               gsi_2_irq[gsi] = i;
24451 +               printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
24452 +                               gsi, vector, i);
24453 +               return i;
24454 +       }
24455 +
24456 +       i = next_irq++;
24457 +       BUG_ON(i >= NR_IRQS);
24458 +       gsi_2_irq[gsi] = i;
24459 +       IO_APIC_VECTOR(i) = vector;
24460 +       printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
24461 +                       gsi, vector, i);
24462 +       return i;
24463 +}
24464 +
24465 +static int pin_2_irq(int idx, int apic, int pin)
24466 +{
24467 +       int irq, i;
24468 +       int bus = mp_irqs[idx].mpc_srcbus;
24469 +
24470 +       /*
24471 +        * Debugging check, we are in big trouble if this message pops up!
24472 +        */
24473 +       if (mp_irqs[idx].mpc_dstirq != pin)
24474 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
24475 +
24476 +       switch (mp_bus_id_to_type[bus])
24477 +       {
24478 +               case MP_BUS_ISA: /* ISA pin */
24479 +               case MP_BUS_EISA:
24480 +               case MP_BUS_MCA:
24481 +               {
24482 +                       irq = mp_irqs[idx].mpc_srcbusirq;
24483 +                       break;
24484 +               }
24485 +               case MP_BUS_PCI: /* PCI pin */
24486 +               {
24487 +                       /*
24488 +                        * PCI IRQs are mapped in order
24489 +                        */
24490 +                       i = irq = 0;
24491 +                       while (i < apic)
24492 +                               irq += nr_ioapic_registers[i++];
24493 +                       irq += pin;
24494 +                       irq = gsi_irq_sharing(irq);
24495 +                       break;
24496 +               }
24497 +               default:
24498 +               {
24499 +                       printk(KERN_ERR "unknown bus type %d.\n",bus);
24500 +                       irq = 0;
24501 +                       break;
24502 +               }
24503 +       }
24504 +       BUG_ON(irq >= NR_IRQS);
24505 +
24506 +       /*
24507 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
24508 +        */
24509 +       if ((pin >= 16) && (pin <= 23)) {
24510 +               if (pirq_entries[pin-16] != -1) {
24511 +                       if (!pirq_entries[pin-16]) {
24512 +                               apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
24513 +                       } else {
24514 +                               irq = pirq_entries[pin-16];
24515 +                               apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
24516 +                                               pin-16, irq);
24517 +                       }
24518 +               }
24519 +       }
24520 +       BUG_ON(irq >= NR_IRQS);
24521 +       return irq;
24522 +}
24523 +
24524 +static inline int IO_APIC_irq_trigger(int irq)
24525 +{
24526 +       int apic, idx, pin;
24527 +
24528 +       for (apic = 0; apic < nr_ioapics; apic++) {
24529 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24530 +                       idx = find_irq_entry(apic,pin,mp_INT);
24531 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
24532 +                               return irq_trigger(idx);
24533 +               }
24534 +       }
24535 +       /*
24536 +        * nonexistent IRQs are edge default
24537 +        */
24538 +       return 0;
24539 +}
24540 +
24541 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
24542 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
24543 +
24544 +int assign_irq_vector(int irq)
24545 +{
24546 +       unsigned long flags;
24547 +       int vector;
24548 +       struct physdev_irq irq_op;
24549 +
24550 +       BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
24551 +
24552 +       if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
24553 +               return -EINVAL;
24554 +
24555 +       spin_lock_irqsave(&vector_lock, flags);
24556 +
24557 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
24558 +               spin_unlock_irqrestore(&vector_lock, flags);
24559 +               return IO_APIC_VECTOR(irq);
24560 +       }
24561 +
24562 +       irq_op.irq = irq;
24563 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
24564 +               spin_unlock_irqrestore(&vector_lock, flags);
24565 +               return -ENOSPC;
24566 +       }
24567 +
24568 +       vector = irq_op.vector;
24569 +       vector_irq[vector] = irq;
24570 +       if (irq != AUTO_ASSIGN)
24571 +               IO_APIC_VECTOR(irq) = vector;
24572 +
24573 +       spin_unlock_irqrestore(&vector_lock, flags);
24574 +
24575 +       return vector;
24576 +}
24577 +
24578 +extern void (*interrupt[NR_IRQS])(void);
24579 +#ifndef CONFIG_XEN
24580 +static struct hw_interrupt_type ioapic_level_type;
24581 +static struct hw_interrupt_type ioapic_edge_type;
24582 +
24583 +#define IOAPIC_AUTO    -1
24584 +#define IOAPIC_EDGE    0
24585 +#define IOAPIC_LEVEL   1
24586 +
24587 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
24588 +{
24589 +       unsigned idx;
24590 +
24591 +       idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
24592 +
24593 +       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
24594 +                       trigger == IOAPIC_LEVEL)
24595 +               irq_desc[idx].chip = &ioapic_level_type;
24596 +       else
24597 +               irq_desc[idx].chip = &ioapic_edge_type;
24598 +       set_intr_gate(vector, interrupt[idx]);
24599 +}
24600 +#else
24601 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
24602 +#endif /* !CONFIG_XEN */
24603 +
24604 +static void __init setup_IO_APIC_irqs(void)
24605 +{
24606 +       struct IO_APIC_route_entry entry;
24607 +       int apic, pin, idx, irq, first_notcon = 1, vector;
24608 +       unsigned long flags;
24609 +
24610 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
24611 +
24612 +       for (apic = 0; apic < nr_ioapics; apic++) {
24613 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24614 +
24615 +               /*
24616 +                * add it to the IO-APIC irq-routing table:
24617 +                */
24618 +               memset(&entry,0,sizeof(entry));
24619 +
24620 +               entry.delivery_mode = INT_DELIVERY_MODE;
24621 +               entry.dest_mode = INT_DEST_MODE;
24622 +               entry.mask = 0;                         /* enable IRQ */
24623 +               entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24624 +
24625 +               idx = find_irq_entry(apic,pin,mp_INT);
24626 +               if (idx == -1) {
24627 +                       if (first_notcon) {
24628 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24629 +                               first_notcon = 0;
24630 +                       } else
24631 +                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24632 +                       continue;
24633 +               }
24634 +
24635 +               entry.trigger = irq_trigger(idx);
24636 +               entry.polarity = irq_polarity(idx);
24637 +
24638 +               if (irq_trigger(idx)) {
24639 +                       entry.trigger = 1;
24640 +                       entry.mask = 1;
24641 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24642 +               }
24643 +
24644 +               irq = pin_2_irq(idx, apic, pin);
24645 +               add_pin_to_irq(irq, apic, pin);
24646 +
24647 +               if (/* !apic && */ !IO_APIC_IRQ(irq))
24648 +                       continue;
24649 +
24650 +               if (IO_APIC_IRQ(irq)) {
24651 +                       vector = assign_irq_vector(irq);
24652 +                       entry.vector = vector;
24653 +
24654 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
24655 +                       if (!apic && (irq < 16))
24656 +                               disable_8259A_irq(irq);
24657 +               }
24658 +               spin_lock_irqsave(&ioapic_lock, flags);
24659 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24660 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24661 +               set_native_irq_info(irq, TARGET_CPUS);
24662 +               spin_unlock_irqrestore(&ioapic_lock, flags);
24663 +       }
24664 +       }
24665 +
24666 +       if (!first_notcon)
24667 +               apic_printk(APIC_VERBOSE," not connected.\n");
24668 +}
24669 +
24670 +#ifndef CONFIG_XEN
24671 +/*
24672 + * Set up the 8259A-master output pin as broadcast to all
24673 + * CPUs.
24674 + */
24675 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
24676 +{
24677 +       struct IO_APIC_route_entry entry;
24678 +       unsigned long flags;
24679 +
24680 +       memset(&entry,0,sizeof(entry));
24681 +
24682 +       disable_8259A_irq(0);
24683 +
24684 +       /* mask LVT0 */
24685 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
24686 +
24687 +       /*
24688 +        * We use logical delivery to get the timer IRQ
24689 +        * to the first CPU.
24690 +        */
24691 +       entry.dest_mode = INT_DEST_MODE;
24692 +       entry.mask = 0;                                 /* unmask IRQ now */
24693 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24694 +       entry.delivery_mode = INT_DELIVERY_MODE;
24695 +       entry.polarity = 0;
24696 +       entry.trigger = 0;
24697 +       entry.vector = vector;
24698 +
24699 +       /*
24700 +        * The timer IRQ doesn't have to know that behind the
24701 +        * scene we have a 8259A-master in AEOI mode ...
24702 +        */
24703 +       irq_desc[0].chip = &ioapic_edge_type;
24704 +
24705 +       /*
24706 +        * Add it to the IO-APIC irq-routing table:
24707 +        */
24708 +       spin_lock_irqsave(&ioapic_lock, flags);
24709 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24710 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24711 +       spin_unlock_irqrestore(&ioapic_lock, flags);
24712 +
24713 +       enable_8259A_irq(0);
24714 +}
24715 +
24716 +void __init UNEXPECTED_IO_APIC(void)
24717 +{
24718 +}
24719 +
24720 +void __apicdebuginit print_IO_APIC(void)
24721 +{
24722 +       int apic, i;
24723 +       union IO_APIC_reg_00 reg_00;
24724 +       union IO_APIC_reg_01 reg_01;
24725 +       union IO_APIC_reg_02 reg_02;
24726 +       unsigned long flags;
24727 +
24728 +       if (apic_verbosity == APIC_QUIET)
24729 +               return;
24730 +
24731 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
24732 +       for (i = 0; i < nr_ioapics; i++)
24733 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
24734 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
24735 +
24736 +       /*
24737 +        * We are a bit conservative about what we expect.  We have to
24738 +        * know about every hardware change ASAP.
24739 +        */
24740 +       printk(KERN_INFO "testing the IO APIC.......................\n");
24741 +
24742 +       for (apic = 0; apic < nr_ioapics; apic++) {
24743 +
24744 +       spin_lock_irqsave(&ioapic_lock, flags);
24745 +       reg_00.raw = io_apic_read(apic, 0);
24746 +       reg_01.raw = io_apic_read(apic, 1);
24747 +       if (reg_01.bits.version >= 0x10)
24748 +               reg_02.raw = io_apic_read(apic, 2);
24749 +       spin_unlock_irqrestore(&ioapic_lock, flags);
24750 +
24751 +       printk("\n");
24752 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
24753 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
24754 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
24755 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
24756 +               UNEXPECTED_IO_APIC();
24757 +
24758 +       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
24759 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
24760 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
24761 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
24762 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
24763 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
24764 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
24765 +               (reg_01.bits.entries != 0x2E) &&
24766 +               (reg_01.bits.entries != 0x3F) &&
24767 +               (reg_01.bits.entries != 0x03)
24768 +       )
24769 +               UNEXPECTED_IO_APIC();
24770 +
24771 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
24772 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
24773 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
24774 +               (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
24775 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
24776 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
24777 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
24778 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
24779 +       )
24780 +               UNEXPECTED_IO_APIC();
24781 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
24782 +               UNEXPECTED_IO_APIC();
24783 +
24784 +       if (reg_01.bits.version >= 0x10) {
24785 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
24786 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
24787 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
24788 +                       UNEXPECTED_IO_APIC();
24789 +       }
24790 +
24791 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
24792 +
24793 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
24794 +                         " Stat Dest Deli Vect:   \n");
24795 +
24796 +       for (i = 0; i <= reg_01.bits.entries; i++) {
24797 +               struct IO_APIC_route_entry entry;
24798 +
24799 +               spin_lock_irqsave(&ioapic_lock, flags);
24800 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
24801 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
24802 +               spin_unlock_irqrestore(&ioapic_lock, flags);
24803 +
24804 +               printk(KERN_DEBUG " %02x %03X %02X  ",
24805 +                       i,
24806 +                       entry.dest.logical.logical_dest,
24807 +                       entry.dest.physical.physical_dest
24808 +               );
24809 +
24810 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
24811 +                       entry.mask,
24812 +                       entry.trigger,
24813 +                       entry.irr,
24814 +                       entry.polarity,
24815 +                       entry.delivery_status,
24816 +                       entry.dest_mode,
24817 +                       entry.delivery_mode,
24818 +                       entry.vector
24819 +               );
24820 +       }
24821 +       }
24822 +       if (use_pci_vector())
24823 +               printk(KERN_INFO "Using vector-based indexing\n");
24824 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
24825 +       for (i = 0; i < NR_IRQS; i++) {
24826 +               struct irq_pin_list *entry = irq_2_pin + i;
24827 +               if (entry->pin < 0)
24828 +                       continue;
24829 +               if (use_pci_vector() && !platform_legacy_irq(i))
24830 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
24831 +               else
24832 +                       printk(KERN_DEBUG "IRQ%d ", i);
24833 +               for (;;) {
24834 +                       printk("-> %d:%d", entry->apic, entry->pin);
24835 +                       if (!entry->next)
24836 +                               break;
24837 +                       entry = irq_2_pin + entry->next;
24838 +               }
24839 +               printk("\n");
24840 +       }
24841 +
24842 +       printk(KERN_INFO ".................................... done.\n");
24843 +
24844 +       return;
24845 +}
24846 +
24847 +static __apicdebuginit void print_APIC_bitfield (int base)
24848 +{
24849 +       unsigned int v;
24850 +       int i, j;
24851 +
24852 +       if (apic_verbosity == APIC_QUIET)
24853 +               return;
24854 +
24855 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
24856 +       for (i = 0; i < 8; i++) {
24857 +               v = apic_read(base + i*0x10);
24858 +               for (j = 0; j < 32; j++) {
24859 +                       if (v & (1<<j))
24860 +                               printk("1");
24861 +                       else
24862 +                               printk("0");
24863 +               }
24864 +               printk("\n");
24865 +       }
24866 +}
24867 +
24868 +void __apicdebuginit print_local_APIC(void * dummy)
24869 +{
24870 +       unsigned int v, ver, maxlvt;
24871 +
24872 +       if (apic_verbosity == APIC_QUIET)
24873 +               return;
24874 +
24875 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
24876 +               smp_processor_id(), hard_smp_processor_id());
24877 +       v = apic_read(APIC_ID);
24878 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
24879 +       v = apic_read(APIC_LVR);
24880 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
24881 +       ver = GET_APIC_VERSION(v);
24882 +       maxlvt = get_maxlvt();
24883 +
24884 +       v = apic_read(APIC_TASKPRI);
24885 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
24886 +
24887 +       v = apic_read(APIC_ARBPRI);
24888 +       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
24889 +               v & APIC_ARBPRI_MASK);
24890 +       v = apic_read(APIC_PROCPRI);
24891 +       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
24892 +
24893 +       v = apic_read(APIC_EOI);
24894 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
24895 +       v = apic_read(APIC_RRR);
24896 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
24897 +       v = apic_read(APIC_LDR);
24898 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
24899 +       v = apic_read(APIC_DFR);
24900 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
24901 +       v = apic_read(APIC_SPIV);
24902 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
24903 +
24904 +       printk(KERN_DEBUG "... APIC ISR field:\n");
24905 +       print_APIC_bitfield(APIC_ISR);
24906 +       printk(KERN_DEBUG "... APIC TMR field:\n");
24907 +       print_APIC_bitfield(APIC_TMR);
24908 +       printk(KERN_DEBUG "... APIC IRR field:\n");
24909 +       print_APIC_bitfield(APIC_IRR);
24910 +
24911 +       v = apic_read(APIC_ESR);
24912 +       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
24913 +
24914 +       v = apic_read(APIC_ICR);
24915 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
24916 +       v = apic_read(APIC_ICR2);
24917 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
24918 +
24919 +       v = apic_read(APIC_LVTT);
24920 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
24921 +
24922 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
24923 +               v = apic_read(APIC_LVTPC);
24924 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
24925 +       }
24926 +       v = apic_read(APIC_LVT0);
24927 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
24928 +       v = apic_read(APIC_LVT1);
24929 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
24930 +
24931 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
24932 +               v = apic_read(APIC_LVTERR);
24933 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
24934 +       }
24935 +
24936 +       v = apic_read(APIC_TMICT);
24937 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
24938 +       v = apic_read(APIC_TMCCT);
24939 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
24940 +       v = apic_read(APIC_TDCR);
24941 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
24942 +       printk("\n");
24943 +}
24944 +
24945 +void print_all_local_APICs (void)
24946 +{
24947 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
24948 +}
24949 +
24950 +void __apicdebuginit print_PIC(void)
24951 +{
24952 +       unsigned int v;
24953 +       unsigned long flags;
24954 +
24955 +       if (apic_verbosity == APIC_QUIET)
24956 +               return;
24957 +
24958 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
24959 +
24960 +       spin_lock_irqsave(&i8259A_lock, flags);
24961 +
24962 +       v = inb(0xa1) << 8 | inb(0x21);
24963 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
24964 +
24965 +       v = inb(0xa0) << 8 | inb(0x20);
24966 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
24967 +
24968 +       outb(0x0b,0xa0);
24969 +       outb(0x0b,0x20);
24970 +       v = inb(0xa0) << 8 | inb(0x20);
24971 +       outb(0x0a,0xa0);
24972 +       outb(0x0a,0x20);
24973 +
24974 +       spin_unlock_irqrestore(&i8259A_lock, flags);
24975 +
24976 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
24977 +
24978 +       v = inb(0x4d1) << 8 | inb(0x4d0);
24979 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
24980 +}
24981 +#endif /* !CONFIG_XEN */
24982 +
24983 +static void __init enable_IO_APIC(void)
24984 +{
24985 +       union IO_APIC_reg_01 reg_01;
24986 +#ifndef CONFIG_XEN
24987 +       int i8259_apic, i8259_pin;
24988 +#endif
24989 +       int i, apic;
24990 +       unsigned long flags;
24991 +
24992 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
24993 +               irq_2_pin[i].pin = -1;
24994 +               irq_2_pin[i].next = 0;
24995 +       }
24996 +       if (!pirqs_enabled)
24997 +               for (i = 0; i < MAX_PIRQS; i++)
24998 +                       pirq_entries[i] = -1;
24999 +
25000 +       /*
25001 +        * The number of IO-APIC IRQ registers (== #pins):
25002 +        */
25003 +       for (apic = 0; apic < nr_ioapics; apic++) {
25004 +               spin_lock_irqsave(&ioapic_lock, flags);
25005 +               reg_01.raw = io_apic_read(apic, 1);
25006 +               spin_unlock_irqrestore(&ioapic_lock, flags);
25007 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
25008 +       }
25009 +#ifndef CONFIG_XEN
25010 +       for(apic = 0; apic < nr_ioapics; apic++) {
25011 +               int pin;
25012 +               /* See if any of the pins is in ExtINT mode */
25013 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
25014 +                       struct IO_APIC_route_entry entry;
25015 +                       spin_lock_irqsave(&ioapic_lock, flags);
25016 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25017 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25018 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
25019 +
25020 +
25021 +                       /* If the interrupt line is enabled and in ExtInt mode
25022 +                        * I have found the pin where the i8259 is connected.
25023 +                        */
25024 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
25025 +                               ioapic_i8259.apic = apic;
25026 +                               ioapic_i8259.pin  = pin;
25027 +                               goto found_i8259;
25028 +                       }
25029 +               }
25030 +       }
25031 + found_i8259:
25032 +       /* Look to see what if the MP table has reported the ExtINT */
25033 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
25034 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
25035 +       /* Trust the MP table if nothing is setup in the hardware */
25036 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
25037 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
25038 +               ioapic_i8259.pin  = i8259_pin;
25039 +               ioapic_i8259.apic = i8259_apic;
25040 +       }
25041 +       /* Complain if the MP table and the hardware disagree */
25042 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
25043 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
25044 +       {
25045 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
25046 +       }
25047 +#endif
25048 +
25049 +       /*
25050 +        * Do not trust the IO-APIC being empty at bootup
25051 +        */
25052 +       clear_IO_APIC();
25053 +}
25054 +
25055 +/*
25056 + * Not an __init, needed by the reboot code
25057 + */
25058 +void disable_IO_APIC(void)
25059 +{
25060 +       /*
25061 +        * Clear the IO-APIC before rebooting:
25062 +        */
25063 +       clear_IO_APIC();
25064 +
25065 +#ifndef CONFIG_XEN
25066 +       /*
25067 +        * If the i8259 is routed through an IOAPIC
25068 +        * Put that IOAPIC in virtual wire mode
25069 +        * so legacy interrupts can be delivered.
25070 +        */
25071 +       if (ioapic_i8259.pin != -1) {
25072 +               struct IO_APIC_route_entry entry;
25073 +               unsigned long flags;
25074 +
25075 +               memset(&entry, 0, sizeof(entry));
25076 +               entry.mask            = 0; /* Enabled */
25077 +               entry.trigger         = 0; /* Edge */
25078 +               entry.irr             = 0;
25079 +               entry.polarity        = 0; /* High */
25080 +               entry.delivery_status = 0;
25081 +               entry.dest_mode       = 0; /* Physical */
25082 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
25083 +               entry.vector          = 0;
25084 +               entry.dest.physical.physical_dest =
25085 +                                       GET_APIC_ID(apic_read(APIC_ID));
25086 +
25087 +               /*
25088 +                * Add it to the IO-APIC irq-routing table:
25089 +                */
25090 +               spin_lock_irqsave(&ioapic_lock, flags);
25091 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
25092 +                       *(((int *)&entry)+1));
25093 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
25094 +                       *(((int *)&entry)+0));
25095 +               spin_unlock_irqrestore(&ioapic_lock, flags);
25096 +       }
25097 +
25098 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
25099 +#endif
25100 +}
25101 +
25102 +/*
25103 + * function to set the IO-APIC physical IDs based on the
25104 + * values stored in the MPC table.
25105 + *
25106 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
25107 + */
25108 +
25109 +#ifndef CONFIG_XEN
25110 +static void __init setup_ioapic_ids_from_mpc (void)
25111 +{
25112 +       union IO_APIC_reg_00 reg_00;
25113 +       int apic;
25114 +       int i;
25115 +       unsigned char old_id;
25116 +       unsigned long flags;
25117 +
25118 +       /*
25119 +        * Set the IOAPIC ID to the value stored in the MPC table.
25120 +        */
25121 +       for (apic = 0; apic < nr_ioapics; apic++) {
25122 +
25123 +               /* Read the register 0 value */
25124 +               spin_lock_irqsave(&ioapic_lock, flags);
25125 +               reg_00.raw = io_apic_read(apic, 0);
25126 +               spin_unlock_irqrestore(&ioapic_lock, flags);
25127 +
25128 +               old_id = mp_ioapics[apic].mpc_apicid;
25129 +
25130 +
25131 +               printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
25132 +
25133 +
25134 +               /*
25135 +                * We need to adjust the IRQ routing table
25136 +                * if the ID changed.
25137 +                */
25138 +               if (old_id != mp_ioapics[apic].mpc_apicid)
25139 +                       for (i = 0; i < mp_irq_entries; i++)
25140 +                               if (mp_irqs[i].mpc_dstapic == old_id)
25141 +                                       mp_irqs[i].mpc_dstapic
25142 +                                               = mp_ioapics[apic].mpc_apicid;
25143 +
25144 +               /*
25145 +                * Read the right value from the MPC table and
25146 +                * write it into the ID register.
25147 +                */
25148 +               apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
25149 +                               mp_ioapics[apic].mpc_apicid);
25150 +
25151 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
25152 +               spin_lock_irqsave(&ioapic_lock, flags);
25153 +               io_apic_write(apic, 0, reg_00.raw);
25154 +               spin_unlock_irqrestore(&ioapic_lock, flags);
25155 +
25156 +               /*
25157 +                * Sanity check
25158 +                */
25159 +               spin_lock_irqsave(&ioapic_lock, flags);
25160 +               reg_00.raw = io_apic_read(apic, 0);
25161 +               spin_unlock_irqrestore(&ioapic_lock, flags);
25162 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
25163 +                       printk("could not set ID!\n");
25164 +               else
25165 +                       apic_printk(APIC_VERBOSE," ok.\n");
25166 +       }
25167 +}
25168 +#else
25169 +static void __init setup_ioapic_ids_from_mpc(void) { }
25170 +#endif
25171 +
25172 +/*
25173 + * There is a nasty bug in some older SMP boards, their mptable lies
25174 + * about the timer IRQ. We do the following to work around the situation:
25175 + *
25176 + *     - timer IRQ defaults to IO-APIC IRQ
25177 + *     - if this function detects that timer IRQs are defunct, then we fall
25178 + *       back to ISA timer IRQs
25179 + */
25180 +#ifndef CONFIG_XEN
25181 +static int __init timer_irq_works(void)
25182 +{
25183 +       unsigned long t1 = jiffies;
25184 +
25185 +       local_irq_enable();
25186 +       /* Let ten ticks pass... */
25187 +       mdelay((10 * 1000) / HZ);
25188 +
25189 +       /*
25190 +        * Expect a few ticks at least, to be sure some possible
25191 +        * glue logic does not lock up after one or two first
25192 +        * ticks in a non-ExtINT mode.  Also the local APIC
25193 +        * might have cached one ExtINT interrupt.  Finally, at
25194 +        * least one tick may be lost due to delays.
25195 +        */
25196 +
25197 +       /* jiffies wrap? */
25198 +       if (jiffies - t1 > 4)
25199 +               return 1;
25200 +       return 0;
25201 +}
25202 +
25203 +/*
25204 + * In the SMP+IOAPIC case it might happen that there are an unspecified
25205 + * number of pending IRQ events unhandled. These cases are very rare,
25206 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
25207 + * better to do it this way as thus we do not have to be aware of
25208 + * 'pending' interrupts in the IRQ path, except at this point.
25209 + */
25210 +/*
25211 + * Edge triggered needs to resend any interrupt
25212 + * that was delayed but this is now handled in the device
25213 + * independent code.
25214 + */
25215 +
25216 +/*
25217 + * Starting up a edge-triggered IO-APIC interrupt is
25218 + * nasty - we need to make sure that we get the edge.
25219 + * If it is already asserted for some reason, we need
25220 + * return 1 to indicate that is was pending.
25221 + *
25222 + * This is not complete - we should be able to fake
25223 + * an edge even if it isn't on the 8259A...
25224 + */
25225 +
25226 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
25227 +{
25228 +       int was_pending = 0;
25229 +       unsigned long flags;
25230 +
25231 +       spin_lock_irqsave(&ioapic_lock, flags);
25232 +       if (irq < 16) {
25233 +               disable_8259A_irq(irq);
25234 +               if (i8259A_irq_pending(irq))
25235 +                       was_pending = 1;
25236 +       }
25237 +       __unmask_IO_APIC_irq(irq);
25238 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25239 +
25240 +       return was_pending;
25241 +}
25242 +
25243 +/*
25244 + * Once we have recorded IRQ_PENDING already, we can mask the
25245 + * interrupt for real. This prevents IRQ storms from unhandled
25246 + * devices.
25247 + */
25248 +static void ack_edge_ioapic_irq(unsigned int irq)
25249 +{
25250 +       move_irq(irq);
25251 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
25252 +                                       == (IRQ_PENDING | IRQ_DISABLED))
25253 +               mask_IO_APIC_irq(irq);
25254 +       ack_APIC_irq();
25255 +}
25256 +
25257 +/*
25258 + * Level triggered interrupts can just be masked,
25259 + * and shutting down and starting up the interrupt
25260 + * is the same as enabling and disabling them -- except
25261 + * with a startup need to return a "was pending" value.
25262 + *
25263 + * Level triggered interrupts are special because we
25264 + * do not touch any IO-APIC register while handling
25265 + * them. We ack the APIC in the end-IRQ handler, not
25266 + * in the start-IRQ-handler. Protection against reentrance
25267 + * from the same interrupt is still provided, both by the
25268 + * generic IRQ layer and by the fact that an unacked local
25269 + * APIC does not accept IRQs.
25270 + */
25271 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
25272 +{
25273 +       unmask_IO_APIC_irq(irq);
25274 +
25275 +       return 0; /* don't check for pending */
25276 +}
25277 +
25278 +static void end_level_ioapic_irq (unsigned int irq)
25279 +{
25280 +       move_irq(irq);
25281 +       ack_APIC_irq();
25282 +}
25283 +
25284 +#ifdef CONFIG_PCI_MSI
25285 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
25286 +{
25287 +       int irq = vector_to_irq(vector);
25288 +
25289 +       return startup_edge_ioapic_irq(irq);
25290 +}
25291 +
25292 +static void ack_edge_ioapic_vector(unsigned int vector)
25293 +{
25294 +       int irq = vector_to_irq(vector);
25295 +
25296 +       move_native_irq(vector);
25297 +       ack_edge_ioapic_irq(irq);
25298 +}
25299 +
25300 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
25301 +{
25302 +       int irq = vector_to_irq(vector);
25303 +
25304 +       return startup_level_ioapic_irq (irq);
25305 +}
25306 +
25307 +static void end_level_ioapic_vector (unsigned int vector)
25308 +{
25309 +       int irq = vector_to_irq(vector);
25310 +
25311 +       move_native_irq(vector);
25312 +       end_level_ioapic_irq(irq);
25313 +}
25314 +
25315 +static void mask_IO_APIC_vector (unsigned int vector)
25316 +{
25317 +       int irq = vector_to_irq(vector);
25318 +
25319 +       mask_IO_APIC_irq(irq);
25320 +}
25321 +
25322 +static void unmask_IO_APIC_vector (unsigned int vector)
25323 +{
25324 +       int irq = vector_to_irq(vector);
25325 +
25326 +       unmask_IO_APIC_irq(irq);
25327 +}
25328 +
25329 +#ifdef CONFIG_SMP
25330 +static void set_ioapic_affinity_vector (unsigned int vector,
25331 +                                       cpumask_t cpu_mask)
25332 +{
25333 +       int irq = vector_to_irq(vector);
25334 +
25335 +       set_native_irq_info(vector, cpu_mask);
25336 +       set_ioapic_affinity_irq(irq, cpu_mask);
25337 +}
25338 +#endif // CONFIG_SMP
25339 +#endif // CONFIG_PCI_MSI
25340 +
25341 +static int ioapic_retrigger(unsigned int irq)
25342 +{
25343 +       send_IPI_self(IO_APIC_VECTOR(irq));
25344 +
25345 +       return 1;
25346 +}
25347 +
25348 +/*
25349 + * Level and edge triggered IO-APIC interrupts need different handling,
25350 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
25351 + * handled with the level-triggered descriptor, but that one has slightly
25352 + * more overhead. Level-triggered interrupts cannot be handled with the
25353 + * edge-triggered handler, without risking IRQ storms and other ugly
25354 + * races.
25355 + */
25356 +
25357 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
25358 +       .typename = "IO-APIC-edge",
25359 +       .startup        = startup_edge_ioapic,
25360 +       .shutdown       = shutdown_edge_ioapic,
25361 +       .enable         = enable_edge_ioapic,
25362 +       .disable        = disable_edge_ioapic,
25363 +       .ack            = ack_edge_ioapic,
25364 +       .end            = end_edge_ioapic,
25365 +#ifdef CONFIG_SMP
25366 +       .set_affinity = set_ioapic_affinity,
25367 +#endif
25368 +       .retrigger      = ioapic_retrigger,
25369 +};
25370 +
25371 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
25372 +       .typename = "IO-APIC-level",
25373 +       .startup        = startup_level_ioapic,
25374 +       .shutdown       = shutdown_level_ioapic,
25375 +       .enable         = enable_level_ioapic,
25376 +       .disable        = disable_level_ioapic,
25377 +       .ack            = mask_and_ack_level_ioapic,
25378 +       .end            = end_level_ioapic,
25379 +#ifdef CONFIG_SMP
25380 +       .set_affinity = set_ioapic_affinity,
25381 +#endif
25382 +       .retrigger      = ioapic_retrigger,
25383 +};
25384 +#endif /* !CONFIG_XEN */
25385 +
25386 +static inline void init_IO_APIC_traps(void)
25387 +{
25388 +       int irq;
25389 +
25390 +       /*
25391 +        * NOTE! The local APIC isn't very good at handling
25392 +        * multiple interrupts at the same interrupt level.
25393 +        * As the interrupt level is determined by taking the
25394 +        * vector number and shifting that right by 4, we
25395 +        * want to spread these out a bit so that they don't
25396 +        * all fall in the same interrupt level.
25397 +        *
25398 +        * Also, we've got to be careful not to trash gate
25399 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
25400 +        */
25401 +       for (irq = 0; irq < NR_IRQS ; irq++) {
25402 +               int tmp = irq;
25403 +               if (use_pci_vector()) {
25404 +                       if (!platform_legacy_irq(tmp))
25405 +                               if ((tmp = vector_to_irq(tmp)) == -1)
25406 +                                       continue;
25407 +               }
25408 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
25409 +                       /*
25410 +                        * Hmm.. We don't have an entry for this,
25411 +                        * so default to an old-fashioned 8259
25412 +                        * interrupt if we can..
25413 +                        */
25414 +                       if (irq < 16)
25415 +                               make_8259A_irq(irq);
25416 +#ifndef CONFIG_XEN
25417 +                       else
25418 +                               /* Strange. Oh, well.. */
25419 +                               irq_desc[irq].chip = &no_irq_type;
25420 +#endif
25421 +               }
25422 +       }
25423 +}
25424 +
25425 +#ifndef CONFIG_XEN
25426 +static void enable_lapic_irq (unsigned int irq)
25427 +{
25428 +       unsigned long v;
25429 +
25430 +       v = apic_read(APIC_LVT0);
25431 +       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
25432 +}
25433 +
25434 +static void disable_lapic_irq (unsigned int irq)
25435 +{
25436 +       unsigned long v;
25437 +
25438 +       v = apic_read(APIC_LVT0);
25439 +       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
25440 +}
25441 +
25442 +static void ack_lapic_irq (unsigned int irq)
25443 +{
25444 +       ack_APIC_irq();
25445 +}
25446 +
25447 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
25448 +
25449 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
25450 +       .typename = "local-APIC-edge",
25451 +       .startup = NULL, /* startup_irq() not used for IRQ0 */
25452 +       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
25453 +       .enable = enable_lapic_irq,
25454 +       .disable = disable_lapic_irq,
25455 +       .ack = ack_lapic_irq,
25456 +       .end = end_lapic_irq,
25457 +};
25458 +
25459 +static void setup_nmi (void)
25460 +{
25461 +       /*
25462 +        * Dirty trick to enable the NMI watchdog ...
25463 +        * We put the 8259A master into AEOI mode and
25464 +        * unmask on all local APICs LVT0 as NMI.
25465 +        *
25466 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
25467 +        * is from Maciej W. Rozycki - so we do not have to EOI from
25468 +        * the NMI handler or the timer interrupt.
25469 +        */
25470 +       printk(KERN_INFO "activating NMI Watchdog ...");
25471 +
25472 +       enable_NMI_through_LVT0(NULL);
25473 +
25474 +       printk(" done.\n");
25475 +}
25476 +
25477 +/*
25478 + * This looks a bit hackish but it's about the only one way of sending
25479 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
25480 + * not support the ExtINT mode, unfortunately.  We need to send these
25481 + * cycles as some i82489DX-based boards have glue logic that keeps the
25482 + * 8259A interrupt line asserted until INTA.  --macro
25483 + */
25484 +static inline void unlock_ExtINT_logic(void)
25485 +{
25486 +       int apic, pin, i;
25487 +       struct IO_APIC_route_entry entry0, entry1;
25488 +       unsigned char save_control, save_freq_select;
25489 +       unsigned long flags;
25490 +
25491 +       pin  = find_isa_irq_pin(8, mp_INT);
25492 +       apic = find_isa_irq_apic(8, mp_INT);
25493 +       if (pin == -1)
25494 +               return;
25495 +
25496 +       spin_lock_irqsave(&ioapic_lock, flags);
25497 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25498 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25499 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25500 +       clear_IO_APIC_pin(apic, pin);
25501 +
25502 +       memset(&entry1, 0, sizeof(entry1));
25503 +
25504 +       entry1.dest_mode = 0;                   /* physical delivery */
25505 +       entry1.mask = 0;                        /* unmask IRQ now */
25506 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
25507 +       entry1.delivery_mode = dest_ExtINT;
25508 +       entry1.polarity = entry0.polarity;
25509 +       entry1.trigger = 0;
25510 +       entry1.vector = 0;
25511 +
25512 +       spin_lock_irqsave(&ioapic_lock, flags);
25513 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
25514 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
25515 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25516 +
25517 +       save_control = CMOS_READ(RTC_CONTROL);
25518 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
25519 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
25520 +                  RTC_FREQ_SELECT);
25521 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
25522 +
25523 +       i = 100;
25524 +       while (i-- > 0) {
25525 +               mdelay(10);
25526 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
25527 +                       i -= 10;
25528 +       }
25529 +
25530 +       CMOS_WRITE(save_control, RTC_CONTROL);
25531 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
25532 +       clear_IO_APIC_pin(apic, pin);
25533 +
25534 +       spin_lock_irqsave(&ioapic_lock, flags);
25535 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
25536 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
25537 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25538 +}
25539 +
25540 +int timer_uses_ioapic_pin_0;
25541 +
25542 +/*
25543 + * This code may look a bit paranoid, but it's supposed to cooperate with
25544 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
25545 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
25546 + * fanatically on his truly buggy board.
25547 + *
25548 + * FIXME: really need to revamp this for modern platforms only.
25549 + */
25550 +static inline void check_timer(void)
25551 +{
25552 +       int apic1, pin1, apic2, pin2;
25553 +       int vector;
25554 +
25555 +       /*
25556 +        * get/set the timer IRQ vector:
25557 +        */
25558 +       disable_8259A_irq(0);
25559 +       vector = assign_irq_vector(0);
25560 +       set_intr_gate(vector, interrupt[0]);
25561 +
25562 +       /*
25563 +        * Subtle, code in do_timer_interrupt() expects an AEOI
25564 +        * mode for the 8259A whenever interrupts are routed
25565 +        * through I/O APICs.  Also IRQ0 has to be enabled in
25566 +        * the 8259A which implies the virtual wire has to be
25567 +        * disabled in the local APIC.
25568 +        */
25569 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
25570 +       init_8259A(1);
25571 +       if (timer_over_8254 > 0)
25572 +               enable_8259A_irq(0);
25573 +
25574 +       pin1  = find_isa_irq_pin(0, mp_INT);
25575 +       apic1 = find_isa_irq_apic(0, mp_INT);
25576 +       pin2  = ioapic_i8259.pin;
25577 +       apic2 = ioapic_i8259.apic;
25578 +
25579 +       if (pin1 == 0)
25580 +               timer_uses_ioapic_pin_0 = 1;
25581 +
25582 +       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
25583 +               vector, apic1, pin1, apic2, pin2);
25584 +
25585 +       if (pin1 != -1) {
25586 +               /*
25587 +                * Ok, does IRQ0 through the IOAPIC work?
25588 +                */
25589 +               unmask_IO_APIC_irq(0);
25590 +               if (!no_timer_check && timer_irq_works()) {
25591 +                       nmi_watchdog_default();
25592 +                       if (nmi_watchdog == NMI_IO_APIC) {
25593 +                               disable_8259A_irq(0);
25594 +                               setup_nmi();
25595 +                               enable_8259A_irq(0);
25596 +                       }
25597 +                       if (disable_timer_pin_1 > 0)
25598 +                               clear_IO_APIC_pin(0, pin1);
25599 +                       return;
25600 +               }
25601 +               clear_IO_APIC_pin(apic1, pin1);
25602 +               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
25603 +                               "connected to IO-APIC\n");
25604 +       }
25605 +
25606 +       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
25607 +                               "through the 8259A ... ");
25608 +       if (pin2 != -1) {
25609 +               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
25610 +                       apic2, pin2);
25611 +               /*
25612 +                * legacy devices should be connected to IO APIC #0
25613 +                */
25614 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
25615 +               if (timer_irq_works()) {
25616 +                       apic_printk(APIC_VERBOSE," works.\n");
25617 +                       nmi_watchdog_default();
25618 +                       if (nmi_watchdog == NMI_IO_APIC) {
25619 +                               setup_nmi();
25620 +                       }
25621 +                       return;
25622 +               }
25623 +               /*
25624 +                * Cleanup, just in case ...
25625 +                */
25626 +               clear_IO_APIC_pin(apic2, pin2);
25627 +       }
25628 +       apic_printk(APIC_VERBOSE," failed.\n");
25629 +
25630 +       if (nmi_watchdog == NMI_IO_APIC) {
25631 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
25632 +               nmi_watchdog = 0;
25633 +       }
25634 +
25635 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
25636 +
25637 +       disable_8259A_irq(0);
25638 +       irq_desc[0].chip = &lapic_irq_type;
25639 +       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
25640 +       enable_8259A_irq(0);
25641 +
25642 +       if (timer_irq_works()) {
25643 +               apic_printk(APIC_VERBOSE," works.\n");
25644 +               return;
25645 +       }
25646 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
25647 +       apic_printk(APIC_VERBOSE," failed.\n");
25648 +
25649 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
25650 +
25651 +       init_8259A(0);
25652 +       make_8259A_irq(0);
25653 +       apic_write(APIC_LVT0, APIC_DM_EXTINT);
25654 +
25655 +       unlock_ExtINT_logic();
25656 +
25657 +       if (timer_irq_works()) {
25658 +               apic_printk(APIC_VERBOSE," works.\n");
25659 +               return;
25660 +       }
25661 +       apic_printk(APIC_VERBOSE," failed :(.\n");
25662 +       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
25663 +}
25664 +#else
25665 +#define check_timer() ((void)0)
25666 +int timer_uses_ioapic_pin_0 = 0;
25667 +#endif /* !CONFIG_XEN */
25668 +
25669 +static int __init notimercheck(char *s)
25670 +{
25671 +       no_timer_check = 1;
25672 +       return 1;
25673 +}
25674 +__setup("no_timer_check", notimercheck);
25675 +
25676 +/*
25677 + *
25678 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
25679 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
25680 + *   Linux doesn't really care, as it's not actually used
25681 + *   for any interrupt handling anyway.
25682 + */
25683 +#define PIC_IRQS       (1<<2)
25684 +
25685 +void __init setup_IO_APIC(void)
25686 +{
25687 +       enable_IO_APIC();
25688 +
25689 +       if (acpi_ioapic)
25690 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
25691 +       else
25692 +               io_apic_irqs = ~PIC_IRQS;
25693 +
25694 +       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
25695 +
25696 +       /*
25697 +        * Set up the IO-APIC IRQ routing table.
25698 +        */
25699 +       if (!acpi_ioapic)
25700 +               setup_ioapic_ids_from_mpc();
25701 +#ifndef CONFIG_XEN
25702 +       sync_Arb_IDs();
25703 +#endif /* !CONFIG_XEN */
25704 +       setup_IO_APIC_irqs();
25705 +       init_IO_APIC_traps();
25706 +       check_timer();
25707 +       if (!acpi_ioapic)
25708 +               print_IO_APIC();
25709 +}
25710 +
25711 +struct sysfs_ioapic_data {
25712 +       struct sys_device dev;
25713 +       struct IO_APIC_route_entry entry[0];
25714 +};
25715 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
25716 +
25717 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
25718 +{
25719 +       struct IO_APIC_route_entry *entry;
25720 +       struct sysfs_ioapic_data *data;
25721 +       unsigned long flags;
25722 +       int i;
25723 +
25724 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
25725 +       entry = data->entry;
25726 +       spin_lock_irqsave(&ioapic_lock, flags);
25727 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25728 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
25729 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
25730 +       }
25731 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25732 +
25733 +       return 0;
25734 +}
25735 +
25736 +static int ioapic_resume(struct sys_device *dev)
25737 +{
25738 +       struct IO_APIC_route_entry *entry;
25739 +       struct sysfs_ioapic_data *data;
25740 +       unsigned long flags;
25741 +       union IO_APIC_reg_00 reg_00;
25742 +       int i;
25743 +
25744 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
25745 +       entry = data->entry;
25746 +
25747 +       spin_lock_irqsave(&ioapic_lock, flags);
25748 +       reg_00.raw = io_apic_read(dev->id, 0);
25749 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
25750 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
25751 +               io_apic_write(dev->id, 0, reg_00.raw);
25752 +       }
25753 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25754 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
25755 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
25756 +       }
25757 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25758 +
25759 +       return 0;
25760 +}
25761 +
25762 +static struct sysdev_class ioapic_sysdev_class = {
25763 +       set_kset_name("ioapic"),
25764 +#ifndef CONFIG_XEN
25765 +       .suspend = ioapic_suspend,
25766 +       .resume = ioapic_resume,
25767 +#endif
25768 +};
25769 +
25770 +static int __init ioapic_init_sysfs(void)
25771 +{
25772 +       struct sys_device * dev;
25773 +       int i, size, error = 0;
25774 +
25775 +       error = sysdev_class_register(&ioapic_sysdev_class);
25776 +       if (error)
25777 +               return error;
25778 +
25779 +       for (i = 0; i < nr_ioapics; i++ ) {
25780 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
25781 +                       * sizeof(struct IO_APIC_route_entry);
25782 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
25783 +               if (!mp_ioapic_data[i]) {
25784 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25785 +                       continue;
25786 +               }
25787 +               memset(mp_ioapic_data[i], 0, size);
25788 +               dev = &mp_ioapic_data[i]->dev;
25789 +               dev->id = i;
25790 +               dev->cls = &ioapic_sysdev_class;
25791 +               error = sysdev_register(dev);
25792 +               if (error) {
25793 +                       kfree(mp_ioapic_data[i]);
25794 +                       mp_ioapic_data[i] = NULL;
25795 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25796 +                       continue;
25797 +               }
25798 +       }
25799 +
25800 +       return 0;
25801 +}
25802 +
25803 +device_initcall(ioapic_init_sysfs);
25804 +
25805 +/* --------------------------------------------------------------------------
25806 +                          ACPI-based IOAPIC Configuration
25807 +   -------------------------------------------------------------------------- */
25808 +
25809 +#ifdef CONFIG_ACPI
25810 +
25811 +#define IO_APIC_MAX_ID         0xFE
25812 +
25813 +int __init io_apic_get_version (int ioapic)
25814 +{
25815 +       union IO_APIC_reg_01    reg_01;
25816 +       unsigned long flags;
25817 +
25818 +       spin_lock_irqsave(&ioapic_lock, flags);
25819 +       reg_01.raw = io_apic_read(ioapic, 1);
25820 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25821 +
25822 +       return reg_01.bits.version;
25823 +}
25824 +
25825 +
25826 +int __init io_apic_get_redir_entries (int ioapic)
25827 +{
25828 +       union IO_APIC_reg_01    reg_01;
25829 +       unsigned long flags;
25830 +
25831 +       spin_lock_irqsave(&ioapic_lock, flags);
25832 +       reg_01.raw = io_apic_read(ioapic, 1);
25833 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25834 +
25835 +       return reg_01.bits.entries;
25836 +}
25837 +
25838 +
25839 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
25840 +{
25841 +       struct IO_APIC_route_entry entry;
25842 +       unsigned long flags;
25843 +
25844 +       if (!IO_APIC_IRQ(irq)) {
25845 +               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
25846 +                       ioapic);
25847 +               return -EINVAL;
25848 +       }
25849 +
25850 +       /*
25851 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
25852 +        * Note that we mask (disable) IRQs now -- these get enabled when the
25853 +        * corresponding device driver registers for this IRQ.
25854 +        */
25855 +
25856 +       memset(&entry,0,sizeof(entry));
25857 +
25858 +       entry.delivery_mode = INT_DELIVERY_MODE;
25859 +       entry.dest_mode = INT_DEST_MODE;
25860 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
25861 +       entry.trigger = edge_level;
25862 +       entry.polarity = active_high_low;
25863 +       entry.mask = 1;                                  /* Disabled (masked) */
25864 +
25865 +       irq = gsi_irq_sharing(irq);
25866 +       /*
25867 +        * IRQs < 16 are already in the irq_2_pin[] map
25868 +        */
25869 +       if (irq >= 16)
25870 +               add_pin_to_irq(irq, ioapic, pin);
25871 +
25872 +       entry.vector = assign_irq_vector(irq);
25873 +
25874 +       apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
25875 +               "IRQ %d Mode:%i Active:%i)\n", ioapic,
25876 +              mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
25877 +              edge_level, active_high_low);
25878 +
25879 +       ioapic_register_intr(irq, entry.vector, edge_level);
25880 +
25881 +       if (!ioapic && (irq < 16))
25882 +               disable_8259A_irq(irq);
25883 +
25884 +       spin_lock_irqsave(&ioapic_lock, flags);
25885 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
25886 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
25887 +       set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
25888 +       spin_unlock_irqrestore(&ioapic_lock, flags);
25889 +
25890 +       return 0;
25891 +}
25892 +
25893 +#endif /* CONFIG_ACPI */
25894 +
25895 +
25896 +#ifndef CONFIG_XEN
25897 +/*
25898 + * This function currently is only a helper for the i386 smp boot process where
25899 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
25900 + * so mask in all cases should simply be TARGET_CPUS
25901 + */
25902 +#ifdef CONFIG_SMP
25903 +void __init setup_ioapic_dest(void)
25904 +{
25905 +       int pin, ioapic, irq, irq_entry;
25906 +
25907 +       if (skip_ioapic_setup == 1)
25908 +               return;
25909 +
25910 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
25911 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
25912 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
25913 +                       if (irq_entry == -1)
25914 +                               continue;
25915 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
25916 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
25917 +               }
25918 +
25919 +       }
25920 +}
25921 +#endif
25922 +#endif /* !CONFIG_XEN */
25923 Index: head-2008-11-25/arch/x86/kernel/ioport_64-xen.c
25924 ===================================================================
25925 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
25926 +++ head-2008-11-25/arch/x86/kernel/ioport_64-xen.c     2008-01-28 12:24:19.000000000 +0100
25927 @@ -0,0 +1,100 @@
25928 +/*
25929 + *     linux/arch/x86_64/kernel/ioport.c
25930 + *
25931 + * This contains the io-permission bitmap code - written by obz, with changes
25932 + * by Linus.
25933 + */
25934 +
25935 +#include <linux/sched.h>
25936 +#include <linux/kernel.h>
25937 +#include <linux/capability.h>
25938 +#include <linux/errno.h>
25939 +#include <linux/types.h>
25940 +#include <linux/ioport.h>
25941 +#include <linux/mm.h>
25942 +#include <linux/smp.h>
25943 +#include <linux/smp_lock.h>
25944 +#include <linux/stddef.h>
25945 +#include <linux/slab.h>
25946 +#include <linux/thread_info.h>
25947 +#include <xen/interface/physdev.h>
25948 +
25949 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
25950 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
25951 +{
25952 +       int i;
25953 +
25954 +       if (new_value)
25955 +               for (i = base; i < base + extent; i++)
25956 +                       __set_bit(i, bitmap);
25957 +       else
25958 +               for (i = base; i < base + extent; i++)
25959 +                       clear_bit(i, bitmap);
25960 +}
25961 +
25962 +/*
25963 + * this changes the io permissions bitmap in the current task.
25964 + */
25965 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
25966 +{
25967 +       struct thread_struct * t = &current->thread;
25968 +       unsigned long *bitmap;
25969 +       struct physdev_set_iobitmap set_iobitmap;
25970 +
25971 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
25972 +               return -EINVAL;
25973 +       if (turn_on && !capable(CAP_SYS_RAWIO))
25974 +               return -EPERM;
25975 +
25976 +       /*
25977 +        * If it's the first ioperm() call in this thread's lifetime, set the
25978 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
25979 +        * this is why we delay this operation until now:
25980 +        */
25981 +       if (!t->io_bitmap_ptr) {
25982 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
25983 +               if (!bitmap)
25984 +                       return -ENOMEM;
25985 +
25986 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
25987 +               t->io_bitmap_ptr = bitmap;
25988 +
25989 +               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
25990 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
25991 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
25992 +                                             &set_iobitmap));
25993 +       }
25994 +
25995 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
25996 +
25997 +       return 0;
25998 +}
25999 +
26000 +/*
26001 + * sys_iopl has to be used when you want to access the IO ports
26002 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
26003 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
26004 + *
26005 + */
26006 +
26007 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
26008 +{
26009 +       unsigned int old_iopl = current->thread.iopl;
26010 +       struct physdev_set_iopl set_iopl;
26011 +
26012 +       if (new_iopl > 3)
26013 +               return -EINVAL;
26014 +
26015 +       /* Need "raw I/O" privileges for direct port access. */
26016 +       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
26017 +               return -EPERM;
26018 +
26019 +       /* Change our version of the privilege levels. */
26020 +       current->thread.iopl = new_iopl;
26021 +
26022 +       /* Force the change at ring 0. */
26023 +       set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
26024 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26025 +
26026 +       return 0;
26027 +}
26028 Index: head-2008-11-25/arch/x86/kernel/irq_64-xen.c
26029 ===================================================================
26030 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
26031 +++ head-2008-11-25/arch/x86/kernel/irq_64-xen.c        2008-10-29 09:55:56.000000000 +0100
26032 @@ -0,0 +1,197 @@
26033 +/*
26034 + *     linux/arch/x86_64/kernel/irq.c
26035 + *
26036 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
26037 + *
26038 + * This file contains the lowest level x86_64-specific interrupt
26039 + * entry and irq statistics code. All the remaining irq logic is
26040 + * done by the generic kernel/irq/ code and in the
26041 + * x86_64-specific irq controller code. (e.g. i8259.c and
26042 + * io_apic.c.)
26043 + */
26044 +
26045 +#include <linux/kernel_stat.h>
26046 +#include <linux/interrupt.h>
26047 +#include <linux/seq_file.h>
26048 +#include <linux/module.h>
26049 +#include <linux/delay.h>
26050 +#include <asm/uaccess.h>
26051 +#include <asm/io_apic.h>
26052 +#include <asm/idle.h>
26053 +
26054 +atomic_t irq_err_count;
26055 +#ifdef CONFIG_X86_IO_APIC
26056 +#ifdef APIC_MISMATCH_DEBUG
26057 +atomic_t irq_mis_count;
26058 +#endif
26059 +#endif
26060 +
26061 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26062 +/*
26063 + * Probabilistic stack overflow check:
26064 + *
26065 + * Only check the stack in process context, because everything else
26066 + * runs on the big interrupt stacks. Checking reliably is too expensive,
26067 + * so we just check from interrupts.
26068 + */
26069 +static inline void stack_overflow_check(struct pt_regs *regs)
26070 +{
26071 +       u64 curbase = (u64) current->thread_info;
26072 +       static unsigned long warned = -60*HZ;
26073 +
26074 +       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
26075 +           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
26076 +           time_after(jiffies, warned + 60*HZ)) {
26077 +               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
26078 +                      current->comm, curbase, regs->rsp);
26079 +               show_stack(NULL,NULL);
26080 +               warned = jiffies;
26081 +       }
26082 +}
26083 +#endif
26084 +
26085 +/*
26086 + * Generic, controller-independent functions:
26087 + */
26088 +
26089 +int show_interrupts(struct seq_file *p, void *v)
26090 +{
26091 +       int i = *(loff_t *) v, j;
26092 +       struct irqaction * action;
26093 +       unsigned long flags;
26094 +
26095 +       if (i == 0) {
26096 +               seq_printf(p, "           ");
26097 +               for_each_online_cpu(j)
26098 +                       seq_printf(p, "CPU%-8d",j);
26099 +               seq_putc(p, '\n');
26100 +       }
26101 +
26102 +       if (i < NR_IRQS) {
26103 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
26104 +               action = irq_desc[i].action;
26105 +               if (!action)
26106 +                       goto skip;
26107 +               seq_printf(p, "%3d: ",i);
26108 +#ifndef CONFIG_SMP
26109 +               seq_printf(p, "%10u ", kstat_irqs(i));
26110 +#else
26111 +               for_each_online_cpu(j)
26112 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
26113 +#endif
26114 +               seq_printf(p, " %14s", irq_desc[i].chip->typename);
26115 +
26116 +               seq_printf(p, "  %s", action->name);
26117 +               for (action=action->next; action; action = action->next)
26118 +                       seq_printf(p, ", %s", action->name);
26119 +               seq_putc(p, '\n');
26120 +skip:
26121 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
26122 +       } else if (i == NR_IRQS) {
26123 +               seq_printf(p, "NMI: ");
26124 +               for_each_online_cpu(j)
26125 +                       seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
26126 +               seq_putc(p, '\n');
26127 +#ifdef CONFIG_X86_LOCAL_APIC
26128 +               seq_printf(p, "LOC: ");
26129 +               for_each_online_cpu(j)
26130 +                       seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
26131 +               seq_putc(p, '\n');
26132 +#endif
26133 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
26134 +#ifdef CONFIG_X86_IO_APIC
26135 +#ifdef APIC_MISMATCH_DEBUG
26136 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
26137 +#endif
26138 +#endif
26139 +       }
26140 +       return 0;
26141 +}
26142 +
26143 +/*
26144 + * do_IRQ handles all normal device IRQ's (the special
26145 + * SMP cross-CPU interrupts have their own specific
26146 + * handlers).
26147 + */
26148 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
26149 +{
26150 +       /* high bit used in ret_from_ code  */
26151 +       unsigned irq = ~regs->orig_rax;
26152 +
26153 +       if (unlikely(irq >= NR_IRQS)) {
26154 +               printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
26155 +                                       __FUNCTION__, irq);
26156 +               BUG();
26157 +       }
26158 +
26159 +       /*exit_idle();*/
26160 +       /*irq_enter();*/
26161 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26162 +       stack_overflow_check(regs);
26163 +#endif
26164 +       __do_IRQ(irq, regs);
26165 +       /*irq_exit();*/
26166 +
26167 +       return 1;
26168 +}
26169 +
26170 +#ifdef CONFIG_HOTPLUG_CPU
26171 +void fixup_irqs(cpumask_t map)
26172 +{
26173 +       unsigned int irq;
26174 +       static int warned;
26175 +
26176 +       for (irq = 0; irq < NR_IRQS; irq++) {
26177 +               cpumask_t mask;
26178 +               if (irq == 2)
26179 +                       continue;
26180 +
26181 +               cpus_and(mask, irq_desc[irq].affinity, map);
26182 +               if (any_online_cpu(mask) == NR_CPUS) {
26183 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
26184 +                       mask = map;
26185 +               }
26186 +               if (irq_desc[irq].chip->set_affinity)
26187 +                       irq_desc[irq].chip->set_affinity(irq, mask);
26188 +               else if (irq_desc[irq].action && !(warned++))
26189 +                       printk("Cannot set affinity for irq %i\n", irq);
26190 +       }
26191 +
26192 +       /* That doesn't seem sufficient.  Give it 1ms. */
26193 +       local_irq_enable();
26194 +       mdelay(1);
26195 +       local_irq_disable();
26196 +}
26197 +#endif
26198 +
26199 +extern void call_softirq(void);
26200 +
26201 +asmlinkage void do_softirq(void)
26202 +{
26203 +       __u32 pending;
26204 +       unsigned long flags;
26205 +
26206 +       if (in_interrupt())
26207 +               return;
26208 +
26209 +       local_irq_save(flags);
26210 +       pending = local_softirq_pending();
26211 +       /* Switch to interrupt stack */
26212 +       if (pending) {
26213 +               call_softirq();
26214 +               WARN_ON_ONCE(softirq_count());
26215 +       }
26216 +       local_irq_restore(flags);
26217 +}
26218 +EXPORT_SYMBOL(do_softirq);
26219 +
26220 +#ifndef CONFIG_X86_LOCAL_APIC
26221 +/*
26222 + * 'what should we do if we get a hw irq event on an illegal vector'.
26223 + * each architecture has to answer this themselves.
26224 + */
26225 +void ack_bad_irq(unsigned int irq)
26226 +{
26227 +        printk("unexpected IRQ trap at vector %02x\n", irq);
26228 +}
26229 +#endif
26230 Index: head-2008-11-25/arch/x86/kernel/ldt_64-xen.c
26231 ===================================================================
26232 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
26233 +++ head-2008-11-25/arch/x86/kernel/ldt_64-xen.c        2007-06-12 13:13:01.000000000 +0200
26234 @@ -0,0 +1,282 @@
26235 +/*
26236 + * linux/arch/x86_64/kernel/ldt.c
26237 + *
26238 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
26239 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
26240 + * Copyright (C) 2002 Andi Kleen
26241 + *
26242 + * This handles calls from both 32bit and 64bit mode.
26243 + */
26244 +
26245 +#include <linux/errno.h>
26246 +#include <linux/sched.h>
26247 +#include <linux/string.h>
26248 +#include <linux/mm.h>
26249 +#include <linux/smp.h>
26250 +#include <linux/smp_lock.h>
26251 +#include <linux/vmalloc.h>
26252 +#include <linux/slab.h>
26253 +
26254 +#include <asm/uaccess.h>
26255 +#include <asm/system.h>
26256 +#include <asm/ldt.h>
26257 +#include <asm/desc.h>
26258 +#include <asm/proto.h>
26259 +#include <asm/pgalloc.h>
26260 +
26261 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26262 +static void flush_ldt(void *null)
26263 +{
26264 +       if (current->active_mm)
26265 +               load_LDT(&current->active_mm->context);
26266 +}
26267 +#endif
26268 +
26269 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
26270 +{
26271 +       void *oldldt;
26272 +       void *newldt;
26273 +       unsigned oldsize;
26274 +
26275 +       if (mincount <= (unsigned)pc->size)
26276 +               return 0;
26277 +       oldsize = pc->size;
26278 +       mincount = (mincount+511)&(~511);
26279 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
26280 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
26281 +       else
26282 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
26283 +
26284 +       if (!newldt)
26285 +               return -ENOMEM;
26286 +
26287 +       if (oldsize)
26288 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
26289 +       oldldt = pc->ldt;
26290 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
26291 +       wmb();
26292 +       pc->ldt = newldt;
26293 +       wmb();
26294 +       pc->size = mincount;
26295 +       wmb();
26296 +       if (reload) {
26297 +#ifdef CONFIG_SMP
26298 +               cpumask_t mask;
26299 +
26300 +               preempt_disable();
26301 +#endif
26302 +               make_pages_readonly(
26303 +                       pc->ldt,
26304 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26305 +                       XENFEAT_writable_descriptor_tables);
26306 +               load_LDT(pc);
26307 +#ifdef CONFIG_SMP
26308 +               mask = cpumask_of_cpu(smp_processor_id());
26309 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
26310 +                       smp_call_function(flush_ldt, NULL, 1, 1);
26311 +               preempt_enable();
26312 +#endif
26313 +       }
26314 +       if (oldsize) {
26315 +               make_pages_writable(
26316 +                       oldldt,
26317 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
26318 +                       XENFEAT_writable_descriptor_tables);
26319 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
26320 +                       vfree(oldldt);
26321 +               else
26322 +                       kfree(oldldt);
26323 +       }
26324 +       return 0;
26325 +}
26326 +
26327 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
26328 +{
26329 +       int err = alloc_ldt(new, old->size, 0);
26330 +       if (err < 0)
26331 +               return err;
26332 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
26333 +       make_pages_readonly(
26334 +               new->ldt,
26335 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26336 +               XENFEAT_writable_descriptor_tables);
26337 +       return 0;
26338 +}
26339 +
26340 +/*
26341 + * we do not have to muck with descriptors here, that is
26342 + * done in switch_mm() as needed.
26343 + */
26344 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
26345 +{
26346 +       struct mm_struct * old_mm;
26347 +       int retval = 0;
26348 +
26349 +       memset(&mm->context, 0, sizeof(mm->context));
26350 +       init_MUTEX(&mm->context.sem);
26351 +       old_mm = current->mm;
26352 +       if (old_mm && old_mm->context.size > 0) {
26353 +               down(&old_mm->context.sem);
26354 +               retval = copy_ldt(&mm->context, &old_mm->context);
26355 +               up(&old_mm->context.sem);
26356 +       }
26357 +       if (retval == 0) {
26358 +               spin_lock(&mm_unpinned_lock);
26359 +               list_add(&mm->context.unpinned, &mm_unpinned);
26360 +               spin_unlock(&mm_unpinned_lock);
26361 +       }
26362 +       return retval;
26363 +}
26364 +
26365 +/*
26366 + *
26367 + * Don't touch the LDT register - we're already in the next thread.
26368 + */
26369 +void destroy_context(struct mm_struct *mm)
26370 +{
26371 +       if (mm->context.size) {
26372 +               if (mm == current->active_mm)
26373 +                       clear_LDT();
26374 +               make_pages_writable(
26375 +                       mm->context.ldt,
26376 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26377 +                       XENFEAT_writable_descriptor_tables);
26378 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
26379 +                       vfree(mm->context.ldt);
26380 +               else
26381 +                       kfree(mm->context.ldt);
26382 +               mm->context.size = 0;
26383 +       }
26384 +       if (!mm->context.pinned) {
26385 +               spin_lock(&mm_unpinned_lock);
26386 +               list_del(&mm->context.unpinned);
26387 +               spin_unlock(&mm_unpinned_lock);
26388 +       }
26389 +}
26390 +
26391 +static int read_ldt(void __user * ptr, unsigned long bytecount)
26392 +{
26393 +       int err;
26394 +       unsigned long size;
26395 +       struct mm_struct * mm = current->mm;
26396 +
26397 +       if (!mm->context.size)
26398 +               return 0;
26399 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
26400 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
26401 +
26402 +       down(&mm->context.sem);
26403 +       size = mm->context.size*LDT_ENTRY_SIZE;
26404 +       if (size > bytecount)
26405 +               size = bytecount;
26406 +
26407 +       err = 0;
26408 +       if (copy_to_user(ptr, mm->context.ldt, size))
26409 +               err = -EFAULT;
26410 +       up(&mm->context.sem);
26411 +       if (err < 0)
26412 +               goto error_return;
26413 +       if (size != bytecount) {
26414 +               /* zero-fill the rest */
26415 +               if (clear_user(ptr+size, bytecount-size) != 0) {
26416 +                       err = -EFAULT;
26417 +                       goto error_return;
26418 +               }
26419 +       }
26420 +       return bytecount;
26421 +error_return:
26422 +       return err;
26423 +}
26424 +
26425 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
26426 +{
26427 +       /* Arbitrary number */
26428 +       /* x86-64 default LDT is all zeros */
26429 +       if (bytecount > 128)
26430 +               bytecount = 128;
26431 +       if (clear_user(ptr, bytecount))
26432 +               return -EFAULT;
26433 +       return bytecount;
26434 +}
26435 +
26436 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
26437 +{
26438 +       struct task_struct *me = current;
26439 +       struct mm_struct * mm = me->mm;
26440 +       __u32 entry_1, entry_2, *lp;
26441 +       unsigned long mach_lp;
26442 +       int error;
26443 +       struct user_desc ldt_info;
26444 +
26445 +       error = -EINVAL;
26446 +
26447 +       if (bytecount != sizeof(ldt_info))
26448 +               goto out;
26449 +       error = -EFAULT;
26450 +       if (copy_from_user(&ldt_info, ptr, bytecount))
26451 +               goto out;
26452 +
26453 +       error = -EINVAL;
26454 +       if (ldt_info.entry_number >= LDT_ENTRIES)
26455 +               goto out;
26456 +       if (ldt_info.contents == 3) {
26457 +               if (oldmode)
26458 +                       goto out;
26459 +               if (ldt_info.seg_not_present == 0)
26460 +                       goto out;
26461 +       }
26462 +
26463 +       down(&mm->context.sem);
26464 +       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
26465 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
26466 +               if (error < 0)
26467 +                       goto out_unlock;
26468 +       }
26469 +
26470 +       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
26471 +       mach_lp = arbitrary_virt_to_machine(lp);
26472 +
26473 +       /* Allow LDTs to be cleared by the user. */
26474 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
26475 +               if (oldmode || LDT_empty(&ldt_info)) {
26476 +                       entry_1 = 0;
26477 +                       entry_2 = 0;
26478 +                       goto install;
26479 +               }
26480 +       }
26481 +
26482 +       entry_1 = LDT_entry_a(&ldt_info);
26483 +       entry_2 = LDT_entry_b(&ldt_info);
26484 +       if (oldmode)
26485 +               entry_2 &= ~(1 << 20);
26486 +
26487 +       /* Install the new entry ...  */
26488 +install:
26489 +       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
26490 +
26491 +out_unlock:
26492 +       up(&mm->context.sem);
26493 +out:
26494 +       return error;
26495 +}
26496 +
26497 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
26498 +{
26499 +       int ret = -ENOSYS;
26500 +
26501 +       switch (func) {
26502 +       case 0:
26503 +               ret = read_ldt(ptr, bytecount);
26504 +               break;
26505 +       case 1:
26506 +               ret = write_ldt(ptr, bytecount, 1);
26507 +               break;
26508 +       case 2:
26509 +               ret = read_default_ldt(ptr, bytecount);
26510 +               break;
26511 +       case 0x11:
26512 +               ret = write_ldt(ptr, bytecount, 0);
26513 +               break;
26514 +       }
26515 +       return ret;
26516 +}
26517 Index: head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c
26518 ===================================================================
26519 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
26520 +++ head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c    2007-06-12 13:13:01.000000000 +0200
26521 @@ -0,0 +1,1011 @@
26522 +/*
26523 + *     Intel Multiprocessor Specification 1.1 and 1.4
26524 + *     compliant MP-table parsing routines.
26525 + *
26526 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
26527 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
26528 + *
26529 + *     Fixes
26530 + *             Erich Boleyn    :       MP v1.4 and additional changes.
26531 + *             Alan Cox        :       Added EBDA scanning
26532 + *             Ingo Molnar     :       various cleanups and rewrites
26533 + *             Maciej W. Rozycki:      Bits for default MP configurations
26534 + *             Paul Diefenbaugh:       Added full ACPI support
26535 + */
26536 +
26537 +#include <linux/mm.h>
26538 +#include <linux/init.h>
26539 +#include <linux/delay.h>
26540 +#include <linux/bootmem.h>
26541 +#include <linux/smp_lock.h>
26542 +#include <linux/kernel_stat.h>
26543 +#include <linux/mc146818rtc.h>
26544 +#include <linux/acpi.h>
26545 +#include <linux/module.h>
26546 +
26547 +#include <asm/smp.h>
26548 +#include <asm/mtrr.h>
26549 +#include <asm/mpspec.h>
26550 +#include <asm/pgalloc.h>
26551 +#include <asm/io_apic.h>
26552 +#include <asm/proto.h>
26553 +#include <asm/acpi.h>
26554 +
26555 +/* Have we found an MP table */
26556 +int smp_found_config;
26557 +unsigned int __initdata maxcpus = NR_CPUS;
26558 +
26559 +int acpi_found_madt;
26560 +
26561 +/*
26562 + * Various Linux-internal data structures created from the
26563 + * MP-table.
26564 + */
26565 +unsigned char apic_version [MAX_APICS];
26566 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26567 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26568 +
26569 +static int mp_current_pci_id = 0;
26570 +/* I/O APIC entries */
26571 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
26572 +
26573 +/* # of MP IRQ source entries */
26574 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
26575 +
26576 +/* MP IRQ source entries */
26577 +int mp_irq_entries;
26578 +
26579 +int nr_ioapics;
26580 +int pic_mode;
26581 +unsigned long mp_lapic_addr = 0;
26582 +
26583 +
26584 +
26585 +/* Processor that is doing the boot up */
26586 +unsigned int boot_cpu_id = -1U;
26587 +/* Internal processor count */
26588 +unsigned int num_processors __initdata = 0;
26589 +
26590 +unsigned disabled_cpus __initdata;
26591 +
26592 +/* Bitmask of physically existing CPUs */
26593 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
26594 +
26595 +/* ACPI MADT entry parsing functions */
26596 +#ifdef CONFIG_ACPI
26597 +extern struct acpi_boot_flags acpi_boot;
26598 +#ifdef CONFIG_X86_LOCAL_APIC
26599 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
26600 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
26601 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
26602 +#endif /*CONFIG_X86_LOCAL_APIC*/
26603 +#ifdef CONFIG_X86_IO_APIC
26604 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
26605 +#endif /*CONFIG_X86_IO_APIC*/
26606 +#endif /*CONFIG_ACPI*/
26607 +
26608 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
26609 +
26610 +
26611 +/*
26612 + * Intel MP BIOS table parsing routines:
26613 + */
26614 +
26615 +/*
26616 + * Checksum an MP configuration block.
26617 + */
26618 +
26619 +static int __init mpf_checksum(unsigned char *mp, int len)
26620 +{
26621 +       int sum = 0;
26622 +
26623 +       while (len--)
26624 +               sum += *mp++;
26625 +
26626 +       return sum & 0xFF;
26627 +}
26628 +
26629 +#ifndef CONFIG_XEN
26630 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26631 +{
26632 +       int cpu;
26633 +       unsigned char ver;
26634 +       cpumask_t tmp_map;
26635 +
26636 +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
26637 +               disabled_cpus++;
26638 +               return;
26639 +       }
26640 +
26641 +       printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
26642 +               m->mpc_apicid,
26643 +              (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
26644 +              (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
26645 +               m->mpc_apicver);
26646 +
26647 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26648 +               Dprintk("    Bootup CPU\n");
26649 +               boot_cpu_id = m->mpc_apicid;
26650 +       }
26651 +       if (num_processors >= NR_CPUS) {
26652 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
26653 +                       " Processor ignored.\n", NR_CPUS);
26654 +               return;
26655 +       }
26656 +
26657 +       num_processors++;
26658 +       cpus_complement(tmp_map, cpu_present_map);
26659 +       cpu = first_cpu(tmp_map);
26660 +
26661 +#if MAX_APICS < 255
26662 +       if ((int)m->mpc_apicid > MAX_APICS) {
26663 +               printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
26664 +                       m->mpc_apicid, MAX_APICS);
26665 +               return;
26666 +       }
26667 +#endif
26668 +       ver = m->mpc_apicver;
26669 +
26670 +       physid_set(m->mpc_apicid, phys_cpu_present_map);
26671 +       /*
26672 +        * Validate version
26673 +        */
26674 +       if (ver == 0x0) {
26675 +               printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
26676 +               ver = 0x10;
26677 +       }
26678 +       apic_version[m->mpc_apicid] = ver;
26679 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26680 +               /*
26681 +                * bios_cpu_apicid is required to have processors listed
26682 +                * in same order as logical cpu numbers. Hence the first
26683 +                * entry is BSP, and so on.
26684 +                */
26685 +               cpu = 0;
26686 +       }
26687 +       bios_cpu_apicid[cpu] = m->mpc_apicid;
26688 +       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
26689 +
26690 +       cpu_set(cpu, cpu_possible_map);
26691 +       cpu_set(cpu, cpu_present_map);
26692 +}
26693 +#else
26694 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26695 +{
26696 +       num_processors++;
26697 +}
26698 +#endif /* CONFIG_XEN */
26699 +
26700 +static void __init MP_bus_info (struct mpc_config_bus *m)
26701 +{
26702 +       char str[7];
26703 +
26704 +       memcpy(str, m->mpc_bustype, 6);
26705 +       str[6] = 0;
26706 +       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
26707 +
26708 +       if (strncmp(str, "ISA", 3) == 0) {
26709 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
26710 +       } else if (strncmp(str, "EISA", 4) == 0) {
26711 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
26712 +       } else if (strncmp(str, "PCI", 3) == 0) {
26713 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
26714 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
26715 +               mp_current_pci_id++;
26716 +       } else if (strncmp(str, "MCA", 3) == 0) {
26717 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
26718 +       } else {
26719 +               printk(KERN_ERR "Unknown bustype %s\n", str);
26720 +       }
26721 +}
26722 +
26723 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
26724 +{
26725 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
26726 +               return;
26727 +
26728 +       printk("I/O APIC #%d Version %d at 0x%X.\n",
26729 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
26730 +       if (nr_ioapics >= MAX_IO_APICS) {
26731 +               printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
26732 +                       MAX_IO_APICS, nr_ioapics);
26733 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
26734 +       }
26735 +       if (!m->mpc_apicaddr) {
26736 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
26737 +                       " found in MP table, skipping!\n");
26738 +               return;
26739 +       }
26740 +       mp_ioapics[nr_ioapics] = *m;
26741 +       nr_ioapics++;
26742 +}
26743 +
26744 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
26745 +{
26746 +       mp_irqs [mp_irq_entries] = *m;
26747 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
26748 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
26749 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
26750 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
26751 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
26752 +       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
26753 +               panic("Max # of irq sources exceeded!!\n");
26754 +}
26755 +
26756 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
26757 +{
26758 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
26759 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
26760 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
26761 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
26762 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
26763 +       /*
26764 +        * Well it seems all SMP boards in existence
26765 +        * use ExtINT/LVT1 == LINT0 and
26766 +        * NMI/LVT2 == LINT1 - the following check
26767 +        * will show us if this assumptions is false.
26768 +        * Until then we do not have to add baggage.
26769 +        */
26770 +       if ((m->mpc_irqtype == mp_ExtINT) &&
26771 +               (m->mpc_destapiclint != 0))
26772 +                       BUG();
26773 +       if ((m->mpc_irqtype == mp_NMI) &&
26774 +               (m->mpc_destapiclint != 1))
26775 +                       BUG();
26776 +}
26777 +
26778 +/*
26779 + * Read/parse the MPC
26780 + */
26781 +
26782 +static int __init smp_read_mpc(struct mp_config_table *mpc)
26783 +{
26784 +       char str[16];
26785 +       int count=sizeof(*mpc);
26786 +       unsigned char *mpt=((unsigned char *)mpc)+count;
26787 +
26788 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
26789 +               printk("SMP mptable: bad signature [%c%c%c%c]!\n",
26790 +                       mpc->mpc_signature[0],
26791 +                       mpc->mpc_signature[1],
26792 +                       mpc->mpc_signature[2],
26793 +                       mpc->mpc_signature[3]);
26794 +               return 0;
26795 +       }
26796 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
26797 +               printk("SMP mptable: checksum error!\n");
26798 +               return 0;
26799 +       }
26800 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
26801 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
26802 +                       mpc->mpc_spec);
26803 +               return 0;
26804 +       }
26805 +       if (!mpc->mpc_lapic) {
26806 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
26807 +               return 0;
26808 +       }
26809 +       memcpy(str,mpc->mpc_oem,8);
26810 +       str[8]=0;
26811 +       printk(KERN_INFO "OEM ID: %s ",str);
26812 +
26813 +       memcpy(str,mpc->mpc_productid,12);
26814 +       str[12]=0;
26815 +       printk("Product ID: %s ",str);
26816 +
26817 +       printk("APIC at: 0x%X\n",mpc->mpc_lapic);
26818 +
26819 +       /* save the local APIC address, it might be non-default */
26820 +       if (!acpi_lapic)
26821 +       mp_lapic_addr = mpc->mpc_lapic;
26822 +
26823 +       /*
26824 +        *      Now process the configuration blocks.
26825 +        */
26826 +       while (count < mpc->mpc_length) {
26827 +               switch(*mpt) {
26828 +                       case MP_PROCESSOR:
26829 +                       {
26830 +                               struct mpc_config_processor *m=
26831 +                                       (struct mpc_config_processor *)mpt;
26832 +                               if (!acpi_lapic)
26833 +                               MP_processor_info(m);
26834 +                               mpt += sizeof(*m);
26835 +                               count += sizeof(*m);
26836 +                               break;
26837 +                       }
26838 +                       case MP_BUS:
26839 +                       {
26840 +                               struct mpc_config_bus *m=
26841 +                                       (struct mpc_config_bus *)mpt;
26842 +                               MP_bus_info(m);
26843 +                               mpt += sizeof(*m);
26844 +                               count += sizeof(*m);
26845 +                               break;
26846 +                       }
26847 +                       case MP_IOAPIC:
26848 +                       {
26849 +                               struct mpc_config_ioapic *m=
26850 +                                       (struct mpc_config_ioapic *)mpt;
26851 +                               MP_ioapic_info(m);
26852 +                               mpt+=sizeof(*m);
26853 +                               count+=sizeof(*m);
26854 +                               break;
26855 +                       }
26856 +                       case MP_INTSRC:
26857 +                       {
26858 +                               struct mpc_config_intsrc *m=
26859 +                                       (struct mpc_config_intsrc *)mpt;
26860 +
26861 +                               MP_intsrc_info(m);
26862 +                               mpt+=sizeof(*m);
26863 +                               count+=sizeof(*m);
26864 +                               break;
26865 +                       }
26866 +                       case MP_LINTSRC:
26867 +                       {
26868 +                               struct mpc_config_lintsrc *m=
26869 +                                       (struct mpc_config_lintsrc *)mpt;
26870 +                               MP_lintsrc_info(m);
26871 +                               mpt+=sizeof(*m);
26872 +                               count+=sizeof(*m);
26873 +                               break;
26874 +                       }
26875 +               }
26876 +       }
26877 +       clustered_apic_check();
26878 +       if (!num_processors)
26879 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
26880 +       return num_processors;
26881 +}
26882 +
26883 +static int __init ELCR_trigger(unsigned int irq)
26884 +{
26885 +       unsigned int port;
26886 +
26887 +       port = 0x4d0 + (irq >> 3);
26888 +       return (inb(port) >> (irq & 7)) & 1;
26889 +}
26890 +
26891 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
26892 +{
26893 +       struct mpc_config_intsrc intsrc;
26894 +       int i;
26895 +       int ELCR_fallback = 0;
26896 +
26897 +       intsrc.mpc_type = MP_INTSRC;
26898 +       intsrc.mpc_irqflag = 0;                 /* conforming */
26899 +       intsrc.mpc_srcbus = 0;
26900 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
26901 +
26902 +       intsrc.mpc_irqtype = mp_INT;
26903 +
26904 +       /*
26905 +        *  If true, we have an ISA/PCI system with no IRQ entries
26906 +        *  in the MP table. To prevent the PCI interrupts from being set up
26907 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
26908 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
26909 +        *  never be level sensitive, so we simply see if the ELCR agrees.
26910 +        *  If it does, we assume it's valid.
26911 +        */
26912 +       if (mpc_default_type == 5) {
26913 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
26914 +
26915 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
26916 +                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
26917 +               else {
26918 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
26919 +                       ELCR_fallback = 1;
26920 +               }
26921 +       }
26922 +
26923 +       for (i = 0; i < 16; i++) {
26924 +               switch (mpc_default_type) {
26925 +               case 2:
26926 +                       if (i == 0 || i == 13)
26927 +                               continue;       /* IRQ0 & IRQ13 not connected */
26928 +                       /* fall through */
26929 +               default:
26930 +                       if (i == 2)
26931 +                               continue;       /* IRQ2 is never connected */
26932 +               }
26933 +
26934 +               if (ELCR_fallback) {
26935 +                       /*
26936 +                        *  If the ELCR indicates a level-sensitive interrupt, we
26937 +                        *  copy that information over to the MP table in the
26938 +                        *  irqflag field (level sensitive, active high polarity).
26939 +                        */
26940 +                       if (ELCR_trigger(i))
26941 +                               intsrc.mpc_irqflag = 13;
26942 +                       else
26943 +                               intsrc.mpc_irqflag = 0;
26944 +               }
26945 +
26946 +               intsrc.mpc_srcbusirq = i;
26947 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
26948 +               MP_intsrc_info(&intsrc);
26949 +       }
26950 +
26951 +       intsrc.mpc_irqtype = mp_ExtINT;
26952 +       intsrc.mpc_srcbusirq = 0;
26953 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
26954 +       MP_intsrc_info(&intsrc);
26955 +}
26956 +
26957 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
26958 +{
26959 +       struct mpc_config_processor processor;
26960 +       struct mpc_config_bus bus;
26961 +       struct mpc_config_ioapic ioapic;
26962 +       struct mpc_config_lintsrc lintsrc;
26963 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
26964 +       int i;
26965 +
26966 +       /*
26967 +        * local APIC has default address
26968 +        */
26969 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
26970 +
26971 +       /*
26972 +        * 2 CPUs, numbered 0 & 1.
26973 +        */
26974 +       processor.mpc_type = MP_PROCESSOR;
26975 +       /* Either an integrated APIC or a discrete 82489DX. */
26976 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
26977 +       processor.mpc_cpuflag = CPU_ENABLED;
26978 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
26979 +                                  (boot_cpu_data.x86_model << 4) |
26980 +                                  boot_cpu_data.x86_mask;
26981 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
26982 +       processor.mpc_reserved[0] = 0;
26983 +       processor.mpc_reserved[1] = 0;
26984 +       for (i = 0; i < 2; i++) {
26985 +               processor.mpc_apicid = i;
26986 +               MP_processor_info(&processor);
26987 +       }
26988 +
26989 +       bus.mpc_type = MP_BUS;
26990 +       bus.mpc_busid = 0;
26991 +       switch (mpc_default_type) {
26992 +               default:
26993 +                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
26994 +                               mpc_default_type);
26995 +                       /* fall through */
26996 +               case 1:
26997 +               case 5:
26998 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
26999 +                       break;
27000 +               case 2:
27001 +               case 6:
27002 +               case 3:
27003 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
27004 +                       break;
27005 +               case 4:
27006 +               case 7:
27007 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
27008 +       }
27009 +       MP_bus_info(&bus);
27010 +       if (mpc_default_type > 4) {
27011 +               bus.mpc_busid = 1;
27012 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
27013 +               MP_bus_info(&bus);
27014 +       }
27015 +
27016 +       ioapic.mpc_type = MP_IOAPIC;
27017 +       ioapic.mpc_apicid = 2;
27018 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
27019 +       ioapic.mpc_flags = MPC_APIC_USABLE;
27020 +       ioapic.mpc_apicaddr = 0xFEC00000;
27021 +       MP_ioapic_info(&ioapic);
27022 +
27023 +       /*
27024 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
27025 +        */
27026 +       construct_default_ioirq_mptable(mpc_default_type);
27027 +
27028 +       lintsrc.mpc_type = MP_LINTSRC;
27029 +       lintsrc.mpc_irqflag = 0;                /* conforming */
27030 +       lintsrc.mpc_srcbusid = 0;
27031 +       lintsrc.mpc_srcbusirq = 0;
27032 +       lintsrc.mpc_destapic = MP_APIC_ALL;
27033 +       for (i = 0; i < 2; i++) {
27034 +               lintsrc.mpc_irqtype = linttypes[i];
27035 +               lintsrc.mpc_destapiclint = i;
27036 +               MP_lintsrc_info(&lintsrc);
27037 +       }
27038 +}
27039 +
27040 +static struct intel_mp_floating *mpf_found;
27041 +
27042 +/*
27043 + * Scan the memory blocks for an SMP configuration block.
27044 + */
27045 +void __init get_smp_config (void)
27046 +{
27047 +       struct intel_mp_floating *mpf = mpf_found;
27048 +
27049 +       /*
27050 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical
27051 +        * processors, where MPS only supports physical.
27052 +        */
27053 +       if (acpi_lapic && acpi_ioapic) {
27054 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
27055 +               return;
27056 +       }
27057 +       else if (acpi_lapic)
27058 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
27059 +
27060 +       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
27061 +       if (mpf->mpf_feature2 & (1<<7)) {
27062 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
27063 +               pic_mode = 1;
27064 +       } else {
27065 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
27066 +               pic_mode = 0;
27067 +       }
27068 +
27069 +       /*
27070 +        * Now see if we need to read further.
27071 +        */
27072 +       if (mpf->mpf_feature1 != 0) {
27073 +
27074 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
27075 +               construct_default_ISA_mptable(mpf->mpf_feature1);
27076 +
27077 +       } else if (mpf->mpf_physptr) {
27078 +
27079 +               /*
27080 +                * Read the physical hardware table.  Anything here will
27081 +                * override the defaults.
27082 +                */
27083 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
27084 +                       smp_found_config = 0;
27085 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
27086 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
27087 +                       return;
27088 +               }
27089 +               /*
27090 +                * If there are no explicit MP IRQ entries, then we are
27091 +                * broken.  We set up most of the low 16 IO-APIC pins to
27092 +                * ISA defaults and hope it will work.
27093 +                */
27094 +               if (!mp_irq_entries) {
27095 +                       struct mpc_config_bus bus;
27096 +
27097 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
27098 +
27099 +                       bus.mpc_type = MP_BUS;
27100 +                       bus.mpc_busid = 0;
27101 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
27102 +                       MP_bus_info(&bus);
27103 +
27104 +                       construct_default_ioirq_mptable(0);
27105 +               }
27106 +
27107 +       } else
27108 +               BUG();
27109 +
27110 +       printk(KERN_INFO "Processors: %d\n", num_processors);
27111 +       /*
27112 +        * Only use the first configuration found.
27113 +        */
27114 +}
27115 +
27116 +static int __init smp_scan_config (unsigned long base, unsigned long length)
27117 +{
27118 +       extern void __bad_mpf_size(void);
27119 +       unsigned int *bp = isa_bus_to_virt(base);
27120 +       struct intel_mp_floating *mpf;
27121 +
27122 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
27123 +       if (sizeof(*mpf) != 16)
27124 +               __bad_mpf_size();
27125 +
27126 +       while (length > 0) {
27127 +               mpf = (struct intel_mp_floating *)bp;
27128 +               if ((*bp == SMP_MAGIC_IDENT) &&
27129 +                       (mpf->mpf_length == 1) &&
27130 +                       !mpf_checksum((unsigned char *)bp, 16) &&
27131 +                       ((mpf->mpf_specification == 1)
27132 +                               || (mpf->mpf_specification == 4)) ) {
27133 +
27134 +                       smp_found_config = 1;
27135 +                       mpf_found = mpf;
27136 +                       return 1;
27137 +               }
27138 +               bp += 4;
27139 +               length -= 16;
27140 +       }
27141 +       return 0;
27142 +}
27143 +
27144 +void __init find_intel_smp (void)
27145 +{
27146 +       unsigned int address;
27147 +
27148 +       /*
27149 +        * FIXME: Linux assumes you have 640K of base ram..
27150 +        * this continues the error...
27151 +        *
27152 +        * 1) Scan the bottom 1K for a signature
27153 +        * 2) Scan the top 1K of base RAM
27154 +        * 3) Scan the 64K of bios
27155 +        */
27156 +       if (smp_scan_config(0x0,0x400) ||
27157 +               smp_scan_config(639*0x400,0x400) ||
27158 +                       smp_scan_config(0xF0000,0x10000))
27159 +               return;
27160 +       /*
27161 +        * If it is an SMP machine we should know now, unless the
27162 +        * configuration is in an EISA/MCA bus machine with an
27163 +        * extended bios data area.
27164 +        *
27165 +        * there is a real-mode segmented pointer pointing to the
27166 +        * 4K EBDA area at 0x40E, calculate and scan it here.
27167 +        *
27168 +        * NOTE! There are Linux loaders that will corrupt the EBDA
27169 +        * area, and as such this kind of SMP config may be less
27170 +        * trustworthy, simply because the SMP table may have been
27171 +        * stomped on during early boot. These loaders are buggy and
27172 +        * should be fixed.
27173 +        */
27174 +
27175 +       address = *(unsigned short *)phys_to_virt(0x40E);
27176 +       address <<= 4;
27177 +       if (smp_scan_config(address, 0x1000))
27178 +               return;
27179 +
27180 +       /* If we have come this far, we did not find an MP table  */
27181 +        printk(KERN_INFO "No mptable found.\n");
27182 +}
27183 +
27184 +/*
27185 + * - Intel MP Configuration Table
27186 + */
27187 +void __init find_smp_config (void)
27188 +{
27189 +#ifdef CONFIG_X86_LOCAL_APIC
27190 +       find_intel_smp();
27191 +#endif
27192 +}
27193 +
27194 +
27195 +/* --------------------------------------------------------------------------
27196 +                            ACPI-based MP Configuration
27197 +   -------------------------------------------------------------------------- */
27198 +
27199 +#ifdef CONFIG_ACPI
27200 +
27201 +void __init mp_register_lapic_address (
27202 +       u64                     address)
27203 +{
27204 +#ifndef CONFIG_XEN
27205 +       mp_lapic_addr = (unsigned long) address;
27206 +
27207 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
27208 +
27209 +       if (boot_cpu_id == -1U)
27210 +               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
27211 +
27212 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
27213 +#endif
27214 +}
27215 +
27216 +
27217 +void __cpuinit mp_register_lapic (
27218 +       u8                      id,
27219 +       u8                      enabled)
27220 +{
27221 +       struct mpc_config_processor processor;
27222 +       int                     boot_cpu = 0;
27223 +
27224 +       if (id >= MAX_APICS) {
27225 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
27226 +                       id, MAX_APICS);
27227 +               return;
27228 +       }
27229 +
27230 +       if (id == boot_cpu_physical_apicid)
27231 +               boot_cpu = 1;
27232 +
27233 +#ifndef CONFIG_XEN
27234 +       processor.mpc_type = MP_PROCESSOR;
27235 +       processor.mpc_apicid = id;
27236 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
27237 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
27238 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
27239 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
27240 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
27241 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
27242 +       processor.mpc_reserved[0] = 0;
27243 +       processor.mpc_reserved[1] = 0;
27244 +#endif
27245 +
27246 +       MP_processor_info(&processor);
27247 +}
27248 +
27249 +#ifdef CONFIG_X86_IO_APIC
27250 +
27251 +#define MP_ISA_BUS             0
27252 +#define MP_MAX_IOAPIC_PIN      127
27253 +
27254 +static struct mp_ioapic_routing {
27255 +       int                     apic_id;
27256 +       int                     gsi_start;
27257 +       int                     gsi_end;
27258 +       u32                     pin_programmed[4];
27259 +} mp_ioapic_routing[MAX_IO_APICS];
27260 +
27261 +
27262 +static int mp_find_ioapic (
27263 +       int                     gsi)
27264 +{
27265 +       int                     i = 0;
27266 +
27267 +       /* Find the IOAPIC that manages this GSI. */
27268 +       for (i = 0; i < nr_ioapics; i++) {
27269 +               if ((gsi >= mp_ioapic_routing[i].gsi_start)
27270 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
27271 +                       return i;
27272 +       }
27273 +
27274 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
27275 +
27276 +       return -1;
27277 +}
27278 +
27279 +
27280 +void __init mp_register_ioapic (
27281 +       u8                      id,
27282 +       u32                     address,
27283 +       u32                     gsi_base)
27284 +{
27285 +       int                     idx = 0;
27286 +
27287 +       if (nr_ioapics >= MAX_IO_APICS) {
27288 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
27289 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
27290 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
27291 +       }
27292 +       if (!address) {
27293 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
27294 +                       " found in MADT table, skipping!\n");
27295 +               return;
27296 +       }
27297 +
27298 +       idx = nr_ioapics++;
27299 +
27300 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
27301 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
27302 +       mp_ioapics[idx].mpc_apicaddr = address;
27303 +
27304 +#ifndef CONFIG_XEN
27305 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
27306 +#endif
27307 +       mp_ioapics[idx].mpc_apicid = id;
27308 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
27309 +
27310 +       /*
27311 +        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
27312 +        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
27313 +        */
27314 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
27315 +       mp_ioapic_routing[idx].gsi_start = gsi_base;
27316 +       mp_ioapic_routing[idx].gsi_end = gsi_base +
27317 +               io_apic_get_redir_entries(idx);
27318 +
27319 +       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
27320 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
27321 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
27322 +               mp_ioapic_routing[idx].gsi_start,
27323 +               mp_ioapic_routing[idx].gsi_end);
27324 +
27325 +       return;
27326 +}
27327 +
27328 +
27329 +void __init mp_override_legacy_irq (
27330 +       u8                      bus_irq,
27331 +       u8                      polarity,
27332 +       u8                      trigger,
27333 +       u32                     gsi)
27334 +{
27335 +       struct mpc_config_intsrc intsrc;
27336 +       int                     ioapic = -1;
27337 +       int                     pin = -1;
27338 +
27339 +       /*
27340 +        * Convert 'gsi' to 'ioapic.pin'.
27341 +        */
27342 +       ioapic = mp_find_ioapic(gsi);
27343 +       if (ioapic < 0)
27344 +               return;
27345 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27346 +
27347 +       /*
27348 +        * TBD: This check is for faulty timer entries, where the override
27349 +        *      erroneously sets the trigger to level, resulting in a HUGE
27350 +        *      increase of timer interrupts!
27351 +        */
27352 +       if ((bus_irq == 0) && (trigger == 3))
27353 +               trigger = 1;
27354 +
27355 +       intsrc.mpc_type = MP_INTSRC;
27356 +       intsrc.mpc_irqtype = mp_INT;
27357 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
27358 +       intsrc.mpc_srcbus = MP_ISA_BUS;
27359 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
27360 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
27361 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
27362 +
27363 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
27364 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27365 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27366 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
27367 +
27368 +       mp_irqs[mp_irq_entries] = intsrc;
27369 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
27370 +               panic("Max # of irq sources exceeded!\n");
27371 +
27372 +       return;
27373 +}
27374 +
27375 +
27376 +void __init mp_config_acpi_legacy_irqs (void)
27377 +{
27378 +       struct mpc_config_intsrc intsrc;
27379 +       int                     i = 0;
27380 +       int                     ioapic = -1;
27381 +
27382 +       /*
27383 +        * Fabricate the legacy ISA bus (bus #31).
27384 +        */
27385 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
27386 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
27387 +
27388 +       /*
27389 +        * Locate the IOAPIC that manages the ISA IRQs (0-15).
27390 +        */
27391 +       ioapic = mp_find_ioapic(0);
27392 +       if (ioapic < 0)
27393 +               return;
27394 +
27395 +       intsrc.mpc_type = MP_INTSRC;
27396 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
27397 +       intsrc.mpc_srcbus = MP_ISA_BUS;
27398 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
27399 +
27400 +       /*
27401 +        * Use the default configuration for the IRQs 0-15.  Unless
27402 +        * overridden by (MADT) interrupt source override entries.
27403 +        */
27404 +       for (i = 0; i < 16; i++) {
27405 +               int idx;
27406 +
27407 +               for (idx = 0; idx < mp_irq_entries; idx++) {
27408 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
27409 +
27410 +                       /* Do we already have a mapping for this ISA IRQ? */
27411 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
27412 +                               break;
27413 +
27414 +                       /* Do we already have a mapping for this IOAPIC pin */
27415 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
27416 +                               (irq->mpc_dstirq == i))
27417 +                               break;
27418 +               }
27419 +
27420 +               if (idx != mp_irq_entries) {
27421 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
27422 +                       continue;                       /* IRQ already used */
27423 +               }
27424 +
27425 +               intsrc.mpc_irqtype = mp_INT;
27426 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
27427 +               intsrc.mpc_dstirq = i;
27428 +
27429 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
27430 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27431 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27432 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
27433 +                       intsrc.mpc_dstirq);
27434 +
27435 +               mp_irqs[mp_irq_entries] = intsrc;
27436 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
27437 +                       panic("Max # of irq sources exceeded!\n");
27438 +       }
27439 +
27440 +       return;
27441 +}
27442 +
27443 +#define MAX_GSI_NUM    4096
27444 +
27445 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
27446 +{
27447 +       int                     ioapic = -1;
27448 +       int                     ioapic_pin = 0;
27449 +       int                     idx, bit = 0;
27450 +       static int              pci_irq = 16;
27451 +       /*
27452 +        * Mapping between Global System Interrupts, which
27453 +        * represent all possible interrupts, to the IRQs
27454 +        * assigned to actual devices.
27455 +        */
27456 +       static int              gsi_to_irq[MAX_GSI_NUM];
27457 +
27458 +       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
27459 +               return gsi;
27460 +
27461 +       /* Don't set up the ACPI SCI because it's already set up */
27462 +       if (acpi_fadt.sci_int == gsi)
27463 +               return gsi;
27464 +
27465 +       ioapic = mp_find_ioapic(gsi);
27466 +       if (ioapic < 0) {
27467 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
27468 +               return gsi;
27469 +       }
27470 +
27471 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27472 +
27473 +       /*
27474 +        * Avoid pin reprogramming.  PRTs typically include entries
27475 +        * with redundant pin->gsi mappings (but unique PCI devices);
27476 +        * we only program the IOAPIC on the first.
27477 +        */
27478 +       bit = ioapic_pin % 32;
27479 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
27480 +       if (idx > 3) {
27481 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
27482 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
27483 +                       ioapic_pin);
27484 +               return gsi;
27485 +       }
27486 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
27487 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
27488 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
27489 +               return gsi_to_irq[gsi];
27490 +       }
27491 +
27492 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
27493 +
27494 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
27495 +               /*
27496 +                * For PCI devices assign IRQs in order, avoiding gaps
27497 +                * due to unused I/O APIC pins.
27498 +                */
27499 +               int irq = gsi;
27500 +               if (gsi < MAX_GSI_NUM) {
27501 +                       /*
27502 +                        * Retain the VIA chipset work-around (gsi > 15), but
27503 +                        * avoid a problem where the 8254 timer (IRQ0) is setup
27504 +                        * via an override (so it's not on pin 0 of the ioapic),
27505 +                        * and at the same time, the pin 0 interrupt is a PCI
27506 +                        * type.  The gsi > 15 test could cause these two pins
27507 +                        * to be shared as IRQ0, and they are not shareable.
27508 +                        * So test for this condition, and if necessary, avoid
27509 +                        * the pin collision.
27510 +                        */
27511 +                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
27512 +                               gsi = pci_irq++;
27513 +                       /*
27514 +                        * Don't assign IRQ used by ACPI SCI
27515 +                        */
27516 +                       if (gsi == acpi_fadt.sci_int)
27517 +                               gsi = pci_irq++;
27518 +                       gsi_to_irq[irq] = gsi;
27519 +               } else {
27520 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
27521 +                       return gsi;
27522 +               }
27523 +       }
27524 +
27525 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
27526 +               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
27527 +               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
27528 +       return gsi;
27529 +}
27530 +
27531 +#endif /*CONFIG_X86_IO_APIC*/
27532 +#endif /*CONFIG_ACPI*/
27533 Index: head-2008-11-25/arch/x86/kernel/process_64-xen.c
27534 ===================================================================
27535 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
27536 +++ head-2008-11-25/arch/x86/kernel/process_64-xen.c    2008-08-07 12:44:36.000000000 +0200
27537 @@ -0,0 +1,848 @@
27538 +/*
27539 + *  linux/arch/x86-64/kernel/process.c
27540 + *
27541 + *  Copyright (C) 1995  Linus Torvalds
27542 + *
27543 + *  Pentium III FXSR, SSE support
27544 + *     Gareth Hughes <gareth@valinux.com>, May 2000
27545 + *
27546 + *  X86-64 port
27547 + *     Andi Kleen.
27548 + *
27549 + *     CPU hotplug support - ashok.raj@intel.com
27550 + *
27551 + *  Jun Nakajima <jun.nakajima@intel.com>
27552 + *     Modified for Xen
27553 + */
27554 +
27555 +/*
27556 + * This file handles the architecture-dependent parts of process handling..
27557 + */
27558 +
27559 +#include <stdarg.h>
27560 +
27561 +#include <linux/cpu.h>
27562 +#include <linux/errno.h>
27563 +#include <linux/sched.h>
27564 +#include <linux/kernel.h>
27565 +#include <linux/mm.h>
27566 +#include <linux/elfcore.h>
27567 +#include <linux/smp.h>
27568 +#include <linux/slab.h>
27569 +#include <linux/user.h>
27570 +#include <linux/module.h>
27571 +#include <linux/a.out.h>
27572 +#include <linux/interrupt.h>
27573 +#include <linux/delay.h>
27574 +#include <linux/ptrace.h>
27575 +#include <linux/utsname.h>
27576 +#include <linux/random.h>
27577 +#include <linux/notifier.h>
27578 +#include <linux/kprobes.h>
27579 +
27580 +#include <asm/uaccess.h>
27581 +#include <asm/pgtable.h>
27582 +#include <asm/system.h>
27583 +#include <asm/io.h>
27584 +#include <asm/processor.h>
27585 +#include <asm/i387.h>
27586 +#include <asm/mmu_context.h>
27587 +#include <asm/pda.h>
27588 +#include <asm/prctl.h>
27589 +#include <asm/kdebug.h>
27590 +#include <xen/interface/platform.h>
27591 +#include <xen/interface/physdev.h>
27592 +#include <xen/interface/vcpu.h>
27593 +#include <asm/desc.h>
27594 +#include <asm/proto.h>
27595 +#include <asm/hardirq.h>
27596 +#include <asm/ia32.h>
27597 +#include <asm/idle.h>
27598 +
27599 +#include <xen/cpu_hotplug.h>
27600 +
27601 +asmlinkage extern void ret_from_fork(void);
27602 +
27603 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
27604 +
27605 +unsigned long boot_option_idle_override = 0;
27606 +EXPORT_SYMBOL(boot_option_idle_override);
27607 +
27608 +/*
27609 + * Powermanagement idle function, if any..
27610 + */
27611 +void (*pm_idle)(void);
27612 +EXPORT_SYMBOL(pm_idle);
27613 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
27614 +
27615 +static ATOMIC_NOTIFIER_HEAD(idle_notifier);
27616 +
27617 +void idle_notifier_register(struct notifier_block *n)
27618 +{
27619 +       atomic_notifier_chain_register(&idle_notifier, n);
27620 +}
27621 +EXPORT_SYMBOL_GPL(idle_notifier_register);
27622 +
27623 +void idle_notifier_unregister(struct notifier_block *n)
27624 +{
27625 +       atomic_notifier_chain_unregister(&idle_notifier, n);
27626 +}
27627 +EXPORT_SYMBOL(idle_notifier_unregister);
27628 +
27629 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
27630 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
27631 +
27632 +void enter_idle(void)
27633 +{
27634 +       __get_cpu_var(idle_state) = CPU_IDLE;
27635 +       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
27636 +}
27637 +
27638 +static void __exit_idle(void)
27639 +{
27640 +       __get_cpu_var(idle_state) = CPU_NOT_IDLE;
27641 +       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
27642 +}
27643 +
27644 +/* Called from interrupts to signify idle end */
27645 +void exit_idle(void)
27646 +{
27647 +       if (current->pid | read_pda(irqcount))
27648 +               return;
27649 +       __exit_idle();
27650 +}
27651 +
27652 +/*
27653 + * On SMP it's slightly faster (but much more power-consuming!)
27654 + * to poll the ->need_resched flag instead of waiting for the
27655 + * cross-CPU IPI to arrive. Use this option with caution.
27656 + */
27657 +static void poll_idle (void)
27658 +{
27659 +       local_irq_enable();
27660 +
27661 +       asm volatile(
27662 +               "2:"
27663 +               "testl %0,%1;"
27664 +               "rep; nop;"
27665 +               "je 2b;"
27666 +               : :
27667 +               "i" (_TIF_NEED_RESCHED),
27668 +               "m" (current_thread_info()->flags));
27669 +}
27670 +
27671 +static void xen_idle(void)
27672 +{
27673 +       local_irq_disable();
27674 +
27675 +       if (need_resched())
27676 +               local_irq_enable();
27677 +       else {
27678 +               current_thread_info()->status &= ~TS_POLLING;
27679 +               smp_mb__after_clear_bit();
27680 +               safe_halt();
27681 +               current_thread_info()->status |= TS_POLLING;
27682 +       }
27683 +}
27684 +
27685 +#ifdef CONFIG_HOTPLUG_CPU
27686 +static inline void play_dead(void)
27687 +{
27688 +       idle_task_exit();
27689 +       local_irq_disable();
27690 +       cpu_clear(smp_processor_id(), cpu_initialized);
27691 +       preempt_enable_no_resched();
27692 +       VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
27693 +       cpu_bringup();
27694 +}
27695 +#else
27696 +static inline void play_dead(void)
27697 +{
27698 +       BUG();
27699 +}
27700 +#endif /* CONFIG_HOTPLUG_CPU */
27701 +
27702 +/*
27703 + * The idle thread. There's no useful work to be
27704 + * done, so just try to conserve power and have a
27705 + * low exit latency (ie sit in a loop waiting for
27706 + * somebody to say that they'd like to reschedule)
27707 + */
27708 +void cpu_idle (void)
27709 +{
27710 +       current_thread_info()->status |= TS_POLLING;
27711 +       /* endless idle loop with no priority at all */
27712 +       while (1) {
27713 +               while (!need_resched()) {
27714 +                       void (*idle)(void);
27715 +
27716 +                       if (__get_cpu_var(cpu_idle_state))
27717 +                               __get_cpu_var(cpu_idle_state) = 0;
27718 +                       rmb();
27719 +                       idle = xen_idle; /* no alternatives */
27720 +                       if (cpu_is_offline(smp_processor_id()))
27721 +                               play_dead();
27722 +                       enter_idle();
27723 +                       idle();
27724 +                       __exit_idle();
27725 +               }
27726 +
27727 +               preempt_enable_no_resched();
27728 +               schedule();
27729 +               preempt_disable();
27730 +       }
27731 +}
27732 +
27733 +void cpu_idle_wait(void)
27734 +{
27735 +       unsigned int cpu, this_cpu = get_cpu();
27736 +       cpumask_t map;
27737 +
27738 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
27739 +       put_cpu();
27740 +
27741 +       cpus_clear(map);
27742 +       for_each_online_cpu(cpu) {
27743 +               per_cpu(cpu_idle_state, cpu) = 1;
27744 +               cpu_set(cpu, map);
27745 +       }
27746 +
27747 +       __get_cpu_var(cpu_idle_state) = 0;
27748 +
27749 +       wmb();
27750 +       do {
27751 +               ssleep(1);
27752 +               for_each_online_cpu(cpu) {
27753 +                       if (cpu_isset(cpu, map) &&
27754 +                                       !per_cpu(cpu_idle_state, cpu))
27755 +                               cpu_clear(cpu, map);
27756 +               }
27757 +               cpus_and(map, map, cpu_online_map);
27758 +       } while (!cpus_empty(map));
27759 +}
27760 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
27761 +
27762 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
27763 +{
27764 +}
27765 +
27766 +static int __init idle_setup (char *str)
27767 +{
27768 +       if (!strncmp(str, "poll", 4)) {
27769 +               printk("using polling idle threads.\n");
27770 +               pm_idle = poll_idle;
27771 +       }
27772 +
27773 +       boot_option_idle_override = 1;
27774 +       return 1;
27775 +}
27776 +
27777 +__setup("idle=", idle_setup);
27778 +
27779 +/* Prints also some state that isn't saved in the pt_regs */
27780 +void __show_regs(struct pt_regs * regs)
27781 +{
27782 +       unsigned long fs, gs, shadowgs;
27783 +       unsigned int fsindex,gsindex;
27784 +       unsigned int ds,cs,es;
27785 +
27786 +       printk("\n");
27787 +       print_modules();
27788 +       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
27789 +               current->pid, current->comm, print_tainted(),
27790 +               system_utsname.release,
27791 +               (int)strcspn(system_utsname.version, " "),
27792 +               system_utsname.version);
27793 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
27794 +       printk_address(regs->rip);
27795 +       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
27796 +               regs->eflags);
27797 +       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
27798 +              regs->rax, regs->rbx, regs->rcx);
27799 +       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
27800 +              regs->rdx, regs->rsi, regs->rdi);
27801 +       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
27802 +              regs->rbp, regs->r8, regs->r9);
27803 +       printk("R10: %016lx R11: %016lx R12: %016lx\n",
27804 +              regs->r10, regs->r11, regs->r12);
27805 +       printk("R13: %016lx R14: %016lx R15: %016lx\n",
27806 +              regs->r13, regs->r14, regs->r15);
27807 +
27808 +       asm("mov %%ds,%0" : "=r" (ds));
27809 +       asm("mov %%cs,%0" : "=r" (cs));
27810 +       asm("mov %%es,%0" : "=r" (es));
27811 +       asm("mov %%fs,%0" : "=r" (fsindex));
27812 +       asm("mov %%gs,%0" : "=r" (gsindex));
27813 +
27814 +       rdmsrl(MSR_FS_BASE, fs);
27815 +       rdmsrl(MSR_GS_BASE, gs);
27816 +       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
27817 +
27818 +       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
27819 +              fs,fsindex,gs,gsindex,shadowgs);
27820 +       printk("CS:  %04x DS: %04x ES: %04x\n", cs, ds, es);
27821 +
27822 +}
27823 +
27824 +void show_regs(struct pt_regs *regs)
27825 +{
27826 +       printk("CPU %d:", smp_processor_id());
27827 +       __show_regs(regs);
27828 +       show_trace(NULL, regs, (void *)(regs + 1));
27829 +}
27830 +
27831 +/*
27832 + * Free current thread data structures etc..
27833 + */
27834 +void exit_thread(void)
27835 +{
27836 +       struct task_struct *me = current;
27837 +       struct thread_struct *t = &me->thread;
27838 +
27839 +       if (me->thread.io_bitmap_ptr) {
27840 +#ifndef CONFIG_X86_NO_TSS
27841 +               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
27842 +#endif
27843 +#ifdef CONFIG_XEN
27844 +               struct physdev_set_iobitmap iobmp_op;
27845 +               memset(&iobmp_op, 0, sizeof(iobmp_op));
27846 +#endif
27847 +
27848 +               kfree(t->io_bitmap_ptr);
27849 +               t->io_bitmap_ptr = NULL;
27850 +               /*
27851 +                * Careful, clear this in the TSS too:
27852 +                */
27853 +#ifndef CONFIG_X86_NO_TSS
27854 +               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
27855 +               put_cpu();
27856 +#endif
27857 +#ifdef CONFIG_XEN
27858 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
27859 +                                             &iobmp_op));
27860 +#endif
27861 +               t->io_bitmap_max = 0;
27862 +       }
27863 +}
27864 +
27865 +void load_gs_index(unsigned gs)
27866 +{
27867 +       WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
27868 +}
27869 +
27870 +void flush_thread(void)
27871 +{
27872 +       struct task_struct *tsk = current;
27873 +       struct thread_info *t = current_thread_info();
27874 +
27875 +       if (t->flags & _TIF_ABI_PENDING) {
27876 +               t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
27877 +               if (t->flags & _TIF_IA32)
27878 +                       current_thread_info()->status |= TS_COMPAT;
27879 +       }
27880 +
27881 +       tsk->thread.debugreg0 = 0;
27882 +       tsk->thread.debugreg1 = 0;
27883 +       tsk->thread.debugreg2 = 0;
27884 +       tsk->thread.debugreg3 = 0;
27885 +       tsk->thread.debugreg6 = 0;
27886 +       tsk->thread.debugreg7 = 0;
27887 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
27888 +       /*
27889 +        * Forget coprocessor state..
27890 +        */
27891 +       clear_fpu(tsk);
27892 +       clear_used_math();
27893 +}
27894 +
27895 +void release_thread(struct task_struct *dead_task)
27896 +{
27897 +       if (dead_task->mm) {
27898 +               if (dead_task->mm->context.size) {
27899 +                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
27900 +                                       dead_task->comm,
27901 +                                       dead_task->mm->context.ldt,
27902 +                                       dead_task->mm->context.size);
27903 +                       BUG();
27904 +               }
27905 +       }
27906 +}
27907 +
27908 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
27909 +{
27910 +       struct user_desc ud = {
27911 +               .base_addr = addr,
27912 +               .limit = 0xfffff,
27913 +               .seg_32bit = 1,
27914 +               .limit_in_pages = 1,
27915 +               .useable = 1,
27916 +       };
27917 +       struct n_desc_struct *desc = (void *)t->thread.tls_array;
27918 +       desc += tls;
27919 +       desc->a = LDT_entry_a(&ud);
27920 +       desc->b = LDT_entry_b(&ud);
27921 +}
27922 +
27923 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
27924 +{
27925 +       struct desc_struct *desc = (void *)t->thread.tls_array;
27926 +       desc += tls;
27927 +       return desc->base0 |
27928 +               (((u32)desc->base1) << 16) |
27929 +               (((u32)desc->base2) << 24);
27930 +}
27931 +
27932 +/*
27933 + * This gets called before we allocate a new thread and copy
27934 + * the current task into it.
27935 + */
27936 +void prepare_to_copy(struct task_struct *tsk)
27937 +{
27938 +       unlazy_fpu(tsk);
27939 +}
27940 +
27941 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
27942 +               unsigned long unused,
27943 +       struct task_struct * p, struct pt_regs * regs)
27944 +{
27945 +       int err;
27946 +       struct pt_regs * childregs;
27947 +       struct task_struct *me = current;
27948 +
27949 +       childregs = ((struct pt_regs *)
27950 +                       (THREAD_SIZE + task_stack_page(p))) - 1;
27951 +       *childregs = *regs;
27952 +
27953 +       childregs->rax = 0;
27954 +       childregs->rsp = rsp;
27955 +       if (rsp == ~0UL)
27956 +               childregs->rsp = (unsigned long)childregs;
27957 +
27958 +       p->thread.rsp = (unsigned long) childregs;
27959 +       p->thread.rsp0 = (unsigned long) (childregs+1);
27960 +       p->thread.userrsp = me->thread.userrsp;
27961 +
27962 +       set_tsk_thread_flag(p, TIF_FORK);
27963 +
27964 +       p->thread.fs = me->thread.fs;
27965 +       p->thread.gs = me->thread.gs;
27966 +
27967 +       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
27968 +       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
27969 +       asm("mov %%es,%0" : "=m" (p->thread.es));
27970 +       asm("mov %%ds,%0" : "=m" (p->thread.ds));
27971 +
27972 +       if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
27973 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
27974 +               if (!p->thread.io_bitmap_ptr) {
27975 +                       p->thread.io_bitmap_max = 0;
27976 +                       return -ENOMEM;
27977 +               }
27978 +               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
27979 +                               IO_BITMAP_BYTES);
27980 +       }
27981 +
27982 +       /*
27983 +        * Set a new TLS for the child thread?
27984 +        */
27985 +       if (clone_flags & CLONE_SETTLS) {
27986 +#ifdef CONFIG_IA32_EMULATION
27987 +               if (test_thread_flag(TIF_IA32))
27988 +                       err = ia32_child_tls(p, childregs);
27989 +               else
27990 +#endif
27991 +                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
27992 +               if (err)
27993 +                       goto out;
27994 +       }
27995 +        p->thread.iopl = current->thread.iopl;
27996 +
27997 +       err = 0;
27998 +out:
27999 +       if (err && p->thread.io_bitmap_ptr) {
28000 +               kfree(p->thread.io_bitmap_ptr);
28001 +               p->thread.io_bitmap_max = 0;
28002 +       }
28003 +       return err;
28004 +}
28005 +
28006 +static inline void __save_init_fpu( struct task_struct *tsk )
28007 +{
28008 +       asm volatile( "rex64 ; fxsave %0 ; fnclex"
28009 +                     : "=m" (tsk->thread.i387.fxsave));
28010 +       tsk->thread_info->status &= ~TS_USEDFPU;
28011 +}
28012 +
28013 +/*
28014 + *     switch_to(x,y) should switch tasks from x to y.
28015 + *
28016 + * This could still be optimized:
28017 + * - fold all the options into a flag word and test it with a single test.
28018 + * - could test fs/gs bitsliced
28019 + *
28020 + * Kprobes not supported here. Set the probe on schedule instead.
28021 + */
28022 +__kprobes struct task_struct *
28023 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
28024 +{
28025 +       struct thread_struct *prev = &prev_p->thread,
28026 +                                *next = &next_p->thread;
28027 +       int cpu = smp_processor_id();
28028 +#ifndef CONFIG_X86_NO_TSS
28029 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
28030 +#endif
28031 +#if CONFIG_XEN_COMPAT > 0x030002
28032 +       struct physdev_set_iopl iopl_op;
28033 +       struct physdev_set_iobitmap iobmp_op;
28034 +#else
28035 +       struct physdev_op _pdo[2], *pdo = _pdo;
28036 +#define iopl_op pdo->u.set_iopl
28037 +#define iobmp_op pdo->u.set_iobitmap
28038 +#endif
28039 +       multicall_entry_t _mcl[8], *mcl = _mcl;
28040 +
28041 +       /*
28042 +        * This is basically '__unlazy_fpu', except that we queue a
28043 +        * multicall to indicate FPU task switch, rather than
28044 +        * synchronously trapping to Xen.
28045 +        * The AMD workaround requires it to be after DS reload, or
28046 +        * after DS has been cleared, which we do in __prepare_arch_switch.
28047 +        */
28048 +       if (prev_p->thread_info->status & TS_USEDFPU) {
28049 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
28050 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
28051 +               mcl->args[0] = 1;
28052 +               mcl++;
28053 +       }
28054 +
28055 +       /*
28056 +        * Reload esp0, LDT and the page table pointer:
28057 +        */
28058 +       mcl->op      = __HYPERVISOR_stack_switch;
28059 +       mcl->args[0] = __KERNEL_DS;
28060 +       mcl->args[1] = next->rsp0;
28061 +       mcl++;
28062 +
28063 +       /*
28064 +        * Load the per-thread Thread-Local Storage descriptor.
28065 +        * This is load_TLS(next, cpu) with multicalls.
28066 +        */
28067 +#define C(i) do {                                                      \
28068 +       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
28069 +               mcl->op      = __HYPERVISOR_update_descriptor;          \
28070 +               mcl->args[0] = virt_to_machine(                         \
28071 +                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
28072 +               mcl->args[1] = next->tls_array[i];                      \
28073 +               mcl++;                                                  \
28074 +       }                                                               \
28075 +} while (0)
28076 +       C(0); C(1); C(2);
28077 +#undef C
28078 +
28079 +       if (unlikely(prev->iopl != next->iopl)) {
28080 +               iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
28081 +#if CONFIG_XEN_COMPAT > 0x030002
28082 +               mcl->op      = __HYPERVISOR_physdev_op;
28083 +               mcl->args[0] = PHYSDEVOP_set_iopl;
28084 +               mcl->args[1] = (unsigned long)&iopl_op;
28085 +#else
28086 +               mcl->op      = __HYPERVISOR_physdev_op_compat;
28087 +               pdo->cmd     = PHYSDEVOP_set_iopl;
28088 +               mcl->args[0] = (unsigned long)pdo++;
28089 +#endif
28090 +               mcl++;
28091 +       }
28092 +
28093 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
28094 +               set_xen_guest_handle(iobmp_op.bitmap,
28095 +                                    (char *)next->io_bitmap_ptr);
28096 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
28097 +#if CONFIG_XEN_COMPAT > 0x030002
28098 +               mcl->op      = __HYPERVISOR_physdev_op;
28099 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
28100 +               mcl->args[1] = (unsigned long)&iobmp_op;
28101 +#else
28102 +               mcl->op      = __HYPERVISOR_physdev_op_compat;
28103 +               pdo->cmd     = PHYSDEVOP_set_iobitmap;
28104 +               mcl->args[0] = (unsigned long)pdo++;
28105 +#endif
28106 +               mcl++;
28107 +       }
28108 +
28109 +#if CONFIG_XEN_COMPAT <= 0x030002
28110 +       BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
28111 +#endif
28112 +       BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
28113 +       if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
28114 +               BUG();
28115 +
28116 +       /*
28117 +        * Switch DS and ES.
28118 +        * This won't pick up thread selector changes, but I guess that is ok.
28119 +        */
28120 +       if (unlikely(next->es))
28121 +               loadsegment(es, next->es);
28122 +
28123 +       if (unlikely(next->ds))
28124 +               loadsegment(ds, next->ds);
28125 +
28126 +       /*
28127 +        * Switch FS and GS.
28128 +        */
28129 +       if (unlikely(next->fsindex))
28130 +               loadsegment(fs, next->fsindex);
28131 +
28132 +       if (next->fs)
28133 +               WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
28134 +
28135 +       if (unlikely(next->gsindex))
28136 +               load_gs_index(next->gsindex);
28137 +
28138 +       if (next->gs)
28139 +               WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
28140 +
28141 +       /*
28142 +        * Switch the PDA context.
28143 +        */
28144 +       prev->userrsp = read_pda(oldrsp);
28145 +       write_pda(oldrsp, next->userrsp);
28146 +       write_pda(pcurrent, next_p);
28147 +       write_pda(kernelstack,
28148 +                 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
28149 +
28150 +       /*
28151 +        * Now maybe reload the debug registers
28152 +        */
28153 +       if (unlikely(next->debugreg7)) {
28154 +               set_debugreg(next->debugreg0, 0);
28155 +               set_debugreg(next->debugreg1, 1);
28156 +               set_debugreg(next->debugreg2, 2);
28157 +               set_debugreg(next->debugreg3, 3);
28158 +               /* no 4 and 5 */
28159 +               set_debugreg(next->debugreg6, 6);
28160 +               set_debugreg(next->debugreg7, 7);
28161 +       }
28162 +
28163 +       return prev_p;
28164 +}
28165 +
28166 +/*
28167 + * sys_execve() executes a new program.
28168 + */
28169 +asmlinkage
28170 +long sys_execve(char __user *name, char __user * __user *argv,
28171 +               char __user * __user *envp, struct pt_regs regs)
28172 +{
28173 +       long error;
28174 +       char * filename;
28175 +
28176 +       filename = getname(name);
28177 +       error = PTR_ERR(filename);
28178 +       if (IS_ERR(filename))
28179 +               return error;
28180 +       error = do_execve(filename, argv, envp, &regs);
28181 +       if (error == 0) {
28182 +               task_lock(current);
28183 +               current->ptrace &= ~PT_DTRACE;
28184 +               task_unlock(current);
28185 +       }
28186 +       putname(filename);
28187 +       return error;
28188 +}
28189 +
28190 +void set_personality_64bit(void)
28191 +{
28192 +       /* inherit personality from parent */
28193 +
28194 +       /* Make sure to be in 64bit mode */
28195 +       clear_thread_flag(TIF_IA32);
28196 +
28197 +       /* TBD: overwrites user setup. Should have two bits.
28198 +          But 64bit processes have always behaved this way,
28199 +          so it's not too bad. The main problem is just that
28200 +          32bit childs are affected again. */
28201 +       current->personality &= ~READ_IMPLIES_EXEC;
28202 +}
28203 +
28204 +asmlinkage long sys_fork(struct pt_regs *regs)
28205 +{
28206 +       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
28207 +}
28208 +
28209 +asmlinkage long
28210 +sys_clone(unsigned long clone_flags, unsigned long newsp,
28211 +         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
28212 +{
28213 +       if (!newsp)
28214 +               newsp = regs->rsp;
28215 +       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
28216 +}
28217 +
28218 +/*
28219 + * This is trivial, and on the face of it looks like it
28220 + * could equally well be done in user mode.
28221 + *
28222 + * Not so, for quite unobvious reasons - register pressure.
28223 + * In user mode vfork() cannot have a stack frame, and if
28224 + * done by calling the "clone()" system call directly, you
28225 + * do not have enough call-clobbered registers to hold all
28226 + * the information you need.
28227 + */
28228 +asmlinkage long sys_vfork(struct pt_regs *regs)
28229 +{
28230 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
28231 +                   NULL, NULL);
28232 +}
28233 +
28234 +unsigned long get_wchan(struct task_struct *p)
28235 +{
28236 +       unsigned long stack;
28237 +       u64 fp,rip;
28238 +       int count = 0;
28239 +
28240 +       if (!p || p == current || p->state==TASK_RUNNING)
28241 +               return 0;
28242 +       stack = (unsigned long)task_stack_page(p);
28243 +       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
28244 +               return 0;
28245 +       fp = *(u64 *)(p->thread.rsp);
28246 +       do {
28247 +               if (fp < (unsigned long)stack ||
28248 +                   fp > (unsigned long)stack+THREAD_SIZE)
28249 +                       return 0;
28250 +               rip = *(u64 *)(fp+8);
28251 +               if (!in_sched_functions(rip))
28252 +                       return rip;
28253 +               fp = *(u64 *)fp;
28254 +       } while (count++ < 16);
28255 +       return 0;
28256 +}
28257 +
28258 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
28259 +{
28260 +       int ret = 0;
28261 +       int doit = task == current;
28262 +       int cpu;
28263 +
28264 +       switch (code) {
28265 +       case ARCH_SET_GS:
28266 +               if (addr >= TASK_SIZE_OF(task))
28267 +                       return -EPERM;
28268 +               cpu = get_cpu();
28269 +               /* handle small bases via the GDT because that's faster to
28270 +                  switch. */
28271 +               if (addr <= 0xffffffff) {
28272 +                       set_32bit_tls(task, GS_TLS, addr);
28273 +                       if (doit) {
28274 +                               load_TLS(&task->thread, cpu);
28275 +                               load_gs_index(GS_TLS_SEL);
28276 +                       }
28277 +                       task->thread.gsindex = GS_TLS_SEL;
28278 +                       task->thread.gs = 0;
28279 +               } else {
28280 +                       task->thread.gsindex = 0;
28281 +                       task->thread.gs = addr;
28282 +                       if (doit) {
28283 +                               load_gs_index(0);
28284 +                               ret = HYPERVISOR_set_segment_base(
28285 +                                       SEGBASE_GS_USER, addr);
28286 +                       }
28287 +               }
28288 +               put_cpu();
28289 +               break;
28290 +       case ARCH_SET_FS:
28291 +               /* Not strictly needed for fs, but do it for symmetry
28292 +                  with gs */
28293 +               if (addr >= TASK_SIZE_OF(task))
28294 +                       return -EPERM;
28295 +               cpu = get_cpu();
28296 +               /* handle small bases via the GDT because that's faster to
28297 +                  switch. */
28298 +               if (addr <= 0xffffffff) {
28299 +                       set_32bit_tls(task, FS_TLS, addr);
28300 +                       if (doit) {
28301 +                               load_TLS(&task->thread, cpu);
28302 +                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
28303 +                       }
28304 +                       task->thread.fsindex = FS_TLS_SEL;
28305 +                       task->thread.fs = 0;
28306 +               } else {
28307 +                       task->thread.fsindex = 0;
28308 +                       task->thread.fs = addr;
28309 +                       if (doit) {
28310 +                               /* set the selector to 0 to not confuse
28311 +                                  __switch_to */
28312 +                               asm volatile("movl %0,%%fs" :: "r" (0));
28313 +                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
28314 +                                                                 addr);
28315 +                       }
28316 +               }
28317 +               put_cpu();
28318 +               break;
28319 +       case ARCH_GET_FS: {
28320 +               unsigned long base;
28321 +               if (task->thread.fsindex == FS_TLS_SEL)
28322 +                       base = read_32bit_tls(task, FS_TLS);
28323 +               else if (doit)
28324 +                       rdmsrl(MSR_FS_BASE, base);
28325 +               else
28326 +                       base = task->thread.fs;
28327 +               ret = put_user(base, (unsigned long __user *)addr);
28328 +               break;
28329 +       }
28330 +       case ARCH_GET_GS: {
28331 +               unsigned long base;
28332 +               unsigned gsindex;
28333 +               if (task->thread.gsindex == GS_TLS_SEL)
28334 +                       base = read_32bit_tls(task, GS_TLS);
28335 +               else if (doit) {
28336 +                       asm("movl %%gs,%0" : "=r" (gsindex));
28337 +                       if (gsindex)
28338 +                               rdmsrl(MSR_KERNEL_GS_BASE, base);
28339 +                       else
28340 +                               base = task->thread.gs;
28341 +               }
28342 +               else
28343 +                       base = task->thread.gs;
28344 +               ret = put_user(base, (unsigned long __user *)addr);
28345 +               break;
28346 +       }
28347 +
28348 +       default:
28349 +               ret = -EINVAL;
28350 +               break;
28351 +       }
28352 +
28353 +       return ret;
28354 +}
28355 +
28356 +long sys_arch_prctl(int code, unsigned long addr)
28357 +{
28358 +       return do_arch_prctl(current, code, addr);
28359 +}
28360 +
28361 +/*
28362 + * Capture the user space registers if the task is not running (in user space)
28363 + */
28364 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
28365 +{
28366 +       struct pt_regs *pp, ptregs;
28367 +
28368 +       pp = task_pt_regs(tsk);
28369 +
28370 +       ptregs = *pp;
28371 +       ptregs.cs &= 0xffff;
28372 +       ptregs.ss &= 0xffff;
28373 +
28374 +       elf_core_copy_regs(regs, &ptregs);
28375 +
28376 +        boot_option_idle_override = 1;
28377 +       return 1;
28378 +}
28379 +
28380 +unsigned long arch_align_stack(unsigned long sp)
28381 +{
28382 +       if (randomize_va_space)
28383 +               sp -= get_random_int() % 8192;
28384 +       return sp & ~0xf;
28385 +}
28386 Index: head-2008-11-25/arch/x86/kernel/setup_64-xen.c
28387 ===================================================================
28388 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
28389 +++ head-2008-11-25/arch/x86/kernel/setup_64-xen.c      2008-04-22 15:41:51.000000000 +0200
28390 @@ -0,0 +1,1652 @@
28391 +/*
28392 + *  linux/arch/x86-64/kernel/setup.c
28393 + *
28394 + *  Copyright (C) 1995  Linus Torvalds
28395 + *
28396 + *  Nov 2001 Dave Jones <davej@suse.de>
28397 + *  Forked from i386 setup code.
28398 + */
28399 +
28400 +/*
28401 + * This file handles the architecture-dependent parts of initialization
28402 + */
28403 +
28404 +#include <linux/errno.h>
28405 +#include <linux/sched.h>
28406 +#include <linux/kernel.h>
28407 +#include <linux/mm.h>
28408 +#include <linux/stddef.h>
28409 +#include <linux/unistd.h>
28410 +#include <linux/ptrace.h>
28411 +#include <linux/slab.h>
28412 +#include <linux/user.h>
28413 +#include <linux/a.out.h>
28414 +#include <linux/screen_info.h>
28415 +#include <linux/ioport.h>
28416 +#include <linux/delay.h>
28417 +#include <linux/init.h>
28418 +#include <linux/initrd.h>
28419 +#include <linux/highmem.h>
28420 +#include <linux/bootmem.h>
28421 +#include <linux/module.h>
28422 +#include <asm/processor.h>
28423 +#include <linux/console.h>
28424 +#include <linux/seq_file.h>
28425 +#include <linux/crash_dump.h>
28426 +#include <linux/root_dev.h>
28427 +#include <linux/pci.h>
28428 +#include <linux/acpi.h>
28429 +#include <linux/kallsyms.h>
28430 +#include <linux/edd.h>
28431 +#include <linux/mmzone.h>
28432 +#include <linux/kexec.h>
28433 +#include <linux/cpufreq.h>
28434 +#include <linux/dmi.h>
28435 +#include <linux/dma-mapping.h>
28436 +#include <linux/ctype.h>
28437 +
28438 +#include <asm/mtrr.h>
28439 +#include <asm/uaccess.h>
28440 +#include <asm/system.h>
28441 +#include <asm/io.h>
28442 +#include <asm/smp.h>
28443 +#include <asm/msr.h>
28444 +#include <asm/desc.h>
28445 +#include <video/edid.h>
28446 +#include <asm/e820.h>
28447 +#include <asm/dma.h>
28448 +#include <asm/mpspec.h>
28449 +#include <asm/mmu_context.h>
28450 +#include <asm/bootsetup.h>
28451 +#include <asm/proto.h>
28452 +#include <asm/setup.h>
28453 +#include <asm/mach_apic.h>
28454 +#include <asm/numa.h>
28455 +#include <asm/sections.h>
28456 +#include <asm/dmi.h>
28457 +#ifdef CONFIG_XEN
28458 +#include <linux/percpu.h>
28459 +#include <xen/interface/physdev.h>
28460 +#include "setup_arch_pre.h"
28461 +#include <asm/hypervisor.h>
28462 +#include <xen/interface/nmi.h>
28463 +#include <xen/features.h>
28464 +#include <xen/firmware.h>
28465 +#include <xen/xencons.h>
28466 +#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
28467 +#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
28468 +#include <asm/mach-xen/setup_arch_post.h>
28469 +#include <xen/interface/memory.h>
28470 +
28471 +#ifdef CONFIG_XEN
28472 +#include <xen/interface/kexec.h>
28473 +#endif
28474 +
28475 +extern unsigned long start_pfn;
28476 +extern struct edid_info edid_info;
28477 +
28478 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
28479 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
28480 +
28481 +extern char hypercall_page[PAGE_SIZE];
28482 +EXPORT_SYMBOL(hypercall_page);
28483 +
28484 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
28485 +static struct notifier_block xen_panic_block = {
28486 +       xen_panic_event, NULL, 0 /* try to go last */
28487 +};
28488 +
28489 +unsigned long *phys_to_machine_mapping;
28490 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
28491 +
28492 +EXPORT_SYMBOL(phys_to_machine_mapping);
28493 +
28494 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
28495 +DEFINE_PER_CPU(int, nr_multicall_ents);
28496 +
28497 +/* Raw start-of-day parameters from the hypervisor. */
28498 +start_info_t *xen_start_info;
28499 +EXPORT_SYMBOL(xen_start_info);
28500 +#endif
28501 +
28502 +/*
28503 + * Machine setup..
28504 + */
28505 +
28506 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
28507 +EXPORT_SYMBOL(boot_cpu_data);
28508 +
28509 +unsigned long mmu_cr4_features;
28510 +
28511 +int acpi_disabled;
28512 +EXPORT_SYMBOL(acpi_disabled);
28513 +#ifdef CONFIG_ACPI
28514 +extern int __initdata acpi_ht;
28515 +extern acpi_interrupt_flags    acpi_sci_flags;
28516 +int __initdata acpi_force = 0;
28517 +#endif
28518 +
28519 +int acpi_numa __initdata;
28520 +
28521 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
28522 +int bootloader_type;
28523 +
28524 +unsigned long saved_video_mode;
28525 +
28526 +/*
28527 + * Early DMI memory
28528 + */
28529 +int dmi_alloc_index;
28530 +char dmi_alloc_data[DMI_MAX_DATA];
28531 +
28532 +/*
28533 + * Setup options
28534 + */
28535 +struct screen_info screen_info;
28536 +EXPORT_SYMBOL(screen_info);
28537 +struct sys_desc_table_struct {
28538 +       unsigned short length;
28539 +       unsigned char table[0];
28540 +};
28541 +
28542 +struct edid_info edid_info;
28543 +EXPORT_SYMBOL_GPL(edid_info);
28544 +struct e820map e820;
28545 +#ifdef CONFIG_XEN
28546 +struct e820map machine_e820;
28547 +#endif
28548 +
28549 +extern int root_mountflags;
28550 +
28551 +char command_line[COMMAND_LINE_SIZE];
28552 +
28553 +struct resource standard_io_resources[] = {
28554 +       { .name = "dma1", .start = 0x00, .end = 0x1f,
28555 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28556 +       { .name = "pic1", .start = 0x20, .end = 0x21,
28557 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28558 +       { .name = "timer0", .start = 0x40, .end = 0x43,
28559 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28560 +       { .name = "timer1", .start = 0x50, .end = 0x53,
28561 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28562 +       { .name = "keyboard", .start = 0x60, .end = 0x6f,
28563 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28564 +       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
28565 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28566 +       { .name = "pic2", .start = 0xa0, .end = 0xa1,
28567 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28568 +       { .name = "dma2", .start = 0xc0, .end = 0xdf,
28569 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28570 +       { .name = "fpu", .start = 0xf0, .end = 0xff,
28571 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
28572 +};
28573 +
28574 +#define STANDARD_IO_RESOURCES \
28575 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
28576 +
28577 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
28578 +
28579 +struct resource data_resource = {
28580 +       .name = "Kernel data",
28581 +       .start = 0,
28582 +       .end = 0,
28583 +       .flags = IORESOURCE_RAM,
28584 +};
28585 +struct resource code_resource = {
28586 +       .name = "Kernel code",
28587 +       .start = 0,
28588 +       .end = 0,
28589 +       .flags = IORESOURCE_RAM,
28590 +};
28591 +
28592 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
28593 +
28594 +static struct resource system_rom_resource = {
28595 +       .name = "System ROM",
28596 +       .start = 0xf0000,
28597 +       .end = 0xfffff,
28598 +       .flags = IORESOURCE_ROM,
28599 +};
28600 +
28601 +static struct resource extension_rom_resource = {
28602 +       .name = "Extension ROM",
28603 +       .start = 0xe0000,
28604 +       .end = 0xeffff,
28605 +       .flags = IORESOURCE_ROM,
28606 +};
28607 +
28608 +static struct resource adapter_rom_resources[] = {
28609 +       { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
28610 +               .flags = IORESOURCE_ROM },
28611 +       { .name = "Adapter ROM", .start = 0, .end = 0,
28612 +               .flags = IORESOURCE_ROM },
28613 +       { .name = "Adapter ROM", .start = 0, .end = 0,
28614 +               .flags = IORESOURCE_ROM },
28615 +       { .name = "Adapter ROM", .start = 0, .end = 0,
28616 +               .flags = IORESOURCE_ROM },
28617 +       { .name = "Adapter ROM", .start = 0, .end = 0,
28618 +               .flags = IORESOURCE_ROM },
28619 +       { .name = "Adapter ROM", .start = 0, .end = 0,
28620 +               .flags = IORESOURCE_ROM }
28621 +};
28622 +
28623 +#define ADAPTER_ROM_RESOURCES \
28624 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
28625 +
28626 +static struct resource video_rom_resource = {
28627 +       .name = "Video ROM",
28628 +       .start = 0xc0000,
28629 +       .end = 0xc7fff,
28630 +       .flags = IORESOURCE_ROM,
28631 +};
28632 +
28633 +static struct resource video_ram_resource = {
28634 +       .name = "Video RAM area",
28635 +       .start = 0xa0000,
28636 +       .end = 0xbffff,
28637 +       .flags = IORESOURCE_RAM,
28638 +};
28639 +
28640 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
28641 +
28642 +static int __init romchecksum(unsigned char *rom, unsigned long length)
28643 +{
28644 +       unsigned char *p, sum = 0;
28645 +
28646 +       for (p = rom; p < rom + length; p++)
28647 +               sum += *p;
28648 +       return sum == 0;
28649 +}
28650 +
28651 +static void __init probe_roms(void)
28652 +{
28653 +       unsigned long start, length, upper;
28654 +       unsigned char *rom;
28655 +       int           i;
28656 +
28657 +#ifdef CONFIG_XEN
28658 +       /* Nothing to do if not running in dom0. */
28659 +       if (!is_initial_xendomain())
28660 +               return;
28661 +#endif
28662 +
28663 +       /* video rom */
28664 +       upper = adapter_rom_resources[0].start;
28665 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
28666 +               rom = isa_bus_to_virt(start);
28667 +               if (!romsignature(rom))
28668 +                       continue;
28669 +
28670 +               video_rom_resource.start = start;
28671 +
28672 +               /* 0 < length <= 0x7f * 512, historically */
28673 +               length = rom[2] * 512;
28674 +
28675 +               /* if checksum okay, trust length byte */
28676 +               if (length && romchecksum(rom, length))
28677 +                       video_rom_resource.end = start + length - 1;
28678 +
28679 +               request_resource(&iomem_resource, &video_rom_resource);
28680 +               break;
28681 +                       }
28682 +
28683 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
28684 +       if (start < upper)
28685 +               start = upper;
28686 +
28687 +       /* system rom */
28688 +       request_resource(&iomem_resource, &system_rom_resource);
28689 +       upper = system_rom_resource.start;
28690 +
28691 +       /* check for extension rom (ignore length byte!) */
28692 +       rom = isa_bus_to_virt(extension_rom_resource.start);
28693 +       if (romsignature(rom)) {
28694 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
28695 +               if (romchecksum(rom, length)) {
28696 +                       request_resource(&iomem_resource, &extension_rom_resource);
28697 +                       upper = extension_rom_resource.start;
28698 +               }
28699 +       }
28700 +
28701 +       /* check for adapter roms on 2k boundaries */
28702 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
28703 +               rom = isa_bus_to_virt(start);
28704 +               if (!romsignature(rom))
28705 +                       continue;
28706 +
28707 +               /* 0 < length <= 0x7f * 512, historically */
28708 +               length = rom[2] * 512;
28709 +
28710 +               /* but accept any length that fits if checksum okay */
28711 +               if (!length || start + length > upper || !romchecksum(rom, length))
28712 +                       continue;
28713 +
28714 +               adapter_rom_resources[i].start = start;
28715 +               adapter_rom_resources[i].end = start + length - 1;
28716 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
28717 +
28718 +               start = adapter_rom_resources[i++].end & ~2047UL;
28719 +       }
28720 +}
28721 +
28722 +/* Check for full argument with no trailing characters */
28723 +static int fullarg(char *p, char *arg)
28724 +{
28725 +       int l = strlen(arg);
28726 +       return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
28727 +}
28728 +
28729 +static __init void parse_cmdline_early (char ** cmdline_p)
28730 +{
28731 +       char c = ' ', *to = command_line, *from = COMMAND_LINE;
28732 +       int len = 0;
28733 +       int userdef = 0;
28734 +
28735 +       for (;;) {
28736 +               if (c != ' ')
28737 +                       goto next_char;
28738 +
28739 +#ifdef  CONFIG_SMP
28740 +               /*
28741 +                * If the BIOS enumerates physical processors before logical,
28742 +                * maxcpus=N at enumeration-time can be used to disable HT.
28743 +                */
28744 +               else if (!memcmp(from, "maxcpus=", 8)) {
28745 +                       extern unsigned int maxcpus;
28746 +
28747 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
28748 +               }
28749 +#endif
28750 +#ifdef CONFIG_ACPI
28751 +               /* "acpi=off" disables both ACPI table parsing and interpreter init */
28752 +               if (fullarg(from,"acpi=off"))
28753 +                       disable_acpi();
28754 +
28755 +               if (fullarg(from, "acpi=force")) {
28756 +                       /* add later when we do DMI horrors: */
28757 +                       acpi_force = 1;
28758 +                       acpi_disabled = 0;
28759 +               }
28760 +
28761 +               /* acpi=ht just means: do ACPI MADT parsing
28762 +                  at bootup, but don't enable the full ACPI interpreter */
28763 +               if (fullarg(from, "acpi=ht")) {
28764 +                       if (!acpi_force)
28765 +                               disable_acpi();
28766 +                       acpi_ht = 1;
28767 +               }
28768 +                else if (fullarg(from, "pci=noacpi"))
28769 +                       acpi_disable_pci();
28770 +               else if (fullarg(from, "acpi=noirq"))
28771 +                       acpi_noirq_set();
28772 +
28773 +               else if (fullarg(from, "acpi_sci=edge"))
28774 +                       acpi_sci_flags.trigger =  1;
28775 +               else if (fullarg(from, "acpi_sci=level"))
28776 +                       acpi_sci_flags.trigger = 3;
28777 +               else if (fullarg(from, "acpi_sci=high"))
28778 +                       acpi_sci_flags.polarity = 1;
28779 +               else if (fullarg(from, "acpi_sci=low"))
28780 +                       acpi_sci_flags.polarity = 3;
28781 +
28782 +               /* acpi=strict disables out-of-spec workarounds */
28783 +               else if (fullarg(from, "acpi=strict")) {
28784 +                       acpi_strict = 1;
28785 +               }
28786 +#ifdef CONFIG_X86_IO_APIC
28787 +               else if (fullarg(from, "acpi_skip_timer_override"))
28788 +                       acpi_skip_timer_override = 1;
28789 +#endif
28790 +#endif
28791 +
28792 +#ifndef CONFIG_XEN
28793 +               if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
28794 +                       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
28795 +                       disable_apic = 1;
28796 +               }
28797 +
28798 +               if (fullarg(from, "noapic"))
28799 +                       skip_ioapic_setup = 1;
28800 +
28801 +               if (fullarg(from,"apic")) {
28802 +                       skip_ioapic_setup = 0;
28803 +                       ioapic_force = 1;
28804 +               }
28805 +#endif
28806 +
28807 +               if (!memcmp(from, "mem=", 4))
28808 +                       parse_memopt(from+4, &from);
28809 +
28810 +               if (!memcmp(from, "memmap=", 7)) {
28811 +                       /* exactmap option is for used defined memory */
28812 +                       if (!memcmp(from+7, "exactmap", 8)) {
28813 +#ifdef CONFIG_CRASH_DUMP
28814 +                               /* If we are doing a crash dump, we
28815 +                                * still need to know the real mem
28816 +                                * size before original memory map is
28817 +                                * reset.
28818 +                                */
28819 +                               saved_max_pfn = e820_end_of_ram();
28820 +#endif
28821 +                               from += 8+7;
28822 +                               end_pfn_map = 0;
28823 +                               e820.nr_map = 0;
28824 +                               userdef = 1;
28825 +                       }
28826 +                       else {
28827 +                               parse_memmapopt(from+7, &from);
28828 +                               userdef = 1;
28829 +                       }
28830 +               }
28831 +
28832 +#ifdef CONFIG_NUMA
28833 +               if (!memcmp(from, "numa=", 5))
28834 +                       numa_setup(from+5);
28835 +#endif
28836 +
28837 +               if (!memcmp(from,"iommu=",6)) {
28838 +                       iommu_setup(from+6);
28839 +               }
28840 +
28841 +               if (fullarg(from,"oops=panic"))
28842 +                       panic_on_oops = 1;
28843 +
28844 +               if (!memcmp(from, "noexec=", 7))
28845 +                       nonx_setup(from + 7);
28846 +
28847 +#ifdef CONFIG_KEXEC
28848 +               /* crashkernel=size@addr specifies the location to reserve for
28849 +                * a crash kernel.  By reserving this memory we guarantee
28850 +                * that linux never set's it up as a DMA target.
28851 +                * Useful for holding code to do something appropriate
28852 +                * after a kernel panic.
28853 +                */
28854 +               else if (!memcmp(from, "crashkernel=", 12)) {
28855 +#ifndef CONFIG_XEN
28856 +                       unsigned long size, base;
28857 +                       size = memparse(from+12, &from);
28858 +                       if (*from == '@') {
28859 +                               base = memparse(from+1, &from);
28860 +                               /* FIXME: Do I want a sanity check
28861 +                                * to validate the memory range?
28862 +                                */
28863 +                               crashk_res.start = base;
28864 +                               crashk_res.end   = base + size - 1;
28865 +                       }
28866 +#else
28867 +                       printk("Ignoring crashkernel command line, "
28868 +                              "parameter will be supplied by xen\n");
28869 +#endif
28870 +               }
28871 +#endif
28872 +
28873 +#ifdef CONFIG_PROC_VMCORE
28874 +               /* elfcorehdr= specifies the location of elf core header
28875 +                * stored by the crashed kernel. This option will be passed
28876 +                * by kexec loader to the capture kernel.
28877 +                */
28878 +               else if(!memcmp(from, "elfcorehdr=", 11))
28879 +                       elfcorehdr_addr = memparse(from+11, &from);
28880 +#endif
28881 +
28882 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
28883 +               else if (!memcmp(from, "additional_cpus=", 16))
28884 +                       setup_additional_cpus(from+16);
28885 +#endif
28886 +
28887 +       next_char:
28888 +               c = *(from++);
28889 +               if (!c)
28890 +                       break;
28891 +               if (COMMAND_LINE_SIZE <= ++len)
28892 +                       break;
28893 +               *(to++) = c;
28894 +       }
28895 +       if (userdef) {
28896 +               printk(KERN_INFO "user-defined physical RAM map:\n");
28897 +               e820_print_map("user");
28898 +       }
28899 +       *to = '\0';
28900 +       *cmdline_p = command_line;
28901 +}
28902 +
28903 +#ifndef CONFIG_NUMA
28904 +static void __init
28905 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
28906 +{
28907 +       unsigned long bootmap_size, bootmap;
28908 +
28909 +       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
28910 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
28911 +       if (bootmap == -1L)
28912 +               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
28913 +       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
28914 +#ifdef CONFIG_XEN
28915 +       e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
28916 +#else
28917 +       e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
28918 +#endif
28919 +       reserve_bootmem(bootmap, bootmap_size);
28920 +}
28921 +#endif
28922 +
28923 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
28924 +struct edd edd;
28925 +#ifdef CONFIG_EDD_MODULE
28926 +EXPORT_SYMBOL(edd);
28927 +#endif
28928 +#ifndef CONFIG_XEN
28929 +/**
28930 + * copy_edd() - Copy the BIOS EDD information
28931 + *              from boot_params into a safe place.
28932 + *
28933 + */
28934 +static inline void copy_edd(void)
28935 +{
28936 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
28937 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
28938 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
28939 +     edd.edd_info_nr = EDD_NR;
28940 +}
28941 +#endif
28942 +#else
28943 +static inline void copy_edd(void)
28944 +{
28945 +}
28946 +#endif
28947 +
28948 +#ifndef CONFIG_XEN
28949 +#define EBDA_ADDR_POINTER 0x40E
28950 +
28951 +unsigned __initdata ebda_addr;
28952 +unsigned __initdata ebda_size;
28953 +
28954 +static void discover_ebda(void)
28955 +{
28956 +       /*
28957 +        * there is a real-mode segmented pointer pointing to the
28958 +        * 4K EBDA area at 0x40E
28959 +        */
28960 +       ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
28961 +       ebda_addr <<= 4;
28962 +
28963 +       ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
28964 +
28965 +       /* Round EBDA up to pages */
28966 +       if (ebda_size == 0)
28967 +               ebda_size = 1;
28968 +       ebda_size <<= 10;
28969 +       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
28970 +       if (ebda_size > 64*1024)
28971 +               ebda_size = 64*1024;
28972 +}
28973 +#else
28974 +#define discover_ebda() ((void)0)
28975 +#endif
28976 +
28977 +void __init setup_arch(char **cmdline_p)
28978 +{
28979 +#ifdef CONFIG_XEN
28980 +       /* Register a call for panic conditions. */
28981 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
28982 +
28983 +       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
28984 +       screen_info = SCREEN_INFO;
28985 +
28986 +       if (is_initial_xendomain()) {
28987 +               const struct dom0_vga_console_info *info =
28988 +                       (void *)((char *)xen_start_info +
28989 +                                xen_start_info->console.dom0.info_off);
28990 +
28991 +               dom0_init_screen_info(info,
28992 +                                     xen_start_info->console.dom0.info_size);
28993 +               xen_start_info->console.domU.mfn = 0;
28994 +               xen_start_info->console.domU.evtchn = 0;
28995 +       } else
28996 +               screen_info.orig_video_isVGA = 0;
28997 +
28998 +       copy_edid();
28999 +
29000 +       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
29001 +                                    VMASST_TYPE_writable_pagetables));
29002 +
29003 +       ARCH_SETUP
29004 +#else
29005 +       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
29006 +       screen_info = SCREEN_INFO;
29007 +       edid_info = EDID_INFO;
29008 +#endif /* !CONFIG_XEN */
29009 +       saved_video_mode = SAVED_VIDEO_MODE;
29010 +       bootloader_type = LOADER_TYPE;
29011 +
29012 +#ifdef CONFIG_BLK_DEV_RAM
29013 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
29014 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
29015 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
29016 +#endif
29017 +       setup_memory_region();
29018 +       copy_edd();
29019 +
29020 +       if (!MOUNT_ROOT_RDONLY)
29021 +               root_mountflags &= ~MS_RDONLY;
29022 +       init_mm.start_code = (unsigned long) &_text;
29023 +       init_mm.end_code = (unsigned long) &_etext;
29024 +       init_mm.end_data = (unsigned long) &_edata;
29025 +       init_mm.brk = (unsigned long) &_end;
29026 +
29027 +       code_resource.start = virt_to_phys(&_text);
29028 +       code_resource.end = virt_to_phys(&_etext)-1;
29029 +       data_resource.start = virt_to_phys(&_etext);
29030 +       data_resource.end = virt_to_phys(&_edata)-1;
29031 +
29032 +       parse_cmdline_early(cmdline_p);
29033 +
29034 +       early_identify_cpu(&boot_cpu_data);
29035 +
29036 +       /*
29037 +        * partially used pages are not usable - thus
29038 +        * we are rounding upwards:
29039 +        */
29040 +       end_pfn = e820_end_of_ram();
29041 +       num_physpages = end_pfn;                /* for pfn_valid */
29042 +
29043 +       check_efer();
29044 +
29045 +       discover_ebda();
29046 +
29047 +       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
29048 +
29049 +       if (is_initial_xendomain())
29050 +               dmi_scan_machine();
29051 +
29052 +#ifdef CONFIG_ACPI_NUMA
29053 +       /*
29054 +        * Parse SRAT to discover nodes.
29055 +        */
29056 +       acpi_numa_init();
29057 +#endif
29058 +
29059 +#ifdef CONFIG_NUMA
29060 +       numa_initmem_init(0, end_pfn);
29061 +#else
29062 +       contig_initmem_init(0, end_pfn);
29063 +#endif
29064 +
29065 +#ifdef CONFIG_XEN
29066 +       /*
29067 +        * Reserve kernel, physmap, start info, initial page tables, and
29068 +        * direct mapping.
29069 +        */
29070 +       reserve_bootmem_generic(__pa_symbol(&_text),
29071 +                               (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
29072 +#else
29073 +       /* Reserve direct mapping */
29074 +       reserve_bootmem_generic(table_start << PAGE_SHIFT,
29075 +                               (table_end - table_start) << PAGE_SHIFT);
29076 +
29077 +       /* reserve kernel */
29078 +       reserve_bootmem_generic(__pa_symbol(&_text),
29079 +                               __pa_symbol(&_end) - __pa_symbol(&_text));
29080 +
29081 +       /*
29082 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
29083 +        * enabling clean reboots, SMP operation, laptop functions.
29084 +        */
29085 +       reserve_bootmem_generic(0, PAGE_SIZE);
29086 +
29087 +       /* reserve ebda region */
29088 +       if (ebda_addr)
29089 +               reserve_bootmem_generic(ebda_addr, ebda_size);
29090 +
29091 +#ifdef CONFIG_SMP
29092 +       /*
29093 +        * But first pinch a few for the stack/trampoline stuff
29094 +        * FIXME: Don't need the extra page at 4K, but need to fix
29095 +        * trampoline before removing it. (see the GDT stuff)
29096 +        */
29097 +       reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
29098 +
29099 +       /* Reserve SMP trampoline */
29100 +       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
29101 +#endif
29102 +#endif
29103 +
29104 +#ifdef CONFIG_ACPI_SLEEP
29105 +       /*
29106 +        * Reserve low memory region for sleep support.
29107 +        */
29108 +       acpi_reserve_bootmem();
29109 +#endif
29110 +#ifdef CONFIG_XEN
29111 +#ifdef CONFIG_BLK_DEV_INITRD
29112 +       if (xen_start_info->mod_start) {
29113 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29114 +                       /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
29115 +                       initrd_start = INITRD_START + PAGE_OFFSET;
29116 +                       initrd_end = initrd_start+INITRD_SIZE;
29117 +                       initrd_below_start_ok = 1;
29118 +               } else {
29119 +                       printk(KERN_ERR "initrd extends beyond end of memory "
29120 +                               "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29121 +                               (unsigned long)(INITRD_START + INITRD_SIZE),
29122 +                               (unsigned long)(end_pfn << PAGE_SHIFT));
29123 +                       initrd_start = 0;
29124 +               }
29125 +       }
29126 +#endif
29127 +#else  /* CONFIG_XEN */
29128 +#ifdef CONFIG_BLK_DEV_INITRD
29129 +       if (LOADER_TYPE && INITRD_START) {
29130 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29131 +                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
29132 +                       initrd_start =
29133 +                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
29134 +                       initrd_end = initrd_start+INITRD_SIZE;
29135 +               }
29136 +               else {
29137 +                       printk(KERN_ERR "initrd extends beyond end of memory "
29138 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29139 +                           (unsigned long)(INITRD_START + INITRD_SIZE),
29140 +                           (unsigned long)(end_pfn << PAGE_SHIFT));
29141 +                       initrd_start = 0;
29142 +               }
29143 +       }
29144 +#endif
29145 +#endif /* !CONFIG_XEN */
29146 +#ifdef CONFIG_KEXEC
29147 +#ifdef CONFIG_XEN
29148 +       xen_machine_kexec_setup_resources();
29149 +#else
29150 +       if (crashk_res.start != crashk_res.end) {
29151 +               reserve_bootmem_generic(crashk_res.start,
29152 +                       crashk_res.end - crashk_res.start + 1);
29153 +       }
29154 +#endif
29155 +#endif
29156 +
29157 +       paging_init();
29158 +#ifdef CONFIG_X86_LOCAL_APIC
29159 +       /*
29160 +        * Find and reserve possible boot-time SMP configuration:
29161 +        */
29162 +       find_smp_config();
29163 +#endif
29164 +#ifdef CONFIG_XEN
29165 +       {
29166 +               int i, j, k, fpp;
29167 +               unsigned long p2m_pages;
29168 +
29169 +               p2m_pages = end_pfn;
29170 +               if (xen_start_info->nr_pages > end_pfn) {
29171 +                       /*
29172 +                        * the end_pfn was shrunk (probably by mem= or highmem=
29173 +                        * kernel parameter); shrink reservation with the HV
29174 +                        */
29175 +                       struct xen_memory_reservation reservation = {
29176 +                               .address_bits = 0,
29177 +                               .extent_order = 0,
29178 +                               .domid = DOMID_SELF
29179 +                       };
29180 +                       unsigned int difference;
29181 +                       int ret;
29182 +
29183 +                       difference = xen_start_info->nr_pages - end_pfn;
29184 +
29185 +                       set_xen_guest_handle(reservation.extent_start,
29186 +                                            ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
29187 +                       reservation.nr_extents = difference;
29188 +                       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
29189 +                                                  &reservation);
29190 +                       BUG_ON (ret != difference);
29191 +               }
29192 +               else if (end_pfn > xen_start_info->nr_pages)
29193 +                       p2m_pages = xen_start_info->nr_pages;
29194 +
29195 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29196 +                       /* Make sure we have a large enough P->M table. */
29197 +                       phys_to_machine_mapping = alloc_bootmem_pages(
29198 +                               end_pfn * sizeof(unsigned long));
29199 +                       memset(phys_to_machine_mapping, ~0,
29200 +                              end_pfn * sizeof(unsigned long));
29201 +                       memcpy(phys_to_machine_mapping,
29202 +                              (unsigned long *)xen_start_info->mfn_list,
29203 +                              p2m_pages * sizeof(unsigned long));
29204 +                       free_bootmem(
29205 +                               __pa(xen_start_info->mfn_list),
29206 +                               PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
29207 +                                               sizeof(unsigned long))));
29208 +
29209 +                       /*
29210 +                        * Initialise the list of the frames that specify the
29211 +                        * list of frames that make up the p2m table. Used by
29212 +                         * save/restore.
29213 +                        */
29214 +                       pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
29215 +
29216 +                       fpp = PAGE_SIZE/sizeof(unsigned long);
29217 +                       for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
29218 +                               if ((j % fpp) == 0) {
29219 +                                       k++;
29220 +                                       BUG_ON(k>=fpp);
29221 +                                       pfn_to_mfn_frame_list[k] =
29222 +                                               alloc_bootmem_pages(PAGE_SIZE);
29223 +                                       pfn_to_mfn_frame_list_list[k] =
29224 +                                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
29225 +                                       j=0;
29226 +                               }
29227 +                               pfn_to_mfn_frame_list[k][j] =
29228 +                                       virt_to_mfn(&phys_to_machine_mapping[i]);
29229 +                       }
29230 +                       HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
29231 +                       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
29232 +                               virt_to_mfn(pfn_to_mfn_frame_list_list);
29233 +               }
29234 +
29235 +               /* Mark all ISA DMA channels in-use - using them wouldn't work. */
29236 +               for (i = 0; i < MAX_DMA_CHANNELS; ++i)
29237 +                       if (i != 4 && request_dma(i, "xen") != 0)
29238 +                               BUG();
29239 +       }
29240 +
29241 +       if (!is_initial_xendomain()) {
29242 +               acpi_disabled = 1;
29243 +#ifdef  CONFIG_ACPI
29244 +               acpi_ht = 0;
29245 +#endif
29246 +       }
29247 +#endif
29248 +
29249 +#ifndef CONFIG_XEN
29250 +       check_ioapic();
29251 +#endif
29252 +
29253 +       zap_low_mappings(0);
29254 +
29255 +       /*
29256 +        * set this early, so we dont allocate cpu0
29257 +        * if MADT list doesnt list BSP first
29258 +        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
29259 +        */
29260 +       cpu_set(0, cpu_present_map);
29261 +#ifdef CONFIG_ACPI
29262 +       /*
29263 +        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
29264 +        * Call this early for SRAT node setup.
29265 +        */
29266 +       acpi_boot_table_init();
29267 +
29268 +       /*
29269 +        * Read APIC and some other early information from ACPI tables.
29270 +        */
29271 +       acpi_boot_init();
29272 +#endif
29273 +
29274 +       init_cpu_to_node();
29275 +
29276 +#ifdef CONFIG_X86_LOCAL_APIC
29277 +       /*
29278 +        * get boot-time SMP configuration:
29279 +        */
29280 +       if (smp_found_config)
29281 +               get_smp_config();
29282 +#ifndef CONFIG_XEN
29283 +       init_apic_mappings();
29284 +#endif
29285 +#endif
29286 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
29287 +       prefill_possible_map();
29288 +#endif
29289 +
29290 +       /*
29291 +        * Request address space for all standard RAM and ROM resources
29292 +        * and also for regions reported as reserved by the e820.
29293 +        */
29294 +       probe_roms();
29295 +#ifdef CONFIG_XEN
29296 +       if (is_initial_xendomain())
29297 +               e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
29298 +#else
29299 +       e820_reserve_resources(e820.map, e820.nr_map);
29300 +#endif
29301 +
29302 +       request_resource(&iomem_resource, &video_ram_resource);
29303 +
29304 +       {
29305 +       unsigned i;
29306 +       /* request I/O space for devices used on all i[345]86 PCs */
29307 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
29308 +               request_resource(&ioport_resource, &standard_io_resources[i]);
29309 +       }
29310 +
29311 +#ifdef CONFIG_XEN
29312 +       if (is_initial_xendomain())
29313 +               e820_setup_gap(machine_e820.map, machine_e820.nr_map);
29314 +#else
29315 +       e820_setup_gap(e820.map, e820.nr_map);
29316 +#endif
29317 +
29318 +#ifdef CONFIG_XEN
29319 +       {
29320 +               struct physdev_set_iopl set_iopl;
29321 +
29322 +               set_iopl.iopl = 1;
29323 +               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
29324 +
29325 +               if (is_initial_xendomain()) {
29326 +#ifdef CONFIG_VT
29327 +#if defined(CONFIG_VGA_CONSOLE)
29328 +                       conswitchp = &vga_con;
29329 +#elif defined(CONFIG_DUMMY_CONSOLE)
29330 +                       conswitchp = &dummy_con;
29331 +#endif
29332 +#endif
29333 +               } else {
29334 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
29335 +                       conswitchp = &dummy_con;
29336 +#endif
29337 +                }
29338 +       }
29339 +#else  /* CONFIG_XEN */
29340 +
29341 +#ifdef CONFIG_VT
29342 +#if defined(CONFIG_VGA_CONSOLE)
29343 +       conswitchp = &vga_con;
29344 +#elif defined(CONFIG_DUMMY_CONSOLE)
29345 +       conswitchp = &dummy_con;
29346 +#endif
29347 +#endif
29348 +
29349 +#endif /* !CONFIG_XEN */
29350 +}
29351 +
29352 +#ifdef CONFIG_XEN
29353 +static int
29354 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
29355 +{
29356 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
29357 +       /* we're never actually going to get here... */
29358 +       return NOTIFY_DONE;
29359 +}
29360 +#endif /* !CONFIG_XEN */
29361 +
29362 +
29363 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
29364 +{
29365 +       unsigned int *v;
29366 +
29367 +       if (c->extended_cpuid_level < 0x80000004)
29368 +               return 0;
29369 +
29370 +       v = (unsigned int *) c->x86_model_id;
29371 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
29372 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
29373 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
29374 +       c->x86_model_id[48] = 0;
29375 +       return 1;
29376 +}
29377 +
29378 +
29379 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
29380 +{
29381 +       unsigned int n, dummy, eax, ebx, ecx, edx;
29382 +
29383 +       n = c->extended_cpuid_level;
29384 +
29385 +       if (n >= 0x80000005) {
29386 +               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
29387 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
29388 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
29389 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
29390 +               /* On K8 L1 TLB is inclusive, so don't count it */
29391 +               c->x86_tlbsize = 0;
29392 +       }
29393 +
29394 +       if (n >= 0x80000006) {
29395 +               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
29396 +               ecx = cpuid_ecx(0x80000006);
29397 +               c->x86_cache_size = ecx >> 16;
29398 +               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
29399 +
29400 +               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
29401 +               c->x86_cache_size, ecx & 0xFF);
29402 +       }
29403 +
29404 +       if (n >= 0x80000007)
29405 +               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
29406 +       if (n >= 0x80000008) {
29407 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
29408 +               c->x86_virt_bits = (eax >> 8) & 0xff;
29409 +               c->x86_phys_bits = eax & 0xff;
29410 +       }
29411 +}
29412 +
29413 +#ifdef CONFIG_NUMA
29414 +static int nearby_node(int apicid)
29415 +{
29416 +       int i;
29417 +       for (i = apicid - 1; i >= 0; i--) {
29418 +               int node = apicid_to_node[i];
29419 +               if (node != NUMA_NO_NODE && node_online(node))
29420 +                       return node;
29421 +       }
29422 +       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
29423 +               int node = apicid_to_node[i];
29424 +               if (node != NUMA_NO_NODE && node_online(node))
29425 +                       return node;
29426 +       }
29427 +       return first_node(node_online_map); /* Shouldn't happen */
29428 +}
29429 +#endif
29430 +
29431 +/*
29432 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
29433 + * Assumes number of cores is a power of two.
29434 + */
29435 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
29436 +{
29437 +#ifdef CONFIG_SMP
29438 +       unsigned bits;
29439 +#ifdef CONFIG_NUMA
29440 +       int cpu = smp_processor_id();
29441 +       int node = 0;
29442 +       unsigned apicid = hard_smp_processor_id();
29443 +#endif
29444 +       unsigned ecx = cpuid_ecx(0x80000008);
29445 +
29446 +       c->x86_max_cores = (ecx & 0xff) + 1;
29447 +
29448 +       /* CPU telling us the core id bits shift? */
29449 +       bits = (ecx >> 12) & 0xF;
29450 +
29451 +       /* Otherwise recompute */
29452 +       if (bits == 0) {
29453 +               while ((1 << bits) < c->x86_max_cores)
29454 +                       bits++;
29455 +       }
29456 +
29457 +       /* Low order bits define the core id (index of core in socket) */
29458 +       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
29459 +       /* Convert the APIC ID into the socket ID */
29460 +       c->phys_proc_id = phys_pkg_id(bits);
29461 +
29462 +#ifdef CONFIG_NUMA
29463 +       node = c->phys_proc_id;
29464 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
29465 +               node = apicid_to_node[apicid];
29466 +       if (!node_online(node)) {
29467 +               /* Two possibilities here:
29468 +                  - The CPU is missing memory and no node was created.
29469 +                  In that case try picking one from a nearby CPU
29470 +                  - The APIC IDs differ from the HyperTransport node IDs
29471 +                  which the K8 northbridge parsing fills in.
29472 +                  Assume they are all increased by a constant offset,
29473 +                  but in the same order as the HT nodeids.
29474 +                  If that doesn't result in a usable node fall back to the
29475 +                  path for the previous case.  */
29476 +               int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
29477 +               if (ht_nodeid >= 0 &&
29478 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
29479 +                       node = apicid_to_node[ht_nodeid];
29480 +               /* Pick a nearby node */
29481 +               if (!node_online(node))
29482 +                       node = nearby_node(apicid);
29483 +       }
29484 +       numa_set_node(cpu, node);
29485 +
29486 +       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29487 +#endif
29488 +#endif
29489 +}
29490 +
29491 +static void __init init_amd(struct cpuinfo_x86 *c)
29492 +{
29493 +       unsigned level;
29494 +
29495 +#ifdef CONFIG_SMP
29496 +       unsigned long value;
29497 +
29498 +       /*
29499 +        * Disable TLB flush filter by setting HWCR.FFDIS on K8
29500 +        * bit 6 of msr C001_0015
29501 +        *
29502 +        * Errata 63 for SH-B3 steppings
29503 +        * Errata 122 for all steppings (F+ have it disabled by default)
29504 +        */
29505 +       if (c->x86 == 15) {
29506 +               rdmsrl(MSR_K8_HWCR, value);
29507 +               value |= 1 << 6;
29508 +               wrmsrl(MSR_K8_HWCR, value);
29509 +       }
29510 +#endif
29511 +
29512 +       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
29513 +          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
29514 +       clear_bit(0*32+31, &c->x86_capability);
29515 +
29516 +       /* On C+ stepping K8 rep microcode works well for copy/memset */
29517 +       level = cpuid_eax(1);
29518 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
29519 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
29520 +
29521 +       /* Enable workaround for FXSAVE leak */
29522 +       if (c->x86 >= 6)
29523 +               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
29524 +
29525 +       level = get_model_name(c);
29526 +       if (!level) {
29527 +               switch (c->x86) {
29528 +               case 15:
29529 +                       /* Should distinguish Models here, but this is only
29530 +                          a fallback anyways. */
29531 +                       strcpy(c->x86_model_id, "Hammer");
29532 +                       break;
29533 +               }
29534 +       }
29535 +       display_cacheinfo(c);
29536 +
29537 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
29538 +       if (c->x86_power & (1<<8))
29539 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29540 +
29541 +       /* Multi core CPU? */
29542 +       if (c->extended_cpuid_level >= 0x80000008)
29543 +               amd_detect_cmp(c);
29544 +
29545 +       /* Fix cpuid4 emulation for more */
29546 +       num_cache_leaves = 3;
29547 +}
29548 +
29549 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
29550 +{
29551 +#ifdef CONFIG_SMP
29552 +       u32     eax, ebx, ecx, edx;
29553 +       int     index_msb, core_bits;
29554 +
29555 +       cpuid(1, &eax, &ebx, &ecx, &edx);
29556 +
29557 +
29558 +       if (!cpu_has(c, X86_FEATURE_HT))
29559 +               return;
29560 +       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
29561 +               goto out;
29562 +
29563 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
29564 +
29565 +       if (smp_num_siblings == 1) {
29566 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
29567 +       } else if (smp_num_siblings > 1 ) {
29568 +
29569 +               if (smp_num_siblings > NR_CPUS) {
29570 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
29571 +                       smp_num_siblings = 1;
29572 +                       return;
29573 +               }
29574 +
29575 +               index_msb = get_count_order(smp_num_siblings);
29576 +               c->phys_proc_id = phys_pkg_id(index_msb);
29577 +
29578 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
29579 +
29580 +               index_msb = get_count_order(smp_num_siblings) ;
29581 +
29582 +               core_bits = get_count_order(c->x86_max_cores);
29583 +
29584 +               c->cpu_core_id = phys_pkg_id(index_msb) &
29585 +                                              ((1 << core_bits) - 1);
29586 +       }
29587 +out:
29588 +       if ((c->x86_max_cores * smp_num_siblings) > 1) {
29589 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
29590 +               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
29591 +       }
29592 +
29593 +#endif
29594 +}
29595 +
29596 +/*
29597 + * find out the number of processor cores on the die
29598 + */
29599 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
29600 +{
29601 +       unsigned int eax, t;
29602 +
29603 +       if (c->cpuid_level < 4)
29604 +               return 1;
29605 +
29606 +       cpuid_count(4, 0, &eax, &t, &t, &t);
29607 +
29608 +       if (eax & 0x1f)
29609 +               return ((eax >> 26) + 1);
29610 +       else
29611 +               return 1;
29612 +}
29613 +
29614 +static void srat_detect_node(void)
29615 +{
29616 +#ifdef CONFIG_NUMA
29617 +       unsigned node;
29618 +       int cpu = smp_processor_id();
29619 +       int apicid = hard_smp_processor_id();
29620 +
29621 +       /* Don't do the funky fallback heuristics the AMD version employs
29622 +          for now. */
29623 +       node = apicid_to_node[apicid];
29624 +       if (node == NUMA_NO_NODE)
29625 +               node = first_node(node_online_map);
29626 +       numa_set_node(cpu, node);
29627 +
29628 +       if (acpi_numa > 0)
29629 +               printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29630 +#endif
29631 +}
29632 +
29633 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
29634 +{
29635 +       /* Cache sizes */
29636 +       unsigned n;
29637 +
29638 +       init_intel_cacheinfo(c);
29639 +       if (c->cpuid_level > 9 ) {
29640 +               unsigned eax = cpuid_eax(10);
29641 +               /* Check for version and the number of counters */
29642 +               if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
29643 +                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
29644 +       }
29645 +
29646 +       n = c->extended_cpuid_level;
29647 +       if (n >= 0x80000008) {
29648 +               unsigned eax = cpuid_eax(0x80000008);
29649 +               c->x86_virt_bits = (eax >> 8) & 0xff;
29650 +               c->x86_phys_bits = eax & 0xff;
29651 +               /* CPUID workaround for Intel 0F34 CPU */
29652 +               if (c->x86_vendor == X86_VENDOR_INTEL &&
29653 +                   c->x86 == 0xF && c->x86_model == 0x3 &&
29654 +                   c->x86_mask == 0x4)
29655 +                       c->x86_phys_bits = 36;
29656 +       }
29657 +
29658 +       if (c->x86 == 15)
29659 +               c->x86_cache_alignment = c->x86_clflush_size * 2;
29660 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
29661 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
29662 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29663 +       set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
29664 +       c->x86_max_cores = intel_num_cpu_cores(c);
29665 +
29666 +       srat_detect_node();
29667 +}
29668 +
29669 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
29670 +{
29671 +       char *v = c->x86_vendor_id;
29672 +
29673 +       if (!strcmp(v, "AuthenticAMD"))
29674 +               c->x86_vendor = X86_VENDOR_AMD;
29675 +       else if (!strcmp(v, "GenuineIntel"))
29676 +               c->x86_vendor = X86_VENDOR_INTEL;
29677 +       else
29678 +               c->x86_vendor = X86_VENDOR_UNKNOWN;
29679 +}
29680 +
29681 +struct cpu_model_info {
29682 +       int vendor;
29683 +       int family;
29684 +       char *model_names[16];
29685 +};
29686 +
29687 +/* Do some early cpuid on the boot CPU to get some parameter that are
29688 +   needed before check_bugs. Everything advanced is in identify_cpu
29689 +   below. */
29690 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
29691 +{
29692 +       u32 tfms;
29693 +
29694 +       c->loops_per_jiffy = loops_per_jiffy;
29695 +       c->x86_cache_size = -1;
29696 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
29697 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
29698 +       c->x86_vendor_id[0] = '\0'; /* Unset */
29699 +       c->x86_model_id[0] = '\0';  /* Unset */
29700 +       c->x86_clflush_size = 64;
29701 +       c->x86_cache_alignment = c->x86_clflush_size;
29702 +       c->x86_max_cores = 1;
29703 +       c->extended_cpuid_level = 0;
29704 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
29705 +
29706 +       /* Get vendor name */
29707 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
29708 +             (unsigned int *)&c->x86_vendor_id[0],
29709 +             (unsigned int *)&c->x86_vendor_id[8],
29710 +             (unsigned int *)&c->x86_vendor_id[4]);
29711 +
29712 +       get_cpu_vendor(c);
29713 +
29714 +       /* Initialize the standard set of capabilities */
29715 +       /* Note that the vendor-specific code below might override */
29716 +
29717 +       /* Intel-defined flags: level 0x00000001 */
29718 +       if (c->cpuid_level >= 0x00000001) {
29719 +               __u32 misc;
29720 +               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
29721 +                     &c->x86_capability[0]);
29722 +               c->x86 = (tfms >> 8) & 0xf;
29723 +               c->x86_model = (tfms >> 4) & 0xf;
29724 +               c->x86_mask = tfms & 0xf;
29725 +               if (c->x86 == 0xf)
29726 +                       c->x86 += (tfms >> 20) & 0xff;
29727 +               if (c->x86 >= 0x6)
29728 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
29729 +               if (c->x86_capability[0] & (1<<19))
29730 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
29731 +       } else {
29732 +               /* Have CPUID level 0 only - unheard of */
29733 +               c->x86 = 4;
29734 +       }
29735 +
29736 +#ifdef CONFIG_SMP
29737 +       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
29738 +#endif
29739 +}
29740 +
29741 +/*
29742 + * This does the hard work of actually picking apart the CPU stuff...
29743 + */
29744 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
29745 +{
29746 +       int i;
29747 +       u32 xlvl;
29748 +
29749 +       early_identify_cpu(c);
29750 +
29751 +       /* AMD-defined flags: level 0x80000001 */
29752 +       xlvl = cpuid_eax(0x80000000);
29753 +       c->extended_cpuid_level = xlvl;
29754 +       if ((xlvl & 0xffff0000) == 0x80000000) {
29755 +               if (xlvl >= 0x80000001) {
29756 +                       c->x86_capability[1] = cpuid_edx(0x80000001);
29757 +                       c->x86_capability[6] = cpuid_ecx(0x80000001);
29758 +               }
29759 +               if (xlvl >= 0x80000004)
29760 +                       get_model_name(c); /* Default name */
29761 +       }
29762 +
29763 +       /* Transmeta-defined flags: level 0x80860001 */
29764 +       xlvl = cpuid_eax(0x80860000);
29765 +       if ((xlvl & 0xffff0000) == 0x80860000) {
29766 +               /* Don't set x86_cpuid_level here for now to not confuse. */
29767 +               if (xlvl >= 0x80860001)
29768 +                       c->x86_capability[2] = cpuid_edx(0x80860001);
29769 +       }
29770 +
29771 +       c->apicid = phys_pkg_id(0);
29772 +
29773 +       /*
29774 +        * Vendor-specific initialization.  In this section we
29775 +        * canonicalize the feature flags, meaning if there are
29776 +        * features a certain CPU supports which CPUID doesn't
29777 +        * tell us, CPUID claiming incorrect flags, or other bugs,
29778 +        * we handle them here.
29779 +        *
29780 +        * At the end of this section, c->x86_capability better
29781 +        * indicate the features this CPU genuinely supports!
29782 +        */
29783 +       switch (c->x86_vendor) {
29784 +       case X86_VENDOR_AMD:
29785 +               init_amd(c);
29786 +               break;
29787 +
29788 +       case X86_VENDOR_INTEL:
29789 +               init_intel(c);
29790 +               break;
29791 +
29792 +       case X86_VENDOR_UNKNOWN:
29793 +       default:
29794 +               display_cacheinfo(c);
29795 +               break;
29796 +       }
29797 +
29798 +       select_idle_routine(c);
29799 +       detect_ht(c);
29800 +
29801 +       /*
29802 +        * On SMP, boot_cpu_data holds the common feature set between
29803 +        * all CPUs; so make sure that we indicate which features are
29804 +        * common between the CPUs.  The first time this routine gets
29805 +        * executed, c == &boot_cpu_data.
29806 +        */
29807 +       if (c != &boot_cpu_data) {
29808 +               /* AND the already accumulated flags with these */
29809 +               for (i = 0 ; i < NCAPINTS ; i++)
29810 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
29811 +       }
29812 +
29813 +#ifdef CONFIG_X86_MCE
29814 +       mcheck_init(c);
29815 +#endif
29816 +       if (c == &boot_cpu_data)
29817 +               mtrr_bp_init();
29818 +       else
29819 +               mtrr_ap_init();
29820 +#ifdef CONFIG_NUMA
29821 +       numa_add_cpu(smp_processor_id());
29822 +#endif
29823 +}
29824 +
29825 +
29826 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
29827 +{
29828 +       if (c->x86_model_id[0])
29829 +               printk("%s", c->x86_model_id);
29830 +
29831 +       if (c->x86_mask || c->cpuid_level >= 0)
29832 +               printk(" stepping %02x\n", c->x86_mask);
29833 +       else
29834 +               printk("\n");
29835 +}
29836 +
29837 +/*
29838 + *     Get CPU information for use by the procfs.
29839 + */
29840 +
29841 +static int show_cpuinfo(struct seq_file *m, void *v)
29842 +{
29843 +       struct cpuinfo_x86 *c = v;
29844 +
29845 +       /*
29846 +        * These flag bits must match the definitions in <asm/cpufeature.h>.
29847 +        * NULL means this bit is undefined or reserved; either way it doesn't
29848 +        * have meaning as far as Linux is concerned.  Note that it's important
29849 +        * to realize there is a difference between this table and CPUID -- if
29850 +        * applications want to get the raw CPUID data, they should access
29851 +        * /dev/cpu/<cpu_nr>/cpuid instead.
29852 +        */
29853 +       static char *x86_cap_flags[] = {
29854 +               /* Intel-defined */
29855 +               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
29856 +               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
29857 +               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
29858 +               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
29859 +
29860 +               /* AMD-defined */
29861 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29862 +               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
29863 +               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
29864 +               NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
29865 +
29866 +               /* Transmeta-defined */
29867 +               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
29868 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29869 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29870 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29871 +
29872 +               /* Other (Linux-defined) */
29873 +               "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
29874 +               "constant_tsc", NULL, NULL,
29875 +               "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29876 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29877 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29878 +
29879 +               /* Intel-defined (#2) */
29880 +               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
29881 +               "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
29882 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29883 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29884 +
29885 +               /* VIA/Cyrix/Centaur-defined */
29886 +               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
29887 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29888 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29889 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29890 +
29891 +               /* AMD-defined (#2) */
29892 +               "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
29893 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29894 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29895 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29896 +       };
29897 +       static char *x86_power_flags[] = {
29898 +               "ts",   /* temperature sensor */
29899 +               "fid",  /* frequency id control */
29900 +               "vid",  /* voltage id control */
29901 +               "ttp",  /* thermal trip */
29902 +               "tm",
29903 +               "stc",
29904 +               NULL,
29905 +               /* nothing */   /* constant_tsc - moved to flags */
29906 +       };
29907 +
29908 +
29909 +#ifdef CONFIG_SMP
29910 +       if (!cpu_online(c-cpu_data))
29911 +               return 0;
29912 +#endif
29913 +
29914 +       seq_printf(m,"processor\t: %u\n"
29915 +                    "vendor_id\t: %s\n"
29916 +                    "cpu family\t: %d\n"
29917 +                    "model\t\t: %d\n"
29918 +                    "model name\t: %s\n",
29919 +                    (unsigned)(c-cpu_data),
29920 +                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
29921 +                    c->x86,
29922 +                    (int)c->x86_model,
29923 +                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
29924 +
29925 +       if (c->x86_mask || c->cpuid_level >= 0)
29926 +               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
29927 +       else
29928 +               seq_printf(m, "stepping\t: unknown\n");
29929 +
29930 +       if (cpu_has(c,X86_FEATURE_TSC)) {
29931 +               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
29932 +               if (!freq)
29933 +                       freq = cpu_khz;
29934 +               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
29935 +                            freq / 1000, (freq % 1000));
29936 +       }
29937 +
29938 +       /* Cache size */
29939 +       if (c->x86_cache_size >= 0)
29940 +               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
29941 +
29942 +#ifdef CONFIG_SMP
29943 +       if (smp_num_siblings * c->x86_max_cores > 1) {
29944 +               int cpu = c - cpu_data;
29945 +               seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
29946 +               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
29947 +               seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
29948 +               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
29949 +       }
29950 +#endif
29951 +
29952 +       seq_printf(m,
29953 +               "fpu\t\t: yes\n"
29954 +               "fpu_exception\t: yes\n"
29955 +               "cpuid level\t: %d\n"
29956 +               "wp\t\t: yes\n"
29957 +               "flags\t\t:",
29958 +                  c->cpuid_level);
29959 +
29960 +       {
29961 +               int i;
29962 +               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
29963 +                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
29964 +                               seq_printf(m, " %s", x86_cap_flags[i]);
29965 +       }
29966 +
29967 +       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
29968 +                  c->loops_per_jiffy/(500000/HZ),
29969 +                  (c->loops_per_jiffy/(5000/HZ)) % 100);
29970 +
29971 +       if (c->x86_tlbsize > 0)
29972 +               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
29973 +       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
29974 +       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
29975 +
29976 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
29977 +                  c->x86_phys_bits, c->x86_virt_bits);
29978 +
29979 +       seq_printf(m, "power management:");
29980 +       {
29981 +               unsigned i;
29982 +               for (i = 0; i < 32; i++)
29983 +                       if (c->x86_power & (1 << i)) {
29984 +                               if (i < ARRAY_SIZE(x86_power_flags) &&
29985 +                                       x86_power_flags[i])
29986 +                                       seq_printf(m, "%s%s",
29987 +                                               x86_power_flags[i][0]?" ":"",
29988 +                                               x86_power_flags[i]);
29989 +                               else
29990 +                                       seq_printf(m, " [%d]", i);
29991 +                       }
29992 +       }
29993 +
29994 +       seq_printf(m, "\n\n");
29995 +
29996 +       return 0;
29997 +}
29998 +
29999 +static void *c_start(struct seq_file *m, loff_t *pos)
30000 +{
30001 +       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
30002 +}
30003 +
30004 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
30005 +{
30006 +       ++*pos;
30007 +       return c_start(m, pos);
30008 +}
30009 +
30010 +static void c_stop(struct seq_file *m, void *v)
30011 +{
30012 +}
30013 +
30014 +struct seq_operations cpuinfo_op = {
30015 +       .start =c_start,
30016 +       .next = c_next,
30017 +       .stop = c_stop,
30018 +       .show = show_cpuinfo,
30019 +};
30020 +
30021 +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
30022 +#include <linux/platform_device.h>
30023 +static __init int add_pcspkr(void)
30024 +{
30025 +       struct platform_device *pd;
30026 +       int ret;
30027 +
30028 +       if (!is_initial_xendomain())
30029 +               return 0;
30030 +
30031 +       pd = platform_device_alloc("pcspkr", -1);
30032 +       if (!pd)
30033 +               return -ENOMEM;
30034 +
30035 +       ret = platform_device_add(pd);
30036 +       if (ret)
30037 +               platform_device_put(pd);
30038 +
30039 +       return ret;
30040 +}
30041 +device_initcall(add_pcspkr);
30042 +#endif
30043 Index: head-2008-11-25/arch/x86/kernel/setup64-xen.c
30044 ===================================================================
30045 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
30046 +++ head-2008-11-25/arch/x86/kernel/setup64-xen.c       2008-01-28 12:24:19.000000000 +0100
30047 @@ -0,0 +1,367 @@
30048 +/*
30049 + * X86-64 specific CPU setup.
30050 + * Copyright (C) 1995  Linus Torvalds
30051 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
30052 + * See setup.c for older changelog.
30053 + *
30054 + * Jun Nakajima <jun.nakajima@intel.com>
30055 + *   Modified for Xen
30056 + *
30057 + */
30058 +#include <linux/init.h>
30059 +#include <linux/kernel.h>
30060 +#include <linux/sched.h>
30061 +#include <linux/string.h>
30062 +#include <linux/bootmem.h>
30063 +#include <linux/bitops.h>
30064 +#include <linux/module.h>
30065 +#include <asm/bootsetup.h>
30066 +#include <asm/pda.h>
30067 +#include <asm/pgtable.h>
30068 +#include <asm/processor.h>
30069 +#include <asm/desc.h>
30070 +#include <asm/atomic.h>
30071 +#include <asm/mmu_context.h>
30072 +#include <asm/smp.h>
30073 +#include <asm/i387.h>
30074 +#include <asm/percpu.h>
30075 +#include <asm/proto.h>
30076 +#include <asm/sections.h>
30077 +#ifdef CONFIG_XEN
30078 +#include <asm/hypervisor.h>
30079 +#endif
30080 +
30081 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30082 +
30083 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30084 +
30085 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
30086 +EXPORT_SYMBOL(_cpu_pda);
30087 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
30088 +
30089 +#ifndef CONFIG_X86_NO_IDT
30090 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
30091 +#endif
30092 +
30093 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
30094 +
30095 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
30096 +EXPORT_SYMBOL(__supported_pte_mask);
30097 +static int do_not_nx __cpuinitdata = 0;
30098 +
30099 +/* noexec=on|off
30100 +Control non executable mappings for 64bit processes.
30101 +
30102 +on     Enable(default)
30103 +off    Disable
30104 +*/
30105 +int __init nonx_setup(char *str)
30106 +{
30107 +       if (!strncmp(str, "on", 2)) {
30108 +                __supported_pte_mask |= _PAGE_NX;
30109 +               do_not_nx = 0;
30110 +       } else if (!strncmp(str, "off", 3)) {
30111 +               do_not_nx = 1;
30112 +               __supported_pte_mask &= ~_PAGE_NX;
30113 +        }
30114 +       return 1;
30115 +}
30116 +__setup("noexec=", nonx_setup);        /* parsed early actually */
30117 +
30118 +int force_personality32 = 0;
30119 +
30120 +/* noexec32=on|off
30121 +Control non executable heap for 32bit processes.
30122 +To control the stack too use noexec=off
30123 +
30124 +on     PROT_READ does not imply PROT_EXEC for 32bit processes
30125 +off    PROT_READ implies PROT_EXEC (default)
30126 +*/
30127 +static int __init nonx32_setup(char *str)
30128 +{
30129 +       if (!strcmp(str, "on"))
30130 +               force_personality32 &= ~READ_IMPLIES_EXEC;
30131 +       else if (!strcmp(str, "off"))
30132 +               force_personality32 |= READ_IMPLIES_EXEC;
30133 +       return 1;
30134 +}
30135 +__setup("noexec32=", nonx32_setup);
30136 +
30137 +/*
30138 + * Great future plan:
30139 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
30140 + * Always point %gs to its beginning
30141 + */
30142 +void __init setup_per_cpu_areas(void)
30143 +{
30144 +       int i;
30145 +       unsigned long size;
30146 +
30147 +#ifdef CONFIG_HOTPLUG_CPU
30148 +       prefill_possible_map();
30149 +#endif
30150 +
30151 +       /* Copy section for each CPU (we discard the original) */
30152 +       size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
30153 +#ifdef CONFIG_MODULES
30154 +       if (size < PERCPU_ENOUGH_ROOM)
30155 +               size = PERCPU_ENOUGH_ROOM;
30156 +#endif
30157 +
30158 +       for_each_cpu_mask (i, cpu_possible_map) {
30159 +               char *ptr;
30160 +
30161 +               if (!NODE_DATA(cpu_to_node(i))) {
30162 +                       printk("cpu with no node %d, num_online_nodes %d\n",
30163 +                              i, num_online_nodes());
30164 +                       ptr = alloc_bootmem(size);
30165 +               } else {
30166 +                       ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
30167 +               }
30168 +               if (!ptr)
30169 +                       panic("Cannot allocate cpu data for CPU %d\n", i);
30170 +               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
30171 +               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
30172 +       }
30173 +}
30174 +
30175 +#ifdef CONFIG_XEN
30176 +static void switch_pt(void)
30177 +{
30178 +       xen_pt_switch(__pa_symbol(init_level4_pgt));
30179 +       xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
30180 +}
30181 +
30182 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30183 +{
30184 +       unsigned long frames[16];
30185 +       unsigned long va;
30186 +       int f;
30187 +
30188 +       for (va = gdt_descr->address, f = 0;
30189 +            va < gdt_descr->address + gdt_descr->size;
30190 +            va += PAGE_SIZE, f++) {
30191 +               frames[f] = virt_to_mfn(va);
30192 +               make_page_readonly(
30193 +                       (void *)va, XENFEAT_writable_descriptor_tables);
30194 +       }
30195 +       if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
30196 +                               sizeof (struct desc_struct)))
30197 +               BUG();
30198 +}
30199 +#else
30200 +static void switch_pt(void)
30201 +{
30202 +       asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
30203 +}
30204 +
30205 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30206 +{
30207 +       asm volatile("lgdt %0" :: "m" (*gdt_descr));
30208 +       asm volatile("lidt %0" :: "m" (idt_descr));
30209 +}
30210 +#endif
30211 +
30212 +void pda_init(int cpu)
30213 +{
30214 +       struct x8664_pda *pda = cpu_pda(cpu);
30215 +
30216 +       /* Setup up data that may be needed in __get_free_pages early */
30217 +       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
30218 +#ifndef CONFIG_XEN
30219 +       wrmsrl(MSR_GS_BASE, pda);
30220 +#else
30221 +       if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
30222 +                                       (unsigned long)pda))
30223 +               BUG();
30224 +#endif
30225 +       pda->cpunumber = cpu;
30226 +       pda->irqcount = -1;
30227 +       pda->kernelstack =
30228 +               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
30229 +       pda->active_mm = &init_mm;
30230 +       pda->mmu_state = 0;
30231 +
30232 +       if (cpu == 0) {
30233 +#ifdef CONFIG_XEN
30234 +               xen_init_pt();
30235 +#endif
30236 +               /* others are initialized in smpboot.c */
30237 +               pda->pcurrent = &init_task;
30238 +               pda->irqstackptr = boot_cpu_stack;
30239 +       } else {
30240 +               pda->irqstackptr = (char *)
30241 +                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
30242 +               if (!pda->irqstackptr)
30243 +                       panic("cannot allocate irqstack for cpu %d", cpu);
30244 +       }
30245 +
30246 +       switch_pt();
30247 +
30248 +       pda->irqstackptr += IRQSTACKSIZE-64;
30249 +}
30250 +
30251 +#ifndef CONFIG_X86_NO_TSS
30252 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
30253 +__attribute__((section(".bss.page_aligned")));
30254 +#endif
30255 +
30256 +/* May not be marked __init: used by software suspend */
30257 +void syscall_init(void)
30258 +{
30259 +#ifndef CONFIG_XEN
30260 +       /*
30261 +        * LSTAR and STAR live in a bit strange symbiosis.
30262 +        * They both write to the same internal register. STAR allows to set CS/DS
30263 +        * but only a 32bit target. LSTAR sets the 64bit rip.
30264 +        */
30265 +       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
30266 +       wrmsrl(MSR_LSTAR, system_call);
30267 +
30268 +       /* Flags to clear on syscall */
30269 +       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
30270 +#endif
30271 +#ifdef CONFIG_IA32_EMULATION
30272 +       syscall32_cpu_init ();
30273 +#endif
30274 +}
30275 +
30276 +void __cpuinit check_efer(void)
30277 +{
30278 +       unsigned long efer;
30279 +
30280 +       rdmsrl(MSR_EFER, efer);
30281 +        if (!(efer & EFER_NX) || do_not_nx) {
30282 +                __supported_pte_mask &= ~_PAGE_NX;
30283 +        }
30284 +}
30285 +
30286 +unsigned long kernel_eflags;
30287 +
30288 +/*
30289 + * cpu_init() initializes state that is per-CPU. Some data is already
30290 + * initialized (naturally) in the bootstrap process, such as the GDT
30291 + * and IDT. We reload them nevertheless, this function acts as a
30292 + * 'CPU state barrier', nothing should get across.
30293 + * A lot of state is already set up in PDA init.
30294 + */
30295 +void __cpuinit cpu_init (void)
30296 +{
30297 +       int cpu = stack_smp_processor_id();
30298 +#ifndef CONFIG_X86_NO_TSS
30299 +       struct tss_struct *t = &per_cpu(init_tss, cpu);
30300 +       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
30301 +       unsigned long v;
30302 +       char *estacks = NULL;
30303 +       unsigned i;
30304 +#endif
30305 +       struct task_struct *me;
30306 +
30307 +       /* CPU 0 is initialised in head64.c */
30308 +       if (cpu != 0) {
30309 +               pda_init(cpu);
30310 +               zap_low_mappings(cpu);
30311 +       }
30312 +#ifndef CONFIG_X86_NO_TSS
30313 +       else
30314 +               estacks = boot_exception_stacks;
30315 +#endif
30316 +
30317 +       me = current;
30318 +
30319 +       if (cpu_test_and_set(cpu, cpu_initialized))
30320 +               panic("CPU#%d already initialized!\n", cpu);
30321 +
30322 +       printk("Initializing CPU#%d\n", cpu);
30323 +
30324 +       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
30325 +
30326 +       /*
30327 +        * Initialize the per-CPU GDT with the boot GDT,
30328 +        * and set up the GDT descriptor:
30329 +        */
30330 +#ifndef CONFIG_XEN
30331 +       if (cpu)
30332 +               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
30333 +#endif
30334 +
30335 +       cpu_gdt_descr[cpu].size = GDT_SIZE;
30336 +       cpu_gdt_init(&cpu_gdt_descr[cpu]);
30337 +
30338 +       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
30339 +       syscall_init();
30340 +
30341 +       wrmsrl(MSR_FS_BASE, 0);
30342 +       wrmsrl(MSR_KERNEL_GS_BASE, 0);
30343 +       barrier();
30344 +
30345 +       check_efer();
30346 +
30347 +#ifndef CONFIG_X86_NO_TSS
30348 +       /*
30349 +        * set up and load the per-CPU TSS
30350 +        */
30351 +       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
30352 +               if (cpu) {
30353 +                       static const unsigned int order[N_EXCEPTION_STACKS] = {
30354 +                               [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
30355 +                               [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
30356 +                       };
30357 +
30358 +                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
30359 +                       if (!estacks)
30360 +                               panic("Cannot allocate exception stack %ld %d\n",
30361 +                                     v, cpu);
30362 +               }
30363 +               switch (v + 1) {
30364 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
30365 +               case DEBUG_STACK:
30366 +                       cpu_pda(cpu)->debugstack = (unsigned long)estacks;
30367 +                       estacks += DEBUG_STKSZ;
30368 +                       break;
30369 +#endif
30370 +               default:
30371 +                       estacks += EXCEPTION_STKSZ;
30372 +                       break;
30373 +               }
30374 +               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
30375 +       }
30376 +
30377 +       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
30378 +       /*
30379 +        * <= is required because the CPU will access up to
30380 +        * 8 bits beyond the end of the IO permission bitmap.
30381 +        */
30382 +       for (i = 0; i <= IO_BITMAP_LONGS; i++)
30383 +               t->io_bitmap[i] = ~0UL;
30384 +#endif
30385 +
30386 +       atomic_inc(&init_mm.mm_count);
30387 +       me->active_mm = &init_mm;
30388 +       if (me->mm)
30389 +               BUG();
30390 +       enter_lazy_tlb(&init_mm, me);
30391 +
30392 +#ifndef CONFIG_X86_NO_TSS
30393 +       set_tss_desc(cpu, t);
30394 +#endif
30395 +#ifndef CONFIG_XEN
30396 +       load_TR_desc();
30397 +#endif
30398 +       load_LDT(&init_mm.context);
30399 +
30400 +       /*
30401 +        * Clear all 6 debug registers:
30402 +        */
30403 +
30404 +       set_debugreg(0UL, 0);
30405 +       set_debugreg(0UL, 1);
30406 +       set_debugreg(0UL, 2);
30407 +       set_debugreg(0UL, 3);
30408 +       set_debugreg(0UL, 6);
30409 +       set_debugreg(0UL, 7);
30410 +
30411 +       fpu_init();
30412 +
30413 +       raw_local_save_flags(kernel_eflags);
30414 +}
30415 Index: head-2008-11-25/arch/x86/kernel/smp_64-xen.c
30416 ===================================================================
30417 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
30418 +++ head-2008-11-25/arch/x86/kernel/smp_64-xen.c        2008-04-02 12:34:02.000000000 +0200
30419 @@ -0,0 +1,575 @@
30420 +/*
30421 + *     Intel SMP support routines.
30422 + *
30423 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
30424 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
30425 + *      (c) 2002,2003 Andi Kleen, SuSE Labs.
30426 + *
30427 + *     This code is released under the GNU General Public License version 2 or
30428 + *     later.
30429 + */
30430 +
30431 +#include <linux/init.h>
30432 +
30433 +#include <linux/mm.h>
30434 +#include <linux/delay.h>
30435 +#include <linux/spinlock.h>
30436 +#include <linux/smp_lock.h>
30437 +#include <linux/smp.h>
30438 +#include <linux/kernel_stat.h>
30439 +#include <linux/mc146818rtc.h>
30440 +#include <linux/interrupt.h>
30441 +
30442 +#include <asm/mtrr.h>
30443 +#include <asm/pgalloc.h>
30444 +#include <asm/tlbflush.h>
30445 +#include <asm/mach_apic.h>
30446 +#include <asm/mmu_context.h>
30447 +#include <asm/proto.h>
30448 +#include <asm/apicdef.h>
30449 +#include <asm/idle.h>
30450 +#ifdef CONFIG_XEN
30451 +#include <xen/evtchn.h>
30452 +#endif
30453 +
30454 +#ifndef CONFIG_XEN
30455 +/*
30456 + *     Smarter SMP flushing macros.
30457 + *             c/o Linus Torvalds.
30458 + *
30459 + *     These mean you can really definitely utterly forget about
30460 + *     writing to user space from interrupts. (Its not allowed anyway).
30461 + *
30462 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
30463 + *
30464 + *     More scalable flush, from Andi Kleen
30465 + *
30466 + *     To avoid global state use 8 different call vectors.
30467 + *     Each CPU uses a specific vector to trigger flushes on other
30468 + *     CPUs. Depending on the received vector the target CPUs look into
30469 + *     the right per cpu variable for the flush data.
30470 + *
30471 + *     With more than 8 CPUs they are hashed to the 8 available
30472 + *     vectors. The limited global vector space forces us to this right now.
30473 + *     In future when interrupts are split into per CPU domains this could be
30474 + *     fixed, at the cost of triggering multiple IPIs in some cases.
30475 + */
30476 +
30477 +union smp_flush_state {
30478 +       struct {
30479 +               cpumask_t flush_cpumask;
30480 +               struct mm_struct *flush_mm;
30481 +               unsigned long flush_va;
30482 +#define FLUSH_ALL      -1ULL
30483 +               spinlock_t tlbstate_lock;
30484 +       };
30485 +       char pad[SMP_CACHE_BYTES];
30486 +} ____cacheline_aligned;
30487 +
30488 +/* State is put into the per CPU data section, but padded
30489 +   to a full cache line because other CPUs can access it and we don't
30490 +   want false sharing in the per cpu data segment. */
30491 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
30492 +
30493 +/*
30494 + * We cannot call mmdrop() because we are in interrupt context,
30495 + * instead update mm->cpu_vm_mask.
30496 + */
30497 +static inline void leave_mm(unsigned long cpu)
30498 +{
30499 +       if (read_pda(mmu_state) == TLBSTATE_OK)
30500 +               BUG();
30501 +       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
30502 +       load_cr3(swapper_pg_dir);
30503 +}
30504 +
30505 +/*
30506 + *
30507 + * The flush IPI assumes that a thread switch happens in this order:
30508 + * [cpu0: the cpu that switches]
30509 + * 1) switch_mm() either 1a) or 1b)
30510 + * 1a) thread switch to a different mm
30511 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
30512 + *     Stop ipi delivery for the old mm. This is not synchronized with
30513 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
30514 + *     for the wrong mm, and in the worst case we perform a superfluous
30515 + *     tlb flush.
30516 + * 1a2) set cpu mmu_state to TLBSTATE_OK
30517 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
30518 + *     was in lazy tlb mode.
30519 + * 1a3) update cpu active_mm
30520 + *     Now cpu0 accepts tlb flushes for the new mm.
30521 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
30522 + *     Now the other cpus will send tlb flush ipis.
30523 + * 1a4) change cr3.
30524 + * 1b) thread switch without mm change
30525 + *     cpu active_mm is correct, cpu0 already handles
30526 + *     flush ipis.
30527 + * 1b1) set cpu mmu_state to TLBSTATE_OK
30528 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
30529 + *     Atomically set the bit [other cpus will start sending flush ipis],
30530 + *     and test the bit.
30531 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
30532 + * 2) switch %%esp, ie current
30533 + *
30534 + * The interrupt must handle 2 special cases:
30535 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
30536 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
30537 + *   runs in kernel space, the cpu could load tlb entries for user space
30538 + *   pages.
30539 + *
30540 + * The good news is that cpu mmu_state is local to each cpu, no
30541 + * write/read ordering problems.
30542 + */
30543 +
30544 +/*
30545 + * TLB flush IPI:
30546 + *
30547 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
30548 + * 2) Leave the mm if we are in the lazy tlb mode.
30549 + *
30550 + * Interrupts are disabled.
30551 + */
30552 +
30553 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
30554 +{
30555 +       int cpu;
30556 +       int sender;
30557 +       union smp_flush_state *f;
30558 +
30559 +       cpu = smp_processor_id();
30560 +       /*
30561 +        * orig_rax contains the negated interrupt vector.
30562 +        * Use that to determine where the sender put the data.
30563 +        */
30564 +       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
30565 +       f = &per_cpu(flush_state, sender);
30566 +
30567 +       if (!cpu_isset(cpu, f->flush_cpumask))
30568 +               goto out;
30569 +               /*
30570 +                * This was a BUG() but until someone can quote me the
30571 +                * line from the intel manual that guarantees an IPI to
30572 +                * multiple CPUs is retried _only_ on the erroring CPUs
30573 +                * its staying as a return
30574 +                *
30575 +                * BUG();
30576 +                */
30577 +
30578 +       if (f->flush_mm == read_pda(active_mm)) {
30579 +               if (read_pda(mmu_state) == TLBSTATE_OK) {
30580 +                       if (f->flush_va == FLUSH_ALL)
30581 +                               local_flush_tlb();
30582 +                       else
30583 +                               __flush_tlb_one(f->flush_va);
30584 +               } else
30585 +                       leave_mm(cpu);
30586 +       }
30587 +out:
30588 +       ack_APIC_irq();
30589 +       cpu_clear(cpu, f->flush_cpumask);
30590 +}
30591 +
30592 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
30593 +                                               unsigned long va)
30594 +{
30595 +       int sender;
30596 +       union smp_flush_state *f;
30597 +
30598 +       /* Caller has disabled preemption */
30599 +       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
30600 +       f = &per_cpu(flush_state, sender);
30601 +
30602 +       /* Could avoid this lock when
30603 +          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
30604 +          probably not worth checking this for a cache-hot lock. */
30605 +       spin_lock(&f->tlbstate_lock);
30606 +
30607 +       f->flush_mm = mm;
30608 +       f->flush_va = va;
30609 +       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
30610 +
30611 +       /*
30612 +        * We have to send the IPI only to
30613 +        * CPUs affected.
30614 +        */
30615 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
30616 +
30617 +       while (!cpus_empty(f->flush_cpumask))
30618 +               cpu_relax();
30619 +
30620 +       f->flush_mm = NULL;
30621 +       f->flush_va = 0;
30622 +       spin_unlock(&f->tlbstate_lock);
30623 +}
30624 +
30625 +int __cpuinit init_smp_flush(void)
30626 +{
30627 +       int i;
30628 +       for_each_cpu_mask(i, cpu_possible_map) {
30629 +               spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
30630 +       }
30631 +       return 0;
30632 +}
30633 +
30634 +core_initcall(init_smp_flush);
30635 +
30636 +void flush_tlb_current_task(void)
30637 +{
30638 +       struct mm_struct *mm = current->mm;
30639 +       cpumask_t cpu_mask;
30640 +
30641 +       preempt_disable();
30642 +       cpu_mask = mm->cpu_vm_mask;
30643 +       cpu_clear(smp_processor_id(), cpu_mask);
30644 +
30645 +       local_flush_tlb();
30646 +       if (!cpus_empty(cpu_mask))
30647 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30648 +       preempt_enable();
30649 +}
30650 +EXPORT_SYMBOL(flush_tlb_current_task);
30651 +
30652 +void flush_tlb_mm (struct mm_struct * mm)
30653 +{
30654 +       cpumask_t cpu_mask;
30655 +
30656 +       preempt_disable();
30657 +       cpu_mask = mm->cpu_vm_mask;
30658 +       cpu_clear(smp_processor_id(), cpu_mask);
30659 +
30660 +       if (current->active_mm == mm) {
30661 +               if (current->mm)
30662 +                       local_flush_tlb();
30663 +               else
30664 +                       leave_mm(smp_processor_id());
30665 +       }
30666 +       if (!cpus_empty(cpu_mask))
30667 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30668 +
30669 +       preempt_enable();
30670 +}
30671 +EXPORT_SYMBOL(flush_tlb_mm);
30672 +
30673 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
30674 +{
30675 +       struct mm_struct *mm = vma->vm_mm;
30676 +       cpumask_t cpu_mask;
30677 +
30678 +       preempt_disable();
30679 +       cpu_mask = mm->cpu_vm_mask;
30680 +       cpu_clear(smp_processor_id(), cpu_mask);
30681 +
30682 +       if (current->active_mm == mm) {
30683 +               if(current->mm)
30684 +                       __flush_tlb_one(va);
30685 +                else
30686 +                       leave_mm(smp_processor_id());
30687 +       }
30688 +
30689 +       if (!cpus_empty(cpu_mask))
30690 +               flush_tlb_others(cpu_mask, mm, va);
30691 +
30692 +       preempt_enable();
30693 +}
30694 +EXPORT_SYMBOL(flush_tlb_page);
30695 +
30696 +static void do_flush_tlb_all(void* info)
30697 +{
30698 +       unsigned long cpu = smp_processor_id();
30699 +
30700 +       __flush_tlb_all();
30701 +       if (read_pda(mmu_state) == TLBSTATE_LAZY)
30702 +               leave_mm(cpu);
30703 +}
30704 +
30705 +void flush_tlb_all(void)
30706 +{
30707 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
30708 +}
30709 +#endif /* Xen */
30710 +
30711 +/*
30712 + * this function sends a 'reschedule' IPI to another CPU.
30713 + * it goes straight through and wastes no time serializing
30714 + * anything. Worst case is that we lose a reschedule ...
30715 + */
30716 +
30717 +void smp_send_reschedule(int cpu)
30718 +{
30719 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
30720 +}
30721 +
30722 +/*
30723 + * Structure and data for smp_call_function(). This is designed to minimise
30724 + * static memory requirements. It also looks cleaner.
30725 + */
30726 +static DEFINE_SPINLOCK(call_lock);
30727 +
30728 +struct call_data_struct {
30729 +       void (*func) (void *info);
30730 +       void *info;
30731 +       atomic_t started;
30732 +       atomic_t finished;
30733 +       int wait;
30734 +};
30735 +
30736 +static struct call_data_struct * call_data;
30737 +
30738 +void lock_ipi_call_lock(void)
30739 +{
30740 +       spin_lock_irq(&call_lock);
30741 +}
30742 +
30743 +void unlock_ipi_call_lock(void)
30744 +{
30745 +       spin_unlock_irq(&call_lock);
30746 +}
30747 +
30748 +/*
30749 + * this function sends a 'generic call function' IPI to one other CPU
30750 + * in the system.
30751 + *
30752 + * cpu is a standard Linux logical CPU number.
30753 + */
30754 +static void
30755 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
30756 +                               int nonatomic, int wait)
30757 +{
30758 +       struct call_data_struct data;
30759 +       int cpus = 1;
30760 +
30761 +       data.func = func;
30762 +       data.info = info;
30763 +       atomic_set(&data.started, 0);
30764 +       data.wait = wait;
30765 +       if (wait)
30766 +               atomic_set(&data.finished, 0);
30767 +
30768 +       call_data = &data;
30769 +       wmb();
30770 +       /* Send a message to all other CPUs and wait for them to respond */
30771 +       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
30772 +
30773 +       /* Wait for response */
30774 +       while (atomic_read(&data.started) != cpus)
30775 +               cpu_relax();
30776 +
30777 +       if (!wait)
30778 +               return;
30779 +
30780 +       while (atomic_read(&data.finished) != cpus)
30781 +               cpu_relax();
30782 +}
30783 +
30784 +/*
30785 + * smp_call_function_single - Run a function on another CPU
30786 + * @func: The function to run. This must be fast and non-blocking.
30787 + * @info: An arbitrary pointer to pass to the function.
30788 + * @nonatomic: Currently unused.
30789 + * @wait: If true, wait until function has completed on other CPUs.
30790 + *
30791 + * Retrurns 0 on success, else a negative status code.
30792 + *
30793 + * Does not return until the remote CPU is nearly ready to execute <func>
30794 + * or is or has executed.
30795 + */
30796 +
30797 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
30798 +       int nonatomic, int wait)
30799 +{
30800 +       /* prevent preemption and reschedule on another processor */
30801 +       int me = get_cpu();
30802 +       if (cpu == me) {
30803 +               WARN_ON(1);
30804 +               put_cpu();
30805 +               return -EBUSY;
30806 +       }
30807 +       spin_lock_bh(&call_lock);
30808 +       __smp_call_function_single(cpu, func, info, nonatomic, wait);
30809 +       spin_unlock_bh(&call_lock);
30810 +       put_cpu();
30811 +       return 0;
30812 +}
30813 +
30814 +/*
30815 + * this function sends a 'generic call function' IPI to all other CPUs
30816 + * in the system.
30817 + */
30818 +static void __smp_call_function (void (*func) (void *info), void *info,
30819 +                               int nonatomic, int wait)
30820 +{
30821 +       struct call_data_struct data;
30822 +       int cpus = num_online_cpus()-1;
30823 +
30824 +       if (!cpus)
30825 +               return;
30826 +
30827 +       data.func = func;
30828 +       data.info = info;
30829 +       atomic_set(&data.started, 0);
30830 +       data.wait = wait;
30831 +       if (wait)
30832 +               atomic_set(&data.finished, 0);
30833 +
30834 +       call_data = &data;
30835 +       wmb();
30836 +       /* Send a message to all other CPUs and wait for them to respond */
30837 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
30838 +
30839 +       /* Wait for response */
30840 +       while (atomic_read(&data.started) != cpus)
30841 +               cpu_relax();
30842 +
30843 +       if (!wait)
30844 +               return;
30845 +
30846 +       while (atomic_read(&data.finished) != cpus)
30847 +               cpu_relax();
30848 +}
30849 +
30850 +/*
30851 + * smp_call_function - run a function on all other CPUs.
30852 + * @func: The function to run. This must be fast and non-blocking.
30853 + * @info: An arbitrary pointer to pass to the function.
30854 + * @nonatomic: currently unused.
30855 + * @wait: If true, wait (atomically) until function has completed on other
30856 + *        CPUs.
30857 + *
30858 + * Returns 0 on success, else a negative status code. Does not return until
30859 + * remote CPUs are nearly ready to execute func or are or have executed.
30860 + *
30861 + * You must not call this function with disabled interrupts or from a
30862 + * hardware interrupt handler or from a bottom half handler.
30863 + * Actually there are a few legal cases, like panic.
30864 + */
30865 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
30866 +                       int wait)
30867 +{
30868 +       spin_lock(&call_lock);
30869 +       __smp_call_function(func,info,nonatomic,wait);
30870 +       spin_unlock(&call_lock);
30871 +       return 0;
30872 +}
30873 +EXPORT_SYMBOL(smp_call_function);
30874 +
30875 +void smp_stop_cpu(void)
30876 +{
30877 +       unsigned long flags;
30878 +       /*
30879 +        * Remove this CPU:
30880 +        */
30881 +       cpu_clear(smp_processor_id(), cpu_online_map);
30882 +       local_irq_save(flags);
30883 +       disable_all_local_evtchn();
30884 +       local_irq_restore(flags);
30885 +}
30886 +
30887 +static void smp_really_stop_cpu(void *dummy)
30888 +{
30889 +       smp_stop_cpu();
30890 +       for (;;)
30891 +               halt();
30892 +}
30893 +
30894 +void smp_send_stop(void)
30895 +{
30896 +       int nolock = 0;
30897 +#ifndef CONFIG_XEN
30898 +       if (reboot_force)
30899 +               return;
30900 +#endif
30901 +       /* Don't deadlock on the call lock in panic */
30902 +       if (!spin_trylock(&call_lock)) {
30903 +               /* ignore locking because we have panicked anyways */
30904 +               nolock = 1;
30905 +       }
30906 +       __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
30907 +       if (!nolock)
30908 +               spin_unlock(&call_lock);
30909 +
30910 +       local_irq_disable();
30911 +       disable_all_local_evtchn();
30912 +       local_irq_enable();
30913 +}
30914 +
30915 +/*
30916 + * Reschedule call back. Nothing to do,
30917 + * all the work is done automatically when
30918 + * we return from the interrupt.
30919 + */
30920 +#ifndef CONFIG_XEN
30921 +asmlinkage void smp_reschedule_interrupt(void)
30922 +#else
30923 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
30924 +#endif
30925 +{
30926 +#ifndef CONFIG_XEN
30927 +       ack_APIC_irq();
30928 +#else
30929 +       return IRQ_HANDLED;
30930 +#endif
30931 +}
30932 +
30933 +#ifndef CONFIG_XEN
30934 +asmlinkage void smp_call_function_interrupt(void)
30935 +#else
30936 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
30937 +#endif
30938 +{
30939 +       void (*func) (void *info) = call_data->func;
30940 +       void *info = call_data->info;
30941 +       int wait = call_data->wait;
30942 +
30943 +#ifndef CONFIG_XEN
30944 +       ack_APIC_irq();
30945 +#endif
30946 +       /*
30947 +        * Notify initiating CPU that I've grabbed the data and am
30948 +        * about to execute the function
30949 +        */
30950 +       mb();
30951 +       atomic_inc(&call_data->started);
30952 +       /*
30953 +        * At this point the info structure may be out of scope unless wait==1
30954 +        */
30955 +       exit_idle();
30956 +       irq_enter();
30957 +       (*func)(info);
30958 +       irq_exit();
30959 +       if (wait) {
30960 +               mb();
30961 +               atomic_inc(&call_data->finished);
30962 +       }
30963 +#ifdef CONFIG_XEN
30964 +       return IRQ_HANDLED;
30965 +#endif
30966 +}
30967 +
30968 +int safe_smp_processor_id(void)
30969 +{
30970 +#ifdef CONFIG_XEN
30971 +       return smp_processor_id();
30972 +#else
30973 +       unsigned apicid, i;
30974 +
30975 +       if (disable_apic)
30976 +               return 0;
30977 +
30978 +       apicid = hard_smp_processor_id();
30979 +       if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
30980 +               return apicid;
30981 +
30982 +       for (i = 0; i < NR_CPUS; ++i) {
30983 +               if (x86_cpu_to_apicid[i] == apicid)
30984 +                       return i;
30985 +       }
30986 +
30987 +       /* No entries in x86_cpu_to_apicid?  Either no MPS|ACPI,
30988 +        * or called too early.  Either way, we must be CPU 0. */
30989 +       if (x86_cpu_to_apicid[0] == BAD_APICID)
30990 +               return 0;
30991 +
30992 +       return 0; /* Should not happen */
30993 +#endif
30994 +}
30995 Index: head-2008-11-25/arch/x86/kernel/traps_64-xen.c
30996 ===================================================================
30997 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
30998 +++ head-2008-11-25/arch/x86/kernel/traps_64-xen.c      2008-04-02 12:34:02.000000000 +0200
30999 @@ -0,0 +1,1173 @@
31000 +/*
31001 + *  linux/arch/x86-64/traps.c
31002 + *
31003 + *  Copyright (C) 1991, 1992  Linus Torvalds
31004 + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
31005 + *
31006 + *  Pentium III FXSR, SSE support
31007 + *     Gareth Hughes <gareth@valinux.com>, May 2000
31008 + */
31009 +
31010 +/*
31011 + * 'Traps.c' handles hardware traps and faults after we have saved some
31012 + * state in 'entry.S'.
31013 + */
31014 +#include <linux/sched.h>
31015 +#include <linux/kernel.h>
31016 +#include <linux/string.h>
31017 +#include <linux/errno.h>
31018 +#include <linux/ptrace.h>
31019 +#include <linux/timer.h>
31020 +#include <linux/mm.h>
31021 +#include <linux/init.h>
31022 +#include <linux/delay.h>
31023 +#include <linux/spinlock.h>
31024 +#include <linux/interrupt.h>
31025 +#include <linux/module.h>
31026 +#include <linux/moduleparam.h>
31027 +#include <linux/nmi.h>
31028 +#include <linux/kprobes.h>
31029 +#include <linux/kexec.h>
31030 +#include <linux/unwind.h>
31031 +
31032 +#include <asm/system.h>
31033 +#include <asm/uaccess.h>
31034 +#include <asm/io.h>
31035 +#include <asm/atomic.h>
31036 +#include <asm/debugreg.h>
31037 +#include <asm/desc.h>
31038 +#include <asm/i387.h>
31039 +#include <asm/kdebug.h>
31040 +#include <asm/processor.h>
31041 +#include <asm/unwind.h>
31042 +#include <asm/smp.h>
31043 +#include <asm/pgalloc.h>
31044 +#include <asm/pda.h>
31045 +#include <asm/proto.h>
31046 +#include <asm/nmi.h>
31047 +
31048 +asmlinkage void divide_error(void);
31049 +asmlinkage void debug(void);
31050 +asmlinkage void nmi(void);
31051 +asmlinkage void int3(void);
31052 +asmlinkage void overflow(void);
31053 +asmlinkage void bounds(void);
31054 +asmlinkage void invalid_op(void);
31055 +asmlinkage void device_not_available(void);
31056 +asmlinkage void double_fault(void);
31057 +asmlinkage void coprocessor_segment_overrun(void);
31058 +asmlinkage void invalid_TSS(void);
31059 +asmlinkage void segment_not_present(void);
31060 +asmlinkage void stack_segment(void);
31061 +asmlinkage void general_protection(void);
31062 +asmlinkage void page_fault(void);
31063 +asmlinkage void coprocessor_error(void);
31064 +asmlinkage void simd_coprocessor_error(void);
31065 +asmlinkage void reserved(void);
31066 +asmlinkage void alignment_check(void);
31067 +asmlinkage void machine_check(void);
31068 +asmlinkage void spurious_interrupt_bug(void);
31069 +
31070 +ATOMIC_NOTIFIER_HEAD(die_chain);
31071 +EXPORT_SYMBOL(die_chain);
31072 +
31073 +int register_die_notifier(struct notifier_block *nb)
31074 +{
31075 +       vmalloc_sync_all();
31076 +       return atomic_notifier_chain_register(&die_chain, nb);
31077 +}
31078 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
31079 +
31080 +int unregister_die_notifier(struct notifier_block *nb)
31081 +{
31082 +       return atomic_notifier_chain_unregister(&die_chain, nb);
31083 +}
31084 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
31085 +
31086 +static inline void conditional_sti(struct pt_regs *regs)
31087 +{
31088 +       if (regs->eflags & X86_EFLAGS_IF)
31089 +               local_irq_enable();
31090 +}
31091 +
31092 +static inline void preempt_conditional_sti(struct pt_regs *regs)
31093 +{
31094 +       preempt_disable();
31095 +       if (regs->eflags & X86_EFLAGS_IF)
31096 +               local_irq_enable();
31097 +}
31098 +
31099 +static inline void preempt_conditional_cli(struct pt_regs *regs)
31100 +{
31101 +       if (regs->eflags & X86_EFLAGS_IF)
31102 +               local_irq_disable();
31103 +       /* Make sure to not schedule here because we could be running
31104 +          on an exception stack. */
31105 +       preempt_enable_no_resched();
31106 +}
31107 +
31108 +static int kstack_depth_to_print = 12;
31109 +#ifdef CONFIG_STACK_UNWIND
31110 +static int call_trace = 1;
31111 +#else
31112 +#define call_trace (-1)
31113 +#endif
31114 +
31115 +#ifdef CONFIG_KALLSYMS
31116 +# include <linux/kallsyms.h>
31117 +void printk_address(unsigned long address)
31118 +{
31119 +       unsigned long offset = 0, symsize;
31120 +       const char *symname;
31121 +       char *modname;
31122 +       char *delim = ":";
31123 +       char namebuf[128];
31124 +
31125 +       symname = kallsyms_lookup(address, &symsize, &offset,
31126 +                                       &modname, namebuf);
31127 +       if (!symname) {
31128 +               printk(" [<%016lx>]\n", address);
31129 +               return;
31130 +       }
31131 +       if (!modname)
31132 +               modname = delim = "";
31133 +       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
31134 +               address, delim, modname, delim, symname, offset, symsize);
31135 +}
31136 +#else
31137 +void printk_address(unsigned long address)
31138 +{
31139 +       printk(" [<%016lx>]\n", address);
31140 +}
31141 +#endif
31142 +
31143 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
31144 +                                       unsigned *usedp, const char **idp)
31145 +{
31146 +#ifndef CONFIG_X86_NO_TSS
31147 +       static char ids[][8] = {
31148 +               [DEBUG_STACK - 1] = "#DB",
31149 +               [NMI_STACK - 1] = "NMI",
31150 +               [DOUBLEFAULT_STACK - 1] = "#DF",
31151 +               [STACKFAULT_STACK - 1] = "#SS",
31152 +               [MCE_STACK - 1] = "#MC",
31153 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31154 +               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
31155 +#endif
31156 +       };
31157 +       unsigned k;
31158 +
31159 +       /*
31160 +        * Iterate over all exception stacks, and figure out whether
31161 +        * 'stack' is in one of them:
31162 +        */
31163 +       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
31164 +               unsigned long end;
31165 +
31166 +               /*
31167 +                * set 'end' to the end of the exception stack.
31168 +                */
31169 +               switch (k + 1) {
31170 +               /*
31171 +                * TODO: this block is not needed i think, because
31172 +                * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
31173 +                * properly too.
31174 +                */
31175 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31176 +               case DEBUG_STACK:
31177 +                       end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
31178 +                       break;
31179 +#endif
31180 +               default:
31181 +                       end = per_cpu(orig_ist, cpu).ist[k];
31182 +                       break;
31183 +               }
31184 +               /*
31185 +                * Is 'stack' above this exception frame's end?
31186 +                * If yes then skip to the next frame.
31187 +                */
31188 +               if (stack >= end)
31189 +                       continue;
31190 +               /*
31191 +                * Is 'stack' above this exception frame's start address?
31192 +                * If yes then we found the right frame.
31193 +                */
31194 +               if (stack >= end - EXCEPTION_STKSZ) {
31195 +                       /*
31196 +                        * Make sure we only iterate through an exception
31197 +                        * stack once. If it comes up for the second time
31198 +                        * then there's something wrong going on - just
31199 +                        * break out and return NULL:
31200 +                        */
31201 +                       if (*usedp & (1U << k))
31202 +                               break;
31203 +                       *usedp |= 1U << k;
31204 +                       *idp = ids[k];
31205 +                       return (unsigned long *)end;
31206 +               }
31207 +               /*
31208 +                * If this is a debug stack, and if it has a larger size than
31209 +                * the usual exception stacks, then 'stack' might still
31210 +                * be within the lower portion of the debug stack:
31211 +                */
31212 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31213 +               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
31214 +                       unsigned j = N_EXCEPTION_STACKS - 1;
31215 +
31216 +                       /*
31217 +                        * Black magic. A large debug stack is composed of
31218 +                        * multiple exception stack entries, which we
31219 +                        * iterate through now. Dont look:
31220 +                        */
31221 +                       do {
31222 +                               ++j;
31223 +                               end -= EXCEPTION_STKSZ;
31224 +                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
31225 +                       } while (stack < end - EXCEPTION_STKSZ);
31226 +                       if (*usedp & (1U << j))
31227 +                               break;
31228 +                       *usedp |= 1U << j;
31229 +                       *idp = ids[j];
31230 +                       return (unsigned long *)end;
31231 +               }
31232 +#endif
31233 +       }
31234 +#endif
31235 +       return NULL;
31236 +}
31237 +
31238 +static int show_trace_unwind(struct unwind_frame_info *info, void *context)
31239 +{
31240 +       int n = 0;
31241 +
31242 +       while (unwind(info) == 0 && UNW_PC(info)) {
31243 +               n++;
31244 +               printk_address(UNW_PC(info));
31245 +               if (arch_unw_user_mode(info))
31246 +                       break;
31247 +       }
31248 +       return n;
31249 +}
31250 +
31251 +/*
31252 + * x86-64 can have upto three kernel stacks:
31253 + * process stack
31254 + * interrupt stack
31255 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
31256 + */
31257 +
31258 +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
31259 +{
31260 +       const unsigned cpu = safe_smp_processor_id();
31261 +       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
31262 +       unsigned used = 0;
31263 +
31264 +       printk("\nCall Trace:\n");
31265 +
31266 +       if (!tsk)
31267 +               tsk = current;
31268 +
31269 +       if (call_trace >= 0) {
31270 +               int unw_ret = 0;
31271 +               struct unwind_frame_info info;
31272 +
31273 +               if (regs) {
31274 +                       if (unwind_init_frame_info(&info, tsk, regs) == 0)
31275 +                               unw_ret = show_trace_unwind(&info, NULL);
31276 +               } else if (tsk == current)
31277 +                       unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
31278 +               else {
31279 +                       if (unwind_init_blocked(&info, tsk) == 0)
31280 +                               unw_ret = show_trace_unwind(&info, NULL);
31281 +               }
31282 +               if (unw_ret > 0) {
31283 +                       if (call_trace == 1 && !arch_unw_user_mode(&info)) {
31284 +                               print_symbol("DWARF2 unwinder stuck at %s\n",
31285 +                                            UNW_PC(&info));
31286 +                               if ((long)UNW_SP(&info) < 0) {
31287 +                                       printk("Leftover inexact backtrace:\n");
31288 +                                       stack = (unsigned long *)UNW_SP(&info);
31289 +                               } else
31290 +                                       printk("Full inexact backtrace again:\n");
31291 +                       } else if (call_trace >= 1)
31292 +                               return;
31293 +                       else
31294 +                               printk("Full inexact backtrace again:\n");
31295 +               } else
31296 +                       printk("Inexact backtrace:\n");
31297 +       }
31298 +
31299 +       /*
31300 +        * Print function call entries within a stack. 'cond' is the
31301 +        * "end of stackframe" condition, that the 'stack++'
31302 +        * iteration will eventually trigger.
31303 +        */
31304 +#define HANDLE_STACK(cond) \
31305 +       do while (cond) { \
31306 +               unsigned long addr = *stack++; \
31307 +               if (kernel_text_address(addr)) { \
31308 +                       /* \
31309 +                        * If the address is either in the text segment of the \
31310 +                        * kernel, or in the region which contains vmalloc'ed \
31311 +                        * memory, it *may* be the address of a calling \
31312 +                        * routine; if so, print it so that someone tracing \
31313 +                        * down the cause of the crash will be able to figure \
31314 +                        * out the call path that was taken. \
31315 +                        */ \
31316 +                       printk_address(addr); \
31317 +               } \
31318 +       } while (0)
31319 +
31320 +       /*
31321 +        * Print function call entries in all stacks, starting at the
31322 +        * current stack address. If the stacks consist of nested
31323 +        * exceptions
31324 +        */
31325 +       for ( ; ; ) {
31326 +               const char *id;
31327 +               unsigned long *estack_end;
31328 +               estack_end = in_exception_stack(cpu, (unsigned long)stack,
31329 +                                               &used, &id);
31330 +
31331 +               if (estack_end) {
31332 +                       printk(" <%s>", id);
31333 +                       HANDLE_STACK (stack < estack_end);
31334 +                       printk(" <EOE>");
31335 +                       /*
31336 +                        * We link to the next stack via the
31337 +                        * second-to-last pointer (index -2 to end) in the
31338 +                        * exception stack:
31339 +                        */
31340 +                       stack = (unsigned long *) estack_end[-2];
31341 +                       continue;
31342 +               }
31343 +               if (irqstack_end) {
31344 +                       unsigned long *irqstack;
31345 +                       irqstack = irqstack_end -
31346 +                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
31347 +
31348 +                       if (stack >= irqstack && stack < irqstack_end) {
31349 +                               printk(" <IRQ>");
31350 +                               HANDLE_STACK (stack < irqstack_end);
31351 +                               /*
31352 +                                * We link to the next stack (which would be
31353 +                                * the process stack normally) the last
31354 +                                * pointer (index -1 to end) in the IRQ stack:
31355 +                                */
31356 +                               stack = (unsigned long *) (irqstack_end[-1]);
31357 +                               irqstack_end = NULL;
31358 +                               printk(" <EOI>");
31359 +                               continue;
31360 +                       }
31361 +               }
31362 +               break;
31363 +       }
31364 +
31365 +       /*
31366 +        * This prints the process stack:
31367 +        */
31368 +       HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
31369 +#undef HANDLE_STACK
31370 +
31371 +       printk("\n");
31372 +}
31373 +
31374 +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
31375 +{
31376 +       unsigned long *stack;
31377 +       int i;
31378 +       const int cpu = safe_smp_processor_id();
31379 +       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
31380 +       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
31381 +
31382 +       // debugging aid: "show_stack(NULL, NULL);" prints the
31383 +       // back trace for this cpu.
31384 +
31385 +       if (rsp == NULL) {
31386 +               if (tsk)
31387 +                       rsp = (unsigned long *)tsk->thread.rsp;
31388 +               else
31389 +                       rsp = (unsigned long *)&rsp;
31390 +       }
31391 +
31392 +       stack = rsp;
31393 +       for(i=0; i < kstack_depth_to_print; i++) {
31394 +               if (stack >= irqstack && stack <= irqstack_end) {
31395 +                       if (stack == irqstack_end) {
31396 +                               stack = (unsigned long *) (irqstack_end[-1]);
31397 +                               printk(" <EOI> ");
31398 +                       }
31399 +               } else {
31400 +               if (((long) stack & (THREAD_SIZE-1)) == 0)
31401 +                       break;
31402 +               }
31403 +               if (i && ((i % 4) == 0))
31404 +                       printk("\n");
31405 +               printk(" %016lx", *stack++);
31406 +               touch_nmi_watchdog();
31407 +       }
31408 +       show_trace(tsk, regs, rsp);
31409 +}
31410 +
31411 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
31412 +{
31413 +       _show_stack(tsk, NULL, rsp);
31414 +}
31415 +
31416 +/*
31417 + * The architecture-independent dump_stack generator
31418 + */
31419 +void dump_stack(void)
31420 +{
31421 +       unsigned long dummy;
31422 +       show_trace(NULL, NULL, &dummy);
31423 +}
31424 +
31425 +EXPORT_SYMBOL(dump_stack);
31426 +
31427 +void show_registers(struct pt_regs *regs)
31428 +{
31429 +       int i;
31430 +       int in_kernel = !user_mode(regs);
31431 +       unsigned long rsp;
31432 +       const int cpu = safe_smp_processor_id();
31433 +       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
31434 +
31435 +               rsp = regs->rsp;
31436 +
31437 +       printk("CPU %d ", cpu);
31438 +       __show_regs(regs);
31439 +       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
31440 +               cur->comm, cur->pid, task_thread_info(cur), cur);
31441 +
31442 +       /*
31443 +        * When in-kernel, we also print out the stack and code at the
31444 +        * time of the fault..
31445 +        */
31446 +       if (in_kernel) {
31447 +
31448 +               printk("Stack: ");
31449 +               _show_stack(NULL, regs, (unsigned long*)rsp);
31450 +
31451 +               printk("\nCode: ");
31452 +               if (regs->rip < PAGE_OFFSET)
31453 +                       goto bad;
31454 +
31455 +               for (i=0; i<20; i++) {
31456 +                       unsigned char c;
31457 +                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
31458 +bad:
31459 +                               printk(" Bad RIP value.");
31460 +                               break;
31461 +                       }
31462 +                       printk("%02x ", c);
31463 +               }
31464 +       }
31465 +       printk("\n");
31466 +}
31467 +
31468 +void handle_BUG(struct pt_regs *regs)
31469 +{
31470 +       struct bug_frame f;
31471 +       long len;
31472 +       const char *prefix = "";
31473 +
31474 +       if (user_mode(regs))
31475 +               return;
31476 +       if (__copy_from_user(&f, (const void __user *) regs->rip,
31477 +                            sizeof(struct bug_frame)))
31478 +               return;
31479 +       if (f.filename >= 0 ||
31480 +           f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
31481 +               return;
31482 +       len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
31483 +       if (len < 0 || len >= PATH_MAX)
31484 +               f.filename = (int)(long)"unmapped filename";
31485 +       else if (len > 50) {
31486 +               f.filename += len - 50;
31487 +               prefix = "...";
31488 +       }
31489 +       printk("----------- [cut here ] --------- [please bite here ] ---------\n");
31490 +       printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
31491 +}
31492 +
31493 +#ifdef CONFIG_BUG
31494 +void out_of_line_bug(void)
31495 +{
31496 +       BUG();
31497 +}
31498 +EXPORT_SYMBOL(out_of_line_bug);
31499 +#endif
31500 +
31501 +static DEFINE_SPINLOCK(die_lock);
31502 +static int die_owner = -1;
31503 +static unsigned int die_nest_count;
31504 +
31505 +unsigned __kprobes long oops_begin(void)
31506 +{
31507 +       int cpu = safe_smp_processor_id();
31508 +       unsigned long flags;
31509 +
31510 +       /* racy, but better than risking deadlock. */
31511 +       local_irq_save(flags);
31512 +       if (!spin_trylock(&die_lock)) {
31513 +               if (cpu == die_owner)
31514 +                       /* nested oops. should stop eventually */;
31515 +               else
31516 +                       spin_lock(&die_lock);
31517 +       }
31518 +       die_nest_count++;
31519 +       die_owner = cpu;
31520 +       console_verbose();
31521 +       bust_spinlocks(1);
31522 +       return flags;
31523 +}
31524 +
31525 +void __kprobes oops_end(unsigned long flags)
31526 +{
31527 +       die_owner = -1;
31528 +       bust_spinlocks(0);
31529 +       die_nest_count--;
31530 +       if (die_nest_count)
31531 +               /* We still own the lock */
31532 +               local_irq_restore(flags);
31533 +       else
31534 +               /* Nest count reaches zero, release the lock. */
31535 +               spin_unlock_irqrestore(&die_lock, flags);
31536 +       if (panic_on_oops)
31537 +               panic("Fatal exception");
31538 +}
31539 +
31540 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
31541 +{
31542 +       static int die_counter;
31543 +       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
31544 +#ifdef CONFIG_PREEMPT
31545 +       printk("PREEMPT ");
31546 +#endif
31547 +#ifdef CONFIG_SMP
31548 +       printk("SMP ");
31549 +#endif
31550 +#ifdef CONFIG_DEBUG_PAGEALLOC
31551 +       printk("DEBUG_PAGEALLOC");
31552 +#endif
31553 +       printk("\n");
31554 +       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
31555 +       show_registers(regs);
31556 +       /* Executive summary in case the oops scrolled away */
31557 +       printk(KERN_ALERT "RIP ");
31558 +       printk_address(regs->rip);
31559 +       printk(" RSP <%016lx>\n", regs->rsp);
31560 +       if (kexec_should_crash(current))
31561 +               crash_kexec(regs);
31562 +}
31563 +
31564 +void die(const char * str, struct pt_regs * regs, long err)
31565 +{
31566 +       unsigned long flags = oops_begin();
31567 +
31568 +       handle_BUG(regs);
31569 +       __die(str, regs, err);
31570 +       oops_end(flags);
31571 +       do_exit(SIGSEGV);
31572 +}
31573 +
31574 +#ifdef CONFIG_X86_LOCAL_APIC
31575 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
31576 +{
31577 +       unsigned long flags = oops_begin();
31578 +
31579 +       /*
31580 +        * We are in trouble anyway, lets at least try
31581 +        * to get a message out.
31582 +        */
31583 +       printk(str, safe_smp_processor_id());
31584 +       show_registers(regs);
31585 +       if (kexec_should_crash(current))
31586 +               crash_kexec(regs);
31587 +       if (panic_on_timeout || panic_on_oops)
31588 +               panic("nmi watchdog");
31589 +       printk("console shuts up ...\n");
31590 +       oops_end(flags);
31591 +       nmi_exit();
31592 +       local_irq_enable();
31593 +       do_exit(SIGSEGV);
31594 +}
31595 +#endif
31596 +
31597 +static void __kprobes do_trap(int trapnr, int signr, char *str,
31598 +                             struct pt_regs * regs, long error_code,
31599 +                             siginfo_t *info)
31600 +{
31601 +       struct task_struct *tsk = current;
31602 +
31603 +       tsk->thread.error_code = error_code;
31604 +       tsk->thread.trap_no = trapnr;
31605 +
31606 +       if (user_mode(regs)) {
31607 +               if (exception_trace && unhandled_signal(tsk, signr))
31608 +                       printk(KERN_INFO
31609 +                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
31610 +                              tsk->comm, tsk->pid, str,
31611 +                              regs->rip, regs->rsp, error_code);
31612 +
31613 +               if (info)
31614 +                       force_sig_info(signr, info, tsk);
31615 +               else
31616 +                       force_sig(signr, tsk);
31617 +               return;
31618 +       }
31619 +
31620 +
31621 +       /* kernel trap */
31622 +       {
31623 +               const struct exception_table_entry *fixup;
31624 +               fixup = search_exception_tables(regs->rip);
31625 +               if (fixup)
31626 +                       regs->rip = fixup->fixup;
31627 +               else
31628 +                       die(str, regs, error_code);
31629 +               return;
31630 +       }
31631 +}
31632 +
31633 +#define DO_ERROR(trapnr, signr, str, name) \
31634 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31635 +{ \
31636 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31637 +                                                       == NOTIFY_STOP) \
31638 +               return; \
31639 +       conditional_sti(regs);                                          \
31640 +       do_trap(trapnr, signr, str, regs, error_code, NULL); \
31641 +}
31642 +
31643 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
31644 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31645 +{ \
31646 +       siginfo_t info; \
31647 +       info.si_signo = signr; \
31648 +       info.si_errno = 0; \
31649 +       info.si_code = sicode; \
31650 +       info.si_addr = (void __user *)siaddr; \
31651 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31652 +                                                       == NOTIFY_STOP) \
31653 +               return; \
31654 +       conditional_sti(regs);                                          \
31655 +       do_trap(trapnr, signr, str, regs, error_code, &info); \
31656 +}
31657 +
31658 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
31659 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
31660 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
31661 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
31662 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
31663 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
31664 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
31665 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
31666 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
31667 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
31668 +
31669 +/* Runs on IST stack */
31670 +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
31671 +{
31672 +       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
31673 +                       12, SIGBUS) == NOTIFY_STOP)
31674 +               return;
31675 +       preempt_conditional_sti(regs);
31676 +       do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
31677 +       preempt_conditional_cli(regs);
31678 +}
31679 +
31680 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
31681 +{
31682 +       static const char str[] = "double fault";
31683 +       struct task_struct *tsk = current;
31684 +
31685 +       /* Return not checked because double check cannot be ignored */
31686 +       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
31687 +
31688 +       tsk->thread.error_code = error_code;
31689 +       tsk->thread.trap_no = 8;
31690 +
31691 +       /* This is always a kernel trap and never fixable (and thus must
31692 +          never return). */
31693 +       for (;;)
31694 +               die(str, regs, error_code);
31695 +}
31696 +
31697 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
31698 +                                               long error_code)
31699 +{
31700 +       struct task_struct *tsk = current;
31701 +
31702 +       conditional_sti(regs);
31703 +
31704 +       tsk->thread.error_code = error_code;
31705 +       tsk->thread.trap_no = 13;
31706 +
31707 +       if (user_mode(regs)) {
31708 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV))
31709 +                       printk(KERN_INFO
31710 +                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
31711 +                              tsk->comm, tsk->pid,
31712 +                              regs->rip, regs->rsp, error_code);
31713 +
31714 +               force_sig(SIGSEGV, tsk);
31715 +               return;
31716 +       }
31717 +
31718 +       /* kernel gp */
31719 +       {
31720 +               const struct exception_table_entry *fixup;
31721 +               fixup = search_exception_tables(regs->rip);
31722 +               if (fixup) {
31723 +                       regs->rip = fixup->fixup;
31724 +                       return;
31725 +               }
31726 +               if (notify_die(DIE_GPF, "general protection fault", regs,
31727 +                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
31728 +                       return;
31729 +               die("general protection fault", regs, error_code);
31730 +       }
31731 +}
31732 +
31733 +static __kprobes void
31734 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
31735 +{
31736 +       printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
31737 +       printk("You probably have a hardware problem with your RAM chips\n");
31738 +
31739 +#if 0 /* XEN */
31740 +       /* Clear and disable the memory parity error line. */
31741 +       reason = (reason & 0xf) | 4;
31742 +       outb(reason, 0x61);
31743 +#endif /* XEN */
31744 +}
31745 +
31746 +static __kprobes void
31747 +io_check_error(unsigned char reason, struct pt_regs * regs)
31748 +{
31749 +       printk("NMI: IOCK error (debug interrupt?)\n");
31750 +       show_registers(regs);
31751 +
31752 +#if 0 /* XEN */
31753 +       /* Re-enable the IOCK line, wait for a few seconds */
31754 +       reason = (reason & 0xf) | 8;
31755 +       outb(reason, 0x61);
31756 +       mdelay(2000);
31757 +       reason &= ~8;
31758 +       outb(reason, 0x61);
31759 +#endif /* XEN */
31760 +}
31761 +
31762 +static __kprobes void
31763 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
31764 +{      printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
31765 +       printk("Dazed and confused, but trying to continue\n");
31766 +       printk("Do you have a strange power saving mode enabled?\n");
31767 +}
31768 +
31769 +/* Runs on IST stack. This code must keep interrupts off all the time.
31770 +   Nested NMIs are prevented by the CPU. */
31771 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
31772 +{
31773 +       unsigned char reason = 0;
31774 +       int cpu;
31775 +
31776 +       cpu = smp_processor_id();
31777 +
31778 +       /* Only the BSP gets external NMIs from the system.  */
31779 +       if (!cpu)
31780 +               reason = get_nmi_reason();
31781 +
31782 +       if (!(reason & 0xc0)) {
31783 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
31784 +                                                               == NOTIFY_STOP)
31785 +                       return;
31786 +#ifdef CONFIG_X86_LOCAL_APIC
31787 +               /*
31788 +                * Ok, so this is none of the documented NMI sources,
31789 +                * so it must be the NMI watchdog.
31790 +                */
31791 +               if (nmi_watchdog > 0) {
31792 +                       nmi_watchdog_tick(regs,reason);
31793 +                       return;
31794 +               }
31795 +#endif
31796 +               unknown_nmi_error(reason, regs);
31797 +               return;
31798 +       }
31799 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
31800 +               return;
31801 +
31802 +       /* AK: following checks seem to be broken on modern chipsets. FIXME */
31803 +
31804 +       if (reason & 0x80)
31805 +               mem_parity_error(reason, regs);
31806 +       if (reason & 0x40)
31807 +               io_check_error(reason, regs);
31808 +}
31809 +
31810 +/* runs on IST stack. */
31811 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
31812 +{
31813 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
31814 +               return;
31815 +       }
31816 +       preempt_conditional_sti(regs);
31817 +       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
31818 +       preempt_conditional_cli(regs);
31819 +}
31820 +
31821 +/* Help handler running on IST stack to switch back to user stack
31822 +   for scheduling or signal handling. The actual stack switch is done in
31823 +   entry.S */
31824 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
31825 +{
31826 +       struct pt_regs *regs = eregs;
31827 +       /* Did already sync */
31828 +       if (eregs == (struct pt_regs *)eregs->rsp)
31829 +               ;
31830 +       /* Exception from user space */
31831 +       else if (user_mode(eregs))
31832 +               regs = task_pt_regs(current);
31833 +       /* Exception from kernel and interrupts are enabled. Move to
31834 +          kernel process stack. */
31835 +       else if (eregs->eflags & X86_EFLAGS_IF)
31836 +               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
31837 +       if (eregs != regs)
31838 +               *regs = *eregs;
31839 +       return regs;
31840 +}
31841 +
31842 +/* runs on IST stack. */
31843 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
31844 +                                  unsigned long error_code)
31845 +{
31846 +       unsigned long condition;
31847 +       struct task_struct *tsk = current;
31848 +       siginfo_t info;
31849 +
31850 +       get_debugreg(condition, 6);
31851 +
31852 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
31853 +                                               SIGTRAP) == NOTIFY_STOP)
31854 +               return;
31855 +
31856 +       preempt_conditional_sti(regs);
31857 +
31858 +       /* Mask out spurious debug traps due to lazy DR7 setting */
31859 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
31860 +               if (!tsk->thread.debugreg7) {
31861 +                       goto clear_dr7;
31862 +               }
31863 +       }
31864 +
31865 +       tsk->thread.debugreg6 = condition;
31866 +
31867 +       /* Mask out spurious TF errors due to lazy TF clearing */
31868 +       if (condition & DR_STEP) {
31869 +               /*
31870 +                * The TF error should be masked out only if the current
31871 +                * process is not traced and if the TRAP flag has been set
31872 +                * previously by a tracing process (condition detected by
31873 +                * the PT_DTRACE flag); remember that the i386 TRAP flag
31874 +                * can be modified by the process itself in user mode,
31875 +                * allowing programs to debug themselves without the ptrace()
31876 +                * interface.
31877 +                */
31878 +                if (!user_mode(regs))
31879 +                       goto clear_TF_reenable;
31880 +               /*
31881 +                * Was the TF flag set by a debugger? If so, clear it now,
31882 +                * so that register information is correct.
31883 +                */
31884 +               if (tsk->ptrace & PT_DTRACE) {
31885 +                       regs->eflags &= ~TF_MASK;
31886 +                       tsk->ptrace &= ~PT_DTRACE;
31887 +               }
31888 +       }
31889 +
31890 +       /* Ok, finally something we can handle */
31891 +       tsk->thread.trap_no = 1;
31892 +       tsk->thread.error_code = error_code;
31893 +       info.si_signo = SIGTRAP;
31894 +       info.si_errno = 0;
31895 +       info.si_code = TRAP_BRKPT;
31896 +       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
31897 +       force_sig_info(SIGTRAP, &info, tsk);
31898 +
31899 +clear_dr7:
31900 +       set_debugreg(0UL, 7);
31901 +       preempt_conditional_cli(regs);
31902 +       return;
31903 +
31904 +clear_TF_reenable:
31905 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
31906 +       regs->eflags &= ~TF_MASK;
31907 +       preempt_conditional_cli(regs);
31908 +}
31909 +
31910 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
31911 +{
31912 +       const struct exception_table_entry *fixup;
31913 +       fixup = search_exception_tables(regs->rip);
31914 +       if (fixup) {
31915 +               regs->rip = fixup->fixup;
31916 +               return 1;
31917 +       }
31918 +       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
31919 +       /* Illegal floating point operation in the kernel */
31920 +       current->thread.trap_no = trapnr;
31921 +       die(str, regs, 0);
31922 +       return 0;
31923 +}
31924 +
31925 +/*
31926 + * Note that we play around with the 'TS' bit in an attempt to get
31927 + * the correct behaviour even in the presence of the asynchronous
31928 + * IRQ13 behaviour
31929 + */
31930 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
31931 +{
31932 +       void __user *rip = (void __user *)(regs->rip);
31933 +       struct task_struct * task;
31934 +       siginfo_t info;
31935 +       unsigned short cwd, swd;
31936 +
31937 +       conditional_sti(regs);
31938 +       if (!user_mode(regs) &&
31939 +           kernel_math_error(regs, "kernel x87 math error", 16))
31940 +               return;
31941 +
31942 +       /*
31943 +        * Save the info for the exception handler and clear the error.
31944 +        */
31945 +       task = current;
31946 +       save_init_fpu(task);
31947 +       task->thread.trap_no = 16;
31948 +       task->thread.error_code = 0;
31949 +       info.si_signo = SIGFPE;
31950 +       info.si_errno = 0;
31951 +       info.si_code = __SI_FAULT;
31952 +       info.si_addr = rip;
31953 +       /*
31954 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
31955 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
31956 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
31957 +        * fault bit.  We should only be taking one exception at a time,
31958 +        * so if this combination doesn't produce any single exception,
31959 +        * then we have a bad program that isn't synchronizing its FPU usage
31960 +        * and it will suffer the consequences since we won't be able to
31961 +        * fully reproduce the context of the exception
31962 +        */
31963 +       cwd = get_fpu_cwd(task);
31964 +       swd = get_fpu_swd(task);
31965 +       switch (swd & ~cwd & 0x3f) {
31966 +               case 0x000:
31967 +               default:
31968 +                       break;
31969 +               case 0x001: /* Invalid Op */
31970 +                       /*
31971 +                        * swd & 0x240 == 0x040: Stack Underflow
31972 +                        * swd & 0x240 == 0x240: Stack Overflow
31973 +                        * User must clear the SF bit (0x40) if set
31974 +                        */
31975 +                       info.si_code = FPE_FLTINV;
31976 +                       break;
31977 +               case 0x002: /* Denormalize */
31978 +               case 0x010: /* Underflow */
31979 +                       info.si_code = FPE_FLTUND;
31980 +                       break;
31981 +               case 0x004: /* Zero Divide */
31982 +                       info.si_code = FPE_FLTDIV;
31983 +                       break;
31984 +               case 0x008: /* Overflow */
31985 +                       info.si_code = FPE_FLTOVF;
31986 +                       break;
31987 +               case 0x020: /* Precision */
31988 +                       info.si_code = FPE_FLTRES;
31989 +                       break;
31990 +       }
31991 +       force_sig_info(SIGFPE, &info, task);
31992 +}
31993 +
31994 +asmlinkage void bad_intr(void)
31995 +{
31996 +       printk("bad interrupt");
31997 +}
31998 +
31999 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
32000 +{
32001 +       void __user *rip = (void __user *)(regs->rip);
32002 +       struct task_struct * task;
32003 +       siginfo_t info;
32004 +       unsigned short mxcsr;
32005 +
32006 +       conditional_sti(regs);
32007 +       if (!user_mode(regs) &&
32008 +               kernel_math_error(regs, "kernel simd math error", 19))
32009 +               return;
32010 +
32011 +       /*
32012 +        * Save the info for the exception handler and clear the error.
32013 +        */
32014 +       task = current;
32015 +       save_init_fpu(task);
32016 +       task->thread.trap_no = 19;
32017 +       task->thread.error_code = 0;
32018 +       info.si_signo = SIGFPE;
32019 +       info.si_errno = 0;
32020 +       info.si_code = __SI_FAULT;
32021 +       info.si_addr = rip;
32022 +       /*
32023 +        * The SIMD FPU exceptions are handled a little differently, as there
32024 +        * is only a single status/control register.  Thus, to determine which
32025 +        * unmasked exception was caught we must mask the exception mask bits
32026 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
32027 +        */
32028 +       mxcsr = get_fpu_mxcsr(task);
32029 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
32030 +               case 0x000:
32031 +               default:
32032 +                       break;
32033 +               case 0x001: /* Invalid Op */
32034 +                       info.si_code = FPE_FLTINV;
32035 +                       break;
32036 +               case 0x002: /* Denormalize */
32037 +               case 0x010: /* Underflow */
32038 +                       info.si_code = FPE_FLTUND;
32039 +                       break;
32040 +               case 0x004: /* Zero Divide */
32041 +                       info.si_code = FPE_FLTDIV;
32042 +                       break;
32043 +               case 0x008: /* Overflow */
32044 +                       info.si_code = FPE_FLTOVF;
32045 +                       break;
32046 +               case 0x020: /* Precision */
32047 +                       info.si_code = FPE_FLTRES;
32048 +                       break;
32049 +       }
32050 +       force_sig_info(SIGFPE, &info, task);
32051 +}
32052 +
32053 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
32054 +{
32055 +}
32056 +
32057 +#if 0
32058 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
32059 +{
32060 +}
32061 +#endif
32062 +
32063 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
32064 +{
32065 +}
32066 +
32067 +/*
32068 + *  'math_state_restore()' saves the current math information in the
32069 + * old math state array, and gets the new ones from the current task
32070 + *
32071 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
32072 + * Don't touch unless you *really* know how it works.
32073 + */
32074 +asmlinkage void math_state_restore(void)
32075 +{
32076 +       struct task_struct *me = current;
32077 +        /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
32078 +
32079 +       if (!used_math())
32080 +               init_fpu(me);
32081 +       restore_fpu_checking(&me->thread.i387.fxsave);
32082 +       task_thread_info(me)->status |= TS_USEDFPU;
32083 +}
32084 +
32085 +
32086 +/*
32087 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
32088 + * specify <dpl>|4 in the second field.
32089 + */
32090 +static trap_info_t __cpuinitdata trap_table[] = {
32091 +        {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
32092 +        {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
32093 +        {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
32094 +        {  4, 3|4, __KERNEL_CS, (unsigned long)overflow                   },
32095 +        {  5, 0|4, __KERNEL_CS, (unsigned long)bounds                     },
32096 +        {  6, 0|4, __KERNEL_CS, (unsigned long)invalid_op                 },
32097 +        {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available       },
32098 +        {  9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
32099 +        { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS                },
32100 +        { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present        },
32101 +        { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment              },
32102 +        { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection         },
32103 +        { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault                 },
32104 +        { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug     },
32105 +        { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error          },
32106 +        { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check            },
32107 +#ifdef CONFIG_X86_MCE
32108 +        { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check              },
32109 +#endif
32110 +        { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
32111 +#ifdef CONFIG_IA32_EMULATION
32112 +       { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall},
32113 +#endif
32114 +        {  0, 0,           0, 0                                              }
32115 +};
32116 +
32117 +void __init trap_init(void)
32118 +{
32119 +        int ret;
32120 +
32121 +        ret = HYPERVISOR_set_trap_table(trap_table);
32122 +        if (ret)
32123 +               printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
32124 +
32125 +       /*
32126 +        * Should be a barrier for any external CPU state.
32127 +        */
32128 +       cpu_init();
32129 +}
32130 +
32131 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
32132 +{
32133 +       const trap_info_t *t = trap_table;
32134 +
32135 +       for (t = trap_table; t->address; t++) {
32136 +               trap_ctxt[t->vector].flags = t->flags;
32137 +               trap_ctxt[t->vector].cs = t->cs;
32138 +               trap_ctxt[t->vector].address = t->address;
32139 +       }
32140 +}
32141 +
32142 +
32143 +/* Actual parsing is done early in setup.c. */
32144 +static int __init oops_dummy(char *s)
32145 +{
32146 +       panic_on_oops = 1;
32147 +       return 1;
32148 +}
32149 +__setup("oops=", oops_dummy);
32150 +
32151 +static int __init kstack_setup(char *s)
32152 +{
32153 +       kstack_depth_to_print = simple_strtoul(s,NULL,0);
32154 +       return 1;
32155 +}
32156 +__setup("kstack=", kstack_setup);
32157 +
32158 +#ifdef CONFIG_STACK_UNWIND
32159 +static int __init call_trace_setup(char *s)
32160 +{
32161 +       if (strcmp(s, "old") == 0)
32162 +               call_trace = -1;
32163 +       else if (strcmp(s, "both") == 0)
32164 +               call_trace = 0;
32165 +       else if (strcmp(s, "newfallback") == 0)
32166 +               call_trace = 1;
32167 +       else if (strcmp(s, "new") == 0)
32168 +               call_trace = 2;
32169 +       return 1;
32170 +}
32171 +__setup("call_trace=", call_trace_setup);
32172 +#endif
32173 Index: head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c
32174 ===================================================================
32175 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
32176 +++ head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c   2007-06-18 08:38:13.000000000 +0200
32177 @@ -0,0 +1,227 @@
32178 +/*
32179 + *  linux/arch/x86_64/kernel/vsyscall.c
32180 + *
32181 + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
32182 + *  Copyright 2003 Andi Kleen, SuSE Labs.
32183 + *
32184 + *  Thanks to hpa@transmeta.com for some useful hint.
32185 + *  Special thanks to Ingo Molnar for his early experience with
32186 + *  a different vsyscall implementation for Linux/IA32 and for the name.
32187 + *
32188 + *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
32189 + *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
32190 + *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
32191 + *  jumping out of line if necessary. We cannot add more with this
32192 + *  mechanism because older kernels won't return -ENOSYS.
32193 + *  If we want more than four we need a vDSO.
32194 + *
32195 + *  Note: the concept clashes with user mode linux. If you use UML and
32196 + *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
32197 + */
32198 +
32199 +#include <linux/time.h>
32200 +#include <linux/init.h>
32201 +#include <linux/kernel.h>
32202 +#include <linux/timer.h>
32203 +#include <linux/seqlock.h>
32204 +#include <linux/jiffies.h>
32205 +#include <linux/sysctl.h>
32206 +
32207 +#include <asm/vsyscall.h>
32208 +#include <asm/pgtable.h>
32209 +#include <asm/page.h>
32210 +#include <asm/fixmap.h>
32211 +#include <asm/errno.h>
32212 +#include <asm/io.h>
32213 +
32214 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
32215 +
32216 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
32217 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
32218 +
32219 +#include <asm/unistd.h>
32220 +
32221 +static __always_inline void timeval_normalize(struct timeval * tv)
32222 +{
32223 +       time_t __sec;
32224 +
32225 +       __sec = tv->tv_usec / 1000000;
32226 +       if (__sec) {
32227 +               tv->tv_usec %= 1000000;
32228 +               tv->tv_sec += __sec;
32229 +       }
32230 +}
32231 +
32232 +static __always_inline void do_vgettimeofday(struct timeval * tv)
32233 +{
32234 +       long sequence, t;
32235 +       unsigned long sec, usec;
32236 +
32237 +       do {
32238 +               sequence = read_seqbegin(&__xtime_lock);
32239 +
32240 +               sec = __xtime.tv_sec;
32241 +               usec = (__xtime.tv_nsec / 1000) +
32242 +                       (__jiffies - __wall_jiffies) * (1000000 / HZ);
32243 +
32244 +               if (__vxtime.mode != VXTIME_HPET) {
32245 +                       t = get_cycles_sync();
32246 +                       if (t < __vxtime.last_tsc)
32247 +                               t = __vxtime.last_tsc;
32248 +                       usec += ((t - __vxtime.last_tsc) *
32249 +                                __vxtime.tsc_quot) >> 32;
32250 +                       /* See comment in x86_64 do_gettimeofday. */
32251 +               } else {
32252 +                       usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
32253 +                                 __vxtime.last) * __vxtime.quot) >> 32;
32254 +               }
32255 +       } while (read_seqretry(&__xtime_lock, sequence));
32256 +
32257 +       tv->tv_sec = sec + usec / 1000000;
32258 +       tv->tv_usec = usec % 1000000;
32259 +}
32260 +
32261 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
32262 +static __always_inline void do_get_tz(struct timezone * tz)
32263 +{
32264 +       *tz = __sys_tz;
32265 +}
32266 +
32267 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
32268 +{
32269 +       int ret;
32270 +       asm volatile("vsysc2: syscall"
32271 +               : "=a" (ret)
32272 +               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
32273 +       return ret;
32274 +}
32275 +
32276 +static __always_inline long time_syscall(long *t)
32277 +{
32278 +       long secs;
32279 +       asm volatile("vsysc1: syscall"
32280 +               : "=a" (secs)
32281 +               : "0" (__NR_time),"D" (t) : __syscall_clobber);
32282 +       return secs;
32283 +}
32284 +
32285 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
32286 +{
32287 +       if (!__sysctl_vsyscall)
32288 +               return gettimeofday(tv,tz);
32289 +       if (tv)
32290 +               do_vgettimeofday(tv);
32291 +       if (tz)
32292 +               do_get_tz(tz);
32293 +       return 0;
32294 +}
32295 +
32296 +/* This will break when the xtime seconds get inaccurate, but that is
32297 + * unlikely */
32298 +time_t __vsyscall(1) vtime(time_t *t)
32299 +{
32300 +       if (!__sysctl_vsyscall)
32301 +               return time_syscall(t);
32302 +       else if (t)
32303 +               *t = __xtime.tv_sec;
32304 +       return __xtime.tv_sec;
32305 +}
32306 +
32307 +long __vsyscall(2) venosys_0(void)
32308 +{
32309 +       return -ENOSYS;
32310 +}
32311 +
32312 +long __vsyscall(3) venosys_1(void)
32313 +{
32314 +       return -ENOSYS;
32315 +}
32316 +
32317 +#ifdef CONFIG_SYSCTL
32318 +
32319 +#define SYSCALL 0x050f
32320 +#define NOP2    0x9090
32321 +
32322 +/*
32323 + * NOP out syscall in vsyscall page when not needed.
32324 + */
32325 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
32326 +                        void __user *buffer, size_t *lenp, loff_t *ppos)
32327 +{
32328 +       extern u16 vsysc1, vsysc2;
32329 +       u16 *map1, *map2;
32330 +       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
32331 +       if (!write)
32332 +               return ret;
32333 +       /* gcc has some trouble with __va(__pa()), so just do it this
32334 +          way. */
32335 +       map1 = ioremap(__pa_symbol(&vsysc1), 2);
32336 +       if (!map1)
32337 +               return -ENOMEM;
32338 +       map2 = ioremap(__pa_symbol(&vsysc2), 2);
32339 +       if (!map2) {
32340 +               ret = -ENOMEM;
32341 +               goto out;
32342 +       }
32343 +       if (!sysctl_vsyscall) {
32344 +               *map1 = SYSCALL;
32345 +               *map2 = SYSCALL;
32346 +       } else {
32347 +               *map1 = NOP2;
32348 +               *map2 = NOP2;
32349 +       }
32350 +       iounmap(map2);
32351 +out:
32352 +       iounmap(map1);
32353 +       return ret;
32354 +}
32355 +
32356 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
32357 +                               void __user *oldval, size_t __user *oldlenp,
32358 +                               void __user *newval, size_t newlen,
32359 +                               void **context)
32360 +{
32361 +       return -ENOSYS;
32362 +}
32363 +
32364 +static ctl_table kernel_table2[] = {
32365 +       { .ctl_name = 99, .procname = "vsyscall64",
32366 +         .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
32367 +         .strategy = vsyscall_sysctl_nostrat,
32368 +         .proc_handler = vsyscall_sysctl_change },
32369 +       { 0, }
32370 +};
32371 +
32372 +static ctl_table kernel_root_table2[] = {
32373 +       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
32374 +         .child = kernel_table2 },
32375 +       { 0 },
32376 +};
32377 +
32378 +#endif
32379 +
32380 +static void __init map_vsyscall(void)
32381 +{
32382 +       extern char __vsyscall_0;
32383 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
32384 +
32385 +       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
32386 +}
32387 +
32388 +static int __init vsyscall_init(void)
32389 +{
32390 +       BUG_ON(((unsigned long) &vgettimeofday !=
32391 +                       VSYSCALL_ADDR(__NR_vgettimeofday)));
32392 +       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
32393 +       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
32394 +       map_vsyscall();
32395 +#ifdef CONFIG_XEN
32396 +       sysctl_vsyscall = 0; /* disable vgettimeofay() */
32397 +#endif
32398 +#ifdef CONFIG_SYSCTL
32399 +       register_sysctl_table(kernel_root_table2, 0);
32400 +#endif
32401 +       return 0;
32402 +}
32403 +
32404 +__initcall(vsyscall_init);
32405 Index: head-2008-11-25/arch/x86/kernel/xen_entry_64.S
32406 ===================================================================
32407 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
32408 +++ head-2008-11-25/arch/x86/kernel/xen_entry_64.S      2008-04-02 12:34:02.000000000 +0200
32409 @@ -0,0 +1,36 @@
32410 +/*
32411 + * Copied from arch/xen/i386/kernel/entry.S
32412 + */
32413 +/* Offsets into shared_info_t. */
32414 +#define evtchn_upcall_pending          /* 0 */
32415 +#define evtchn_upcall_mask             1
32416 +
32417 +#define sizeof_vcpu_shift              6
32418 +
32419 +#ifdef CONFIG_SMP
32420 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
32421 +//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
32422 +#define preempt_disable(reg)
32423 +#define preempt_enable(reg)
32424 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
32425 +                               movq %gs:pda_cpunumber,reg              ; \
32426 +                               shl  $32, reg                           ; \
32427 +                               shr  $32-sizeof_vcpu_shift,reg          ; \
32428 +                               addq HYPERVISOR_shared_info,reg
32429 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
32430 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
32431 +#else
32432 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
32433 +#define XEN_PUT_VCPU_INFO(reg)
32434 +#define XEN_PUT_VCPU_INFO_fixup
32435 +#endif
32436 +
32437 +#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
32438 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
32439 +#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
32440 +                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
32441 +                               XEN_PUT_VCPU_INFO(reg)
32442 +#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
32443 +                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
32444 +                               XEN_PUT_VCPU_INFO(reg)
32445 +#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
32446 Index: head-2008-11-25/arch/x86/mm/fault_64-xen.c
32447 ===================================================================
32448 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
32449 +++ head-2008-11-25/arch/x86/mm/fault_64-xen.c  2007-11-02 17:34:23.000000000 +0100
32450 @@ -0,0 +1,724 @@
32451 +/*
32452 + *  linux/arch/x86-64/mm/fault.c
32453 + *
32454 + *  Copyright (C) 1995  Linus Torvalds
32455 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
32456 + */
32457 +
32458 +#include <linux/signal.h>
32459 +#include <linux/sched.h>
32460 +#include <linux/kernel.h>
32461 +#include <linux/errno.h>
32462 +#include <linux/string.h>
32463 +#include <linux/types.h>
32464 +#include <linux/ptrace.h>
32465 +#include <linux/mman.h>
32466 +#include <linux/mm.h>
32467 +#include <linux/smp.h>
32468 +#include <linux/smp_lock.h>
32469 +#include <linux/interrupt.h>
32470 +#include <linux/init.h>
32471 +#include <linux/tty.h>
32472 +#include <linux/vt_kern.h>             /* For unblank_screen() */
32473 +#include <linux/compiler.h>
32474 +#include <linux/module.h>
32475 +#include <linux/kprobes.h>
32476 +
32477 +#include <asm/system.h>
32478 +#include <asm/uaccess.h>
32479 +#include <asm/pgalloc.h>
32480 +#include <asm/smp.h>
32481 +#include <asm/tlbflush.h>
32482 +#include <asm/proto.h>
32483 +#include <asm/kdebug.h>
32484 +#include <asm-generic/sections.h>
32485 +
32486 +/* Page fault error code bits */
32487 +#define PF_PROT        (1<<0)          /* or no page found */
32488 +#define PF_WRITE       (1<<1)
32489 +#define PF_USER        (1<<2)
32490 +#define PF_RSVD        (1<<3)
32491 +#define PF_INSTR       (1<<4)
32492 +
32493 +#ifdef CONFIG_KPROBES
32494 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
32495 +
32496 +/* Hook to register for page fault notifications */
32497 +int register_page_fault_notifier(struct notifier_block *nb)
32498 +{
32499 +       vmalloc_sync_all();
32500 +       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
32501 +}
32502 +
32503 +int unregister_page_fault_notifier(struct notifier_block *nb)
32504 +{
32505 +       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
32506 +}
32507 +
32508 +static inline int notify_page_fault(enum die_val val, const char *str,
32509 +                       struct pt_regs *regs, long err, int trap, int sig)
32510 +{
32511 +       struct die_args args = {
32512 +               .regs = regs,
32513 +               .str = str,
32514 +               .err = err,
32515 +               .trapnr = trap,
32516 +               .signr = sig
32517 +       };
32518 +       return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
32519 +}
32520 +#else
32521 +static inline int notify_page_fault(enum die_val val, const char *str,
32522 +                       struct pt_regs *regs, long err, int trap, int sig)
32523 +{
32524 +       return NOTIFY_DONE;
32525 +}
32526 +#endif
32527 +
32528 +void bust_spinlocks(int yes)
32529 +{
32530 +       int loglevel_save = console_loglevel;
32531 +       if (yes) {
32532 +               oops_in_progress = 1;
32533 +       } else {
32534 +#ifdef CONFIG_VT
32535 +               unblank_screen();
32536 +#endif
32537 +               oops_in_progress = 0;
32538 +               /*
32539 +                * OK, the message is on the console.  Now we call printk()
32540 +                * without oops_in_progress set so that printk will give klogd
32541 +                * a poke.  Hold onto your hats...
32542 +                */
32543 +               console_loglevel = 15;          /* NMI oopser may have shut the console up */
32544 +               printk(" ");
32545 +               console_loglevel = loglevel_save;
32546 +       }
32547 +}
32548 +
32549 +/* Sometimes the CPU reports invalid exceptions on prefetch.
32550 +   Check that here and ignore.
32551 +   Opcode checker based on code by Richard Brunner */
32552 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
32553 +                               unsigned long error_code)
32554 +{
32555 +       unsigned char *instr;
32556 +       int scan_more = 1;
32557 +       int prefetch = 0;
32558 +       unsigned char *max_instr;
32559 +
32560 +       /* If it was a exec fault ignore */
32561 +       if (error_code & PF_INSTR)
32562 +               return 0;
32563 +
32564 +       instr = (unsigned char *)convert_rip_to_linear(current, regs);
32565 +       max_instr = instr + 15;
32566 +
32567 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
32568 +               return 0;
32569 +
32570 +       while (scan_more && instr < max_instr) {
32571 +               unsigned char opcode;
32572 +               unsigned char instr_hi;
32573 +               unsigned char instr_lo;
32574 +
32575 +               if (__get_user(opcode, instr))
32576 +                       break;
32577 +
32578 +               instr_hi = opcode & 0xf0;
32579 +               instr_lo = opcode & 0x0f;
32580 +               instr++;
32581 +
32582 +               switch (instr_hi) {
32583 +               case 0x20:
32584 +               case 0x30:
32585 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
32586 +                          prefixes.  In long mode, the CPU will signal
32587 +                          invalid opcode if some of these prefixes are
32588 +                          present so we will never get here anyway */
32589 +                       scan_more = ((instr_lo & 7) == 0x6);
32590 +                       break;
32591 +
32592 +               case 0x40:
32593 +                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
32594 +                          Need to figure out under what instruction mode the
32595 +                          instruction was issued ... */
32596 +                       /* Could check the LDT for lm, but for now it's good
32597 +                          enough to assume that long mode only uses well known
32598 +                          segments or kernel. */
32599 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
32600 +                       break;
32601 +
32602 +               case 0x60:
32603 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
32604 +                       scan_more = (instr_lo & 0xC) == 0x4;
32605 +                       break;
32606 +               case 0xF0:
32607 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
32608 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
32609 +                       break;
32610 +               case 0x00:
32611 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
32612 +                       scan_more = 0;
32613 +                       if (__get_user(opcode, instr))
32614 +                               break;
32615 +                       prefetch = (instr_lo == 0xF) &&
32616 +                               (opcode == 0x0D || opcode == 0x18);
32617 +                       break;
32618 +               default:
32619 +                       scan_more = 0;
32620 +                       break;
32621 +               }
32622 +       }
32623 +       return prefetch;
32624 +}
32625 +
32626 +static int bad_address(void *p)
32627 +{
32628 +       unsigned long dummy;
32629 +       return __get_user(dummy, (unsigned long *)p);
32630 +}
32631 +
32632 +void dump_pagetable(unsigned long address)
32633 +{
32634 +       pgd_t *pgd;
32635 +       pud_t *pud;
32636 +       pmd_t *pmd;
32637 +       pte_t *pte;
32638 +
32639 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32640 +       pgd += pgd_index(address);
32641 +       if (bad_address(pgd)) goto bad;
32642 +       printk("PGD %lx ", pgd_val(*pgd));
32643 +       if (!pgd_present(*pgd)) goto ret;
32644 +
32645 +       pud = pud_offset(pgd, address);
32646 +       if (bad_address(pud)) goto bad;
32647 +       printk("PUD %lx ", pud_val(*pud));
32648 +       if (!pud_present(*pud)) goto ret;
32649 +
32650 +       pmd = pmd_offset(pud, address);
32651 +       if (bad_address(pmd)) goto bad;
32652 +       printk("PMD %lx ", pmd_val(*pmd));
32653 +       if (!pmd_present(*pmd)) goto ret;
32654 +
32655 +       pte = pte_offset_kernel(pmd, address);
32656 +       if (bad_address(pte)) goto bad;
32657 +       printk("PTE %lx", pte_val(*pte));
32658 +ret:
32659 +       printk("\n");
32660 +       return;
32661 +bad:
32662 +       printk("BAD\n");
32663 +}
32664 +
32665 +static const char errata93_warning[] =
32666 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
32667 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
32668 +KERN_ERR "******* Please consider a BIOS update.\n"
32669 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
32670 +
32671 +/* Workaround for K8 erratum #93 & buggy BIOS.
32672 +   BIOS SMM functions are required to use a specific workaround
32673 +   to avoid corruption of the 64bit RIP register on C stepping K8.
32674 +   A lot of BIOS that didn't get tested properly miss this.
32675 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
32676 +   Try to work around it here.
32677 +   Note we only handle faults in kernel here. */
32678 +
32679 +static int is_errata93(struct pt_regs *regs, unsigned long address)
32680 +{
32681 +       static int warned;
32682 +       if (address != regs->rip)
32683 +               return 0;
32684 +       if ((address >> 32) != 0)
32685 +               return 0;
32686 +       address |= 0xffffffffUL << 32;
32687 +       if ((address >= (u64)_stext && address <= (u64)_etext) ||
32688 +           (address >= MODULES_VADDR && address <= MODULES_END)) {
32689 +               if (!warned) {
32690 +                       printk(errata93_warning);
32691 +                       warned = 1;
32692 +               }
32693 +               regs->rip = address;
32694 +               return 1;
32695 +       }
32696 +       return 0;
32697 +}
32698 +
32699 +int unhandled_signal(struct task_struct *tsk, int sig)
32700 +{
32701 +       if (tsk->pid == 1)
32702 +               return 1;
32703 +       if (tsk->ptrace & PT_PTRACED)
32704 +               return 0;
32705 +       return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
32706 +               (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
32707 +}
32708 +
32709 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
32710 +                                unsigned long error_code)
32711 +{
32712 +       unsigned long flags = oops_begin();
32713 +       struct task_struct *tsk;
32714 +
32715 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
32716 +              current->comm, address);
32717 +       dump_pagetable(address);
32718 +       tsk = current;
32719 +       tsk->thread.cr2 = address;
32720 +       tsk->thread.trap_no = 14;
32721 +       tsk->thread.error_code = error_code;
32722 +       __die("Bad pagetable", regs, error_code);
32723 +       oops_end(flags);
32724 +       do_exit(SIGKILL);
32725 +}
32726 +
32727 +/*
32728 + * Handle a fault on the vmalloc area
32729 + *
32730 + * This assumes no large pages in there.
32731 + */
32732 +static int vmalloc_fault(unsigned long address)
32733 +{
32734 +       pgd_t *pgd, *pgd_ref;
32735 +       pud_t *pud, *pud_ref;
32736 +       pmd_t *pmd, *pmd_ref;
32737 +       pte_t *pte, *pte_ref;
32738 +
32739 +       /* Copy kernel mappings over when needed. This can also
32740 +          happen within a race in page table update. In the later
32741 +          case just flush. */
32742 +
32743 +       /* On Xen the line below does not always work. Needs investigating! */
32744 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
32745 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32746 +       pgd += pgd_index(address);
32747 +       pgd_ref = pgd_offset_k(address);
32748 +       if (pgd_none(*pgd_ref))
32749 +               return -1;
32750 +       if (pgd_none(*pgd))
32751 +               set_pgd(pgd, *pgd_ref);
32752 +       else
32753 +               BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
32754 +
32755 +       /* Below here mismatches are bugs because these lower tables
32756 +          are shared */
32757 +
32758 +       pud = pud_offset(pgd, address);
32759 +       pud_ref = pud_offset(pgd_ref, address);
32760 +       if (pud_none(*pud_ref))
32761 +               return -1;
32762 +       if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
32763 +               BUG();
32764 +       pmd = pmd_offset(pud, address);
32765 +       pmd_ref = pmd_offset(pud_ref, address);
32766 +       if (pmd_none(*pmd_ref))
32767 +               return -1;
32768 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
32769 +               BUG();
32770 +       pte_ref = pte_offset_kernel(pmd_ref, address);
32771 +       if (!pte_present(*pte_ref))
32772 +               return -1;
32773 +       pte = pte_offset_kernel(pmd, address);
32774 +       /* Don't use pte_page here, because the mappings can point
32775 +          outside mem_map, and the NUMA hash lookup cannot handle
32776 +          that. */
32777 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
32778 +               BUG();
32779 +       return 0;
32780 +}
32781 +
32782 +int page_fault_trace = 0;
32783 +int exception_trace = 1;
32784 +
32785 +
32786 +#define MEM_VERBOSE 1
32787 +
32788 +#ifdef MEM_VERBOSE
32789 +#define MEM_LOG(_f, _a...)                     \
32790 +       printk("fault.c:[%d]-> " _f "\n",       \
32791 +       __LINE__ , ## _a )
32792 +#else
32793 +#define MEM_LOG(_f, _a...) ((void)0)
32794 +#endif
32795 +
32796 +static int spurious_fault(struct pt_regs *regs,
32797 +                         unsigned long address,
32798 +                         unsigned long error_code)
32799 +{
32800 +       pgd_t *pgd;
32801 +       pud_t *pud;
32802 +       pmd_t *pmd;
32803 +       pte_t *pte;
32804 +
32805 +#ifdef CONFIG_XEN
32806 +       /* Faults in hypervisor area are never spurious. */
32807 +       if ((address >= HYPERVISOR_VIRT_START) &&
32808 +           (address < HYPERVISOR_VIRT_END))
32809 +               return 0;
32810 +#endif
32811 +
32812 +       /* Reserved-bit violation or user access to kernel space? */
32813 +       if (error_code & (PF_RSVD|PF_USER))
32814 +               return 0;
32815 +
32816 +       pgd = init_mm.pgd + pgd_index(address);
32817 +       if (!pgd_present(*pgd))
32818 +               return 0;
32819 +
32820 +       pud = pud_offset(pgd, address);
32821 +       if (!pud_present(*pud))
32822 +               return 0;
32823 +
32824 +       pmd = pmd_offset(pud, address);
32825 +       if (!pmd_present(*pmd))
32826 +               return 0;
32827 +
32828 +       pte = pte_offset_kernel(pmd, address);
32829 +       if (!pte_present(*pte))
32830 +               return 0;
32831 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
32832 +               return 0;
32833 +       if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
32834 +               return 0;
32835 +
32836 +       return 1;
32837 +}
32838 +
32839 +/*
32840 + * This routine handles page faults.  It determines the address,
32841 + * and the problem, and then passes it off to one of the appropriate
32842 + * routines.
32843 + */
32844 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
32845 +                                       unsigned long error_code)
32846 +{
32847 +       struct task_struct *tsk;
32848 +       struct mm_struct *mm;
32849 +       struct vm_area_struct * vma;
32850 +       unsigned long address;
32851 +       const struct exception_table_entry *fixup;
32852 +       int write;
32853 +       unsigned long flags;
32854 +       siginfo_t info;
32855 +
32856 +       if (!user_mode(regs))
32857 +               error_code &= ~PF_USER; /* means kernel */
32858 +
32859 +       tsk = current;
32860 +       mm = tsk->mm;
32861 +       prefetchw(&mm->mmap_sem);
32862 +
32863 +       /* get the address */
32864 +       address = current_vcpu_info()->arch.cr2;
32865 +
32866 +       info.si_code = SEGV_MAPERR;
32867 +
32868 +
32869 +       /*
32870 +        * We fault-in kernel-space virtual memory on-demand. The
32871 +        * 'reference' page table is init_mm.pgd.
32872 +        *
32873 +        * NOTE! We MUST NOT take any locks for this case. We may
32874 +        * be in an interrupt or a critical region, and should
32875 +        * only copy the information from the master page table,
32876 +        * nothing more.
32877 +        *
32878 +        * This verifies that the fault happens in kernel space
32879 +        * (error_code & 4) == 0, and that the fault was not a
32880 +        * protection error (error_code & 9) == 0.
32881 +        */
32882 +       if (unlikely(address >= TASK_SIZE64)) {
32883 +               /*
32884 +                * Don't check for the module range here: its PML4
32885 +                * is always initialized because it's shared with the main
32886 +                * kernel text. Only vmalloc may need PML4 syncups.
32887 +                */
32888 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
32889 +                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
32890 +                       if (vmalloc_fault(address) >= 0)
32891 +                               return;
32892 +               }
32893 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
32894 +               if (spurious_fault(regs, address, error_code))
32895 +                       return;
32896 +               if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32897 +                                               SIGSEGV) == NOTIFY_STOP)
32898 +                       return;
32899 +               /*
32900 +                * Don't take the mm semaphore here. If we fixup a prefetch
32901 +                * fault we could otherwise deadlock.
32902 +                */
32903 +               goto bad_area_nosemaphore;
32904 +       }
32905 +
32906 +       if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32907 +                                       SIGSEGV) == NOTIFY_STOP)
32908 +               return;
32909 +
32910 +       if (likely(regs->eflags & X86_EFLAGS_IF))
32911 +               local_irq_enable();
32912 +
32913 +       if (unlikely(page_fault_trace))
32914 +               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
32915 +                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
32916 +
32917 +       if (unlikely(error_code & PF_RSVD))
32918 +               pgtable_bad(address, regs, error_code);
32919 +
32920 +       /*
32921 +        * If we're in an interrupt or have no user
32922 +        * context, we must not take the fault..
32923 +        */
32924 +       if (unlikely(in_atomic() || !mm))
32925 +               goto bad_area_nosemaphore;
32926 +
32927 + again:
32928 +       /* When running in the kernel we expect faults to occur only to
32929 +        * addresses in user space.  All other faults represent errors in the
32930 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
32931 +        * erroneous fault occurring in a code path which already holds mmap_sem
32932 +        * we will deadlock attempting to validate the fault against the
32933 +        * address space.  Luckily the kernel only validly references user
32934 +        * space from well defined areas of code, which are listed in the
32935 +        * exceptions table.
32936 +        *
32937 +        * As the vast majority of faults will be valid we will only perform
32938 +        * the source reference check when there is a possibilty of a deadlock.
32939 +        * Attempt to lock the address space, if we cannot we then validate the
32940 +        * source.  If this is invalid we can skip the address space check,
32941 +        * thus avoiding the deadlock.
32942 +        */
32943 +       if (!down_read_trylock(&mm->mmap_sem)) {
32944 +               if ((error_code & PF_USER) == 0 &&
32945 +                   !search_exception_tables(regs->rip))
32946 +                       goto bad_area_nosemaphore;
32947 +               down_read(&mm->mmap_sem);
32948 +       }
32949 +
32950 +       vma = find_vma(mm, address);
32951 +       if (!vma)
32952 +               goto bad_area;
32953 +       if (likely(vma->vm_start <= address))
32954 +               goto good_area;
32955 +       if (!(vma->vm_flags & VM_GROWSDOWN))
32956 +               goto bad_area;
32957 +       if (error_code & 4) {
32958 +               /* Allow userspace just enough access below the stack pointer
32959 +                * to let the 'enter' instruction work.
32960 +                */
32961 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
32962 +                       goto bad_area;
32963 +       }
32964 +       if (expand_stack(vma, address))
32965 +               goto bad_area;
32966 +/*
32967 + * Ok, we have a good vm_area for this memory access, so
32968 + * we can handle it..
32969 + */
32970 +good_area:
32971 +       info.si_code = SEGV_ACCERR;
32972 +       write = 0;
32973 +       switch (error_code & (PF_PROT|PF_WRITE)) {
32974 +               default:        /* 3: write, present */
32975 +                       /* fall through */
32976 +               case PF_WRITE:          /* write, not present */
32977 +                       if (!(vma->vm_flags & VM_WRITE))
32978 +                               goto bad_area;
32979 +                       write++;
32980 +                       break;
32981 +               case PF_PROT:           /* read, present */
32982 +                       goto bad_area;
32983 +               case 0:                 /* read, not present */
32984 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
32985 +                               goto bad_area;
32986 +       }
32987 +
32988 +       /*
32989 +        * If for any reason at all we couldn't handle the fault,
32990 +        * make sure we exit gracefully rather than endlessly redo
32991 +        * the fault.
32992 +        */
32993 +       switch (handle_mm_fault(mm, vma, address, write)) {
32994 +       case VM_FAULT_MINOR:
32995 +               tsk->min_flt++;
32996 +               break;
32997 +       case VM_FAULT_MAJOR:
32998 +               tsk->maj_flt++;
32999 +               break;
33000 +       case VM_FAULT_SIGBUS:
33001 +               goto do_sigbus;
33002 +       default:
33003 +               goto out_of_memory;
33004 +       }
33005 +
33006 +       up_read(&mm->mmap_sem);
33007 +       return;
33008 +
33009 +/*
33010 + * Something tried to access memory that isn't in our memory map..
33011 + * Fix it, but check if it's kernel or user first..
33012 + */
33013 +bad_area:
33014 +       up_read(&mm->mmap_sem);
33015 +
33016 +bad_area_nosemaphore:
33017 +       /* User mode accesses just cause a SIGSEGV */
33018 +       if (error_code & PF_USER) {
33019 +               if (is_prefetch(regs, address, error_code))
33020 +                       return;
33021 +
33022 +               /* Work around K8 erratum #100 K8 in compat mode
33023 +                  occasionally jumps to illegal addresses >4GB.  We
33024 +                  catch this here in the page fault handler because
33025 +                  these addresses are not reachable. Just detect this
33026 +                  case and return.  Any code segment in LDT is
33027 +                  compatibility mode. */
33028 +               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
33029 +                   (address >> 32))
33030 +                       return;
33031 +
33032 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
33033 +                       printk(
33034 +                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
33035 +                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
33036 +                                       tsk->comm, tsk->pid, address, regs->rip,
33037 +                                       regs->rsp, error_code);
33038 +               }
33039 +
33040 +               tsk->thread.cr2 = address;
33041 +               /* Kernel addresses are always protection faults */
33042 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
33043 +               tsk->thread.trap_no = 14;
33044 +               info.si_signo = SIGSEGV;
33045 +               info.si_errno = 0;
33046 +               /* info.si_code has been set above */
33047 +               info.si_addr = (void __user *)address;
33048 +               force_sig_info(SIGSEGV, &info, tsk);
33049 +               return;
33050 +       }
33051 +
33052 +no_context:
33053 +
33054 +       /* Are we prepared to handle this kernel fault?  */
33055 +       fixup = search_exception_tables(regs->rip);
33056 +       if (fixup) {
33057 +               regs->rip = fixup->fixup;
33058 +               return;
33059 +       }
33060 +
33061 +       /*
33062 +        * Hall of shame of CPU/BIOS bugs.
33063 +        */
33064 +
33065 +       if (is_prefetch(regs, address, error_code))
33066 +               return;
33067 +
33068 +       if (is_errata93(regs, address))
33069 +               return;
33070 +
33071 +/*
33072 + * Oops. The kernel tried to access some bad page. We'll have to
33073 + * terminate things with extreme prejudice.
33074 + */
33075 +
33076 +       flags = oops_begin();
33077 +
33078 +       if (address < PAGE_SIZE)
33079 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
33080 +       else
33081 +               printk(KERN_ALERT "Unable to handle kernel paging request");
33082 +       printk(" at %016lx RIP: \n" KERN_ALERT,address);
33083 +       printk_address(regs->rip);
33084 +       dump_pagetable(address);
33085 +       tsk->thread.cr2 = address;
33086 +       tsk->thread.trap_no = 14;
33087 +       tsk->thread.error_code = error_code;
33088 +       __die("Oops", regs, error_code);
33089 +       /* Executive summary in case the body of the oops scrolled away */
33090 +       printk(KERN_EMERG "CR2: %016lx\n", address);
33091 +       oops_end(flags);
33092 +       do_exit(SIGKILL);
33093 +
33094 +/*
33095 + * We ran out of memory, or some other thing happened to us that made
33096 + * us unable to handle the page fault gracefully.
33097 + */
33098 +out_of_memory:
33099 +       up_read(&mm->mmap_sem);
33100 +       if (current->pid == 1) {
33101 +               yield();
33102 +               goto again;
33103 +       }
33104 +       printk("VM: killing process %s\n", tsk->comm);
33105 +       if (error_code & 4)
33106 +               do_exit(SIGKILL);
33107 +       goto no_context;
33108 +
33109 +do_sigbus:
33110 +       up_read(&mm->mmap_sem);
33111 +
33112 +       /* Kernel mode? Handle exceptions or die */
33113 +       if (!(error_code & PF_USER))
33114 +               goto no_context;
33115 +
33116 +       tsk->thread.cr2 = address;
33117 +       tsk->thread.error_code = error_code;
33118 +       tsk->thread.trap_no = 14;
33119 +       info.si_signo = SIGBUS;
33120 +       info.si_errno = 0;
33121 +       info.si_code = BUS_ADRERR;
33122 +       info.si_addr = (void __user *)address;
33123 +       force_sig_info(SIGBUS, &info, tsk);
33124 +       return;
33125 +}
33126 +
33127 +DEFINE_SPINLOCK(pgd_lock);
33128 +struct page *pgd_list;
33129 +
33130 +void vmalloc_sync_all(void)
33131 +{
33132 +       /* Note that races in the updates of insync and start aren't
33133 +          problematic:
33134 +          insync can only get set bits added, and updates to start are only
33135 +          improving performance (without affecting correctness if undone). */
33136 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
33137 +       static unsigned long start = VMALLOC_START & PGDIR_MASK;
33138 +       unsigned long address;
33139 +
33140 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
33141 +               if (!test_bit(pgd_index(address), insync)) {
33142 +                       const pgd_t *pgd_ref = pgd_offset_k(address);
33143 +                       struct page *page;
33144 +
33145 +                       if (pgd_none(*pgd_ref))
33146 +                               continue;
33147 +                       spin_lock(&pgd_lock);
33148 +                       for (page = pgd_list; page;
33149 +                            page = (struct page *)page->index) {
33150 +                               pgd_t *pgd;
33151 +                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
33152 +                               if (pgd_none(*pgd))
33153 +                                       set_pgd(pgd, *pgd_ref);
33154 +                               else
33155 +                                       BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
33156 +                       }
33157 +                       spin_unlock(&pgd_lock);
33158 +                       set_bit(pgd_index(address), insync);
33159 +               }
33160 +               if (address == start)
33161 +                       start = address + PGDIR_SIZE;
33162 +       }
33163 +       /* Check that there is no need to do the same for the modules area. */
33164 +       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33165 +       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
33166 +                               (__START_KERNEL & PGDIR_MASK)));
33167 +}
33168 +
33169 +static int __init enable_pagefaulttrace(char *str)
33170 +{
33171 +       page_fault_trace = 1;
33172 +       return 1;
33173 +}
33174 +__setup("pagefaulttrace", enable_pagefaulttrace);
33175 Index: head-2008-11-25/arch/x86/mm/init_64-xen.c
33176 ===================================================================
33177 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
33178 +++ head-2008-11-25/arch/x86/mm/init_64-xen.c   2008-10-29 09:55:56.000000000 +0100
33179 @@ -0,0 +1,1206 @@
33180 +/*
33181 + *  linux/arch/x86_64/mm/init.c
33182 + *
33183 + *  Copyright (C) 1995  Linus Torvalds
33184 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
33185 + *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
33186 + *
33187 + *  Jun Nakajima <jun.nakajima@intel.com>
33188 + *     Modified for Xen.
33189 + */
33190 +
33191 +#include <linux/signal.h>
33192 +#include <linux/sched.h>
33193 +#include <linux/kernel.h>
33194 +#include <linux/errno.h>
33195 +#include <linux/string.h>
33196 +#include <linux/types.h>
33197 +#include <linux/ptrace.h>
33198 +#include <linux/mman.h>
33199 +#include <linux/mm.h>
33200 +#include <linux/swap.h>
33201 +#include <linux/smp.h>
33202 +#include <linux/init.h>
33203 +#include <linux/pagemap.h>
33204 +#include <linux/bootmem.h>
33205 +#include <linux/proc_fs.h>
33206 +#include <linux/pci.h>
33207 +#include <linux/poison.h>
33208 +#include <linux/dma-mapping.h>
33209 +#include <linux/module.h>
33210 +#include <linux/memory_hotplug.h>
33211 +
33212 +#include <asm/processor.h>
33213 +#include <asm/system.h>
33214 +#include <asm/uaccess.h>
33215 +#include <asm/pgtable.h>
33216 +#include <asm/pgalloc.h>
33217 +#include <asm/dma.h>
33218 +#include <asm/fixmap.h>
33219 +#include <asm/e820.h>
33220 +#include <asm/apic.h>
33221 +#include <asm/tlb.h>
33222 +#include <asm/mmu_context.h>
33223 +#include <asm/proto.h>
33224 +#include <asm/smp.h>
33225 +#include <asm/sections.h>
33226 +
33227 +#include <xen/features.h>
33228 +
33229 +#ifndef Dprintk
33230 +#define Dprintk(x...)
33231 +#endif
33232 +
33233 +struct dma_mapping_ops* dma_ops;
33234 +EXPORT_SYMBOL(dma_ops);
33235 +
33236 +#if CONFIG_XEN_COMPAT <= 0x030002
33237 +unsigned int __kernel_page_user;
33238 +EXPORT_SYMBOL(__kernel_page_user);
33239 +#endif
33240 +
33241 +int after_bootmem;
33242 +
33243 +static unsigned long dma_reserve __initdata;
33244 +
33245 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
33246 +extern unsigned long start_pfn;
33247 +
33248 +/*
33249 + * Use this until direct mapping is established, i.e. before __va() is
33250 + * available in init_memory_mapping().
33251 + */
33252 +
33253 +#define addr_to_page(addr, page)                               \
33254 +       (addr) &= PHYSICAL_PAGE_MASK;                           \
33255 +       (page) = ((unsigned long *) ((unsigned long)            \
33256 +       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
33257 +       __START_KERNEL_map)))
33258 +
33259 +static void __meminit early_make_page_readonly(void *va, unsigned int feature)
33260 +{
33261 +       unsigned long addr, _va = (unsigned long)va;
33262 +       pte_t pte, *ptep;
33263 +       unsigned long *page = (unsigned long *) init_level4_pgt;
33264 +
33265 +       BUG_ON(after_bootmem);
33266 +
33267 +       if (xen_feature(feature))
33268 +               return;
33269 +
33270 +       addr = (unsigned long) page[pgd_index(_va)];
33271 +       addr_to_page(addr, page);
33272 +
33273 +       addr = page[pud_index(_va)];
33274 +       addr_to_page(addr, page);
33275 +
33276 +       addr = page[pmd_index(_va)];
33277 +       addr_to_page(addr, page);
33278 +
33279 +       ptep = (pte_t *) &page[pte_index(_va)];
33280 +
33281 +       pte.pte = ptep->pte & ~_PAGE_RW;
33282 +       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
33283 +               BUG();
33284 +}
33285 +
33286 +static void __make_page_readonly(void *va)
33287 +{
33288 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33289 +       unsigned long addr = (unsigned long) va;
33290 +
33291 +       pgd = pgd_offset_k(addr);
33292 +       pud = pud_offset(pgd, addr);
33293 +       pmd = pmd_offset(pud, addr);
33294 +       ptep = pte_offset_kernel(pmd, addr);
33295 +
33296 +       pte.pte = ptep->pte & ~_PAGE_RW;
33297 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33298 +               xen_l1_entry_update(ptep, pte); /* fallback */
33299 +
33300 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33301 +               __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
33302 +}
33303 +
33304 +static void __make_page_writable(void *va)
33305 +{
33306 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33307 +       unsigned long addr = (unsigned long) va;
33308 +
33309 +       pgd = pgd_offset_k(addr);
33310 +       pud = pud_offset(pgd, addr);
33311 +       pmd = pmd_offset(pud, addr);
33312 +       ptep = pte_offset_kernel(pmd, addr);
33313 +
33314 +       pte.pte = ptep->pte | _PAGE_RW;
33315 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33316 +               xen_l1_entry_update(ptep, pte); /* fallback */
33317 +
33318 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33319 +               __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
33320 +}
33321 +
33322 +void make_page_readonly(void *va, unsigned int feature)
33323 +{
33324 +       if (!xen_feature(feature))
33325 +               __make_page_readonly(va);
33326 +}
33327 +
33328 +void make_page_writable(void *va, unsigned int feature)
33329 +{
33330 +       if (!xen_feature(feature))
33331 +               __make_page_writable(va);
33332 +}
33333 +
33334 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
33335 +{
33336 +       if (xen_feature(feature))
33337 +               return;
33338 +
33339 +       while (nr-- != 0) {
33340 +               __make_page_readonly(va);
33341 +               va = (void*)((unsigned long)va + PAGE_SIZE);
33342 +       }
33343 +}
33344 +
33345 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
33346 +{
33347 +       if (xen_feature(feature))
33348 +               return;
33349 +
33350 +       while (nr-- != 0) {
33351 +               __make_page_writable(va);
33352 +               va = (void*)((unsigned long)va + PAGE_SIZE);
33353 +       }
33354 +}
33355 +
33356 +/*
33357 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
33358 + * physical space so we can cache the place of the first one and move
33359 + * around without checking the pgd every time.
33360 + */
33361 +
33362 +void show_mem(void)
33363 +{
33364 +       long i, total = 0, reserved = 0;
33365 +       long shared = 0, cached = 0;
33366 +       pg_data_t *pgdat;
33367 +       struct page *page;
33368 +
33369 +       printk(KERN_INFO "Mem-info:\n");
33370 +       show_free_areas();
33371 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
33372 +
33373 +       for_each_online_pgdat(pgdat) {
33374 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
33375 +                       page = pfn_to_page(pgdat->node_start_pfn + i);
33376 +                       total++;
33377 +                       if (PageReserved(page))
33378 +                               reserved++;
33379 +                       else if (PageSwapCache(page))
33380 +                               cached++;
33381 +                       else if (page_count(page))
33382 +                               shared += page_count(page) - 1;
33383 +               }
33384 +       }
33385 +       printk(KERN_INFO "%lu pages of RAM\n", total);
33386 +       printk(KERN_INFO "%lu reserved pages\n",reserved);
33387 +       printk(KERN_INFO "%lu pages shared\n",shared);
33388 +       printk(KERN_INFO "%lu pages swap cached\n",cached);
33389 +}
33390 +
33391 +
33392 +static __init void *spp_getpage(void)
33393 +{
33394 +       void *ptr;
33395 +       if (after_bootmem)
33396 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
33397 +       else if (start_pfn < table_end) {
33398 +               ptr = __va(start_pfn << PAGE_SHIFT);
33399 +               start_pfn++;
33400 +               memset(ptr, 0, PAGE_SIZE);
33401 +       } else
33402 +               ptr = alloc_bootmem_pages(PAGE_SIZE);
33403 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
33404 +               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
33405 +
33406 +       Dprintk("spp_getpage %p\n", ptr);
33407 +       return ptr;
33408 +}
33409 +
33410 +#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
33411 +#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
33412 +
33413 +static __init void set_pte_phys(unsigned long vaddr,
33414 +                        unsigned long phys, pgprot_t prot, int user_mode)
33415 +{
33416 +       pgd_t *pgd;
33417 +       pud_t *pud;
33418 +       pmd_t *pmd;
33419 +       pte_t *pte, new_pte;
33420 +
33421 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33422 +
33423 +       pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
33424 +       if (pgd_none(*pgd)) {
33425 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33426 +               return;
33427 +       }
33428 +       pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
33429 +       if (pud_none(*pud)) {
33430 +               pmd = (pmd_t *) spp_getpage();
33431 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
33432 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33433 +               if (pmd != pmd_offset(pud, 0)) {
33434 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33435 +                       return;
33436 +               }
33437 +       }
33438 +       pmd = pmd_offset(pud, vaddr);
33439 +       if (pmd_none(*pmd)) {
33440 +               pte = (pte_t *) spp_getpage();
33441 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
33442 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33443 +               if (pte != pte_offset_kernel(pmd, 0)) {
33444 +                       printk("PAGETABLE BUG #02!\n");
33445 +                       return;
33446 +               }
33447 +       }
33448 +       if (pgprot_val(prot))
33449 +               new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
33450 +       else
33451 +               new_pte = __pte(0);
33452 +
33453 +       pte = pte_offset_kernel(pmd, vaddr);
33454 +       if (!pte_none(*pte) && __pte_val(new_pte) &&
33455 +           __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33456 +               pte_ERROR(*pte);
33457 +       set_pte(pte, new_pte);
33458 +
33459 +       /*
33460 +        * It's enough to flush this one mapping.
33461 +        * (PGE mappings get flushed as well)
33462 +        */
33463 +       __flush_tlb_one(vaddr);
33464 +}
33465 +
33466 +static __init void set_pte_phys_ma(unsigned long vaddr,
33467 +                                  unsigned long phys, pgprot_t prot)
33468 +{
33469 +       pgd_t *pgd;
33470 +       pud_t *pud;
33471 +       pmd_t *pmd;
33472 +       pte_t *pte, new_pte;
33473 +
33474 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33475 +
33476 +       pgd = pgd_offset_k(vaddr);
33477 +       if (pgd_none(*pgd)) {
33478 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33479 +               return;
33480 +       }
33481 +       pud = pud_offset(pgd, vaddr);
33482 +       if (pud_none(*pud)) {
33483 +
33484 +               pmd = (pmd_t *) spp_getpage();
33485 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
33486 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33487 +               if (pmd != pmd_offset(pud, 0)) {
33488 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33489 +                       return;
33490 +               }
33491 +       }
33492 +       pmd = pmd_offset(pud, vaddr);
33493 +       if (pmd_none(*pmd)) {
33494 +               pte = (pte_t *) spp_getpage();
33495 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
33496 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33497 +               if (pte != pte_offset_kernel(pmd, 0)) {
33498 +                       printk("PAGETABLE BUG #02!\n");
33499 +                       return;
33500 +               }
33501 +       }
33502 +       new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
33503 +
33504 +       pte = pte_offset_kernel(pmd, vaddr);
33505 +       if (!pte_none(*pte) && __pte_val(new_pte) &&
33506 +#ifdef CONFIG_ACPI
33507 +           /* __acpi_map_table() fails to properly call clear_fixmap() */
33508 +           (vaddr < __fix_to_virt(FIX_ACPI_END) ||
33509 +            vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
33510 +#endif
33511 +           __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33512 +               pte_ERROR(*pte);
33513 +       set_pte(pte, new_pte);
33514 +
33515 +       /*
33516 +        * It's enough to flush this one mapping.
33517 +        * (PGE mappings get flushed as well)
33518 +        */
33519 +       __flush_tlb_one(vaddr);
33520 +}
33521 +
33522 +/* NOTE: this is meant to be run only at boot */
33523 +void __init
33524 +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
33525 +{
33526 +       unsigned long address = __fix_to_virt(idx);
33527 +
33528 +       if (idx >= __end_of_fixed_addresses) {
33529 +               printk("Invalid __set_fixmap\n");
33530 +               return;
33531 +       }
33532 +       switch (idx) {
33533 +       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
33534 +               set_pte_phys(address, phys, prot, 0);
33535 +               set_pte_phys(address, phys, prot, 1);
33536 +               break;
33537 +       default:
33538 +               set_pte_phys_ma(address, phys, prot);
33539 +               break;
33540 +       }
33541 +}
33542 +
33543 +unsigned long __initdata table_start, table_end;
33544 +
33545 +static __meminit void *alloc_static_page(unsigned long *phys)
33546 +{
33547 +       unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
33548 +
33549 +       if (after_bootmem) {
33550 +               void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
33551 +
33552 +               *phys = __pa(adr);
33553 +               return adr;
33554 +       }
33555 +
33556 +       *phys = start_pfn << PAGE_SHIFT;
33557 +       start_pfn++;
33558 +       memset((void *)va, 0, PAGE_SIZE);
33559 +       return (void *)va;
33560 +}
33561 +
33562 +#define PTE_SIZE PAGE_SIZE
33563 +
33564 +static inline int make_readonly(unsigned long paddr)
33565 +{
33566 +       extern char __vsyscall_0;
33567 +       int readonly = 0;
33568 +
33569 +       /* Make new page tables read-only. */
33570 +       if (!xen_feature(XENFEAT_writable_page_tables)
33571 +           && (paddr >= (table_start << PAGE_SHIFT))
33572 +           && (paddr < (table_end << PAGE_SHIFT)))
33573 +               readonly = 1;
33574 +       /* Make old page tables read-only. */
33575 +       if (!xen_feature(XENFEAT_writable_page_tables)
33576 +           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
33577 +           && (paddr < (start_pfn << PAGE_SHIFT)))
33578 +               readonly = 1;
33579 +
33580 +       /*
33581 +        * No need for writable mapping of kernel image. This also ensures that
33582 +        * page and descriptor tables embedded inside don't have writable
33583 +        * mappings. Exclude the vsyscall area here, allowing alternative
33584 +        * instruction patching to work.
33585 +        */
33586 +       if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
33587 +           && !(paddr >= __pa_symbol(&__vsyscall_0)
33588 +                && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
33589 +               readonly = 1;
33590 +
33591 +       return readonly;
33592 +}
33593 +
33594 +#ifndef CONFIG_XEN
33595 +/* Must run before zap_low_mappings */
33596 +__init void *early_ioremap(unsigned long addr, unsigned long size)
33597 +{
33598 +       unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
33599 +
33600 +       /* actually usually some more */
33601 +       if (size >= LARGE_PAGE_SIZE) {
33602 +               printk("SMBIOS area too long %lu\n", size);
33603 +               return NULL;
33604 +       }
33605 +       set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33606 +       map += LARGE_PAGE_SIZE;
33607 +       set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33608 +       __flush_tlb();
33609 +       return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
33610 +}
33611 +
33612 +/* To avoid virtual aliases later */
33613 +__init void early_iounmap(void *addr, unsigned long size)
33614 +{
33615 +       if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
33616 +               printk("early_iounmap: bad address %p\n", addr);
33617 +       set_pmd(temp_mappings[0].pmd, __pmd(0));
33618 +       set_pmd(temp_mappings[1].pmd, __pmd(0));
33619 +       __flush_tlb();
33620 +}
33621 +#endif
33622 +
33623 +static void __meminit
33624 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
33625 +{
33626 +       int i, k;
33627 +
33628 +       for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
33629 +               unsigned long pte_phys;
33630 +               pte_t *pte, *pte_save;
33631 +
33632 +               if (address >= end)
33633 +                       break;
33634 +               pte = alloc_static_page(&pte_phys);
33635 +               pte_save = pte;
33636 +               for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
33637 +                       unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
33638 +
33639 +                       if (address >= (after_bootmem
33640 +                                       ? end
33641 +                                       : xen_start_info->nr_pages << PAGE_SHIFT))
33642 +                               pteval = 0;
33643 +                       else if (make_readonly(address))
33644 +                               pteval &= ~_PAGE_RW;
33645 +                       set_pte(pte, __pte(pteval & __supported_pte_mask));
33646 +               }
33647 +               if (!after_bootmem) {
33648 +                       early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33649 +                       *pmd = __pmd(pte_phys | _KERNPG_TABLE);
33650 +               } else {
33651 +                       make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33652 +                       set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
33653 +               }
33654 +       }
33655 +}
33656 +
33657 +static void __meminit
33658 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
33659 +{
33660 +       pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
33661 +
33662 +       if (pmd_none(*pmd)) {
33663 +               spin_lock(&init_mm.page_table_lock);
33664 +               phys_pmd_init(pmd, address, end);
33665 +               spin_unlock(&init_mm.page_table_lock);
33666 +               __flush_tlb_all();
33667 +       }
33668 +}
33669 +
33670 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
33671 +{
33672 +       long i = pud_index(address);
33673 +
33674 +       pud = pud + i;
33675 +
33676 +       if (after_bootmem && pud_val(*pud)) {
33677 +               phys_pmd_update(pud, address, end);
33678 +               return;
33679 +       }
33680 +
33681 +       for (; i < PTRS_PER_PUD; pud++, i++) {
33682 +               unsigned long paddr, pmd_phys;
33683 +               pmd_t *pmd;
33684 +
33685 +               paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
33686 +               if (paddr >= end)
33687 +                       break;
33688 +
33689 +               pmd = alloc_static_page(&pmd_phys);
33690 +
33691 +               spin_lock(&init_mm.page_table_lock);
33692 +               *pud = __pud(pmd_phys | _KERNPG_TABLE);
33693 +               phys_pmd_init(pmd, paddr, end);
33694 +               spin_unlock(&init_mm.page_table_lock);
33695 +
33696 +               early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
33697 +       }
33698 +       __flush_tlb();
33699 +}
33700 +
33701 +void __init xen_init_pt(void)
33702 +{
33703 +       unsigned long addr, *page;
33704 +
33705 +       /* Find the initial pte page that was built for us. */
33706 +       page = (unsigned long *)xen_start_info->pt_base;
33707 +       addr = page[pgd_index(__START_KERNEL_map)];
33708 +       addr_to_page(addr, page);
33709 +       addr = page[pud_index(__START_KERNEL_map)];
33710 +       addr_to_page(addr, page);
33711 +
33712 +#if CONFIG_XEN_COMPAT <= 0x030002
33713 +       /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
33714 +          in kernel PTEs. We check that here. */
33715 +       if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
33716 +               unsigned long *pg;
33717 +               pte_t pte;
33718 +
33719 +               /* Mess with the initial mapping of page 0. It's not needed. */
33720 +               BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
33721 +               addr = page[pmd_index(__START_KERNEL_map)];
33722 +               addr_to_page(addr, pg);
33723 +               pte.pte = pg[pte_index(__START_KERNEL_map)];
33724 +               BUG_ON(!(pte.pte & _PAGE_PRESENT));
33725 +
33726 +               /* If _PAGE_USER isn't set, we obviously do not need it. */
33727 +               if (pte.pte & _PAGE_USER) {
33728 +                       /* _PAGE_USER is needed, but is it set implicitly? */
33729 +                       pte.pte &= ~_PAGE_USER;
33730 +                       if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
33731 +                                                         pte, 0) != 0) ||
33732 +                           !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
33733 +                               /* We need to explicitly specify _PAGE_USER. */
33734 +                               __kernel_page_user = _PAGE_USER;
33735 +               }
33736 +       }
33737 +#endif
33738 +
33739 +       /* Construct mapping of initial pte page in our own directories. */
33740 +       init_level4_pgt[pgd_index(__START_KERNEL_map)] =
33741 +               __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
33742 +       level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
33743 +               __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
33744 +       memcpy(level2_kernel_pgt, page, PAGE_SIZE);
33745 +
33746 +       __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
33747 +               __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
33748 +
33749 +       early_make_page_readonly(init_level4_pgt,
33750 +                                XENFEAT_writable_page_tables);
33751 +       early_make_page_readonly(__user_pgd(init_level4_pgt),
33752 +                                XENFEAT_writable_page_tables);
33753 +       early_make_page_readonly(level3_kernel_pgt,
33754 +                                XENFEAT_writable_page_tables);
33755 +       early_make_page_readonly(level3_user_pgt,
33756 +                                XENFEAT_writable_page_tables);
33757 +       early_make_page_readonly(level2_kernel_pgt,
33758 +                                XENFEAT_writable_page_tables);
33759 +
33760 +       if (!xen_feature(XENFEAT_writable_page_tables)) {
33761 +               xen_pgd_pin(__pa_symbol(init_level4_pgt));
33762 +               xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
33763 +       }
33764 +}
33765 +
33766 +static void __init extend_init_mapping(unsigned long tables_space)
33767 +{
33768 +       unsigned long va = __START_KERNEL_map;
33769 +       unsigned long phys, addr, *pte_page;
33770 +       pmd_t *pmd;
33771 +       pte_t *pte, new_pte;
33772 +       unsigned long *page = (unsigned long *)init_level4_pgt;
33773 +
33774 +       addr = page[pgd_index(va)];
33775 +       addr_to_page(addr, page);
33776 +       addr = page[pud_index(va)];
33777 +       addr_to_page(addr, page);
33778 +
33779 +       /* Kill mapping of low 1MB. */
33780 +       while (va < (unsigned long)&_text) {
33781 +               if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33782 +                       BUG();
33783 +               va += PAGE_SIZE;
33784 +       }
33785 +
33786 +       /* Ensure init mappings cover kernel text/data and initial tables. */
33787 +       while (va < (__START_KERNEL_map
33788 +                    + (start_pfn << PAGE_SHIFT)
33789 +                    + tables_space)) {
33790 +               pmd = (pmd_t *)&page[pmd_index(va)];
33791 +               if (pmd_none(*pmd)) {
33792 +                       pte_page = alloc_static_page(&phys);
33793 +                       early_make_page_readonly(
33794 +                               pte_page, XENFEAT_writable_page_tables);
33795 +                       set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
33796 +               } else {
33797 +                       addr = page[pmd_index(va)];
33798 +                       addr_to_page(addr, pte_page);
33799 +               }
33800 +               pte = (pte_t *)&pte_page[pte_index(va)];
33801 +               if (pte_none(*pte)) {
33802 +                       new_pte = pfn_pte(
33803 +                               (va - __START_KERNEL_map) >> PAGE_SHIFT,
33804 +                               __pgprot(_KERNPG_TABLE));
33805 +                       xen_l1_entry_update(pte, new_pte);
33806 +               }
33807 +               va += PAGE_SIZE;
33808 +       }
33809 +
33810 +       /* Finally, blow away any spurious initial mappings. */
33811 +       while (1) {
33812 +               pmd = (pmd_t *)&page[pmd_index(va)];
33813 +               if (pmd_none(*pmd))
33814 +                       break;
33815 +               if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33816 +                       BUG();
33817 +               va += PAGE_SIZE;
33818 +       }
33819 +}
33820 +
33821 +static void __init find_early_table_space(unsigned long end)
33822 +{
33823 +       unsigned long puds, pmds, ptes, tables;
33824 +
33825 +       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
33826 +       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
33827 +       ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
33828 +
33829 +       tables = round_up(puds * 8, PAGE_SIZE) +
33830 +               round_up(pmds * 8, PAGE_SIZE) +
33831 +               round_up(ptes * 8, PAGE_SIZE);
33832 +
33833 +       extend_init_mapping(tables);
33834 +
33835 +       table_start = start_pfn;
33836 +       table_end = table_start + (tables>>PAGE_SHIFT);
33837 +
33838 +       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
33839 +               end, table_start << PAGE_SHIFT,
33840 +               (table_start << PAGE_SHIFT) + tables);
33841 +}
33842 +
33843 +static void xen_finish_init_mapping(void)
33844 +{
33845 +       unsigned long i, start, end;
33846 +
33847 +       /* Re-vector virtual addresses pointing into the initial
33848 +          mapping to the just-established permanent ones. */
33849 +       xen_start_info = __va(__pa(xen_start_info));
33850 +       xen_start_info->pt_base = (unsigned long)
33851 +               __va(__pa(xen_start_info->pt_base));
33852 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
33853 +               phys_to_machine_mapping =
33854 +                       __va(__pa(xen_start_info->mfn_list));
33855 +               xen_start_info->mfn_list = (unsigned long)
33856 +                       phys_to_machine_mapping;
33857 +       }
33858 +       if (xen_start_info->mod_start)
33859 +               xen_start_info->mod_start = (unsigned long)
33860 +                       __va(__pa(xen_start_info->mod_start));
33861 +
33862 +       /* Destroy the Xen-created mappings beyond the kernel image as
33863 +        * well as the temporary mappings created above. Prevents
33864 +        * overlap with modules area (if init mapping is very big).
33865 +        */
33866 +       start = PAGE_ALIGN((unsigned long)_end);
33867 +       end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
33868 +       for (; start < end; start += PAGE_SIZE)
33869 +               if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
33870 +                       BUG();
33871 +
33872 +       /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
33873 +       table_end = ~0UL;
33874 +
33875 +       /*
33876 +        * Prefetch pte's for the bt_ioremap() area. It gets used before the
33877 +        * boot-time allocator is online, so allocate-on-demand would fail.
33878 +        */
33879 +       for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
33880 +               __set_fixmap(i, 0, __pgprot(0));
33881 +
33882 +       /* Switch to the real shared_info page, and clear the dummy page. */
33883 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
33884 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
33885 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
33886 +
33887 +       /* Set up mapping of lowest 1MB of physical memory. */
33888 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
33889 +               if (is_initial_xendomain())
33890 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
33891 +               else
33892 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
33893 +                                    virt_to_mfn(empty_zero_page)
33894 +                                    << PAGE_SHIFT,
33895 +                                    PAGE_KERNEL_RO);
33896 +
33897 +       /* Disable the 'start_pfn' allocator. */
33898 +       table_end = start_pfn;
33899 +}
33900 +
33901 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
33902 +   This runs before bootmem is initialized and gets pages directly from the
33903 +   physical memory. To access them they are temporarily mapped. */
33904 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
33905 +{
33906 +       unsigned long next;
33907 +
33908 +       Dprintk("init_memory_mapping\n");
33909 +
33910 +       /*
33911 +        * Find space for the kernel direct mapping tables.
33912 +        * Later we should allocate these tables in the local node of the memory
33913 +        * mapped.  Unfortunately this is done currently before the nodes are
33914 +        * discovered.
33915 +        */
33916 +       if (!after_bootmem)
33917 +               find_early_table_space(end);
33918 +
33919 +       start = (unsigned long)__va(start);
33920 +       end = (unsigned long)__va(end);
33921 +
33922 +       for (; start < end; start = next) {
33923 +               unsigned long pud_phys;
33924 +               pgd_t *pgd = pgd_offset_k(start);
33925 +               pud_t *pud;
33926 +
33927 +               if (after_bootmem)
33928 +                       pud = pud_offset(pgd, start & PGDIR_MASK);
33929 +               else
33930 +                       pud = alloc_static_page(&pud_phys);
33931 +               next = start + PGDIR_SIZE;
33932 +               if (next > end)
33933 +                       next = end;
33934 +               phys_pud_init(pud, __pa(start), __pa(next));
33935 +               if (!after_bootmem) {
33936 +                       early_make_page_readonly(pud, XENFEAT_writable_page_tables);
33937 +                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
33938 +               }
33939 +       }
33940 +
33941 +       if (!after_bootmem) {
33942 +               BUG_ON(start_pfn != table_end);
33943 +               xen_finish_init_mapping();
33944 +       }
33945 +
33946 +       __flush_tlb_all();
33947 +}
33948 +
33949 +void __cpuinit zap_low_mappings(int cpu)
33950 +{
33951 +       /* this is not required for Xen */
33952 +#if 0
33953 +       swap_low_mappings();
33954 +#endif
33955 +}
33956 +
33957 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
33958 +__init void
33959 +size_zones(unsigned long *z, unsigned long *h,
33960 +          unsigned long start_pfn, unsigned long end_pfn)
33961 +{
33962 +       int i;
33963 +       unsigned long w;
33964 +
33965 +       for (i = 0; i < MAX_NR_ZONES; i++)
33966 +               z[i] = 0;
33967 +
33968 +       if (start_pfn < MAX_DMA_PFN)
33969 +               z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
33970 +       if (start_pfn < MAX_DMA32_PFN) {
33971 +               unsigned long dma32_pfn = MAX_DMA32_PFN;
33972 +               if (dma32_pfn > end_pfn)
33973 +                       dma32_pfn = end_pfn;
33974 +               z[ZONE_DMA32] = dma32_pfn - start_pfn;
33975 +       }
33976 +       z[ZONE_NORMAL] = end_pfn - start_pfn;
33977 +
33978 +       /* Remove lower zones from higher ones. */
33979 +       w = 0;
33980 +       for (i = 0; i < MAX_NR_ZONES; i++) {
33981 +               if (z[i])
33982 +                       z[i] -= w;
33983 +               w += z[i];
33984 +       }
33985 +
33986 +       /* Compute holes */
33987 +       w = start_pfn;
33988 +       for (i = 0; i < MAX_NR_ZONES; i++) {
33989 +               unsigned long s = w;
33990 +               w += z[i];
33991 +               h[i] = e820_hole_size(s, w);
33992 +       }
33993 +
33994 +       /* Add the space pace needed for mem_map to the holes too. */
33995 +       for (i = 0; i < MAX_NR_ZONES; i++)
33996 +               h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
33997 +
33998 +       /* The 16MB DMA zone has the kernel and other misc mappings.
33999 +          Account them too */
34000 +       if (h[ZONE_DMA]) {
34001 +               h[ZONE_DMA] += dma_reserve;
34002 +               if (h[ZONE_DMA] >= z[ZONE_DMA]) {
34003 +                       printk(KERN_WARNING
34004 +                               "Kernel too large and filling up ZONE_DMA?\n");
34005 +                       h[ZONE_DMA] = z[ZONE_DMA];
34006 +               }
34007 +       }
34008 +}
34009 +
34010 +#ifndef CONFIG_NUMA
34011 +void __init paging_init(void)
34012 +{
34013 +       unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
34014 +
34015 +       memory_present(0, 0, end_pfn);
34016 +       sparse_init();
34017 +       size_zones(zones, holes, 0, end_pfn);
34018 +       free_area_init_node(0, NODE_DATA(0), zones,
34019 +                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
34020 +
34021 +       init_mm.context.pinned = 1;
34022 +}
34023 +#endif
34024 +
34025 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
34026 +   from the CPU leading to inconsistent cache lines. address and size
34027 +   must be aligned to 2MB boundaries.
34028 +   Does nothing when the mapping doesn't exist. */
34029 +void __init clear_kernel_mapping(unsigned long address, unsigned long size)
34030 +{
34031 +       unsigned long end = address + size;
34032 +
34033 +       BUG_ON(address & ~LARGE_PAGE_MASK);
34034 +       BUG_ON(size & ~LARGE_PAGE_MASK);
34035 +
34036 +       for (; address < end; address += LARGE_PAGE_SIZE) {
34037 +               pgd_t *pgd = pgd_offset_k(address);
34038 +               pud_t *pud;
34039 +               pmd_t *pmd;
34040 +               if (pgd_none(*pgd))
34041 +                       continue;
34042 +               pud = pud_offset(pgd, address);
34043 +               if (pud_none(*pud))
34044 +                       continue;
34045 +               pmd = pmd_offset(pud, address);
34046 +               if (!pmd || pmd_none(*pmd))
34047 +                       continue;
34048 +               if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
34049 +                       /* Could handle this, but it should not happen currently. */
34050 +                       printk(KERN_ERR
34051 +              "clear_kernel_mapping: mapping has been split. will leak memory\n");
34052 +                       pmd_ERROR(*pmd);
34053 +               }
34054 +               set_pmd(pmd, __pmd(0));
34055 +       }
34056 +       __flush_tlb_all();
34057 +}
34058 +
34059 +/*
34060 + * Memory hotplug specific functions
34061 + */
34062 +void online_page(struct page *page)
34063 +{
34064 +       ClearPageReserved(page);
34065 +       init_page_count(page);
34066 +       __free_page(page);
34067 +       totalram_pages++;
34068 +       num_physpages++;
34069 +}
34070 +
34071 +#ifdef CONFIG_MEMORY_HOTPLUG
34072 +/*
34073 + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
34074 + *     via probe interface of sysfs. If acpi notifies hot-add event, then it
34075 + *     can tell node id by searching dsdt. But, probe interface doesn't have
34076 + *     node id. So, return 0 as node id at this time.
34077 + */
34078 +#ifdef CONFIG_NUMA
34079 +int memory_add_physaddr_to_nid(u64 start)
34080 +{
34081 +       return 0;
34082 +}
34083 +#endif
34084 +
34085 +/*
34086 + * Memory is added always to NORMAL zone. This means you will never get
34087 + * additional DMA/DMA32 memory.
34088 + */
34089 +int arch_add_memory(int nid, u64 start, u64 size)
34090 +{
34091 +       struct pglist_data *pgdat = NODE_DATA(nid);
34092 +       struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
34093 +       unsigned long start_pfn = start >> PAGE_SHIFT;
34094 +       unsigned long nr_pages = size >> PAGE_SHIFT;
34095 +       int ret;
34096 +
34097 +       ret = __add_pages(zone, start_pfn, nr_pages);
34098 +       if (ret)
34099 +               goto error;
34100 +
34101 +       init_memory_mapping(start, (start + size -1));
34102 +
34103 +       return ret;
34104 +error:
34105 +       printk("%s: Problem encountered in __add_pages!\n", __func__);
34106 +       return ret;
34107 +}
34108 +EXPORT_SYMBOL_GPL(arch_add_memory);
34109 +
34110 +int remove_memory(u64 start, u64 size)
34111 +{
34112 +       return -EINVAL;
34113 +}
34114 +EXPORT_SYMBOL_GPL(remove_memory);
34115 +
34116 +#else /* CONFIG_MEMORY_HOTPLUG */
34117 +/*
34118 + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
34119 + * just online the pages.
34120 + */
34121 +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
34122 +{
34123 +       int err = -EIO;
34124 +       unsigned long pfn;
34125 +       unsigned long total = 0, mem = 0;
34126 +       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
34127 +               if (pfn_valid(pfn)) {
34128 +                       online_page(pfn_to_page(pfn));
34129 +                       err = 0;
34130 +                       mem++;
34131 +               }
34132 +               total++;
34133 +       }
34134 +       if (!err) {
34135 +               z->spanned_pages += total;
34136 +               z->present_pages += mem;
34137 +               z->zone_pgdat->node_spanned_pages += total;
34138 +               z->zone_pgdat->node_present_pages += mem;
34139 +       }
34140 +       return err;
34141 +}
34142 +#endif /* CONFIG_MEMORY_HOTPLUG */
34143 +
34144 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
34145 +                        kcore_vsyscall;
34146 +
34147 +void __init mem_init(void)
34148 +{
34149 +       long codesize, reservedpages, datasize, initsize;
34150 +       unsigned long pfn;
34151 +
34152 +       pci_iommu_alloc();
34153 +
34154 +       /* How many end-of-memory variables you have, grandma! */
34155 +       max_low_pfn = end_pfn;
34156 +       max_pfn = end_pfn;
34157 +       num_physpages = end_pfn;
34158 +       high_memory = (void *) __va(end_pfn * PAGE_SIZE);
34159 +
34160 +       /* clear the zero-page */
34161 +       memset(empty_zero_page, 0, PAGE_SIZE);
34162 +
34163 +       reservedpages = 0;
34164 +
34165 +       /* this will put all low memory onto the freelists */
34166 +#ifdef CONFIG_NUMA
34167 +       totalram_pages = numa_free_all_bootmem();
34168 +#else
34169 +       totalram_pages = free_all_bootmem();
34170 +#endif
34171 +       /* XEN: init and count pages outside initial allocation. */
34172 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
34173 +               ClearPageReserved(pfn_to_page(pfn));
34174 +               init_page_count(pfn_to_page(pfn));
34175 +               totalram_pages++;
34176 +       }
34177 +       reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
34178 +
34179 +       after_bootmem = 1;
34180 +
34181 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
34182 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
34183 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
34184 +
34185 +       /* Register memory areas for /proc/kcore */
34186 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
34187 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
34188 +                  VMALLOC_END-VMALLOC_START);
34189 +       kclist_add(&kcore_kernel, &_stext, _end - _stext);
34190 +       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
34191 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
34192 +                                VSYSCALL_END - VSYSCALL_START);
34193 +
34194 +       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
34195 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
34196 +               end_pfn << (PAGE_SHIFT-10),
34197 +               codesize >> 10,
34198 +               reservedpages << (PAGE_SHIFT-10),
34199 +               datasize >> 10,
34200 +               initsize >> 10);
34201 +
34202 +#ifndef CONFIG_XEN
34203 +#ifdef CONFIG_SMP
34204 +       /*
34205 +        * Sync boot_level4_pgt mappings with the init_level4_pgt
34206 +        * except for the low identity mappings which are already zapped
34207 +        * in init_level4_pgt. This sync-up is essential for AP's bringup
34208 +        */
34209 +       memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
34210 +#endif
34211 +#endif
34212 +}
34213 +
34214 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
34215 +{
34216 +       unsigned long addr;
34217 +
34218 +       if (begin >= end)
34219 +               return;
34220 +
34221 +       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
34222 +       for (addr = begin; addr < end; addr += PAGE_SIZE) {
34223 +               ClearPageReserved(virt_to_page(addr));
34224 +               init_page_count(virt_to_page(addr));
34225 +               memset((void *)(addr & ~(PAGE_SIZE-1)),
34226 +                      POISON_FREE_INITMEM, PAGE_SIZE);
34227 +               if (addr >= __START_KERNEL_map) {
34228 +                       /* make_readonly() reports all kernel addresses. */
34229 +                       __make_page_writable(__va(__pa(addr)));
34230 +                       if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
34231 +                               pgd_t *pgd = pgd_offset_k(addr);
34232 +                               pud_t *pud = pud_offset(pgd, addr);
34233 +                               pmd_t *pmd = pmd_offset(pud, addr);
34234 +                               pte_t *pte = pte_offset_kernel(pmd, addr);
34235 +
34236 +                               xen_l1_entry_update(pte, __pte(0)); /* fallback */
34237 +                       }
34238 +               }
34239 +               free_page(addr);
34240 +               totalram_pages++;
34241 +       }
34242 +}
34243 +
34244 +void free_initmem(void)
34245 +{
34246 +       memset(__initdata_begin, POISON_FREE_INITDATA,
34247 +               __initdata_end - __initdata_begin);
34248 +       free_init_pages("unused kernel memory",
34249 +                       (unsigned long)(&__init_begin),
34250 +                       (unsigned long)(&__init_end));
34251 +}
34252 +
34253 +#ifdef CONFIG_DEBUG_RODATA
34254 +
34255 +void mark_rodata_ro(void)
34256 +{
34257 +       unsigned long addr = (unsigned long)__start_rodata;
34258 +
34259 +       for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
34260 +               change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
34261 +
34262 +       printk ("Write protecting the kernel read-only data: %luk\n",
34263 +                       (__end_rodata - __start_rodata) >> 10);
34264 +
34265 +       /*
34266 +        * change_page_attr_addr() requires a global_flush_tlb() call after it.
34267 +        * We do this after the printk so that if something went wrong in the
34268 +        * change, the printk gets out at least to give a better debug hint
34269 +        * of who is the culprit.
34270 +        */
34271 +       global_flush_tlb();
34272 +}
34273 +#endif
34274 +
34275 +#ifdef CONFIG_BLK_DEV_INITRD
34276 +void free_initrd_mem(unsigned long start, unsigned long end)
34277 +{
34278 +       free_init_pages("initrd memory", start, end);
34279 +}
34280 +#endif
34281 +
34282 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
34283 +{
34284 +       /* Should check here against the e820 map to avoid double free */
34285 +#ifdef CONFIG_NUMA
34286 +       int nid = phys_to_nid(phys);
34287 +       reserve_bootmem_node(NODE_DATA(nid), phys, len);
34288 +#else
34289 +       reserve_bootmem(phys, len);
34290 +#endif
34291 +       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
34292 +               dma_reserve += len / PAGE_SIZE;
34293 +}
34294 +
34295 +int kern_addr_valid(unsigned long addr)
34296 +{
34297 +       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
34298 +       pgd_t *pgd;
34299 +       pud_t *pud;
34300 +       pmd_t *pmd;
34301 +       pte_t *pte;
34302 +
34303 +       if (above != 0 && above != -1UL)
34304 +               return 0;
34305 +
34306 +       pgd = pgd_offset_k(addr);
34307 +       if (pgd_none(*pgd))
34308 +               return 0;
34309 +
34310 +       pud = pud_offset(pgd, addr);
34311 +       if (pud_none(*pud))
34312 +               return 0;
34313 +
34314 +       pmd = pmd_offset(pud, addr);
34315 +       if (pmd_none(*pmd))
34316 +               return 0;
34317 +       if (pmd_large(*pmd))
34318 +               return pfn_valid(pmd_pfn(*pmd));
34319 +
34320 +       pte = pte_offset_kernel(pmd, addr);
34321 +       if (pte_none(*pte))
34322 +               return 0;
34323 +       return pfn_valid(pte_pfn(*pte));
34324 +}
34325 +
34326 +#ifdef CONFIG_SYSCTL
34327 +#include <linux/sysctl.h>
34328 +
34329 +extern int exception_trace, page_fault_trace;
34330 +
34331 +static ctl_table debug_table2[] = {
34332 +       { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
34333 +         proc_dointvec },
34334 +       { 0, }
34335 +};
34336 +
34337 +static ctl_table debug_root_table2[] = {
34338 +       { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
34339 +          .child = debug_table2 },
34340 +       { 0 },
34341 +};
34342 +
34343 +static __init int x8664_sysctl_init(void)
34344 +{
34345 +       register_sysctl_table(debug_root_table2, 1);
34346 +       return 0;
34347 +}
34348 +__initcall(x8664_sysctl_init);
34349 +#endif
34350 +
34351 +/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
34352 +   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
34353 +   not need special handling anymore. */
34354 +
34355 +static struct vm_area_struct gate_vma = {
34356 +       .vm_start = VSYSCALL_START,
34357 +       .vm_end = VSYSCALL_END,
34358 +       .vm_page_prot = PAGE_READONLY
34359 +};
34360 +
34361 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
34362 +{
34363 +#ifdef CONFIG_IA32_EMULATION
34364 +       if (test_tsk_thread_flag(tsk, TIF_IA32))
34365 +               return NULL;
34366 +#endif
34367 +       return &gate_vma;
34368 +}
34369 +
34370 +int in_gate_area(struct task_struct *task, unsigned long addr)
34371 +{
34372 +       struct vm_area_struct *vma = get_gate_vma(task);
34373 +       if (!vma)
34374 +               return 0;
34375 +       return (addr >= vma->vm_start) && (addr < vma->vm_end);
34376 +}
34377 +
34378 +/* Use this when you have no reliable task/vma, typically from interrupt
34379 + * context.  It is less reliable than using the task's vma and may give
34380 + * false positives.
34381 + */
34382 +int in_gate_area_no_task(unsigned long addr)
34383 +{
34384 +       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
34385 +}
34386 Index: head-2008-11-25/arch/x86/mm/pageattr_64-xen.c
34387 ===================================================================
34388 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
34389 +++ head-2008-11-25/arch/x86/mm/pageattr_64-xen.c       2008-07-21 11:00:32.000000000 +0200
34390 @@ -0,0 +1,502 @@
34391 +/*
34392 + * Copyright 2002 Andi Kleen, SuSE Labs.
34393 + * Thanks to Ben LaHaise for precious feedback.
34394 + */
34395 +
34396 +#include <linux/mm.h>
34397 +#include <linux/sched.h>
34398 +#include <linux/highmem.h>
34399 +#include <linux/module.h>
34400 +#include <linux/slab.h>
34401 +#include <asm/uaccess.h>
34402 +#include <asm/processor.h>
34403 +#include <asm/tlbflush.h>
34404 +#include <asm/io.h>
34405 +
34406 +#ifdef CONFIG_XEN
34407 +#include <asm/pgalloc.h>
34408 +#include <asm/mmu_context.h>
34409 +
34410 +LIST_HEAD(mm_unpinned);
34411 +DEFINE_SPINLOCK(mm_unpinned_lock);
34412 +
34413 +static void _pin_lock(struct mm_struct *mm, int lock) {
34414 +       if (lock)
34415 +               spin_lock(&mm->page_table_lock);
34416 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
34417 +       /* While mm->page_table_lock protects us against insertions and
34418 +        * removals of higher level page table pages, it doesn't protect
34419 +        * against updates of pte-s. Such updates, however, require the
34420 +        * pte pages to be in consistent state (unpinned+writable or
34421 +        * pinned+readonly). The pinning and attribute changes, however
34422 +        * cannot be done atomically, which is why such updates must be
34423 +        * prevented from happening concurrently.
34424 +        * Note that no pte lock can ever elsewhere be acquired nesting
34425 +        * with an already acquired one in the same mm, or with the mm's
34426 +        * page_table_lock already acquired, as that would break in the
34427 +        * non-split case (where all these are actually resolving to the
34428 +        * one page_table_lock). Thus acquiring all of them here is not
34429 +        * going to result in dead locks, and the order of acquires
34430 +        * doesn't matter.
34431 +        */
34432 +       {
34433 +               pgd_t *pgd = mm->pgd;
34434 +               unsigned g;
34435 +
34436 +               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34437 +                       pud_t *pud;
34438 +                       unsigned u;
34439 +
34440 +                       if (pgd_none(*pgd))
34441 +                               continue;
34442 +                       pud = pud_offset(pgd, 0);
34443 +                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34444 +                               pmd_t *pmd;
34445 +                               unsigned m;
34446 +
34447 +                               if (pud_none(*pud))
34448 +                                       continue;
34449 +                               pmd = pmd_offset(pud, 0);
34450 +                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34451 +                                       spinlock_t *ptl;
34452 +
34453 +                                       if (pmd_none(*pmd))
34454 +                                               continue;
34455 +                                       ptl = pte_lockptr(0, pmd);
34456 +                                       if (lock)
34457 +                                               spin_lock(ptl);
34458 +                                       else
34459 +                                               spin_unlock(ptl);
34460 +                               }
34461 +                       }
34462 +               }
34463 +       }
34464 +#endif
34465 +       if (!lock)
34466 +               spin_unlock(&mm->page_table_lock);
34467 +}
34468 +#define pin_lock(mm) _pin_lock(mm, 1)
34469 +#define pin_unlock(mm) _pin_lock(mm, 0)
34470 +
34471 +#define PIN_BATCH 8
34472 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
34473 +
34474 +static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags,
34475 +                                            unsigned int cpu, unsigned int seq)
34476 +{
34477 +       struct page *page = virt_to_page(pt);
34478 +       unsigned long pfn = page_to_pfn(page);
34479 +
34480 +       MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
34481 +               (unsigned long)__va(pfn << PAGE_SHIFT),
34482 +               pfn_pte(pfn, flags), 0);
34483 +       if (unlikely(++seq == PIN_BATCH)) {
34484 +               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
34485 +                                                       PIN_BATCH, NULL)))
34486 +                       BUG();
34487 +               seq = 0;
34488 +       }
34489 +
34490 +       return seq;
34491 +}
34492 +
34493 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
34494 +{
34495 +       pgd_t       *pgd;
34496 +       pud_t       *pud;
34497 +       pmd_t       *pmd;
34498 +       pte_t       *pte;
34499 +       int          g,u,m;
34500 +       unsigned int cpu, seq;
34501 +       multicall_entry_t *mcl;
34502 +
34503 +       pgd = mm->pgd;
34504 +       cpu = get_cpu();
34505 +
34506 +       /*
34507 +        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
34508 +        * be the 'current' task's pagetables (e.g., current may be 32-bit,
34509 +        * but the pagetables may be for a 64-bit task).
34510 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
34511 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
34512 +        */
34513 +       for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34514 +               if (pgd_none(*pgd))
34515 +                       continue;
34516 +               pud = pud_offset(pgd, 0);
34517 +               if (PTRS_PER_PUD > 1) /* not folded */
34518 +                       seq = mm_walk_set_prot(pud,flags,cpu,seq);
34519 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34520 +                       if (pud_none(*pud))
34521 +                               continue;
34522 +                       pmd = pmd_offset(pud, 0);
34523 +                       if (PTRS_PER_PMD > 1) /* not folded */
34524 +                               seq = mm_walk_set_prot(pmd,flags,cpu,seq);
34525 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34526 +                               if (pmd_none(*pmd))
34527 +                                       continue;
34528 +                               pte = pte_offset_kernel(pmd,0);
34529 +                               seq = mm_walk_set_prot(pte,flags,cpu,seq);
34530 +                       }
34531 +               }
34532 +       }
34533 +
34534 +       mcl = per_cpu(pb_mcl, cpu);
34535 +       if (unlikely(seq > PIN_BATCH - 2)) {
34536 +               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
34537 +                       BUG();
34538 +               seq = 0;
34539 +       }
34540 +       MULTI_update_va_mapping(mcl + seq,
34541 +              (unsigned long)__user_pgd(mm->pgd),
34542 +              pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags),
34543 +              0);
34544 +       MULTI_update_va_mapping(mcl + seq + 1,
34545 +              (unsigned long)mm->pgd,
34546 +              pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags),
34547 +              UVMF_TLB_FLUSH);
34548 +       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
34549 +               BUG();
34550 +
34551 +       put_cpu();
34552 +}
34553 +
34554 +void mm_pin(struct mm_struct *mm)
34555 +{
34556 +       if (xen_feature(XENFEAT_writable_page_tables))
34557 +               return;
34558 +
34559 +       pin_lock(mm);
34560 +
34561 +       mm_walk(mm, PAGE_KERNEL_RO);
34562 +       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
34563 +       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
34564 +       mm->context.pinned = 1;
34565 +       spin_lock(&mm_unpinned_lock);
34566 +       list_del(&mm->context.unpinned);
34567 +       spin_unlock(&mm_unpinned_lock);
34568 +
34569 +       pin_unlock(mm);
34570 +}
34571 +
34572 +void mm_unpin(struct mm_struct *mm)
34573 +{
34574 +       if (xen_feature(XENFEAT_writable_page_tables))
34575 +               return;
34576 +
34577 +       pin_lock(mm);
34578 +
34579 +       xen_pgd_unpin(__pa(mm->pgd));
34580 +       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
34581 +       mm_walk(mm, PAGE_KERNEL);
34582 +       mm->context.pinned = 0;
34583 +       spin_lock(&mm_unpinned_lock);
34584 +       list_add(&mm->context.unpinned, &mm_unpinned);
34585 +       spin_unlock(&mm_unpinned_lock);
34586 +
34587 +       pin_unlock(mm);
34588 +}
34589 +
34590 +void mm_pin_all(void)
34591 +{
34592 +       if (xen_feature(XENFEAT_writable_page_tables))
34593 +               return;
34594 +
34595 +       /*
34596 +        * Allow uninterrupted access to the mm_unpinned list. We don't
34597 +        * actually take the mm_unpinned_lock as it is taken inside mm_pin().
34598 +        * All other CPUs must be at a safe point (e.g., in stop_machine
34599 +        * or offlined entirely).
34600 +        */
34601 +       preempt_disable();
34602 +       while (!list_empty(&mm_unpinned))
34603 +               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
34604 +                                 context.unpinned));
34605 +       preempt_enable();
34606 +}
34607 +
34608 +void _arch_dup_mmap(struct mm_struct *mm)
34609 +{
34610 +       if (!mm->context.pinned)
34611 +               mm_pin(mm);
34612 +}
34613 +
34614 +void _arch_exit_mmap(struct mm_struct *mm)
34615 +{
34616 +       struct task_struct *tsk = current;
34617 +
34618 +       task_lock(tsk);
34619 +
34620 +       /*
34621 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
34622 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
34623 +        */
34624 +       if (tsk->active_mm == mm) {
34625 +               tsk->active_mm = &init_mm;
34626 +               atomic_inc(&init_mm.mm_count);
34627 +
34628 +               switch_mm(mm, &init_mm, tsk);
34629 +
34630 +               atomic_dec(&mm->mm_count);
34631 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
34632 +       }
34633 +
34634 +       task_unlock(tsk);
34635 +
34636 +       if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
34637 +            !mm->context.has_foreign_mappings )
34638 +               mm_unpin(mm);
34639 +}
34640 +
34641 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
34642 +{
34643 +       struct page *pte;
34644 +
34645 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
34646 +       if (pte) {
34647 +               SetPageForeign(pte, pte_free);
34648 +               init_page_count(pte);
34649 +       }
34650 +       return pte;
34651 +}
34652 +
34653 +void pte_free(struct page *pte)
34654 +{
34655 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
34656 +
34657 +       if (!pte_write(*virt_to_ptep(va)))
34658 +               if (HYPERVISOR_update_va_mapping(
34659 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
34660 +                       BUG();
34661 +
34662 +       ClearPageForeign(pte);
34663 +       init_page_count(pte);
34664 +
34665 +       __free_page(pte);
34666 +}
34667 +#endif /* CONFIG_XEN */
34668 +
34669 +pte_t *lookup_address(unsigned long address)
34670 +{
34671 +       pgd_t *pgd = pgd_offset_k(address);
34672 +       pud_t *pud;
34673 +       pmd_t *pmd;
34674 +       pte_t *pte;
34675 +       if (pgd_none(*pgd))
34676 +               return NULL;
34677 +       pud = pud_offset(pgd, address);
34678 +       if (!pud_present(*pud))
34679 +               return NULL;
34680 +       pmd = pmd_offset(pud, address);
34681 +       if (!pmd_present(*pmd))
34682 +               return NULL;
34683 +       if (pmd_large(*pmd))
34684 +               return (pte_t *)pmd;
34685 +       pte = pte_offset_kernel(pmd, address);
34686 +       if (pte && !pte_present(*pte))
34687 +               pte = NULL;
34688 +       return pte;
34689 +}
34690 +
34691 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
34692 +                                    pgprot_t ref_prot)
34693 +{
34694 +       int i;
34695 +       unsigned long addr;
34696 +       struct page *base = alloc_pages(GFP_KERNEL, 0);
34697 +       pte_t *pbase;
34698 +       if (!base)
34699 +               return NULL;
34700 +       /*
34701 +        * page_private is used to track the number of entries in
34702 +        * the page table page have non standard attributes.
34703 +        */
34704 +       SetPagePrivate(base);
34705 +       page_private(base) = 0;
34706 +
34707 +       address = __pa(address);
34708 +       addr = address & LARGE_PAGE_MASK;
34709 +       pbase = (pte_t *)page_address(base);
34710 +       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
34711 +               pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
34712 +                                  addr == address ? prot : ref_prot);
34713 +       }
34714 +       return base;
34715 +}
34716 +
34717 +
34718 +static void flush_kernel_map(void *address)
34719 +{
34720 +       if (0 && address && cpu_has_clflush) {
34721 +               /* is this worth it? */
34722 +               int i;
34723 +               for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
34724 +                       asm volatile("clflush (%0)" :: "r" (address + i));
34725 +       } else
34726 +               asm volatile("wbinvd":::"memory");
34727 +       if (address)
34728 +               __flush_tlb_one(address);
34729 +       else
34730 +               __flush_tlb_all();
34731 +}
34732 +
34733 +
34734 +static inline void flush_map(unsigned long address)
34735 +{
34736 +       on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
34737 +}
34738 +
34739 +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
34740 +
34741 +static inline void save_page(struct page *fpage)
34742 +{
34743 +       fpage->lru.next = (struct list_head *)deferred_pages;
34744 +       deferred_pages = fpage;
34745 +}
34746 +
34747 +/*
34748 + * No more special protections in this 2/4MB area - revert to a
34749 + * large page again.
34750 + */
34751 +static void revert_page(unsigned long address, pgprot_t ref_prot)
34752 +{
34753 +       pgd_t *pgd;
34754 +       pud_t *pud;
34755 +       pmd_t *pmd;
34756 +       pte_t large_pte;
34757 +
34758 +       pgd = pgd_offset_k(address);
34759 +       BUG_ON(pgd_none(*pgd));
34760 +       pud = pud_offset(pgd,address);
34761 +       BUG_ON(pud_none(*pud));
34762 +       pmd = pmd_offset(pud, address);
34763 +       BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
34764 +       pgprot_val(ref_prot) |= _PAGE_PSE;
34765 +       large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
34766 +       set_pte((pte_t *)pmd, large_pte);
34767 +}
34768 +
34769 +static int
34770 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
34771 +                                  pgprot_t ref_prot)
34772 +{
34773 +       pte_t *kpte;
34774 +       struct page *kpte_page;
34775 +       unsigned kpte_flags;
34776 +       pgprot_t ref_prot2;
34777 +       kpte = lookup_address(address);
34778 +       if (!kpte) return 0;
34779 +       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
34780 +       kpte_flags = pte_val(*kpte);
34781 +       if (pgprot_val(prot) != pgprot_val(ref_prot)) {
34782 +               if ((kpte_flags & _PAGE_PSE) == 0) {
34783 +                       set_pte(kpte, pfn_pte(pfn, prot));
34784 +               } else {
34785 +                       /*
34786 +                        * split_large_page will take the reference for this
34787 +                        * change_page_attr on the split page.
34788 +                        */
34789 +
34790 +                       struct page *split;
34791 +                       ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
34792 +
34793 +                       split = split_large_page(address, prot, ref_prot2);
34794 +                       if (!split)
34795 +                               return -ENOMEM;
34796 +                       set_pte(kpte,mk_pte(split, ref_prot2));
34797 +                       kpte_page = split;
34798 +               }
34799 +               page_private(kpte_page)++;
34800 +       } else if ((kpte_flags & _PAGE_PSE) == 0) {
34801 +               set_pte(kpte, pfn_pte(pfn, ref_prot));
34802 +               BUG_ON(page_private(kpte_page) == 0);
34803 +               page_private(kpte_page)--;
34804 +       } else
34805 +               BUG();
34806 +
34807 +       /* on x86-64 the direct mapping set at boot is not using 4k pages */
34808 +       /*
34809 +        * ..., but the XEN guest kernels (currently) do:
34810 +        * If the pte was reserved, it means it was created at boot
34811 +        * time (not via split_large_page) and in turn we must not
34812 +        * replace it with a large page.
34813 +        */
34814 +#ifndef CONFIG_XEN
34815 +       BUG_ON(PageReserved(kpte_page));
34816 +#else
34817 +       if (PageReserved(kpte_page))
34818 +               return 0;
34819 +#endif
34820 +
34821 +       if (page_private(kpte_page) == 0) {
34822 +               save_page(kpte_page);
34823 +               revert_page(address, ref_prot);
34824 +       }
34825 +       return 0;
34826 +}
34827 +
34828 +/*
34829 + * Change the page attributes of an page in the linear mapping.
34830 + *
34831 + * This should be used when a page is mapped with a different caching policy
34832 + * than write-back somewhere - some CPUs do not like it when mappings with
34833 + * different caching policies exist. This changes the page attributes of the
34834 + * in kernel linear mapping too.
34835 + *
34836 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
34837 + * This function only deals with the kernel linear map.
34838 + *
34839 + * Caller must call global_flush_tlb() after this.
34840 + */
34841 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
34842 +{
34843 +       int err = 0;
34844 +       int i;
34845 +
34846 +       down_write(&init_mm.mmap_sem);
34847 +       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
34848 +               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
34849 +
34850 +               err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
34851 +               if (err)
34852 +                       break;
34853 +               /* Handle kernel mapping too which aliases part of the
34854 +                * lowmem */
34855 +               if (__pa(address) < KERNEL_TEXT_SIZE) {
34856 +                       unsigned long addr2;
34857 +                       pgprot_t prot2 = prot;
34858 +                       addr2 = __START_KERNEL_map + __pa(address);
34859 +                       pgprot_val(prot2) &= ~_PAGE_NX;
34860 +                       err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
34861 +               }
34862 +       }
34863 +       up_write(&init_mm.mmap_sem);
34864 +       return err;
34865 +}
34866 +
34867 +/* Don't call this for MMIO areas that may not have a mem_map entry */
34868 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
34869 +{
34870 +       unsigned long addr = (unsigned long)page_address(page);
34871 +       return change_page_attr_addr(addr, numpages, prot);
34872 +}
34873 +
34874 +void global_flush_tlb(void)
34875 +{
34876 +       struct page *dpage;
34877 +
34878 +       down_read(&init_mm.mmap_sem);
34879 +       dpage = xchg(&deferred_pages, NULL);
34880 +       up_read(&init_mm.mmap_sem);
34881 +
34882 +       flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
34883 +       while (dpage) {
34884 +               struct page *tmp = dpage;
34885 +               dpage = (struct page *)dpage->lru.next;
34886 +               ClearPagePrivate(tmp);
34887 +               __free_page(tmp);
34888 +       }
34889 +}
34890 +
34891 +EXPORT_SYMBOL(change_page_attr);
34892 +EXPORT_SYMBOL(global_flush_tlb);
34893 Index: head-2008-11-25/drivers/pci/msi-xen.c
34894 ===================================================================
34895 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
34896 +++ head-2008-11-25/drivers/pci/msi-xen.c       2008-10-13 13:43:45.000000000 +0200
34897 @@ -0,0 +1,809 @@
34898 +/*
34899 + * File:       msi.c
34900 + * Purpose:    PCI Message Signaled Interrupt (MSI)
34901 + *
34902 + * Copyright (C) 2003-2004 Intel
34903 + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
34904 + */
34905 +
34906 +#include <linux/mm.h>
34907 +#include <linux/irq.h>
34908 +#include <linux/interrupt.h>
34909 +#include <linux/init.h>
34910 +#include <linux/ioport.h>
34911 +#include <linux/smp_lock.h>
34912 +#include <linux/pci.h>
34913 +#include <linux/proc_fs.h>
34914 +
34915 +#include <xen/evtchn.h>
34916 +
34917 +#include <asm/errno.h>
34918 +#include <asm/io.h>
34919 +#include <asm/smp.h>
34920 +
34921 +#include "pci.h"
34922 +#include "msi.h"
34923 +
34924 +static int pci_msi_enable = 1;
34925 +
34926 +static struct msi_ops *msi_ops;
34927 +
34928 +int msi_register(struct msi_ops *ops)
34929 +{
34930 +       msi_ops = ops;
34931 +       return 0;
34932 +}
34933 +
34934 +static LIST_HEAD(msi_dev_head);
34935 +DEFINE_SPINLOCK(msi_dev_lock);
34936 +
34937 +struct msi_dev_list {
34938 +       struct pci_dev *dev;
34939 +       struct list_head list;
34940 +       spinlock_t pirq_list_lock;
34941 +       struct list_head pirq_list_head;
34942 +};
34943 +
34944 +struct msi_pirq_entry {
34945 +       struct list_head list;
34946 +       int pirq;
34947 +       int entry_nr;
34948 +};
34949 +
34950 +static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
34951 +{
34952 +       struct msi_dev_list *msi_dev_list, *ret = NULL;
34953 +       unsigned long flags;
34954 +
34955 +       spin_lock_irqsave(&msi_dev_lock, flags);
34956 +
34957 +       list_for_each_entry(msi_dev_list, &msi_dev_head, list)
34958 +               if ( msi_dev_list->dev == dev )
34959 +                       ret = msi_dev_list;
34960 +
34961 +       if ( ret ) {
34962 +               spin_unlock_irqrestore(&msi_dev_lock, flags);
34963 +               return ret;
34964 +       }
34965 +
34966 +       /* Has not allocate msi_dev until now. */
34967 +       ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
34968 +
34969 +       /* Failed to allocate msi_dev structure */
34970 +       if ( !ret ) {
34971 +               spin_unlock_irqrestore(&msi_dev_lock, flags);
34972 +               return NULL;
34973 +       }
34974 +
34975 +       ret->dev = dev;
34976 +       spin_lock_init(&ret->pirq_list_lock);
34977 +       INIT_LIST_HEAD(&ret->pirq_list_head);
34978 +       list_add_tail(&ret->list, &msi_dev_head);
34979 +       spin_unlock_irqrestore(&msi_dev_lock, flags);
34980 +       return ret;
34981 +}
34982 +
34983 +static int attach_pirq_entry(int pirq, int entry_nr,
34984 +                             struct msi_dev_list *msi_dev_entry)
34985 +{
34986 +       struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
34987 +       unsigned long flags;
34988 +
34989 +       if (!entry)
34990 +               return -ENOMEM;
34991 +       entry->pirq = pirq;
34992 +       entry->entry_nr = entry_nr;
34993 +       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
34994 +       list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
34995 +       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
34996 +       return 0;
34997 +}
34998 +
34999 +static void detach_pirq_entry(int entry_nr,
35000 +                                                       struct msi_dev_list *msi_dev_entry)
35001 +{
35002 +       unsigned long flags;
35003 +       struct msi_pirq_entry *pirq_entry;
35004 +
35005 +       list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35006 +               if (pirq_entry->entry_nr == entry_nr) {
35007 +                       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35008 +                       list_del(&pirq_entry->list);
35009 +                       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35010 +                       kfree(pirq_entry);
35011 +                       return;
35012 +               }
35013 +       }
35014 +}
35015 +
35016 +/*
35017 + * pciback will provide device's owner
35018 + */
35019 +static int (*get_owner)(struct pci_dev *dev);
35020 +
35021 +int register_msi_get_owner(int (*func)(struct pci_dev *dev))
35022 +{
35023 +       if (get_owner) {
35024 +               printk(KERN_WARNING "register msi_get_owner again\n");
35025 +               return -EEXIST;
35026 +       }
35027 +       get_owner = func;
35028 +       return 0;
35029 +}
35030 +
35031 +int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
35032 +{
35033 +       if (get_owner != func)
35034 +               return -EINVAL;
35035 +       get_owner = NULL;
35036 +       return 0;
35037 +}
35038 +
35039 +static int msi_get_dev_owner(struct pci_dev *dev)
35040 +{
35041 +       int owner;
35042 +
35043 +       BUG_ON(!is_initial_xendomain());
35044 +       if (get_owner && (owner = get_owner(dev)) >= 0) {
35045 +               printk(KERN_INFO "get owner for dev %x get %x \n",
35046 +                      dev->devfn, owner);
35047 +               return owner;
35048 +       }
35049 +
35050 +       return DOMID_SELF;
35051 +}
35052 +
35053 +static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
35054 +{
35055 +       struct physdev_unmap_pirq unmap;
35056 +       int rc;
35057 +
35058 +       unmap.domid = msi_get_dev_owner(dev);
35059 +       /* See comments in msi_map_pirq_to_vector, input parameter pirq
35060 +        * mean irq number only if the device belongs to dom0 itself.
35061 +        */
35062 +       unmap.pirq = (unmap.domid != DOMID_SELF)
35063 +               ? pirq : evtchn_get_xen_pirq(pirq);
35064 +
35065 +       if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
35066 +               printk(KERN_WARNING "unmap irq %x failed\n", pirq);
35067 +
35068 +       if (rc < 0)
35069 +               return rc;
35070 +
35071 +       if (unmap.domid == DOMID_SELF)
35072 +               evtchn_map_pirq(pirq, 0);
35073 +
35074 +       return 0;
35075 +}
35076 +
35077 +static u64 find_table_base(struct pci_dev *dev, int pos)
35078 +{
35079 +       u8 bar;
35080 +       u32 reg;
35081 +       unsigned long flags;
35082 +
35083 +       pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
35084 +       bar = reg & PCI_MSIX_FLAGS_BIRMASK;
35085 +
35086 +       flags = pci_resource_flags(dev, bar);
35087 +       if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
35088 +               return 0;
35089 +
35090 +       return pci_resource_start(dev, bar);
35091 +}
35092 +
35093 +/*
35094 + * Protected by msi_lock
35095 + */
35096 +static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq,
35097 +                                 int entry_nr, u64 table_base)
35098 +{
35099 +       struct physdev_map_pirq map_irq;
35100 +       int rc;
35101 +       domid_t domid = DOMID_SELF;
35102 +
35103 +       domid = msi_get_dev_owner(dev);
35104 +
35105 +       map_irq.domid = domid;
35106 +       map_irq.type = MAP_PIRQ_TYPE_MSI;
35107 +       map_irq.index = -1;
35108 +       map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq);
35109 +       map_irq.bus = dev->bus->number;
35110 +       map_irq.devfn = dev->devfn;
35111 +       map_irq.entry_nr = entry_nr;
35112 +       map_irq.table_base = table_base;
35113 +
35114 +       if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
35115 +               printk(KERN_WARNING "map irq failed\n");
35116 +
35117 +       if (rc < 0)
35118 +               return rc;
35119 +       /* This happens when MSI support is not enabled in Xen. */
35120 +       if (rc == 0 && map_irq.pirq < 0)
35121 +               return -ENOSYS;
35122 +
35123 +       BUG_ON(map_irq.pirq <= 0);
35124 +
35125 +       /* If mapping of this particular MSI is on behalf of another domain,
35126 +        * we do not need to get an irq in dom0. This also implies:
35127 +        * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
35128 +        * to another domain, and will be 'Linux irq' if it belongs to dom0.
35129 +        */
35130 +       return ((domid != DOMID_SELF) ?
35131 +               map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq));
35132 +}
35133 +
35134 +static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
35135 +{
35136 +       return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base);
35137 +}
35138 +
35139 +static int msi_init(void)
35140 +{
35141 +       static int status = 0;
35142 +
35143 +       if (pci_msi_quirk) {
35144 +               pci_msi_enable = 0;
35145 +               printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n");
35146 +               status = -EINVAL;
35147 +       }
35148 +
35149 +       return status;
35150 +}
35151 +
35152 +void pci_scan_msi_device(struct pci_dev *dev) { }
35153 +
35154 +void disable_msi_mode(struct pci_dev *dev, int pos, int type)
35155 +{
35156 +       u16 control;
35157 +
35158 +       pci_read_config_word(dev, msi_control_reg(pos), &control);
35159 +       if (type == PCI_CAP_ID_MSI) {
35160 +               /* Set enabled bits to single MSI & enable MSI_enable bit */
35161 +               msi_disable(control);
35162 +               pci_write_config_word(dev, msi_control_reg(pos), control);
35163 +               dev->msi_enabled = 0;
35164 +       } else {
35165 +               msix_disable(control);
35166 +               pci_write_config_word(dev, msi_control_reg(pos), control);
35167 +               dev->msix_enabled = 0;
35168 +       }
35169 +       if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35170 +               /* PCI Express Endpoint device detected */
35171 +               pci_intx(dev, 1);  /* enable intx */
35172 +       }
35173 +}
35174 +
35175 +static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
35176 +{
35177 +       u16 control;
35178 +
35179 +       pci_read_config_word(dev, msi_control_reg(pos), &control);
35180 +       if (type == PCI_CAP_ID_MSI) {
35181 +               /* Set enabled bits to single MSI & enable MSI_enable bit */
35182 +               msi_enable(control, 1);
35183 +               pci_write_config_word(dev, msi_control_reg(pos), control);
35184 +               dev->msi_enabled = 1;
35185 +       } else {
35186 +               msix_enable(control);
35187 +               pci_write_config_word(dev, msi_control_reg(pos), control);
35188 +               dev->msix_enabled = 1;
35189 +       }
35190 +       if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35191 +               /* PCI Express Endpoint device detected */
35192 +               pci_intx(dev, 0);  /* disable intx */
35193 +       }
35194 +}
35195 +
35196 +#ifdef CONFIG_PM
35197 +int pci_save_msi_state(struct pci_dev *dev)
35198 +{
35199 +       int pos;
35200 +
35201 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35202 +       if (pos <= 0 || dev->no_msi)
35203 +               return 0;
35204 +
35205 +       if (!dev->msi_enabled)
35206 +               return 0;
35207 +
35208 +       /* Restore dev->irq to its default pin-assertion vector */
35209 +       msi_unmap_pirq(dev, dev->irq);
35210 +       /* Disable MSI mode */
35211 +       disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35212 +       /* Set the flags for use of restore */
35213 +       dev->msi_enabled = 1;
35214 +       return 0;
35215 +}
35216 +
35217 +void pci_restore_msi_state(struct pci_dev *dev)
35218 +{
35219 +       int pos, pirq;
35220 +
35221 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35222 +       if (pos <= 0)
35223 +               return;
35224 +
35225 +       if (!dev->msi_enabled)
35226 +               return;
35227 +
35228 +       pirq = msi_map_pirq_to_vector(dev, dev->irq, 0, 0);
35229 +       if (pirq < 0)
35230 +               return;
35231 +       enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35232 +}
35233 +
35234 +int pci_save_msix_state(struct pci_dev *dev)
35235 +{
35236 +       int pos;
35237 +       unsigned long flags;
35238 +       struct msi_dev_list *msi_dev_entry;
35239 +       struct msi_pirq_entry *pirq_entry, *tmp;
35240 +
35241 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35242 +       if (pos <= 0 || dev->no_msi)
35243 +               return 0;
35244 +
35245 +       /* save the capability */
35246 +       if (!dev->msix_enabled)
35247 +               return 0;
35248 +
35249 +       msi_dev_entry = get_msi_dev_pirq_list(dev);
35250 +
35251 +       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35252 +        list_for_each_entry_safe(pirq_entry, tmp,
35253 +                                 &msi_dev_entry->pirq_list_head, list)
35254 +               msi_unmap_pirq(dev, pirq_entry->pirq);
35255 +       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35256 +
35257 +       disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35258 +       /* Set the flags for use of restore */
35259 +       dev->msix_enabled = 1;
35260 +
35261 +       return 0;
35262 +}
35263 +
35264 +void pci_restore_msix_state(struct pci_dev *dev)
35265 +{
35266 +       int pos;
35267 +       unsigned long flags;
35268 +       u64 table_base;
35269 +       struct msi_dev_list *msi_dev_entry;
35270 +       struct msi_pirq_entry *pirq_entry, *tmp;
35271 +
35272 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35273 +       if (pos <= 0)
35274 +               return;
35275 +
35276 +       if (!dev->msix_enabled)
35277 +               return;
35278 +
35279 +       msi_dev_entry = get_msi_dev_pirq_list(dev);
35280 +       table_base = find_table_base(dev, pos);
35281 +       if (!table_base)
35282 +               return;
35283 +
35284 +       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35285 +       list_for_each_entry_safe(pirq_entry, tmp,
35286 +                                &msi_dev_entry->pirq_list_head, list) {
35287 +               int rc = msi_map_pirq_to_vector(dev, pirq_entry->pirq,
35288 +                                               pirq_entry->entry_nr, table_base);
35289 +               if (rc < 0)
35290 +                       printk(KERN_WARNING
35291 +                              "%s: re-mapping irq #%d (pirq%d) failed: %d\n",
35292 +                              pci_name(dev), pirq_entry->entry_nr,
35293 +                              pirq_entry->pirq, rc);
35294 +       }
35295 +       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35296 +
35297 +       enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35298 +}
35299 +#endif
35300 +
35301 +/**
35302 + * msi_capability_init - configure device's MSI capability structure
35303 + * @dev: pointer to the pci_dev data structure of MSI device function
35304 + *
35305 + * Setup the MSI capability structure of device function with a single
35306 + * MSI vector, regardless of device function is capable of handling
35307 + * multiple messages. A return of zero indicates the successful setup
35308 + * of an entry zero with the new MSI vector or non-zero for otherwise.
35309 + **/
35310 +static int msi_capability_init(struct pci_dev *dev)
35311 +{
35312 +       int pos, pirq;
35313 +       u16 control;
35314 +
35315 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35316 +       pci_read_config_word(dev, msi_control_reg(pos), &control);
35317 +
35318 +       pirq = msi_map_vector(dev, 0, 0);
35319 +       if (pirq < 0)
35320 +               return -EBUSY;
35321 +
35322 +       dev->irq = pirq;
35323 +       /* Set MSI enabled bits  */
35324 +       enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35325 +       dev->msi_enabled = 1;
35326 +
35327 +       return 0;
35328 +}
35329 +
35330 +/**
35331 + * msix_capability_init - configure device's MSI-X capability
35332 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35333 + * @entries: pointer to an array of struct msix_entry entries
35334 + * @nvec: number of @entries
35335 + *
35336 + * Setup the MSI-X capability structure of device function with a
35337 + * single MSI-X vector. A return of zero indicates the successful setup of
35338 + * requested MSI-X entries with allocated vectors or non-zero for otherwise.
35339 + **/
35340 +static int msix_capability_init(struct pci_dev *dev,
35341 +                               struct msix_entry *entries, int nvec)
35342 +{
35343 +       u64 table_base;
35344 +       int pirq, i, j, mapped, pos;
35345 +       struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
35346 +       struct msi_pirq_entry *pirq_entry;
35347 +
35348 +       if (!msi_dev_entry)
35349 +               return -ENOMEM;
35350 +
35351 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35352 +       table_base = find_table_base(dev, pos);
35353 +       if (!table_base)
35354 +               return -ENODEV;
35355 +
35356 +       /* MSI-X Table Initialization */
35357 +       for (i = 0; i < nvec; i++) {
35358 +               mapped = 0;
35359 +               list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35360 +                       if (pirq_entry->entry_nr == entries[i].entry) {
35361 +                               printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
35362 +                                      not freed before acquire again.\n", entries[i].entry,
35363 +                                          dev->bus->number, PCI_SLOT(dev->devfn),
35364 +                                          PCI_FUNC(dev->devfn));
35365 +                               (entries + i)->vector = pirq_entry->pirq;
35366 +                               mapped = 1;
35367 +                               break;
35368 +                       }
35369 +               }
35370 +               if (mapped)
35371 +                       continue;
35372 +               pirq = msi_map_vector(dev, entries[i].entry, table_base);
35373 +               if (pirq < 0)
35374 +                       break;
35375 +               attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
35376 +               (entries + i)->vector = pirq;
35377 +       }
35378 +
35379 +       if (i != nvec) {
35380 +               for (j = --i; j >= 0; j--) {
35381 +                       msi_unmap_pirq(dev, entries[j].vector);
35382 +                       detach_pirq_entry(entries[j].entry, msi_dev_entry);
35383 +                       entries[j].vector = 0;
35384 +               }
35385 +               return -EBUSY;
35386 +       }
35387 +
35388 +       enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35389 +       dev->msix_enabled = 1;
35390 +
35391 +       return 0;
35392 +}
35393 +
35394 +/**
35395 + * pci_enable_msi - configure device's MSI capability structure
35396 + * @dev: pointer to the pci_dev data structure of MSI device function
35397 + *
35398 + * Setup the MSI capability structure of device function with
35399 + * a single MSI vector upon its software driver call to request for
35400 + * MSI mode enabled on its hardware device function. A return of zero
35401 + * indicates the successful setup of an entry zero with the new MSI
35402 + * vector or non-zero for otherwise.
35403 + **/
35404 +extern int pci_frontend_enable_msi(struct pci_dev *dev);
35405 +int pci_enable_msi(struct pci_dev* dev)
35406 +{
35407 +       struct pci_bus *bus;
35408 +       int pos, temp, status = -EINVAL;
35409 +
35410 +       if (!pci_msi_enable || !dev)
35411 +               return status;
35412 +
35413 +       if (dev->no_msi)
35414 +               return status;
35415 +
35416 +       for (bus = dev->bus; bus; bus = bus->parent)
35417 +               if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35418 +                       return -EINVAL;
35419 +
35420 +       status = msi_init();
35421 +       if (status < 0)
35422 +               return status;
35423 +
35424 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35425 +       if (!is_initial_xendomain())
35426 +       {
35427 +               int ret;
35428 +
35429 +               temp = dev->irq;
35430 +               ret = pci_frontend_enable_msi(dev);
35431 +               if (ret)
35432 +                       return ret;
35433 +
35434 +               dev->irq = evtchn_map_pirq(-1, dev->irq);
35435 +               dev->irq_old = temp;
35436 +
35437 +               return ret;
35438 +       }
35439 +#endif
35440 +
35441 +       temp = dev->irq;
35442 +
35443 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35444 +       if (!pos)
35445 +               return -EINVAL;
35446 +
35447 +       /* Check whether driver already requested for MSI-X vectors */
35448 +       if (dev->msix_enabled) {
35449 +               printk(KERN_INFO "PCI: %s: Can't enable MSI.  "
35450 +                          "Device already has MSI-X vectors assigned\n",
35451 +                          pci_name(dev));
35452 +               dev->irq = temp;
35453 +               return -EINVAL;
35454 +       }
35455 +
35456 +       status = msi_capability_init(dev);
35457 +       if ( !status )
35458 +               dev->irq_old = temp;
35459 +    else
35460 +               dev->irq = temp;
35461 +
35462 +       return status;
35463 +}
35464 +
35465 +extern void pci_frontend_disable_msi(struct pci_dev* dev);
35466 +void pci_disable_msi(struct pci_dev* dev)
35467 +{
35468 +       int pos;
35469 +       int pirq;
35470 +
35471 +       if (!pci_msi_enable)
35472 +               return;
35473 +       if (!dev)
35474 +               return;
35475 +
35476 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35477 +       if (!is_initial_xendomain()) {
35478 +               evtchn_map_pirq(dev->irq, 0);
35479 +               pci_frontend_disable_msi(dev);
35480 +               dev->irq = dev->irq_old;
35481 +               return;
35482 +       }
35483 +#endif
35484 +
35485 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35486 +       if (!pos)
35487 +               return;
35488 +
35489 +       pirq = dev->irq;
35490 +       /* Restore dev->irq to its default pin-assertion vector */
35491 +       dev->irq = dev->irq_old;
35492 +       msi_unmap_pirq(dev, pirq);
35493 +
35494 +       /* Disable MSI mode */
35495 +       disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35496 +}
35497 +
35498 +/**
35499 + * pci_enable_msix - configure device's MSI-X capability structure
35500 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35501 + * @entries: pointer to an array of MSI-X entries
35502 + * @nvec: number of MSI-X vectors requested for allocation by device driver
35503 + *
35504 + * Setup the MSI-X capability structure of device function with the number
35505 + * of requested vectors upon its software driver call to request for
35506 + * MSI-X mode enabled on its hardware device function. A return of zero
35507 + * indicates the successful configuration of MSI-X capability structure
35508 + * with new allocated MSI-X vectors. A return of < 0 indicates a failure.
35509 + * Or a return of > 0 indicates that driver request is exceeding the number
35510 + * of vectors available. Driver should use the returned value to re-send
35511 + * its request.
35512 + **/
35513 +extern int pci_frontend_enable_msix(struct pci_dev *dev,
35514 +               struct msix_entry *entries, int nvec);
35515 +int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
35516 +{
35517 +       struct pci_bus *bus;
35518 +       int status, pos, nr_entries;
35519 +       int i, j, temp;
35520 +       u16 control;
35521 +
35522 +       if (!pci_msi_enable || !dev || !entries)
35523 +               return -EINVAL;
35524 +
35525 +       if (dev->no_msi)
35526 +               return -EINVAL;
35527 +
35528 +       for (bus = dev->bus; bus; bus = bus->parent)
35529 +               if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35530 +                       return -EINVAL;
35531 +
35532 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35533 +       if (!is_initial_xendomain()) {
35534 +               struct msi_dev_list *msi_dev_entry;
35535 +               struct msi_pirq_entry *pirq_entry;
35536 +               int ret, irq;
35537 +
35538 +               ret = pci_frontend_enable_msix(dev, entries, nvec);
35539 +               if (ret) {
35540 +                       printk("get %x from pci_frontend_enable_msix\n", ret);
35541 +                       return ret;
35542 +               }
35543 +
35544 +               msi_dev_entry = get_msi_dev_pirq_list(dev);
35545 +               for (i = 0; i < nvec; i++) {
35546 +                       int mapped = 0;
35547 +
35548 +                       list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35549 +                               if (pirq_entry->entry_nr == entries[i].entry) {
35550 +                                       irq = pirq_entry->pirq;
35551 +                                       BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
35552 +                                       entries[i].vector = irq;
35553 +                                       mapped = 1;
35554 +                                       break;
35555 +                               }
35556 +                       }
35557 +                       if (mapped)
35558 +                               continue;
35559 +                       irq = evtchn_map_pirq(-1, entries[i].vector);
35560 +                       attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
35561 +                       entries[i].vector = irq;
35562 +               }
35563 +        return 0;
35564 +       }
35565 +#endif
35566 +
35567 +       status = msi_init();
35568 +       if (status < 0)
35569 +               return status;
35570 +
35571 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35572 +       if (!pos)
35573 +               return -EINVAL;
35574 +
35575 +       pci_read_config_word(dev, msi_control_reg(pos), &control);
35576 +       nr_entries = multi_msix_capable(control);
35577 +       if (nvec > nr_entries)
35578 +               return -EINVAL;
35579 +
35580 +       /* Check for any invalid entries */
35581 +       for (i = 0; i < nvec; i++) {
35582 +               if (entries[i].entry >= nr_entries)
35583 +                       return -EINVAL;         /* invalid entry */
35584 +               for (j = i + 1; j < nvec; j++) {
35585 +                       if (entries[i].entry == entries[j].entry)
35586 +                               return -EINVAL; /* duplicate entry */
35587 +               }
35588 +       }
35589 +
35590 +       temp = dev->irq;
35591 +       /* Check whether driver already requested for MSI vector */
35592 +       if (dev->msi_enabled) {
35593 +               printk(KERN_INFO "PCI: %s: Can't enable MSI-X.  "
35594 +                      "Device already has an MSI vector assigned\n",
35595 +                      pci_name(dev));
35596 +               dev->irq = temp;
35597 +               return -EINVAL;
35598 +       }
35599 +
35600 +       status = msix_capability_init(dev, entries, nvec);
35601 +
35602 +       if ( !status )
35603 +               dev->irq_old = temp;
35604 +       else
35605 +               dev->irq = temp;
35606 +
35607 +       return status;
35608 +}
35609 +
35610 +extern void pci_frontend_disable_msix(struct pci_dev* dev);
35611 +void pci_disable_msix(struct pci_dev* dev)
35612 +{
35613 +       int pos;
35614 +       u16 control;
35615 +
35616 +
35617 +       if (!pci_msi_enable)
35618 +               return;
35619 +       if (!dev)
35620 +               return;
35621 +
35622 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35623 +       if (!is_initial_xendomain()) {
35624 +               struct msi_dev_list *msi_dev_entry;
35625 +               struct msi_pirq_entry *pirq_entry, *tmp;
35626 +
35627 +               pci_frontend_disable_msix(dev);
35628 +
35629 +               msi_dev_entry = get_msi_dev_pirq_list(dev);
35630 +               list_for_each_entry_safe(pirq_entry, tmp,
35631 +                                        &msi_dev_entry->pirq_list_head, list) {
35632 +                       evtchn_map_pirq(pirq_entry->pirq, 0);
35633 +                       list_del(&pirq_entry->list);
35634 +                       kfree(pirq_entry);
35635 +               }
35636 +
35637 +               dev->irq = dev->irq_old;
35638 +               return;
35639 +       }
35640 +#endif
35641 +
35642 +       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35643 +       if (!pos)
35644 +               return;
35645 +
35646 +       pci_read_config_word(dev, msi_control_reg(pos), &control);
35647 +       if (!(control & PCI_MSIX_FLAGS_ENABLE))
35648 +               return;
35649 +
35650 +       msi_remove_pci_irq_vectors(dev);
35651 +
35652 +       /* Disable MSI mode */
35653 +       disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35654 +}
35655 +
35656 +/**
35657 + * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
35658 + * @dev: pointer to the pci_dev data structure of MSI(X) device function
35659 + *
35660 + * Being called during hotplug remove, from which the device function
35661 + * is hot-removed. All previous assigned MSI/MSI-X vectors, if
35662 + * allocated for this device function, are reclaimed to unused state,
35663 + * which may be used later on.
35664 + **/
35665 +void msi_remove_pci_irq_vectors(struct pci_dev* dev)
35666 +{
35667 +       unsigned long flags;
35668 +       struct msi_dev_list *msi_dev_entry;
35669 +       struct msi_pirq_entry *pirq_entry, *tmp;
35670 +
35671 +       if (!pci_msi_enable || !dev)
35672 +               return;
35673 +
35674 +       msi_dev_entry = get_msi_dev_pirq_list(dev);
35675 +
35676 +       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35677 +       if (!list_empty(&msi_dev_entry->pirq_list_head))
35678 +       {
35679 +               printk(KERN_WARNING "msix pirqs for dev %02x:%02x:%01x are not freed \
35680 +                      before acquire again.\n", dev->bus->number, PCI_SLOT(dev->devfn),
35681 +                          PCI_FUNC(dev->devfn));
35682 +               list_for_each_entry_safe(pirq_entry, tmp,
35683 +                                        &msi_dev_entry->pirq_list_head, list) {
35684 +                       msi_unmap_pirq(dev, pirq_entry->pirq);
35685 +                       list_del(&pirq_entry->list);
35686 +                       kfree(pirq_entry);
35687 +               }
35688 +       }
35689 +       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35690 +       dev->irq = dev->irq_old;
35691 +}
35692 +
35693 +void pci_no_msi(void)
35694 +{
35695 +       pci_msi_enable = 0;
35696 +}
35697 +
35698 +EXPORT_SYMBOL(pci_enable_msi);
35699 +EXPORT_SYMBOL(pci_disable_msi);
35700 +EXPORT_SYMBOL(pci_enable_msix);
35701 +EXPORT_SYMBOL(pci_disable_msix);
35702 +#ifdef CONFIG_XEN
35703 +EXPORT_SYMBOL(register_msi_get_owner);
35704 +EXPORT_SYMBOL(unregister_msi_get_owner);
35705 +#endif
35706 +
35707 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h
35708 ===================================================================
35709 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
35710 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h  2007-06-22 09:08:06.000000000 +0200
35711 @@ -0,0 +1,44 @@
35712 +#ifndef AGP_H
35713 +#define AGP_H 1
35714 +
35715 +#include <asm/pgtable.h>
35716 +#include <asm/cacheflush.h>
35717 +#include <asm/system.h>
35718 +
35719 +/*
35720 + * Functions to keep the agpgart mappings coherent with the MMU.
35721 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
35722 + * mapped uncacheable. Make sure there are no conflicting mappings
35723 + * with different cachability attributes for the same page. This avoids
35724 + * data corruption on some CPUs.
35725 + */
35726 +
35727 +/* Caller's responsibility to call global_flush_tlb() for
35728 + * performance reasons */
35729 +#define map_page_into_agp(page) ( \
35730 +       xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
35731 +       ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
35732 +#define unmap_page_from_agp(page) ( \
35733 +       xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
35734 +       /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
35735 +       change_page_attr(page, 1, PAGE_KERNEL))
35736 +#define flush_agp_mappings() global_flush_tlb()
35737 +
35738 +/* Could use CLFLUSH here if the cpu supports it. But then it would
35739 +   need to be called for each cacheline of the whole page so it may not be
35740 +   worth it. Would need a page for it. */
35741 +#define flush_agp_cache() wbinvd()
35742 +
35743 +/* Convert a physical address to an address suitable for the GART. */
35744 +#define phys_to_gart(x) phys_to_machine(x)
35745 +#define gart_to_phys(x) machine_to_phys(x)
35746 +
35747 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
35748 +#define alloc_gatt_pages(order)        ({                                          \
35749 +       char *_t; dma_addr_t _d;                                            \
35750 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
35751 +       _t; })
35752 +#define free_gatt_pages(table, order)  \
35753 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
35754 +
35755 +#endif
35756 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h
35757 ===================================================================
35758 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
35759 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h      2008-01-28 12:24:19.000000000 +0100
35760 @@ -0,0 +1,166 @@
35761 +#ifndef __ARCH_DESC_H
35762 +#define __ARCH_DESC_H
35763 +
35764 +#include <asm/ldt.h>
35765 +#include <asm/segment.h>
35766 +
35767 +#define CPU_16BIT_STACK_SIZE 1024
35768 +
35769 +#ifndef __ASSEMBLY__
35770 +
35771 +#include <linux/preempt.h>
35772 +#include <linux/smp.h>
35773 +
35774 +#include <asm/mmu.h>
35775 +
35776 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
35777 +
35778 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
35779 +
35780 +struct Xgt_desc_struct {
35781 +       unsigned short size;
35782 +       unsigned long address __attribute__((packed));
35783 +       unsigned short pad;
35784 +} __attribute__ ((packed));
35785 +
35786 +extern struct Xgt_desc_struct idt_descr;
35787 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
35788 +
35789 +
35790 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
35791 +{
35792 +       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
35793 +}
35794 +
35795 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
35796 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
35797 +
35798 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
35799 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
35800 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
35801 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
35802 +
35803 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
35804 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
35805 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
35806 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
35807 +
35808 +/*
35809 + * This is the ldt that every process will get unless we need
35810 + * something other than this.
35811 + */
35812 +extern struct desc_struct default_ldt[];
35813 +extern void set_intr_gate(unsigned int irq, void * addr);
35814 +
35815 +#define _set_tssldt_desc(n,addr,limit,type) \
35816 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
35817 +       "movw %w1,2(%2)\n\t" \
35818 +       "rorl $16,%1\n\t" \
35819 +       "movb %b1,4(%2)\n\t" \
35820 +       "movb %4,5(%2)\n\t" \
35821 +       "movb $0,6(%2)\n\t" \
35822 +       "movb %h1,7(%2)\n\t" \
35823 +       "rorl $16,%1" \
35824 +       : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
35825 +
35826 +#ifndef CONFIG_X86_NO_TSS
35827 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
35828 +{
35829 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
35830 +               offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
35831 +}
35832 +
35833 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
35834 +#endif
35835 +
35836 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
35837 +{
35838 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
35839 +}
35840 +
35841 +#define LDT_entry_a(info) \
35842 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
35843 +
35844 +#define LDT_entry_b(info) \
35845 +       (((info)->base_addr & 0xff000000) | \
35846 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
35847 +       ((info)->limit & 0xf0000) | \
35848 +       (((info)->read_exec_only ^ 1) << 9) | \
35849 +       ((info)->contents << 10) | \
35850 +       (((info)->seg_not_present ^ 1) << 15) | \
35851 +       ((info)->seg_32bit << 22) | \
35852 +       ((info)->limit_in_pages << 23) | \
35853 +       ((info)->useable << 20) | \
35854 +       0x7000)
35855 +
35856 +#define LDT_empty(info) (\
35857 +       (info)->base_addr       == 0    && \
35858 +       (info)->limit           == 0    && \
35859 +       (info)->contents        == 0    && \
35860 +       (info)->read_exec_only  == 1    && \
35861 +       (info)->seg_32bit       == 0    && \
35862 +       (info)->limit_in_pages  == 0    && \
35863 +       (info)->seg_not_present == 1    && \
35864 +       (info)->useable         == 0    )
35865 +
35866 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
35867 +
35868 +#if TLS_SIZE != 24
35869 +# error update this code.
35870 +#endif
35871 +
35872 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
35873 +{
35874 +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
35875 +                                              *(u64 *)&t->tls_array[i])) \
35876 +               BUG();
35877 +       C(0); C(1); C(2);
35878 +#undef C
35879 +}
35880 +
35881 +static inline void clear_LDT(void)
35882 +{
35883 +       int cpu = get_cpu();
35884 +
35885 +       /*
35886 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
35887 +        * it slows down context switching. Noone uses it anyway.
35888 +        */
35889 +       cpu = cpu;              /* XXX avoid compiler warning */
35890 +       xen_set_ldt(NULL, 0);
35891 +       put_cpu();
35892 +}
35893 +
35894 +/*
35895 + * load one particular LDT into the current CPU
35896 + */
35897 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
35898 +{
35899 +       void *segments = pc->ldt;
35900 +       int count = pc->size;
35901 +
35902 +       if (likely(!count))
35903 +               segments = NULL;
35904 +
35905 +       xen_set_ldt(segments, count);
35906 +}
35907 +
35908 +static inline void load_LDT(mm_context_t *pc)
35909 +{
35910 +       int cpu = get_cpu();
35911 +       load_LDT_nolock(pc, cpu);
35912 +       put_cpu();
35913 +}
35914 +
35915 +static inline unsigned long get_desc_base(unsigned long *desc)
35916 +{
35917 +       unsigned long base;
35918 +       base = ((desc[0] >> 16)  & 0x0000ffff) |
35919 +               ((desc[1] << 16) & 0x00ff0000) |
35920 +               (desc[1] & 0xff000000);
35921 +       return base;
35922 +}
35923 +
35924 +#endif /* !__ASSEMBLY__ */
35925 +
35926 +#endif
35927 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h
35928 ===================================================================
35929 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
35930 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h       2008-04-02 12:34:02.000000000 +0200
35931 @@ -0,0 +1,151 @@
35932 +#ifndef _ASM_I386_DMA_MAPPING_H
35933 +#define _ASM_I386_DMA_MAPPING_H
35934 +
35935 +/*
35936 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
35937 + * documentation.
35938 + */
35939 +
35940 +#include <linux/mm.h>
35941 +#include <asm/cache.h>
35942 +#include <asm/io.h>
35943 +#include <asm/scatterlist.h>
35944 +#include <asm/swiotlb.h>
35945 +
35946 +static inline int
35947 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
35948 +{
35949 +       dma_addr_t mask = 0xffffffff;
35950 +       /* If the device has a mask, use it, otherwise default to 32 bits */
35951 +       if (hwdev && hwdev->dma_mask)
35952 +               mask = *hwdev->dma_mask;
35953 +       return (addr & ~mask) != 0;
35954 +}
35955 +
35956 +extern int range_straddles_page_boundary(paddr_t p, size_t size);
35957 +
35958 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
35959 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
35960 +
35961 +void *dma_alloc_coherent(struct device *dev, size_t size,
35962 +                          dma_addr_t *dma_handle, gfp_t flag);
35963 +
35964 +void dma_free_coherent(struct device *dev, size_t size,
35965 +                        void *vaddr, dma_addr_t dma_handle);
35966 +
35967 +extern dma_addr_t
35968 +dma_map_single(struct device *dev, void *ptr, size_t size,
35969 +              enum dma_data_direction direction);
35970 +
35971 +extern void
35972 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
35973 +                enum dma_data_direction direction);
35974 +
35975 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
35976 +                     int nents, enum dma_data_direction direction);
35977 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
35978 +                        int nents, enum dma_data_direction direction);
35979 +
35980 +#ifdef CONFIG_HIGHMEM
35981 +extern dma_addr_t
35982 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
35983 +            size_t size, enum dma_data_direction direction);
35984 +
35985 +extern void
35986 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
35987 +              enum dma_data_direction direction);
35988 +#else
35989 +#define dma_map_page(dev, page, offset, size, dir) \
35990 +       dma_map_single(dev, page_address(page) + (offset), (size), (dir))
35991 +#define dma_unmap_page dma_unmap_single
35992 +#endif
35993 +
35994 +extern void
35995 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
35996 +                       enum dma_data_direction direction);
35997 +
35998 +extern void
35999 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
36000 +                           enum dma_data_direction direction);
36001 +
36002 +static inline void
36003 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
36004 +                             unsigned long offset, size_t size,
36005 +                             enum dma_data_direction direction)
36006 +{
36007 +       dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
36008 +}
36009 +
36010 +static inline void
36011 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
36012 +                                unsigned long offset, size_t size,
36013 +                                enum dma_data_direction direction)
36014 +{
36015 +       dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
36016 +}
36017 +
36018 +static inline void
36019 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
36020 +                   enum dma_data_direction direction)
36021 +{
36022 +       if (swiotlb)
36023 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
36024 +       flush_write_buffers();
36025 +}
36026 +
36027 +static inline void
36028 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
36029 +                   enum dma_data_direction direction)
36030 +{
36031 +       if (swiotlb)
36032 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
36033 +       flush_write_buffers();
36034 +}
36035 +
36036 +extern int
36037 +dma_mapping_error(dma_addr_t dma_addr);
36038 +
36039 +extern int
36040 +dma_supported(struct device *dev, u64 mask);
36041 +
36042 +static inline int
36043 +dma_set_mask(struct device *dev, u64 mask)
36044 +{
36045 +       if(!dev->dma_mask || !dma_supported(dev, mask))
36046 +               return -EIO;
36047 +
36048 +       *dev->dma_mask = mask;
36049 +
36050 +       return 0;
36051 +}
36052 +
36053 +static inline int
36054 +dma_get_cache_alignment(void)
36055 +{
36056 +       /* no easy way to get cache size on all x86, so return the
36057 +        * maximum possible, to be safe */
36058 +       return (1 << INTERNODE_CACHE_SHIFT);
36059 +}
36060 +
36061 +#define dma_is_consistent(d)   (1)
36062 +
36063 +static inline void
36064 +dma_cache_sync(void *vaddr, size_t size,
36065 +              enum dma_data_direction direction)
36066 +{
36067 +       flush_write_buffers();
36068 +}
36069 +
36070 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
36071 +extern int
36072 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
36073 +                           dma_addr_t device_addr, size_t size, int flags);
36074 +
36075 +extern void
36076 +dma_release_declared_memory(struct device *dev);
36077 +
36078 +extern void *
36079 +dma_mark_declared_memory_occupied(struct device *dev,
36080 +                                 dma_addr_t device_addr, size_t size);
36081 +
36082 +#endif
36083 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h
36084 ===================================================================
36085 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
36086 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h    2007-06-12 13:14:02.000000000 +0200
36087 @@ -0,0 +1,155 @@
36088 +/*
36089 + * fixmap.h: compile-time virtual memory allocation
36090 + *
36091 + * This file is subject to the terms and conditions of the GNU General Public
36092 + * License.  See the file "COPYING" in the main directory of this archive
36093 + * for more details.
36094 + *
36095 + * Copyright (C) 1998 Ingo Molnar
36096 + *
36097 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
36098 + */
36099 +
36100 +#ifndef _ASM_FIXMAP_H
36101 +#define _ASM_FIXMAP_H
36102 +
36103 +
36104 +/* used by vmalloc.c, vsyscall.lds.S.
36105 + *
36106 + * Leave one empty page between vmalloc'ed areas and
36107 + * the start of the fixmap.
36108 + */
36109 +extern unsigned long __FIXADDR_TOP;
36110 +
36111 +#ifndef __ASSEMBLY__
36112 +#include <linux/kernel.h>
36113 +#include <asm/acpi.h>
36114 +#include <asm/apicdef.h>
36115 +#include <asm/page.h>
36116 +#ifdef CONFIG_HIGHMEM
36117 +#include <linux/threads.h>
36118 +#include <asm/kmap_types.h>
36119 +#endif
36120 +
36121 +/*
36122 + * Here we define all the compile-time 'special' virtual
36123 + * addresses. The point is to have a constant address at
36124 + * compile time, but to set the physical address only
36125 + * in the boot process. We allocate these special addresses
36126 + * from the end of virtual memory (0xfffff000) backwards.
36127 + * Also this lets us do fail-safe vmalloc(), we
36128 + * can guarantee that these special addresses and
36129 + * vmalloc()-ed addresses never overlap.
36130 + *
36131 + * these 'compile-time allocated' memory buffers are
36132 + * fixed-size 4k pages. (or larger if used with an increment
36133 + * highger than 1) use fixmap_set(idx,phys) to associate
36134 + * physical memory with fixmap indices.
36135 + *
36136 + * TLB entries of such buffers will not be flushed across
36137 + * task switches.
36138 + */
36139 +enum fixed_addresses {
36140 +       FIX_HOLE,
36141 +       FIX_VDSO,
36142 +#ifdef CONFIG_X86_LOCAL_APIC
36143 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
36144 +#endif
36145 +#ifdef CONFIG_X86_IO_APIC
36146 +       FIX_IO_APIC_BASE_0,
36147 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
36148 +#endif
36149 +#ifdef CONFIG_X86_VISWS_APIC
36150 +       FIX_CO_CPU,     /* Cobalt timer */
36151 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
36152 +       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
36153 +       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
36154 +#endif
36155 +#ifdef CONFIG_X86_F00F_BUG
36156 +       FIX_F00F_IDT,   /* Virtual mapping for IDT */
36157 +#endif
36158 +#ifdef CONFIG_X86_CYCLONE_TIMER
36159 +       FIX_CYCLONE_TIMER, /*cyclone timer register*/
36160 +#endif
36161 +#ifdef CONFIG_HIGHMEM
36162 +       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
36163 +       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
36164 +#endif
36165 +#ifdef CONFIG_ACPI
36166 +       FIX_ACPI_BEGIN,
36167 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
36168 +#endif
36169 +#ifdef CONFIG_PCI_MMCONFIG
36170 +       FIX_PCIE_MCFG,
36171 +#endif
36172 +       FIX_SHARED_INFO,
36173 +#define NR_FIX_ISAMAPS 256
36174 +       FIX_ISAMAP_END,
36175 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
36176 +       __end_of_permanent_fixed_addresses,
36177 +       /* temporary boot-time mappings, used before ioremap() is functional */
36178 +#define NR_FIX_BTMAPS  16
36179 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
36180 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
36181 +       FIX_WP_TEST,
36182 +       __end_of_fixed_addresses
36183 +};
36184 +
36185 +extern void set_fixaddr_top(unsigned long top);
36186 +
36187 +extern void __set_fixmap(enum fixed_addresses idx,
36188 +                                       maddr_t phys, pgprot_t flags);
36189 +
36190 +#define set_fixmap(idx, phys) \
36191 +               __set_fixmap(idx, phys, PAGE_KERNEL)
36192 +/*
36193 + * Some hardware wants to get fixmapped without caching.
36194 + */
36195 +#define set_fixmap_nocache(idx, phys) \
36196 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
36197 +
36198 +#define clear_fixmap(idx) \
36199 +               __set_fixmap(idx, 0, __pgprot(0))
36200 +
36201 +#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
36202 +
36203 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
36204 +#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
36205 +#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
36206 +#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
36207 +
36208 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
36209 +#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
36210 +
36211 +extern void __this_fixmap_does_not_exist(void);
36212 +
36213 +/*
36214 + * 'index to address' translation. If anyone tries to use the idx
36215 + * directly without tranlation, we catch the bug with a NULL-deference
36216 + * kernel oops. Illegal ranges of incoming indices are caught too.
36217 + */
36218 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
36219 +{
36220 +       /*
36221 +        * this branch gets completely eliminated after inlining,
36222 +        * except when someone tries to use fixaddr indices in an
36223 +        * illegal way. (such as mixing up address types or using
36224 +        * out-of-range indices).
36225 +        *
36226 +        * If it doesn't get removed, the linker will complain
36227 +        * loudly with a reasonably clear error message..
36228 +        */
36229 +       if (idx >= __end_of_fixed_addresses)
36230 +               __this_fixmap_does_not_exist();
36231 +
36232 +        return __fix_to_virt(idx);
36233 +}
36234 +
36235 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
36236 +{
36237 +       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
36238 +       return __virt_to_fix(vaddr);
36239 +}
36240 +
36241 +#endif /* !__ASSEMBLY__ */
36242 +#endif
36243 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h
36244 ===================================================================
36245 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
36246 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h   2007-08-06 15:10:49.000000000 +0200
36247 @@ -0,0 +1,41 @@
36248 +/*
36249 + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
36250 + * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
36251 + *                    VA Linux Systems Japan K.K.
36252 + *
36253 + * This program is free software; you can redistribute it and/or modify
36254 + * it under the terms of the GNU General Public License as published by
36255 + * the Free Software Foundation; either version 2 of the License, or
36256 + * (at your option) any later version.
36257 + *
36258 + * This program is distributed in the hope that it will be useful,
36259 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
36260 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36261 + * GNU General Public License for more details.
36262 + *
36263 + * You should have received a copy of the GNU General Public License
36264 + * along with this program; if not, write to the Free Software
36265 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
36266 + */
36267 +
36268 +#ifndef _ASM_I386_GNTTAB_DMA_H
36269 +#define _ASM_I386_GNTTAB_DMA_H
36270 +
36271 +static inline int gnttab_dma_local_pfn(struct page *page)
36272 +{
36273 +       /* Has it become a local MFN? */
36274 +       return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
36275 +}
36276 +
36277 +static inline maddr_t gnttab_dma_map_page(struct page *page)
36278 +{
36279 +       __gnttab_dma_map_page(page);
36280 +       return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
36281 +}
36282 +
36283 +static inline void gnttab_dma_unmap_page(maddr_t maddr)
36284 +{
36285 +       __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
36286 +}
36287 +
36288 +#endif /* _ASM_I386_GNTTAB_DMA_H */
36289 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h
36290 ===================================================================
36291 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
36292 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h      2008-10-29 09:55:56.000000000 +0100
36293 @@ -0,0 +1,97 @@
36294 +/*
36295 + * highmem.h: virtual kernel memory mappings for high memory
36296 + *
36297 + * Used in CONFIG_HIGHMEM systems for memory pages which
36298 + * are not addressable by direct kernel virtual addresses.
36299 + *
36300 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
36301 + *                   Gerhard.Wichert@pdb.siemens.de
36302 + *
36303 + *
36304 + * Redesigned the x86 32-bit VM architecture to deal with
36305 + * up to 16 Terabyte physical memory. With current x86 CPUs
36306 + * we now support up to 64 Gigabytes physical RAM.
36307 + *
36308 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
36309 + */
36310 +
36311 +#ifndef _ASM_HIGHMEM_H
36312 +#define _ASM_HIGHMEM_H
36313 +
36314 +#ifdef __KERNEL__
36315 +
36316 +#include <linux/interrupt.h>
36317 +#include <linux/threads.h>
36318 +#include <asm/kmap_types.h>
36319 +#include <asm/tlbflush.h>
36320 +
36321 +/* declarations for highmem.c */
36322 +extern unsigned long highstart_pfn, highend_pfn;
36323 +
36324 +extern pte_t *kmap_pte;
36325 +extern pgprot_t kmap_prot;
36326 +extern pte_t *pkmap_page_table;
36327 +
36328 +/*
36329 + * Right now we initialize only a single pte table. It can be extended
36330 + * easily, subsequent pte tables have to be allocated in one physical
36331 + * chunk of RAM.
36332 + */
36333 +#ifdef CONFIG_X86_PAE
36334 +#define LAST_PKMAP 512
36335 +#else
36336 +#define LAST_PKMAP 1024
36337 +#endif
36338 +/*
36339 + * Ordering is:
36340 + *
36341 + * FIXADDR_TOP
36342 + *                     fixed_addresses
36343 + * FIXADDR_START
36344 + *                     temp fixed addresses
36345 + * FIXADDR_BOOT_START
36346 + *                     Persistent kmap area
36347 + * PKMAP_BASE
36348 + * VMALLOC_END
36349 + *                     Vmalloc area
36350 + * VMALLOC_START
36351 + * high_memory
36352 + */
36353 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
36354 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
36355 +#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
36356 +#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
36357 +
36358 +extern void * FASTCALL(kmap_high(struct page *page));
36359 +extern void FASTCALL(kunmap_high(struct page *page));
36360 +
36361 +void *kmap(struct page *page);
36362 +void kunmap(struct page *page);
36363 +void *kmap_atomic(struct page *page, enum km_type type);
36364 +void *kmap_atomic_pte(struct page *page, enum km_type type);
36365 +void kunmap_atomic(void *kvaddr, enum km_type type);
36366 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
36367 +struct page *kmap_atomic_to_page(void *ptr);
36368 +
36369 +#define flush_cache_kmaps()    do { } while (0)
36370 +
36371 +void clear_highpage(struct page *);
36372 +static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
36373 +{
36374 +       clear_highpage(page);
36375 +}
36376 +#define __HAVE_ARCH_CLEAR_HIGHPAGE
36377 +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
36378 +
36379 +void copy_highpage(struct page *to, struct page *from);
36380 +static inline void copy_user_highpage(struct page *to, struct page *from,
36381 +       unsigned long vaddr)
36382 +{
36383 +       copy_highpage(to, from);
36384 +}
36385 +#define __HAVE_ARCH_COPY_HIGHPAGE
36386 +#define __HAVE_ARCH_COPY_USER_HIGHPAGE
36387 +
36388 +#endif /* __KERNEL__ */
36389 +
36390 +#endif /* _ASM_HIGHMEM_H */
36391 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h
36392 ===================================================================
36393 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
36394 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h 2008-11-25 12:22:34.000000000 +0100
36395 @@ -0,0 +1,409 @@
36396 +/******************************************************************************
36397 + * hypercall.h
36398 + *
36399 + * Linux-specific hypervisor handling.
36400 + *
36401 + * Copyright (c) 2002-2004, K A Fraser
36402 + *
36403 + * This program is free software; you can redistribute it and/or
36404 + * modify it under the terms of the GNU General Public License version 2
36405 + * as published by the Free Software Foundation; or, when distributed
36406 + * separately from the Linux kernel or incorporated into other
36407 + * software packages, subject to the following license:
36408 + *
36409 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36410 + * of this source file (the "Software"), to deal in the Software without
36411 + * restriction, including without limitation the rights to use, copy, modify,
36412 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36413 + * and to permit persons to whom the Software is furnished to do so, subject to
36414 + * the following conditions:
36415 + *
36416 + * The above copyright notice and this permission notice shall be included in
36417 + * all copies or substantial portions of the Software.
36418 + *
36419 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36420 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36421 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36422 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36423 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36424 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36425 + * IN THE SOFTWARE.
36426 + */
36427 +
36428 +#ifndef __HYPERCALL_H__
36429 +#define __HYPERCALL_H__
36430 +
36431 +#include <linux/string.h> /* memcpy() */
36432 +#include <linux/stringify.h>
36433 +
36434 +#ifndef __HYPERVISOR_H__
36435 +# error "please don't include this file directly"
36436 +#endif
36437 +
36438 +#ifdef CONFIG_XEN
36439 +#define HYPERCALL_STR(name)                                    \
36440 +       "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
36441 +#else
36442 +#define HYPERCALL_STR(name)                                    \
36443 +       "mov hypercall_stubs,%%eax; "                           \
36444 +       "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
36445 +       "call *%%eax"
36446 +#endif
36447 +
36448 +#define _hypercall0(type, name)                        \
36449 +({                                             \
36450 +       type __res;                             \
36451 +       asm volatile (                          \
36452 +               HYPERCALL_STR(name)             \
36453 +               : "=a" (__res)                  \
36454 +               :                               \
36455 +               : "memory" );                   \
36456 +       __res;                                  \
36457 +})
36458 +
36459 +#define _hypercall1(type, name, a1)                            \
36460 +({                                                             \
36461 +       type __res;                                             \
36462 +       long __ign1;                                            \
36463 +       asm volatile (                                          \
36464 +               HYPERCALL_STR(name)                             \
36465 +               : "=a" (__res), "=b" (__ign1)                   \
36466 +               : "1" ((long)(a1))                              \
36467 +               : "memory" );                                   \
36468 +       __res;                                                  \
36469 +})
36470 +
36471 +#define _hypercall2(type, name, a1, a2)                                \
36472 +({                                                             \
36473 +       type __res;                                             \
36474 +       long __ign1, __ign2;                                    \
36475 +       asm volatile (                                          \
36476 +               HYPERCALL_STR(name)                             \
36477 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)    \
36478 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
36479 +               : "memory" );                                   \
36480 +       __res;                                                  \
36481 +})
36482 +
36483 +#define _hypercall3(type, name, a1, a2, a3)                    \
36484 +({                                                             \
36485 +       type __res;                                             \
36486 +       long __ign1, __ign2, __ign3;                            \
36487 +       asm volatile (                                          \
36488 +               HYPERCALL_STR(name)                             \
36489 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
36490 +               "=d" (__ign3)                                   \
36491 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
36492 +               "3" ((long)(a3))                                \
36493 +               : "memory" );                                   \
36494 +       __res;                                                  \
36495 +})
36496 +
36497 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
36498 +({                                                             \
36499 +       type __res;                                             \
36500 +       long __ign1, __ign2, __ign3, __ign4;                    \
36501 +       asm volatile (                                          \
36502 +               HYPERCALL_STR(name)                             \
36503 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
36504 +               "=d" (__ign3), "=S" (__ign4)                    \
36505 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
36506 +               "3" ((long)(a3)), "4" ((long)(a4))              \
36507 +               : "memory" );                                   \
36508 +       __res;                                                  \
36509 +})
36510 +
36511 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
36512 +({                                                             \
36513 +       type __res;                                             \
36514 +       long __ign1, __ign2, __ign3, __ign4, __ign5;            \
36515 +       asm volatile (                                          \
36516 +               HYPERCALL_STR(name)                             \
36517 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
36518 +               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)     \
36519 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
36520 +               "3" ((long)(a3)), "4" ((long)(a4)),             \
36521 +               "5" ((long)(a5))                                \
36522 +               : "memory" );                                   \
36523 +       __res;                                                  \
36524 +})
36525 +
36526 +static inline int __must_check
36527 +HYPERVISOR_set_trap_table(
36528 +       const trap_info_t *table)
36529 +{
36530 +       return _hypercall1(int, set_trap_table, table);
36531 +}
36532 +
36533 +static inline int __must_check
36534 +HYPERVISOR_mmu_update(
36535 +       mmu_update_t *req, unsigned int count, unsigned int *success_count,
36536 +       domid_t domid)
36537 +{
36538 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
36539 +}
36540 +
36541 +static inline int __must_check
36542 +HYPERVISOR_mmuext_op(
36543 +       struct mmuext_op *op, unsigned int count, unsigned int *success_count,
36544 +       domid_t domid)
36545 +{
36546 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
36547 +}
36548 +
36549 +static inline int __must_check
36550 +HYPERVISOR_set_gdt(
36551 +       unsigned long *frame_list, unsigned int entries)
36552 +{
36553 +       return _hypercall2(int, set_gdt, frame_list, entries);
36554 +}
36555 +
36556 +static inline int __must_check
36557 +HYPERVISOR_stack_switch(
36558 +       unsigned long ss, unsigned long esp)
36559 +{
36560 +       return _hypercall2(int, stack_switch, ss, esp);
36561 +}
36562 +
36563 +static inline int __must_check
36564 +HYPERVISOR_set_callbacks(
36565 +       unsigned long event_selector, unsigned long event_address,
36566 +       unsigned long failsafe_selector, unsigned long failsafe_address)
36567 +{
36568 +       return _hypercall4(int, set_callbacks,
36569 +                          event_selector, event_address,
36570 +                          failsafe_selector, failsafe_address);
36571 +}
36572 +
36573 +static inline int
36574 +HYPERVISOR_fpu_taskswitch(
36575 +       int set)
36576 +{
36577 +       return _hypercall1(int, fpu_taskswitch, set);
36578 +}
36579 +
36580 +static inline int __must_check
36581 +HYPERVISOR_sched_op_compat(
36582 +       int cmd, unsigned long arg)
36583 +{
36584 +       return _hypercall2(int, sched_op_compat, cmd, arg);
36585 +}
36586 +
36587 +static inline int __must_check
36588 +HYPERVISOR_sched_op(
36589 +       int cmd, void *arg)
36590 +{
36591 +       return _hypercall2(int, sched_op, cmd, arg);
36592 +}
36593 +
36594 +static inline long __must_check
36595 +HYPERVISOR_set_timer_op(
36596 +       u64 timeout)
36597 +{
36598 +       unsigned long timeout_hi = (unsigned long)(timeout>>32);
36599 +       unsigned long timeout_lo = (unsigned long)timeout;
36600 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
36601 +}
36602 +
36603 +static inline int __must_check
36604 +HYPERVISOR_platform_op(
36605 +       struct xen_platform_op *platform_op)
36606 +{
36607 +       platform_op->interface_version = XENPF_INTERFACE_VERSION;
36608 +       return _hypercall1(int, platform_op, platform_op);
36609 +}
36610 +
36611 +static inline int __must_check
36612 +HYPERVISOR_set_debugreg(
36613 +       unsigned int reg, unsigned long value)
36614 +{
36615 +       return _hypercall2(int, set_debugreg, reg, value);
36616 +}
36617 +
36618 +static inline unsigned long __must_check
36619 +HYPERVISOR_get_debugreg(
36620 +       unsigned int reg)
36621 +{
36622 +       return _hypercall1(unsigned long, get_debugreg, reg);
36623 +}
36624 +
36625 +static inline int __must_check
36626 +HYPERVISOR_update_descriptor(
36627 +       u64 ma, u64 desc)
36628 +{
36629 +       return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
36630 +}
36631 +
36632 +static inline int __must_check
36633 +HYPERVISOR_memory_op(
36634 +       unsigned int cmd, void *arg)
36635 +{
36636 +       return _hypercall2(int, memory_op, cmd, arg);
36637 +}
36638 +
36639 +static inline int __must_check
36640 +HYPERVISOR_multicall(
36641 +       multicall_entry_t *call_list, unsigned int nr_calls)
36642 +{
36643 +       return _hypercall2(int, multicall, call_list, nr_calls);
36644 +}
36645 +
36646 +static inline int __must_check
36647 +HYPERVISOR_update_va_mapping(
36648 +       unsigned long va, pte_t new_val, unsigned long flags)
36649 +{
36650 +       unsigned long pte_hi = 0;
36651 +#ifdef CONFIG_X86_PAE
36652 +       pte_hi = new_val.pte_high;
36653 +#endif
36654 +       return _hypercall4(int, update_va_mapping, va,
36655 +                          new_val.pte_low, pte_hi, flags);
36656 +}
36657 +
36658 +static inline int __must_check
36659 +HYPERVISOR_event_channel_op(
36660 +       int cmd, void *arg)
36661 +{
36662 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
36663 +
36664 +#if CONFIG_XEN_COMPAT <= 0x030002
36665 +       if (unlikely(rc == -ENOSYS)) {
36666 +               struct evtchn_op op;
36667 +               op.cmd = cmd;
36668 +               memcpy(&op.u, arg, sizeof(op.u));
36669 +               rc = _hypercall1(int, event_channel_op_compat, &op);
36670 +               memcpy(arg, &op.u, sizeof(op.u));
36671 +       }
36672 +#endif
36673 +
36674 +       return rc;
36675 +}
36676 +
36677 +static inline int __must_check
36678 +HYPERVISOR_xen_version(
36679 +       int cmd, void *arg)
36680 +{
36681 +       return _hypercall2(int, xen_version, cmd, arg);
36682 +}
36683 +
36684 +static inline int __must_check
36685 +HYPERVISOR_console_io(
36686 +       int cmd, unsigned int count, char *str)
36687 +{
36688 +       return _hypercall3(int, console_io, cmd, count, str);
36689 +}
36690 +
36691 +static inline int __must_check
36692 +HYPERVISOR_physdev_op(
36693 +       int cmd, void *arg)
36694 +{
36695 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
36696 +
36697 +#if CONFIG_XEN_COMPAT <= 0x030002
36698 +       if (unlikely(rc == -ENOSYS)) {
36699 +               struct physdev_op op;
36700 +               op.cmd = cmd;
36701 +               memcpy(&op.u, arg, sizeof(op.u));
36702 +               rc = _hypercall1(int, physdev_op_compat, &op);
36703 +               memcpy(arg, &op.u, sizeof(op.u));
36704 +       }
36705 +#endif
36706 +
36707 +       return rc;
36708 +}
36709 +
36710 +static inline int __must_check
36711 +HYPERVISOR_grant_table_op(
36712 +       unsigned int cmd, void *uop, unsigned int count)
36713 +{
36714 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
36715 +}
36716 +
36717 +static inline int __must_check
36718 +HYPERVISOR_update_va_mapping_otherdomain(
36719 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
36720 +{
36721 +       unsigned long pte_hi = 0;
36722 +#ifdef CONFIG_X86_PAE
36723 +       pte_hi = new_val.pte_high;
36724 +#endif
36725 +       return _hypercall5(int, update_va_mapping_otherdomain, va,
36726 +                          new_val.pte_low, pte_hi, flags, domid);
36727 +}
36728 +
36729 +static inline int __must_check
36730 +HYPERVISOR_vm_assist(
36731 +       unsigned int cmd, unsigned int type)
36732 +{
36733 +       return _hypercall2(int, vm_assist, cmd, type);
36734 +}
36735 +
36736 +static inline int __must_check
36737 +HYPERVISOR_vcpu_op(
36738 +       int cmd, unsigned int vcpuid, void *extra_args)
36739 +{
36740 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
36741 +}
36742 +
36743 +static inline int __must_check
36744 +HYPERVISOR_suspend(
36745 +       unsigned long srec)
36746 +{
36747 +       struct sched_shutdown sched_shutdown = {
36748 +               .reason = SHUTDOWN_suspend
36749 +       };
36750 +
36751 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
36752 +                            &sched_shutdown, srec);
36753 +
36754 +#if CONFIG_XEN_COMPAT <= 0x030002
36755 +       if (rc == -ENOSYS)
36756 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
36757 +                                SHUTDOWN_suspend, srec);
36758 +#endif
36759 +
36760 +       return rc;
36761 +}
36762 +
36763 +#if CONFIG_XEN_COMPAT <= 0x030002
36764 +static inline int
36765 +HYPERVISOR_nmi_op(
36766 +       unsigned long op, void *arg)
36767 +{
36768 +       return _hypercall2(int, nmi_op, op, arg);
36769 +}
36770 +#endif
36771 +
36772 +#ifndef CONFIG_XEN
36773 +static inline unsigned long __must_check
36774 +HYPERVISOR_hvm_op(
36775 +    int op, void *arg)
36776 +{
36777 +    return _hypercall2(unsigned long, hvm_op, op, arg);
36778 +}
36779 +#endif
36780 +
36781 +static inline int __must_check
36782 +HYPERVISOR_callback_op(
36783 +       int cmd, const void *arg)
36784 +{
36785 +       return _hypercall2(int, callback_op, cmd, arg);
36786 +}
36787 +
36788 +static inline int __must_check
36789 +HYPERVISOR_xenoprof_op(
36790 +       int op, void *arg)
36791 +{
36792 +       return _hypercall2(int, xenoprof_op, op, arg);
36793 +}
36794 +
36795 +static inline int __must_check
36796 +HYPERVISOR_kexec_op(
36797 +       unsigned long op, void *args)
36798 +{
36799 +       return _hypercall2(int, kexec_op, op, args);
36800 +}
36801 +
36802 +
36803 +
36804 +#endif /* __HYPERCALL_H__ */
36805 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h
36806 ===================================================================
36807 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
36808 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h   2008-02-20 09:32:49.000000000 +0100
36809 @@ -0,0 +1,259 @@
36810 +/******************************************************************************
36811 + * hypervisor.h
36812 + *
36813 + * Linux-specific hypervisor handling.
36814 + *
36815 + * Copyright (c) 2002-2004, K A Fraser
36816 + *
36817 + * This program is free software; you can redistribute it and/or
36818 + * modify it under the terms of the GNU General Public License version 2
36819 + * as published by the Free Software Foundation; or, when distributed
36820 + * separately from the Linux kernel or incorporated into other
36821 + * software packages, subject to the following license:
36822 + *
36823 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36824 + * of this source file (the "Software"), to deal in the Software without
36825 + * restriction, including without limitation the rights to use, copy, modify,
36826 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36827 + * and to permit persons to whom the Software is furnished to do so, subject to
36828 + * the following conditions:
36829 + *
36830 + * The above copyright notice and this permission notice shall be included in
36831 + * all copies or substantial portions of the Software.
36832 + *
36833 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36834 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36835 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36836 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36837 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36838 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36839 + * IN THE SOFTWARE.
36840 + */
36841 +
36842 +#ifndef __HYPERVISOR_H__
36843 +#define __HYPERVISOR_H__
36844 +
36845 +#include <linux/types.h>
36846 +#include <linux/kernel.h>
36847 +#include <linux/version.h>
36848 +#include <linux/errno.h>
36849 +#include <xen/interface/xen.h>
36850 +#include <xen/interface/platform.h>
36851 +#include <xen/interface/event_channel.h>
36852 +#include <xen/interface/physdev.h>
36853 +#include <xen/interface/sched.h>
36854 +#include <xen/interface/nmi.h>
36855 +#include <asm/ptrace.h>
36856 +#include <asm/page.h>
36857 +#if defined(__i386__)
36858 +#  ifdef CONFIG_X86_PAE
36859 +#   include <asm-generic/pgtable-nopud.h>
36860 +#  else
36861 +#   include <asm-generic/pgtable-nopmd.h>
36862 +#  endif
36863 +#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
36864 +#  include <asm-generic/pgtable-nopud.h>
36865 +#endif
36866 +
36867 +extern shared_info_t *HYPERVISOR_shared_info;
36868 +
36869 +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
36870 +#ifdef CONFIG_SMP
36871 +#define current_vcpu_info() vcpu_info(smp_processor_id())
36872 +#else
36873 +#define current_vcpu_info() vcpu_info(0)
36874 +#endif
36875 +
36876 +#ifdef CONFIG_X86_32
36877 +extern unsigned long hypervisor_virt_start;
36878 +#endif
36879 +
36880 +/* arch/xen/i386/kernel/setup.c */
36881 +extern start_info_t *xen_start_info;
36882 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
36883 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
36884 +#else
36885 +#define is_initial_xendomain() 0
36886 +#endif
36887 +
36888 +/* arch/xen/kernel/evtchn.c */
36889 +/* Force a proper event-channel callback from Xen. */
36890 +void force_evtchn_callback(void);
36891 +
36892 +/* arch/xen/kernel/process.c */
36893 +void xen_cpu_idle (void);
36894 +
36895 +/* arch/xen/i386/kernel/hypervisor.c */
36896 +void do_hypervisor_callback(struct pt_regs *regs);
36897 +
36898 +/* arch/xen/i386/mm/hypervisor.c */
36899 +/*
36900 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
36901 + * be MACHINE addresses.
36902 + */
36903 +
36904 +void xen_pt_switch(unsigned long ptr);
36905 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
36906 +void xen_load_gs(unsigned int selector); /* x86_64 only */
36907 +void xen_tlb_flush(void);
36908 +void xen_invlpg(unsigned long ptr);
36909 +
36910 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
36911 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
36912 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
36913 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
36914 +void xen_pgd_pin(unsigned long ptr);
36915 +void xen_pgd_unpin(unsigned long ptr);
36916 +
36917 +void xen_set_ldt(const void *ptr, unsigned int ents);
36918 +
36919 +#ifdef CONFIG_SMP
36920 +#include <linux/cpumask.h>
36921 +void xen_tlb_flush_all(void);
36922 +void xen_invlpg_all(unsigned long ptr);
36923 +void xen_tlb_flush_mask(cpumask_t *mask);
36924 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
36925 +#endif
36926 +
36927 +/* Returns zero on success else negative errno. */
36928 +int xen_create_contiguous_region(
36929 +    unsigned long vstart, unsigned int order, unsigned int address_bits);
36930 +void xen_destroy_contiguous_region(
36931 +    unsigned long vstart, unsigned int order);
36932 +
36933 +struct page;
36934 +
36935 +int xen_limit_pages_to_max_mfn(
36936 +       struct page *pages, unsigned int order, unsigned int address_bits);
36937 +
36938 +/* Turn jiffies into Xen system time. */
36939 +u64 jiffies_to_st(unsigned long jiffies);
36940 +
36941 +#ifdef CONFIG_XEN_SCRUB_PAGES
36942 +void scrub_pages(void *, unsigned int);
36943 +#else
36944 +#define scrub_pages(_p,_n) ((void)0)
36945 +#endif
36946 +
36947 +#include <xen/hypercall.h>
36948 +
36949 +#if defined(CONFIG_X86_64)
36950 +#define MULTI_UVMFLAGS_INDEX 2
36951 +#define MULTI_UVMDOMID_INDEX 3
36952 +#else
36953 +#define MULTI_UVMFLAGS_INDEX 3
36954 +#define MULTI_UVMDOMID_INDEX 4
36955 +#endif
36956 +
36957 +#ifdef CONFIG_XEN
36958 +#define is_running_on_xen() 1
36959 +#else
36960 +extern char *hypercall_stubs;
36961 +#define is_running_on_xen() (!!hypercall_stubs)
36962 +#endif
36963 +
36964 +static inline int
36965 +HYPERVISOR_yield(
36966 +       void)
36967 +{
36968 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
36969 +
36970 +#if CONFIG_XEN_COMPAT <= 0x030002
36971 +       if (rc == -ENOSYS)
36972 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
36973 +#endif
36974 +
36975 +       return rc;
36976 +}
36977 +
36978 +static inline int
36979 +HYPERVISOR_block(
36980 +       void)
36981 +{
36982 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
36983 +
36984 +#if CONFIG_XEN_COMPAT <= 0x030002
36985 +       if (rc == -ENOSYS)
36986 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
36987 +#endif
36988 +
36989 +       return rc;
36990 +}
36991 +
36992 +static inline void /*__noreturn*/
36993 +HYPERVISOR_shutdown(
36994 +       unsigned int reason)
36995 +{
36996 +       struct sched_shutdown sched_shutdown = {
36997 +               .reason = reason
36998 +       };
36999 +
37000 +       VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
37001 +#if CONFIG_XEN_COMPAT <= 0x030002
37002 +       VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
37003 +#endif
37004 +       /* Don't recurse needlessly. */
37005 +       BUG_ON(reason != SHUTDOWN_crash);
37006 +       for(;;);
37007 +}
37008 +
37009 +static inline int __must_check
37010 +HYPERVISOR_poll(
37011 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
37012 +{
37013 +       int rc;
37014 +       struct sched_poll sched_poll = {
37015 +               .nr_ports = nr_ports,
37016 +               .timeout = jiffies_to_st(timeout)
37017 +       };
37018 +       set_xen_guest_handle(sched_poll.ports, ports);
37019 +
37020 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
37021 +#if CONFIG_XEN_COMPAT <= 0x030002
37022 +       if (rc == -ENOSYS)
37023 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
37024 +#endif
37025 +
37026 +       return rc;
37027 +}
37028 +
37029 +#ifdef CONFIG_XEN
37030 +
37031 +static inline void
37032 +MULTI_update_va_mapping(
37033 +    multicall_entry_t *mcl, unsigned long va,
37034 +    pte_t new_val, unsigned long flags)
37035 +{
37036 +    mcl->op = __HYPERVISOR_update_va_mapping;
37037 +    mcl->args[0] = va;
37038 +#if defined(CONFIG_X86_64)
37039 +    mcl->args[1] = new_val.pte;
37040 +#elif defined(CONFIG_X86_PAE)
37041 +    mcl->args[1] = new_val.pte_low;
37042 +    mcl->args[2] = new_val.pte_high;
37043 +#else
37044 +    mcl->args[1] = new_val.pte_low;
37045 +    mcl->args[2] = 0;
37046 +#endif
37047 +    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
37048 +}
37049 +
37050 +static inline void
37051 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
37052 +                    void *uop, unsigned int count)
37053 +{
37054 +    mcl->op = __HYPERVISOR_grant_table_op;
37055 +    mcl->args[0] = cmd;
37056 +    mcl->args[1] = (unsigned long)uop;
37057 +    mcl->args[2] = count;
37058 +}
37059 +
37060 +#else /* !defined(CONFIG_XEN) */
37061 +
37062 +/* Multicalls not supported for HVM guests. */
37063 +#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
37064 +#define MULTI_grant_table_op(a,b,c,d) ((void)0)
37065 +
37066 +#endif
37067 +
37068 +#endif /* __HYPERVISOR_H__ */
37069 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h
37070 ===================================================================
37071 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
37072 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h        2007-08-16 18:07:01.000000000 +0200
37073 @@ -0,0 +1,389 @@
37074 +#ifndef _ASM_IO_H
37075 +#define _ASM_IO_H
37076 +
37077 +#include <linux/string.h>
37078 +#include <linux/compiler.h>
37079 +
37080 +/*
37081 + * This file contains the definitions for the x86 IO instructions
37082 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
37083 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
37084 + * versions of the single-IO instructions (inb_p/inw_p/..).
37085 + *
37086 + * This file is not meant to be obfuscating: it's just complicated
37087 + * to (a) handle it all in a way that makes gcc able to optimize it
37088 + * as well as possible and (b) trying to avoid writing the same thing
37089 + * over and over again with slight variations and possibly making a
37090 + * mistake somewhere.
37091 + */
37092 +
37093 +/*
37094 + * Thanks to James van Artsdalen for a better timing-fix than
37095 + * the two short jumps: using outb's to a nonexistent port seems
37096 + * to guarantee better timings even on fast machines.
37097 + *
37098 + * On the other hand, I'd like to be sure of a non-existent port:
37099 + * I feel a bit unsafe about using 0x80 (should be safe, though)
37100 + *
37101 + *             Linus
37102 + */
37103 +
37104 + /*
37105 +  *  Bit simplified and optimized by Jan Hubicka
37106 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
37107 +  *
37108 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
37109 +  *  isa_read[wl] and isa_write[wl] fixed
37110 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
37111 +  */
37112 +
37113 +#define IO_SPACE_LIMIT 0xffff
37114 +
37115 +#define XQUAD_PORTIO_BASE 0xfe400000
37116 +#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
37117 +
37118 +#ifdef __KERNEL__
37119 +
37120 +#include <asm-generic/iomap.h>
37121 +
37122 +#include <linux/vmalloc.h>
37123 +#include <asm/fixmap.h>
37124 +
37125 +/*
37126 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
37127 + * access
37128 + */
37129 +#define xlate_dev_mem_ptr(p)   __va(p)
37130 +
37131 +/*
37132 + * Convert a virtual cached pointer to an uncached pointer
37133 + */
37134 +#define xlate_dev_kmem_ptr(p)  p
37135 +
37136 +/**
37137 + *     virt_to_phys    -       map virtual addresses to physical
37138 + *     @address: address to remap
37139 + *
37140 + *     The returned physical address is the physical (CPU) mapping for
37141 + *     the memory address given. It is only valid to use this function on
37142 + *     addresses directly mapped or allocated via kmalloc.
37143 + *
37144 + *     This function does not give bus mappings for DMA transfers. In
37145 + *     almost all conceivable cases a device driver should not be using
37146 + *     this function
37147 + */
37148 +
37149 +static inline unsigned long virt_to_phys(volatile void * address)
37150 +{
37151 +       return __pa(address);
37152 +}
37153 +
37154 +/**
37155 + *     phys_to_virt    -       map physical address to virtual
37156 + *     @address: address to remap
37157 + *
37158 + *     The returned virtual address is a current CPU mapping for
37159 + *     the memory address given. It is only valid to use this function on
37160 + *     addresses that have a kernel mapping
37161 + *
37162 + *     This function does not handle bus mappings for DMA transfers. In
37163 + *     almost all conceivable cases a device driver should not be using
37164 + *     this function
37165 + */
37166 +
37167 +static inline void * phys_to_virt(unsigned long address)
37168 +{
37169 +       return __va(address);
37170 +}
37171 +
37172 +/*
37173 + * Change "struct page" to physical address.
37174 + */
37175 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
37176 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
37177 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
37178 +
37179 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
37180 +                                 (unsigned long) bio_offset((bio)))
37181 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
37182 +                                 (unsigned long) (bv)->bv_offset)
37183 +
37184 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
37185 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
37186 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
37187 +         bvec_to_pseudophys((vec2))))
37188 +
37189 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
37190 +
37191 +/**
37192 + * ioremap     -   map bus memory into CPU space
37193 + * @offset:    bus address of the memory
37194 + * @size:      size of the resource to map
37195 + *
37196 + * ioremap performs a platform specific sequence of operations to
37197 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
37198 + * writew/writel functions and the other mmio helpers. The returned
37199 + * address is not guaranteed to be usable directly as a virtual
37200 + * address.
37201 + */
37202 +
37203 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
37204 +{
37205 +       return __ioremap(offset, size, 0);
37206 +}
37207 +
37208 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
37209 +extern void iounmap(volatile void __iomem *addr);
37210 +
37211 +/*
37212 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
37213 + * mappings, before the real ioremap() is functional.
37214 + * A boot-time mapping is currently limited to at most 16 pages.
37215 + */
37216 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
37217 +extern void bt_iounmap(void *addr, unsigned long size);
37218 +
37219 +/* Use early IO mappings for DMI because it's initialized early */
37220 +#define dmi_ioremap bt_ioremap
37221 +#define dmi_iounmap bt_iounmap
37222 +#define dmi_alloc alloc_bootmem
37223 +
37224 +/*
37225 + * ISA I/O bus memory addresses are 1:1 with the physical address.
37226 + */
37227 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
37228 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
37229 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
37230 +
37231 +/*
37232 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
37233 + * are forbidden in portable PCI drivers.
37234 + *
37235 + * Allow them on x86 for legacy drivers, though.
37236 + */
37237 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
37238 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
37239 +
37240 +/*
37241 + * readX/writeX() are used to access memory mapped devices. On some
37242 + * architectures the memory mapped IO stuff needs to be accessed
37243 + * differently. On the x86 architecture, we just read/write the
37244 + * memory location directly.
37245 + */
37246 +
37247 +static inline unsigned char readb(const volatile void __iomem *addr)
37248 +{
37249 +       return *(volatile unsigned char __force *) addr;
37250 +}
37251 +static inline unsigned short readw(const volatile void __iomem *addr)
37252 +{
37253 +       return *(volatile unsigned short __force *) addr;
37254 +}
37255 +static inline unsigned int readl(const volatile void __iomem *addr)
37256 +{
37257 +       return *(volatile unsigned int __force *) addr;
37258 +}
37259 +#define readb_relaxed(addr) readb(addr)
37260 +#define readw_relaxed(addr) readw(addr)
37261 +#define readl_relaxed(addr) readl(addr)
37262 +#define __raw_readb readb
37263 +#define __raw_readw readw
37264 +#define __raw_readl readl
37265 +
37266 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
37267 +{
37268 +       *(volatile unsigned char __force *) addr = b;
37269 +}
37270 +static inline void writew(unsigned short b, volatile void __iomem *addr)
37271 +{
37272 +       *(volatile unsigned short __force *) addr = b;
37273 +}
37274 +static inline void writel(unsigned int b, volatile void __iomem *addr)
37275 +{
37276 +       *(volatile unsigned int __force *) addr = b;
37277 +}
37278 +#define __raw_writeb writeb
37279 +#define __raw_writew writew
37280 +#define __raw_writel writel
37281 +
37282 +#define mmiowb()
37283 +
37284 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
37285 +{
37286 +       memset((void __force *) addr, val, count);
37287 +}
37288 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
37289 +{
37290 +       __memcpy(dst, (void __force *) src, count);
37291 +}
37292 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
37293 +{
37294 +       __memcpy((void __force *) dst, src, count);
37295 +}
37296 +
37297 +/*
37298 + * ISA space is 'always mapped' on a typical x86 system, no need to
37299 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
37300 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
37301 + * are physical addresses. The following constant pointer can be
37302 + * used as the IO-area pointer (it can be iounmapped as well, so the
37303 + * analogy with PCI is quite large):
37304 + */
37305 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
37306 +
37307 +/*
37308 + * Again, i386 does not require mem IO specific function.
37309 + */
37310 +
37311 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void __force *)(b),(c),(d))
37312 +
37313 +/**
37314 + *     check_signature         -       find BIOS signatures
37315 + *     @io_addr: mmio address to check
37316 + *     @signature:  signature block
37317 + *     @length: length of signature
37318 + *
37319 + *     Perform a signature comparison with the mmio address io_addr. This
37320 + *     address should have been obtained by ioremap.
37321 + *     Returns 1 on a match.
37322 + */
37323 +
37324 +static inline int check_signature(volatile void __iomem * io_addr,
37325 +       const unsigned char *signature, int length)
37326 +{
37327 +       int retval = 0;
37328 +       do {
37329 +               if (readb(io_addr) != *signature)
37330 +                       goto out;
37331 +               io_addr++;
37332 +               signature++;
37333 +               length--;
37334 +       } while (length);
37335 +       retval = 1;
37336 +out:
37337 +       return retval;
37338 +}
37339 +
37340 +/*
37341 + *     Cache management
37342 + *
37343 + *     This needed for two cases
37344 + *     1. Out of order aware processors
37345 + *     2. Accidentally out of order processors (PPro errata #51)
37346 + */
37347 +
37348 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
37349 +
37350 +static inline void flush_write_buffers(void)
37351 +{
37352 +       __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
37353 +}
37354 +
37355 +#define dma_cache_inv(_start,_size)            flush_write_buffers()
37356 +#define dma_cache_wback(_start,_size)          flush_write_buffers()
37357 +#define dma_cache_wback_inv(_start,_size)      flush_write_buffers()
37358 +
37359 +#else
37360 +
37361 +/* Nothing to do */
37362 +
37363 +#define dma_cache_inv(_start,_size)            do { } while (0)
37364 +#define dma_cache_wback(_start,_size)          do { } while (0)
37365 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
37366 +#define flush_write_buffers()
37367 +
37368 +#endif
37369 +
37370 +#endif /* __KERNEL__ */
37371 +
37372 +#ifdef SLOW_IO_BY_JUMPING
37373 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
37374 +#else
37375 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
37376 +#endif
37377 +
37378 +static inline void slow_down_io(void) {
37379 +       __asm__ __volatile__(
37380 +               __SLOW_DOWN_IO
37381 +#ifdef REALLY_SLOW_IO
37382 +               __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
37383 +#endif
37384 +               : : );
37385 +}
37386 +
37387 +#ifdef CONFIG_X86_NUMAQ
37388 +extern void *xquad_portio;    /* Where the IO area was mapped */
37389 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
37390 +#define __BUILDIO(bwl,bw,type) \
37391 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
37392 +       if (xquad_portio) \
37393 +               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
37394 +       else \
37395 +               out##bwl##_local(value, port); \
37396 +} \
37397 +static inline void out##bwl(unsigned type value, int port) { \
37398 +       out##bwl##_quad(value, port, 0); \
37399 +} \
37400 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
37401 +       if (xquad_portio) \
37402 +               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
37403 +       else \
37404 +               return in##bwl##_local(port); \
37405 +} \
37406 +static inline unsigned type in##bwl(int port) { \
37407 +       return in##bwl##_quad(port, 0); \
37408 +}
37409 +#else
37410 +#define __BUILDIO(bwl,bw,type) \
37411 +static inline void out##bwl(unsigned type value, int port) { \
37412 +       out##bwl##_local(value, port); \
37413 +} \
37414 +static inline unsigned type in##bwl(int port) { \
37415 +       return in##bwl##_local(port); \
37416 +}
37417 +#endif
37418 +
37419 +
37420 +#define BUILDIO(bwl,bw,type) \
37421 +static inline void out##bwl##_local(unsigned type value, int port) { \
37422 +       __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
37423 +} \
37424 +static inline unsigned type in##bwl##_local(int port) { \
37425 +       unsigned type value; \
37426 +       __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
37427 +       return value; \
37428 +} \
37429 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
37430 +       out##bwl##_local(value, port); \
37431 +       slow_down_io(); \
37432 +} \
37433 +static inline unsigned type in##bwl##_local_p(int port) { \
37434 +       unsigned type value = in##bwl##_local(port); \
37435 +       slow_down_io(); \
37436 +       return value; \
37437 +} \
37438 +__BUILDIO(bwl,bw,type) \
37439 +static inline void out##bwl##_p(unsigned type value, int port) { \
37440 +       out##bwl(value, port); \
37441 +       slow_down_io(); \
37442 +} \
37443 +static inline unsigned type in##bwl##_p(int port) { \
37444 +       unsigned type value = in##bwl(port); \
37445 +       slow_down_io(); \
37446 +       return value; \
37447 +} \
37448 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
37449 +       __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
37450 +} \
37451 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
37452 +       __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
37453 +}
37454 +
37455 +BUILDIO(b,b,char)
37456 +BUILDIO(w,w,short)
37457 +BUILDIO(l,,int)
37458 +
37459 +/* We will be supplying our own /dev/mem implementation */
37460 +#define ARCH_HAS_DEV_MEM
37461 +
37462 +#endif
37463 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h
37464 ===================================================================
37465 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
37466 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h  2007-06-12 13:14:02.000000000 +0200
37467 @@ -0,0 +1,127 @@
37468 +/*
37469 + * include/asm-i386/irqflags.h
37470 + *
37471 + * IRQ flags handling
37472 + *
37473 + * This file gets included from lowlevel asm headers too, to provide
37474 + * wrapped versions of the local_irq_*() APIs, based on the
37475 + * raw_local_irq_*() functions from the lowlevel headers.
37476 + */
37477 +#ifndef _ASM_IRQFLAGS_H
37478 +#define _ASM_IRQFLAGS_H
37479 +
37480 +#ifndef __ASSEMBLY__
37481 +
37482 +/*
37483 + * The use of 'barrier' in the following reflects their use as local-lock
37484 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
37485 + * critical operations are executed. All critical operations must complete
37486 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
37487 + * includes these barriers, for example.
37488 + */
37489 +
37490 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
37491 +
37492 +#define raw_local_save_flags(flags) \
37493 +               do { (flags) = __raw_local_save_flags(); } while (0)
37494 +
37495 +#define raw_local_irq_restore(x)                                       \
37496 +do {                                                                   \
37497 +       vcpu_info_t *_vcpu;                                             \
37498 +       barrier();                                                      \
37499 +       _vcpu = current_vcpu_info();                                    \
37500 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
37501 +               barrier(); /* unmask then check (avoid races) */        \
37502 +               if (unlikely(_vcpu->evtchn_upcall_pending))             \
37503 +                       force_evtchn_callback();                        \
37504 +       }                                                               \
37505 +} while (0)
37506 +
37507 +#define raw_local_irq_disable()                                                \
37508 +do {                                                                   \
37509 +       current_vcpu_info()->evtchn_upcall_mask = 1;                    \
37510 +       barrier();                                                      \
37511 +} while (0)
37512 +
37513 +#define raw_local_irq_enable()                                         \
37514 +do {                                                                   \
37515 +       vcpu_info_t *_vcpu;                                             \
37516 +       barrier();                                                      \
37517 +       _vcpu = current_vcpu_info();                                    \
37518 +       _vcpu->evtchn_upcall_mask = 0;                                  \
37519 +       barrier(); /* unmask then check (avoid races) */                \
37520 +       if (unlikely(_vcpu->evtchn_upcall_pending))                     \
37521 +               force_evtchn_callback();                                \
37522 +} while (0)
37523 +
37524 +/*
37525 + * Used in the idle loop; sti takes one instruction cycle
37526 + * to complete:
37527 + */
37528 +void raw_safe_halt(void);
37529 +
37530 +/*
37531 + * Used when interrupts are already enabled or to
37532 + * shutdown the processor:
37533 + */
37534 +void halt(void);
37535 +
37536 +static inline int raw_irqs_disabled_flags(unsigned long flags)
37537 +{
37538 +       return (flags != 0);
37539 +}
37540 +
37541 +#define raw_irqs_disabled()                                            \
37542 +({                                                                     \
37543 +       unsigned long flags = __raw_local_save_flags();                 \
37544 +                                                                       \
37545 +       raw_irqs_disabled_flags(flags);                                 \
37546 +})
37547 +
37548 +/*
37549 + * For spinlocks, etc:
37550 + */
37551 +#define __raw_local_irq_save()                                         \
37552 +({                                                                     \
37553 +       unsigned long flags = __raw_local_save_flags();                 \
37554 +                                                                       \
37555 +       raw_local_irq_disable();                                        \
37556 +                                                                       \
37557 +       flags;                                                          \
37558 +})
37559 +
37560 +#define raw_local_irq_save(flags) \
37561 +               do { (flags) = __raw_local_irq_save(); } while (0)
37562 +
37563 +#endif /* __ASSEMBLY__ */
37564 +
37565 +/*
37566 + * Do the CPU's IRQ-state tracing from assembly code. We call a
37567 + * C function, so save all the C-clobbered registers:
37568 + */
37569 +#ifdef CONFIG_TRACE_IRQFLAGS
37570 +
37571 +# define TRACE_IRQS_ON                         \
37572 +       pushl %eax;                             \
37573 +       pushl %ecx;                             \
37574 +       pushl %edx;                             \
37575 +       call trace_hardirqs_on;                 \
37576 +       popl %edx;                              \
37577 +       popl %ecx;                              \
37578 +       popl %eax;
37579 +
37580 +# define TRACE_IRQS_OFF                                \
37581 +       pushl %eax;                             \
37582 +       pushl %ecx;                             \
37583 +       pushl %edx;                             \
37584 +       call trace_hardirqs_off;                \
37585 +       popl %edx;                              \
37586 +       popl %ecx;                              \
37587 +       popl %eax;
37588 +
37589 +#else
37590 +# define TRACE_IRQS_ON
37591 +# define TRACE_IRQS_OFF
37592 +#endif
37593 +
37594 +#endif
37595 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h
37596 ===================================================================
37597 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
37598 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h     2008-04-02 12:34:02.000000000 +0200
37599 @@ -0,0 +1,193 @@
37600 +#ifndef _I386_MADDR_H
37601 +#define _I386_MADDR_H
37602 +
37603 +#include <xen/features.h>
37604 +#include <xen/interface/xen.h>
37605 +
37606 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
37607 +#define INVALID_P2M_ENTRY      (~0UL)
37608 +#define FOREIGN_FRAME_BIT      (1UL<<31)
37609 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
37610 +
37611 +/* Definitions for machine and pseudophysical addresses. */
37612 +#ifdef CONFIG_X86_PAE
37613 +typedef unsigned long long paddr_t;
37614 +typedef unsigned long long maddr_t;
37615 +#else
37616 +typedef unsigned long paddr_t;
37617 +typedef unsigned long maddr_t;
37618 +#endif
37619 +
37620 +#ifdef CONFIG_XEN
37621 +
37622 +extern unsigned long *phys_to_machine_mapping;
37623 +extern unsigned long  max_mapnr;
37624 +
37625 +#undef machine_to_phys_mapping
37626 +extern unsigned long *machine_to_phys_mapping;
37627 +extern unsigned int   machine_to_phys_order;
37628 +
37629 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
37630 +{
37631 +       if (xen_feature(XENFEAT_auto_translated_physmap))
37632 +               return pfn;
37633 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
37634 +       return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
37635 +}
37636 +
37637 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
37638 +{
37639 +       if (xen_feature(XENFEAT_auto_translated_physmap))
37640 +               return 1;
37641 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
37642 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
37643 +}
37644 +
37645 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
37646 +{
37647 +       unsigned long pfn;
37648 +
37649 +       if (xen_feature(XENFEAT_auto_translated_physmap))
37650 +               return mfn;
37651 +
37652 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
37653 +               return max_mapnr;
37654 +
37655 +       /* The array access can fail (e.g., device space beyond end of RAM). */
37656 +       asm (
37657 +               "1:     movl %1,%0\n"
37658 +               "2:\n"
37659 +               ".section .fixup,\"ax\"\n"
37660 +               "3:     movl %2,%0\n"
37661 +               "       jmp  2b\n"
37662 +               ".previous\n"
37663 +               ".section __ex_table,\"a\"\n"
37664 +               "       .align 4\n"
37665 +               "       .long 1b,3b\n"
37666 +               ".previous"
37667 +               : "=r" (pfn)
37668 +               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
37669 +
37670 +       return pfn;
37671 +}
37672 +
37673 +/*
37674 + * We detect special mappings in one of two ways:
37675 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
37676 + *     to be outside our maximum possible pseudophys range.
37677 + *  2. If the MFN belongs to a different domain then we will certainly
37678 + *     not have MFN in our p2m table. Conversely, if the page is ours,
37679 + *     then we'll have p2m(m2p(MFN))==MFN.
37680 + * If we detect a special mapping then it doesn't have a 'struct page'.
37681 + * We force !pfn_valid() by returning an out-of-range pointer.
37682 + *
37683 + * NB. These checks require that, for any MFN that is not in our reservation,
37684 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
37685 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
37686 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
37687 + *
37688 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
37689 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
37690 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
37691 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
37692 + */
37693 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
37694 +{
37695 +       unsigned long pfn = mfn_to_pfn(mfn);
37696 +       if ((pfn < max_mapnr)
37697 +           && !xen_feature(XENFEAT_auto_translated_physmap)
37698 +           && (phys_to_machine_mapping[pfn] != mfn))
37699 +               return max_mapnr; /* force !pfn_valid() */
37700 +       return pfn;
37701 +}
37702 +
37703 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
37704 +{
37705 +       BUG_ON(max_mapnr && pfn >= max_mapnr);
37706 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
37707 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
37708 +               return;
37709 +       }
37710 +       phys_to_machine_mapping[pfn] = mfn;
37711 +}
37712 +
37713 +static inline maddr_t phys_to_machine(paddr_t phys)
37714 +{
37715 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37716 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
37717 +       return machine;
37718 +}
37719 +
37720 +static inline paddr_t machine_to_phys(maddr_t machine)
37721 +{
37722 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37723 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
37724 +       return phys;
37725 +}
37726 +
37727 +#ifdef CONFIG_X86_PAE
37728 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
37729 +{
37730 +       /*
37731 +        * In PAE mode, the NX bit needs to be dealt with in the value
37732 +        * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
37733 +        * but for i386 the conversion to ulong for the argument will
37734 +        * clip it off.
37735 +        */
37736 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37737 +       machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
37738 +       return machine;
37739 +}
37740 +
37741 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
37742 +{
37743 +       /*
37744 +        * In PAE mode, the NX bit needs to be dealt with in the value
37745 +        * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
37746 +        * but for i386 the conversion to ulong for the argument will
37747 +        * clip it off.
37748 +        */
37749 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37750 +       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
37751 +       return phys;
37752 +}
37753 +#endif
37754 +
37755 +#ifdef CONFIG_X86_PAE
37756 +#define __pte_ma(x)    ((pte_t) { (x), (maddr_t)(x) >> 32 } )
37757 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
37758 +{
37759 +       pte_t pte;
37760 +
37761 +       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
37762 +                                       (pgprot_val(pgprot) >> 32);
37763 +       pte.pte_high &= (__supported_pte_mask >> 32);
37764 +       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
37765 +                                                       __supported_pte_mask;
37766 +       return pte;
37767 +}
37768 +#else
37769 +#define __pte_ma(x)    ((pte_t) { (x) } )
37770 +#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
37771 +#endif
37772 +
37773 +#else /* !CONFIG_XEN */
37774 +
37775 +#define pfn_to_mfn(pfn) (pfn)
37776 +#define mfn_to_pfn(mfn) (mfn)
37777 +#define mfn_to_local_pfn(mfn) (mfn)
37778 +#define set_phys_to_machine(pfn, mfn) ((void)0)
37779 +#define phys_to_machine_mapping_valid(pfn) (1)
37780 +#define phys_to_machine(phys) ((maddr_t)(phys))
37781 +#define machine_to_phys(mach) ((paddr_t)(mach))
37782 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
37783 +#define __pte_ma(x) __pte(x)
37784 +
37785 +#endif /* !CONFIG_XEN */
37786 +
37787 +/* VIRT <-> MACHINE conversion */
37788 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
37789 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
37790 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
37791 +
37792 +#endif /* _I386_MADDR_H */
37793 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h
37794 ===================================================================
37795 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
37796 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h       2007-06-12 13:14:02.000000000 +0200
37797 @@ -0,0 +1,108 @@
37798 +#ifndef __I386_SCHED_H
37799 +#define __I386_SCHED_H
37800 +
37801 +#include <asm/desc.h>
37802 +#include <asm/atomic.h>
37803 +#include <asm/pgalloc.h>
37804 +#include <asm/tlbflush.h>
37805 +
37806 +/*
37807 + * Used for LDT copy/destruction.
37808 + */
37809 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
37810 +void destroy_context(struct mm_struct *mm);
37811 +
37812 +
37813 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
37814 +{
37815 +#if 0 /* XEN: no lazy tlb */
37816 +       unsigned cpu = smp_processor_id();
37817 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
37818 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
37819 +#endif
37820 +}
37821 +
37822 +#define prepare_arch_switch(next)      __prepare_arch_switch()
37823 +
37824 +static inline void __prepare_arch_switch(void)
37825 +{
37826 +       /*
37827 +        * Save away %fs and %gs. No need to save %es and %ds, as those
37828 +        * are always kernel segments while inside the kernel. Must
37829 +        * happen before reload of cr3/ldt (i.e., not in __switch_to).
37830 +        */
37831 +       asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
37832 +               : "=m" (current->thread.fs),
37833 +                 "=m" (current->thread.gs));
37834 +       asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
37835 +               : : "r" (0) );
37836 +}
37837 +
37838 +extern void mm_pin(struct mm_struct *mm);
37839 +extern void mm_unpin(struct mm_struct *mm);
37840 +void mm_pin_all(void);
37841 +
37842 +static inline void switch_mm(struct mm_struct *prev,
37843 +                            struct mm_struct *next,
37844 +                            struct task_struct *tsk)
37845 +{
37846 +       int cpu = smp_processor_id();
37847 +       struct mmuext_op _op[2], *op = _op;
37848 +
37849 +       if (likely(prev != next)) {
37850 +               BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
37851 +                      !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
37852 +
37853 +               /* stop flush ipis for the previous mm */
37854 +               cpu_clear(cpu, prev->cpu_vm_mask);
37855 +#if 0 /* XEN: no lazy tlb */
37856 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37857 +               per_cpu(cpu_tlbstate, cpu).active_mm = next;
37858 +#endif
37859 +               cpu_set(cpu, next->cpu_vm_mask);
37860 +
37861 +               /* Re-load page tables: load_cr3(next->pgd) */
37862 +               op->cmd = MMUEXT_NEW_BASEPTR;
37863 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
37864 +               op++;
37865 +
37866 +               /*
37867 +                * load the LDT, if the LDT is different:
37868 +                */
37869 +               if (unlikely(prev->context.ldt != next->context.ldt)) {
37870 +                       /* load_LDT_nolock(&next->context, cpu) */
37871 +                       op->cmd = MMUEXT_SET_LDT;
37872 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
37873 +                       op->arg2.nr_ents     = next->context.size;
37874 +                       op++;
37875 +               }
37876 +
37877 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
37878 +       }
37879 +#if 0 /* XEN: no lazy tlb */
37880 +       else {
37881 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37882 +               BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
37883 +
37884 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37885 +                       /* We were in lazy tlb mode and leave_mm disabled
37886 +                        * tlb flush IPI delivery. We must reload %cr3.
37887 +                        */
37888 +                       load_cr3(next->pgd);
37889 +                       load_LDT_nolock(&next->context, cpu);
37890 +               }
37891 +       }
37892 +#endif
37893 +}
37894 +
37895 +#define deactivate_mm(tsk, mm) \
37896 +       asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
37897 +
37898 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
37899 +{
37900 +       if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
37901 +               mm_pin(next);
37902 +       switch_mm(prev, next, NULL);
37903 +}
37904 +
37905 +#endif
37906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h
37907 ===================================================================
37908 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
37909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h       2007-09-14 11:14:51.000000000 +0200
37910 @@ -0,0 +1,148 @@
37911 +#ifndef __i386_PCI_H
37912 +#define __i386_PCI_H
37913 +
37914 +
37915 +#ifdef __KERNEL__
37916 +#include <linux/mm.h>          /* for struct page */
37917 +
37918 +/* Can be used to override the logic in pci_scan_bus for skipping
37919 +   already-configured bus numbers - to be used for buggy BIOSes
37920 +   or architectures with incomplete PCI setup by the loader */
37921 +
37922 +#ifdef CONFIG_PCI
37923 +extern unsigned int pcibios_assign_all_busses(void);
37924 +#else
37925 +#define pcibios_assign_all_busses()    0
37926 +#endif
37927 +
37928 +#include <asm/hypervisor.h>
37929 +#define pcibios_scan_all_fns(a, b)     (!is_initial_xendomain())
37930 +
37931 +extern unsigned long pci_mem_start;
37932 +#define PCIBIOS_MIN_IO         0x1000
37933 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
37934 +
37935 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
37936 +
37937 +void pcibios_config_init(void);
37938 +struct pci_bus * pcibios_scan_root(int bus);
37939 +
37940 +void pcibios_set_master(struct pci_dev *dev);
37941 +void pcibios_penalize_isa_irq(int irq, int active);
37942 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
37943 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
37944 +
37945 +/* Dynamic DMA mapping stuff.
37946 + * i386 has everything mapped statically.
37947 + */
37948 +
37949 +#include <linux/types.h>
37950 +#include <linux/slab.h>
37951 +#include <asm/scatterlist.h>
37952 +#include <linux/string.h>
37953 +#include <asm/io.h>
37954 +
37955 +struct pci_dev;
37956 +
37957 +#ifdef CONFIG_SWIOTLB
37958 +
37959 +
37960 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
37961 +#define PCI_DMA_BUS_IS_PHYS    (0)
37962 +
37963 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
37964 +       dma_addr_t ADDR_NAME;
37965 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
37966 +       __u32 LEN_NAME;
37967 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
37968 +       ((PTR)->ADDR_NAME)
37969 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
37970 +       (((PTR)->ADDR_NAME) = (VAL))
37971 +#define pci_unmap_len(PTR, LEN_NAME)                   \
37972 +       ((PTR)->LEN_NAME)
37973 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
37974 +       (((PTR)->LEN_NAME) = (VAL))
37975 +
37976 +#else
37977 +
37978 +/* The PCI address space does equal the physical memory
37979 + * address space.  The networking and block device layers use
37980 + * this boolean for bounce buffer decisions.
37981 + */
37982 +#define PCI_DMA_BUS_IS_PHYS    (1)
37983 +
37984 +/* pci_unmap_{page,single} is a nop so... */
37985 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
37986 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
37987 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
37988 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
37989 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
37990 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
37991 +
37992 +#endif
37993 +
37994 +/* This is always fine. */
37995 +#define pci_dac_dma_supported(pci_dev, mask)   (1)
37996 +
37997 +static inline dma64_addr_t
37998 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
37999 +{
38000 +       return ((dma64_addr_t) page_to_phys(page) +
38001 +               (dma64_addr_t) offset);
38002 +}
38003 +
38004 +static inline struct page *
38005 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
38006 +{
38007 +       return pfn_to_page(dma_addr >> PAGE_SHIFT);
38008 +}
38009 +
38010 +static inline unsigned long
38011 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
38012 +{
38013 +       return (dma_addr & ~PAGE_MASK);
38014 +}
38015 +
38016 +static inline void
38017 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38018 +{
38019 +}
38020 +
38021 +static inline void
38022 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38023 +{
38024 +       flush_write_buffers();
38025 +}
38026 +
38027 +#define HAVE_PCI_MMAP
38028 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
38029 +                              enum pci_mmap_state mmap_state, int write_combine);
38030 +
38031 +
38032 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
38033 +{
38034 +}
38035 +
38036 +#ifdef CONFIG_PCI
38037 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
38038 +                                       enum pci_dma_burst_strategy *strat,
38039 +                                       unsigned long *strategy_parameter)
38040 +{
38041 +       *strat = PCI_DMA_BURST_INFINITY;
38042 +       *strategy_parameter = ~0UL;
38043 +}
38044 +#endif
38045 +
38046 +#endif /* __KERNEL__ */
38047 +
38048 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
38049 +#include <xen/pcifront.h>
38050 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
38051 +
38052 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
38053 +#include <asm-generic/pci-dma-compat.h>
38054 +
38055 +/* generic pci stuff */
38056 +#include <asm-generic/pci.h>
38057 +
38058 +#endif /* __i386_PCI_H */
38059 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h
38060 ===================================================================
38061 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
38062 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h   2008-07-21 11:00:33.000000000 +0200
38063 @@ -0,0 +1,59 @@
38064 +#ifndef _I386_PGALLOC_H
38065 +#define _I386_PGALLOC_H
38066 +
38067 +#include <asm/fixmap.h>
38068 +#include <linux/threads.h>
38069 +#include <linux/mm.h>          /* for struct page */
38070 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
38071 +
38072 +#define pmd_populate_kernel(mm, pmd, pte) \
38073 +               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
38074 +
38075 +#define pmd_populate(mm, pmd, pte)                                     \
38076 +do {                                                                   \
38077 +       unsigned long pfn = page_to_pfn(pte);                           \
38078 +       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
38079 +               if (!PageHighMem(pte))                                  \
38080 +                       BUG_ON(HYPERVISOR_update_va_mapping(            \
38081 +                         (unsigned long)__va(pfn << PAGE_SHIFT),       \
38082 +                         pfn_pte(pfn, PAGE_KERNEL_RO), 0));            \
38083 +               else if (!test_and_set_bit(PG_pinned, &pte->flags))     \
38084 +                       kmap_flush_unused();                            \
38085 +               set_pmd(pmd,                                            \
38086 +                       __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
38087 +       } else                                                  \
38088 +               *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
38089 +} while (0)
38090 +
38091 +/*
38092 + * Allocate and free page tables.
38093 + */
38094 +extern pgd_t *pgd_alloc(struct mm_struct *);
38095 +extern void pgd_free(pgd_t *pgd);
38096 +
38097 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
38098 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
38099 +
38100 +static inline void pte_free_kernel(pte_t *pte)
38101 +{
38102 +       make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
38103 +       free_page((unsigned long)pte);
38104 +}
38105 +
38106 +extern void pte_free(struct page *pte);
38107 +
38108 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
38109 +
38110 +#ifdef CONFIG_X86_PAE
38111 +/*
38112 + * In the PAE case we free the pmds as part of the pgd.
38113 + */
38114 +#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
38115 +#define pmd_free(x)                    do { } while (0)
38116 +#define __pmd_free_tlb(tlb,x)          do { } while (0)
38117 +#define pud_populate(mm, pmd, pte)     BUG()
38118 +#endif
38119 +
38120 +#define check_pgt_cache()      do { } while (0)
38121 +
38122 +#endif /* _I386_PGALLOC_H */
38123 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
38124 ===================================================================
38125 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
38126 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h  2007-06-12 13:14:02.000000000 +0200
38127 @@ -0,0 +1,24 @@
38128 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
38129 +#define _I386_PGTABLE_3LEVEL_DEFS_H
38130 +
38131 +#define HAVE_SHARED_KERNEL_PMD 0
38132 +
38133 +/*
38134 + * PGDIR_SHIFT determines what a top-level page table entry can map
38135 + */
38136 +#define PGDIR_SHIFT    30
38137 +#define PTRS_PER_PGD   4
38138 +
38139 +/*
38140 + * PMD_SHIFT determines the size of the area a middle-level
38141 + * page table can map
38142 + */
38143 +#define PMD_SHIFT      21
38144 +#define PTRS_PER_PMD   512
38145 +
38146 +/*
38147 + * entries per page directory level
38148 + */
38149 +#define PTRS_PER_PTE   512
38150 +
38151 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
38152 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h
38153 ===================================================================
38154 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
38155 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h       2008-04-02 12:34:02.000000000 +0200
38156 @@ -0,0 +1,211 @@
38157 +#ifndef _I386_PGTABLE_3LEVEL_H
38158 +#define _I386_PGTABLE_3LEVEL_H
38159 +
38160 +#include <asm-generic/pgtable-nopud.h>
38161 +
38162 +/*
38163 + * Intel Physical Address Extension (PAE) Mode - three-level page
38164 + * tables on PPro+ CPUs.
38165 + *
38166 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
38167 + */
38168 +
38169 +#define pte_ERROR(e) \
38170 +       printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
38171 +              &(e), __pte_val(e), pte_pfn(e))
38172 +#define pmd_ERROR(e) \
38173 +       printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38174 +              &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38175 +#define pgd_ERROR(e) \
38176 +       printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38177 +              &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38178 +
38179 +#define pud_none(pud)                          0
38180 +#define pud_bad(pud)                           0
38181 +#define pud_present(pud)                       1
38182 +
38183 +/*
38184 + * Is the pte executable?
38185 + */
38186 +static inline int pte_x(pte_t pte)
38187 +{
38188 +       return !(__pte_val(pte) & _PAGE_NX);
38189 +}
38190 +
38191 +/*
38192 + * All present user-pages with !NX bit are user-executable:
38193 + */
38194 +static inline int pte_exec(pte_t pte)
38195 +{
38196 +       return pte_user(pte) && pte_x(pte);
38197 +}
38198 +/*
38199 + * All present pages with !NX bit are kernel-executable:
38200 + */
38201 +static inline int pte_exec_kernel(pte_t pte)
38202 +{
38203 +       return pte_x(pte);
38204 +}
38205 +
38206 +/* Rules for using set_pte: the pte being assigned *must* be
38207 + * either not present or in a state where the hardware will
38208 + * not attempt to update the pte.  In places where this is
38209 + * not possible, use pte_get_and_clear to obtain the old pte
38210 + * value and then use set_pte to update it.  -ben
38211 + */
38212 +#define __HAVE_ARCH_SET_PTE_ATOMIC
38213 +
38214 +static inline void set_pte(pte_t *ptep, pte_t pte)
38215 +{
38216 +       ptep->pte_high = pte.pte_high;
38217 +       smp_wmb();
38218 +       ptep->pte_low = pte.pte_low;
38219 +}
38220 +#define set_pte_atomic(pteptr,pteval) \
38221 +               set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
38222 +
38223 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
38224 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
38225 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
38226 +               set_pte((ptep), (pteval));                              \
38227 +} while (0)
38228 +
38229 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
38230 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
38231 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
38232 +               set_pte((ptep), (pteval));                              \
38233 +               xen_invlpg((addr));                                     \
38234 +       }                                                               \
38235 +} while (0)
38236 +
38237 +#define set_pmd(pmdptr,pmdval)                         \
38238 +               xen_l2_entry_update((pmdptr), (pmdval))
38239 +#define set_pud(pudptr,pudval) \
38240 +               xen_l3_entry_update((pudptr), (pudval))
38241 +
38242 +/*
38243 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
38244 + * the TLB via cr3 if the top-level pgd is changed...
38245 + * We do not let the generic code free and clear pgd entries due to
38246 + * this erratum.
38247 + */
38248 +static inline void pud_clear (pud_t * pud) { }
38249 +
38250 +#define pud_page(pud) \
38251 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
38252 +
38253 +#define pud_page_kernel(pud) \
38254 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
38255 +
38256 +
38257 +/* Find an entry in the second-level page table.. */
38258 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
38259 +                       pmd_index(address))
38260 +
38261 +static inline int pte_none(pte_t pte)
38262 +{
38263 +       return !(pte.pte_low | pte.pte_high);
38264 +}
38265 +
38266 +/*
38267 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
38268 + * entry, so clear the bottom half first and enforce ordering with a compiler
38269 + * barrier.
38270 + */
38271 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38272 +{
38273 +       if ((mm != current->mm && mm != &init_mm)
38274 +           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38275 +               ptep->pte_low = 0;
38276 +               smp_wmb();
38277 +               ptep->pte_high = 0;
38278 +       }
38279 +}
38280 +
38281 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
38282 +
38283 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38284 +{
38285 +       pte_t pte = *ptep;
38286 +       if (!pte_none(pte)) {
38287 +               if ((mm != &init_mm) ||
38288 +                   HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38289 +                       uint64_t val = __pte_val(pte);
38290 +                       if (__cmpxchg64(ptep, val, 0) != val) {
38291 +                               /* xchg acts as a barrier before the setting of the high bits */
38292 +                               pte.pte_low = xchg(&ptep->pte_low, 0);
38293 +                               pte.pte_high = ptep->pte_high;
38294 +                               ptep->pte_high = 0;
38295 +                       }
38296 +               }
38297 +       }
38298 +       return pte;
38299 +}
38300 +
38301 +#define ptep_clear_flush(vma, addr, ptep)                      \
38302 +({                                                             \
38303 +       pte_t *__ptep = (ptep);                                 \
38304 +       pte_t __res = *__ptep;                                  \
38305 +       if (!pte_none(__res) &&                                 \
38306 +           ((vma)->vm_mm != current->mm ||                     \
38307 +            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
38308 +                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38309 +                               UVMF_INVLPG|UVMF_MULTI))) {     \
38310 +               __ptep->pte_low = 0;                            \
38311 +               smp_wmb();                                      \
38312 +               __ptep->pte_high = 0;                           \
38313 +               flush_tlb_page(vma, addr);                      \
38314 +       }                                                       \
38315 +       __res;                                                  \
38316 +})
38317 +
38318 +static inline int pte_same(pte_t a, pte_t b)
38319 +{
38320 +       return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
38321 +}
38322 +
38323 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
38324 +
38325 +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
38326 +                        ((_pte).pte_high << (32-PAGE_SHIFT)))
38327 +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
38328 +       __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
38329 +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \
38330 +                      (_pte).pte_low & _PAGE_PRESENT ?         \
38331 +                      mfn_to_local_pfn(__pte_mfn(_pte)) :      \
38332 +                      __pte_mfn(_pte))
38333 +
38334 +extern unsigned long long __supported_pte_mask;
38335 +
38336 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
38337 +{
38338 +       return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
38339 +                       pgprot_val(pgprot)) & __supported_pte_mask);
38340 +}
38341 +
38342 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
38343 +{
38344 +       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
38345 +                       pgprot_val(pgprot)) & __supported_pte_mask);
38346 +}
38347 +
38348 +/*
38349 + * Bits 0, 6 and 7 are taken in the low part of the pte,
38350 + * put the 32 bits of offset into the high part.
38351 + */
38352 +#define pte_to_pgoff(pte) ((pte).pte_high)
38353 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
38354 +#define PTE_FILE_MAX_BITS       32
38355 +
38356 +/* Encode and de-code a swap entry */
38357 +#define __swp_type(x)                  (((x).val) & 0x1f)
38358 +#define __swp_offset(x)                        ((x).val >> 5)
38359 +#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
38360 +#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
38361 +#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
38362 +
38363 +#define __pmd_free_tlb(tlb, x)         do { } while (0)
38364 +
38365 +void vmalloc_sync_all(void);
38366 +
38367 +#endif /* _I386_PGTABLE_3LEVEL_H */
38368 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h
38369 ===================================================================
38370 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
38371 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h   2008-07-21 11:00:33.000000000 +0200
38372 @@ -0,0 +1,537 @@
38373 +#ifndef _I386_PGTABLE_H
38374 +#define _I386_PGTABLE_H
38375 +
38376 +#include <asm/hypervisor.h>
38377 +
38378 +/*
38379 + * The Linux memory management assumes a three-level page table setup. On
38380 + * the i386, we use that, but "fold" the mid level into the top-level page
38381 + * table, so that we physically have the same two-level page table as the
38382 + * i386 mmu expects.
38383 + *
38384 + * This file contains the functions and defines necessary to modify and use
38385 + * the i386 page table tree.
38386 + */
38387 +#ifndef __ASSEMBLY__
38388 +#include <asm/processor.h>
38389 +#include <asm/fixmap.h>
38390 +#include <linux/threads.h>
38391 +
38392 +#ifndef _I386_BITOPS_H
38393 +#include <asm/bitops.h>
38394 +#endif
38395 +
38396 +#include <linux/slab.h>
38397 +#include <linux/list.h>
38398 +#include <linux/spinlock.h>
38399 +
38400 +/* Is this pagetable pinned? */
38401 +#define PG_pinned      PG_arch_1
38402 +
38403 +struct mm_struct;
38404 +struct vm_area_struct;
38405 +
38406 +/*
38407 + * ZERO_PAGE is a global shared page that is always zero: used
38408 + * for zero-mapped memory areas etc..
38409 + */
38410 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
38411 +extern unsigned long empty_zero_page[1024];
38412 +extern pgd_t *swapper_pg_dir;
38413 +extern kmem_cache_t *pgd_cache;
38414 +extern kmem_cache_t *pmd_cache;
38415 +extern spinlock_t pgd_lock;
38416 +extern struct page *pgd_list;
38417 +
38418 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
38419 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
38420 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
38421 +void pgtable_cache_init(void);
38422 +void paging_init(void);
38423 +
38424 +/*
38425 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
38426 + * implements both the traditional 2-level x86 page tables and the
38427 + * newer 3-level PAE-mode page tables.
38428 + */
38429 +#ifdef CONFIG_X86_PAE
38430 +# include <asm/pgtable-3level-defs.h>
38431 +# define PMD_SIZE      (1UL << PMD_SHIFT)
38432 +# define PMD_MASK      (~(PMD_SIZE-1))
38433 +#else
38434 +# include <asm/pgtable-2level-defs.h>
38435 +#endif
38436 +
38437 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
38438 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
38439 +
38440 +#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
38441 +#define FIRST_USER_ADDRESS     0
38442 +
38443 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
38444 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
38445 +
38446 +#define TWOLEVEL_PGDIR_SHIFT   22
38447 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
38448 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
38449 +
38450 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
38451 + * current 8MB value just means that there will be a 8MB "hole" after the
38452 + * physical memory until the kernel virtual memory starts.  That means that
38453 + * any out-of-bounds memory accesses will hopefully be caught.
38454 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
38455 + * area for the same reason. ;)
38456 + */
38457 +#define VMALLOC_OFFSET (8*1024*1024)
38458 +#define VMALLOC_START  (((unsigned long) high_memory + vmalloc_earlyreserve + \
38459 +                       2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
38460 +#ifdef CONFIG_HIGHMEM
38461 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
38462 +#else
38463 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
38464 +#endif
38465 +
38466 +/*
38467 + * _PAGE_PSE set in the page directory entry just means that
38468 + * the page directory entry points directly to a 4MB-aligned block of
38469 + * memory.
38470 + */
38471 +#define _PAGE_BIT_PRESENT      0
38472 +#define _PAGE_BIT_RW           1
38473 +#define _PAGE_BIT_USER         2
38474 +#define _PAGE_BIT_PWT          3
38475 +#define _PAGE_BIT_PCD          4
38476 +#define _PAGE_BIT_ACCESSED     5
38477 +#define _PAGE_BIT_DIRTY                6
38478 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
38479 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
38480 +/*#define _PAGE_BIT_UNUSED1    9*/     /* available for programmer */
38481 +#define _PAGE_BIT_UNUSED2      10
38482 +#define _PAGE_BIT_UNUSED3      11
38483 +#define _PAGE_BIT_NX           63
38484 +
38485 +#define _PAGE_PRESENT  0x001
38486 +#define _PAGE_RW       0x002
38487 +#define _PAGE_USER     0x004
38488 +#define _PAGE_PWT      0x008
38489 +#define _PAGE_PCD      0x010
38490 +#define _PAGE_ACCESSED 0x020
38491 +#define _PAGE_DIRTY    0x040
38492 +#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
38493 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
38494 +/*#define _PAGE_UNUSED1        0x200*/ /* available for programmer */
38495 +#define _PAGE_UNUSED2  0x400
38496 +#define _PAGE_UNUSED3  0x800
38497 +
38498 +/* If _PAGE_PRESENT is clear, we use these: */
38499 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
38500 +#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
38501 +                                  pte_present gives true */
38502 +#ifdef CONFIG_X86_PAE
38503 +#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
38504 +#else
38505 +#define _PAGE_NX       0
38506 +#endif
38507 +
38508 +/* Mapped page is I/O or foreign and has no associated page struct. */
38509 +#define _PAGE_IO       0x200
38510 +
38511 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
38512 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
38513 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
38514 +
38515 +#define PAGE_NONE \
38516 +       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
38517 +#define PAGE_SHARED \
38518 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38519 +
38520 +#define PAGE_SHARED_EXEC \
38521 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38522 +#define PAGE_COPY_NOEXEC \
38523 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38524 +#define PAGE_COPY_EXEC \
38525 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38526 +#define PAGE_COPY \
38527 +       PAGE_COPY_NOEXEC
38528 +#define PAGE_READONLY \
38529 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38530 +#define PAGE_READONLY_EXEC \
38531 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38532 +
38533 +#define _PAGE_KERNEL \
38534 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
38535 +#define _PAGE_KERNEL_EXEC \
38536 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
38537 +
38538 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
38539 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
38540 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
38541 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
38542 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
38543 +
38544 +#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
38545 +#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
38546 +#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
38547 +#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
38548 +#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
38549 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
38550 +
38551 +/*
38552 + * The i386 can't do page protection for execute, and considers that
38553 + * the same are read. Also, write permissions imply read permissions.
38554 + * This is the closest we can get..
38555 + */
38556 +#define __P000 PAGE_NONE
38557 +#define __P001 PAGE_READONLY
38558 +#define __P010 PAGE_COPY
38559 +#define __P011 PAGE_COPY
38560 +#define __P100 PAGE_READONLY_EXEC
38561 +#define __P101 PAGE_READONLY_EXEC
38562 +#define __P110 PAGE_COPY_EXEC
38563 +#define __P111 PAGE_COPY_EXEC
38564 +
38565 +#define __S000 PAGE_NONE
38566 +#define __S001 PAGE_READONLY
38567 +#define __S010 PAGE_SHARED
38568 +#define __S011 PAGE_SHARED
38569 +#define __S100 PAGE_READONLY_EXEC
38570 +#define __S101 PAGE_READONLY_EXEC
38571 +#define __S110 PAGE_SHARED_EXEC
38572 +#define __S111 PAGE_SHARED_EXEC
38573 +
38574 +/*
38575 + * Define this if things work differently on an i386 and an i486:
38576 + * it will (on an i486) warn about kernel memory accesses that are
38577 + * done without a 'access_ok(VERIFY_WRITE,..)'
38578 + */
38579 +#undef TEST_ACCESS_OK
38580 +
38581 +/* The boot page tables (all created as a single array) */
38582 +extern unsigned long pg0[];
38583 +
38584 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
38585 +
38586 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
38587 +#define pmd_none(x)    (!(unsigned long)__pmd_val(x))
38588 +#if CONFIG_XEN_COMPAT <= 0x030002
38589 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
38590 +   can temporarily clear it. */
38591 +#define pmd_present(x) (__pmd_val(x))
38592 +#define pmd_bad(x)     ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
38593 +#else
38594 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
38595 +#define pmd_bad(x)     ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
38596 +#endif
38597 +
38598 +
38599 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
38600 +
38601 +/*
38602 + * The following only work if pte_present() is true.
38603 + * Undefined behaviour if not..
38604 + */
38605 +static inline int pte_user(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
38606 +static inline int pte_read(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
38607 +static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
38608 +static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
38609 +static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
38610 +static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
38611 +
38612 +/*
38613 + * The following only works if pte_present() is not true.
38614 + */
38615 +static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
38616 +
38617 +static inline pte_t pte_rdprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
38618 +static inline pte_t pte_exprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
38619 +static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
38620 +static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
38621 +static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
38622 +static inline pte_t pte_mkread(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
38623 +static inline pte_t pte_mkexec(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
38624 +static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
38625 +static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
38626 +static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
38627 +static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
38628 +
38629 +#ifdef CONFIG_X86_PAE
38630 +# include <asm/pgtable-3level.h>
38631 +#else
38632 +# include <asm/pgtable-2level.h>
38633 +#endif
38634 +
38635 +#define ptep_test_and_clear_dirty(vma, addr, ptep)                     \
38636 +({                                                                     \
38637 +       pte_t __pte = *(ptep);                                          \
38638 +       int __ret = pte_dirty(__pte);                                   \
38639 +       if (__ret) {                                                    \
38640 +               __pte = pte_mkclean(__pte);                             \
38641 +               if ((vma)->vm_mm != current->mm ||                      \
38642 +                   HYPERVISOR_update_va_mapping(addr, __pte, 0))       \
38643 +                       (ptep)->pte_low = __pte.pte_low;                \
38644 +       }                                                               \
38645 +       __ret;                                                          \
38646 +})
38647 +
38648 +#define ptep_test_and_clear_young(vma, addr, ptep)                     \
38649 +({                                                                     \
38650 +       pte_t __pte = *(ptep);                                          \
38651 +       int __ret = pte_young(__pte);                                   \
38652 +       if (__ret)                                                      \
38653 +               __pte = pte_mkold(__pte);                               \
38654 +               if ((vma)->vm_mm != current->mm ||                      \
38655 +                   HYPERVISOR_update_va_mapping(addr, __pte, 0))       \
38656 +                       (ptep)->pte_low = __pte.pte_low;                \
38657 +       __ret;                                                          \
38658 +})
38659 +
38660 +#define ptep_get_and_clear_full(mm, addr, ptep, full)                  \
38661 +       ((full) ? ({                                                    \
38662 +               pte_t __res = *(ptep);                                  \
38663 +               if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \
38664 +                       xen_l1_entry_update(ptep, __pte(0));            \
38665 +               else                                                    \
38666 +                       *(ptep) = __pte(0);                             \
38667 +               __res;                                                  \
38668 +        }) :                                                           \
38669 +        ptep_get_and_clear(mm, addr, ptep))
38670 +
38671 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38672 +{
38673 +       pte_t pte = *ptep;
38674 +       if (pte_write(pte))
38675 +               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
38676 +}
38677 +
38678 +/*
38679 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
38680 + *
38681 + *  dst - pointer to pgd range anwhere on a pgd page
38682 + *  src - ""
38683 + *  count - the number of pgds to copy.
38684 + *
38685 + * dst and src can be on the same page, but the range must not overlap,
38686 + * and must not cross a page boundary.
38687 + */
38688 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
38689 +{
38690 +       memcpy(dst, src, count * sizeof(pgd_t));
38691 +}
38692 +
38693 +/*
38694 + * Macro to mark a page protection value as "uncacheable".  On processors which do not support
38695 + * it, this is a no-op.
38696 + */
38697 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3)                                          \
38698 +                                ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
38699 +
38700 +/*
38701 + * Conversion functions: convert a page and protection to a page entry,
38702 + * and a page entry and page directory to the page they refer to.
38703 + */
38704 +
38705 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
38706 +
38707 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
38708 +{
38709 +       /*
38710 +        * Since this might change the present bit (which controls whether
38711 +        * a pte_t object has undergone p2m translation), we must use
38712 +        * pte_val() on the input pte and __pte() for the return value.
38713 +        */
38714 +       paddr_t pteval = pte_val(pte);
38715 +
38716 +       pteval &= _PAGE_CHG_MASK;
38717 +       pteval |= pgprot_val(newprot);
38718 +#ifdef CONFIG_X86_PAE
38719 +       pteval &= __supported_pte_mask;
38720 +#endif
38721 +       return __pte(pteval);
38722 +}
38723 +
38724 +#define pmd_large(pmd) \
38725 +((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
38726 +
38727 +/*
38728 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
38729 + *
38730 + * this macro returns the index of the entry in the pgd page which would
38731 + * control the given virtual address
38732 + */
38733 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
38734 +#define pgd_index_k(addr) pgd_index(addr)
38735 +
38736 +/*
38737 + * pgd_offset() returns a (pgd_t *)
38738 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
38739 + */
38740 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
38741 +
38742 +/*
38743 + * a shortcut which implies the use of the kernel's pgd, instead
38744 + * of a process's
38745 + */
38746 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
38747 +
38748 +/*
38749 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
38750 + *
38751 + * this macro returns the index of the entry in the pmd page which would
38752 + * control the given virtual address
38753 + */
38754 +#define pmd_index(address) \
38755 +               (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
38756 +
38757 +/*
38758 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
38759 + *
38760 + * this macro returns the index of the entry in the pte page which would
38761 + * control the given virtual address
38762 + */
38763 +#define pte_index(address) \
38764 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
38765 +#define pte_offset_kernel(dir, address) \
38766 +       ((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
38767 +
38768 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
38769 +
38770 +#define pmd_page_kernel(pmd) \
38771 +               ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
38772 +
38773 +/*
38774 + * Helper function that returns the kernel pagetable entry controlling
38775 + * the virtual address 'address'. NULL means no pagetable entry present.
38776 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
38777 + * as a pte too.
38778 + */
38779 +extern pte_t *lookup_address(unsigned long address);
38780 +
38781 +/*
38782 + * Make a given kernel text page executable/non-executable.
38783 + * Returns the previous executability setting of that page (which
38784 + * is used to restore the previous state). Used by the SMP bootup code.
38785 + * NOTE: this is an __init function for security reasons.
38786 + */
38787 +#ifdef CONFIG_X86_PAE
38788 + extern int set_kernel_exec(unsigned long vaddr, int enable);
38789 +#else
38790 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
38791 +#endif
38792 +
38793 +extern void noexec_setup(const char *str);
38794 +
38795 +#if defined(CONFIG_HIGHPTE)
38796 +#define pte_offset_map(dir, address) \
38797 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
38798 +        pte_index(address))
38799 +#define pte_offset_map_nested(dir, address) \
38800 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
38801 +        pte_index(address))
38802 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
38803 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
38804 +#else
38805 +#define pte_offset_map(dir, address) \
38806 +       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
38807 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
38808 +#define pte_unmap(pte) do { } while (0)
38809 +#define pte_unmap_nested(pte) do { } while (0)
38810 +#endif
38811 +
38812 +#define __HAVE_ARCH_PTEP_ESTABLISH
38813 +#define ptep_establish(vma, address, ptep, pteval)                     \
38814 +       do {                                                            \
38815 +               if ( likely((vma)->vm_mm == current->mm) ) {            \
38816 +                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
38817 +                               pteval,                                 \
38818 +                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38819 +                                       UVMF_INVLPG|UVMF_MULTI));       \
38820 +               } else {                                                \
38821 +                       xen_l1_entry_update(ptep, pteval);              \
38822 +                       flush_tlb_page(vma, address);                   \
38823 +               }                                                       \
38824 +       } while (0)
38825 +
38826 +/*
38827 + * The i386 doesn't have any external MMU info: the kernel page
38828 + * tables contain all the necessary information.
38829 + *
38830 + * Also, we only update the dirty/accessed state if we set
38831 + * the dirty bit by hand in the kernel, since the hardware
38832 + * will do the accessed bit for us, and we don't want to
38833 + * race with other CPU's that might be updating the dirty
38834 + * bit at the same time.
38835 + */
38836 +#define update_mmu_cache(vma,address,pte) do { } while (0)
38837 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
38838 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
38839 +       do {                                                            \
38840 +               if (dirty)                                              \
38841 +                       ptep_establish(vma, address, ptep, entry);      \
38842 +       } while (0)
38843 +
38844 +#include <xen/features.h>
38845 +void make_lowmem_page_readonly(void *va, unsigned int feature);
38846 +void make_lowmem_page_writable(void *va, unsigned int feature);
38847 +void make_page_readonly(void *va, unsigned int feature);
38848 +void make_page_writable(void *va, unsigned int feature);
38849 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
38850 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
38851 +
38852 +#define virt_to_ptep(va)                                               \
38853 +({                                                                     \
38854 +       pte_t *__ptep = lookup_address((unsigned long)(va));            \
38855 +       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
38856 +       __ptep;                                                         \
38857 +})
38858 +
38859 +#define arbitrary_virt_to_machine(va)                                  \
38860 +       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
38861 +        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
38862 +
38863 +#endif /* !__ASSEMBLY__ */
38864 +
38865 +#ifdef CONFIG_FLATMEM
38866 +#define kern_addr_valid(addr)  (1)
38867 +#endif /* CONFIG_FLATMEM */
38868 +
38869 +int direct_remap_pfn_range(struct vm_area_struct *vma,
38870 +                           unsigned long address,
38871 +                           unsigned long mfn,
38872 +                           unsigned long size,
38873 +                           pgprot_t prot,
38874 +                           domid_t  domid);
38875 +int direct_kernel_remap_pfn_range(unsigned long address,
38876 +                                 unsigned long mfn,
38877 +                                 unsigned long size,
38878 +                                 pgprot_t prot,
38879 +                                 domid_t  domid);
38880 +int create_lookup_pte_addr(struct mm_struct *mm,
38881 +                           unsigned long address,
38882 +                           uint64_t *ptep);
38883 +int touch_pte_range(struct mm_struct *mm,
38884 +                    unsigned long address,
38885 +                    unsigned long size);
38886 +
38887 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
38888 +               unsigned long addr, unsigned long end, pgprot_t newprot);
38889 +
38890 +#define arch_change_pte_range(mm, pmd, addr, end, newprot)     \
38891 +               xen_change_pte_range(mm, pmd, addr, end, newprot)
38892 +
38893 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
38894 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
38895 +
38896 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
38897 +#define GET_IOSPACE(pfn)               0
38898 +#define GET_PFN(pfn)                   (pfn)
38899 +
38900 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
38901 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
38902 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
38903 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
38904 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
38905 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
38906 +#define __HAVE_ARCH_PTE_SAME
38907 +#include <asm-generic/pgtable.h>
38908 +
38909 +#endif /* _I386_PGTABLE_H */
38910 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h
38911 ===================================================================
38912 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
38913 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100
38914 @@ -0,0 +1,743 @@
38915 +/*
38916 + * include/asm-i386/processor.h
38917 + *
38918 + * Copyright (C) 1994 Linus Torvalds
38919 + */
38920 +
38921 +#ifndef __ASM_I386_PROCESSOR_H
38922 +#define __ASM_I386_PROCESSOR_H
38923 +
38924 +#include <asm/vm86.h>
38925 +#include <asm/math_emu.h>
38926 +#include <asm/segment.h>
38927 +#include <asm/page.h>
38928 +#include <asm/types.h>
38929 +#include <asm/sigcontext.h>
38930 +#include <asm/cpufeature.h>
38931 +#include <asm/msr.h>
38932 +#include <asm/system.h>
38933 +#include <linux/cache.h>
38934 +#include <linux/threads.h>
38935 +#include <asm/percpu.h>
38936 +#include <linux/cpumask.h>
38937 +#include <xen/interface/physdev.h>
38938 +
38939 +/* flag for disabling the tsc */
38940 +extern int tsc_disable;
38941 +
38942 +struct desc_struct {
38943 +       unsigned long a,b;
38944 +};
38945 +
38946 +#define desc_empty(desc) \
38947 +               (!((desc)->a | (desc)->b))
38948 +
38949 +#define desc_equal(desc1, desc2) \
38950 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
38951 +/*
38952 + * Default implementation of macro that returns current
38953 + * instruction pointer ("program counter").
38954 + */
38955 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
38956 +
38957 +/*
38958 + *  CPU type and hardware bug flags. Kept separately for each CPU.
38959 + *  Members of this structure are referenced in head.S, so think twice
38960 + *  before touching them. [mj]
38961 + */
38962 +
38963 +struct cpuinfo_x86 {
38964 +       __u8    x86;            /* CPU family */
38965 +       __u8    x86_vendor;     /* CPU vendor */
38966 +       __u8    x86_model;
38967 +       __u8    x86_mask;
38968 +       char    wp_works_ok;    /* It doesn't on 386's */
38969 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
38970 +       char    hard_math;
38971 +       char    rfu;
38972 +               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
38973 +       unsigned long   x86_capability[NCAPINTS];
38974 +       char    x86_vendor_id[16];
38975 +       char    x86_model_id[64];
38976 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
38977 +                                   call  */
38978 +       int     x86_cache_alignment;    /* In bytes */
38979 +       char    fdiv_bug;
38980 +       char    f00f_bug;
38981 +       char    coma_bug;
38982 +       char    pad0;
38983 +       int     x86_power;
38984 +       unsigned long loops_per_jiffy;
38985 +#ifdef CONFIG_SMP
38986 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
38987 +#endif
38988 +       unsigned char x86_max_cores;    /* cpuid returned max cores value */
38989 +       unsigned char apicid;
38990 +#ifdef CONFIG_SMP
38991 +       unsigned char booted_cores;     /* number of cores as seen by OS */
38992 +       __u8 phys_proc_id;              /* Physical processor id. */
38993 +       __u8 cpu_core_id;               /* Core id */
38994 +#endif
38995 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
38996 +
38997 +#define X86_VENDOR_INTEL 0
38998 +#define X86_VENDOR_CYRIX 1
38999 +#define X86_VENDOR_AMD 2
39000 +#define X86_VENDOR_UMC 3
39001 +#define X86_VENDOR_NEXGEN 4
39002 +#define X86_VENDOR_CENTAUR 5
39003 +#define X86_VENDOR_RISE 6
39004 +#define X86_VENDOR_TRANSMETA 7
39005 +#define X86_VENDOR_NSC 8
39006 +#define X86_VENDOR_NUM 9
39007 +#define X86_VENDOR_UNKNOWN 0xff
39008 +
39009 +/*
39010 + * capabilities of CPUs
39011 + */
39012 +
39013 +extern struct cpuinfo_x86 boot_cpu_data;
39014 +extern struct cpuinfo_x86 new_cpu_data;
39015 +#ifndef CONFIG_X86_NO_TSS
39016 +extern struct tss_struct doublefault_tss;
39017 +DECLARE_PER_CPU(struct tss_struct, init_tss);
39018 +#endif
39019 +
39020 +#ifdef CONFIG_SMP
39021 +extern struct cpuinfo_x86 cpu_data[];
39022 +#define current_cpu_data cpu_data[smp_processor_id()]
39023 +#else
39024 +#define cpu_data (&boot_cpu_data)
39025 +#define current_cpu_data boot_cpu_data
39026 +#endif
39027 +
39028 +extern int cpu_llc_id[NR_CPUS];
39029 +extern char ignore_fpu_irq;
39030 +
39031 +extern void identify_cpu(struct cpuinfo_x86 *);
39032 +extern void print_cpu_info(struct cpuinfo_x86 *);
39033 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
39034 +extern unsigned short num_cache_leaves;
39035 +
39036 +#ifdef CONFIG_X86_HT
39037 +extern void detect_ht(struct cpuinfo_x86 *c);
39038 +#else
39039 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
39040 +#endif
39041 +
39042 +/*
39043 + * EFLAGS bits
39044 + */
39045 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
39046 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
39047 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
39048 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
39049 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
39050 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
39051 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
39052 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
39053 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
39054 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
39055 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
39056 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
39057 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
39058 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
39059 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
39060 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
39061 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
39062 +
39063 +/*
39064 + * Generic CPUID function
39065 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
39066 + * resulting in stale register contents being returned.
39067 + */
39068 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
39069 +{
39070 +       __asm__(XEN_CPUID
39071 +               : "=a" (*eax),
39072 +                 "=b" (*ebx),
39073 +                 "=c" (*ecx),
39074 +                 "=d" (*edx)
39075 +               : "0" (op), "c"(0));
39076 +}
39077 +
39078 +/* Some CPUID calls want 'count' to be placed in ecx */
39079 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
39080 +               int *edx)
39081 +{
39082 +       __asm__(XEN_CPUID
39083 +               : "=a" (*eax),
39084 +                 "=b" (*ebx),
39085 +                 "=c" (*ecx),
39086 +                 "=d" (*edx)
39087 +               : "0" (op), "c" (count));
39088 +}
39089 +
39090 +/*
39091 + * CPUID functions returning a single datum
39092 + */
39093 +static inline unsigned int cpuid_eax(unsigned int op)
39094 +{
39095 +       unsigned int eax;
39096 +
39097 +       __asm__(XEN_CPUID
39098 +               : "=a" (eax)
39099 +               : "0" (op)
39100 +               : "bx", "cx", "dx");
39101 +       return eax;
39102 +}
39103 +static inline unsigned int cpuid_ebx(unsigned int op)
39104 +{
39105 +       unsigned int eax, ebx;
39106 +
39107 +       __asm__(XEN_CPUID
39108 +               : "=a" (eax), "=b" (ebx)
39109 +               : "0" (op)
39110 +               : "cx", "dx" );
39111 +       return ebx;
39112 +}
39113 +static inline unsigned int cpuid_ecx(unsigned int op)
39114 +{
39115 +       unsigned int eax, ecx;
39116 +
39117 +       __asm__(XEN_CPUID
39118 +               : "=a" (eax), "=c" (ecx)
39119 +               : "0" (op)
39120 +               : "bx", "dx" );
39121 +       return ecx;
39122 +}
39123 +static inline unsigned int cpuid_edx(unsigned int op)
39124 +{
39125 +       unsigned int eax, edx;
39126 +
39127 +       __asm__(XEN_CPUID
39128 +               : "=a" (eax), "=d" (edx)
39129 +               : "0" (op)
39130 +               : "bx", "cx");
39131 +       return edx;
39132 +}
39133 +
39134 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
39135 +
39136 +/*
39137 + * Intel CPU features in CR4
39138 + */
39139 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
39140 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
39141 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
39142 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
39143 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
39144 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
39145 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
39146 +#define X86_CR4_PGE            0x0080  /* enable global pages */
39147 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
39148 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
39149 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
39150 +
39151 +/*
39152 + * Save the cr4 feature set we're using (ie
39153 + * Pentium 4MB enable and PPro Global page
39154 + * enable), so that any CPU's that boot up
39155 + * after us can get the correct flags.
39156 + */
39157 +extern unsigned long mmu_cr4_features;
39158 +
39159 +static inline void set_in_cr4 (unsigned long mask)
39160 +{
39161 +       unsigned cr4;
39162 +       mmu_cr4_features |= mask;
39163 +       cr4 = read_cr4();
39164 +       cr4 |= mask;
39165 +       write_cr4(cr4);
39166 +}
39167 +
39168 +static inline void clear_in_cr4 (unsigned long mask)
39169 +{
39170 +       unsigned cr4;
39171 +       mmu_cr4_features &= ~mask;
39172 +       cr4 = read_cr4();
39173 +       cr4 &= ~mask;
39174 +       write_cr4(cr4);
39175 +}
39176 +
39177 +/*
39178 + *      NSC/Cyrix CPU configuration register indexes
39179 + */
39180 +
39181 +#define CX86_PCR0 0x20
39182 +#define CX86_GCR  0xb8
39183 +#define CX86_CCR0 0xc0
39184 +#define CX86_CCR1 0xc1
39185 +#define CX86_CCR2 0xc2
39186 +#define CX86_CCR3 0xc3
39187 +#define CX86_CCR4 0xe8
39188 +#define CX86_CCR5 0xe9
39189 +#define CX86_CCR6 0xea
39190 +#define CX86_CCR7 0xeb
39191 +#define CX86_PCR1 0xf0
39192 +#define CX86_DIR0 0xfe
39193 +#define CX86_DIR1 0xff
39194 +#define CX86_ARR_BASE 0xc4
39195 +#define CX86_RCR_BASE 0xdc
39196 +
39197 +/*
39198 + *      NSC/Cyrix CPU indexed register access macros
39199 + */
39200 +
39201 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
39202 +
39203 +#define setCx86(reg, data) do { \
39204 +       outb((reg), 0x22); \
39205 +       outb((data), 0x23); \
39206 +} while (0)
39207 +
39208 +/* Stop speculative execution */
39209 +static inline void sync_core(void)
39210 +{
39211 +       int tmp;
39212 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
39213 +}
39214 +
39215 +static inline void __monitor(const void *eax, unsigned long ecx,
39216 +               unsigned long edx)
39217 +{
39218 +       /* "monitor %eax,%ecx,%edx;" */
39219 +       asm volatile(
39220 +               ".byte 0x0f,0x01,0xc8;"
39221 +               : :"a" (eax), "c" (ecx), "d"(edx));
39222 +}
39223 +
39224 +static inline void __mwait(unsigned long eax, unsigned long ecx)
39225 +{
39226 +       /* "mwait %eax,%ecx;" */
39227 +       asm volatile(
39228 +               ".byte 0x0f,0x01,0xc9;"
39229 +               : :"a" (eax), "c" (ecx));
39230 +}
39231 +
39232 +/* from system description table in BIOS.  Mostly for MCA use, but
39233 +others may find it useful. */
39234 +extern unsigned int machine_id;
39235 +extern unsigned int machine_submodel_id;
39236 +extern unsigned int BIOS_revision;
39237 +extern unsigned int mca_pentium_flag;
39238 +
39239 +/* Boot loader type from the setup header */
39240 +extern int bootloader_type;
39241 +
39242 +/*
39243 + * User space process size: 3GB (default).
39244 + */
39245 +#define TASK_SIZE      (PAGE_OFFSET)
39246 +
39247 +/* This decides where the kernel will search for a free chunk of vm
39248 + * space during mmap's.
39249 + */
39250 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
39251 +
39252 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
39253 +
39254 +/*
39255 + * Size of io_bitmap.
39256 + */
39257 +#define IO_BITMAP_BITS  65536
39258 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
39259 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
39260 +#ifndef CONFIG_X86_NO_TSS
39261 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
39262 +#endif
39263 +#define INVALID_IO_BITMAP_OFFSET 0x8000
39264 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
39265 +
39266 +struct i387_fsave_struct {
39267 +       long    cwd;
39268 +       long    swd;
39269 +       long    twd;
39270 +       long    fip;
39271 +       long    fcs;
39272 +       long    foo;
39273 +       long    fos;
39274 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
39275 +       long    status;         /* software status information */
39276 +};
39277 +
39278 +struct i387_fxsave_struct {
39279 +       unsigned short  cwd;
39280 +       unsigned short  swd;
39281 +       unsigned short  twd;
39282 +       unsigned short  fop;
39283 +       long    fip;
39284 +       long    fcs;
39285 +       long    foo;
39286 +       long    fos;
39287 +       long    mxcsr;
39288 +       long    mxcsr_mask;
39289 +       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
39290 +       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
39291 +       long    padding[56];
39292 +} __attribute__ ((aligned (16)));
39293 +
39294 +struct i387_soft_struct {
39295 +       long    cwd;
39296 +       long    swd;
39297 +       long    twd;
39298 +       long    fip;
39299 +       long    fcs;
39300 +       long    foo;
39301 +       long    fos;
39302 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
39303 +       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
39304 +       struct info     *info;
39305 +       unsigned long   entry_eip;
39306 +};
39307 +
39308 +union i387_union {
39309 +       struct i387_fsave_struct        fsave;
39310 +       struct i387_fxsave_struct       fxsave;
39311 +       struct i387_soft_struct soft;
39312 +};
39313 +
39314 +typedef struct {
39315 +       unsigned long seg;
39316 +} mm_segment_t;
39317 +
39318 +struct thread_struct;
39319 +
39320 +#ifndef CONFIG_X86_NO_TSS
39321 +struct tss_struct {
39322 +       unsigned short  back_link,__blh;
39323 +       unsigned long   esp0;
39324 +       unsigned short  ss0,__ss0h;
39325 +       unsigned long   esp1;
39326 +       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
39327 +       unsigned long   esp2;
39328 +       unsigned short  ss2,__ss2h;
39329 +       unsigned long   __cr3;
39330 +       unsigned long   eip;
39331 +       unsigned long   eflags;
39332 +       unsigned long   eax,ecx,edx,ebx;
39333 +       unsigned long   esp;
39334 +       unsigned long   ebp;
39335 +       unsigned long   esi;
39336 +       unsigned long   edi;
39337 +       unsigned short  es, __esh;
39338 +       unsigned short  cs, __csh;
39339 +       unsigned short  ss, __ssh;
39340 +       unsigned short  ds, __dsh;
39341 +       unsigned short  fs, __fsh;
39342 +       unsigned short  gs, __gsh;
39343 +       unsigned short  ldt, __ldth;
39344 +       unsigned short  trace, io_bitmap_base;
39345 +       /*
39346 +        * The extra 1 is there because the CPU will access an
39347 +        * additional byte beyond the end of the IO permission
39348 +        * bitmap. The extra byte must be all 1 bits, and must
39349 +        * be within the limit.
39350 +        */
39351 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
39352 +       /*
39353 +        * Cache the current maximum and the last task that used the bitmap:
39354 +        */
39355 +       unsigned long io_bitmap_max;
39356 +       struct thread_struct *io_bitmap_owner;
39357 +       /*
39358 +        * pads the TSS to be cacheline-aligned (size is 0x100)
39359 +        */
39360 +       unsigned long __cacheline_filler[35];
39361 +       /*
39362 +        * .. and then another 0x100 bytes for emergency kernel stack
39363 +        */
39364 +       unsigned long stack[64];
39365 +} __attribute__((packed));
39366 +#endif
39367 +
39368 +#define ARCH_MIN_TASKALIGN     16
39369 +
39370 +struct thread_struct {
39371 +/* cached TLS descriptors. */
39372 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
39373 +       unsigned long   esp0;
39374 +       unsigned long   sysenter_cs;
39375 +       unsigned long   eip;
39376 +       unsigned long   esp;
39377 +       unsigned long   fs;
39378 +       unsigned long   gs;
39379 +/* Hardware debugging registers */
39380 +       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
39381 +/* fault info */
39382 +       unsigned long   cr2, trap_no, error_code;
39383 +/* floating point info */
39384 +       union i387_union        i387;
39385 +/* virtual 86 mode info */
39386 +       struct vm86_struct __user * vm86_info;
39387 +       unsigned long           screen_bitmap;
39388 +       unsigned long           v86flags, v86mask, saved_esp0;
39389 +       unsigned int            saved_fs, saved_gs;
39390 +/* IO permissions */
39391 +       unsigned long   *io_bitmap_ptr;
39392 +       unsigned long   iopl;
39393 +/* max allowed port in the bitmap, in bytes: */
39394 +       unsigned long   io_bitmap_max;
39395 +};
39396 +
39397 +#define INIT_THREAD  {                                                 \
39398 +       .vm86_info = NULL,                                              \
39399 +       .sysenter_cs = __KERNEL_CS,                                     \
39400 +       .io_bitmap_ptr = NULL,                                          \
39401 +}
39402 +
39403 +#ifndef CONFIG_X86_NO_TSS
39404 +/*
39405 + * Note that the .io_bitmap member must be extra-big. This is because
39406 + * the CPU will access an additional byte beyond the end of the IO
39407 + * permission bitmap. The extra byte must be all 1 bits, and must
39408 + * be within the limit.
39409 + */
39410 +#define INIT_TSS  {                                                    \
39411 +       .esp0           = sizeof(init_stack) + (long)&init_stack,       \
39412 +       .ss0            = __KERNEL_DS,                                  \
39413 +       .ss1            = __KERNEL_CS,                                  \
39414 +       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,                     \
39415 +       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
39416 +}
39417 +
39418 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
39419 +{
39420 +       tss->esp0 = thread->esp0;
39421 +       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
39422 +       if (unlikely(tss->ss1 != thread->sysenter_cs)) {
39423 +               tss->ss1 = thread->sysenter_cs;
39424 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
39425 +       }
39426 +}
39427 +#define load_esp0(tss, thread) \
39428 +       __load_esp0(tss, thread)
39429 +#else
39430 +#define load_esp0(tss, thread) do { \
39431 +       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
39432 +               BUG(); \
39433 +} while (0)
39434 +#endif
39435 +
39436 +#define start_thread(regs, new_eip, new_esp) do {              \
39437 +       __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));       \
39438 +       set_fs(USER_DS);                                        \
39439 +       regs->xds = __USER_DS;                                  \
39440 +       regs->xes = __USER_DS;                                  \
39441 +       regs->xss = __USER_DS;                                  \
39442 +       regs->xcs = __USER_CS;                                  \
39443 +       regs->eip = new_eip;                                    \
39444 +       regs->esp = new_esp;                                    \
39445 +} while (0)
39446 +
39447 +/*
39448 + * These special macros can be used to get or set a debugging register
39449 + */
39450 +#define get_debugreg(var, register)                            \
39451 +               (var) = HYPERVISOR_get_debugreg((register))
39452 +#define set_debugreg(value, register)                  \
39453 +               WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
39454 +
39455 +/*
39456 + * Set IOPL bits in EFLAGS from given mask
39457 + */
39458 +static inline void set_iopl_mask(unsigned mask)
39459 +{
39460 +       struct physdev_set_iopl set_iopl;
39461 +
39462 +       /* Force the change at ring 0. */
39463 +       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
39464 +       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
39465 +}
39466 +
39467 +/* Forward declaration, a strange C thing */
39468 +struct task_struct;
39469 +struct mm_struct;
39470 +
39471 +/* Free all resources held by a thread. */
39472 +extern void release_thread(struct task_struct *);
39473 +
39474 +/* Prepare to copy thread state - unlazy all lazy status */
39475 +extern void prepare_to_copy(struct task_struct *tsk);
39476 +
39477 +/*
39478 + * create a kernel thread without removing it from tasklists
39479 + */
39480 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
39481 +
39482 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
39483 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
39484 +
39485 +unsigned long get_wchan(struct task_struct *p);
39486 +
39487 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
39488 +#define KSTK_TOP(info)                                                 \
39489 +({                                                                     \
39490 +       unsigned long *__ptr = (unsigned long *)(info);                 \
39491 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
39492 +})
39493 +
39494 +/*
39495 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
39496 + * This is necessary to guarantee that the entire "struct pt_regs"
39497 + * is accessable even if the CPU haven't stored the SS/ESP registers
39498 + * on the stack (interrupt gate does not save these registers
39499 + * when switching to the same priv ring).
39500 + * Therefore beware: accessing the xss/esp fields of the
39501 + * "struct pt_regs" is possible, but they may contain the
39502 + * completely wrong values.
39503 + */
39504 +#define task_pt_regs(task)                                             \
39505 +({                                                                     \
39506 +       struct pt_regs *__regs__;                                       \
39507 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
39508 +       __regs__ - 1;                                                   \
39509 +})
39510 +
39511 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
39512 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
39513 +
39514 +
39515 +struct microcode_header {
39516 +       unsigned int hdrver;
39517 +       unsigned int rev;
39518 +       unsigned int date;
39519 +       unsigned int sig;
39520 +       unsigned int cksum;
39521 +       unsigned int ldrver;
39522 +       unsigned int pf;
39523 +       unsigned int datasize;
39524 +       unsigned int totalsize;
39525 +       unsigned int reserved[3];
39526 +};
39527 +
39528 +struct microcode {
39529 +       struct microcode_header hdr;
39530 +       unsigned int bits[0];
39531 +};
39532 +
39533 +typedef struct microcode microcode_t;
39534 +typedef struct microcode_header microcode_header_t;
39535 +
39536 +/* microcode format is extended from prescott processors */
39537 +struct extended_signature {
39538 +       unsigned int sig;
39539 +       unsigned int pf;
39540 +       unsigned int cksum;
39541 +};
39542 +
39543 +struct extended_sigtable {
39544 +       unsigned int count;
39545 +       unsigned int cksum;
39546 +       unsigned int reserved[3];
39547 +       struct extended_signature sigs[0];
39548 +};
39549 +
39550 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
39551 +static inline void rep_nop(void)
39552 +{
39553 +       __asm__ __volatile__("rep;nop": : :"memory");
39554 +}
39555 +
39556 +#define cpu_relax()    rep_nop()
39557 +
39558 +/* generic versions from gas */
39559 +#define GENERIC_NOP1   ".byte 0x90\n"
39560 +#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
39561 +#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
39562 +#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
39563 +#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
39564 +#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
39565 +#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
39566 +#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
39567 +
39568 +/* Opteron nops */
39569 +#define K8_NOP1 GENERIC_NOP1
39570 +#define K8_NOP2        ".byte 0x66,0x90\n"
39571 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
39572 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
39573 +#define K8_NOP5        K8_NOP3 K8_NOP2
39574 +#define K8_NOP6        K8_NOP3 K8_NOP3
39575 +#define K8_NOP7        K8_NOP4 K8_NOP3
39576 +#define K8_NOP8        K8_NOP4 K8_NOP4
39577 +
39578 +/* K7 nops */
39579 +/* uses eax dependencies (arbitary choice) */
39580 +#define K7_NOP1  GENERIC_NOP1
39581 +#define K7_NOP2        ".byte 0x8b,0xc0\n"
39582 +#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
39583 +#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
39584 +#define K7_NOP5        K7_NOP4 ASM_NOP1
39585 +#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
39586 +#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
39587 +#define K7_NOP8        K7_NOP7 ASM_NOP1
39588 +
39589 +#ifdef CONFIG_MK8
39590 +#define ASM_NOP1 K8_NOP1
39591 +#define ASM_NOP2 K8_NOP2
39592 +#define ASM_NOP3 K8_NOP3
39593 +#define ASM_NOP4 K8_NOP4
39594 +#define ASM_NOP5 K8_NOP5
39595 +#define ASM_NOP6 K8_NOP6
39596 +#define ASM_NOP7 K8_NOP7
39597 +#define ASM_NOP8 K8_NOP8
39598 +#elif defined(CONFIG_MK7)
39599 +#define ASM_NOP1 K7_NOP1
39600 +#define ASM_NOP2 K7_NOP2
39601 +#define ASM_NOP3 K7_NOP3
39602 +#define ASM_NOP4 K7_NOP4
39603 +#define ASM_NOP5 K7_NOP5
39604 +#define ASM_NOP6 K7_NOP6
39605 +#define ASM_NOP7 K7_NOP7
39606 +#define ASM_NOP8 K7_NOP8
39607 +#else
39608 +#define ASM_NOP1 GENERIC_NOP1
39609 +#define ASM_NOP2 GENERIC_NOP2
39610 +#define ASM_NOP3 GENERIC_NOP3
39611 +#define ASM_NOP4 GENERIC_NOP4
39612 +#define ASM_NOP5 GENERIC_NOP5
39613 +#define ASM_NOP6 GENERIC_NOP6
39614 +#define ASM_NOP7 GENERIC_NOP7
39615 +#define ASM_NOP8 GENERIC_NOP8
39616 +#endif
39617 +
39618 +#define ASM_NOP_MAX 8
39619 +
39620 +/* Prefetch instructions for Pentium III and AMD Athlon */
39621 +/* It's not worth to care about 3dnow! prefetches for the K6
39622 +   because they are microcoded there and very slow.
39623 +   However we don't do prefetches for pre XP Athlons currently
39624 +   That should be fixed. */
39625 +#define ARCH_HAS_PREFETCH
39626 +static inline void prefetch(const void *x)
39627 +{
39628 +       alternative_input(ASM_NOP4,
39629 +                         "prefetchnta (%1)",
39630 +                         X86_FEATURE_XMM,
39631 +                         "r" (x));
39632 +}
39633 +
39634 +#define ARCH_HAS_PREFETCH
39635 +#define ARCH_HAS_PREFETCHW
39636 +#define ARCH_HAS_SPINLOCK_PREFETCH
39637 +
39638 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
39639 +   spinlocks to avoid one state transition in the cache coherency protocol. */
39640 +static inline void prefetchw(const void *x)
39641 +{
39642 +       alternative_input(ASM_NOP4,
39643 +                         "prefetchw (%1)",
39644 +                         X86_FEATURE_3DNOW,
39645 +                         "r" (x));
39646 +}
39647 +#define spin_lock_prefetch(x)  prefetchw(x)
39648 +
39649 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
39650 +
39651 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
39652 +
39653 +extern unsigned long boot_option_idle_override;
39654 +extern void enable_sep_cpu(void);
39655 +extern int sysenter_setup(void);
39656 +
39657 +#endif /* __ASM_I386_PROCESSOR_H */
39658 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h
39659 ===================================================================
39660 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
39661 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h   2007-06-12 13:14:02.000000000 +0200
39662 @@ -0,0 +1,117 @@
39663 +#ifndef _ASM_SEGMENT_H
39664 +#define _ASM_SEGMENT_H
39665 +
39666 +/*
39667 + * The layout of the per-CPU GDT under Linux:
39668 + *
39669 + *   0 - null
39670 + *   1 - reserved
39671 + *   2 - reserved
39672 + *   3 - reserved
39673 + *
39674 + *   4 - unused                        <==== new cacheline
39675 + *   5 - unused
39676 + *
39677 + *  ------- start of TLS (Thread-Local Storage) segments:
39678 + *
39679 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
39680 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
39681 + *   8 - TLS segment #3
39682 + *   9 - reserved
39683 + *  10 - reserved
39684 + *  11 - reserved
39685 + *
39686 + *  ------- start of kernel segments:
39687 + *
39688 + *  12 - kernel code segment           <==== new cacheline
39689 + *  13 - kernel data segment
39690 + *  14 - default user CS
39691 + *  15 - default user DS
39692 + *  16 - TSS
39693 + *  17 - LDT
39694 + *  18 - PNPBIOS support (16->32 gate)
39695 + *  19 - PNPBIOS support
39696 + *  20 - PNPBIOS support
39697 + *  21 - PNPBIOS support
39698 + *  22 - PNPBIOS support
39699 + *  23 - APM BIOS support
39700 + *  24 - APM BIOS support
39701 + *  25 - APM BIOS support
39702 + *
39703 + *  26 - ESPFIX small SS
39704 + *  27 - unused
39705 + *  28 - unused
39706 + *  29 - unused
39707 + *  30 - unused
39708 + *  31 - TSS for double fault handler
39709 + */
39710 +#define GDT_ENTRY_TLS_ENTRIES  3
39711 +#define GDT_ENTRY_TLS_MIN      6
39712 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
39713 +
39714 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
39715 +
39716 +#define GDT_ENTRY_DEFAULT_USER_CS      14
39717 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
39718 +
39719 +#define GDT_ENTRY_DEFAULT_USER_DS      15
39720 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
39721 +
39722 +#define GDT_ENTRY_KERNEL_BASE  12
39723 +
39724 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
39725 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
39726 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39727 +
39728 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
39729 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
39730 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39731 +
39732 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
39733 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
39734 +
39735 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
39736 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
39737 +
39738 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
39739 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
39740 +
39741 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
39742 +
39743 +/*
39744 + * The GDT has 32 entries
39745 + */
39746 +#define GDT_ENTRIES 32
39747 +
39748 +#define GDT_SIZE (GDT_ENTRIES * 8)
39749 +
39750 +/* Simple and small GDT entries for booting only */
39751 +
39752 +#define GDT_ENTRY_BOOT_CS              2
39753 +#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
39754 +
39755 +#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
39756 +#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
39757 +
39758 +/* The PnP BIOS entries in the GDT */
39759 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
39760 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
39761 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
39762 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
39763 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
39764 +
39765 +/* The PnP BIOS selectors */
39766 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
39767 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
39768 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
39769 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
39770 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
39771 +
39772 +/*
39773 + * The interrupt descriptor table has room for 256 idt's,
39774 + * the global descriptor table is dependent on the number
39775 + * of tasks we can have..
39776 + */
39777 +#define IDT_ENTRIES 256
39778 +
39779 +#endif
39780 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h
39781 ===================================================================
39782 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
39783 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h       2007-06-12 13:14:02.000000000 +0200
39784 @@ -0,0 +1,103 @@
39785 +#ifndef __ASM_SMP_H
39786 +#define __ASM_SMP_H
39787 +
39788 +/*
39789 + * We need the APIC definitions automatically as part of 'smp.h'
39790 + */
39791 +#ifndef __ASSEMBLY__
39792 +#include <linux/kernel.h>
39793 +#include <linux/threads.h>
39794 +#include <linux/cpumask.h>
39795 +#endif
39796 +
39797 +#ifdef CONFIG_X86_LOCAL_APIC
39798 +#ifndef __ASSEMBLY__
39799 +#include <asm/fixmap.h>
39800 +#include <asm/bitops.h>
39801 +#include <asm/mpspec.h>
39802 +#ifdef CONFIG_X86_IO_APIC
39803 +#include <asm/io_apic.h>
39804 +#endif
39805 +#include <asm/apic.h>
39806 +#endif
39807 +#endif
39808 +
39809 +#define BAD_APICID 0xFFu
39810 +#ifdef CONFIG_SMP
39811 +#ifndef __ASSEMBLY__
39812 +
39813 +/*
39814 + * Private routines/data
39815 + */
39816 +
39817 +extern void smp_alloc_memory(void);
39818 +extern int pic_mode;
39819 +extern int smp_num_siblings;
39820 +extern cpumask_t cpu_sibling_map[];
39821 +extern cpumask_t cpu_core_map[];
39822 +
39823 +extern void (*mtrr_hook) (void);
39824 +extern void zap_low_mappings (void);
39825 +extern void lock_ipi_call_lock(void);
39826 +extern void unlock_ipi_call_lock(void);
39827 +
39828 +#define MAX_APICID 256
39829 +extern u8 x86_cpu_to_apicid[];
39830 +
39831 +#define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
39832 +
39833 +#ifdef CONFIG_HOTPLUG_CPU
39834 +extern void cpu_exit_clear(void);
39835 +extern void cpu_uninit(void);
39836 +#endif
39837 +
39838 +/*
39839 + * This function is needed by all SMP systems. It must _always_ be valid
39840 + * from the initial startup. We map APIC_BASE very early in page_setup(),
39841 + * so this is correct in the x86 case.
39842 + */
39843 +#define raw_smp_processor_id() (current_thread_info()->cpu)
39844 +
39845 +extern cpumask_t cpu_possible_map;
39846 +#define cpu_callin_map cpu_possible_map
39847 +
39848 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
39849 +static inline int num_booting_cpus(void)
39850 +{
39851 +       return cpus_weight(cpu_possible_map);
39852 +}
39853 +
39854 +#ifdef CONFIG_X86_LOCAL_APIC
39855 +
39856 +#ifdef APIC_DEFINITION
39857 +extern int hard_smp_processor_id(void);
39858 +#else
39859 +#include <mach_apicdef.h>
39860 +static inline int hard_smp_processor_id(void)
39861 +{
39862 +       /* we don't want to mark this access volatile - bad code generation */
39863 +       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
39864 +}
39865 +#endif
39866 +
39867 +static __inline int logical_smp_processor_id(void)
39868 +{
39869 +       /* we don't want to mark this access volatile - bad code generation */
39870 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
39871 +}
39872 +
39873 +#endif
39874 +
39875 +extern int __cpu_disable(void);
39876 +extern void __cpu_die(unsigned int cpu);
39877 +extern void prefill_possible_map(void);
39878 +#endif /* !__ASSEMBLY__ */
39879 +
39880 +#else /* CONFIG_SMP */
39881 +
39882 +#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
39883 +
39884 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
39885 +
39886 +#endif
39887 +#endif
39888 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h
39889 ===================================================================
39890 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
39891 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h   2007-06-12 13:14:02.000000000 +0200
39892 @@ -0,0 +1,43 @@
39893 +#ifndef _ASM_SWIOTLB_H
39894 +#define _ASM_SWIOTLB_H 1
39895 +
39896 +/* SWIOTLB interface */
39897 +
39898 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
39899 +                                     int dir);
39900 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
39901 +                                 size_t size, int dir);
39902 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
39903 +                                        dma_addr_t dev_addr,
39904 +                                        size_t size, int dir);
39905 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
39906 +                                           dma_addr_t dev_addr,
39907 +                                           size_t size, int dir);
39908 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
39909 +                                    struct scatterlist *sg, int nelems,
39910 +                                    int dir);
39911 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
39912 +                                       struct scatterlist *sg, int nelems,
39913 +                                       int dir);
39914 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
39915 +                     int nents, int direction);
39916 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
39917 +                        int nents, int direction);
39918 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
39919 +#ifdef CONFIG_HIGHMEM
39920 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
39921 +                                   unsigned long offset, size_t size,
39922 +                                   enum dma_data_direction direction);
39923 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
39924 +                               size_t size, enum dma_data_direction direction);
39925 +#endif
39926 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
39927 +extern void swiotlb_init(void);
39928 +
39929 +#ifdef CONFIG_SWIOTLB
39930 +extern int swiotlb;
39931 +#else
39932 +#define swiotlb 0
39933 +#endif
39934 +
39935 +#endif
39936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h
39937 ===================================================================
39938 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
39939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200
39940 @@ -0,0 +1,126 @@
39941 +#ifndef __XEN_SYNCH_BITOPS_H__
39942 +#define __XEN_SYNCH_BITOPS_H__
39943 +
39944 +/*
39945 + * Copyright 1992, Linus Torvalds.
39946 + * Heavily modified to provide guaranteed strong synchronisation
39947 + * when communicating with Xen or other guest OSes running on other CPUs.
39948 + */
39949 +
39950 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
39951 +#include <xen/platform-compat.h>
39952 +#endif
39953 +
39954 +#define ADDR (*(volatile long *) addr)
39955 +
39956 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
39957 +{
39958 +    __asm__ __volatile__ (
39959 +        "lock btsl %1,%0"
39960 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
39961 +}
39962 +
39963 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
39964 +{
39965 +    __asm__ __volatile__ (
39966 +        "lock btrl %1,%0"
39967 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
39968 +}
39969 +
39970 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
39971 +{
39972 +    __asm__ __volatile__ (
39973 +        "lock btcl %1,%0"
39974 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
39975 +}
39976 +
39977 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
39978 +{
39979 +    int oldbit;
39980 +    __asm__ __volatile__ (
39981 +        "lock btsl %2,%1\n\tsbbl %0,%0"
39982 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39983 +    return oldbit;
39984 +}
39985 +
39986 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
39987 +{
39988 +    int oldbit;
39989 +    __asm__ __volatile__ (
39990 +        "lock btrl %2,%1\n\tsbbl %0,%0"
39991 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39992 +    return oldbit;
39993 +}
39994 +
39995 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
39996 +{
39997 +    int oldbit;
39998 +
39999 +    __asm__ __volatile__ (
40000 +        "lock btcl %2,%1\n\tsbbl %0,%0"
40001 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
40002 +    return oldbit;
40003 +}
40004 +
40005 +struct __synch_xchg_dummy { unsigned long a[100]; };
40006 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
40007 +
40008 +#define synch_cmpxchg(ptr, old, new) \
40009 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
40010 +                                     (unsigned long)(old), \
40011 +                                     (unsigned long)(new), \
40012 +                                     sizeof(*(ptr))))
40013 +
40014 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
40015 +                                           unsigned long old,
40016 +                                           unsigned long new, int size)
40017 +{
40018 +       unsigned long prev;
40019 +       switch (size) {
40020 +       case 1:
40021 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
40022 +                                    : "=a"(prev)
40023 +                                    : "q"(new), "m"(*__synch_xg(ptr)),
40024 +                                      "0"(old)
40025 +                                    : "memory");
40026 +               return prev;
40027 +       case 2:
40028 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
40029 +                                    : "=a"(prev)
40030 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
40031 +                                      "0"(old)
40032 +                                    : "memory");
40033 +               return prev;
40034 +#ifdef CONFIG_X86_64
40035 +       case 4:
40036 +               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
40037 +                                    : "=a"(prev)
40038 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
40039 +                                      "0"(old)
40040 +                                    : "memory");
40041 +               return prev;
40042 +       case 8:
40043 +               __asm__ __volatile__("lock; cmpxchgq %1,%2"
40044 +                                    : "=a"(prev)
40045 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
40046 +                                      "0"(old)
40047 +                                    : "memory");
40048 +               return prev;
40049 +#else
40050 +       case 4:
40051 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
40052 +                                    : "=a"(prev)
40053 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
40054 +                                      "0"(old)
40055 +                                    : "memory");
40056 +               return prev;
40057 +#endif
40058 +       }
40059 +       return old;
40060 +}
40061 +
40062 +#define synch_test_bit test_bit
40063 +
40064 +#define synch_cmpxchg_subword synch_cmpxchg
40065 +
40066 +#endif /* __XEN_SYNCH_BITOPS_H__ */
40067 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h
40068 ===================================================================
40069 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40070 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h    2007-06-12 13:14:02.000000000 +0200
40071 @@ -0,0 +1,488 @@
40072 +#ifndef __ASM_SYSTEM_H
40073 +#define __ASM_SYSTEM_H
40074 +
40075 +#include <linux/kernel.h>
40076 +#include <asm/segment.h>
40077 +#include <asm/cpufeature.h>
40078 +#include <linux/bitops.h> /* for LOCK_PREFIX */
40079 +#include <asm/synch_bitops.h>
40080 +#include <asm/hypervisor.h>
40081 +
40082 +#ifdef __KERNEL__
40083 +
40084 +struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
40085 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
40086 +
40087 +/*
40088 + * Saving eflags is important. It switches not only IOPL between tasks,
40089 + * it also protects other tasks from NT leaking through sysenter etc.
40090 + */
40091 +#define switch_to(prev,next,last) do {                                 \
40092 +       unsigned long esi,edi;                                          \
40093 +       asm volatile("pushfl\n\t"               /* Save flags */        \
40094 +                    "pushl %%ebp\n\t"                                  \
40095 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
40096 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
40097 +                    "movl $1f,%1\n\t"          /* save EIP */          \
40098 +                    "pushl %6\n\t"             /* restore EIP */       \
40099 +                    "jmp __switch_to\n"                                \
40100 +                    "1:\t"                                             \
40101 +                    "popl %%ebp\n\t"                                   \
40102 +                    "popfl"                                            \
40103 +                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
40104 +                     "=a" (last),"=S" (esi),"=D" (edi)                 \
40105 +                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
40106 +                     "2" (prev), "d" (next));                          \
40107 +} while (0)
40108 +
40109 +#define _set_base(addr,base) do { unsigned long __pr; \
40110 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40111 +       "rorl $16,%%edx\n\t" \
40112 +       "movb %%dl,%2\n\t" \
40113 +       "movb %%dh,%3" \
40114 +       :"=&d" (__pr) \
40115 +       :"m" (*((addr)+2)), \
40116 +        "m" (*((addr)+4)), \
40117 +        "m" (*((addr)+7)), \
40118 +         "0" (base) \
40119 +        ); } while(0)
40120 +
40121 +#define _set_limit(addr,limit) do { unsigned long __lr; \
40122 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40123 +       "rorl $16,%%edx\n\t" \
40124 +       "movb %2,%%dh\n\t" \
40125 +       "andb $0xf0,%%dh\n\t" \
40126 +       "orb %%dh,%%dl\n\t" \
40127 +       "movb %%dl,%2" \
40128 +       :"=&d" (__lr) \
40129 +       :"m" (*(addr)), \
40130 +        "m" (*((addr)+6)), \
40131 +        "0" (limit) \
40132 +        ); } while(0)
40133 +
40134 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
40135 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
40136 +
40137 +/*
40138 + * Load a segment. Fall back on loading the zero
40139 + * segment if something goes wrong..
40140 + */
40141 +#define loadsegment(seg,value)                 \
40142 +       asm volatile("\n"                       \
40143 +               "1:\t"                          \
40144 +               "mov %0,%%" #seg "\n"           \
40145 +               "2:\n"                          \
40146 +               ".section .fixup,\"ax\"\n"      \
40147 +               "3:\t"                          \
40148 +               "pushl $0\n\t"                  \
40149 +               "popl %%" #seg "\n\t"           \
40150 +               "jmp 2b\n"                      \
40151 +               ".previous\n"                   \
40152 +               ".section __ex_table,\"a\"\n\t" \
40153 +               ".align 4\n\t"                  \
40154 +               ".long 1b,3b\n"                 \
40155 +               ".previous"                     \
40156 +               : :"rm" (value))
40157 +
40158 +/*
40159 + * Save a segment register away
40160 + */
40161 +#define savesegment(seg, value) \
40162 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
40163 +
40164 +#define read_cr0() ({ \
40165 +       unsigned int __dummy; \
40166 +       __asm__ __volatile__( \
40167 +               "movl %%cr0,%0\n\t" \
40168 +               :"=r" (__dummy)); \
40169 +       __dummy; \
40170 +})
40171 +#define write_cr0(x) \
40172 +       __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
40173 +
40174 +#define read_cr2() (current_vcpu_info()->arch.cr2)
40175 +#define write_cr2(x) \
40176 +       __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
40177 +
40178 +#define read_cr3() ({ \
40179 +       unsigned int __dummy; \
40180 +       __asm__ ( \
40181 +               "movl %%cr3,%0\n\t" \
40182 +               :"=r" (__dummy)); \
40183 +       __dummy = xen_cr3_to_pfn(__dummy); \
40184 +       mfn_to_pfn(__dummy) << PAGE_SHIFT; \
40185 +})
40186 +#define write_cr3(x) ({                                                \
40187 +       unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT);   \
40188 +       __dummy = xen_pfn_to_cr3(__dummy);                      \
40189 +       __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy));  \
40190 +})
40191 +#define read_cr4() ({ \
40192 +       unsigned int __dummy; \
40193 +       __asm__( \
40194 +               "movl %%cr4,%0\n\t" \
40195 +               :"=r" (__dummy)); \
40196 +       __dummy; \
40197 +})
40198 +#define read_cr4_safe() ({                           \
40199 +       unsigned int __dummy;                         \
40200 +       /* This could fault if %cr4 does not exist */ \
40201 +       __asm__("1: movl %%cr4, %0              \n"   \
40202 +               "2:                             \n"   \
40203 +               ".section __ex_table,\"a\"      \n"   \
40204 +               ".long 1b,2b                    \n"   \
40205 +               ".previous                      \n"   \
40206 +               : "=r" (__dummy): "0" (0));           \
40207 +       __dummy;                                      \
40208 +})
40209 +
40210 +#define write_cr4(x) \
40211 +       __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
40212 +
40213 +/*
40214 + * Clear and set 'TS' bit respectively
40215 + */
40216 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
40217 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
40218 +
40219 +#endif /* __KERNEL__ */
40220 +
40221 +#define wbinvd() \
40222 +       __asm__ __volatile__ ("wbinvd": : :"memory")
40223 +
40224 +static inline unsigned long get_limit(unsigned long segment)
40225 +{
40226 +       unsigned long __limit;
40227 +       __asm__("lsll %1,%0"
40228 +               :"=r" (__limit):"r" (segment));
40229 +       return __limit+1;
40230 +}
40231 +
40232 +#define nop() __asm__ __volatile__ ("nop")
40233 +
40234 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
40235 +
40236 +#define tas(ptr) (xchg((ptr),1))
40237 +
40238 +struct __xchg_dummy { unsigned long a[100]; };
40239 +#define __xg(x) ((struct __xchg_dummy *)(x))
40240 +
40241 +
40242 +#ifdef CONFIG_X86_CMPXCHG64
40243 +
40244 +/*
40245 + * The semantics of XCHGCMP8B are a bit strange, this is why
40246 + * there is a loop and the loading of %%eax and %%edx has to
40247 + * be inside. This inlines well in most cases, the cached
40248 + * cost is around ~38 cycles. (in the future we might want
40249 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
40250 + * might have an implicit FPU-save as a cost, so it's not
40251 + * clear which path to go.)
40252 + *
40253 + * cmpxchg8b must be used with the lock prefix here to allow
40254 + * the instruction to be executed atomically, see page 3-102
40255 + * of the instruction set reference 24319102.pdf. We need
40256 + * the reader side to see the coherent 64bit value.
40257 + */
40258 +static inline void __set_64bit (unsigned long long * ptr,
40259 +               unsigned int low, unsigned int high)
40260 +{
40261 +       __asm__ __volatile__ (
40262 +               "\n1:\t"
40263 +               "movl (%0), %%eax\n\t"
40264 +               "movl 4(%0), %%edx\n\t"
40265 +               "lock cmpxchg8b (%0)\n\t"
40266 +               "jnz 1b"
40267 +               : /* no outputs */
40268 +               :       "D"(ptr),
40269 +                       "b"(low),
40270 +                       "c"(high)
40271 +               :       "ax","dx","memory");
40272 +}
40273 +
40274 +static inline void __set_64bit_constant (unsigned long long *ptr,
40275 +                                                unsigned long long value)
40276 +{
40277 +       __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
40278 +}
40279 +#define ll_low(x)      *(((unsigned int*)&(x))+0)
40280 +#define ll_high(x)     *(((unsigned int*)&(x))+1)
40281 +
40282 +static inline void __set_64bit_var (unsigned long long *ptr,
40283 +                        unsigned long long value)
40284 +{
40285 +       __set_64bit(ptr,ll_low(value), ll_high(value));
40286 +}
40287 +
40288 +#define set_64bit(ptr,value) \
40289 +(__builtin_constant_p(value) ? \
40290 + __set_64bit_constant(ptr, value) : \
40291 + __set_64bit_var(ptr, value) )
40292 +
40293 +#define _set_64bit(ptr,value) \
40294 +(__builtin_constant_p(value) ? \
40295 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
40296 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
40297 +
40298 +#endif
40299 +
40300 +/*
40301 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
40302 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
40303 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
40304 + */
40305 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
40306 +{
40307 +       switch (size) {
40308 +               case 1:
40309 +                       __asm__ __volatile__("xchgb %b0,%1"
40310 +                               :"=q" (x)
40311 +                               :"m" (*__xg(ptr)), "0" (x)
40312 +                               :"memory");
40313 +                       break;
40314 +               case 2:
40315 +                       __asm__ __volatile__("xchgw %w0,%1"
40316 +                               :"=r" (x)
40317 +                               :"m" (*__xg(ptr)), "0" (x)
40318 +                               :"memory");
40319 +                       break;
40320 +               case 4:
40321 +                       __asm__ __volatile__("xchgl %0,%1"
40322 +                               :"=r" (x)
40323 +                               :"m" (*__xg(ptr)), "0" (x)
40324 +                               :"memory");
40325 +                       break;
40326 +       }
40327 +       return x;
40328 +}
40329 +
40330 +/*
40331 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
40332 + * store NEW in MEM.  Return the initial value in MEM.  Success is
40333 + * indicated by comparing RETURN with OLD.
40334 + */
40335 +
40336 +#ifdef CONFIG_X86_CMPXCHG
40337 +#define __HAVE_ARCH_CMPXCHG 1
40338 +#define cmpxchg(ptr,o,n)\
40339 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
40340 +                                       (unsigned long)(n),sizeof(*(ptr))))
40341 +#endif
40342 +
40343 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
40344 +                                     unsigned long new, int size)
40345 +{
40346 +       unsigned long prev;
40347 +       switch (size) {
40348 +       case 1:
40349 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
40350 +                                    : "=a"(prev)
40351 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
40352 +                                    : "memory");
40353 +               return prev;
40354 +       case 2:
40355 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
40356 +                                    : "=a"(prev)
40357 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
40358 +                                    : "memory");
40359 +               return prev;
40360 +       case 4:
40361 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
40362 +                                    : "=a"(prev)
40363 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
40364 +                                    : "memory");
40365 +               return prev;
40366 +       }
40367 +       return old;
40368 +}
40369 +
40370 +#ifndef CONFIG_X86_CMPXCHG
40371 +/*
40372 + * Building a kernel capable running on 80386. It may be necessary to
40373 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
40374 + * a function for each of the sizes we support.
40375 + */
40376 +
40377 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
40378 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
40379 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
40380 +
40381 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
40382 +                                     unsigned long new, int size)
40383 +{
40384 +       switch (size) {
40385 +       case 1:
40386 +               return cmpxchg_386_u8(ptr, old, new);
40387 +       case 2:
40388 +               return cmpxchg_386_u16(ptr, old, new);
40389 +       case 4:
40390 +               return cmpxchg_386_u32(ptr, old, new);
40391 +       }
40392 +       return old;
40393 +}
40394 +
40395 +#define cmpxchg(ptr,o,n)                                               \
40396 +({                                                                     \
40397 +       __typeof__(*(ptr)) __ret;                                       \
40398 +       if (likely(boot_cpu_data.x86 > 3))                              \
40399 +               __ret = __cmpxchg((ptr), (unsigned long)(o),            \
40400 +                                       (unsigned long)(n), sizeof(*(ptr))); \
40401 +       else                                                            \
40402 +               __ret = cmpxchg_386((ptr), (unsigned long)(o),          \
40403 +                                       (unsigned long)(n), sizeof(*(ptr))); \
40404 +       __ret;                                                          \
40405 +})
40406 +#endif
40407 +
40408 +#ifdef CONFIG_X86_CMPXCHG64
40409 +
40410 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
40411 +                                     unsigned long long new)
40412 +{
40413 +       unsigned long long prev;
40414 +       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
40415 +                            : "=A"(prev)
40416 +                            : "b"((unsigned long)new),
40417 +                              "c"((unsigned long)(new >> 32)),
40418 +                              "m"(*__xg(ptr)),
40419 +                              "0"(old)
40420 +                            : "memory");
40421 +       return prev;
40422 +}
40423 +
40424 +#define cmpxchg64(ptr,o,n)\
40425 +       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
40426 +                                       (unsigned long long)(n)))
40427 +
40428 +#endif
40429 +
40430 +/*
40431 + * Force strict CPU ordering.
40432 + * And yes, this is required on UP too when we're talking
40433 + * to devices.
40434 + *
40435 + * For now, "wmb()" doesn't actually do anything, as all
40436 + * Intel CPU's follow what Intel calls a *Processor Order*,
40437 + * in which all writes are seen in the program order even
40438 + * outside the CPU.
40439 + *
40440 + * I expect future Intel CPU's to have a weaker ordering,
40441 + * but I'd also expect them to finally get their act together
40442 + * and add some real memory barriers if so.
40443 + *
40444 + * Some non intel clones support out of order store. wmb() ceases to be a
40445 + * nop for these.
40446 + */
40447 +
40448 +
40449 +/*
40450 + * Actually only lfence would be needed for mb() because all stores done
40451 + * by the kernel should be already ordered. But keep a full barrier for now.
40452 + */
40453 +
40454 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
40455 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
40456 +
40457 +/**
40458 + * read_barrier_depends - Flush all pending reads that subsequents reads
40459 + * depend on.
40460 + *
40461 + * No data-dependent reads from memory-like regions are ever reordered
40462 + * over this barrier.  All reads preceding this primitive are guaranteed
40463 + * to access memory (but not necessarily other CPUs' caches) before any
40464 + * reads following this primitive that depend on the data return by
40465 + * any of the preceding reads.  This primitive is much lighter weight than
40466 + * rmb() on most CPUs, and is never heavier weight than is
40467 + * rmb().
40468 + *
40469 + * These ordering constraints are respected by both the local CPU
40470 + * and the compiler.
40471 + *
40472 + * Ordering is not guaranteed by anything other than these primitives,
40473 + * not even by data dependencies.  See the documentation for
40474 + * memory_barrier() for examples and URLs to more information.
40475 + *
40476 + * For example, the following code would force ordering (the initial
40477 + * value of "a" is zero, "b" is one, and "p" is "&a"):
40478 + *
40479 + * <programlisting>
40480 + *     CPU 0                           CPU 1
40481 + *
40482 + *     b = 2;
40483 + *     memory_barrier();
40484 + *     p = &b;                         q = p;
40485 + *                                     read_barrier_depends();
40486 + *                                     d = *q;
40487 + * </programlisting>
40488 + *
40489 + * because the read of "*q" depends on the read of "p" and these
40490 + * two reads are separated by a read_barrier_depends().  However,
40491 + * the following code, with the same initial values for "a" and "b":
40492 + *
40493 + * <programlisting>
40494 + *     CPU 0                           CPU 1
40495 + *
40496 + *     a = 2;
40497 + *     memory_barrier();
40498 + *     b = 3;                          y = b;
40499 + *                                     read_barrier_depends();
40500 + *                                     x = a;
40501 + * </programlisting>
40502 + *
40503 + * does not enforce ordering, since there is no data dependency between
40504 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
40505 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
40506 + * in cases like this where there are no data dependencies.
40507 + **/
40508 +
40509 +#define read_barrier_depends() do { } while(0)
40510 +
40511 +#ifdef CONFIG_X86_OOSTORE
40512 +/* Actually there are no OOO store capable CPUs for now that do SSE,
40513 +   but make it already an possibility. */
40514 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
40515 +#else
40516 +#define wmb()  __asm__ __volatile__ ("": : :"memory")
40517 +#endif
40518 +
40519 +#ifdef CONFIG_SMP
40520 +#define smp_mb()       mb()
40521 +#define smp_rmb()      rmb()
40522 +#define smp_wmb()      wmb()
40523 +#define smp_read_barrier_depends()     read_barrier_depends()
40524 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
40525 +#else
40526 +#define smp_mb()       barrier()
40527 +#define smp_rmb()      barrier()
40528 +#define smp_wmb()      barrier()
40529 +#define smp_read_barrier_depends()     do { } while(0)
40530 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
40531 +#endif
40532 +
40533 +#include <linux/irqflags.h>
40534 +
40535 +/*
40536 + * disable hlt during certain critical i/o operations
40537 + */
40538 +#define HAVE_DISABLE_HLT
40539 +void disable_hlt(void);
40540 +void enable_hlt(void);
40541 +
40542 +extern int es7000_plat;
40543 +void cpu_idle_wait(void);
40544 +
40545 +/*
40546 + * On SMP systems, when the scheduler does migration-cost autodetection,
40547 + * it needs a way to flush as much of the CPU's caches as possible:
40548 + */
40549 +static inline void sched_cacheflush(void)
40550 +{
40551 +       wbinvd();
40552 +}
40553 +
40554 +extern unsigned long arch_align_stack(unsigned long sp);
40555 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
40556 +
40557 +void default_idle(void);
40558 +
40559 +#endif
40560 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h
40561 ===================================================================
40562 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40563 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h  2007-11-26 16:59:25.000000000 +0100
40564 @@ -0,0 +1,101 @@
40565 +#ifndef _I386_TLBFLUSH_H
40566 +#define _I386_TLBFLUSH_H
40567 +
40568 +#include <linux/mm.h>
40569 +#include <asm/processor.h>
40570 +
40571 +#define __flush_tlb() xen_tlb_flush()
40572 +#define __flush_tlb_global() xen_tlb_flush()
40573 +#define __flush_tlb_all() xen_tlb_flush()
40574 +
40575 +extern unsigned long pgkern_mask;
40576 +
40577 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
40578 +
40579 +#define __flush_tlb_single(addr) xen_invlpg(addr)
40580 +
40581 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
40582 +
40583 +/*
40584 + * TLB flushing:
40585 + *
40586 + *  - flush_tlb() flushes the current mm struct TLBs
40587 + *  - flush_tlb_all() flushes all processes TLBs
40588 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
40589 + *  - flush_tlb_page(vma, vmaddr) flushes one page
40590 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
40591 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
40592 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
40593 + *
40594 + * ..but the i386 has somewhat limited tlb flushing capabilities,
40595 + * and page-granular flushes are available only on i486 and up.
40596 + */
40597 +
40598 +#ifndef CONFIG_SMP
40599 +
40600 +#define flush_tlb() __flush_tlb()
40601 +#define flush_tlb_all() __flush_tlb_all()
40602 +#define local_flush_tlb() __flush_tlb()
40603 +
40604 +static inline void flush_tlb_mm(struct mm_struct *mm)
40605 +{
40606 +       if (mm == current->active_mm)
40607 +               __flush_tlb();
40608 +}
40609 +
40610 +static inline void flush_tlb_page(struct vm_area_struct *vma,
40611 +       unsigned long addr)
40612 +{
40613 +       if (vma->vm_mm == current->active_mm)
40614 +               __flush_tlb_one(addr);
40615 +}
40616 +
40617 +static inline void flush_tlb_range(struct vm_area_struct *vma,
40618 +       unsigned long start, unsigned long end)
40619 +{
40620 +       if (vma->vm_mm == current->active_mm)
40621 +               __flush_tlb();
40622 +}
40623 +
40624 +#else
40625 +
40626 +#include <asm/smp.h>
40627 +
40628 +#define local_flush_tlb() \
40629 +       __flush_tlb()
40630 +
40631 +#define flush_tlb_all xen_tlb_flush_all
40632 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
40633 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
40634 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
40635 +
40636 +#define flush_tlb()    flush_tlb_current_task()
40637 +
40638 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
40639 +{
40640 +       flush_tlb_mm(vma->vm_mm);
40641 +}
40642 +
40643 +#define TLBSTATE_OK    1
40644 +#define TLBSTATE_LAZY  2
40645 +
40646 +struct tlb_state
40647 +{
40648 +       struct mm_struct *active_mm;
40649 +       int state;
40650 +       char __cacheline_padding[L1_CACHE_BYTES-8];
40651 +};
40652 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
40653 +
40654 +
40655 +#endif
40656 +
40657 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
40658 +
40659 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
40660 +                                     unsigned long start, unsigned long end)
40661 +{
40662 +       /* i386 does not keep any page table caches in TLB */
40663 +}
40664 +
40665 +#endif /* _I386_TLBFLUSH_H */
40666 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h
40667 ===================================================================
40668 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40669 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h  2007-06-12 13:14:02.000000000 +0200
40670 @@ -0,0 +1,20 @@
40671 +/*
40672 + *     Access to VGA videoram
40673 + *
40674 + *     (c) 1998 Martin Mares <mj@ucw.cz>
40675 + */
40676 +
40677 +#ifndef _LINUX_ASM_VGA_H_
40678 +#define _LINUX_ASM_VGA_H_
40679 +
40680 +/*
40681 + *     On the PC, we can just recalculate addresses and then
40682 + *     access the videoram directly without any black magic.
40683 + */
40684 +
40685 +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
40686 +
40687 +#define vga_readb(x) (*(x))
40688 +#define vga_writeb(x,y) (*(y) = (x))
40689 +
40690 +#endif
40691 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h
40692 ===================================================================
40693 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40694 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h     2007-06-12 13:14:02.000000000 +0200
40695 @@ -0,0 +1,48 @@
40696 +/******************************************************************************
40697 + * asm-i386/mach-xen/asm/xenoprof.h
40698 + *
40699 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
40700 + *                    VA Linux Systems Japan K.K.
40701 + *
40702 + * This program is free software; you can redistribute it and/or modify
40703 + * it under the terms of the GNU General Public License as published by
40704 + * the Free Software Foundation; either version 2 of the License, or
40705 + * (at your option) any later version.
40706 + *
40707 + * This program is distributed in the hope that it will be useful,
40708 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
40709 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
40710 + * GNU General Public License for more details.
40711 + *
40712 + * You should have received a copy of the GNU General Public License
40713 + * along with this program; if not, write to the Free Software
40714 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
40715 + *
40716 + */
40717 +#ifndef __ASM_XENOPROF_H__
40718 +#define __ASM_XENOPROF_H__
40719 +#ifdef CONFIG_XEN
40720 +
40721 +struct super_block;
40722 +struct dentry;
40723 +int xenoprof_create_files(struct super_block * sb, struct dentry * root);
40724 +#define HAVE_XENOPROF_CREATE_FILES
40725 +
40726 +struct xenoprof_init;
40727 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
40728 +void xenoprof_arch_counter(void);
40729 +void xenoprof_arch_start(void);
40730 +void xenoprof_arch_stop(void);
40731 +
40732 +struct xenoprof_arch_shared_buffer {
40733 +       /* nothing */
40734 +};
40735 +struct xenoprof_shared_buffer;
40736 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
40737 +struct xenoprof_get_buffer;
40738 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
40739 +struct xenoprof_passive;
40740 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
40741 +
40742 +#endif /* CONFIG_XEN */
40743 +#endif /* __ASM_XENOPROF_H__ */
40744 Index: head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h
40745 ===================================================================
40746 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40747 +++ head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h      2008-09-25 13:55:32.000000000 +0200
40748 @@ -0,0 +1,125 @@
40749 +/*
40750 + * This file should contain #defines for all of the interrupt vector
40751 + * numbers used by this architecture.
40752 + *
40753 + * In addition, there are some standard defines:
40754 + *
40755 + *     FIRST_EXTERNAL_VECTOR:
40756 + *             The first free place for external interrupts
40757 + *
40758 + *     SYSCALL_VECTOR:
40759 + *             The IRQ vector a syscall makes the user to kernel transition
40760 + *             under.
40761 + *
40762 + *     TIMER_IRQ:
40763 + *             The IRQ number the timer interrupt comes in at.
40764 + *
40765 + *     NR_IRQS:
40766 + *             The total number of interrupt vectors (including all the
40767 + *             architecture specific interrupts) needed.
40768 + *
40769 + */
40770 +#ifndef _ASM_IRQ_VECTORS_H
40771 +#define _ASM_IRQ_VECTORS_H
40772 +
40773 +/*
40774 + * IDT vectors usable for external interrupt sources start
40775 + * at 0x20:
40776 + */
40777 +#define FIRST_EXTERNAL_VECTOR  0x20
40778 +
40779 +#define SYSCALL_VECTOR         0x80
40780 +
40781 +/*
40782 + * Vectors 0x20-0x2f are used for ISA interrupts.
40783 + */
40784 +
40785 +#if 0
40786 +/*
40787 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
40788 + *
40789 + *  some of the following vectors are 'rare', they are merged
40790 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
40791 + *  TLB, reschedule and local APIC vectors are performance-critical.
40792 + *
40793 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
40794 + */
40795 +#define SPURIOUS_APIC_VECTOR   0xff
40796 +#define ERROR_APIC_VECTOR      0xfe
40797 +#define INVALIDATE_TLB_VECTOR  0xfd
40798 +#define RESCHEDULE_VECTOR      0xfc
40799 +#define CALL_FUNCTION_VECTOR   0xfb
40800 +
40801 +#define THERMAL_APIC_VECTOR    0xf0
40802 +/*
40803 + * Local APIC timer IRQ vector is on a different priority level,
40804 + * to work around the 'lost local interrupt if more than 2 IRQ
40805 + * sources per level' errata.
40806 + */
40807 +#define LOCAL_TIMER_VECTOR     0xef
40808 +#endif
40809 +
40810 +#define SPURIOUS_APIC_VECTOR   0xff
40811 +#define ERROR_APIC_VECTOR      0xfe
40812 +
40813 +/*
40814 + * First APIC vector available to drivers: (vectors 0x30-0xee)
40815 + * we start at 0x31 to spread out vectors evenly between priority
40816 + * levels. (0x80 is the syscall vector)
40817 + */
40818 +#define FIRST_DEVICE_VECTOR    0x31
40819 +#define FIRST_SYSTEM_VECTOR    0xef
40820 +
40821 +/*
40822 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
40823 + * Right now the APIC is mostly only used for SMP.
40824 + * 256 vectors is an architectural limit. (we can have
40825 + * more than 256 devices theoretically, but they will
40826 + * have to use shared interrupts)
40827 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
40828 + * the usable vector space is 0x20-0xff (224 vectors)
40829 + */
40830 +
40831 +#define RESCHEDULE_VECTOR      0
40832 +#define CALL_FUNCTION_VECTOR   1
40833 +#define NR_IPIS                        2
40834 +
40835 +/*
40836 + * The maximum number of vectors supported by i386 processors
40837 + * is limited to 256. For processors other than i386, NR_VECTORS
40838 + * should be changed accordingly.
40839 + */
40840 +#define NR_VECTORS 256
40841 +
40842 +#define FPU_IRQ                        13
40843 +
40844 +#define        FIRST_VM86_IRQ          3
40845 +#define LAST_VM86_IRQ          15
40846 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
40847 +
40848 +/*
40849 + * The flat IRQ space is divided into two regions:
40850 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
40851 + *     if we have physical device-access privilege. This region is at the
40852 + *     start of the IRQ space so that existing device drivers do not need
40853 + *     to be modified to translate physical IRQ numbers into our IRQ space.
40854 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
40855 + *     are bound using the provided bind/unbind functions.
40856 + */
40857 +
40858 +#define PIRQ_BASE              0
40859 +#if !defined(MAX_IO_APICS)
40860 +# define NR_PIRQS              (NR_VECTORS + 32 * NR_CPUS)
40861 +#elif NR_CPUS < MAX_IO_APICS
40862 +# define NR_PIRQS              (NR_VECTORS + 32 * NR_CPUS)
40863 +#else
40864 +# define NR_PIRQS              (NR_VECTORS + 32 * MAX_IO_APICS)
40865 +#endif
40866 +
40867 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
40868 +#define NR_DYNIRQS             256
40869 +
40870 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
40871 +#define NR_IRQ_VECTORS         NR_IRQS
40872 +
40873 +#endif /* _ASM_IRQ_VECTORS_H */
40874 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h
40875 ===================================================================
40876 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40877 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h       2007-06-12 13:14:02.000000000 +0200
40878 @@ -0,0 +1,33 @@
40879 +/*
40880 + *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
40881 + *
40882 + *  Machine specific NMI handling for Xen
40883 + */
40884 +#ifndef _MACH_TRAPS_H
40885 +#define _MACH_TRAPS_H
40886 +
40887 +#include <linux/bitops.h>
40888 +#include <xen/interface/nmi.h>
40889 +
40890 +static inline void clear_mem_error(unsigned char reason) {}
40891 +static inline void clear_io_check_error(unsigned char reason) {}
40892 +
40893 +static inline unsigned char get_nmi_reason(void)
40894 +{
40895 +       shared_info_t *s = HYPERVISOR_shared_info;
40896 +       unsigned char reason = 0;
40897 +
40898 +       /* construct a value which looks like it came from
40899 +        * port 0x61.
40900 +        */
40901 +       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
40902 +               reason |= 0x40;
40903 +       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
40904 +               reason |= 0x80;
40905 +
40906 +        return reason;
40907 +}
40908 +
40909 +static inline void reassert_nmi(void) {}
40910 +
40911 +#endif /* !_MACH_TRAPS_H */
40912 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h
40913 ===================================================================
40914 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40915 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h       2007-06-12 13:14:02.000000000 +0200
40916 @@ -0,0 +1,5 @@
40917 +/* Hook to call BIOS initialisation function */
40918 +
40919 +#define ARCH_SETUP machine_specific_arch_setup();
40920 +
40921 +void __init machine_specific_arch_setup(void);
40922 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h
40923 ===================================================================
40924 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
40925 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h      2008-01-28 12:24:19.000000000 +0100
40926 @@ -0,0 +1,265 @@
40927 +/* Written 2000 by Andi Kleen */
40928 +#ifndef __ARCH_DESC_H
40929 +#define __ARCH_DESC_H
40930 +
40931 +#include <linux/threads.h>
40932 +#include <asm/ldt.h>
40933 +
40934 +#ifndef __ASSEMBLY__
40935 +
40936 +#include <linux/string.h>
40937 +#include <linux/smp.h>
40938 +
40939 +#include <asm/segment.h>
40940 +#include <asm/mmu.h>
40941 +
40942 +// 8 byte segment descriptor
40943 +struct desc_struct {
40944 +       u16 limit0;
40945 +       u16 base0;
40946 +       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
40947 +       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
40948 +} __attribute__((packed));
40949 +
40950 +struct n_desc_struct {
40951 +       unsigned int a,b;
40952 +};
40953 +
40954 +enum {
40955 +       GATE_INTERRUPT = 0xE,
40956 +       GATE_TRAP = 0xF,
40957 +       GATE_CALL = 0xC,
40958 +};
40959 +
40960 +// 16byte gate
40961 +struct gate_struct {
40962 +       u16 offset_low;
40963 +       u16 segment;
40964 +       unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
40965 +       u16 offset_middle;
40966 +       u32 offset_high;
40967 +       u32 zero1;
40968 +} __attribute__((packed));
40969 +
40970 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
40971 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
40972 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
40973 +
40974 +enum {
40975 +       DESC_TSS = 0x9,
40976 +       DESC_LDT = 0x2,
40977 +};
40978 +
40979 +// LDT or TSS descriptor in the GDT. 16 bytes.
40980 +struct ldttss_desc {
40981 +       u16 limit0;
40982 +       u16 base0;
40983 +       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
40984 +       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
40985 +       u32 base3;
40986 +       u32 zero1;
40987 +} __attribute__((packed));
40988 +
40989 +struct desc_ptr {
40990 +       unsigned short size;
40991 +       unsigned long address;
40992 +} __attribute__((packed)) ;
40993 +
40994 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
40995 +
40996 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
40997 +
40998 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
40999 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
41000 +
41001 +static inline void clear_LDT(void)
41002 +{
41003 +       int cpu = get_cpu();
41004 +
41005 +       /*
41006 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
41007 +        * it slows down context switching. Noone uses it anyway.
41008 +        */
41009 +       cpu = cpu;              /* XXX avoid compiler warning */
41010 +       xen_set_ldt(NULL, 0);
41011 +       put_cpu();
41012 +}
41013 +
41014 +/*
41015 + * This is the ldt that every process will get unless we need
41016 + * something other than this.
41017 + */
41018 +extern struct desc_struct default_ldt[];
41019 +#ifndef CONFIG_X86_NO_IDT
41020 +extern struct gate_struct idt_table[];
41021 +#endif
41022 +extern struct desc_ptr cpu_gdt_descr[];
41023 +
41024 +/* the cpu gdt accessor */
41025 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
41026 +
41027 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
41028 +{
41029 +       struct gate_struct s;
41030 +       s.offset_low = PTR_LOW(func);
41031 +       s.segment = __KERNEL_CS;
41032 +       s.ist = ist;
41033 +       s.p = 1;
41034 +       s.dpl = dpl;
41035 +       s.zero0 = 0;
41036 +       s.zero1 = 0;
41037 +       s.type = type;
41038 +       s.offset_middle = PTR_MIDDLE(func);
41039 +       s.offset_high = PTR_HIGH(func);
41040 +       /* does not need to be atomic because it is only done once at setup time */
41041 +       memcpy(adr, &s, 16);
41042 +}
41043 +
41044 +#ifndef CONFIG_X86_NO_IDT
41045 +static inline void set_intr_gate(int nr, void *func)
41046 +{
41047 +       BUG_ON((unsigned)nr > 0xFF);
41048 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
41049 +}
41050 +
41051 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
41052 +{
41053 +       BUG_ON((unsigned)nr > 0xFF);
41054 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
41055 +}
41056 +
41057 +static inline void set_system_gate(int nr, void *func)
41058 +{
41059 +       BUG_ON((unsigned)nr > 0xFF);
41060 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
41061 +}
41062 +
41063 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
41064 +{
41065 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
41066 +}
41067 +#endif
41068 +
41069 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
41070 +                                        unsigned size)
41071 +{
41072 +       struct ldttss_desc d;
41073 +       memset(&d,0,sizeof(d));
41074 +       d.limit0 = size & 0xFFFF;
41075 +       d.base0 = PTR_LOW(tss);
41076 +       d.base1 = PTR_MIDDLE(tss) & 0xFF;
41077 +       d.type = type;
41078 +       d.p = 1;
41079 +       d.limit1 = (size >> 16) & 0xF;
41080 +       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
41081 +       d.base3 = PTR_HIGH(tss);
41082 +       memcpy(ptr, &d, 16);
41083 +}
41084 +
41085 +#ifndef CONFIG_X86_NO_TSS
41086 +static inline void set_tss_desc(unsigned cpu, void *addr)
41087 +{
41088 +       /*
41089 +        * sizeof(unsigned long) coming from an extra "long" at the end
41090 +        * of the iobitmap. See tss_struct definition in processor.h
41091 +        *
41092 +        * -1? seg base+limit should be pointing to the address of the
41093 +        * last valid byte
41094 +        */
41095 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
41096 +               (unsigned long)addr, DESC_TSS,
41097 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
41098 +}
41099 +#endif
41100 +
41101 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
41102 +{
41103 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
41104 +                             DESC_LDT, size * 8 - 1);
41105 +}
41106 +
41107 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
41108 +{
41109 +       struct desc_struct *d = &cpu_gdt(cpu)[entry];
41110 +       u32 addr = (u32)(u64)base;
41111 +       BUG_ON((u64)base >> 32);
41112 +       d->base0 = addr & 0xffff;
41113 +       d->base1 = (addr >> 16) & 0xff;
41114 +       d->base2 = (addr >> 24) & 0xff;
41115 +}
41116 +
41117 +#define LDT_entry_a(info) \
41118 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
41119 +/* Don't allow setting of the lm bit. It is useless anyways because
41120 +   64bit system calls require __USER_CS. */
41121 +#define LDT_entry_b(info) \
41122 +       (((info)->base_addr & 0xff000000) | \
41123 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
41124 +       ((info)->limit & 0xf0000) | \
41125 +       (((info)->read_exec_only ^ 1) << 9) | \
41126 +       ((info)->contents << 10) | \
41127 +       (((info)->seg_not_present ^ 1) << 15) | \
41128 +       ((info)->seg_32bit << 22) | \
41129 +       ((info)->limit_in_pages << 23) | \
41130 +       ((info)->useable << 20) | \
41131 +       /* ((info)->lm << 21) | */ \
41132 +       0x7000)
41133 +
41134 +#define LDT_empty(info) (\
41135 +       (info)->base_addr       == 0    && \
41136 +       (info)->limit           == 0    && \
41137 +       (info)->contents        == 0    && \
41138 +       (info)->read_exec_only  == 1    && \
41139 +       (info)->seg_32bit       == 0    && \
41140 +       (info)->limit_in_pages  == 0    && \
41141 +       (info)->seg_not_present == 1    && \
41142 +       (info)->useable         == 0    && \
41143 +       (info)->lm              == 0)
41144 +
41145 +#if TLS_SIZE != 24
41146 +# error update this code.
41147 +#endif
41148 +
41149 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
41150 +{
41151 +#if 0
41152 +       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
41153 +       gdt[0] = t->tls_array[0];
41154 +       gdt[1] = t->tls_array[1];
41155 +       gdt[2] = t->tls_array[2];
41156 +#endif
41157 +#define C(i) \
41158 +       if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
41159 +                                                t->tls_array[i])) \
41160 +               BUG();
41161 +
41162 +       C(0); C(1); C(2);
41163 +#undef C
41164 +}
41165 +
41166 +/*
41167 + * load one particular LDT into the current CPU
41168 + */
41169 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
41170 +{
41171 +       void *segments = pc->ldt;
41172 +       int count = pc->size;
41173 +
41174 +       if (likely(!count))
41175 +               segments = NULL;
41176 +
41177 +       xen_set_ldt(segments, count);
41178 +}
41179 +
41180 +static inline void load_LDT(mm_context_t *pc)
41181 +{
41182 +       int cpu = get_cpu();
41183 +       load_LDT_nolock(pc, cpu);
41184 +       put_cpu();
41185 +}
41186 +
41187 +extern struct desc_ptr idt_descr;
41188 +
41189 +#endif /* !__ASSEMBLY__ */
41190 +
41191 +#endif
41192 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h
41193 ===================================================================
41194 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
41195 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h       2007-06-12 13:14:13.000000000 +0200
41196 @@ -0,0 +1,207 @@
41197 +#ifndef _X8664_DMA_MAPPING_H
41198 +#define _X8664_DMA_MAPPING_H 1
41199 +
41200 +/*
41201 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
41202 + * documentation.
41203 + */
41204 +
41205 +
41206 +#include <asm/scatterlist.h>
41207 +#include <asm/io.h>
41208 +#include <asm/swiotlb.h>
41209 +
41210 +struct dma_mapping_ops {
41211 +       int             (*mapping_error)(dma_addr_t dma_addr);
41212 +       void*           (*alloc_coherent)(struct device *dev, size_t size,
41213 +                                dma_addr_t *dma_handle, gfp_t gfp);
41214 +       void            (*free_coherent)(struct device *dev, size_t size,
41215 +                                void *vaddr, dma_addr_t dma_handle);
41216 +       dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
41217 +                                size_t size, int direction);
41218 +       /* like map_single, but doesn't check the device mask */
41219 +       dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
41220 +                                size_t size, int direction);
41221 +       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
41222 +                               size_t size, int direction);
41223 +       void            (*sync_single_for_cpu)(struct device *hwdev,
41224 +                               dma_addr_t dma_handle, size_t size,
41225 +                               int direction);
41226 +       void            (*sync_single_for_device)(struct device *hwdev,
41227 +                                dma_addr_t dma_handle, size_t size,
41228 +                               int direction);
41229 +       void            (*sync_single_range_for_cpu)(struct device *hwdev,
41230 +                                dma_addr_t dma_handle, unsigned long offset,
41231 +                               size_t size, int direction);
41232 +       void            (*sync_single_range_for_device)(struct device *hwdev,
41233 +                               dma_addr_t dma_handle, unsigned long offset,
41234 +                               size_t size, int direction);
41235 +       void            (*sync_sg_for_cpu)(struct device *hwdev,
41236 +                                struct scatterlist *sg, int nelems,
41237 +                               int direction);
41238 +       void            (*sync_sg_for_device)(struct device *hwdev,
41239 +                               struct scatterlist *sg, int nelems,
41240 +                               int direction);
41241 +       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
41242 +                               int nents, int direction);
41243 +       void            (*unmap_sg)(struct device *hwdev,
41244 +                               struct scatterlist *sg, int nents,
41245 +                               int direction);
41246 +       int             (*dma_supported)(struct device *hwdev, u64 mask);
41247 +       int             is_phys;
41248 +};
41249 +
41250 +extern dma_addr_t bad_dma_address;
41251 +extern struct dma_mapping_ops* dma_ops;
41252 +extern int iommu_merge;
41253 +
41254 +static inline int valid_dma_direction(int dma_direction)
41255 +{
41256 +       return ((dma_direction == DMA_BIDIRECTIONAL) ||
41257 +               (dma_direction == DMA_TO_DEVICE) ||
41258 +               (dma_direction == DMA_FROM_DEVICE));
41259 +}
41260 +
41261 +#if 0
41262 +static inline int dma_mapping_error(dma_addr_t dma_addr)
41263 +{
41264 +       if (dma_ops->mapping_error)
41265 +               return dma_ops->mapping_error(dma_addr);
41266 +
41267 +       return (dma_addr == bad_dma_address);
41268 +}
41269 +
41270 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
41271 +                               dma_addr_t *dma_handle, gfp_t gfp);
41272 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
41273 +                             dma_addr_t dma_handle);
41274 +
41275 +static inline dma_addr_t
41276 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
41277 +              int direction)
41278 +{
41279 +       BUG_ON(!valid_dma_direction(direction));
41280 +       return dma_ops->map_single(hwdev, ptr, size, direction);
41281 +}
41282 +
41283 +static inline void
41284 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
41285 +                int direction)
41286 +{
41287 +       BUG_ON(!valid_dma_direction(direction));
41288 +       dma_ops->unmap_single(dev, addr, size, direction);
41289 +}
41290 +
41291 +#define dma_map_page(dev,page,offset,size,dir) \
41292 +       dma_map_single((dev), page_address(page)+(offset), (size), (dir))
41293 +
41294 +#define dma_unmap_page dma_unmap_single
41295 +
41296 +static inline void
41297 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41298 +                       size_t size, int direction)
41299 +{
41300 +       BUG_ON(!valid_dma_direction(direction));
41301 +       if (dma_ops->sync_single_for_cpu)
41302 +               dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
41303 +                                            direction);
41304 +       flush_write_buffers();
41305 +}
41306 +
41307 +static inline void
41308 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
41309 +                          size_t size, int direction)
41310 +{
41311 +       BUG_ON(!valid_dma_direction(direction));
41312 +       if (dma_ops->sync_single_for_device)
41313 +               dma_ops->sync_single_for_device(hwdev, dma_handle, size,
41314 +                                               direction);
41315 +       flush_write_buffers();
41316 +}
41317 +
41318 +static inline void
41319 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41320 +                             unsigned long offset, size_t size, int direction)
41321 +{
41322 +       BUG_ON(!valid_dma_direction(direction));
41323 +       if (dma_ops->sync_single_range_for_cpu) {
41324 +               dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
41325 +       }
41326 +
41327 +       flush_write_buffers();
41328 +}
41329 +
41330 +static inline void
41331 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
41332 +                                unsigned long offset, size_t size, int direction)
41333 +{
41334 +       BUG_ON(!valid_dma_direction(direction));
41335 +       if (dma_ops->sync_single_range_for_device)
41336 +               dma_ops->sync_single_range_for_device(hwdev, dma_handle,
41337 +                                                     offset, size, direction);
41338 +
41339 +       flush_write_buffers();
41340 +}
41341 +
41342 +static inline void
41343 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
41344 +                   int nelems, int direction)
41345 +{
41346 +       BUG_ON(!valid_dma_direction(direction));
41347 +       if (dma_ops->sync_sg_for_cpu)
41348 +               dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
41349 +       flush_write_buffers();
41350 +}
41351 +
41352 +static inline void
41353 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
41354 +                      int nelems, int direction)
41355 +{
41356 +       BUG_ON(!valid_dma_direction(direction));
41357 +       if (dma_ops->sync_sg_for_device) {
41358 +               dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
41359 +       }
41360 +
41361 +       flush_write_buffers();
41362 +}
41363 +
41364 +static inline int
41365 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
41366 +{
41367 +       BUG_ON(!valid_dma_direction(direction));
41368 +       return dma_ops->map_sg(hwdev, sg, nents, direction);
41369 +}
41370 +
41371 +static inline void
41372 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
41373 +            int direction)
41374 +{
41375 +       BUG_ON(!valid_dma_direction(direction));
41376 +       dma_ops->unmap_sg(hwdev, sg, nents, direction);
41377 +}
41378 +
41379 +extern int dma_supported(struct device *hwdev, u64 mask);
41380 +
41381 +/* same for gart, swiotlb, and nommu */
41382 +static inline int dma_get_cache_alignment(void)
41383 +{
41384 +       return boot_cpu_data.x86_clflush_size;
41385 +}
41386 +
41387 +#define dma_is_consistent(h) 1
41388 +
41389 +extern int dma_set_mask(struct device *dev, u64 mask);
41390 +
41391 +static inline void
41392 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
41393 +{
41394 +       flush_write_buffers();
41395 +}
41396 +
41397 +extern struct device fallback_dev;
41398 +extern int panic_on_overflow;
41399 +#endif
41400 +
41401 +#endif /* _X8664_DMA_MAPPING_H */
41402 +
41403 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
41404 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h
41405 ===================================================================
41406 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
41407 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h    2007-06-12 13:14:13.000000000 +0200
41408 @@ -0,0 +1,112 @@
41409 +/*
41410 + * fixmap.h: compile-time virtual memory allocation
41411 + *
41412 + * This file is subject to the terms and conditions of the GNU General Public
41413 + * License.  See the file "COPYING" in the main directory of this archive
41414 + * for more details.
41415 + *
41416 + * Copyright (C) 1998 Ingo Molnar
41417 + */
41418 +
41419 +#ifndef _ASM_FIXMAP_H
41420 +#define _ASM_FIXMAP_H
41421 +
41422 +#include <linux/kernel.h>
41423 +#include <asm/apicdef.h>
41424 +#include <asm/page.h>
41425 +#include <asm/vsyscall.h>
41426 +#include <asm/vsyscall32.h>
41427 +#include <asm/acpi.h>
41428 +
41429 +/*
41430 + * Here we define all the compile-time 'special' virtual
41431 + * addresses. The point is to have a constant address at
41432 + * compile time, but to set the physical address only
41433 + * in the boot process.
41434 + *
41435 + * these 'compile-time allocated' memory buffers are
41436 + * fixed-size 4k pages. (or larger if used with an increment
41437 + * highger than 1) use fixmap_set(idx,phys) to associate
41438 + * physical memory with fixmap indices.
41439 + *
41440 + * TLB entries of such buffers will not be flushed across
41441 + * task switches.
41442 + */
41443 +
41444 +enum fixed_addresses {
41445 +       VSYSCALL_LAST_PAGE,
41446 +       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
41447 +       VSYSCALL_HPET,
41448 +       FIX_HPET_BASE,
41449 +#ifdef CONFIG_X86_LOCAL_APIC
41450 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
41451 +#endif
41452 +#ifdef CONFIG_X86_IO_APIC
41453 +       FIX_IO_APIC_BASE_0,
41454 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
41455 +#endif
41456 +#ifdef CONFIG_ACPI
41457 +       FIX_ACPI_BEGIN,
41458 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
41459 +#endif
41460 +       FIX_SHARED_INFO,
41461 +#define NR_FIX_ISAMAPS 256
41462 +       FIX_ISAMAP_END,
41463 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
41464 +       __end_of_permanent_fixed_addresses,
41465 +       /* temporary boot-time mappings, used before ioremap() is functional */
41466 +#define NR_FIX_BTMAPS  16
41467 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
41468 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
41469 +       __end_of_fixed_addresses
41470 +};
41471 +
41472 +extern void __set_fixmap (enum fixed_addresses idx,
41473 +                                       unsigned long phys, pgprot_t flags);
41474 +
41475 +#define set_fixmap(idx, phys) \
41476 +               __set_fixmap(idx, phys, PAGE_KERNEL)
41477 +/*
41478 + * Some hardware wants to get fixmapped without caching.
41479 + */
41480 +#define set_fixmap_nocache(idx, phys) \
41481 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
41482 +
41483 +#define clear_fixmap(idx) \
41484 +                __set_fixmap(idx, 0, __pgprot(0))
41485 +
41486 +#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
41487 +#define FIXADDR_SIZE   (__end_of_fixed_addresses << PAGE_SHIFT)
41488 +#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
41489 +
41490 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
41491 +#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
41492 +#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
41493 +
41494 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
41495 +
41496 +extern void __this_fixmap_does_not_exist(void);
41497 +
41498 +/*
41499 + * 'index to address' translation. If anyone tries to use the idx
41500 + * directly without translation, we catch the bug with a NULL-deference
41501 + * kernel oops. Illegal ranges of incoming indices are caught too.
41502 + */
41503 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
41504 +{
41505 +       /*
41506 +        * this branch gets completely eliminated after inlining,
41507 +        * except when someone tries to use fixaddr indices in an
41508 +        * illegal way. (such as mixing up address types or using
41509 +        * out-of-range indices).
41510 +        *
41511 +        * If it doesn't get removed, the linker will complain
41512 +        * loudly with a reasonably clear error message..
41513 +        */
41514 +       if (idx >= __end_of_fixed_addresses)
41515 +               __this_fixmap_does_not_exist();
41516 +
41517 +        return __fix_to_virt(idx);
41518 +}
41519 +
41520 +#endif
41521 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h
41522 ===================================================================
41523 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
41524 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h 2008-11-25 12:22:34.000000000 +0100
41525 @@ -0,0 +1,408 @@
41526 +/******************************************************************************
41527 + * hypercall.h
41528 + *
41529 + * Linux-specific hypervisor handling.
41530 + *
41531 + * Copyright (c) 2002-2004, K A Fraser
41532 + *
41533 + * 64-bit updates:
41534 + *   Benjamin Liu <benjamin.liu@intel.com>
41535 + *   Jun Nakajima <jun.nakajima@intel.com>
41536 + *
41537 + * This program is free software; you can redistribute it and/or
41538 + * modify it under the terms of the GNU General Public License version 2
41539 + * as published by the Free Software Foundation; or, when distributed
41540 + * separately from the Linux kernel or incorporated into other
41541 + * software packages, subject to the following license:
41542 + *
41543 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41544 + * of this source file (the "Software"), to deal in the Software without
41545 + * restriction, including without limitation the rights to use, copy, modify,
41546 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41547 + * and to permit persons to whom the Software is furnished to do so, subject to
41548 + * the following conditions:
41549 + *
41550 + * The above copyright notice and this permission notice shall be included in
41551 + * all copies or substantial portions of the Software.
41552 + *
41553 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41554 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41555 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41556 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41557 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41558 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41559 + * IN THE SOFTWARE.
41560 + */
41561 +
41562 +#ifndef __HYPERCALL_H__
41563 +#define __HYPERCALL_H__
41564 +
41565 +#include <linux/string.h> /* memcpy() */
41566 +#include <linux/stringify.h>
41567 +
41568 +#ifndef __HYPERVISOR_H__
41569 +# error "please don't include this file directly"
41570 +#endif
41571 +
41572 +#ifdef CONFIG_XEN
41573 +#define HYPERCALL_STR(name)                                    \
41574 +       "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
41575 +#else
41576 +#define HYPERCALL_STR(name)                                    \
41577 +       "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
41578 +       "add hypercall_stubs(%%rip),%%rax; "                    \
41579 +       "call *%%rax"
41580 +#endif
41581 +
41582 +#define _hypercall0(type, name)                        \
41583 +({                                             \
41584 +       type __res;                             \
41585 +       asm volatile (                          \
41586 +               HYPERCALL_STR(name)             \
41587 +               : "=a" (__res)                  \
41588 +               :                               \
41589 +               : "memory" );                   \
41590 +       __res;                                  \
41591 +})
41592 +
41593 +#define _hypercall1(type, name, a1)                            \
41594 +({                                                             \
41595 +       type __res;                                             \
41596 +       long __ign1;                                            \
41597 +       asm volatile (                                          \
41598 +               HYPERCALL_STR(name)                             \
41599 +               : "=a" (__res), "=D" (__ign1)                   \
41600 +               : "1" ((long)(a1))                              \
41601 +               : "memory" );                                   \
41602 +       __res;                                                  \
41603 +})
41604 +
41605 +#define _hypercall2(type, name, a1, a2)                                \
41606 +({                                                             \
41607 +       type __res;                                             \
41608 +       long __ign1, __ign2;                                    \
41609 +       asm volatile (                                          \
41610 +               HYPERCALL_STR(name)                             \
41611 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2)    \
41612 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
41613 +               : "memory" );                                   \
41614 +       __res;                                                  \
41615 +})
41616 +
41617 +#define _hypercall3(type, name, a1, a2, a3)                    \
41618 +({                                                             \
41619 +       type __res;                                             \
41620 +       long __ign1, __ign2, __ign3;                            \
41621 +       asm volatile (                                          \
41622 +               HYPERCALL_STR(name)                             \
41623 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
41624 +               "=d" (__ign3)                                   \
41625 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
41626 +               "3" ((long)(a3))                                \
41627 +               : "memory" );                                   \
41628 +       __res;                                                  \
41629 +})
41630 +
41631 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
41632 +({                                                             \
41633 +       type __res;                                             \
41634 +       long __ign1, __ign2, __ign3;                            \
41635 +       register long __arg4 asm("r10") = (long)(a4);           \
41636 +       asm volatile (                                          \
41637 +               HYPERCALL_STR(name)                             \
41638 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
41639 +                 "=d" (__ign3), "+r" (__arg4)                  \
41640 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
41641 +                 "3" ((long)(a3))                              \
41642 +               : "memory" );                                   \
41643 +       __res;                                                  \
41644 +})
41645 +
41646 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
41647 +({                                                             \
41648 +       type __res;                                             \
41649 +       long __ign1, __ign2, __ign3;                            \
41650 +       register long __arg4 asm("r10") = (long)(a4);           \
41651 +       register long __arg5 asm("r8") = (long)(a5);            \
41652 +       asm volatile (                                          \
41653 +               HYPERCALL_STR(name)                             \
41654 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
41655 +                 "=d" (__ign3), "+r" (__arg4), "+r" (__arg5)   \
41656 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
41657 +                 "3" ((long)(a3))                              \
41658 +               : "memory" );                                   \
41659 +       __res;                                                  \
41660 +})
41661 +
41662 +static inline int __must_check
41663 +HYPERVISOR_set_trap_table(
41664 +       const trap_info_t *table)
41665 +{
41666 +       return _hypercall1(int, set_trap_table, table);
41667 +}
41668 +
41669 +static inline int __must_check
41670 +HYPERVISOR_mmu_update(
41671 +       mmu_update_t *req, unsigned int count, unsigned int *success_count,
41672 +       domid_t domid)
41673 +{
41674 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
41675 +}
41676 +
41677 +static inline int __must_check
41678 +HYPERVISOR_mmuext_op(
41679 +       struct mmuext_op *op, unsigned int count, unsigned int *success_count,
41680 +       domid_t domid)
41681 +{
41682 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
41683 +}
41684 +
41685 +static inline int __must_check
41686 +HYPERVISOR_set_gdt(
41687 +       unsigned long *frame_list, unsigned int entries)
41688 +{
41689 +       return _hypercall2(int, set_gdt, frame_list, entries);
41690 +}
41691 +
41692 +static inline int __must_check
41693 +HYPERVISOR_stack_switch(
41694 +       unsigned long ss, unsigned long esp)
41695 +{
41696 +       return _hypercall2(int, stack_switch, ss, esp);
41697 +}
41698 +
41699 +static inline int __must_check
41700 +HYPERVISOR_set_callbacks(
41701 +       unsigned long event_address, unsigned long failsafe_address,
41702 +       unsigned long syscall_address)
41703 +{
41704 +       return _hypercall3(int, set_callbacks,
41705 +                          event_address, failsafe_address, syscall_address);
41706 +}
41707 +
41708 +static inline int
41709 +HYPERVISOR_fpu_taskswitch(
41710 +       int set)
41711 +{
41712 +       return _hypercall1(int, fpu_taskswitch, set);
41713 +}
41714 +
41715 +static inline int __must_check
41716 +HYPERVISOR_sched_op_compat(
41717 +       int cmd, unsigned long arg)
41718 +{
41719 +       return _hypercall2(int, sched_op_compat, cmd, arg);
41720 +}
41721 +
41722 +static inline int __must_check
41723 +HYPERVISOR_sched_op(
41724 +       int cmd, void *arg)
41725 +{
41726 +       return _hypercall2(int, sched_op, cmd, arg);
41727 +}
41728 +
41729 +static inline long __must_check
41730 +HYPERVISOR_set_timer_op(
41731 +       u64 timeout)
41732 +{
41733 +       return _hypercall1(long, set_timer_op, timeout);
41734 +}
41735 +
41736 +static inline int __must_check
41737 +HYPERVISOR_platform_op(
41738 +       struct xen_platform_op *platform_op)
41739 +{
41740 +       platform_op->interface_version = XENPF_INTERFACE_VERSION;
41741 +       return _hypercall1(int, platform_op, platform_op);
41742 +}
41743 +
41744 +static inline int __must_check
41745 +HYPERVISOR_set_debugreg(
41746 +       unsigned int reg, unsigned long value)
41747 +{
41748 +       return _hypercall2(int, set_debugreg, reg, value);
41749 +}
41750 +
41751 +static inline unsigned long __must_check
41752 +HYPERVISOR_get_debugreg(
41753 +       unsigned int reg)
41754 +{
41755 +       return _hypercall1(unsigned long, get_debugreg, reg);
41756 +}
41757 +
41758 +static inline int __must_check
41759 +HYPERVISOR_update_descriptor(
41760 +       unsigned long ma, unsigned long word)
41761 +{
41762 +       return _hypercall2(int, update_descriptor, ma, word);
41763 +}
41764 +
41765 +static inline int __must_check
41766 +HYPERVISOR_memory_op(
41767 +       unsigned int cmd, void *arg)
41768 +{
41769 +       return _hypercall2(int, memory_op, cmd, arg);
41770 +}
41771 +
41772 +static inline int __must_check
41773 +HYPERVISOR_multicall(
41774 +       multicall_entry_t *call_list, unsigned int nr_calls)
41775 +{
41776 +       return _hypercall2(int, multicall, call_list, nr_calls);
41777 +}
41778 +
41779 +static inline int __must_check
41780 +HYPERVISOR_update_va_mapping(
41781 +       unsigned long va, pte_t new_val, unsigned long flags)
41782 +{
41783 +       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
41784 +}
41785 +
41786 +static inline int __must_check
41787 +HYPERVISOR_event_channel_op(
41788 +       int cmd, void *arg)
41789 +{
41790 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
41791 +
41792 +#if CONFIG_XEN_COMPAT <= 0x030002
41793 +       if (unlikely(rc == -ENOSYS)) {
41794 +               struct evtchn_op op;
41795 +               op.cmd = cmd;
41796 +               memcpy(&op.u, arg, sizeof(op.u));
41797 +               rc = _hypercall1(int, event_channel_op_compat, &op);
41798 +               memcpy(arg, &op.u, sizeof(op.u));
41799 +       }
41800 +#endif
41801 +
41802 +       return rc;
41803 +}
41804 +
41805 +static inline int __must_check
41806 +HYPERVISOR_xen_version(
41807 +       int cmd, void *arg)
41808 +{
41809 +       return _hypercall2(int, xen_version, cmd, arg);
41810 +}
41811 +
41812 +static inline int __must_check
41813 +HYPERVISOR_console_io(
41814 +       int cmd, unsigned int count, char *str)
41815 +{
41816 +       return _hypercall3(int, console_io, cmd, count, str);
41817 +}
41818 +
41819 +static inline int __must_check
41820 +HYPERVISOR_physdev_op(
41821 +       int cmd, void *arg)
41822 +{
41823 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
41824 +
41825 +#if CONFIG_XEN_COMPAT <= 0x030002
41826 +       if (unlikely(rc == -ENOSYS)) {
41827 +               struct physdev_op op;
41828 +               op.cmd = cmd;
41829 +               memcpy(&op.u, arg, sizeof(op.u));
41830 +               rc = _hypercall1(int, physdev_op_compat, &op);
41831 +               memcpy(arg, &op.u, sizeof(op.u));
41832 +       }
41833 +#endif
41834 +
41835 +       return rc;
41836 +}
41837 +
41838 +static inline int __must_check
41839 +HYPERVISOR_grant_table_op(
41840 +       unsigned int cmd, void *uop, unsigned int count)
41841 +{
41842 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
41843 +}
41844 +
41845 +static inline int __must_check
41846 +HYPERVISOR_update_va_mapping_otherdomain(
41847 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
41848 +{
41849 +       return _hypercall4(int, update_va_mapping_otherdomain, va,
41850 +                          new_val.pte, flags, domid);
41851 +}
41852 +
41853 +static inline int __must_check
41854 +HYPERVISOR_vm_assist(
41855 +       unsigned int cmd, unsigned int type)
41856 +{
41857 +       return _hypercall2(int, vm_assist, cmd, type);
41858 +}
41859 +
41860 +static inline int __must_check
41861 +HYPERVISOR_vcpu_op(
41862 +       int cmd, unsigned int vcpuid, void *extra_args)
41863 +{
41864 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
41865 +}
41866 +
41867 +static inline int __must_check
41868 +HYPERVISOR_set_segment_base(
41869 +       int reg, unsigned long value)
41870 +{
41871 +       return _hypercall2(int, set_segment_base, reg, value);
41872 +}
41873 +
41874 +static inline int __must_check
41875 +HYPERVISOR_suspend(
41876 +       unsigned long srec)
41877 +{
41878 +       struct sched_shutdown sched_shutdown = {
41879 +               .reason = SHUTDOWN_suspend
41880 +       };
41881 +
41882 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
41883 +                            &sched_shutdown, srec);
41884 +
41885 +#if CONFIG_XEN_COMPAT <= 0x030002
41886 +       if (rc == -ENOSYS)
41887 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
41888 +                                SHUTDOWN_suspend, srec);
41889 +#endif
41890 +
41891 +       return rc;
41892 +}
41893 +
41894 +#if CONFIG_XEN_COMPAT <= 0x030002
41895 +static inline int
41896 +HYPERVISOR_nmi_op(
41897 +       unsigned long op, void *arg)
41898 +{
41899 +       return _hypercall2(int, nmi_op, op, arg);
41900 +}
41901 +#endif
41902 +
41903 +#ifndef CONFIG_XEN
41904 +static inline unsigned long __must_check
41905 +HYPERVISOR_hvm_op(
41906 +    int op, void *arg)
41907 +{
41908 +    return _hypercall2(unsigned long, hvm_op, op, arg);
41909 +}
41910 +#endif
41911 +
41912 +static inline int __must_check
41913 +HYPERVISOR_callback_op(
41914 +       int cmd, const void *arg)
41915 +{
41916 +       return _hypercall2(int, callback_op, cmd, arg);
41917 +}
41918 +
41919 +static inline int __must_check
41920 +HYPERVISOR_xenoprof_op(
41921 +       int op, void *arg)
41922 +{
41923 +       return _hypercall2(int, xenoprof_op, op, arg);
41924 +}
41925 +
41926 +static inline int __must_check
41927 +HYPERVISOR_kexec_op(
41928 +       unsigned long op, void *args)
41929 +{
41930 +       return _hypercall2(int, kexec_op, op, args);
41931 +}
41932 +
41933 +#endif /* __HYPERCALL_H__ */
41934 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h
41935 ===================================================================
41936 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
41937 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h        2007-08-16 18:07:01.000000000 +0200
41938 @@ -0,0 +1,329 @@
41939 +#ifndef _ASM_IO_H
41940 +#define _ASM_IO_H
41941 +
41942 +#include <asm/fixmap.h>
41943 +
41944 +/*
41945 + * This file contains the definitions for the x86 IO instructions
41946 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
41947 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
41948 + * versions of the single-IO instructions (inb_p/inw_p/..).
41949 + *
41950 + * This file is not meant to be obfuscating: it's just complicated
41951 + * to (a) handle it all in a way that makes gcc able to optimize it
41952 + * as well as possible and (b) trying to avoid writing the same thing
41953 + * over and over again with slight variations and possibly making a
41954 + * mistake somewhere.
41955 + */
41956 +
41957 +/*
41958 + * Thanks to James van Artsdalen for a better timing-fix than
41959 + * the two short jumps: using outb's to a nonexistent port seems
41960 + * to guarantee better timings even on fast machines.
41961 + *
41962 + * On the other hand, I'd like to be sure of a non-existent port:
41963 + * I feel a bit unsafe about using 0x80 (should be safe, though)
41964 + *
41965 + *             Linus
41966 + */
41967 +
41968 + /*
41969 +  *  Bit simplified and optimized by Jan Hubicka
41970 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
41971 +  *
41972 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
41973 +  *  isa_read[wl] and isa_write[wl] fixed
41974 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
41975 +  */
41976 +
41977 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
41978 +
41979 +#ifdef REALLY_SLOW_IO
41980 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
41981 +#else
41982 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
41983 +#endif
41984 +
41985 +/*
41986 + * Talk about misusing macros..
41987 + */
41988 +#define __OUT1(s,x) \
41989 +static inline void out##s(unsigned x value, unsigned short port) {
41990 +
41991 +#define __OUT2(s,s1,s2) \
41992 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
41993 +
41994 +#define __OUT(s,s1,x) \
41995 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
41996 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
41997 +
41998 +#define __IN1(s) \
41999 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
42000 +
42001 +#define __IN2(s,s1,s2) \
42002 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
42003 +
42004 +#define __IN(s,s1,i...) \
42005 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42006 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42007 +
42008 +#define __INS(s) \
42009 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
42010 +{ __asm__ __volatile__ ("rep ; ins" #s \
42011 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42012 +
42013 +#define __OUTS(s) \
42014 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
42015 +{ __asm__ __volatile__ ("rep ; outs" #s \
42016 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42017 +
42018 +#define RETURN_TYPE unsigned char
42019 +__IN(b,"")
42020 +#undef RETURN_TYPE
42021 +#define RETURN_TYPE unsigned short
42022 +__IN(w,"")
42023 +#undef RETURN_TYPE
42024 +#define RETURN_TYPE unsigned int
42025 +__IN(l,"")
42026 +#undef RETURN_TYPE
42027 +
42028 +__OUT(b,"b",char)
42029 +__OUT(w,"w",short)
42030 +__OUT(l,,int)
42031 +
42032 +__INS(b)
42033 +__INS(w)
42034 +__INS(l)
42035 +
42036 +__OUTS(b)
42037 +__OUTS(w)
42038 +__OUTS(l)
42039 +
42040 +#define IO_SPACE_LIMIT 0xffff
42041 +
42042 +#if defined(__KERNEL__) && __x86_64__
42043 +
42044 +#include <linux/vmalloc.h>
42045 +
42046 +#ifndef __i386__
42047 +/*
42048 + * Change virtual addresses to physical addresses and vv.
42049 + * These are pretty trivial
42050 + */
42051 +static inline unsigned long virt_to_phys(volatile void * address)
42052 +{
42053 +       return __pa(address);
42054 +}
42055 +
42056 +static inline void * phys_to_virt(unsigned long address)
42057 +{
42058 +       return __va(address);
42059 +}
42060 +
42061 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42062 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42063 +#endif
42064 +
42065 +/*
42066 + * Change "struct page" to physical address.
42067 + */
42068 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
42069 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
42070 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
42071 +
42072 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
42073 +                                 (unsigned long) bio_offset((bio)))
42074 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
42075 +                                 (unsigned long) (bv)->bv_offset)
42076 +
42077 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
42078 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
42079 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
42080 +         bvec_to_pseudophys((vec2))))
42081 +
42082 +#include <asm-generic/iomap.h>
42083 +
42084 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
42085 +
42086 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
42087 +{
42088 +       return __ioremap(offset, size, 0);
42089 +}
42090 +
42091 +extern void *bt_ioremap(unsigned long addr, unsigned long size);
42092 +extern void bt_iounmap(void *addr, unsigned long size);
42093 +#define early_ioremap bt_ioremap
42094 +#define early_iounmap bt_iounmap
42095 +
42096 +/*
42097 + * This one maps high address device memory and turns off caching for that area.
42098 + * it's useful if some control registers are in such an area and write combining
42099 + * or read caching is not desirable:
42100 + */
42101 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
42102 +extern void iounmap(volatile void __iomem *addr);
42103 +
42104 +/*
42105 + * ISA I/O bus memory addresses are 1:1 with the physical address.
42106 + */
42107 +
42108 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
42109 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
42110 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
42111 +
42112 +/*
42113 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
42114 + * are forbidden in portable PCI drivers.
42115 + *
42116 + * Allow them on x86 for legacy drivers, though.
42117 + */
42118 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42119 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42120 +
42121 +/*
42122 + * readX/writeX() are used to access memory mapped devices. On some
42123 + * architectures the memory mapped IO stuff needs to be accessed
42124 + * differently. On the x86 architecture, we just read/write the
42125 + * memory location directly.
42126 + */
42127 +
42128 +static inline __u8 __readb(const volatile void __iomem *addr)
42129 +{
42130 +       return *(__force volatile __u8 *)addr;
42131 +}
42132 +static inline __u16 __readw(const volatile void __iomem *addr)
42133 +{
42134 +       return *(__force volatile __u16 *)addr;
42135 +}
42136 +static __always_inline __u32 __readl(const volatile void __iomem *addr)
42137 +{
42138 +       return *(__force volatile __u32 *)addr;
42139 +}
42140 +static inline __u64 __readq(const volatile void __iomem *addr)
42141 +{
42142 +       return *(__force volatile __u64 *)addr;
42143 +}
42144 +#define readb(x) __readb(x)
42145 +#define readw(x) __readw(x)
42146 +#define readl(x) __readl(x)
42147 +#define readq(x) __readq(x)
42148 +#define readb_relaxed(a) readb(a)
42149 +#define readw_relaxed(a) readw(a)
42150 +#define readl_relaxed(a) readl(a)
42151 +#define readq_relaxed(a) readq(a)
42152 +#define __raw_readb readb
42153 +#define __raw_readw readw
42154 +#define __raw_readl readl
42155 +#define __raw_readq readq
42156 +
42157 +#define mmiowb()
42158 +
42159 +static inline void __writel(__u32 b, volatile void __iomem *addr)
42160 +{
42161 +       *(__force volatile __u32 *)addr = b;
42162 +}
42163 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
42164 +{
42165 +       *(__force volatile __u64 *)addr = b;
42166 +}
42167 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
42168 +{
42169 +       *(__force volatile __u8 *)addr = b;
42170 +}
42171 +static inline void __writew(__u16 b, volatile void __iomem *addr)
42172 +{
42173 +       *(__force volatile __u16 *)addr = b;
42174 +}
42175 +#define writeq(val,addr) __writeq((val),(addr))
42176 +#define writel(val,addr) __writel((val),(addr))
42177 +#define writew(val,addr) __writew((val),(addr))
42178 +#define writeb(val,addr) __writeb((val),(addr))
42179 +#define __raw_writeb writeb
42180 +#define __raw_writew writew
42181 +#define __raw_writel writel
42182 +#define __raw_writeq writeq
42183 +
42184 +void __memcpy_fromio(void*,unsigned long,unsigned);
42185 +void __memcpy_toio(unsigned long,const void*,unsigned);
42186 +
42187 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
42188 +{
42189 +       __memcpy_fromio(to,(unsigned long)from,len);
42190 +}
42191 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
42192 +{
42193 +       __memcpy_toio((unsigned long)to,from,len);
42194 +}
42195 +
42196 +void memset_io(volatile void __iomem *a, int b, size_t c);
42197 +
42198 +/*
42199 + * ISA space is 'always mapped' on a typical x86 system, no need to
42200 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
42201 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
42202 + * are physical addresses. The following constant pointer can be
42203 + * used as the IO-area pointer (it can be iounmapped as well, so the
42204 + * analogy with PCI is quite large):
42205 + */
42206 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
42207 +
42208 +/*
42209 + * Again, x86-64 does not require mem IO specific function.
42210 + */
42211 +
42212 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void *)(b),(c),(d))
42213 +
42214 +/**
42215 + *     check_signature         -       find BIOS signatures
42216 + *     @io_addr: mmio address to check
42217 + *     @signature:  signature block
42218 + *     @length: length of signature
42219 + *
42220 + *     Perform a signature comparison with the mmio address io_addr. This
42221 + *     address should have been obtained by ioremap.
42222 + *     Returns 1 on a match.
42223 + */
42224 +
42225 +static inline int check_signature(void __iomem *io_addr,
42226 +       const unsigned char *signature, int length)
42227 +{
42228 +       int retval = 0;
42229 +       do {
42230 +               if (readb(io_addr) != *signature)
42231 +                       goto out;
42232 +               io_addr++;
42233 +               signature++;
42234 +               length--;
42235 +       } while (length);
42236 +       retval = 1;
42237 +out:
42238 +       return retval;
42239 +}
42240 +
42241 +/* Nothing to do */
42242 +
42243 +#define dma_cache_inv(_start,_size)            do { } while (0)
42244 +#define dma_cache_wback(_start,_size)          do { } while (0)
42245 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
42246 +
42247 +#define flush_write_buffers()
42248 +
42249 +extern int iommu_bio_merge;
42250 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
42251 +
42252 +/*
42253 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
42254 + * access
42255 + */
42256 +#define xlate_dev_mem_ptr(p)   __va(p)
42257 +
42258 +/*
42259 + * Convert a virtual cached pointer to an uncached pointer
42260 + */
42261 +#define xlate_dev_kmem_ptr(p)  p
42262 +
42263 +#endif /* __KERNEL__ */
42264 +
42265 +#define ARCH_HAS_DEV_MEM
42266 +
42267 +#endif
42268 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h
42269 ===================================================================
42270 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
42271 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h  2007-06-12 13:14:13.000000000 +0200
42272 @@ -0,0 +1,139 @@
42273 +/*
42274 + * include/asm-x86_64/irqflags.h
42275 + *
42276 + * IRQ flags handling
42277 + *
42278 + * This file gets included from lowlevel asm headers too, to provide
42279 + * wrapped versions of the local_irq_*() APIs, based on the
42280 + * raw_local_irq_*() functions from the lowlevel headers.
42281 + */
42282 +#ifndef _ASM_IRQFLAGS_H
42283 +#define _ASM_IRQFLAGS_H
42284 +
42285 +#ifndef __ASSEMBLY__
42286 +/*
42287 + * Interrupt control:
42288 + */
42289 +
42290 +/*
42291 + * The use of 'barrier' in the following reflects their use as local-lock
42292 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
42293 + * critical operations are executed. All critical operations must complete
42294 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
42295 + * includes these barriers, for example.
42296 + */
42297 +
42298 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
42299 +
42300 +#define raw_local_save_flags(flags) \
42301 +               do { (flags) = __raw_local_save_flags(); } while (0)
42302 +
42303 +#define raw_local_irq_restore(x)                                       \
42304 +do {                                                                   \
42305 +       vcpu_info_t *_vcpu;                                             \
42306 +       barrier();                                                      \
42307 +       _vcpu = current_vcpu_info();            \
42308 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
42309 +               barrier(); /* unmask then check (avoid races) */        \
42310 +               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
42311 +                       force_evtchn_callback();                        \
42312 +       }                                                               \
42313 +} while (0)
42314 +
42315 +#ifdef CONFIG_X86_VSMP
42316 +
42317 +/*
42318 + * Interrupt control for the VSMP architecture:
42319 + */
42320 +
42321 +static inline void raw_local_irq_disable(void)
42322 +{
42323 +       unsigned long flags = __raw_local_save_flags();
42324 +
42325 +       raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
42326 +}
42327 +
42328 +static inline void raw_local_irq_enable(void)
42329 +{
42330 +       unsigned long flags = __raw_local_save_flags();
42331 +
42332 +       raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
42333 +}
42334 +
42335 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42336 +{
42337 +       return !(flags & (1<<9)) || (flags & (1 << 18));
42338 +}
42339 +
42340 +#else /* CONFIG_X86_VSMP */
42341 +
42342 +#define raw_local_irq_disable()                                                \
42343 +do {                                                                   \
42344 +       current_vcpu_info()->evtchn_upcall_mask = 1;                                    \
42345 +       barrier();                                                      \
42346 +} while (0)
42347 +
42348 +#define raw_local_irq_enable()                                         \
42349 +do {                                                                   \
42350 +       vcpu_info_t *_vcpu;                                             \
42351 +       barrier();                                                      \
42352 +       _vcpu = current_vcpu_info();            \
42353 +       _vcpu->evtchn_upcall_mask = 0;                                  \
42354 +       barrier(); /* unmask then check (avoid races) */                \
42355 +       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
42356 +               force_evtchn_callback();                                \
42357 +} while (0)
42358 +
42359 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42360 +{
42361 +       return (flags != 0);
42362 +}
42363 +
42364 +#endif
42365 +
42366 +/*
42367 + * For spinlocks, etc.:
42368 + */
42369 +
42370 +#define __raw_local_irq_save()                                         \
42371 +({                                                                     \
42372 +       unsigned long flags = __raw_local_save_flags();                 \
42373 +                                                                       \
42374 +       raw_local_irq_disable();                                        \
42375 +                                                                       \
42376 +       flags;                                                          \
42377 +})
42378 +
42379 +#define raw_local_irq_save(flags) \
42380 +               do { (flags) = __raw_local_irq_save(); } while (0)
42381 +
42382 +#define raw_irqs_disabled()                                            \
42383 +({                                                                     \
42384 +       unsigned long flags = __raw_local_save_flags();                 \
42385 +                                                                       \
42386 +       raw_irqs_disabled_flags(flags);                                 \
42387 +})
42388 +
42389 +/*
42390 + * Used in the idle loop; sti takes one instruction cycle
42391 + * to complete:
42392 + */
42393 +void raw_safe_halt(void);
42394 +
42395 +/*
42396 + * Used when interrupts are already enabled or to
42397 + * shutdown the processor:
42398 + */
42399 +void halt(void);
42400 +
42401 +#else /* __ASSEMBLY__: */
42402 +# ifdef CONFIG_TRACE_IRQFLAGS
42403 +#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk
42404 +#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk
42405 +# else
42406 +#  define TRACE_IRQS_ON
42407 +#  define TRACE_IRQS_OFF
42408 +# endif
42409 +#endif
42410 +
42411 +#endif
42412 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h
42413 ===================================================================
42414 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
42415 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h     2007-06-12 13:14:13.000000000 +0200
42416 @@ -0,0 +1,161 @@
42417 +#ifndef _X86_64_MADDR_H
42418 +#define _X86_64_MADDR_H
42419 +
42420 +#include <xen/features.h>
42421 +#include <xen/interface/xen.h>
42422 +
42423 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
42424 +#define INVALID_P2M_ENTRY      (~0UL)
42425 +#define FOREIGN_FRAME_BIT      (1UL<<63)
42426 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
42427 +
42428 +/* Definitions for machine and pseudophysical addresses. */
42429 +typedef unsigned long paddr_t;
42430 +typedef unsigned long maddr_t;
42431 +
42432 +#ifdef CONFIG_XEN
42433 +
42434 +extern unsigned long *phys_to_machine_mapping;
42435 +
42436 +#undef machine_to_phys_mapping
42437 +extern unsigned long *machine_to_phys_mapping;
42438 +extern unsigned int   machine_to_phys_order;
42439 +
42440 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
42441 +{
42442 +       if (xen_feature(XENFEAT_auto_translated_physmap))
42443 +               return pfn;
42444 +       BUG_ON(end_pfn && pfn >= end_pfn);
42445 +       return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
42446 +}
42447 +
42448 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
42449 +{
42450 +       if (xen_feature(XENFEAT_auto_translated_physmap))
42451 +               return 1;
42452 +       BUG_ON(end_pfn && pfn >= end_pfn);
42453 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
42454 +}
42455 +
42456 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
42457 +{
42458 +       unsigned long pfn;
42459 +
42460 +       if (xen_feature(XENFEAT_auto_translated_physmap))
42461 +               return mfn;
42462 +
42463 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
42464 +               return end_pfn;
42465 +
42466 +       /* The array access can fail (e.g., device space beyond end of RAM). */
42467 +       asm (
42468 +               "1:     movq %1,%0\n"
42469 +               "2:\n"
42470 +               ".section .fixup,\"ax\"\n"
42471 +               "3:     movq %2,%0\n"
42472 +               "       jmp  2b\n"
42473 +               ".previous\n"
42474 +               ".section __ex_table,\"a\"\n"
42475 +               "       .align 8\n"
42476 +               "       .quad 1b,3b\n"
42477 +               ".previous"
42478 +               : "=r" (pfn)
42479 +               : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
42480 +
42481 +       return pfn;
42482 +}
42483 +
42484 +/*
42485 + * We detect special mappings in one of two ways:
42486 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
42487 + *     to be outside our maximum possible pseudophys range.
42488 + *  2. If the MFN belongs to a different domain then we will certainly
42489 + *     not have MFN in our p2m table. Conversely, if the page is ours,
42490 + *     then we'll have p2m(m2p(MFN))==MFN.
42491 + * If we detect a special mapping then it doesn't have a 'struct page'.
42492 + * We force !pfn_valid() by returning an out-of-range pointer.
42493 + *
42494 + * NB. These checks require that, for any MFN that is not in our reservation,
42495 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
42496 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
42497 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
42498 + *
42499 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
42500 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
42501 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
42502 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
42503 + */
42504 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
42505 +{
42506 +       unsigned long pfn = mfn_to_pfn(mfn);
42507 +       if ((pfn < end_pfn)
42508 +           && !xen_feature(XENFEAT_auto_translated_physmap)
42509 +           && (phys_to_machine_mapping[pfn] != mfn))
42510 +               return end_pfn; /* force !pfn_valid() */
42511 +       return pfn;
42512 +}
42513 +
42514 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
42515 +{
42516 +       BUG_ON(end_pfn && pfn >= end_pfn);
42517 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
42518 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
42519 +               return;
42520 +       }
42521 +       phys_to_machine_mapping[pfn] = mfn;
42522 +}
42523 +
42524 +static inline maddr_t phys_to_machine(paddr_t phys)
42525 +{
42526 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
42527 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
42528 +       return machine;
42529 +}
42530 +
42531 +static inline paddr_t machine_to_phys(maddr_t machine)
42532 +{
42533 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
42534 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
42535 +       return phys;
42536 +}
42537 +
42538 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
42539 +{
42540 +       maddr_t machine;
42541 +       machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42542 +       machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
42543 +       return machine;
42544 +}
42545 +
42546 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
42547 +{
42548 +       paddr_t phys;
42549 +       phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42550 +       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
42551 +       return phys;
42552 +}
42553 +
42554 +#define __pte_ma(x)     ((pte_t) { (x) } )
42555 +#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
42556 +
42557 +#else /* !CONFIG_XEN */
42558 +
42559 +#define pfn_to_mfn(pfn) (pfn)
42560 +#define mfn_to_pfn(mfn) (mfn)
42561 +#define mfn_to_local_pfn(mfn) (mfn)
42562 +#define set_phys_to_machine(pfn, mfn) ((void)0)
42563 +#define phys_to_machine_mapping_valid(pfn) (1)
42564 +#define phys_to_machine(phys) ((maddr_t)(phys))
42565 +#define machine_to_phys(mach) ((paddr_t)(mach))
42566 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
42567 +#define __pte_ma(x) __pte(x)
42568 +
42569 +#endif /* !CONFIG_XEN */
42570 +
42571 +/* VIRT <-> MACHINE conversion */
42572 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
42573 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
42574 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
42575 +
42576 +#endif /* _X86_64_MADDR_H */
42577 +
42578 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h
42579 ===================================================================
42580 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
42581 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h       2007-06-12 13:14:13.000000000 +0200
42582 @@ -0,0 +1,136 @@
42583 +#ifndef __X86_64_MMU_CONTEXT_H
42584 +#define __X86_64_MMU_CONTEXT_H
42585 +
42586 +#include <asm/desc.h>
42587 +#include <asm/atomic.h>
42588 +#include <asm/pgalloc.h>
42589 +#include <asm/page.h>
42590 +#include <asm/pda.h>
42591 +#include <asm/pgtable.h>
42592 +#include <asm/tlbflush.h>
42593 +
42594 +/*
42595 + * possibly do the LDT unload here?
42596 + */
42597 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
42598 +void destroy_context(struct mm_struct *mm);
42599 +
42600 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
42601 +{
42602 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42603 +       if (read_pda(mmu_state) == TLBSTATE_OK)
42604 +               write_pda(mmu_state, TLBSTATE_LAZY);
42605 +#endif
42606 +}
42607 +
42608 +#define prepare_arch_switch(next)      __prepare_arch_switch()
42609 +
42610 +static inline void __prepare_arch_switch(void)
42611 +{
42612 +       /*
42613 +        * Save away %es, %ds, %fs and %gs. Must happen before reload
42614 +        * of cr3/ldt (i.e., not in __switch_to).
42615 +        */
42616 +       __asm__ __volatile__ (
42617 +               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
42618 +               : "=m" (current->thread.es),
42619 +                 "=m" (current->thread.ds),
42620 +                 "=m" (current->thread.fsindex),
42621 +                 "=m" (current->thread.gsindex) );
42622 +
42623 +       if (current->thread.ds)
42624 +               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
42625 +
42626 +       if (current->thread.es)
42627 +               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
42628 +
42629 +       if (current->thread.fsindex) {
42630 +               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
42631 +               current->thread.fs = 0;
42632 +       }
42633 +
42634 +       if (current->thread.gsindex) {
42635 +               load_gs_index(0);
42636 +               current->thread.gs = 0;
42637 +       }
42638 +}
42639 +
42640 +extern void mm_pin(struct mm_struct *mm);
42641 +extern void mm_unpin(struct mm_struct *mm);
42642 +void mm_pin_all(void);
42643 +
42644 +static inline void load_cr3(pgd_t *pgd)
42645 +{
42646 +       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
42647 +                    "memory");
42648 +}
42649 +
42650 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
42651 +                            struct task_struct *tsk)
42652 +{
42653 +       unsigned cpu = smp_processor_id();
42654 +       struct mmuext_op _op[3], *op = _op;
42655 +
42656 +       if (likely(prev != next)) {
42657 +               BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
42658 +                      !next->context.pinned);
42659 +
42660 +               /* stop flush ipis for the previous mm */
42661 +               cpu_clear(cpu, prev->cpu_vm_mask);
42662 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42663 +               write_pda(mmu_state, TLBSTATE_OK);
42664 +               write_pda(active_mm, next);
42665 +#endif
42666 +               cpu_set(cpu, next->cpu_vm_mask);
42667 +
42668 +               /* load_cr3(next->pgd) */
42669 +               op->cmd = MMUEXT_NEW_BASEPTR;
42670 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
42671 +               op++;
42672 +
42673 +               /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
42674 +               op->cmd = MMUEXT_NEW_USER_BASEPTR;
42675 +               op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
42676 +               op++;
42677 +
42678 +               if (unlikely(next->context.ldt != prev->context.ldt)) {
42679 +                       /* load_LDT_nolock(&next->context, cpu) */
42680 +                       op->cmd = MMUEXT_SET_LDT;
42681 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
42682 +                       op->arg2.nr_ents     = next->context.size;
42683 +                       op++;
42684 +               }
42685 +
42686 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
42687 +       }
42688 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42689 +       else {
42690 +               write_pda(mmu_state, TLBSTATE_OK);
42691 +               if (read_pda(active_mm) != next)
42692 +                       out_of_line_bug();
42693 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42694 +                       /* We were in lazy tlb mode and leave_mm disabled
42695 +                        * tlb flush IPI delivery. We must reload CR3
42696 +                        * to make sure to use no freed page tables.
42697 +                        */
42698 +                        load_cr3(next->pgd);
42699 +                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));
42700 +                       load_LDT_nolock(&next->context, cpu);
42701 +               }
42702 +       }
42703 +#endif
42704 +}
42705 +
42706 +#define deactivate_mm(tsk,mm)  do { \
42707 +       load_gs_index(0); \
42708 +       asm volatile("movl %0,%%fs"::"r"(0));  \
42709 +} while(0)
42710 +
42711 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
42712 +{
42713 +       if (!next->context.pinned)
42714 +               mm_pin(next);
42715 +       switch_mm(prev, next, NULL);
42716 +}
42717 +
42718 +#endif
42719 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h
42720 ===================================================================
42721 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
42722 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h      2008-04-02 12:34:02.000000000 +0200
42723 @@ -0,0 +1,212 @@
42724 +#ifndef _X86_64_PAGE_H
42725 +#define _X86_64_PAGE_H
42726 +
42727 +/* #include <linux/string.h> */
42728 +#ifndef __ASSEMBLY__
42729 +#include <linux/kernel.h>
42730 +#include <linux/types.h>
42731 +#include <asm/bug.h>
42732 +#endif
42733 +#include <xen/interface/xen.h>
42734 +
42735 +/*
42736 + * Need to repeat this here in order to not include pgtable.h (which in turn
42737 + * depends on definitions made here), but to be able to use the symbolic
42738 + * below. The preprocessor will warn if the two definitions aren't identical.
42739 + */
42740 +#define _PAGE_PRESENT  0x001
42741 +#define _PAGE_IO       0x200
42742 +
42743 +/* PAGE_SHIFT determines the page size */
42744 +#define PAGE_SHIFT     12
42745 +#ifdef __ASSEMBLY__
42746 +#define PAGE_SIZE      (0x1 << PAGE_SHIFT)
42747 +#else
42748 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
42749 +#endif
42750 +#define PAGE_MASK      (~(PAGE_SIZE-1))
42751 +
42752 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
42753 +#define __PHYSICAL_MASK_SHIFT  46
42754 +#define __PHYSICAL_MASK                ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
42755 +#define __VIRTUAL_MASK_SHIFT   48
42756 +#define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
42757 +
42758 +#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
42759 +
42760 +#define THREAD_ORDER 1
42761 +#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
42762 +#define CURRENT_MASK (~(THREAD_SIZE-1))
42763 +
42764 +#define EXCEPTION_STACK_ORDER 0
42765 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
42766 +
42767 +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
42768 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
42769 +
42770 +#define IRQSTACK_ORDER 2
42771 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
42772 +
42773 +#define STACKFAULT_STACK 1
42774 +#define DOUBLEFAULT_STACK 2
42775 +#define NMI_STACK 3
42776 +#define DEBUG_STACK 4
42777 +#define MCE_STACK 5
42778 +#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
42779 +
42780 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
42781 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
42782 +
42783 +#define HPAGE_SHIFT PMD_SHIFT
42784 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
42785 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
42786 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
42787 +
42788 +#ifdef __KERNEL__
42789 +#ifndef __ASSEMBLY__
42790 +
42791 +extern unsigned long end_pfn;
42792 +
42793 +#include <asm/maddr.h>
42794 +
42795 +void clear_page(void *);
42796 +void copy_page(void *, void *);
42797 +
42798 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
42799 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
42800 +
42801 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
42802 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
42803 +
42804 +/*
42805 + * These are used to make use of C type-checking..
42806 + */
42807 +typedef struct { unsigned long pte; } pte_t;
42808 +typedef struct { unsigned long pmd; } pmd_t;
42809 +typedef struct { unsigned long pud; } pud_t;
42810 +typedef struct { unsigned long pgd; } pgd_t;
42811 +#define PTE_MASK       PHYSICAL_PAGE_MASK
42812 +
42813 +typedef struct { unsigned long pgprot; } pgprot_t;
42814 +
42815 +#define __pte_val(x) ((x).pte)
42816 +#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO))  \
42817 +                   == _PAGE_PRESENT ?                          \
42818 +                   pte_machine_to_phys(__pte_val(x)) :         \
42819 +                   __pte_val(x))
42820 +
42821 +#define __pmd_val(x) ((x).pmd)
42822 +static inline unsigned long pmd_val(pmd_t x)
42823 +{
42824 +       unsigned long ret = __pmd_val(x);
42825 +#if CONFIG_XEN_COMPAT <= 0x030002
42826 +       if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
42827 +#else
42828 +       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42829 +#endif
42830 +       return ret;
42831 +}
42832 +
42833 +#define __pud_val(x) ((x).pud)
42834 +static inline unsigned long pud_val(pud_t x)
42835 +{
42836 +       unsigned long ret = __pud_val(x);
42837 +       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42838 +       return ret;
42839 +}
42840 +
42841 +#define __pgd_val(x) ((x).pgd)
42842 +static inline unsigned long pgd_val(pgd_t x)
42843 +{
42844 +       unsigned long ret = __pgd_val(x);
42845 +       if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42846 +       return ret;
42847 +}
42848 +
42849 +#define pgprot_val(x)  ((x).pgprot)
42850 +
42851 +static inline pte_t __pte(unsigned long x)
42852 +{
42853 +       if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
42854 +               x = pte_phys_to_machine(x);
42855 +       return ((pte_t) { (x) });
42856 +}
42857 +
42858 +static inline pmd_t __pmd(unsigned long x)
42859 +{
42860 +       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42861 +       return ((pmd_t) { (x) });
42862 +}
42863 +
42864 +static inline pud_t __pud(unsigned long x)
42865 +{
42866 +       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42867 +       return ((pud_t) { (x) });
42868 +}
42869 +
42870 +static inline pgd_t __pgd(unsigned long x)
42871 +{
42872 +       if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42873 +       return ((pgd_t) { (x) });
42874 +}
42875 +
42876 +#define __pgprot(x)    ((pgprot_t) { (x) } )
42877 +
42878 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
42879 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
42880 +#define __START_KERNEL_map     0xffffffff80000000UL
42881 +#define __PAGE_OFFSET           0xffff880000000000UL
42882 +
42883 +#else
42884 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
42885 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
42886 +#define __START_KERNEL_map     0xffffffff80000000
42887 +#define __PAGE_OFFSET           0xffff880000000000
42888 +#endif /* !__ASSEMBLY__ */
42889 +
42890 +#if CONFIG_XEN_COMPAT <= 0x030002
42891 +#undef LOAD_OFFSET
42892 +#define LOAD_OFFSET            0
42893 +#endif
42894 +
42895 +/* to align the pointer to the (next) page boundary */
42896 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
42897 +
42898 +#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
42899 +#define KERNEL_TEXT_START 0xffffffff80000000UL
42900 +
42901 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
42902 +
42903 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
42904 +   Otherwise you risk miscompilation. */
42905 +#define __pa(x)                        (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
42906 +/* __pa_symbol should be used for C visible symbols.
42907 +   This seems to be the official gcc blessed way to do such arithmetic. */
42908 +#define __pa_symbol(x)         \
42909 +       ({unsigned long v;  \
42910 +         asm("" : "=r" (v) : "0" (x)); \
42911 +         __pa(v); })
42912 +
42913 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
42914 +#define __boot_va(x)           __va(x)
42915 +#define __boot_pa(x)           __pa(x)
42916 +#ifdef CONFIG_FLATMEM
42917 +#define pfn_valid(pfn)         ((pfn) < end_pfn)
42918 +#endif
42919 +
42920 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
42921 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
42922 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
42923 +
42924 +#define VM_DATA_DEFAULT_FLAGS \
42925 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
42926 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
42927 +
42928 +#define __HAVE_ARCH_GATE_AREA 1
42929 +
42930 +#include <asm-generic/memory_model.h>
42931 +#include <asm-generic/page.h>
42932 +
42933 +#endif /* __KERNEL__ */
42934 +
42935 +#endif /* _X86_64_PAGE_H */
42936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h
42937 ===================================================================
42938 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
42939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h       2007-09-14 11:14:51.000000000 +0200
42940 @@ -0,0 +1,168 @@
42941 +#ifndef __x8664_PCI_H
42942 +#define __x8664_PCI_H
42943 +
42944 +#include <asm/io.h>
42945 +
42946 +#ifdef __KERNEL__
42947 +
42948 +#include <linux/mm.h> /* for struct page */
42949 +
42950 +/* Can be used to override the logic in pci_scan_bus for skipping
42951 +   already-configured bus numbers - to be used for buggy BIOSes
42952 +   or architectures with incomplete PCI setup by the loader */
42953 +
42954 +#ifdef CONFIG_PCI
42955 +extern unsigned int pcibios_assign_all_busses(void);
42956 +#else
42957 +#define pcibios_assign_all_busses()    0
42958 +#endif
42959 +
42960 +#include <asm/hypervisor.h>
42961 +#define pcibios_scan_all_fns(a, b)     (!is_initial_xendomain())
42962 +
42963 +extern unsigned long pci_mem_start;
42964 +#define PCIBIOS_MIN_IO         0x1000
42965 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
42966 +
42967 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
42968 +
42969 +void pcibios_config_init(void);
42970 +struct pci_bus * pcibios_scan_root(int bus);
42971 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
42972 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
42973 +
42974 +void pcibios_set_master(struct pci_dev *dev);
42975 +void pcibios_penalize_isa_irq(int irq, int active);
42976 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
42977 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
42978 +
42979 +#include <linux/types.h>
42980 +#include <linux/slab.h>
42981 +#include <asm/scatterlist.h>
42982 +#include <linux/string.h>
42983 +#include <asm/page.h>
42984 +
42985 +extern void pci_iommu_alloc(void);
42986 +extern int iommu_setup(char *opt);
42987 +
42988 +/* The PCI address space does equal the physical memory
42989 + * address space.  The networking and block device layers use
42990 + * this boolean for bounce buffer decisions
42991 + *
42992 + * On AMD64 it mostly equals, but we set it to zero if a hardware
42993 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
42994 + */
42995 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
42996 +
42997 +#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
42998 +
42999 +/*
43000 + * x86-64 always supports DAC, but sometimes it is useful to force
43001 + * devices through the IOMMU to get automatic sg list merging.
43002 + * Optional right now.
43003 + */
43004 +extern int iommu_sac_force;
43005 +#define pci_dac_dma_supported(pci_dev, mask)   (!iommu_sac_force)
43006 +
43007 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
43008 +       dma_addr_t ADDR_NAME;
43009 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
43010 +       __u32 LEN_NAME;
43011 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
43012 +       ((PTR)->ADDR_NAME)
43013 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
43014 +       (((PTR)->ADDR_NAME) = (VAL))
43015 +#define pci_unmap_len(PTR, LEN_NAME)                   \
43016 +       ((PTR)->LEN_NAME)
43017 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
43018 +       (((PTR)->LEN_NAME) = (VAL))
43019 +
43020 +#elif defined(CONFIG_SWIOTLB)
43021 +
43022 +#define pci_dac_dma_supported(pci_dev, mask)    1
43023 +
43024 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
43025 +       dma_addr_t ADDR_NAME;
43026 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
43027 +       __u32 LEN_NAME;
43028 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
43029 +       ((PTR)->ADDR_NAME)
43030 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
43031 +       (((PTR)->ADDR_NAME) = (VAL))
43032 +#define pci_unmap_len(PTR, LEN_NAME)                   \
43033 +       ((PTR)->LEN_NAME)
43034 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
43035 +       (((PTR)->LEN_NAME) = (VAL))
43036 +
43037 +#else
43038 +/* No IOMMU */
43039 +
43040 +#define pci_dac_dma_supported(pci_dev, mask)    1
43041 +
43042 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
43043 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
43044 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
43045 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
43046 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
43047 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
43048 +
43049 +#endif
43050 +
43051 +#include <asm-generic/pci-dma-compat.h>
43052 +
43053 +static inline dma64_addr_t
43054 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
43055 +{
43056 +       return ((dma64_addr_t) page_to_phys(page) +
43057 +               (dma64_addr_t) offset);
43058 +}
43059 +
43060 +static inline struct page *
43061 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
43062 +{
43063 +       return virt_to_page(__va(dma_addr));
43064 +}
43065 +
43066 +static inline unsigned long
43067 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
43068 +{
43069 +       return (dma_addr & ~PAGE_MASK);
43070 +}
43071 +
43072 +static inline void
43073 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43074 +{
43075 +}
43076 +
43077 +static inline void
43078 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43079 +{
43080 +       flush_write_buffers();
43081 +}
43082 +
43083 +#ifdef CONFIG_PCI
43084 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
43085 +                                       enum pci_dma_burst_strategy *strat,
43086 +                                       unsigned long *strategy_parameter)
43087 +{
43088 +       *strat = PCI_DMA_BURST_INFINITY;
43089 +       *strategy_parameter = ~0UL;
43090 +}
43091 +#endif
43092 +
43093 +#define HAVE_PCI_MMAP
43094 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
43095 +                              enum pci_mmap_state mmap_state, int write_combine);
43096 +
43097 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
43098 +{
43099 +}
43100 +
43101 +#endif /* __KERNEL__ */
43102 +
43103 +/* generic pci stuff */
43104 +#ifdef CONFIG_PCI
43105 +#include <asm-generic/pci.h>
43106 +#endif
43107 +
43108 +#endif /* __x8664_PCI_H */
43109 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h
43110 ===================================================================
43111 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
43112 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h   2007-06-18 08:38:13.000000000 +0200
43113 @@ -0,0 +1,204 @@
43114 +#ifndef _X86_64_PGALLOC_H
43115 +#define _X86_64_PGALLOC_H
43116 +
43117 +#include <asm/fixmap.h>
43118 +#include <asm/pda.h>
43119 +#include <linux/threads.h>
43120 +#include <linux/mm.h>
43121 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
43122 +
43123 +#include <xen/features.h>
43124 +void make_page_readonly(void *va, unsigned int feature);
43125 +void make_page_writable(void *va, unsigned int feature);
43126 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
43127 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
43128 +
43129 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43130 +
43131 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
43132 +{
43133 +       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
43134 +}
43135 +
43136 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
43137 +{
43138 +       if (unlikely((mm)->context.pinned)) {
43139 +               BUG_ON(HYPERVISOR_update_va_mapping(
43140 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
43141 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
43142 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
43143 +       } else {
43144 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
43145 +       }
43146 +}
43147 +
43148 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
43149 +{
43150 +       if (unlikely((mm)->context.pinned)) {
43151 +               BUG_ON(HYPERVISOR_update_va_mapping(
43152 +                              (unsigned long)pmd,
43153 +                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
43154 +                                      PAGE_KERNEL_RO), 0));
43155 +               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
43156 +       } else {
43157 +               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
43158 +       }
43159 +}
43160 +
43161 +/*
43162 + * We need to use the batch mode here, but pgd_pupulate() won't be
43163 + * be called frequently.
43164 + */
43165 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
43166 +{
43167 +       if (unlikely((mm)->context.pinned)) {
43168 +               BUG_ON(HYPERVISOR_update_va_mapping(
43169 +                              (unsigned long)pud,
43170 +                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
43171 +                                      PAGE_KERNEL_RO), 0));
43172 +               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
43173 +               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
43174 +       } else {
43175 +               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
43176 +               *(__user_pgd(pgd)) = *(pgd);
43177 +       }
43178 +}
43179 +
43180 +extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
43181 +extern void pte_free(struct page *pte);
43182 +
43183 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
43184 +{
43185 +       struct page *pg;
43186 +
43187 +       pg = pte_alloc_one(mm, addr);
43188 +       return pg ? page_address(pg) : NULL;
43189 +}
43190 +
43191 +static inline void pmd_free(pmd_t *pmd)
43192 +{
43193 +       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
43194 +       pte_free(virt_to_page(pmd));
43195 +}
43196 +
43197 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
43198 +{
43199 +       struct page *pg;
43200 +
43201 +       pg = pte_alloc_one(mm, addr);
43202 +       return pg ? page_address(pg) : NULL;
43203 +}
43204 +
43205 +static inline void pud_free(pud_t *pud)
43206 +{
43207 +       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
43208 +       pte_free(virt_to_page(pud));
43209 +}
43210 +
43211 +static inline void pgd_list_add(pgd_t *pgd)
43212 +{
43213 +       struct page *page = virt_to_page(pgd);
43214 +
43215 +       spin_lock(&pgd_lock);
43216 +       page->index = (pgoff_t)pgd_list;
43217 +       if (pgd_list)
43218 +               pgd_list->private = (unsigned long)&page->index;
43219 +       pgd_list = page;
43220 +       page->private = (unsigned long)&pgd_list;
43221 +       spin_unlock(&pgd_lock);
43222 +}
43223 +
43224 +static inline void pgd_list_del(pgd_t *pgd)
43225 +{
43226 +       struct page *next, **pprev, *page = virt_to_page(pgd);
43227 +
43228 +       spin_lock(&pgd_lock);
43229 +       next = (struct page *)page->index;
43230 +       pprev = (struct page **)page->private;
43231 +       *pprev = next;
43232 +       if (next)
43233 +               next->private = (unsigned long)pprev;
43234 +       spin_unlock(&pgd_lock);
43235 +}
43236 +
43237 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
43238 +{
43239 +       /*
43240 +        * We allocate two contiguous pages for kernel and user.
43241 +        */
43242 +       unsigned boundary;
43243 +       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
43244 +       if (!pgd)
43245 +               return NULL;
43246 +       pgd_list_add(pgd);
43247 +       /*
43248 +        * Copy kernel pointers in from init.
43249 +        * Could keep a freelist or slab cache of those because the kernel
43250 +        * part never changes.
43251 +        */
43252 +       boundary = pgd_index(__PAGE_OFFSET);
43253 +       memset(pgd, 0, boundary * sizeof(pgd_t));
43254 +       memcpy(pgd + boundary,
43255 +              init_level4_pgt + boundary,
43256 +              (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
43257 +
43258 +       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
43259 +       /*
43260 +        * Set level3_user_pgt for vsyscall area
43261 +        */
43262 +       __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
43263 +               __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
43264 +       return pgd;
43265 +}
43266 +
43267 +static inline void pgd_free(pgd_t *pgd)
43268 +{
43269 +       pte_t *ptep = virt_to_ptep(pgd);
43270 +
43271 +       if (!pte_write(*ptep)) {
43272 +               xen_pgd_unpin(__pa(pgd));
43273 +               BUG_ON(HYPERVISOR_update_va_mapping(
43274 +                              (unsigned long)pgd,
43275 +                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
43276 +                              0));
43277 +       }
43278 +
43279 +       ptep = virt_to_ptep(__user_pgd(pgd));
43280 +
43281 +       if (!pte_write(*ptep)) {
43282 +               xen_pgd_unpin(__pa(__user_pgd(pgd)));
43283 +               BUG_ON(HYPERVISOR_update_va_mapping(
43284 +                              (unsigned long)__user_pgd(pgd),
43285 +                              pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
43286 +                                      PAGE_KERNEL),
43287 +                              0));
43288 +       }
43289 +
43290 +       pgd_list_del(pgd);
43291 +       free_pages((unsigned long)pgd, 1);
43292 +}
43293 +
43294 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
43295 +{
43296 +       pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
43297 +       if (pte)
43298 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
43299 +
43300 +       return pte;
43301 +}
43302 +
43303 +/* Should really implement gc for free page table pages. This could be
43304 +   done with a reference count in struct page. */
43305 +
43306 +static inline void pte_free_kernel(pte_t *pte)
43307 +{
43308 +       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
43309 +       make_page_writable(pte, XENFEAT_writable_page_tables);
43310 +       free_page((unsigned long)pte);
43311 +}
43312 +
43313 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
43314 +#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
43315 +#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
43316 +
43317 +#endif /* _X86_64_PGALLOC_H */
43318 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h
43319 ===================================================================
43320 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
43321 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h   2008-07-21 11:00:33.000000000 +0200
43322 @@ -0,0 +1,583 @@
43323 +#ifndef _X86_64_PGTABLE_H
43324 +#define _X86_64_PGTABLE_H
43325 +
43326 +/*
43327 + * This file contains the functions and defines necessary to modify and use
43328 + * the x86-64 page table tree.
43329 + */
43330 +#include <asm/processor.h>
43331 +#include <asm/fixmap.h>
43332 +#include <asm/bitops.h>
43333 +#include <linux/threads.h>
43334 +#include <linux/sched.h>
43335 +#include <asm/pda.h>
43336 +#ifdef CONFIG_XEN
43337 +#include <asm/hypervisor.h>
43338 +
43339 +extern pud_t level3_user_pgt[512];
43340 +
43341 +extern void xen_init_pt(void);
43342 +
43343 +extern pte_t *lookup_address(unsigned long address);
43344 +
43345 +#define virt_to_ptep(va)                                               \
43346 +({                                                                     \
43347 +       pte_t *__ptep = lookup_address((unsigned long)(va));            \
43348 +       BUG_ON(!__ptep || !pte_present(*__ptep));                       \
43349 +       __ptep;                                                         \
43350 +})
43351 +
43352 +#define arbitrary_virt_to_machine(va)                                  \
43353 +       (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)            \
43354 +        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
43355 +#endif
43356 +
43357 +extern pud_t level3_kernel_pgt[512];
43358 +extern pud_t level3_physmem_pgt[512];
43359 +extern pud_t level3_ident_pgt[512];
43360 +extern pmd_t level2_kernel_pgt[512];
43361 +extern pgd_t init_level4_pgt[];
43362 +extern pgd_t boot_level4_pgt[];
43363 +extern unsigned long __supported_pte_mask;
43364 +
43365 +#define swapper_pg_dir init_level4_pgt
43366 +
43367 +extern int nonx_setup(char *str);
43368 +extern void paging_init(void);
43369 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
43370 +
43371 +extern unsigned long pgkern_mask;
43372 +
43373 +/*
43374 + * ZERO_PAGE is a global shared page that is always zero: used
43375 + * for zero-mapped memory areas etc..
43376 + */
43377 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
43378 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
43379 +
43380 +/*
43381 + * PGDIR_SHIFT determines what a top-level page table entry can map
43382 + */
43383 +#define PGDIR_SHIFT    39
43384 +#define PTRS_PER_PGD   512
43385 +
43386 +/*
43387 + * 3rd level page
43388 + */
43389 +#define PUD_SHIFT      30
43390 +#define PTRS_PER_PUD   512
43391 +
43392 +/*
43393 + * PMD_SHIFT determines the size of the area a middle-level
43394 + * page table can map
43395 + */
43396 +#define PMD_SHIFT      21
43397 +#define PTRS_PER_PMD   512
43398 +
43399 +/*
43400 + * entries per page directory level
43401 + */
43402 +#define PTRS_PER_PTE   512
43403 +
43404 +#define pte_ERROR(e) \
43405 +       printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43406 +              &(e), __pte_val(e), pte_pfn(e))
43407 +#define pmd_ERROR(e) \
43408 +       printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43409 +              &(e), __pmd_val(e), pmd_pfn(e))
43410 +#define pud_ERROR(e) \
43411 +       printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43412 +              &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43413 +#define pgd_ERROR(e) \
43414 +       printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43415 +              &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43416 +
43417 +#define pgd_none(x)    (!__pgd_val(x))
43418 +#define pud_none(x)    (!__pud_val(x))
43419 +
43420 +static inline void set_pte(pte_t *dst, pte_t val)
43421 +{
43422 +       *dst = val;
43423 +}
43424 +
43425 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
43426 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
43427 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
43428 +
43429 +static inline void pud_clear (pud_t * pud)
43430 +{
43431 +       set_pud(pud, __pud(0));
43432 +}
43433 +
43434 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43435 +
43436 +static inline void pgd_clear (pgd_t * pgd)
43437 +{
43438 +        set_pgd(pgd, __pgd(0));
43439 +        set_pgd(__user_pgd(pgd), __pgd(0));
43440 +}
43441 +
43442 +#define pud_page(pud) \
43443 +    ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
43444 +
43445 +#define pte_same(a, b)         ((a).pte == (b).pte)
43446 +
43447 +#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
43448 +
43449 +#define PMD_SIZE       (1UL << PMD_SHIFT)
43450 +#define PMD_MASK       (~(PMD_SIZE-1))
43451 +#define PUD_SIZE       (1UL << PUD_SHIFT)
43452 +#define PUD_MASK       (~(PUD_SIZE-1))
43453 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
43454 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
43455 +
43456 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
43457 +#define FIRST_USER_ADDRESS     0
43458 +
43459 +#ifndef __ASSEMBLY__
43460 +#define MAXMEM          0x3fffffffffffUL
43461 +#define VMALLOC_START    0xffffc20000000000UL
43462 +#define VMALLOC_END      0xffffe1ffffffffffUL
43463 +#define MODULES_VADDR    0xffffffff88000000UL
43464 +#define MODULES_END      0xfffffffffff00000UL
43465 +#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
43466 +
43467 +#define _PAGE_BIT_PRESENT      0
43468 +#define _PAGE_BIT_RW           1
43469 +#define _PAGE_BIT_USER         2
43470 +#define _PAGE_BIT_PWT          3
43471 +#define _PAGE_BIT_PCD          4
43472 +#define _PAGE_BIT_ACCESSED     5
43473 +#define _PAGE_BIT_DIRTY                6
43474 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
43475 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
43476 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
43477 +
43478 +#define _PAGE_PRESENT  0x001
43479 +#define _PAGE_RW       0x002
43480 +#define _PAGE_USER     0x004
43481 +#define _PAGE_PWT      0x008
43482 +#define _PAGE_PCD      0x010
43483 +#define _PAGE_ACCESSED 0x020
43484 +#define _PAGE_DIRTY    0x040
43485 +#define _PAGE_PSE      0x080   /* 2MB page */
43486 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
43487 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
43488 +
43489 +#define _PAGE_PROTNONE 0x080   /* If not present */
43490 +#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
43491 +
43492 +/* Mapped page is I/O or foreign and has no associated page struct. */
43493 +#define _PAGE_IO       0x200
43494 +
43495 +#if CONFIG_XEN_COMPAT <= 0x030002
43496 +extern unsigned int __kernel_page_user;
43497 +#else
43498 +#define __kernel_page_user 0
43499 +#endif
43500 +
43501 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
43502 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
43503 +
43504 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
43505 +
43506 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
43507 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43508 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
43509 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43510 +#define PAGE_COPY PAGE_COPY_NOEXEC
43511 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43512 +#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43513 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43514 +#define __PAGE_KERNEL \
43515 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43516 +#define __PAGE_KERNEL_EXEC \
43517 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
43518 +#define __PAGE_KERNEL_NOCACHE \
43519 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43520 +#define __PAGE_KERNEL_RO \
43521 +       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43522 +#define __PAGE_KERNEL_VSYSCALL \
43523 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43524 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
43525 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
43526 +#define __PAGE_KERNEL_LARGE \
43527 +       (__PAGE_KERNEL | _PAGE_PSE)
43528 +#define __PAGE_KERNEL_LARGE_EXEC \
43529 +       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
43530 +
43531 +/*
43532 + * We don't support GLOBAL page in xenolinux64
43533 + */
43534 +#define MAKE_GLOBAL(x) __pgprot((x))
43535 +
43536 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
43537 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
43538 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
43539 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
43540 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
43541 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
43542 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
43543 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
43544 +
43545 +/*         xwr */
43546 +#define __P000 PAGE_NONE
43547 +#define __P001 PAGE_READONLY
43548 +#define __P010 PAGE_COPY
43549 +#define __P011 PAGE_COPY
43550 +#define __P100 PAGE_READONLY_EXEC
43551 +#define __P101 PAGE_READONLY_EXEC
43552 +#define __P110 PAGE_COPY_EXEC
43553 +#define __P111 PAGE_COPY_EXEC
43554 +
43555 +#define __S000 PAGE_NONE
43556 +#define __S001 PAGE_READONLY
43557 +#define __S010 PAGE_SHARED
43558 +#define __S011 PAGE_SHARED
43559 +#define __S100 PAGE_READONLY_EXEC
43560 +#define __S101 PAGE_READONLY_EXEC
43561 +#define __S110 PAGE_SHARED_EXEC
43562 +#define __S111 PAGE_SHARED_EXEC
43563 +
43564 +static inline unsigned long pgd_bad(pgd_t pgd)
43565 +{
43566 +       unsigned long val = __pgd_val(pgd);
43567 +       val &= ~PTE_MASK;
43568 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
43569 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43570 +}
43571 +
43572 +static inline unsigned long pud_bad(pud_t pud)
43573 +{
43574 +       unsigned long val = __pud_val(pud);
43575 +       val &= ~PTE_MASK;
43576 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
43577 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43578 +}
43579 +
43580 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
43581 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
43582 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
43583 +               set_pte((ptep), (pteval));                              \
43584 +} while (0)
43585 +
43586 +#define pte_none(x)    (!(x).pte)
43587 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
43588 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
43589 +
43590 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
43591 +
43592 +#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
43593 +#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
43594 +       __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
43595 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn :       \
43596 +                      (_pte).pte & _PAGE_PRESENT ?             \
43597 +                      mfn_to_local_pfn(__pte_mfn(_pte)) :      \
43598 +                      __pte_mfn(_pte))
43599 +
43600 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
43601 +
43602 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
43603 +{
43604 +       unsigned long pte = page_nr << PAGE_SHIFT;
43605 +       pte |= pgprot_val(pgprot);
43606 +       pte &= __supported_pte_mask;
43607 +       return __pte(pte);
43608 +}
43609 +
43610 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43611 +{
43612 +       pte_t pte = *ptep;
43613 +       if (!pte_none(pte)) {
43614 +               if ((mm != &init_mm) ||
43615 +                   HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
43616 +                       pte = __pte_ma(xchg(&ptep->pte, 0));
43617 +       }
43618 +       return pte;
43619 +}
43620 +
43621 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
43622 +{
43623 +       if (full) {
43624 +               pte_t pte = *ptep;
43625 +               if (mm->context.pinned)
43626 +                       xen_l1_entry_update(ptep, __pte(0));
43627 +               else
43628 +                       *ptep = __pte(0);
43629 +               return pte;
43630 +       }
43631 +       return ptep_get_and_clear(mm, addr, ptep);
43632 +}
43633 +
43634 +#define ptep_clear_flush(vma, addr, ptep)                      \
43635 +({                                                             \
43636 +       pte_t *__ptep = (ptep);                                 \
43637 +       pte_t __res = *__ptep;                                  \
43638 +       if (!pte_none(__res) &&                                 \
43639 +           ((vma)->vm_mm != current->mm ||                     \
43640 +            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
43641 +                       (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43642 +                               UVMF_INVLPG|UVMF_MULTI))) {     \
43643 +               __ptep->pte = 0;                                \
43644 +               flush_tlb_page(vma, addr);                      \
43645 +       }                                                       \
43646 +       __res;                                                  \
43647 +})
43648 +
43649 +/*
43650 + * The following only work if pte_present() is true.
43651 + * Undefined behaviour if not..
43652 + */
43653 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
43654 +static inline int pte_user(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
43655 +static inline int pte_read(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
43656 +static inline int pte_exec(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
43657 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
43658 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
43659 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
43660 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
43661 +static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
43662 +
43663 +static inline pte_t pte_rdprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43664 +static inline pte_t pte_exprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43665 +static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
43666 +static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
43667 +static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
43668 +static inline pte_t pte_mkread(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
43669 +static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
43670 +static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
43671 +static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
43672 +static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
43673 +static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= _PAGE_PSE; return pte; }
43674 +
43675 +#define ptep_test_and_clear_dirty(vma, addr, ptep)                     \
43676 +({                                                                     \
43677 +       pte_t __pte = *(ptep);                                          \
43678 +       int __ret = pte_dirty(__pte);                                   \
43679 +       if (__ret)                                                      \
43680 +               set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
43681 +       __ret;                                                          \
43682 +})
43683 +
43684 +#define ptep_test_and_clear_young(vma, addr, ptep)                     \
43685 +({                                                                     \
43686 +       pte_t __pte = *(ptep);                                          \
43687 +       int __ret = pte_young(__pte);                                   \
43688 +       if (__ret)                                                      \
43689 +               set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
43690 +       __ret;                                                          \
43691 +})
43692 +
43693 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43694 +{
43695 +       pte_t pte = *ptep;
43696 +       if (pte_write(pte))
43697 +               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
43698 +}
43699 +
43700 +/*
43701 + * Macro to mark a page protection value as "uncacheable".
43702 + */
43703 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
43704 +
43705 +static inline int pmd_large(pmd_t pte) {
43706 +       return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
43707 +}
43708 +
43709 +
43710 +/*
43711 + * Conversion functions: convert a page and protection to a page entry,
43712 + * and a page entry and page directory to the page they refer to.
43713 + */
43714 +
43715 +/*
43716 + * Level 4 access.
43717 + * Never use these in the common code.
43718 + */
43719 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
43720 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
43721 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
43722 +#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
43723 +#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
43724 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
43725 +
43726 +/* PUD - Level3 access */
43727 +/* to find an entry in a page-table-directory. */
43728 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
43729 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
43730 +#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
43731 +
43732 +/* PMD  - Level 2 access */
43733 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
43734 +#define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
43735 +
43736 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
43737 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
43738 +                                  pmd_index(address))
43739 +#define pmd_none(x)    (!__pmd_val(x))
43740 +#if CONFIG_XEN_COMPAT <= 0x030002
43741 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
43742 +   can temporarily clear it. */
43743 +#define pmd_present(x) (__pmd_val(x))
43744 +#else
43745 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
43746 +#endif
43747 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
43748 +#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
43749 +                   != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
43750 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
43751 +#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43752 +
43753 +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
43754 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
43755 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
43756 +
43757 +/* PTE - Level 1 access. */
43758 +
43759 +/* page, protection -> pte */
43760 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
43761 +#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
43762 +
43763 +/* physical address -> PTE */
43764 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
43765 +{
43766 +       unsigned long pteval;
43767 +       pteval = physpage | pgprot_val(pgprot);
43768 +       return __pte(pteval);
43769 +}
43770 +
43771 +/* Change flags of a PTE */
43772 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
43773 +{
43774 +       /*
43775 +        * Since this might change the present bit (which controls whether
43776 +        * a pte_t object has undergone p2m translation), we must use
43777 +        * pte_val() on the input pte and __pte() for the return value.
43778 +        */
43779 +       unsigned long pteval = pte_val(pte);
43780 +
43781 +       pteval &= _PAGE_CHG_MASK;
43782 +       pteval |= pgprot_val(newprot);
43783 +       pteval &= __supported_pte_mask;
43784 +       return __pte(pteval);
43785 +}
43786 +
43787 +#define pte_index(address) \
43788 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
43789 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
43790 +                       pte_index(address))
43791 +
43792 +/* x86-64 always has all page tables mapped. */
43793 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
43794 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
43795 +#define pte_unmap(pte) /* NOP */
43796 +#define pte_unmap_nested(pte) /* NOP */
43797 +
43798 +#define update_mmu_cache(vma,address,pte) do { } while (0)
43799 +
43800 +/*
43801 + * Rules for using ptep_establish: the pte MUST be a user pte, and
43802 + * must be a present->present transition.
43803 + */
43804 +#define __HAVE_ARCH_PTEP_ESTABLISH
43805 +#define ptep_establish(vma, address, ptep, pteval)                     \
43806 +       do {                                                            \
43807 +               if ( likely((vma)->vm_mm == current->mm) ) {            \
43808 +                       BUG_ON(HYPERVISOR_update_va_mapping(address,    \
43809 +                               pteval,                                 \
43810 +                               (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43811 +                                       UVMF_INVLPG|UVMF_MULTI));       \
43812 +               } else {                                                \
43813 +                       xen_l1_entry_update(ptep, pteval);              \
43814 +                       flush_tlb_page(vma, address);                   \
43815 +               }                                                       \
43816 +       } while (0)
43817 +
43818 +/* We only update the dirty/accessed state if we set
43819 + * the dirty bit by hand in the kernel, since the hardware
43820 + * will do the accessed bit for us, and we don't want to
43821 + * race with other CPU's that might be updating the dirty
43822 + * bit at the same time. */
43823 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
43824 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
43825 +       do {                                                            \
43826 +               if (dirty)                                              \
43827 +                       ptep_establish(vma, address, ptep, entry);      \
43828 +       } while (0)
43829 +
43830 +/* Encode and de-code a swap entry */
43831 +#define __swp_type(x)                  (((x).val >> 1) & 0x3f)
43832 +#define __swp_offset(x)                        ((x).val >> 8)
43833 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
43834 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { __pte_val(pte) })
43835 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
43836 +
43837 +extern spinlock_t pgd_lock;
43838 +extern struct page *pgd_list;
43839 +void vmalloc_sync_all(void);
43840 +
43841 +#endif /* !__ASSEMBLY__ */
43842 +
43843 +extern int kern_addr_valid(unsigned long addr);
43844 +
43845 +#define DOMID_LOCAL (0xFFFFU)
43846 +
43847 +struct vm_area_struct;
43848 +
43849 +int direct_remap_pfn_range(struct vm_area_struct *vma,
43850 +                            unsigned long address,
43851 +                            unsigned long mfn,
43852 +                            unsigned long size,
43853 +                            pgprot_t prot,
43854 +                            domid_t  domid);
43855 +
43856 +int direct_kernel_remap_pfn_range(unsigned long address,
43857 +                                 unsigned long mfn,
43858 +                                 unsigned long size,
43859 +                                 pgprot_t prot,
43860 +                                 domid_t  domid);
43861 +
43862 +int create_lookup_pte_addr(struct mm_struct *mm,
43863 +                           unsigned long address,
43864 +                           uint64_t *ptep);
43865 +
43866 +int touch_pte_range(struct mm_struct *mm,
43867 +                    unsigned long address,
43868 +                    unsigned long size);
43869 +
43870 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43871 +               unsigned long addr, unsigned long end, pgprot_t newprot);
43872 +
43873 +#define arch_change_pte_range(mm, pmd, addr, end, newprot)     \
43874 +               xen_change_pte_range(mm, pmd, addr, end, newprot)
43875 +
43876 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
43877 +               direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
43878 +
43879 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
43880 +#define GET_IOSPACE(pfn)               0
43881 +#define GET_PFN(pfn)                   (pfn)
43882 +
43883 +#define HAVE_ARCH_UNMAPPED_AREA
43884 +
43885 +#define pgtable_cache_init()   do { } while (0)
43886 +#define check_pgt_cache()      do { } while (0)
43887 +
43888 +#define PAGE_AGP    PAGE_KERNEL_NOCACHE
43889 +#define HAVE_PAGE_AGP 1
43890 +
43891 +/* fs/proc/kcore.c */
43892 +#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
43893 +#define        kc_offset_to_vaddr(o) \
43894 +   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
43895 +
43896 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
43897 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
43898 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
43899 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
43900 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
43901 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
43902 +#define __HAVE_ARCH_PTE_SAME
43903 +#include <asm-generic/pgtable.h>
43904 +
43905 +#endif /* _X86_64_PGTABLE_H */
43906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h
43907 ===================================================================
43908 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
43909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100
43910 @@ -0,0 +1,502 @@
43911 +/*
43912 + * include/asm-x86_64/processor.h
43913 + *
43914 + * Copyright (C) 1994 Linus Torvalds
43915 + */
43916 +
43917 +#ifndef __ASM_X86_64_PROCESSOR_H
43918 +#define __ASM_X86_64_PROCESSOR_H
43919 +
43920 +#include <asm/segment.h>
43921 +#include <asm/page.h>
43922 +#include <asm/types.h>
43923 +#include <asm/sigcontext.h>
43924 +#include <asm/cpufeature.h>
43925 +#include <linux/threads.h>
43926 +#include <asm/msr.h>
43927 +#include <asm/current.h>
43928 +#include <asm/system.h>
43929 +#include <asm/mmsegment.h>
43930 +#include <asm/percpu.h>
43931 +#include <linux/personality.h>
43932 +#include <linux/cpumask.h>
43933 +
43934 +#define TF_MASK                0x00000100
43935 +#define IF_MASK                0x00000200
43936 +#define IOPL_MASK      0x00003000
43937 +#define NT_MASK                0x00004000
43938 +#define VM_MASK                0x00020000
43939 +#define AC_MASK                0x00040000
43940 +#define VIF_MASK       0x00080000      /* virtual interrupt flag */
43941 +#define VIP_MASK       0x00100000      /* virtual interrupt pending */
43942 +#define ID_MASK                0x00200000
43943 +
43944 +#define desc_empty(desc) \
43945 +               (!((desc)->a | (desc)->b))
43946 +
43947 +#define desc_equal(desc1, desc2) \
43948 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
43949 +
43950 +/*
43951 + * Default implementation of macro that returns current
43952 + * instruction pointer ("program counter").
43953 + */
43954 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
43955 +
43956 +/*
43957 + *  CPU type and hardware bug flags. Kept separately for each CPU.
43958 + */
43959 +
43960 +struct cpuinfo_x86 {
43961 +       __u8    x86;            /* CPU family */
43962 +       __u8    x86_vendor;     /* CPU vendor */
43963 +       __u8    x86_model;
43964 +       __u8    x86_mask;
43965 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
43966 +       __u32   x86_capability[NCAPINTS];
43967 +       char    x86_vendor_id[16];
43968 +       char    x86_model_id[64];
43969 +       int     x86_cache_size;  /* in KB */
43970 +       int     x86_clflush_size;
43971 +       int     x86_cache_alignment;
43972 +       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
43973 +        __u8    x86_virt_bits, x86_phys_bits;
43974 +       __u8    x86_max_cores;  /* cpuid returned max cores value */
43975 +        __u32   x86_power;
43976 +       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
43977 +       unsigned long loops_per_jiffy;
43978 +#ifdef CONFIG_SMP
43979 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
43980 +#endif
43981 +       __u8    apicid;
43982 +#ifdef CONFIG_SMP
43983 +       __u8    booted_cores;   /* number of cores as seen by OS */
43984 +       __u8    phys_proc_id;   /* Physical Processor id. */
43985 +       __u8    cpu_core_id;    /* Core id. */
43986 +#endif
43987 +} ____cacheline_aligned;
43988 +
43989 +#define X86_VENDOR_INTEL 0
43990 +#define X86_VENDOR_CYRIX 1
43991 +#define X86_VENDOR_AMD 2
43992 +#define X86_VENDOR_UMC 3
43993 +#define X86_VENDOR_NEXGEN 4
43994 +#define X86_VENDOR_CENTAUR 5
43995 +#define X86_VENDOR_RISE 6
43996 +#define X86_VENDOR_TRANSMETA 7
43997 +#define X86_VENDOR_NUM 8
43998 +#define X86_VENDOR_UNKNOWN 0xff
43999 +
44000 +#ifdef CONFIG_SMP
44001 +extern struct cpuinfo_x86 cpu_data[];
44002 +#define current_cpu_data cpu_data[smp_processor_id()]
44003 +#else
44004 +#define cpu_data (&boot_cpu_data)
44005 +#define current_cpu_data boot_cpu_data
44006 +#endif
44007 +
44008 +extern char ignore_irq13;
44009 +
44010 +extern void identify_cpu(struct cpuinfo_x86 *);
44011 +extern void print_cpu_info(struct cpuinfo_x86 *);
44012 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
44013 +extern unsigned short num_cache_leaves;
44014 +
44015 +/*
44016 + * EFLAGS bits
44017 + */
44018 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
44019 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
44020 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
44021 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
44022 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
44023 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
44024 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
44025 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
44026 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
44027 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
44028 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
44029 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
44030 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
44031 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
44032 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
44033 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
44034 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
44035 +
44036 +/*
44037 + * Intel CPU features in CR4
44038 + */
44039 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
44040 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
44041 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
44042 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
44043 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
44044 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
44045 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
44046 +#define X86_CR4_PGE            0x0080  /* enable global pages */
44047 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
44048 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
44049 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
44050 +
44051 +/*
44052 + * Save the cr4 feature set we're using (ie
44053 + * Pentium 4MB enable and PPro Global page
44054 + * enable), so that any CPU's that boot up
44055 + * after us can get the correct flags.
44056 + */
44057 +extern unsigned long mmu_cr4_features;
44058 +
44059 +static inline void set_in_cr4 (unsigned long mask)
44060 +{
44061 +       mmu_cr4_features |= mask;
44062 +       __asm__("movq %%cr4,%%rax\n\t"
44063 +               "orq %0,%%rax\n\t"
44064 +               "movq %%rax,%%cr4\n"
44065 +               : : "irg" (mask)
44066 +               :"ax");
44067 +}
44068 +
44069 +static inline void clear_in_cr4 (unsigned long mask)
44070 +{
44071 +       mmu_cr4_features &= ~mask;
44072 +       __asm__("movq %%cr4,%%rax\n\t"
44073 +               "andq %0,%%rax\n\t"
44074 +               "movq %%rax,%%cr4\n"
44075 +               : : "irg" (~mask)
44076 +               :"ax");
44077 +}
44078 +
44079 +
44080 +/*
44081 + * User space process size. 47bits minus one guard page.
44082 + */
44083 +#define TASK_SIZE64    (0x800000000000UL - 4096)
44084 +
44085 +/* This decides where the kernel will search for a free chunk of vm
44086 + * space during mmap's.
44087 + */
44088 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
44089 +
44090 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44091 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44092 +
44093 +#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
44094 +
44095 +/*
44096 + * Size of io_bitmap.
44097 + */
44098 +#define IO_BITMAP_BITS  65536
44099 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
44100 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
44101 +#ifndef CONFIG_X86_NO_TSS
44102 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
44103 +#endif
44104 +#define INVALID_IO_BITMAP_OFFSET 0x8000
44105 +
44106 +struct i387_fxsave_struct {
44107 +       u16     cwd;
44108 +       u16     swd;
44109 +       u16     twd;
44110 +       u16     fop;
44111 +       u64     rip;
44112 +       u64     rdp;
44113 +       u32     mxcsr;
44114 +       u32     mxcsr_mask;
44115 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
44116 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 128 bytes */
44117 +       u32     padding[24];
44118 +} __attribute__ ((aligned (16)));
44119 +
44120 +union i387_union {
44121 +       struct i387_fxsave_struct       fxsave;
44122 +};
44123 +
44124 +#ifndef CONFIG_X86_NO_TSS
44125 +struct tss_struct {
44126 +       u32 reserved1;
44127 +       u64 rsp0;
44128 +       u64 rsp1;
44129 +       u64 rsp2;
44130 +       u64 reserved2;
44131 +       u64 ist[7];
44132 +       u32 reserved3;
44133 +       u32 reserved4;
44134 +       u16 reserved5;
44135 +       u16 io_bitmap_base;
44136 +       /*
44137 +        * The extra 1 is there because the CPU will access an
44138 +        * additional byte beyond the end of the IO permission
44139 +        * bitmap. The extra byte must be all 1 bits, and must
44140 +        * be within the limit. Thus we have:
44141 +        *
44142 +        * 128 bytes, the bitmap itself, for ports 0..0x3ff
44143 +        * 8 bytes, for an extra "long" of ~0UL
44144 +        */
44145 +       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
44146 +} __attribute__((packed)) ____cacheline_aligned;
44147 +
44148 +DECLARE_PER_CPU(struct tss_struct,init_tss);
44149 +#endif
44150 +
44151 +
44152 +extern struct cpuinfo_x86 boot_cpu_data;
44153 +#ifndef CONFIG_X86_NO_TSS
44154 +/* Save the original ist values for checking stack pointers during debugging */
44155 +struct orig_ist {
44156 +       unsigned long ist[7];
44157 +};
44158 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
44159 +#endif
44160 +
44161 +#ifdef CONFIG_X86_VSMP
44162 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
44163 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
44164 +#else
44165 +#define ARCH_MIN_TASKALIGN     16
44166 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
44167 +#endif
44168 +
44169 +struct thread_struct {
44170 +       unsigned long   rsp0;
44171 +       unsigned long   rsp;
44172 +       unsigned long   userrsp;        /* Copy from PDA */
44173 +       unsigned long   fs;
44174 +       unsigned long   gs;
44175 +       unsigned short  es, ds, fsindex, gsindex;
44176 +/* Hardware debugging registers */
44177 +       unsigned long   debugreg0;
44178 +       unsigned long   debugreg1;
44179 +       unsigned long   debugreg2;
44180 +       unsigned long   debugreg3;
44181 +       unsigned long   debugreg6;
44182 +       unsigned long   debugreg7;
44183 +/* fault info */
44184 +       unsigned long   cr2, trap_no, error_code;
44185 +/* floating point info */
44186 +       union i387_union        i387  __attribute__((aligned(16)));
44187 +/* IO permissions. the bitmap could be moved into the GDT, that would make
44188 +   switch faster for a limited number of ioperm using tasks. -AK */
44189 +       int             ioperm;
44190 +       unsigned long   *io_bitmap_ptr;
44191 +       unsigned io_bitmap_max;
44192 +/* cached TLS descriptors. */
44193 +       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
44194 +       unsigned int    iopl;
44195 +} __attribute__((aligned(16)));
44196 +
44197 +#define INIT_THREAD  { \
44198 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44199 +}
44200 +
44201 +#ifndef CONFIG_X86_NO_TSS
44202 +#define INIT_TSS  { \
44203 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44204 +}
44205 +#endif
44206 +
44207 +#define INIT_MMAP \
44208 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
44209 +
44210 +#define start_thread(regs,new_rip,new_rsp) do { \
44211 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
44212 +       load_gs_index(0);                                                       \
44213 +       (regs)->rip = (new_rip);                                                 \
44214 +       (regs)->rsp = (new_rsp);                                                 \
44215 +       write_pda(oldrsp, (new_rsp));                                            \
44216 +       (regs)->cs = __USER_CS;                                                  \
44217 +       (regs)->ss = __USER_DS;                                                  \
44218 +       (regs)->eflags = 0x200;                                                  \
44219 +       set_fs(USER_DS);                                                         \
44220 +} while(0)
44221 +
44222 +#define get_debugreg(var, register)                            \
44223 +       var = HYPERVISOR_get_debugreg(register)
44224 +#define set_debugreg(value, register) do {                     \
44225 +       if (HYPERVISOR_set_debugreg(register, value))           \
44226 +               BUG();                                          \
44227 +} while (0)
44228 +
44229 +struct task_struct;
44230 +struct mm_struct;
44231 +
44232 +/* Free all resources held by a thread. */
44233 +extern void release_thread(struct task_struct *);
44234 +
44235 +/* Prepare to copy thread state - unlazy all lazy status */
44236 +extern void prepare_to_copy(struct task_struct *tsk);
44237 +
44238 +/*
44239 + * create a kernel thread without removing it from tasklists
44240 + */
44241 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
44242 +
44243 +/*
44244 + * Return saved PC of a blocked thread.
44245 + * What is this good for? it will be always the scheduler or ret_from_fork.
44246 + */
44247 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
44248 +
44249 +extern unsigned long get_wchan(struct task_struct *p);
44250 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
44251 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
44252 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
44253 +
44254 +
44255 +struct microcode_header {
44256 +       unsigned int hdrver;
44257 +       unsigned int rev;
44258 +       unsigned int date;
44259 +       unsigned int sig;
44260 +       unsigned int cksum;
44261 +       unsigned int ldrver;
44262 +       unsigned int pf;
44263 +       unsigned int datasize;
44264 +       unsigned int totalsize;
44265 +       unsigned int reserved[3];
44266 +};
44267 +
44268 +struct microcode {
44269 +       struct microcode_header hdr;
44270 +       unsigned int bits[0];
44271 +};
44272 +
44273 +typedef struct microcode microcode_t;
44274 +typedef struct microcode_header microcode_header_t;
44275 +
44276 +/* microcode format is extended from prescott processors */
44277 +struct extended_signature {
44278 +       unsigned int sig;
44279 +       unsigned int pf;
44280 +       unsigned int cksum;
44281 +};
44282 +
44283 +struct extended_sigtable {
44284 +       unsigned int count;
44285 +       unsigned int cksum;
44286 +       unsigned int reserved[3];
44287 +       struct extended_signature sigs[0];
44288 +};
44289 +
44290 +
44291 +#define ASM_NOP1 K8_NOP1
44292 +#define ASM_NOP2 K8_NOP2
44293 +#define ASM_NOP3 K8_NOP3
44294 +#define ASM_NOP4 K8_NOP4
44295 +#define ASM_NOP5 K8_NOP5
44296 +#define ASM_NOP6 K8_NOP6
44297 +#define ASM_NOP7 K8_NOP7
44298 +#define ASM_NOP8 K8_NOP8
44299 +
44300 +/* Opteron nops */
44301 +#define K8_NOP1 ".byte 0x90\n"
44302 +#define K8_NOP2        ".byte 0x66,0x90\n"
44303 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
44304 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
44305 +#define K8_NOP5        K8_NOP3 K8_NOP2
44306 +#define K8_NOP6        K8_NOP3 K8_NOP3
44307 +#define K8_NOP7        K8_NOP4 K8_NOP3
44308 +#define K8_NOP8        K8_NOP4 K8_NOP4
44309 +
44310 +#define ASM_NOP_MAX 8
44311 +
44312 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
44313 +static inline void rep_nop(void)
44314 +{
44315 +       __asm__ __volatile__("rep;nop": : :"memory");
44316 +}
44317 +
44318 +/* Stop speculative execution */
44319 +static inline void sync_core(void)
44320 +{
44321 +       int tmp;
44322 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
44323 +}
44324 +
44325 +#define cpu_has_fpu 1
44326 +
44327 +#define ARCH_HAS_PREFETCH
44328 +static inline void prefetch(void *x)
44329 +{
44330 +       asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
44331 +}
44332 +
44333 +#define ARCH_HAS_PREFETCHW 1
44334 +static inline void prefetchw(void *x)
44335 +{
44336 +       alternative_input("prefetcht0 (%1)",
44337 +                         "prefetchw (%1)",
44338 +                         X86_FEATURE_3DNOW,
44339 +                         "r" (x));
44340 +}
44341 +
44342 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
44343 +
44344 +#define spin_lock_prefetch(x)  prefetchw(x)
44345 +
44346 +#define cpu_relax()   rep_nop()
44347 +
44348 +/*
44349 + *      NSC/Cyrix CPU configuration register indexes
44350 + */
44351 +#define CX86_CCR0 0xc0
44352 +#define CX86_CCR1 0xc1
44353 +#define CX86_CCR2 0xc2
44354 +#define CX86_CCR3 0xc3
44355 +#define CX86_CCR4 0xe8
44356 +#define CX86_CCR5 0xe9
44357 +#define CX86_CCR6 0xea
44358 +#define CX86_CCR7 0xeb
44359 +#define CX86_DIR0 0xfe
44360 +#define CX86_DIR1 0xff
44361 +#define CX86_ARR_BASE 0xc4
44362 +#define CX86_RCR_BASE 0xdc
44363 +
44364 +/*
44365 + *      NSC/Cyrix CPU indexed register access macros
44366 + */
44367 +
44368 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
44369 +
44370 +#define setCx86(reg, data) do { \
44371 +       outb((reg), 0x22); \
44372 +       outb((data), 0x23); \
44373 +} while (0)
44374 +
44375 +static inline void serialize_cpu(void)
44376 +{
44377 +       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
44378 +}
44379 +
44380 +static inline void __monitor(const void *eax, unsigned long ecx,
44381 +               unsigned long edx)
44382 +{
44383 +       /* "monitor %eax,%ecx,%edx;" */
44384 +       asm volatile(
44385 +               ".byte 0x0f,0x01,0xc8;"
44386 +               : :"a" (eax), "c" (ecx), "d"(edx));
44387 +}
44388 +
44389 +static inline void __mwait(unsigned long eax, unsigned long ecx)
44390 +{
44391 +       /* "mwait %eax,%ecx;" */
44392 +       asm volatile(
44393 +               ".byte 0x0f,0x01,0xc9;"
44394 +               : :"a" (eax), "c" (ecx));
44395 +}
44396 +
44397 +#define stack_current() \
44398 +({                                                             \
44399 +       struct thread_info *ti;                                 \
44400 +       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
44401 +       ti->task;                                       \
44402 +})
44403 +
44404 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
44405 +
44406 +extern unsigned long boot_option_idle_override;
44407 +/* Boot loader type from the setup header */
44408 +extern int bootloader_type;
44409 +
44410 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
44411 +
44412 +#endif /* __ASM_X86_64_PROCESSOR_H */
44413 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h
44414 ===================================================================
44415 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
44416 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h       2007-06-12 13:14:13.000000000 +0200
44417 @@ -0,0 +1,150 @@
44418 +#ifndef __ASM_SMP_H
44419 +#define __ASM_SMP_H
44420 +
44421 +/*
44422 + * We need the APIC definitions automatically as part of 'smp.h'
44423 + */
44424 +#ifndef __ASSEMBLY__
44425 +#include <linux/threads.h>
44426 +#include <linux/cpumask.h>
44427 +#include <linux/bitops.h>
44428 +extern int disable_apic;
44429 +#endif
44430 +
44431 +#ifdef CONFIG_X86_LOCAL_APIC
44432 +#ifndef __ASSEMBLY__
44433 +#include <asm/fixmap.h>
44434 +#include <asm/mpspec.h>
44435 +#ifdef CONFIG_X86_IO_APIC
44436 +#include <asm/io_apic.h>
44437 +#endif
44438 +#include <asm/apic.h>
44439 +#include <asm/thread_info.h>
44440 +#endif
44441 +#endif
44442 +
44443 +#ifdef CONFIG_SMP
44444 +#ifndef ASSEMBLY
44445 +
44446 +#include <asm/pda.h>
44447 +
44448 +struct pt_regs;
44449 +
44450 +extern cpumask_t cpu_present_mask;
44451 +extern cpumask_t cpu_possible_map;
44452 +extern cpumask_t cpu_online_map;
44453 +extern cpumask_t cpu_initialized;
44454 +
44455 +/*
44456 + * Private routines/data
44457 + */
44458 +
44459 +extern void smp_alloc_memory(void);
44460 +extern volatile unsigned long smp_invalidate_needed;
44461 +extern int pic_mode;
44462 +extern void lock_ipi_call_lock(void);
44463 +extern void unlock_ipi_call_lock(void);
44464 +extern int smp_num_siblings;
44465 +extern void smp_send_reschedule(int cpu);
44466 +void smp_stop_cpu(void);
44467 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
44468 +                               void *info, int retry, int wait);
44469 +
44470 +extern cpumask_t cpu_sibling_map[NR_CPUS];
44471 +extern cpumask_t cpu_core_map[NR_CPUS];
44472 +extern u8 cpu_llc_id[NR_CPUS];
44473 +
44474 +#define SMP_TRAMPOLINE_BASE 0x6000
44475 +
44476 +/*
44477 + * On x86 all CPUs are mapped 1:1 to the APIC space.
44478 + * This simplifies scheduling and IPI sending and
44479 + * compresses data structures.
44480 + */
44481 +
44482 +static inline int num_booting_cpus(void)
44483 +{
44484 +       return cpus_weight(cpu_possible_map);
44485 +}
44486 +
44487 +#define raw_smp_processor_id() read_pda(cpunumber)
44488 +
44489 +#ifdef CONFIG_X86_LOCAL_APIC
44490 +static inline int hard_smp_processor_id(void)
44491 +{
44492 +       /* we don't want to mark this access volatile - bad code generation */
44493 +       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
44494 +}
44495 +#endif
44496 +
44497 +extern int safe_smp_processor_id(void);
44498 +extern int __cpu_disable(void);
44499 +extern void __cpu_die(unsigned int cpu);
44500 +extern void prefill_possible_map(void);
44501 +extern unsigned num_processors;
44502 +extern unsigned disabled_cpus;
44503 +
44504 +#endif /* !ASSEMBLY */
44505 +
44506 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
44507 +
44508 +#endif
44509 +
44510 +#ifndef ASSEMBLY
44511 +/*
44512 + * Some lowlevel functions might want to know about
44513 + * the real APIC ID <-> CPU # mapping.
44514 + */
44515 +extern u8 x86_cpu_to_apicid[NR_CPUS];  /* physical ID */
44516 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
44517 +extern u8 bios_cpu_apicid[];
44518 +
44519 +#ifdef CONFIG_X86_LOCAL_APIC
44520 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
44521 +{
44522 +       return cpus_addr(cpumask)[0];
44523 +}
44524 +
44525 +static inline int cpu_present_to_apicid(int mps_cpu)
44526 +{
44527 +       if (mps_cpu < NR_CPUS)
44528 +               return (int)bios_cpu_apicid[mps_cpu];
44529 +       else
44530 +               return BAD_APICID;
44531 +}
44532 +#endif
44533 +
44534 +#endif /* !ASSEMBLY */
44535 +
44536 +#ifndef CONFIG_SMP
44537 +#define stack_smp_processor_id() 0
44538 +#define safe_smp_processor_id() 0
44539 +#define cpu_logical_map(x) (x)
44540 +#else
44541 +#include <asm/thread_info.h>
44542 +#define stack_smp_processor_id() \
44543 +({                                                             \
44544 +       struct thread_info *ti;                                 \
44545 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
44546 +       ti->cpu;                                                \
44547 +})
44548 +#endif
44549 +
44550 +#ifndef __ASSEMBLY__
44551 +#ifdef CONFIG_X86_LOCAL_APIC
44552 +static __inline int logical_smp_processor_id(void)
44553 +{
44554 +       /* we don't want to mark this access volatile - bad code generation */
44555 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
44556 +}
44557 +#endif
44558 +#endif
44559 +
44560 +#ifdef CONFIG_SMP
44561 +#define cpu_physical_id(cpu)           x86_cpu_to_apicid[cpu]
44562 +#else
44563 +#define cpu_physical_id(cpu)           boot_cpu_id
44564 +#endif
44565 +
44566 +#endif
44567 +
44568 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h
44569 ===================================================================
44570 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
44571 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h    2007-11-26 16:59:25.000000000 +0100
44572 @@ -0,0 +1,256 @@
44573 +#ifndef __ASM_SYSTEM_H
44574 +#define __ASM_SYSTEM_H
44575 +
44576 +#include <linux/kernel.h>
44577 +#include <asm/segment.h>
44578 +#include <asm/alternative.h>
44579 +
44580 +#include <asm/synch_bitops.h>
44581 +#include <asm/hypervisor.h>
44582 +#include <xen/interface/arch-x86_64.h>
44583 +
44584 +#ifdef __KERNEL__
44585 +
44586 +#define __STR(x) #x
44587 +#define STR(x) __STR(x)
44588 +
44589 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
44590 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
44591 +
44592 +/* frame pointer must be last for get_wchan */
44593 +#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
44594 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
44595 +
44596 +#define __EXTRA_CLOBBER  \
44597 +       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
44598 +
44599 +#define switch_to(prev,next,last) \
44600 +       asm volatile(SAVE_CONTEXT                                                   \
44601 +                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
44602 +                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
44603 +                    "call __switch_to\n\t"                                       \
44604 +                    ".globl thread_return\n"                                   \
44605 +                    "thread_return:\n\t"                                           \
44606 +                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
44607 +                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
44608 +                    LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
44609 +                    "movq %%rax,%%rdi\n\t"                                       \
44610 +                    "jc   ret_from_fork\n\t"                                     \
44611 +                    RESTORE_CONTEXT                                                \
44612 +                    : "=a" (last)                                                \
44613 +                    : [next] "S" (next), [prev] "D" (prev),                      \
44614 +                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
44615 +                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
44616 +                      [tif_fork] "i" (TIF_FORK),                         \
44617 +                      [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
44618 +                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
44619 +                    : "memory", "cc" __EXTRA_CLOBBER)
44620 +
44621 +extern void load_gs_index(unsigned);
44622 +
44623 +/*
44624 + * Load a segment. Fall back on loading the zero
44625 + * segment if something goes wrong..
44626 + */
44627 +#define loadsegment(seg,value) \
44628 +       asm volatile("\n"                       \
44629 +               "1:\t"                          \
44630 +               "movl %k0,%%" #seg "\n"         \
44631 +               "2:\n"                          \
44632 +               ".section .fixup,\"ax\"\n"      \
44633 +               "3:\t"                          \
44634 +               "movl %1,%%" #seg "\n\t"        \
44635 +               "jmp 2b\n"                      \
44636 +               ".previous\n"                   \
44637 +               ".section __ex_table,\"a\"\n\t" \
44638 +               ".align 8\n\t"                  \
44639 +               ".quad 1b,3b\n"                 \
44640 +               ".previous"                     \
44641 +               : :"r" (value), "r" (0))
44642 +
44643 +/*
44644 + * Clear and set 'TS' bit respectively
44645 + */
44646 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
44647 +
44648 +static inline unsigned long read_cr0(void)
44649 +{
44650 +       unsigned long cr0;
44651 +       asm volatile("movq %%cr0,%0" : "=r" (cr0));
44652 +       return cr0;
44653 +}
44654 +
44655 +static inline void write_cr0(unsigned long val)
44656 +{
44657 +       asm volatile("movq %0,%%cr0" :: "r" (val));
44658 +}
44659 +
44660 +#define read_cr3() ({ \
44661 +       unsigned long __dummy; \
44662 +       asm("movq %%cr3,%0" : "=r" (__dummy)); \
44663 +       machine_to_phys(__dummy); \
44664 +})
44665 +
44666 +static inline unsigned long read_cr4(void)
44667 +{
44668 +       unsigned long cr4;
44669 +       asm("movq %%cr4,%0" : "=r" (cr4));
44670 +       return cr4;
44671 +}
44672 +
44673 +static inline void write_cr4(unsigned long val)
44674 +{
44675 +       asm volatile("movq %0,%%cr4" :: "r" (val));
44676 +}
44677 +
44678 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
44679 +
44680 +#define wbinvd() \
44681 +       __asm__ __volatile__ ("wbinvd": : :"memory");
44682 +
44683 +/*
44684 + * On SMP systems, when the scheduler does migration-cost autodetection,
44685 + * it needs a way to flush as much of the CPU's caches as possible.
44686 + */
44687 +static inline void sched_cacheflush(void)
44688 +{
44689 +       wbinvd();
44690 +}
44691 +
44692 +#endif /* __KERNEL__ */
44693 +
44694 +#define nop() __asm__ __volatile__ ("nop")
44695 +
44696 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
44697 +
44698 +#define tas(ptr) (xchg((ptr),1))
44699 +
44700 +#define __xg(x) ((volatile long *)(x))
44701 +
44702 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
44703 +{
44704 +       *ptr = val;
44705 +}
44706 +
44707 +#define _set_64bit set_64bit
44708 +
44709 +/*
44710 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
44711 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
44712 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
44713 + */
44714 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
44715 +{
44716 +       switch (size) {
44717 +               case 1:
44718 +                       __asm__ __volatile__("xchgb %b0,%1"
44719 +                               :"=q" (x)
44720 +                               :"m" (*__xg(ptr)), "0" (x)
44721 +                               :"memory");
44722 +                       break;
44723 +               case 2:
44724 +                       __asm__ __volatile__("xchgw %w0,%1"
44725 +                               :"=r" (x)
44726 +                               :"m" (*__xg(ptr)), "0" (x)
44727 +                               :"memory");
44728 +                       break;
44729 +               case 4:
44730 +                       __asm__ __volatile__("xchgl %k0,%1"
44731 +                               :"=r" (x)
44732 +                               :"m" (*__xg(ptr)), "0" (x)
44733 +                               :"memory");
44734 +                       break;
44735 +               case 8:
44736 +                       __asm__ __volatile__("xchgq %0,%1"
44737 +                               :"=r" (x)
44738 +                               :"m" (*__xg(ptr)), "0" (x)
44739 +                               :"memory");
44740 +                       break;
44741 +       }
44742 +       return x;
44743 +}
44744 +
44745 +/*
44746 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
44747 + * store NEW in MEM.  Return the initial value in MEM.  Success is
44748 + * indicated by comparing RETURN with OLD.
44749 + */
44750 +
44751 +#define __HAVE_ARCH_CMPXCHG 1
44752 +
44753 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
44754 +                                     unsigned long new, int size)
44755 +{
44756 +       unsigned long prev;
44757 +       switch (size) {
44758 +       case 1:
44759 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
44760 +                                    : "=a"(prev)
44761 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
44762 +                                    : "memory");
44763 +               return prev;
44764 +       case 2:
44765 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
44766 +                                    : "=a"(prev)
44767 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
44768 +                                    : "memory");
44769 +               return prev;
44770 +       case 4:
44771 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
44772 +                                    : "=a"(prev)
44773 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
44774 +                                    : "memory");
44775 +               return prev;
44776 +       case 8:
44777 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
44778 +                                    : "=a"(prev)
44779 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
44780 +                                    : "memory");
44781 +               return prev;
44782 +       }
44783 +       return old;
44784 +}
44785 +
44786 +#define cmpxchg(ptr,o,n)\
44787 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
44788 +                                       (unsigned long)(n),sizeof(*(ptr))))
44789 +
44790 +#ifdef CONFIG_SMP
44791 +#define smp_mb()       mb()
44792 +#define smp_rmb()      rmb()
44793 +#define smp_wmb()      wmb()
44794 +#define smp_read_barrier_depends()     do {} while(0)
44795 +#else
44796 +#define smp_mb()       barrier()
44797 +#define smp_rmb()      barrier()
44798 +#define smp_wmb()      barrier()
44799 +#define smp_read_barrier_depends()     do {} while(0)
44800 +#endif
44801 +
44802 +
44803 +/*
44804 + * Force strict CPU ordering.
44805 + * And yes, this is required on UP too when we're talking
44806 + * to devices.
44807 + */
44808 +#define mb()   asm volatile("mfence":::"memory")
44809 +#define rmb()  asm volatile("lfence":::"memory")
44810 +
44811 +#ifdef CONFIG_UNORDERED_IO
44812 +#define wmb()  asm volatile("sfence" ::: "memory")
44813 +#else
44814 +#define wmb()  asm volatile("" ::: "memory")
44815 +#endif
44816 +#define read_barrier_depends() do {} while(0)
44817 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
44818 +
44819 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
44820 +
44821 +#include <linux/irqflags.h>
44822 +
44823 +void cpu_idle_wait(void);
44824 +
44825 +extern unsigned long arch_align_stack(unsigned long sp);
44826 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
44827 +
44828 +#endif
44829 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h
44830 ===================================================================
44831 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
44832 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h  2007-11-26 16:59:25.000000000 +0100
44833 @@ -0,0 +1,103 @@
44834 +#ifndef _X8664_TLBFLUSH_H
44835 +#define _X8664_TLBFLUSH_H
44836 +
44837 +#include <linux/mm.h>
44838 +#include <asm/processor.h>
44839 +
44840 +#define __flush_tlb()  xen_tlb_flush()
44841 +
44842 +/*
44843 + * Global pages have to be flushed a bit differently. Not a real
44844 + * performance problem because this does not happen often.
44845 + */
44846 +#define __flush_tlb_global()   xen_tlb_flush()
44847 +
44848 +
44849 +extern unsigned long pgkern_mask;
44850 +
44851 +#define __flush_tlb_all() __flush_tlb_global()
44852 +
44853 +#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
44854 +
44855 +
44856 +/*
44857 + * TLB flushing:
44858 + *
44859 + *  - flush_tlb() flushes the current mm struct TLBs
44860 + *  - flush_tlb_all() flushes all processes TLBs
44861 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
44862 + *  - flush_tlb_page(vma, vmaddr) flushes one page
44863 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
44864 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
44865 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
44866 + *
44867 + * x86-64 can only flush individual pages or full VMs. For a range flush
44868 + * we always do the full VM. Might be worth trying if for a small
44869 + * range a few INVLPGs in a row are a win.
44870 + */
44871 +
44872 +#ifndef CONFIG_SMP
44873 +
44874 +#define flush_tlb() __flush_tlb()
44875 +#define flush_tlb_all() __flush_tlb_all()
44876 +#define local_flush_tlb() __flush_tlb()
44877 +
44878 +static inline void flush_tlb_mm(struct mm_struct *mm)
44879 +{
44880 +       if (mm == current->active_mm)
44881 +               __flush_tlb();
44882 +}
44883 +
44884 +static inline void flush_tlb_page(struct vm_area_struct *vma,
44885 +       unsigned long addr)
44886 +{
44887 +       if (vma->vm_mm == current->active_mm)
44888 +               __flush_tlb_one(addr);
44889 +}
44890 +
44891 +static inline void flush_tlb_range(struct vm_area_struct *vma,
44892 +       unsigned long start, unsigned long end)
44893 +{
44894 +       if (vma->vm_mm == current->active_mm)
44895 +               __flush_tlb();
44896 +}
44897 +
44898 +#else
44899 +
44900 +#include <asm/smp.h>
44901 +
44902 +#define local_flush_tlb() \
44903 +       __flush_tlb()
44904 +
44905 +#define flush_tlb_all xen_tlb_flush_all
44906 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
44907 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
44908 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
44909 +
44910 +#define flush_tlb()    flush_tlb_current_task()
44911 +
44912 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
44913 +{
44914 +       flush_tlb_mm(vma->vm_mm);
44915 +}
44916 +
44917 +#define TLBSTATE_OK    1
44918 +#define TLBSTATE_LAZY  2
44919 +
44920 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
44921 +   ranges. Cost is about 42k of memory for each CPU. */
44922 +#define ARCH_FREE_PTE_NR 5350
44923 +
44924 +#endif
44925 +
44926 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
44927 +
44928 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
44929 +                                     unsigned long start, unsigned long end)
44930 +{
44931 +       /* x86_64 does not keep any page table caches in a software TLB.
44932 +          The CPUs do in their hardware TLBs, but they are handled
44933 +          by the normal TLB flushing algorithms. */
44934 +}
44935 +
44936 +#endif /* _X8664_TLBFLUSH_H */
44937 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h
44938 ===================================================================
44939 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
44940 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h       2007-06-12 13:14:13.000000000 +0200
44941 @@ -0,0 +1,328 @@
44942 +/*
44943 + * x86-64 changes / gcc fixes from Andi Kleen.
44944 + * Copyright 2002 Andi Kleen, SuSE Labs.
44945 + *
44946 + * This hasn't been optimized for the hammer yet, but there are likely
44947 + * no advantages to be gotten from x86-64 here anyways.
44948 + */
44949 +
44950 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
44951 +
44952 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
44953 +   tell it to do a clts before the register saving. */
44954 +#define XMMS_SAVE do {                         \
44955 +       preempt_disable();                      \
44956 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
44957 +               clts();                         \
44958 +       __asm__ __volatile__ (                  \
44959 +               "movups %%xmm0,(%1)     ;\n\t"  \
44960 +               "movups %%xmm1,0x10(%1) ;\n\t"  \
44961 +               "movups %%xmm2,0x20(%1) ;\n\t"  \
44962 +               "movups %%xmm3,0x30(%1) ;\n\t"  \
44963 +               : "=&r" (cr0)                   \
44964 +               : "r" (xmm_save)                \
44965 +               : "memory");                    \
44966 +} while(0)
44967 +
44968 +#define XMMS_RESTORE do {                      \
44969 +       asm volatile (                          \
44970 +               "sfence                 ;\n\t"  \
44971 +               "movups (%1),%%xmm0     ;\n\t"  \
44972 +               "movups 0x10(%1),%%xmm1 ;\n\t"  \
44973 +               "movups 0x20(%1),%%xmm2 ;\n\t"  \
44974 +               "movups 0x30(%1),%%xmm3 ;\n\t"  \
44975 +               :                               \
44976 +               : "r" (cr0), "r" (xmm_save)     \
44977 +               : "memory");                    \
44978 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
44979 +               stts();                         \
44980 +       preempt_enable();                       \
44981 +} while(0)
44982 +
44983 +#define OFFS(x)                "16*("#x")"
44984 +#define PF_OFFS(x)     "256+16*("#x")"
44985 +#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
44986 +#define LD(x,y)                "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
44987 +#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
44988 +#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
44989 +#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
44990 +#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
44991 +#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
44992 +#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
44993 +#define XO1(x,y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
44994 +#define XO2(x,y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
44995 +#define XO3(x,y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
44996 +#define XO4(x,y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
44997 +#define XO5(x,y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
44998 +
44999 +
45000 +static void
45001 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45002 +{
45003 +        unsigned int lines = bytes >> 8;
45004 +       unsigned long cr0;
45005 +       xmm_store_t xmm_save[4];
45006 +
45007 +       XMMS_SAVE;
45008 +
45009 +        asm volatile (
45010 +#undef BLOCK
45011 +#define BLOCK(i) \
45012 +               LD(i,0)                                 \
45013 +                       LD(i+1,1)                       \
45014 +               PF1(i)                                  \
45015 +                               PF1(i+2)                \
45016 +                               LD(i+2,2)               \
45017 +                                       LD(i+3,3)       \
45018 +               PF0(i+4)                                \
45019 +                               PF0(i+6)                \
45020 +               XO1(i,0)                                \
45021 +                       XO1(i+1,1)                      \
45022 +                               XO1(i+2,2)              \
45023 +                                       XO1(i+3,3)      \
45024 +               ST(i,0)                                 \
45025 +                       ST(i+1,1)                       \
45026 +                               ST(i+2,2)               \
45027 +                                       ST(i+3,3)       \
45028 +
45029 +
45030 +               PF0(0)
45031 +                               PF0(2)
45032 +
45033 +       " .align 32                     ;\n"
45034 +        " 1:                            ;\n"
45035 +
45036 +               BLOCK(0)
45037 +               BLOCK(4)
45038 +               BLOCK(8)
45039 +               BLOCK(12)
45040 +
45041 +        "       addq %[inc], %[p1]           ;\n"
45042 +        "       addq %[inc], %[p2]           ;\n"
45043 +               "               decl %[cnt] ; jnz 1b"
45044 +       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
45045 +       : [inc] "r" (256UL)
45046 +        : "memory");
45047 +
45048 +       XMMS_RESTORE;
45049 +}
45050 +
45051 +static void
45052 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45053 +         unsigned long *p3)
45054 +{
45055 +       unsigned int lines = bytes >> 8;
45056 +       xmm_store_t xmm_save[4];
45057 +       unsigned long cr0;
45058 +
45059 +       XMMS_SAVE;
45060 +
45061 +        __asm__ __volatile__ (
45062 +#undef BLOCK
45063 +#define BLOCK(i) \
45064 +               PF1(i)                                  \
45065 +                               PF1(i+2)                \
45066 +               LD(i,0)                                 \
45067 +                       LD(i+1,1)                       \
45068 +                               LD(i+2,2)               \
45069 +                                       LD(i+3,3)       \
45070 +               PF2(i)                                  \
45071 +                               PF2(i+2)                \
45072 +               PF0(i+4)                                \
45073 +                               PF0(i+6)                \
45074 +               XO1(i,0)                                \
45075 +                       XO1(i+1,1)                      \
45076 +                               XO1(i+2,2)              \
45077 +                                       XO1(i+3,3)      \
45078 +               XO2(i,0)                                \
45079 +                       XO2(i+1,1)                      \
45080 +                               XO2(i+2,2)              \
45081 +                                       XO2(i+3,3)      \
45082 +               ST(i,0)                                 \
45083 +                       ST(i+1,1)                       \
45084 +                               ST(i+2,2)               \
45085 +                                       ST(i+3,3)       \
45086 +
45087 +
45088 +               PF0(0)
45089 +                               PF0(2)
45090 +
45091 +       " .align 32                     ;\n"
45092 +        " 1:                            ;\n"
45093 +
45094 +               BLOCK(0)
45095 +               BLOCK(4)
45096 +               BLOCK(8)
45097 +               BLOCK(12)
45098 +
45099 +        "       addq %[inc], %[p1]           ;\n"
45100 +        "       addq %[inc], %[p2]          ;\n"
45101 +        "       addq %[inc], %[p3]           ;\n"
45102 +               "               decl %[cnt] ; jnz 1b"
45103 +       : [cnt] "+r" (lines),
45104 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
45105 +       : [inc] "r" (256UL)
45106 +       : "memory");
45107 +       XMMS_RESTORE;
45108 +}
45109 +
45110 +static void
45111 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45112 +         unsigned long *p3, unsigned long *p4)
45113 +{
45114 +       unsigned int lines = bytes >> 8;
45115 +       xmm_store_t xmm_save[4];
45116 +       unsigned long cr0;
45117 +
45118 +       XMMS_SAVE;
45119 +
45120 +        __asm__ __volatile__ (
45121 +#undef BLOCK
45122 +#define BLOCK(i) \
45123 +               PF1(i)                                  \
45124 +                               PF1(i+2)                \
45125 +               LD(i,0)                                 \
45126 +                       LD(i+1,1)                       \
45127 +                               LD(i+2,2)               \
45128 +                                       LD(i+3,3)       \
45129 +               PF2(i)                                  \
45130 +                               PF2(i+2)                \
45131 +               XO1(i,0)                                \
45132 +                       XO1(i+1,1)                      \
45133 +                               XO1(i+2,2)              \
45134 +                                       XO1(i+3,3)      \
45135 +               PF3(i)                                  \
45136 +                               PF3(i+2)                \
45137 +               PF0(i+4)                                \
45138 +                               PF0(i+6)                \
45139 +               XO2(i,0)                                \
45140 +                       XO2(i+1,1)                      \
45141 +                               XO2(i+2,2)              \
45142 +                                       XO2(i+3,3)      \
45143 +               XO3(i,0)                                \
45144 +                       XO3(i+1,1)                      \
45145 +                               XO3(i+2,2)              \
45146 +                                       XO3(i+3,3)      \
45147 +               ST(i,0)                                 \
45148 +                       ST(i+1,1)                       \
45149 +                               ST(i+2,2)               \
45150 +                                       ST(i+3,3)       \
45151 +
45152 +
45153 +               PF0(0)
45154 +                               PF0(2)
45155 +
45156 +       " .align 32                     ;\n"
45157 +        " 1:                            ;\n"
45158 +
45159 +               BLOCK(0)
45160 +               BLOCK(4)
45161 +               BLOCK(8)
45162 +               BLOCK(12)
45163 +
45164 +        "       addq %[inc], %[p1]           ;\n"
45165 +        "       addq %[inc], %[p2]           ;\n"
45166 +        "       addq %[inc], %[p3]           ;\n"
45167 +        "       addq %[inc], %[p4]           ;\n"
45168 +       "       decl %[cnt] ; jnz 1b"
45169 +       : [cnt] "+c" (lines),
45170 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
45171 +       : [inc] "r" (256UL)
45172 +        : "memory" );
45173 +
45174 +       XMMS_RESTORE;
45175 +}
45176 +
45177 +static void
45178 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45179 +         unsigned long *p3, unsigned long *p4, unsigned long *p5)
45180 +{
45181 +        unsigned int lines = bytes >> 8;
45182 +       xmm_store_t xmm_save[4];
45183 +       unsigned long cr0;
45184 +
45185 +       XMMS_SAVE;
45186 +
45187 +        __asm__ __volatile__ (
45188 +#undef BLOCK
45189 +#define BLOCK(i) \
45190 +               PF1(i)                                  \
45191 +                               PF1(i+2)                \
45192 +               LD(i,0)                                 \
45193 +                       LD(i+1,1)                       \
45194 +                               LD(i+2,2)               \
45195 +                                       LD(i+3,3)       \
45196 +               PF2(i)                                  \
45197 +                               PF2(i+2)                \
45198 +               XO1(i,0)                                \
45199 +                       XO1(i+1,1)                      \
45200 +                               XO1(i+2,2)              \
45201 +                                       XO1(i+3,3)      \
45202 +               PF3(i)                                  \
45203 +                               PF3(i+2)                \
45204 +               XO2(i,0)                                \
45205 +                       XO2(i+1,1)                      \
45206 +                               XO2(i+2,2)              \
45207 +                                       XO2(i+3,3)      \
45208 +               PF4(i)                                  \
45209 +                               PF4(i+2)                \
45210 +               PF0(i+4)                                \
45211 +                               PF0(i+6)                \
45212 +               XO3(i,0)                                \
45213 +                       XO3(i+1,1)                      \
45214 +                               XO3(i+2,2)              \
45215 +                                       XO3(i+3,3)      \
45216 +               XO4(i,0)                                \
45217 +                       XO4(i+1,1)                      \
45218 +                               XO4(i+2,2)              \
45219 +                                       XO4(i+3,3)      \
45220 +               ST(i,0)                                 \
45221 +                       ST(i+1,1)                       \
45222 +                               ST(i+2,2)               \
45223 +                                       ST(i+3,3)       \
45224 +
45225 +
45226 +               PF0(0)
45227 +                               PF0(2)
45228 +
45229 +       " .align 32                     ;\n"
45230 +        " 1:                            ;\n"
45231 +
45232 +               BLOCK(0)
45233 +               BLOCK(4)
45234 +               BLOCK(8)
45235 +               BLOCK(12)
45236 +
45237 +        "       addq %[inc], %[p1]           ;\n"
45238 +        "       addq %[inc], %[p2]           ;\n"
45239 +        "       addq %[inc], %[p3]           ;\n"
45240 +        "       addq %[inc], %[p4]           ;\n"
45241 +        "       addq %[inc], %[p5]           ;\n"
45242 +       "       decl %[cnt] ; jnz 1b"
45243 +       : [cnt] "+c" (lines),
45244 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
45245 +         [p5] "+r" (p5)
45246 +       : [inc] "r" (256UL)
45247 +       : "memory");
45248 +
45249 +       XMMS_RESTORE;
45250 +}
45251 +
45252 +static struct xor_block_template xor_block_sse = {
45253 +        .name = "generic_sse",
45254 +        .do_2 = xor_sse_2,
45255 +        .do_3 = xor_sse_3,
45256 +        .do_4 = xor_sse_4,
45257 +        .do_5 = xor_sse_5,
45258 +};
45259 +
45260 +#undef XOR_TRY_TEMPLATES
45261 +#define XOR_TRY_TEMPLATES                              \
45262 +       do {                                            \
45263 +               xor_speed(&xor_block_sse);      \
45264 +       } while (0)
45265 +
45266 +/* We force the use of the SSE xor block because it can write around L2.
45267 +   We may also be able to load into the L1 only depending on how the cpu
45268 +   deals with a load to a line that is being prefetched.  */
45269 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
45270 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_time.h
45271 ===================================================================
45272 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45273 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_time.h        2007-06-12 13:14:13.000000000 +0200
45274 @@ -0,0 +1,111 @@
45275 +/*
45276 + *  include/asm-i386/mach-default/mach_time.h
45277 + *
45278 + *  Machine specific set RTC function for generic.
45279 + *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
45280 + */
45281 +#ifndef _MACH_TIME_H
45282 +#define _MACH_TIME_H
45283 +
45284 +#include <asm-i386/mc146818rtc.h>
45285 +
45286 +/* for check timing call set_rtc_mmss() 500ms     */
45287 +/* used in arch/i386/time.c::do_timer_interrupt() */
45288 +#define USEC_AFTER     500000
45289 +#define USEC_BEFORE    500000
45290 +
45291 +/*
45292 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
45293 + * called 500 ms after the second nowtime has started, because when
45294 + * nowtime is written into the registers of the CMOS clock, it will
45295 + * jump to the next second precisely 500 ms later. Check the Motorola
45296 + * MC146818A or Dallas DS12887 data sheet for details.
45297 + *
45298 + * BUG: This routine does not handle hour overflow properly; it just
45299 + *      sets the minutes. Usually you'll only notice that after reboot!
45300 + */
45301 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
45302 +{
45303 +       int retval = 0;
45304 +       int real_seconds, real_minutes, cmos_minutes;
45305 +       unsigned char save_control, save_freq_select;
45306 +
45307 +       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
45308 +       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
45309 +
45310 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
45311 +       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
45312 +
45313 +       cmos_minutes = CMOS_READ(RTC_MINUTES);
45314 +       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
45315 +               BCD_TO_BIN(cmos_minutes);
45316 +
45317 +       /*
45318 +        * since we're only adjusting minutes and seconds,
45319 +        * don't interfere with hour overflow. This avoids
45320 +        * messing with unknown time zones but requires your
45321 +        * RTC not to be off by more than 15 minutes
45322 +        */
45323 +       real_seconds = nowtime % 60;
45324 +       real_minutes = nowtime / 60;
45325 +       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
45326 +               real_minutes += 30;             /* correct for half hour time zone */
45327 +       real_minutes %= 60;
45328 +
45329 +       if (abs(real_minutes - cmos_minutes) < 30) {
45330 +               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45331 +                       BIN_TO_BCD(real_seconds);
45332 +                       BIN_TO_BCD(real_minutes);
45333 +               }
45334 +               CMOS_WRITE(real_seconds,RTC_SECONDS);
45335 +               CMOS_WRITE(real_minutes,RTC_MINUTES);
45336 +       } else {
45337 +               printk(KERN_WARNING
45338 +                      "set_rtc_mmss: can't update from %d to %d\n",
45339 +                      cmos_minutes, real_minutes);
45340 +               retval = -1;
45341 +       }
45342 +
45343 +       /* The following flags have to be released exactly in this order,
45344 +        * otherwise the DS12887 (popular MC146818A clone with integrated
45345 +        * battery and quartz) will not reset the oscillator and will not
45346 +        * update precisely 500 ms later. You won't find this mentioned in
45347 +        * the Dallas Semiconductor data sheets, but who believes data
45348 +        * sheets anyway ...                           -- Markus Kuhn
45349 +        */
45350 +       CMOS_WRITE(save_control, RTC_CONTROL);
45351 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
45352 +
45353 +       return retval;
45354 +}
45355 +
45356 +static inline unsigned long mach_get_cmos_time(void)
45357 +{
45358 +       unsigned int year, mon, day, hour, min, sec;
45359 +
45360 +       do {
45361 +               sec = CMOS_READ(RTC_SECONDS);
45362 +               min = CMOS_READ(RTC_MINUTES);
45363 +               hour = CMOS_READ(RTC_HOURS);
45364 +               day = CMOS_READ(RTC_DAY_OF_MONTH);
45365 +               mon = CMOS_READ(RTC_MONTH);
45366 +               year = CMOS_READ(RTC_YEAR);
45367 +       } while (sec != CMOS_READ(RTC_SECONDS));
45368 +
45369 +       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45370 +               BCD_TO_BIN(sec);
45371 +               BCD_TO_BIN(min);
45372 +               BCD_TO_BIN(hour);
45373 +               BCD_TO_BIN(day);
45374 +               BCD_TO_BIN(mon);
45375 +               BCD_TO_BIN(year);
45376 +       }
45377 +
45378 +       year += 1900;
45379 +       if (year < 1970)
45380 +               year += 100;
45381 +
45382 +       return mktime(year, mon, day, hour, min, sec);
45383 +}
45384 +
45385 +#endif /* !_MACH_TIME_H */
45386 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h
45387 ===================================================================
45388 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45389 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h       2007-06-12 13:14:13.000000000 +0200
45390 @@ -0,0 +1,50 @@
45391 +/*
45392 + *  include/asm-i386/mach-default/mach_timer.h
45393 + *
45394 + *  Machine specific calibrate_tsc() for generic.
45395 + *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
45396 + */
45397 +/* ------ Calibrate the TSC -------
45398 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
45399 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
45400 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
45401 + * output busy loop as low as possible. We avoid reading the CTC registers
45402 + * directly because of the awkward 8-bit access mechanism of the 82C54
45403 + * device.
45404 + */
45405 +#ifndef _MACH_TIMER_H
45406 +#define _MACH_TIMER_H
45407 +
45408 +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
45409 +#define CALIBRATE_LATCH        \
45410 +       ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
45411 +
45412 +static inline void mach_prepare_counter(void)
45413 +{
45414 +       /* Set the Gate high, disable speaker */
45415 +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
45416 +
45417 +       /*
45418 +        * Now let's take care of CTC channel 2
45419 +        *
45420 +        * Set the Gate high, program CTC channel 2 for mode 0,
45421 +        * (interrupt on terminal count mode), binary count,
45422 +        * load 5 * LATCH count, (LSB and MSB) to begin countdown.
45423 +        *
45424 +        * Some devices need a delay here.
45425 +        */
45426 +       outb(0xb0, 0x43);                       /* binary, mode 0, LSB/MSB, Ch 2 */
45427 +       outb_p(CALIBRATE_LATCH & 0xff, 0x42);   /* LSB of count */
45428 +       outb_p(CALIBRATE_LATCH >> 8, 0x42);       /* MSB of count */
45429 +}
45430 +
45431 +static inline void mach_countup(unsigned long *count_p)
45432 +{
45433 +       unsigned long count = 0;
45434 +       do {
45435 +               count++;
45436 +       } while ((inb_p(0x61) & 0x20) == 0);
45437 +       *count_p = count;
45438 +}
45439 +
45440 +#endif /* !_MACH_TIMER_H */
45441 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h
45442 ===================================================================
45443 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45444 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h  2007-06-12 13:14:13.000000000 +0200
45445 @@ -0,0 +1,63 @@
45446 +/**
45447 + * machine_specific_* - Hooks for machine specific setup.
45448 + *
45449 + * Description:
45450 + *     This is included late in kernel/setup.c so that it can make
45451 + *     use of all of the static functions.
45452 + **/
45453 +
45454 +#include <xen/interface/callback.h>
45455 +
45456 +extern void hypervisor_callback(void);
45457 +extern void failsafe_callback(void);
45458 +extern void nmi(void);
45459 +
45460 +static void __init machine_specific_arch_setup(void)
45461 +{
45462 +       int ret;
45463 +       static struct callback_register __initdata event = {
45464 +               .type = CALLBACKTYPE_event,
45465 +               .address = (unsigned long) hypervisor_callback,
45466 +       };
45467 +       static struct callback_register __initdata failsafe = {
45468 +               .type = CALLBACKTYPE_failsafe,
45469 +               .address = (unsigned long)failsafe_callback,
45470 +       };
45471 +       static struct callback_register __initdata syscall = {
45472 +               .type = CALLBACKTYPE_syscall,
45473 +               .address = (unsigned long)system_call,
45474 +       };
45475 +#ifdef CONFIG_X86_LOCAL_APIC
45476 +       static struct callback_register __initdata nmi_cb = {
45477 +               .type = CALLBACKTYPE_nmi,
45478 +               .address = (unsigned long)nmi,
45479 +       };
45480 +#endif
45481 +
45482 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
45483 +       if (ret == 0)
45484 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
45485 +       if (ret == 0)
45486 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
45487 +#if CONFIG_XEN_COMPAT <= 0x030002
45488 +       if (ret == -ENOSYS)
45489 +               ret = HYPERVISOR_set_callbacks(
45490 +                       event.address,
45491 +                       failsafe.address,
45492 +                       syscall.address);
45493 +#endif
45494 +       BUG_ON(ret);
45495 +
45496 +#ifdef CONFIG_X86_LOCAL_APIC
45497 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
45498 +#if CONFIG_XEN_COMPAT <= 0x030002
45499 +       if (ret == -ENOSYS) {
45500 +               static struct xennmi_callback __initdata cb = {
45501 +                       .handler_address = (unsigned long)nmi
45502 +               };
45503 +
45504 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
45505 +       }
45506 +#endif
45507 +#endif
45508 +}
45509 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h
45510 ===================================================================
45511 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45512 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h   2007-06-12 13:14:13.000000000 +0200
45513 @@ -0,0 +1,5 @@
45514 +/* Hook to call BIOS initialisation function */
45515 +
45516 +#define ARCH_SETUP machine_specific_arch_setup();
45517 +
45518 +static void __init machine_specific_arch_setup(void);
45519 Index: head-2008-11-25/include/xen/blkif.h
45520 ===================================================================
45521 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45522 +++ head-2008-11-25/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200
45523 @@ -0,0 +1,123 @@
45524 +/*
45525 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45526 + * of this software and associated documentation files (the "Software"), to
45527 + * deal in the Software without restriction, including without limitation the
45528 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
45529 + * sell copies of the Software, and to permit persons to whom the Software is
45530 + * furnished to do so, subject to the following conditions:
45531 + *
45532 + * The above copyright notice and this permission notice shall be included in
45533 + * all copies or substantial portions of the Software.
45534 + *
45535 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45536 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45537 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45538 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45539 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45540 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
45541 + * DEALINGS IN THE SOFTWARE.
45542 + */
45543 +
45544 +#ifndef __XEN_BLKIF_H__
45545 +#define __XEN_BLKIF_H__
45546 +
45547 +#include <xen/interface/io/ring.h>
45548 +#include <xen/interface/io/blkif.h>
45549 +#include <xen/interface/io/protocols.h>
45550 +
45551 +/* Not a real protocol.  Used to generate ring structs which contain
45552 + * the elements common to all protocols only.  This way we get a
45553 + * compiler-checkable way to use common struct elements, so we can
45554 + * avoid using switch(protocol) in a number of places.  */
45555 +struct blkif_common_request {
45556 +       char dummy;
45557 +};
45558 +struct blkif_common_response {
45559 +       char dummy;
45560 +};
45561 +
45562 +/* i386 protocol version */
45563 +#pragma pack(push, 4)
45564 +struct blkif_x86_32_request {
45565 +       uint8_t        operation;    /* BLKIF_OP_???                         */
45566 +       uint8_t        nr_segments;  /* number of segments                   */
45567 +       blkif_vdev_t   handle;       /* only for read/write requests         */
45568 +       uint64_t       id;           /* private guest value, echoed in resp  */
45569 +       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
45570 +       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45571 +};
45572 +struct blkif_x86_32_response {
45573 +       uint64_t        id;              /* copied from request */
45574 +       uint8_t         operation;       /* copied from request */
45575 +       int16_t         status;          /* BLKIF_RSP_???       */
45576 +};
45577 +typedef struct blkif_x86_32_request blkif_x86_32_request_t;
45578 +typedef struct blkif_x86_32_response blkif_x86_32_response_t;
45579 +#pragma pack(pop)
45580 +
45581 +/* x86_64 protocol version */
45582 +struct blkif_x86_64_request {
45583 +       uint8_t        operation;    /* BLKIF_OP_???                         */
45584 +       uint8_t        nr_segments;  /* number of segments                   */
45585 +       blkif_vdev_t   handle;       /* only for read/write requests         */
45586 +       uint64_t       __attribute__((__aligned__(8))) id;
45587 +       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
45588 +       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45589 +};
45590 +struct blkif_x86_64_response {
45591 +       uint64_t       __attribute__((__aligned__(8))) id;
45592 +       uint8_t         operation;       /* copied from request */
45593 +       int16_t         status;          /* BLKIF_RSP_???       */
45594 +};
45595 +typedef struct blkif_x86_64_request blkif_x86_64_request_t;
45596 +typedef struct blkif_x86_64_response blkif_x86_64_response_t;
45597 +
45598 +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
45599 +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
45600 +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
45601 +
45602 +union blkif_back_rings {
45603 +       blkif_back_ring_t        native;
45604 +       blkif_common_back_ring_t common;
45605 +       blkif_x86_32_back_ring_t x86_32;
45606 +       blkif_x86_64_back_ring_t x86_64;
45607 +};
45608 +typedef union blkif_back_rings blkif_back_rings_t;
45609 +
45610 +enum blkif_protocol {
45611 +       BLKIF_PROTOCOL_NATIVE = 1,
45612 +       BLKIF_PROTOCOL_X86_32 = 2,
45613 +       BLKIF_PROTOCOL_X86_64 = 3,
45614 +};
45615 +
45616 +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
45617 +{
45618 +       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45619 +       dst->operation = src->operation;
45620 +       dst->nr_segments = src->nr_segments;
45621 +       dst->handle = src->handle;
45622 +       dst->id = src->id;
45623 +       dst->sector_number = src->sector_number;
45624 +       barrier();
45625 +       if (n > dst->nr_segments)
45626 +               n = dst->nr_segments;
45627 +       for (i = 0; i < n; i++)
45628 +               dst->seg[i] = src->seg[i];
45629 +}
45630 +
45631 +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
45632 +{
45633 +       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45634 +       dst->operation = src->operation;
45635 +       dst->nr_segments = src->nr_segments;
45636 +       dst->handle = src->handle;
45637 +       dst->id = src->id;
45638 +       dst->sector_number = src->sector_number;
45639 +       barrier();
45640 +       if (n > dst->nr_segments)
45641 +               n = dst->nr_segments;
45642 +       for (i = 0; i < n; i++)
45643 +               dst->seg[i] = src->seg[i];
45644 +}
45645 +
45646 +#endif /* __XEN_BLKIF_H__ */
45647 Index: head-2008-11-25/include/xen/compat_ioctl.h
45648 ===================================================================
45649 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45650 +++ head-2008-11-25/include/xen/compat_ioctl.h  2007-07-10 09:42:30.000000000 +0200
45651 @@ -0,0 +1,45 @@
45652 +/*
45653 + * This program is free software; you can redistribute it and/or
45654 + * modify it under the terms of the GNU General Public License as
45655 + * published by the Free Software Foundation; either version 2 of the
45656 + * License, or (at your option) any later version.
45657 + *
45658 + * This program is distributed in the hope that it will be useful,
45659 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
45660 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
45661 + * GNU General Public License for more details.
45662 + *
45663 + * You should have received a copy of the GNU General Public License
45664 + * along with this program; if not, write to the Free Software
45665 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
45666 + *
45667 + * Copyright IBM Corp. 2007
45668 + *
45669 + * Authors: Jimi Xenidis <jimix@watson.ibm.com>
45670 + *          Hollis Blanchard <hollisb@us.ibm.com>
45671 + */
45672 +
45673 +#ifndef __LINUX_XEN_COMPAT_H__
45674 +#define __LINUX_XEN_COMPAT_H__
45675 +
45676 +#include <linux/compat.h>
45677 +
45678 +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg);
45679 +struct privcmd_mmap_32 {
45680 +       int num;
45681 +       domid_t dom;
45682 +       compat_uptr_t entry;
45683 +};
45684 +
45685 +struct privcmd_mmapbatch_32 {
45686 +       int num;     /* number of pages to populate */
45687 +       domid_t dom; /* target domain */
45688 +       __u64 addr;  /* virtual address */
45689 +       compat_uptr_t arr; /* array of mfns - top nibble set on err */
45690 +};
45691 +#define IOCTL_PRIVCMD_MMAP_32                   \
45692 +       _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
45693 +#define IOCTL_PRIVCMD_MMAPBATCH_32                  \
45694 +       _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
45695 +
45696 +#endif /* __LINUX_XEN_COMPAT_H__ */
45697 Index: head-2008-11-25/include/xen/cpu_hotplug.h
45698 ===================================================================
45699 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45700 +++ head-2008-11-25/include/xen/cpu_hotplug.h   2007-08-16 18:07:01.000000000 +0200
45701 @@ -0,0 +1,41 @@
45702 +#ifndef __XEN_CPU_HOTPLUG_H__
45703 +#define __XEN_CPU_HOTPLUG_H__
45704 +
45705 +#include <linux/kernel.h>
45706 +#include <linux/cpumask.h>
45707 +
45708 +#if defined(CONFIG_X86) && defined(CONFIG_SMP)
45709 +extern cpumask_t cpu_initialized_map;
45710 +#endif
45711 +
45712 +#if defined(CONFIG_HOTPLUG_CPU)
45713 +
45714 +int cpu_up_check(unsigned int cpu);
45715 +void init_xenbus_allowed_cpumask(void);
45716 +int smp_suspend(void);
45717 +void smp_resume(void);
45718 +
45719 +void cpu_bringup(void);
45720 +
45721 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
45722 +
45723 +#define cpu_up_check(cpu)              (0)
45724 +#define init_xenbus_allowed_cpumask()  ((void)0)
45725 +
45726 +static inline int smp_suspend(void)
45727 +{
45728 +       if (num_online_cpus() > 1) {
45729 +               printk(KERN_WARNING "Can't suspend SMP guests "
45730 +                      "without CONFIG_HOTPLUG_CPU\n");
45731 +               return -EOPNOTSUPP;
45732 +       }
45733 +       return 0;
45734 +}
45735 +
45736 +static inline void smp_resume(void)
45737 +{
45738 +}
45739 +
45740 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
45741 +
45742 +#endif /* __XEN_CPU_HOTPLUG_H__ */
45743 Index: head-2008-11-25/include/xen/driver_util.h
45744 ===================================================================
45745 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45746 +++ head-2008-11-25/include/xen/driver_util.h   2007-06-12 13:14:19.000000000 +0200
45747 @@ -0,0 +1,14 @@
45748 +
45749 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
45750 +#define __ASM_XEN_DRIVER_UTIL_H__
45751 +
45752 +#include <linux/vmalloc.h>
45753 +#include <linux/device.h>
45754 +
45755 +/* Allocate/destroy a 'vmalloc' VM area. */
45756 +extern struct vm_struct *alloc_vm_area(unsigned long size);
45757 +extern void free_vm_area(struct vm_struct *area);
45758 +
45759 +extern struct class *get_xen_class(void);
45760 +
45761 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
45762 Index: head-2008-11-25/include/xen/evtchn.h
45763 ===================================================================
45764 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45765 +++ head-2008-11-25/include/xen/evtchn.h        2008-09-15 13:40:15.000000000 +0200
45766 @@ -0,0 +1,160 @@
45767 +/******************************************************************************
45768 + * evtchn.h
45769 + *
45770 + * Communication via Xen event channels.
45771 + * Also definitions for the device that demuxes notifications to userspace.
45772 + *
45773 + * Copyright (c) 2004-2005, K A Fraser
45774 + *
45775 + * This program is free software; you can redistribute it and/or
45776 + * modify it under the terms of the GNU General Public License version 2
45777 + * as published by the Free Software Foundation; or, when distributed
45778 + * separately from the Linux kernel or incorporated into other
45779 + * software packages, subject to the following license:
45780 + *
45781 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45782 + * of this source file (the "Software"), to deal in the Software without
45783 + * restriction, including without limitation the rights to use, copy, modify,
45784 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45785 + * and to permit persons to whom the Software is furnished to do so, subject to
45786 + * the following conditions:
45787 + *
45788 + * The above copyright notice and this permission notice shall be included in
45789 + * all copies or substantial portions of the Software.
45790 + *
45791 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45792 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45793 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45794 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45795 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45796 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45797 + * IN THE SOFTWARE.
45798 + */
45799 +
45800 +#ifndef __ASM_EVTCHN_H__
45801 +#define __ASM_EVTCHN_H__
45802 +
45803 +#include <linux/interrupt.h>
45804 +#include <asm/hypervisor.h>
45805 +#include <asm/ptrace.h>
45806 +#include <asm/synch_bitops.h>
45807 +#include <xen/interface/event_channel.h>
45808 +#include <linux/smp.h>
45809 +
45810 +/*
45811 + * LOW-LEVEL DEFINITIONS
45812 + */
45813 +
45814 +/*
45815 + * Dynamically bind an event source to an IRQ-like callback handler.
45816 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
45817 + * The IRQ argument passed to the callback handler is the same as returned
45818 + * from the bind call. It may not correspond to a Linux IRQ number.
45819 + * Returns IRQ or negative errno.
45820 + */
45821 +int bind_caller_port_to_irqhandler(
45822 +       unsigned int caller_port,
45823 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
45824 +       unsigned long irqflags,
45825 +       const char *devname,
45826 +       void *dev_id);
45827 +int bind_listening_port_to_irqhandler(
45828 +       unsigned int remote_domain,
45829 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
45830 +       unsigned long irqflags,
45831 +       const char *devname,
45832 +       void *dev_id);
45833 +int bind_interdomain_evtchn_to_irqhandler(
45834 +       unsigned int remote_domain,
45835 +       unsigned int remote_port,
45836 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
45837 +       unsigned long irqflags,
45838 +       const char *devname,
45839 +       void *dev_id);
45840 +int bind_virq_to_irqhandler(
45841 +       unsigned int virq,
45842 +       unsigned int cpu,
45843 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
45844 +       unsigned long irqflags,
45845 +       const char *devname,
45846 +       void *dev_id);
45847 +int bind_ipi_to_irqhandler(
45848 +       unsigned int ipi,
45849 +       unsigned int cpu,
45850 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
45851 +       unsigned long irqflags,
45852 +       const char *devname,
45853 +       void *dev_id);
45854 +
45855 +/*
45856 + * Common unbind function for all event sources. Takes IRQ to unbind from.
45857 + * Automatically closes the underlying event channel (except for bindings
45858 + * made with bind_caller_port_to_irqhandler()).
45859 + */
45860 +void unbind_from_irqhandler(unsigned int irq, void *dev_id);
45861 +
45862 +void irq_resume(void);
45863 +
45864 +/* Entry point for notifications into Linux subsystems. */
45865 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
45866 +
45867 +/* Entry point for notifications into the userland character device. */
45868 +void evtchn_device_upcall(int port);
45869 +
45870 +/* Mark a PIRQ as unavailable for dynamic allocation. */
45871 +void evtchn_register_pirq(int irq);
45872 +/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
45873 +int evtchn_map_pirq(int irq, int xen_pirq);
45874 +/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
45875 +int evtchn_get_xen_pirq(int irq);
45876 +
45877 +void mask_evtchn(int port);
45878 +void disable_all_local_evtchn(void);
45879 +void unmask_evtchn(int port);
45880 +
45881 +#ifdef CONFIG_SMP
45882 +void rebind_evtchn_to_cpu(int port, unsigned int cpu);
45883 +#else
45884 +#define rebind_evtchn_to_cpu(port, cpu)        ((void)0)
45885 +#endif
45886 +
45887 +static inline int test_and_set_evtchn_mask(int port)
45888 +{
45889 +       shared_info_t *s = HYPERVISOR_shared_info;
45890 +       return synch_test_and_set_bit(port, s->evtchn_mask);
45891 +}
45892 +
45893 +static inline void clear_evtchn(int port)
45894 +{
45895 +       shared_info_t *s = HYPERVISOR_shared_info;
45896 +       synch_clear_bit(port, s->evtchn_pending);
45897 +}
45898 +
45899 +static inline void notify_remote_via_evtchn(int port)
45900 +{
45901 +       struct evtchn_send send = { .port = port };
45902 +       VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
45903 +}
45904 +
45905 +/*
45906 + * Use these to access the event channel underlying the IRQ handle returned
45907 + * by bind_*_to_irqhandler().
45908 + */
45909 +void notify_remote_via_irq(int irq);
45910 +int irq_to_evtchn_port(int irq);
45911 +
45912 +#define PIRQ_SET_MAPPING 0x0
45913 +#define PIRQ_CLEAR_MAPPING 0x1
45914 +#define PIRQ_GET_MAPPING 0x3
45915 +int pirq_mapstatus(int pirq, int action);
45916 +int set_pirq_hw_action(int pirq, int (*action)(int pirq, int action));
45917 +int clear_pirq_hw_action(int pirq);
45918 +
45919 +#define PIRQ_STARTUP 1
45920 +#define PIRQ_SHUTDOWN 2
45921 +#define PIRQ_ENABLE 3
45922 +#define PIRQ_DISABLE 4
45923 +#define PIRQ_END 5
45924 +#define PIRQ_ACK 6
45925 +
45926 +#endif /* __ASM_EVTCHN_H__ */
45927 Index: head-2008-11-25/include/xen/firmware.h
45928 ===================================================================
45929 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45930 +++ head-2008-11-25/include/xen/firmware.h      2007-07-02 08:16:19.000000000 +0200
45931 @@ -0,0 +1,10 @@
45932 +#ifndef __XEN_FIRMWARE_H__
45933 +#define __XEN_FIRMWARE_H__
45934 +
45935 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
45936 +void copy_edd(void);
45937 +#endif
45938 +
45939 +void copy_edid(void);
45940 +
45941 +#endif /* __XEN_FIRMWARE_H__ */
45942 Index: head-2008-11-25/include/xen/gnttab.h
45943 ===================================================================
45944 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
45945 +++ head-2008-11-25/include/xen/gnttab.h        2008-11-04 11:13:10.000000000 +0100
45946 @@ -0,0 +1,164 @@
45947 +/******************************************************************************
45948 + * gnttab.h
45949 + *
45950 + * Two sets of functionality:
45951 + * 1. Granting foreign access to our memory reservation.
45952 + * 2. Accessing others' memory reservations via grant references.
45953 + * (i.e., mechanisms for both sender and recipient of grant references)
45954 + *
45955 + * Copyright (c) 2004-2005, K A Fraser
45956 + * Copyright (c) 2005, Christopher Clark
45957 + *
45958 + * This program is free software; you can redistribute it and/or
45959 + * modify it under the terms of the GNU General Public License version 2
45960 + * as published by the Free Software Foundation; or, when distributed
45961 + * separately from the Linux kernel or incorporated into other
45962 + * software packages, subject to the following license:
45963 + *
45964 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45965 + * of this source file (the "Software"), to deal in the Software without
45966 + * restriction, including without limitation the rights to use, copy, modify,
45967 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45968 + * and to permit persons to whom the Software is furnished to do so, subject to
45969 + * the following conditions:
45970 + *
45971 + * The above copyright notice and this permission notice shall be included in
45972 + * all copies or substantial portions of the Software.
45973 + *
45974 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45975 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45976 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45977 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45978 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45979 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45980 + * IN THE SOFTWARE.
45981 + */
45982 +
45983 +#ifndef __ASM_GNTTAB_H__
45984 +#define __ASM_GNTTAB_H__
45985 +
45986 +#include <asm/hypervisor.h>
45987 +#include <asm/maddr.h> /* maddr_t */
45988 +#include <linux/mm.h>
45989 +#include <xen/interface/grant_table.h>
45990 +#include <xen/features.h>
45991 +
45992 +struct gnttab_free_callback {
45993 +       struct gnttab_free_callback *next;
45994 +       void (*fn)(void *);
45995 +       void *arg;
45996 +       u16 count;
45997 +       u8 queued;
45998 +};
45999 +
46000 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
46001 +                               int flags);
46002 +
46003 +/*
46004 + * End access through the given grant reference, iff the grant entry is no
46005 + * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
46006 + * use.
46007 + */
46008 +int gnttab_end_foreign_access_ref(grant_ref_t ref);
46009 +
46010 +/*
46011 + * Eventually end access through the given grant reference, and once that
46012 + * access has been ended, free the given page too.  Access will be ended
46013 + * immediately iff the grant entry is not in use, otherwise it will happen
46014 + * some time later.  page may be 0, in which case no freeing will occur.
46015 + */
46016 +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
46017 +
46018 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
46019 +
46020 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
46021 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
46022 +
46023 +int gnttab_query_foreign_access(grant_ref_t ref);
46024 +
46025 +/*
46026 + * operations on reserved batches of grant references
46027 + */
46028 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
46029 +
46030 +void gnttab_free_grant_reference(grant_ref_t ref);
46031 +
46032 +void gnttab_free_grant_references(grant_ref_t head);
46033 +
46034 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
46035 +
46036 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
46037 +
46038 +void gnttab_release_grant_reference(grant_ref_t *private_head,
46039 +                                   grant_ref_t release);
46040 +
46041 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
46042 +                                 void (*fn)(void *), void *arg, u16 count);
46043 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
46044 +
46045 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
46046 +                                    unsigned long frame, int flags);
46047 +
46048 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
46049 +                                      unsigned long pfn);
46050 +
46051 +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
46052 +void __gnttab_dma_map_page(struct page *page);
46053 +static inline void __gnttab_dma_unmap_page(struct page *page)
46054 +{
46055 +}
46056 +
46057 +void gnttab_reset_grant_page(struct page *page);
46058 +
46059 +int gnttab_suspend(void);
46060 +int gnttab_resume(void);
46061 +
46062 +void *arch_gnttab_alloc_shared(unsigned long *frames);
46063 +
46064 +static inline void
46065 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
46066 +                 uint32_t flags, grant_ref_t ref, domid_t domid)
46067 +{
46068 +       if (flags & GNTMAP_contains_pte)
46069 +               map->host_addr = addr;
46070 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
46071 +               map->host_addr = __pa(addr);
46072 +       else
46073 +               map->host_addr = addr;
46074 +
46075 +       map->flags = flags;
46076 +       map->ref = ref;
46077 +       map->dom = domid;
46078 +}
46079 +
46080 +static inline void
46081 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
46082 +                   uint32_t flags, grant_handle_t handle)
46083 +{
46084 +       if (flags & GNTMAP_contains_pte)
46085 +               unmap->host_addr = addr;
46086 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
46087 +               unmap->host_addr = __pa(addr);
46088 +       else
46089 +               unmap->host_addr = addr;
46090 +
46091 +       unmap->handle = handle;
46092 +       unmap->dev_bus_addr = 0;
46093 +}
46094 +
46095 +static inline void
46096 +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
46097 +                     maddr_t new_addr, grant_handle_t handle)
46098 +{
46099 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
46100 +               unmap->host_addr = __pa(addr);
46101 +               unmap->new_addr = __pa(new_addr);
46102 +       } else {
46103 +               unmap->host_addr = addr;
46104 +               unmap->new_addr = new_addr;
46105 +       }
46106 +
46107 +       unmap->handle = handle;
46108 +}
46109 +
46110 +#endif /* __ASM_GNTTAB_H__ */
46111 Index: head-2008-11-25/include/xen/hvm.h
46112 ===================================================================
46113 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46114 +++ head-2008-11-25/include/xen/hvm.h   2007-06-12 13:14:19.000000000 +0200
46115 @@ -0,0 +1,23 @@
46116 +/* Simple wrappers around HVM functions */
46117 +#ifndef XEN_HVM_H__
46118 +#define XEN_HVM_H__
46119 +
46120 +#include <xen/interface/hvm/params.h>
46121 +
46122 +static inline unsigned long hvm_get_parameter(int idx)
46123 +{
46124 +       struct xen_hvm_param xhv;
46125 +       int r;
46126 +
46127 +       xhv.domid = DOMID_SELF;
46128 +       xhv.index = idx;
46129 +       r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
46130 +       if (r < 0) {
46131 +               printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
46132 +                      idx, r);
46133 +               return 0;
46134 +       }
46135 +       return xhv.value;
46136 +}
46137 +
46138 +#endif /* XEN_HVM_H__ */
46139 Index: head-2008-11-25/include/xen/hypercall.h
46140 ===================================================================
46141 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46142 +++ head-2008-11-25/include/xen/hypercall.h     2008-01-28 12:24:19.000000000 +0100
46143 @@ -0,0 +1,30 @@
46144 +#ifndef __XEN_HYPERCALL_H__
46145 +#define __XEN_HYPERCALL_H__
46146 +
46147 +#include <asm/hypercall.h>
46148 +
46149 +static inline int __must_check
46150 +HYPERVISOR_multicall_check(
46151 +       multicall_entry_t *call_list, unsigned int nr_calls,
46152 +       const unsigned long *rc_list)
46153 +{
46154 +       int rc = HYPERVISOR_multicall(call_list, nr_calls);
46155 +
46156 +       if (unlikely(rc < 0))
46157 +               return rc;
46158 +       BUG_ON(rc);
46159 +       BUG_ON((int)nr_calls < 0);
46160 +
46161 +       for ( ; nr_calls > 0; --nr_calls, ++call_list)
46162 +               if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
46163 +                       return nr_calls;
46164 +
46165 +       return 0;
46166 +}
46167 +
46168 +/* A construct to ignore the return value of hypercall wrappers in a few
46169 + * exceptional cases (simply casting the function result to void doesn't
46170 + * avoid the compiler warning): */
46171 +#define VOID(expr) ((void)((expr)?:0))
46172 +
46173 +#endif /* __XEN_HYPERCALL_H__ */
46174 Index: head-2008-11-25/include/xen/hypervisor_sysfs.h
46175 ===================================================================
46176 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46177 +++ head-2008-11-25/include/xen/hypervisor_sysfs.h      2007-06-22 09:08:06.000000000 +0200
46178 @@ -0,0 +1,30 @@
46179 +/*
46180 + *  copyright (c) 2006 IBM Corporation
46181 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
46182 + *
46183 + *  This program is free software; you can redistribute it and/or modify
46184 + *  it under the terms of the GNU General Public License version 2 as
46185 + *  published by the Free Software Foundation.
46186 + */
46187 +
46188 +#ifndef _HYP_SYSFS_H_
46189 +#define _HYP_SYSFS_H_
46190 +
46191 +#include <linux/kobject.h>
46192 +#include <linux/sysfs.h>
46193 +
46194 +#define HYPERVISOR_ATTR_RO(_name) \
46195 +static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
46196 +
46197 +#define HYPERVISOR_ATTR_RW(_name) \
46198 +static struct hyp_sysfs_attr _name##_attr = \
46199 +       __ATTR(_name, 0644, _name##_show, _name##_store)
46200 +
46201 +struct hyp_sysfs_attr {
46202 +       struct attribute attr;
46203 +       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
46204 +       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
46205 +       void *hyp_attr_data;
46206 +};
46207 +
46208 +#endif /* _HYP_SYSFS_H_ */
46209 Index: head-2008-11-25/include/xen/pcifront.h
46210 ===================================================================
46211 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46212 +++ head-2008-11-25/include/xen/pcifront.h      2007-06-18 08:38:13.000000000 +0200
46213 @@ -0,0 +1,83 @@
46214 +/*
46215 + * PCI Frontend - arch-dependendent declarations
46216 + *
46217 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
46218 + */
46219 +#ifndef __XEN_ASM_PCIFRONT_H__
46220 +#define __XEN_ASM_PCIFRONT_H__
46221 +
46222 +#include <linux/spinlock.h>
46223 +
46224 +#ifdef __KERNEL__
46225 +
46226 +#ifndef __ia64__
46227 +
46228 +struct pcifront_device;
46229 +struct pci_bus;
46230 +
46231 +struct pcifront_sd {
46232 +       int domain;
46233 +       struct pcifront_device *pdev;
46234 +};
46235 +
46236 +static inline struct pcifront_device *
46237 +pcifront_get_pdev(struct pcifront_sd *sd)
46238 +{
46239 +       return sd->pdev;
46240 +}
46241 +
46242 +static inline void pcifront_init_sd(struct pcifront_sd *sd,
46243 +                                   unsigned int domain, unsigned int bus,
46244 +                                   struct pcifront_device *pdev)
46245 +{
46246 +       sd->domain = domain;
46247 +       sd->pdev = pdev;
46248 +}
46249 +
46250 +#if defined(CONFIG_PCI_DOMAINS)
46251 +static inline int pci_domain_nr(struct pci_bus *bus)
46252 +{
46253 +       struct pcifront_sd *sd = bus->sysdata;
46254 +       return sd->domain;
46255 +}
46256 +static inline int pci_proc_domain(struct pci_bus *bus)
46257 +{
46258 +       return pci_domain_nr(bus);
46259 +}
46260 +#endif /* CONFIG_PCI_DOMAINS */
46261 +
46262 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46263 +                                                struct pcifront_sd *sd)
46264 +{
46265 +}
46266 +
46267 +#else /* __ia64__ */
46268 +
46269 +#include <linux/acpi.h>
46270 +#include <asm/pci.h>
46271 +#define pcifront_sd pci_controller
46272 +
46273 +extern void xen_add_resource(struct pci_controller *, unsigned int,
46274 +                            unsigned int, struct acpi_resource *);
46275 +extern void xen_pcibios_setup_root_windows(struct pci_bus *,
46276 +                                          struct pci_controller *);
46277 +
46278 +static inline struct pcifront_device *
46279 +pcifront_get_pdev(struct pcifront_sd *sd)
46280 +{
46281 +       return (struct pcifront_device *)sd->platform_data;
46282 +}
46283 +
46284 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46285 +                                                struct pcifront_sd *sd)
46286 +{
46287 +       xen_pcibios_setup_root_windows(bus, sd);
46288 +}
46289 +
46290 +#endif /* __ia64__ */
46291 +
46292 +extern struct rw_semaphore pci_bus_sem;
46293 +
46294 +#endif /* __KERNEL__ */
46295 +
46296 +#endif /* __XEN_ASM_PCIFRONT_H__ */
46297 Index: head-2008-11-25/include/xen/public/evtchn.h
46298 ===================================================================
46299 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46300 +++ head-2008-11-25/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200
46301 @@ -0,0 +1,88 @@
46302 +/******************************************************************************
46303 + * evtchn.h
46304 + *
46305 + * Interface to /dev/xen/evtchn.
46306 + *
46307 + * Copyright (c) 2003-2005, K A Fraser
46308 + *
46309 + * This program is free software; you can redistribute it and/or
46310 + * modify it under the terms of the GNU General Public License version 2
46311 + * as published by the Free Software Foundation; or, when distributed
46312 + * separately from the Linux kernel or incorporated into other
46313 + * software packages, subject to the following license:
46314 + *
46315 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46316 + * of this source file (the "Software"), to deal in the Software without
46317 + * restriction, including without limitation the rights to use, copy, modify,
46318 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46319 + * and to permit persons to whom the Software is furnished to do so, subject to
46320 + * the following conditions:
46321 + *
46322 + * The above copyright notice and this permission notice shall be included in
46323 + * all copies or substantial portions of the Software.
46324 + *
46325 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46326 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46327 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46328 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46329 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46330 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46331 + * IN THE SOFTWARE.
46332 + */
46333 +
46334 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
46335 +#define __LINUX_PUBLIC_EVTCHN_H__
46336 +
46337 +/*
46338 + * Bind a fresh port to VIRQ @virq.
46339 + * Return allocated port.
46340 + */
46341 +#define IOCTL_EVTCHN_BIND_VIRQ                         \
46342 +       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
46343 +struct ioctl_evtchn_bind_virq {
46344 +       unsigned int virq;
46345 +};
46346 +
46347 +/*
46348 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
46349 + * Return allocated port.
46350 + */
46351 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
46352 +       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
46353 +struct ioctl_evtchn_bind_interdomain {
46354 +       unsigned int remote_domain, remote_port;
46355 +};
46356 +
46357 +/*
46358 + * Allocate a fresh port for binding to @remote_domain.
46359 + * Return allocated port.
46360 + */
46361 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
46362 +       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
46363 +struct ioctl_evtchn_bind_unbound_port {
46364 +       unsigned int remote_domain;
46365 +};
46366 +
46367 +/*
46368 + * Unbind previously allocated @port.
46369 + */
46370 +#define IOCTL_EVTCHN_UNBIND                            \
46371 +       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
46372 +struct ioctl_evtchn_unbind {
46373 +       unsigned int port;
46374 +};
46375 +
46376 +/*
46377 + * Unbind previously allocated @port.
46378 + */
46379 +#define IOCTL_EVTCHN_NOTIFY                            \
46380 +       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
46381 +struct ioctl_evtchn_notify {
46382 +       unsigned int port;
46383 +};
46384 +
46385 +/* Clear and reinitialise the event buffer. Clear error condition. */
46386 +#define IOCTL_EVTCHN_RESET                             \
46387 +       _IOC(_IOC_NONE, 'E', 5, 0)
46388 +
46389 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
46390 Index: head-2008-11-25/include/xen/public/gntdev.h
46391 ===================================================================
46392 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46393 +++ head-2008-11-25/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200
46394 @@ -0,0 +1,119 @@
46395 +/******************************************************************************
46396 + * gntdev.h
46397 + *
46398 + * Interface to /dev/xen/gntdev.
46399 + *
46400 + * Copyright (c) 2007, D G Murray
46401 + *
46402 + * This program is free software; you can redistribute it and/or
46403 + * modify it under the terms of the GNU General Public License version 2
46404 + * as published by the Free Software Foundation; or, when distributed
46405 + * separately from the Linux kernel or incorporated into other
46406 + * software packages, subject to the following license:
46407 + *
46408 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46409 + * of this source file (the "Software"), to deal in the Software without
46410 + * restriction, including without limitation the rights to use, copy, modify,
46411 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46412 + * and to permit persons to whom the Software is furnished to do so, subject to
46413 + * the following conditions:
46414 + *
46415 + * The above copyright notice and this permission notice shall be included in
46416 + * all copies or substantial portions of the Software.
46417 + *
46418 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46419 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46420 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46421 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46422 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46423 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46424 + * IN THE SOFTWARE.
46425 + */
46426 +
46427 +#ifndef __LINUX_PUBLIC_GNTDEV_H__
46428 +#define __LINUX_PUBLIC_GNTDEV_H__
46429 +
46430 +struct ioctl_gntdev_grant_ref {
46431 +       /* The domain ID of the grant to be mapped. */
46432 +       uint32_t domid;
46433 +       /* The grant reference of the grant to be mapped. */
46434 +       uint32_t ref;
46435 +};
46436 +
46437 +/*
46438 + * Inserts the grant references into the mapping table of an instance
46439 + * of gntdev. N.B. This does not perform the mapping, which is deferred
46440 + * until mmap() is called with @index as the offset.
46441 + */
46442 +#define IOCTL_GNTDEV_MAP_GRANT_REF \
46443 +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
46444 +struct ioctl_gntdev_map_grant_ref {
46445 +       /* IN parameters */
46446 +       /* The number of grants to be mapped. */
46447 +       uint32_t count;
46448 +       uint32_t pad;
46449 +       /* OUT parameters */
46450 +       /* The offset to be used on a subsequent call to mmap(). */
46451 +       uint64_t index;
46452 +       /* Variable IN parameter. */
46453 +       /* Array of grant references, of size @count. */
46454 +       struct ioctl_gntdev_grant_ref refs[1];
46455 +};
46456 +
46457 +/*
46458 + * Removes the grant references from the mapping table of an instance of
46459 + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
46460 + * before this ioctl is called, or an error will result.
46461 + */
46462 +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
46463 +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
46464 +struct ioctl_gntdev_unmap_grant_ref {
46465 +       /* IN parameters */
46466 +       /* The offset was returned by the corresponding map operation. */
46467 +       uint64_t index;
46468 +       /* The number of pages to be unmapped. */
46469 +       uint32_t count;
46470 +       uint32_t pad;
46471 +};
46472 +
46473 +/*
46474 + * Returns the offset in the driver's address space that corresponds
46475 + * to @vaddr. This can be used to perform a munmap(), followed by an
46476 + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
46477 + * the caller. The number of pages that were allocated at the same time as
46478 + * @vaddr is returned in @count.
46479 + *
46480 + * N.B. Where more than one page has been mapped into a contiguous range, the
46481 + *      supplied @vaddr must correspond to the start of the range; otherwise
46482 + *      an error will result. It is only possible to munmap() the entire
46483 + *      contiguously-allocated range at once, and not any subrange thereof.
46484 + */
46485 +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
46486 +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
46487 +struct ioctl_gntdev_get_offset_for_vaddr {
46488 +       /* IN parameters */
46489 +       /* The virtual address of the first mapped page in a range. */
46490 +       uint64_t vaddr;
46491 +       /* OUT parameters */
46492 +       /* The offset that was used in the initial mmap() operation. */
46493 +       uint64_t offset;
46494 +       /* The number of pages mapped in the VM area that begins at @vaddr. */
46495 +       uint32_t count;
46496 +       uint32_t pad;
46497 +};
46498 +
46499 +/*
46500 + * Sets the maximum number of grants that may mapped at once by this gntdev
46501 + * instance.
46502 + *
46503 + * N.B. This must be called before any other ioctl is performed on the device.
46504 + */
46505 +#define IOCTL_GNTDEV_SET_MAX_GRANTS \
46506 +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
46507 +struct ioctl_gntdev_set_max_grants {
46508 +       /* IN parameter */
46509 +       /* The maximum number of grants that may be mapped at once. */
46510 +       uint32_t count;
46511 +};
46512 +
46513 +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
46514 Index: head-2008-11-25/include/xen/public/privcmd.h
46515 ===================================================================
46516 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46517 +++ head-2008-11-25/include/xen/public/privcmd.h        2007-06-12 13:14:19.000000000 +0200
46518 @@ -0,0 +1,79 @@
46519 +/******************************************************************************
46520 + * privcmd.h
46521 + *
46522 + * Interface to /proc/xen/privcmd.
46523 + *
46524 + * Copyright (c) 2003-2005, K A Fraser
46525 + *
46526 + * This program is free software; you can redistribute it and/or
46527 + * modify it under the terms of the GNU General Public License version 2
46528 + * as published by the Free Software Foundation; or, when distributed
46529 + * separately from the Linux kernel or incorporated into other
46530 + * software packages, subject to the following license:
46531 + *
46532 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46533 + * of this source file (the "Software"), to deal in the Software without
46534 + * restriction, including without limitation the rights to use, copy, modify,
46535 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46536 + * and to permit persons to whom the Software is furnished to do so, subject to
46537 + * the following conditions:
46538 + *
46539 + * The above copyright notice and this permission notice shall be included in
46540 + * all copies or substantial portions of the Software.
46541 + *
46542 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46543 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46544 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46545 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46546 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46547 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46548 + * IN THE SOFTWARE.
46549 + */
46550 +
46551 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
46552 +#define __LINUX_PUBLIC_PRIVCMD_H__
46553 +
46554 +#include <linux/types.h>
46555 +
46556 +#ifndef __user
46557 +#define __user
46558 +#endif
46559 +
46560 +typedef struct privcmd_hypercall
46561 +{
46562 +       __u64 op;
46563 +       __u64 arg[5];
46564 +} privcmd_hypercall_t;
46565 +
46566 +typedef struct privcmd_mmap_entry {
46567 +       __u64 va;
46568 +       __u64 mfn;
46569 +       __u64 npages;
46570 +} privcmd_mmap_entry_t;
46571 +
46572 +typedef struct privcmd_mmap {
46573 +       int num;
46574 +       domid_t dom; /* target domain */
46575 +       privcmd_mmap_entry_t __user *entry;
46576 +} privcmd_mmap_t;
46577 +
46578 +typedef struct privcmd_mmapbatch {
46579 +       int num;     /* number of pages to populate */
46580 +       domid_t dom; /* target domain */
46581 +       __u64 addr;  /* virtual address */
46582 +       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
46583 +} privcmd_mmapbatch_t;
46584 +
46585 +/*
46586 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
46587 + * @arg: &privcmd_hypercall_t
46588 + * Return: Value returned from execution of the specified hypercall.
46589 + */
46590 +#define IOCTL_PRIVCMD_HYPERCALL                                        \
46591 +       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
46592 +#define IOCTL_PRIVCMD_MMAP                                     \
46593 +       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
46594 +#define IOCTL_PRIVCMD_MMAPBATCH                                        \
46595 +       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
46596 +
46597 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
46598 Index: head-2008-11-25/include/xen/xen_proc.h
46599 ===================================================================
46600 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46601 +++ head-2008-11-25/include/xen/xen_proc.h      2007-06-12 13:14:19.000000000 +0200
46602 @@ -0,0 +1,12 @@
46603 +
46604 +#ifndef __ASM_XEN_PROC_H__
46605 +#define __ASM_XEN_PROC_H__
46606 +
46607 +#include <linux/proc_fs.h>
46608 +
46609 +extern struct proc_dir_entry *create_xen_proc_entry(
46610 +       const char *name, mode_t mode);
46611 +extern void remove_xen_proc_entry(
46612 +       const char *name);
46613 +
46614 +#endif /* __ASM_XEN_PROC_H__ */
46615 Index: head-2008-11-25/include/xen/xencons.h
46616 ===================================================================
46617 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46618 +++ head-2008-11-25/include/xen/xencons.h       2007-10-15 09:39:38.000000000 +0200
46619 @@ -0,0 +1,17 @@
46620 +#ifndef __ASM_XENCONS_H__
46621 +#define __ASM_XENCONS_H__
46622 +
46623 +struct dom0_vga_console_info;
46624 +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
46625 +
46626 +void xencons_force_flush(void);
46627 +void xencons_resume(void);
46628 +
46629 +/* Interrupt work hooks. Receive data, or kick data out. */
46630 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
46631 +void xencons_tx(void);
46632 +
46633 +int xencons_ring_init(void);
46634 +int xencons_ring_send(const char *data, unsigned len);
46635 +
46636 +#endif /* __ASM_XENCONS_H__ */
46637 Index: head-2008-11-25/include/xen/xenoprof.h
46638 ===================================================================
46639 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46640 +++ head-2008-11-25/include/xen/xenoprof.h      2007-06-12 13:14:19.000000000 +0200
46641 @@ -0,0 +1,42 @@
46642 +/******************************************************************************
46643 + * xen/xenoprof.h
46644 + *
46645 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
46646 + *                    VA Linux Systems Japan K.K.
46647 + *
46648 + * This program is free software; you can redistribute it and/or modify
46649 + * it under the terms of the GNU General Public License as published by
46650 + * the Free Software Foundation; either version 2 of the License, or
46651 + * (at your option) any later version.
46652 + *
46653 + * This program is distributed in the hope that it will be useful,
46654 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
46655 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
46656 + * GNU General Public License for more details.
46657 + *
46658 + * You should have received a copy of the GNU General Public License
46659 + * along with this program; if not, write to the Free Software
46660 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46661 + *
46662 + */
46663 +
46664 +#ifndef __XEN_XENOPROF_H__
46665 +#define __XEN_XENOPROF_H__
46666 +#ifdef CONFIG_XEN
46667 +
46668 +#include <asm/xenoprof.h>
46669 +
46670 +struct oprofile_operations;
46671 +int xenoprofile_init(struct oprofile_operations * ops);
46672 +void xenoprofile_exit(void);
46673 +
46674 +struct xenoprof_shared_buffer {
46675 +       char                                    *buffer;
46676 +       struct xenoprof_arch_shared_buffer      arch;
46677 +};
46678 +#else
46679 +#define xenoprofile_init(ops)  (-ENOSYS)
46680 +#define xenoprofile_exit()     do { } while (0)
46681 +
46682 +#endif /* CONFIG_XEN */
46683 +#endif /* __XEN_XENOPROF_H__ */
46684 Index: head-2008-11-25/lib/swiotlb-xen.c
46685 ===================================================================
46686 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
46687 +++ head-2008-11-25/lib/swiotlb-xen.c   2008-09-15 13:40:15.000000000 +0200
46688 @@ -0,0 +1,739 @@
46689 +/*
46690 + * Dynamic DMA mapping support.
46691 + *
46692 + * This implementation is a fallback for platforms that do not support
46693 + * I/O TLBs (aka DMA address translation hardware).
46694 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
46695 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
46696 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
46697 + *     David Mosberger-Tang <davidm@hpl.hp.com>
46698 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
46699 + */
46700 +
46701 +#include <linux/cache.h>
46702 +#include <linux/mm.h>
46703 +#include <linux/module.h>
46704 +#include <linux/pci.h>
46705 +#include <linux/spinlock.h>
46706 +#include <linux/string.h>
46707 +#include <linux/types.h>
46708 +#include <linux/ctype.h>
46709 +#include <linux/init.h>
46710 +#include <linux/bootmem.h>
46711 +#include <linux/highmem.h>
46712 +#include <asm/io.h>
46713 +#include <asm/pci.h>
46714 +#include <asm/dma.h>
46715 +#include <asm/uaccess.h>
46716 +#include <xen/gnttab.h>
46717 +#include <xen/interface/memory.h>
46718 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
46719 +
46720 +int swiotlb;
46721 +EXPORT_SYMBOL(swiotlb);
46722 +
46723 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
46724 +
46725 +/*
46726 + * Maximum allowable number of contiguous slabs to map,
46727 + * must be a power of 2.  What is the appropriate value ?
46728 + * The complexity of {map,unmap}_single is linearly dependent on this value.
46729 + */
46730 +#define IO_TLB_SEGSIZE 128
46731 +
46732 +/*
46733 + * log of the size of each IO TLB slab.  The number of slabs is command line
46734 + * controllable.
46735 + */
46736 +#define IO_TLB_SHIFT 11
46737 +
46738 +int swiotlb_force;
46739 +
46740 +static char *iotlb_virt_start;
46741 +static unsigned long iotlb_nslabs;
46742 +
46743 +/*
46744 + * Used to do a quick range check in swiotlb_unmap_single and
46745 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
46746 + * API.
46747 + */
46748 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
46749 +
46750 +/* Does the given dma address reside within the swiotlb aperture? */
46751 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
46752 +{
46753 +       unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
46754 +       return (pfn_valid(pfn)
46755 +               && (pfn >= iotlb_pfn_start)
46756 +               && (pfn < iotlb_pfn_end));
46757 +}
46758 +
46759 +/*
46760 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
46761 + */
46762 +static unsigned long io_tlb_overflow = 32*1024;
46763 +
46764 +void *io_tlb_overflow_buffer;
46765 +
46766 +/*
46767 + * This is a free list describing the number of free entries available from
46768 + * each index
46769 + */
46770 +static unsigned int *io_tlb_list;
46771 +static unsigned int io_tlb_index;
46772 +
46773 +/*
46774 + * We need to save away the original address corresponding to a mapped entry
46775 + * for the sync operations.
46776 + */
46777 +static struct phys_addr {
46778 +       struct page *page;
46779 +       unsigned int offset;
46780 +} *io_tlb_orig_addr;
46781 +
46782 +/*
46783 + * Protect the above data structures in the map and unmap calls
46784 + */
46785 +static DEFINE_SPINLOCK(io_tlb_lock);
46786 +
46787 +static unsigned int dma_bits;
46788 +static unsigned int __initdata max_dma_bits = 32;
46789 +static int __init
46790 +setup_dma_bits(char *str)
46791 +{
46792 +       max_dma_bits = simple_strtoul(str, NULL, 0);
46793 +       return 0;
46794 +}
46795 +__setup("dma_bits=", setup_dma_bits);
46796 +
46797 +static int __init
46798 +setup_io_tlb_npages(char *str)
46799 +{
46800 +       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
46801 +       if (isdigit(*str)) {
46802 +               iotlb_nslabs = simple_strtoul(str, &str, 0) <<
46803 +                       (20 - IO_TLB_SHIFT);
46804 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46805 +       }
46806 +       if (*str == ',')
46807 +               ++str;
46808 +       /*
46809 +         * NB. 'force' enables the swiotlb, but doesn't force its use for
46810 +         * every DMA like it does on native Linux. 'off' forcibly disables
46811 +         * use of the swiotlb.
46812 +         */
46813 +       if (!strcmp(str, "force"))
46814 +               swiotlb_force = 1;
46815 +       else if (!strcmp(str, "off"))
46816 +               swiotlb_force = -1;
46817 +       return 1;
46818 +}
46819 +__setup("swiotlb=", setup_io_tlb_npages);
46820 +/* make io_tlb_overflow tunable too? */
46821 +
46822 +/*
46823 + * Statically reserve bounce buffer space and initialize bounce buffer data
46824 + * structures for the software IO TLB used to implement the PCI DMA API.
46825 + */
46826 +void
46827 +swiotlb_init_with_default_size (size_t default_size)
46828 +{
46829 +       unsigned long i, bytes;
46830 +       int rc;
46831 +
46832 +       if (!iotlb_nslabs) {
46833 +               iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
46834 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46835 +       }
46836 +
46837 +       bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
46838 +
46839 +       /*
46840 +        * Get IO TLB memory from the low pages
46841 +        */
46842 +       iotlb_virt_start = alloc_bootmem_low_pages(bytes);
46843 +       if (!iotlb_virt_start)
46844 +               panic("Cannot allocate SWIOTLB buffer!\n");
46845 +
46846 +       dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
46847 +       for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
46848 +               do {
46849 +                       rc = xen_create_contiguous_region(
46850 +                               (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
46851 +                               get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
46852 +                               dma_bits);
46853 +               } while (rc && dma_bits++ < max_dma_bits);
46854 +               if (rc) {
46855 +                       if (i == 0)
46856 +                               panic("No suitable physical memory available for SWIOTLB buffer!\n"
46857 +                                     "Use dom0_mem Xen boot parameter to reserve\n"
46858 +                                     "some DMA memory (e.g., dom0_mem=-128M).\n");
46859 +                       iotlb_nslabs = i;
46860 +                       i <<= IO_TLB_SHIFT;
46861 +                       free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
46862 +                       bytes = i;
46863 +                       for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
46864 +                               unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
46865 +
46866 +                               if (bits > dma_bits)
46867 +                                       dma_bits = bits;
46868 +                       }
46869 +                       break;
46870 +               }
46871 +       }
46872 +
46873 +       /*
46874 +        * Allocate and initialize the free list array.  This array is used
46875 +        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
46876 +        */
46877 +       io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
46878 +       for (i = 0; i < iotlb_nslabs; i++)
46879 +               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
46880 +       io_tlb_index = 0;
46881 +       io_tlb_orig_addr = alloc_bootmem(
46882 +               iotlb_nslabs * sizeof(*io_tlb_orig_addr));
46883 +
46884 +       /*
46885 +        * Get the overflow emergency buffer
46886 +        */
46887 +       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
46888 +       if (!io_tlb_overflow_buffer)
46889 +               panic("Cannot allocate SWIOTLB overflow buffer!\n");
46890 +
46891 +       do {
46892 +               rc = xen_create_contiguous_region(
46893 +                       (unsigned long)io_tlb_overflow_buffer,
46894 +                       get_order(io_tlb_overflow),
46895 +                       dma_bits);
46896 +       } while (rc && dma_bits++ < max_dma_bits);
46897 +       if (rc)
46898 +               panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
46899 +
46900 +       iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
46901 +       iotlb_pfn_end   = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
46902 +
46903 +       printk(KERN_INFO "Software IO TLB enabled: \n"
46904 +              " Aperture:     %lu megabytes\n"
46905 +              " Kernel range: %p - %p\n"
46906 +              " Address size: %u bits\n",
46907 +              bytes >> 20,
46908 +              iotlb_virt_start, iotlb_virt_start + bytes,
46909 +              dma_bits);
46910 +}
46911 +
46912 +void
46913 +swiotlb_init(void)
46914 +{
46915 +       long ram_end;
46916 +       size_t defsz = 64 * (1 << 20); /* 64MB default size */
46917 +
46918 +       if (swiotlb_force == 1) {
46919 +               swiotlb = 1;
46920 +       } else if ((swiotlb_force != -1) &&
46921 +                  is_running_on_xen() &&
46922 +                  is_initial_xendomain()) {
46923 +               /* Domain 0 always has a swiotlb. */
46924 +               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
46925 +               if (ram_end <= 0x7ffff)
46926 +                       defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
46927 +               swiotlb = 1;
46928 +       }
46929 +
46930 +       if (swiotlb)
46931 +               swiotlb_init_with_default_size(defsz);
46932 +       else
46933 +               printk(KERN_INFO "Software IO TLB disabled\n");
46934 +}
46935 +
46936 +/*
46937 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
46938 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
46939 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
46940 + * unnecessary copy from the aperture to the host buffer, and a page fault.
46941 + */
46942 +static void
46943 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
46944 +{
46945 +       if (PageHighMem(buffer.page)) {
46946 +               size_t len, bytes;
46947 +               char *dev, *host, *kmp;
46948 +               len = size;
46949 +               while (len != 0) {
46950 +                       unsigned long flags;
46951 +
46952 +                       if (((bytes = len) + buffer.offset) > PAGE_SIZE)
46953 +                               bytes = PAGE_SIZE - buffer.offset;
46954 +                       local_irq_save(flags); /* protects KM_BOUNCE_READ */
46955 +                       kmp  = kmap_atomic(buffer.page, KM_BOUNCE_READ);
46956 +                       dev  = dma_addr + size - len;
46957 +                       host = kmp + buffer.offset;
46958 +                       if (dir == DMA_FROM_DEVICE) {
46959 +                               if (__copy_to_user_inatomic(host, dev, bytes))
46960 +                                       /* inaccessible */;
46961 +                       } else
46962 +                               memcpy(dev, host, bytes);
46963 +                       kunmap_atomic(kmp, KM_BOUNCE_READ);
46964 +                       local_irq_restore(flags);
46965 +                       len -= bytes;
46966 +                       buffer.page++;
46967 +                       buffer.offset = 0;
46968 +               }
46969 +       } else {
46970 +               char *host = (char *)phys_to_virt(
46971 +                       page_to_pseudophys(buffer.page)) + buffer.offset;
46972 +               if (dir == DMA_FROM_DEVICE) {
46973 +                       if (__copy_to_user_inatomic(host, dma_addr, size))
46974 +                               /* inaccessible */;
46975 +               } else if (dir == DMA_TO_DEVICE)
46976 +                       memcpy(dma_addr, host, size);
46977 +       }
46978 +}
46979 +
46980 +/*
46981 + * Allocates bounce buffer and returns its kernel virtual address.
46982 + */
46983 +static void *
46984 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
46985 +{
46986 +       unsigned long flags;
46987 +       char *dma_addr;
46988 +       unsigned int nslots, stride, index, wrap;
46989 +       struct phys_addr slot_buf;
46990 +       int i;
46991 +
46992 +       /*
46993 +        * For mappings greater than a page, we limit the stride (and
46994 +        * hence alignment) to a page size.
46995 +        */
46996 +       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
46997 +       if (size > PAGE_SIZE)
46998 +               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
46999 +       else
47000 +               stride = 1;
47001 +
47002 +       BUG_ON(!nslots);
47003 +
47004 +       /*
47005 +        * Find suitable number of IO TLB entries size that will fit this
47006 +        * request and allocate a buffer from that IO TLB pool.
47007 +        */
47008 +       spin_lock_irqsave(&io_tlb_lock, flags);
47009 +       {
47010 +               wrap = index = ALIGN(io_tlb_index, stride);
47011 +
47012 +               if (index >= iotlb_nslabs)
47013 +                       wrap = index = 0;
47014 +
47015 +               do {
47016 +                       /*
47017 +                        * If we find a slot that indicates we have 'nslots'
47018 +                        * number of contiguous buffers, we allocate the
47019 +                        * buffers from that slot and mark the entries as '0'
47020 +                        * indicating unavailable.
47021 +                        */
47022 +                       if (io_tlb_list[index] >= nslots) {
47023 +                               int count = 0;
47024 +
47025 +                               for (i = index; i < (int)(index + nslots); i++)
47026 +                                       io_tlb_list[i] = 0;
47027 +                               for (i = index - 1;
47028 +                                    (OFFSET(i, IO_TLB_SEGSIZE) !=
47029 +                                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47030 +                                    i--)
47031 +                                       io_tlb_list[i] = ++count;
47032 +                               dma_addr = iotlb_virt_start +
47033 +                                       (index << IO_TLB_SHIFT);
47034 +
47035 +                               /*
47036 +                                * Update the indices to avoid searching in
47037 +                                * the next round.
47038 +                                */
47039 +                               io_tlb_index =
47040 +                                       ((index + nslots) < iotlb_nslabs
47041 +                                        ? (index + nslots) : 0);
47042 +
47043 +                               goto found;
47044 +                       }
47045 +                       index += stride;
47046 +                       if (index >= iotlb_nslabs)
47047 +                               index = 0;
47048 +               } while (index != wrap);
47049 +
47050 +               spin_unlock_irqrestore(&io_tlb_lock, flags);
47051 +               return NULL;
47052 +       }
47053 +  found:
47054 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
47055 +
47056 +       /*
47057 +        * Save away the mapping from the original address to the DMA address.
47058 +        * This is needed when we sync the memory.  Then we sync the buffer if
47059 +        * needed.
47060 +        */
47061 +       slot_buf = buffer;
47062 +       for (i = 0; i < nslots; i++) {
47063 +               slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
47064 +               slot_buf.offset &= PAGE_SIZE - 1;
47065 +               io_tlb_orig_addr[index+i] = slot_buf;
47066 +               slot_buf.offset += 1 << IO_TLB_SHIFT;
47067 +       }
47068 +       if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47069 +               __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
47070 +
47071 +       return dma_addr;
47072 +}
47073 +
47074 +static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
47075 +{
47076 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47077 +       struct phys_addr buffer = io_tlb_orig_addr[index];
47078 +       buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
47079 +       buffer.page += buffer.offset >> PAGE_SHIFT;
47080 +       buffer.offset &= PAGE_SIZE - 1;
47081 +       return buffer;
47082 +}
47083 +
47084 +/*
47085 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
47086 + */
47087 +static void
47088 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47089 +{
47090 +       unsigned long flags;
47091 +       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
47092 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47093 +       struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47094 +
47095 +       /*
47096 +        * First, sync the memory before unmapping the entry
47097 +        */
47098 +       if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47099 +               __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
47100 +
47101 +       /*
47102 +        * Return the buffer to the free list by setting the corresponding
47103 +        * entries to indicate the number of contigous entries available.
47104 +        * While returning the entries to the free list, we merge the entries
47105 +        * with slots below and above the pool being returned.
47106 +        */
47107 +       spin_lock_irqsave(&io_tlb_lock, flags);
47108 +       {
47109 +               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
47110 +                        io_tlb_list[index + nslots] : 0);
47111 +               /*
47112 +                * Step 1: return the slots to the free list, merging the
47113 +                * slots with superceeding slots
47114 +                */
47115 +               for (i = index + nslots - 1; i >= index; i--)
47116 +                       io_tlb_list[i] = ++count;
47117 +               /*
47118 +                * Step 2: merge the returned slots with the preceding slots,
47119 +                * if available (non zero)
47120 +                */
47121 +               for (i = index - 1;
47122 +                    (OFFSET(i, IO_TLB_SEGSIZE) !=
47123 +                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47124 +                    i--)
47125 +                       io_tlb_list[i] = ++count;
47126 +       }
47127 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
47128 +}
47129 +
47130 +static void
47131 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47132 +{
47133 +       struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47134 +       BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
47135 +       __sync_single(buffer, dma_addr, size, dir);
47136 +}
47137 +
47138 +static void
47139 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
47140 +{
47141 +       /*
47142 +        * Ran out of IOMMU space for this operation. This is very bad.
47143 +        * Unfortunately the drivers cannot handle this operation properly.
47144 +        * unless they check for pci_dma_mapping_error (most don't)
47145 +        * When the mapping is small enough return a static buffer to limit
47146 +        * the damage, or panic when the transfer is too big.
47147 +        */
47148 +       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
47149 +              "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
47150 +
47151 +       if (size > io_tlb_overflow && do_panic) {
47152 +               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47153 +                       panic("PCI-DMA: Memory would be corrupted\n");
47154 +               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47155 +                       panic("PCI-DMA: Random memory would be DMAed\n");
47156 +       }
47157 +}
47158 +
47159 +/*
47160 + * Map a single buffer of the indicated size for DMA in streaming mode.  The
47161 + * PCI address to use is returned.
47162 + *
47163 + * Once the device is given the dma address, the device owns this memory until
47164 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
47165 + */
47166 +dma_addr_t
47167 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
47168 +{
47169 +       dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
47170 +                             offset_in_page(ptr);
47171 +       void *map;
47172 +       struct phys_addr buffer;
47173 +
47174 +       BUG_ON(dir == DMA_NONE);
47175 +
47176 +       /*
47177 +        * If the pointer passed in happens to be in the device's DMA window,
47178 +        * we can safely return the device addr and not worry about bounce
47179 +        * buffering it.
47180 +        */
47181 +       if (!range_straddles_page_boundary(__pa(ptr), size) &&
47182 +           !address_needs_mapping(hwdev, dev_addr))
47183 +               return dev_addr;
47184 +
47185 +       /*
47186 +        * Oh well, have to allocate and map a bounce buffer.
47187 +        */
47188 +       gnttab_dma_unmap_page(dev_addr);
47189 +       buffer.page   = virt_to_page(ptr);
47190 +       buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
47191 +       map = map_single(hwdev, buffer, size, dir);
47192 +       if (!map) {
47193 +               swiotlb_full(hwdev, size, dir, 1);
47194 +               map = io_tlb_overflow_buffer;
47195 +       }
47196 +
47197 +       dev_addr = virt_to_bus(map);
47198 +       return dev_addr;
47199 +}
47200 +
47201 +/*
47202 + * Unmap a single streaming mode DMA translation.  The dma_addr and size must
47203 + * match what was provided for in a previous swiotlb_map_single call.  All
47204 + * other usages are undefined.
47205 + *
47206 + * After this call, reads by the cpu to the buffer are guaranteed to see
47207 + * whatever the device wrote there.
47208 + */
47209 +void
47210 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
47211 +                    int dir)
47212 +{
47213 +       BUG_ON(dir == DMA_NONE);
47214 +       if (in_swiotlb_aperture(dev_addr))
47215 +               unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
47216 +       else
47217 +               gnttab_dma_unmap_page(dev_addr);
47218 +}
47219 +
47220 +/*
47221 + * Make physical memory consistent for a single streaming mode DMA translation
47222 + * after a transfer.
47223 + *
47224 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
47225 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
47226 + * call this function before doing so.  At the next point you give the PCI dma
47227 + * address back to the card, you must first perform a
47228 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
47229 + */
47230 +void
47231 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
47232 +                           size_t size, int dir)
47233 +{
47234 +       BUG_ON(dir == DMA_NONE);
47235 +       if (in_swiotlb_aperture(dev_addr))
47236 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47237 +}
47238 +
47239 +void
47240 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
47241 +                              size_t size, int dir)
47242 +{
47243 +       BUG_ON(dir == DMA_NONE);
47244 +       if (in_swiotlb_aperture(dev_addr))
47245 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47246 +}
47247 +
47248 +/*
47249 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
47250 + * This is the scatter-gather version of the above swiotlb_map_single
47251 + * interface.  Here the scatter gather list elements are each tagged with the
47252 + * appropriate dma address and length.  They are obtained via
47253 + * sg_dma_{address,length}(SG).
47254 + *
47255 + * NOTE: An implementation may be able to use a smaller number of
47256 + *       DMA address/length pairs than there are SG table elements.
47257 + *       (for example via virtual mapping capabilities)
47258 + *       The routine returns the number of addr/length pairs actually
47259 + *       used, at most nents.
47260 + *
47261 + * Device ownership issues as mentioned above for swiotlb_map_single are the
47262 + * same here.
47263 + */
47264 +int
47265 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47266 +              int dir)
47267 +{
47268 +       struct phys_addr buffer;
47269 +       dma_addr_t dev_addr;
47270 +       char *map;
47271 +       int i;
47272 +
47273 +       BUG_ON(dir == DMA_NONE);
47274 +
47275 +       for (i = 0; i < nelems; i++, sg++) {
47276 +               dev_addr = gnttab_dma_map_page(sg->page) + sg->offset;
47277 +
47278 +               if (range_straddles_page_boundary(page_to_pseudophys(sg->page)
47279 +                                                 + sg->offset, sg->length)
47280 +                   || address_needs_mapping(hwdev, dev_addr)) {
47281 +                       gnttab_dma_unmap_page(dev_addr);
47282 +                       buffer.page   = sg->page;
47283 +                       buffer.offset = sg->offset;
47284 +                       map = map_single(hwdev, buffer, sg->length, dir);
47285 +                       if (!map) {
47286 +                               /* Don't panic here, we expect map_sg users
47287 +                                  to do proper error handling. */
47288 +                               swiotlb_full(hwdev, sg->length, dir, 0);
47289 +                               swiotlb_unmap_sg(hwdev, sg - i, i, dir);
47290 +                               sg[0].dma_length = 0;
47291 +                               return 0;
47292 +                       }
47293 +                       sg->dma_address = (dma_addr_t)virt_to_bus(map);
47294 +               } else
47295 +                       sg->dma_address = dev_addr;
47296 +               sg->dma_length = sg->length;
47297 +       }
47298 +       return nelems;
47299 +}
47300 +
47301 +/*
47302 + * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
47303 + * concerning calls here are the same as for swiotlb_unmap_single() above.
47304 + */
47305 +void
47306 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47307 +                int dir)
47308 +{
47309 +       int i;
47310 +
47311 +       BUG_ON(dir == DMA_NONE);
47312 +
47313 +       for (i = 0; i < nelems; i++, sg++)
47314 +               if (in_swiotlb_aperture(sg->dma_address))
47315 +                       unmap_single(hwdev,
47316 +                                    (void *)bus_to_virt(sg->dma_address),
47317 +                                    sg->dma_length, dir);
47318 +               else
47319 +                       gnttab_dma_unmap_page(sg->dma_address);
47320 +}
47321 +
47322 +/*
47323 + * Make physical memory consistent for a set of streaming mode DMA translations
47324 + * after a transfer.
47325 + *
47326 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
47327 + * and usage.
47328 + */
47329 +void
47330 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
47331 +                       int nelems, int dir)
47332 +{
47333 +       int i;
47334 +
47335 +       BUG_ON(dir == DMA_NONE);
47336 +
47337 +       for (i = 0; i < nelems; i++, sg++)
47338 +               if (in_swiotlb_aperture(sg->dma_address))
47339 +                       sync_single(hwdev,
47340 +                                   (void *)bus_to_virt(sg->dma_address),
47341 +                                   sg->dma_length, dir);
47342 +}
47343 +
47344 +void
47345 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
47346 +                          int nelems, int dir)
47347 +{
47348 +       int i;
47349 +
47350 +       BUG_ON(dir == DMA_NONE);
47351 +
47352 +       for (i = 0; i < nelems; i++, sg++)
47353 +               if (in_swiotlb_aperture(sg->dma_address))
47354 +                       sync_single(hwdev,
47355 +                                   (void *)bus_to_virt(sg->dma_address),
47356 +                                   sg->dma_length, dir);
47357 +}
47358 +
47359 +#ifdef CONFIG_HIGHMEM
47360 +
47361 +dma_addr_t
47362 +swiotlb_map_page(struct device *hwdev, struct page *page,
47363 +                unsigned long offset, size_t size,
47364 +                enum dma_data_direction direction)
47365 +{
47366 +       struct phys_addr buffer;
47367 +       dma_addr_t dev_addr;
47368 +       char *map;
47369 +
47370 +       dev_addr = gnttab_dma_map_page(page) + offset;
47371 +       if (address_needs_mapping(hwdev, dev_addr)) {
47372 +               gnttab_dma_unmap_page(dev_addr);
47373 +               buffer.page   = page;
47374 +               buffer.offset = offset;
47375 +               map = map_single(hwdev, buffer, size, direction);
47376 +               if (!map) {
47377 +                       swiotlb_full(hwdev, size, direction, 1);
47378 +                       map = io_tlb_overflow_buffer;
47379 +               }
47380 +               dev_addr = (dma_addr_t)virt_to_bus(map);
47381 +       }
47382 +
47383 +       return dev_addr;
47384 +}
47385 +
47386 +void
47387 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
47388 +                  size_t size, enum dma_data_direction direction)
47389 +{
47390 +       BUG_ON(direction == DMA_NONE);
47391 +       if (in_swiotlb_aperture(dma_address))
47392 +               unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
47393 +       else
47394 +               gnttab_dma_unmap_page(dma_address);
47395 +}
47396 +
47397 +#endif
47398 +
47399 +int
47400 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
47401 +{
47402 +       return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
47403 +}
47404 +
47405 +/*
47406 + * Return whether the given PCI device DMA address mask can be supported
47407 + * properly.  For example, if your device can only drive the low 24-bits
47408 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
47409 + * this function.
47410 + */
47411 +int
47412 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
47413 +{
47414 +       return (mask >= ((1UL << dma_bits) - 1));
47415 +}
47416 +
47417 +EXPORT_SYMBOL(swiotlb_init);
47418 +EXPORT_SYMBOL(swiotlb_map_single);
47419 +EXPORT_SYMBOL(swiotlb_unmap_single);
47420 +EXPORT_SYMBOL(swiotlb_map_sg);
47421 +EXPORT_SYMBOL(swiotlb_unmap_sg);
47422 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
47423 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
47424 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
47425 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
47426 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
47427 +EXPORT_SYMBOL(swiotlb_dma_supported);
47428 Index: head-2008-11-25/scripts/Makefile.xen.awk
47429 ===================================================================
47430 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
47431 +++ head-2008-11-25/scripts/Makefile.xen.awk    2007-08-06 15:10:49.000000000 +0200
47432 @@ -0,0 +1,34 @@
47433 +BEGIN {
47434 +       is_rule = 0
47435 +}
47436 +
47437 +/^[[:space:]]*#/ {
47438 +       next
47439 +}
47440 +
47441 +/^[[:space:]]*$/ {
47442 +       if (is_rule)
47443 +               print("")
47444 +       is_rule = 0
47445 +       next
47446 +}
47447 +
47448 +/:[[:space:]]*%\.[cS][[:space:]]/ {
47449 +       line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
47450 +       line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
47451 +       print line
47452 +       is_rule = 1
47453 +       next
47454 +}
47455 +
47456 +/^[^\t]$/ {
47457 +       if (is_rule)
47458 +               print("")
47459 +       is_rule = 0
47460 +       next
47461 +}
47462 +
47463 +is_rule {
47464 +       print $0
47465 +       next
47466 +}