2 From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd)
3 Patch-mainline: obsolete
4 Acked-by: jbeulich@novell.com
6 List of files having Xen derivates (perhaps created during the merging
7 of newer kernel versions), for xen-port-patches.py to pick up (i.e. this
8 must be retained here until the XenSource tree has these in the right
10 +++ linux/arch/x86/kernel/acpi/sleep-xen.c
11 +++ linux/arch/x86/kernel/cpu/common_64-xen.c
12 +++ linux/arch/x86/kernel/e820-xen.c
13 +++ linux/arch/x86/kernel/head-xen.c
14 +++ linux/arch/x86/kernel/head32-xen.c
15 +++ linux/arch/x86/kernel/ioport-xen.c
16 +++ linux/arch/x86/kernel/ipi-xen.c
17 +++ linux/arch/x86/kernel/ldt-xen.c
18 +++ linux/arch/x86/kernel/mpparse-xen.c
19 +++ linux/arch/x86/kernel/pci-nommu-xen.c
20 +++ linux/arch/x86/kernel/process-xen.c
21 +++ linux/arch/x86/kernel/setup-xen.c
22 +++ linux/arch/x86/kernel/setup_percpu-xen.c
23 +++ linux/arch/x86/kernel/smp-xen.c
24 +++ linux/arch/x86/mm/fault-xen.c
25 +++ linux/arch/x86/mm/ioremap-xen.c
26 +++ linux/arch/x86/mm/pageattr-xen.c
27 +++ linux/arch/x86/mm/pat-xen.c
28 +++ linux/arch/x86/mm/pgtable-xen.c
29 +++ linux/arch/x86/vdso/vdso32-setup-xen.c
30 +++ linux/drivers/char/mem-xen.c
31 +++ linux/include/asm-x86/mach-xen/asm/desc.h
32 +++ linux/include/asm-x86/mach-xen/asm/dma-mapping.h
33 +++ linux/include/asm-x86/mach-xen/asm/fixmap.h
34 +++ linux/include/asm-x86/mach-xen/asm/io.h
35 +++ linux/include/asm-x86/mach-xen/asm/irq_vectors.h
36 +++ linux/include/asm-x86/mach-xen/asm/irqflags.h
37 +++ linux/include/asm-x86/mach-xen/asm/mmu_context.h
38 +++ linux/include/asm-x86/mach-xen/asm/page.h
39 +++ linux/include/asm-x86/mach-xen/asm/pci.h
40 +++ linux/include/asm-x86/mach-xen/asm/pgalloc.h
41 +++ linux/include/asm-x86/mach-xen/asm/pgtable.h
42 +++ linux/include/asm-x86/mach-xen/asm/processor.h
43 +++ linux/include/asm-x86/mach-xen/asm/segment.h
44 +++ linux/include/asm-x86/mach-xen/asm/smp.h
45 +++ linux/include/asm-x86/mach-xen/asm/spinlock.h
46 +++ linux/include/asm-x86/mach-xen/asm/swiotlb.h
47 +++ linux/include/asm-x86/mach-xen/asm/system.h
48 +++ linux/include/asm-x86/mach-xen/asm/tlbflush.h
49 +++ linux/include/asm-x86/mach-xen/asm/xor.h
51 List of files folded into their native counterparts (and hence removed
52 from this patch for xen-port-patches.py to not needlessly pick them up;
53 for reference, prefixed with the version the removal occured):
54 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level.h
55 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level-defs.h
56 2.6.19/include/asm-x86/mach-xen/asm/ptrace.h
57 2.6.23/arch/x86/kernel/vsyscall-note_32-xen.S
58 2.6.23/include/asm-x86/mach-xen/asm/ptrace_64.h
59 2.6.24/arch/x86/kernel/early_printk_32-xen.c
60 2.6.24/include/asm-x86/mach-xen/asm/arch_hooks_64.h
61 2.6.24/include/asm-x86/mach-xen/asm/bootsetup_64.h
62 2.6.24/include/asm-x86/mach-xen/asm/mmu_32.h
63 2.6.24/include/asm-x86/mach-xen/asm/mmu_64.h
64 2.6.24/include/asm-x86/mach-xen/asm/nmi_64.h
65 2.6.24/include/asm-x86/mach-xen/asm/setup.h
66 2.6.24/include/asm-x86/mach-xen/asm/time_64.h (added in 2.6.20)
67 2.6.25/arch/x86/ia32/syscall32-xen.c
68 2.6.25/arch/x86/ia32/syscall32_syscall-xen.S
69 2.6.25/arch/x86/ia32/vsyscall-int80.S
70 2.6.25/arch/x86/kernel/acpi/boot-xen.c
71 2.6.25/include/asm-x86/mach-xen/asm/msr.h
72 2.6.25/include/asm-x86/mach-xen/asm/page_32.h
73 2.6.25/include/asm-x86/mach-xen/asm/spinlock_32.h
74 2.6.25/include/asm-x86/mach-xen/asm/timer.h (added in 2.6.24)
75 2.6.25/include/asm-x86/mach-xen/asm/timer_64.h
76 2.6.26/arch/x86/kernel/pci-dma_32-xen.c
77 2.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c
78 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_32.h
79 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_64.h
80 2.6.26/include/asm-x86/mach-xen/asm/nmi.h (added in 2.6.24)
81 2.6.26/include/asm-x86/mach-xen/asm/scatterlist.h (added in 2.6.24)
82 2.6.26/include/asm-x86/mach-xen/asm/scatterlist_32.h
83 2.6.26/include/xen/xencomm.h
84 2.6.27/arch/x86/kernel/e820_32-xen.c
85 2.6.27/include/asm-x86/mach-xen/asm/e820.h (added in 2.6.24)
86 2.6.27/include/asm-x86/mach-xen/asm/e820_64.h
87 2.6.27/include/asm-x86/mach-xen/asm/hw_irq.h (added in 2.6.24)
88 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_32.h
89 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_64.h
90 2.6.27/include/asm-x86/mach-xen/asm/irq.h (added in 2.6.24)
91 2.6.27/include/asm-x86/mach-xen/asm/irq_64.h
93 Index: head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c
94 ===================================================================
95 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
96 +++ head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c 2008-10-01 15:43:24.000000000 +0200
99 + * processor_extcntl_xen.c - interface to notify Xen
101 + * Copyright (C) 2008, Intel corporation
103 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
105 + * This program is free software; you can redistribute it and/or modify
106 + * it under the terms of the GNU General Public License as published by
107 + * the Free Software Foundation; either version 2 of the License, or (at
108 + * your option) any later version.
110 + * This program is distributed in the hope that it will be useful, but
111 + * WITHOUT ANY WARRANTY; without even the implied warranty of
112 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
113 + * General Public License for more details.
115 + * You should have received a copy of the GNU General Public License along
116 + * with this program; if not, write to the Free Software Foundation, Inc.,
117 + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
121 +#include <linux/kernel.h>
122 +#include <linux/init.h>
123 +#include <linux/types.h>
124 +#include <linux/acpi.h>
125 +#include <linux/pm.h>
126 +#include <linux/cpu.h>
128 +#include <linux/cpufreq.h>
129 +#include <acpi/processor.h>
130 +#include <asm/hypercall.h>
132 +static int xen_cx_notifier(struct acpi_processor *pr, int action)
134 + int ret, count = 0, i;
135 + xen_platform_op_t op = {
136 + .cmd = XENPF_set_processor_pminfo,
137 + .interface_version = XENPF_INTERFACE_VERSION,
138 + .u.set_pminfo.id = pr->acpi_id,
139 + .u.set_pminfo.type = XEN_PM_CX,
141 + struct xen_processor_cx *data, *buf;
142 + struct acpi_processor_cx *cx;
144 + if (action == PROCESSOR_PM_CHANGE)
147 + /* Convert to Xen defined structure and hypercall */
148 + buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
154 + for (i = 1; i <= pr->power.count; i++) {
155 + cx = &pr->power.states[i];
156 + /* Skip invalid cstate entry */
160 + data->type = cx->type;
161 + data->latency = cx->latency;
162 + data->power = cx->power;
163 + data->reg.space_id = cx->reg.space_id;
164 + data->reg.bit_width = cx->reg.bit_width;
165 + data->reg.bit_offset = cx->reg.bit_offset;
166 + data->reg.access_size = cx->reg.reserved;
167 + data->reg.address = cx->reg.address;
169 + /* Get dependency relationships */
170 + if (cx->csd_count) {
171 + printk("Wow! _CSD is found. Not support for now!\n");
176 + set_xen_guest_handle(data->dp, NULL);
184 + printk("No available Cx info for cpu %d\n", pr->acpi_id);
189 + op.u.set_pminfo.power.count = count;
190 + op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
191 + op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
192 + op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
193 + op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done;
195 + set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
196 + ret = HYPERVISOR_platform_op(&op);
201 +static int xen_px_notifier(struct acpi_processor *pr, int action)
204 + xen_platform_op_t op = {
205 + .cmd = XENPF_set_processor_pminfo,
206 + .interface_version = XENPF_INTERFACE_VERSION,
207 + .u.set_pminfo.id = pr->acpi_id,
208 + .u.set_pminfo.type = XEN_PM_PX,
210 + struct xen_processor_performance *perf;
211 + struct xen_processor_px *states = NULL;
212 + struct acpi_processor_performance *px;
213 + struct acpi_psd_package *pdomain;
218 + perf = &op.u.set_pminfo.perf;
219 + px = pr->performance;
222 + case PROCESSOR_PM_CHANGE:
223 + /* ppc dynamic handle */
224 + perf->flags = XEN_PX_PPC;
225 + perf->platform_limit = pr->performance_platform_limit;
227 + ret = HYPERVISOR_platform_op(&op);
230 + case PROCESSOR_PM_INIT:
231 + /* px normal init */
232 + perf->flags = XEN_PX_PPC |
238 + perf->platform_limit = pr->performance_platform_limit;
241 + xen_convert_pct_reg(&perf->control_register, &px->control_register);
242 + xen_convert_pct_reg(&perf->status_register, &px->status_register);
245 + perf->state_count = px->state_count;
246 + states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
249 + xen_convert_pss_states(states, px->states, px->state_count);
250 + set_xen_guest_handle(perf->states, states);
253 + pdomain = &px->domain_info;
254 + xen_convert_psd_pack(&perf->domain_info, pdomain);
255 + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
256 + perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
257 + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
258 + perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
259 + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
260 + perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
267 + ret = HYPERVISOR_platform_op(&op);
278 +static int xen_tx_notifier(struct acpi_processor *pr, int action)
282 +static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
287 +static struct processor_extcntl_ops xen_extcntl_ops = {
288 + .hotplug = xen_hotplug_notifier,
291 +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops)
293 + unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
297 + if (pmbits & XEN_PROCESSOR_PM_CX)
298 + xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
299 + if (pmbits & XEN_PROCESSOR_PM_PX)
300 + xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
301 + if (pmbits & XEN_PROCESSOR_PM_TX)
302 + xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
304 + *ops = &xen_extcntl_ops;
306 +EXPORT_SYMBOL(arch_acpi_processor_init_extcntl);
307 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c
308 ===================================================================
309 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
310 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200
313 + * sleep.c - x86-specific ACPI sleep support.
315 + * Copyright (C) 2001-2003 Patrick Mochel
316 + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
319 +#include <linux/acpi.h>
320 +#include <linux/bootmem.h>
321 +#include <linux/dmi.h>
322 +#include <linux/cpumask.h>
324 +#include <asm/smp.h>
326 +#ifndef CONFIG_ACPI_PV_SLEEP
327 +/* address in low memory of the wakeup routine. */
328 +unsigned long acpi_wakeup_address = 0;
329 +unsigned long acpi_video_flags;
330 +extern char wakeup_start, wakeup_end;
332 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
336 + * acpi_save_state_mem - save kernel state
338 + * Create an identity mapped page table and copy the wakeup routine to
341 +int acpi_save_state_mem(void)
343 +#ifndef CONFIG_ACPI_PV_SLEEP
344 + if (!acpi_wakeup_address)
346 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
347 + &wakeup_end - &wakeup_start);
348 + acpi_copy_wakeup_routine(acpi_wakeup_address);
354 + * acpi_restore_state - undo effects of acpi_save_state_mem
356 +void acpi_restore_state_mem(void)
361 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
363 + * We allocate a page from the first 1MB of memory for the wakeup
364 + * routine for when we come back from a sleep state. The
365 + * runtime allocator allows specification of <16MB pages, but not
368 +void __init acpi_reserve_bootmem(void)
370 +#ifndef CONFIG_ACPI_PV_SLEEP
371 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
373 + "ACPI: Wakeup code way too big, S3 disabled.\n");
377 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
378 + if (!acpi_wakeup_address)
379 + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
383 +#ifndef CONFIG_ACPI_PV_SLEEP
384 +static int __init acpi_sleep_setup(char *str)
386 + while ((str != NULL) && (*str != '\0')) {
387 + if (strncmp(str, "s3_bios", 7) == 0)
388 + acpi_video_flags = 1;
389 + if (strncmp(str, "s3_mode", 7) == 0)
390 + acpi_video_flags |= 2;
391 + str = strchr(str, ',');
393 + str += strspn(str, ", \t");
398 +__setup("acpi_sleep=", acpi_sleep_setup);
400 +static __init int reset_videomode_after_s3(struct dmi_system_id *d)
402 + acpi_video_flags |= 2;
406 +static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
407 + { /* Reset video mode after returning from ACPI S3 sleep */
408 + .callback = reset_videomode_after_s3,
409 + .ident = "Toshiba Satellite 4030cdt",
411 + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
417 +static int __init acpisleep_dmi_init(void)
419 + dmi_check_system(acpisleep_dmi_table);
423 +core_initcall(acpisleep_dmi_init);
424 +#endif /* CONFIG_ACPI_PV_SLEEP */
425 Index: head-2008-11-25/arch/x86/kernel/apic_32-xen.c
426 ===================================================================
427 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
428 +++ head-2008-11-25/arch/x86/kernel/apic_32-xen.c 2007-06-12 13:12:48.000000000 +0200
431 + * Local APIC handling, local APIC timers
433 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
436 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
437 + * thanks to Eric Gilmore
439 + * for testing these extensively.
440 + * Maciej W. Rozycki : Various updates and fixes.
441 + * Mikael Pettersson : Power Management for UP-APIC.
443 + * Mikael Pettersson : PM converted to driver model.
446 +#include <linux/init.h>
448 +#include <linux/mm.h>
449 +#include <linux/delay.h>
450 +#include <linux/bootmem.h>
451 +#include <linux/smp_lock.h>
452 +#include <linux/interrupt.h>
453 +#include <linux/mc146818rtc.h>
454 +#include <linux/kernel_stat.h>
455 +#include <linux/sysdev.h>
456 +#include <linux/cpu.h>
457 +#include <linux/module.h>
459 +#include <asm/atomic.h>
460 +#include <asm/smp.h>
461 +#include <asm/mtrr.h>
462 +#include <asm/mpspec.h>
463 +#include <asm/desc.h>
464 +#include <asm/arch_hooks.h>
465 +#include <asm/hpet.h>
466 +#include <asm/i8253.h>
467 +#include <asm/nmi.h>
469 +#include <mach_apic.h>
470 +#include <mach_apicdef.h>
471 +#include <mach_ipi.h>
473 +#include "io_ports.h"
477 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
478 + * IPIs in place of local APIC timers
480 +static cpumask_t timer_bcast_ipi;
484 + * Knob to control our willingness to enable the local APIC.
486 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
494 +static int modern_apic(void)
496 + unsigned int lvr, version;
497 + /* AMD systems use old APIC versions, so check the CPU */
498 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
499 + boot_cpu_data.x86 >= 0xf)
501 + lvr = apic_read(APIC_LVR);
502 + version = GET_APIC_VERSION(lvr);
503 + return version >= 0x14;
505 +#endif /* !CONFIG_XEN */
508 + * 'what should we do if we get a hw irq event on an illegal vector'.
509 + * each architecture has to answer this themselves.
511 +void ack_bad_irq(unsigned int irq)
513 + printk("unexpected IRQ trap at vector %02x\n", irq);
515 + * Currently unexpected vectors happen only on SMP and APIC.
516 + * We _must_ ack these because every local APIC has only N
517 + * irq slots per priority level, and a 'hanging, unacked' IRQ
518 + * holds up an irq slot - in excessive cases (when multiple
519 + * unexpected vectors occur) that might lock up the APIC
521 + * But only ack when the APIC is enabled -AK
527 +int get_physical_broadcast(void)
534 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
536 + int cpu = smp_processor_id();
539 + * the NMI deadlock-detector uses this.
541 + per_cpu(irq_stat, cpu).apic_timer_irqs++;
543 + smp_local_timer_interrupt(regs);
547 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
551 + cpus_and(mask, cpu_online_map, timer_bcast_ipi);
552 + if (!cpus_empty(mask)) {
554 + send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
557 + * We can directly call the apic timer interrupt handler
558 + * in UP case. Minus all irq related functions
560 + up_apic_timer_interrupt_call(regs);
566 +int setup_profiling_timer(unsigned int multiplier)
572 + * This initializes the IO-APIC and APIC hardware if this is
575 +int __init APIC_init_uniprocessor (void)
577 +#ifdef CONFIG_X86_IO_APIC
578 + if (smp_found_config)
579 + if (!skip_ioapic_setup && nr_ioapics)
585 Index: head-2008-11-25/arch/x86/kernel/cpu/common-xen.c
586 ===================================================================
587 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
588 +++ head-2008-11-25/arch/x86/kernel/cpu/common-xen.c 2007-12-10 08:47:31.000000000 +0100
590 +#include <linux/init.h>
591 +#include <linux/string.h>
592 +#include <linux/delay.h>
593 +#include <linux/smp.h>
594 +#include <linux/module.h>
595 +#include <linux/percpu.h>
596 +#include <linux/bootmem.h>
597 +#include <asm/semaphore.h>
598 +#include <asm/processor.h>
599 +#include <asm/i387.h>
600 +#include <asm/msr.h>
602 +#include <asm/mmu_context.h>
603 +#include <asm/mtrr.h>
604 +#include <asm/mce.h>
605 +#ifdef CONFIG_X86_LOCAL_APIC
606 +#include <asm/mpspec.h>
607 +#include <asm/apic.h>
608 +#include <mach_apic.h>
611 +#define phys_pkg_id(a,b) a
614 +#include <asm/hypervisor.h>
618 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
619 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
622 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
623 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
626 +static int cachesize_override __cpuinitdata = -1;
627 +static int disable_x86_fxsr __cpuinitdata;
628 +static int disable_x86_serial_nr __cpuinitdata = 1;
629 +static int disable_x86_sep __cpuinitdata;
631 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
633 +extern int disable_pse;
635 +static void default_init(struct cpuinfo_x86 * c)
637 + /* Not much we can do here... */
638 + /* Check if at least it has cpuid */
639 + if (c->cpuid_level == -1) {
640 + /* No cpuid. It must be an ancient CPU */
642 + strcpy(c->x86_model_id, "486");
643 + else if (c->x86 == 3)
644 + strcpy(c->x86_model_id, "386");
648 +static struct cpu_dev default_cpu = {
649 + .c_init = default_init,
650 + .c_vendor = "Unknown",
652 +static struct cpu_dev * this_cpu = &default_cpu;
654 +static int __init cachesize_setup(char *str)
656 + get_option (&str, &cachesize_override);
659 +__setup("cachesize=", cachesize_setup);
661 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
666 + if (cpuid_eax(0x80000000) < 0x80000004)
669 + v = (unsigned int *) c->x86_model_id;
670 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
671 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
672 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
673 + c->x86_model_id[48] = 0;
675 + /* Intel chips right-justify this string for some dumb reason;
676 + undo that brain damage */
677 + p = q = &c->x86_model_id[0];
678 + while ( *p == ' ' )
683 + while ( q <= &c->x86_model_id[48] )
684 + *q++ = '\0'; /* Zero-pad the rest */
691 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
693 + unsigned int n, dummy, ecx, edx, l2size;
695 + n = cpuid_eax(0x80000000);
697 + if (n >= 0x80000005) {
698 + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
699 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
700 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
701 + c->x86_cache_size=(ecx>>24)+(edx>>24);
704 + if (n < 0x80000006) /* Some chips just has a large L1. */
707 + ecx = cpuid_ecx(0x80000006);
708 + l2size = ecx >> 16;
710 + /* do processor-specific cache resizing */
711 + if (this_cpu->c_size_cache)
712 + l2size = this_cpu->c_size_cache(c,l2size);
714 + /* Allow user to override all this if necessary. */
715 + if (cachesize_override != -1)
716 + l2size = cachesize_override;
719 + return; /* Again, no L2 cache is possible */
721 + c->x86_cache_size = l2size;
723 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
724 + l2size, ecx & 0xFF);
727 +/* Naming convention should be: <Name> [(<Codename>)] */
728 +/* This table only is used unless init_<vendor>() below doesn't set it; */
729 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
731 +/* Look up CPU names by table lookup. */
732 +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
734 + struct cpu_model_info *info;
736 + if ( c->x86_model >= 16 )
737 + return NULL; /* Range check */
742 + info = this_cpu->c_models;
744 + while (info && info->family) {
745 + if (info->family == c->x86)
746 + return info->model_names[c->x86_model];
749 + return NULL; /* Not found */
753 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
755 + char *v = c->x86_vendor_id;
757 + static int printed;
759 + for (i = 0; i < X86_VENDOR_NUM; i++) {
761 + if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
762 + (cpu_devs[i]->c_ident[1] &&
763 + !strcmp(v,cpu_devs[i]->c_ident[1]))) {
766 + this_cpu = cpu_devs[i];
773 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
774 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
776 + c->x86_vendor = X86_VENDOR_UNKNOWN;
777 + this_cpu = &default_cpu;
781 +static int __init x86_fxsr_setup(char * s)
783 + disable_x86_fxsr = 1;
786 +__setup("nofxsr", x86_fxsr_setup);
789 +static int __init x86_sep_setup(char * s)
791 + disable_x86_sep = 1;
794 +__setup("nosep", x86_sep_setup);
797 +/* Standard macro to see if a specific flag is changeable */
798 +static inline int flag_is_changeable_p(u32 flag)
812 + : "=&r" (f1), "=&r" (f2)
815 + return ((f1^f2) & flag) != 0;
819 +/* Probe for the CPUID instruction */
820 +static int __cpuinit have_cpuid_p(void)
822 + return flag_is_changeable_p(X86_EFLAGS_ID);
825 +/* Do minimum CPU detection early.
826 + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
827 + The others are not touched to avoid unwanted side effects.
829 + WARNING: this function is only called on the BP. Don't add code here
830 + that is supposed to run on all CPUs. */
831 +static void __init early_cpu_detect(void)
833 + struct cpuinfo_x86 *c = &boot_cpu_data;
835 + c->x86_cache_alignment = 32;
837 + if (!have_cpuid_p())
840 + /* Get vendor name */
841 + cpuid(0x00000000, &c->cpuid_level,
842 + (int *)&c->x86_vendor_id[0],
843 + (int *)&c->x86_vendor_id[8],
844 + (int *)&c->x86_vendor_id[4]);
846 + get_cpu_vendor(c, 1);
849 + if (c->cpuid_level >= 0x00000001) {
850 + u32 junk, tfms, cap0, misc;
851 + cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
852 + c->x86 = (tfms >> 8) & 15;
853 + c->x86_model = (tfms >> 4) & 15;
855 + c->x86 += (tfms >> 20) & 0xff;
857 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
858 + c->x86_mask = tfms & 15;
859 + if (cap0 & (1<<19))
860 + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
864 +void __cpuinit generic_identify(struct cpuinfo_x86 * c)
869 + if (have_cpuid_p()) {
870 + /* Get vendor name */
871 + cpuid(0x00000000, &c->cpuid_level,
872 + (int *)&c->x86_vendor_id[0],
873 + (int *)&c->x86_vendor_id[8],
874 + (int *)&c->x86_vendor_id[4]);
876 + get_cpu_vendor(c, 0);
877 + /* Initialize the standard set of capabilities */
878 + /* Note that the vendor-specific code below might override */
880 + /* Intel-defined flags: level 0x00000001 */
881 + if ( c->cpuid_level >= 0x00000001 ) {
882 + u32 capability, excap;
883 + cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
884 + c->x86_capability[0] = capability;
885 + c->x86_capability[4] = excap;
886 + c->x86 = (tfms >> 8) & 15;
887 + c->x86_model = (tfms >> 4) & 15;
889 + c->x86 += (tfms >> 20) & 0xff;
891 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
892 + c->x86_mask = tfms & 15;
893 +#ifdef CONFIG_X86_HT
894 + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
896 + c->apicid = (ebx >> 24) & 0xFF;
899 + /* Have CPUID level 0 only - unheard of */
903 + /* AMD-defined flags: level 0x80000001 */
904 + xlvl = cpuid_eax(0x80000000);
905 + if ( (xlvl & 0xffff0000) == 0x80000000 ) {
906 + if ( xlvl >= 0x80000001 ) {
907 + c->x86_capability[1] = cpuid_edx(0x80000001);
908 + c->x86_capability[6] = cpuid_ecx(0x80000001);
910 + if ( xlvl >= 0x80000004 )
911 + get_model_name(c); /* Default name */
915 + early_intel_workaround(c);
917 +#ifdef CONFIG_X86_HT
918 + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
922 +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
924 + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
925 + /* Disable processor serial number */
926 + unsigned long lo,hi;
927 + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
929 + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
930 + printk(KERN_NOTICE "CPU serial number disabled.\n");
931 + clear_bit(X86_FEATURE_PN, c->x86_capability);
933 + /* Disabling the serial number may affect the cpuid level */
934 + c->cpuid_level = cpuid_eax(0);
938 +static int __init x86_serial_nr_setup(char *s)
940 + disable_x86_serial_nr = 0;
943 +__setup("serialnumber", x86_serial_nr_setup);
948 + * This does the hard work of actually picking apart the CPU stuff...
950 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
954 + c->loops_per_jiffy = loops_per_jiffy;
955 + c->x86_cache_size = -1;
956 + c->x86_vendor = X86_VENDOR_UNKNOWN;
957 + c->cpuid_level = -1; /* CPUID not detected */
958 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
959 + c->x86_vendor_id[0] = '\0'; /* Unset */
960 + c->x86_model_id[0] = '\0'; /* Unset */
961 + c->x86_max_cores = 1;
962 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
964 + if (!have_cpuid_p()) {
965 + /* First of all, decide if this is a 486 or higher */
966 + /* It's a 486 if we can modify the AC flag */
967 + if ( flag_is_changeable_p(X86_EFLAGS_AC) )
973 + generic_identify(c);
975 + printk(KERN_DEBUG "CPU: After generic identify, caps:");
976 + for (i = 0; i < NCAPINTS; i++)
977 + printk(" %08lx", c->x86_capability[i]);
980 + if (this_cpu->c_identify) {
981 + this_cpu->c_identify(c);
983 + printk(KERN_DEBUG "CPU: After vendor identify, caps:");
984 + for (i = 0; i < NCAPINTS; i++)
985 + printk(" %08lx", c->x86_capability[i]);
990 + * Vendor-specific initialization. In this section we
991 + * canonicalize the feature flags, meaning if there are
992 + * features a certain CPU supports which CPUID doesn't
993 + * tell us, CPUID claiming incorrect flags, or other bugs,
994 + * we handle them here.
996 + * At the end of this section, c->x86_capability better
997 + * indicate the features this CPU genuinely supports!
999 + if (this_cpu->c_init)
1000 + this_cpu->c_init(c);
1002 + /* Disable the PN if appropriate */
1003 + squash_the_stupid_serial_number(c);
1006 + * The vendor-specific functions might have changed features. Now
1007 + * we do "generic changes."
1010 + /* TSC disabled? */
1011 + if ( tsc_disable )
1012 + clear_bit(X86_FEATURE_TSC, c->x86_capability);
1014 + /* FXSR disabled? */
1015 + if (disable_x86_fxsr) {
1016 + clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1017 + clear_bit(X86_FEATURE_XMM, c->x86_capability);
1020 + /* SEP disabled? */
1021 + if (disable_x86_sep)
1022 + clear_bit(X86_FEATURE_SEP, c->x86_capability);
1025 + clear_bit(X86_FEATURE_PSE, c->x86_capability);
1027 + /* If the model name is still unset, do table lookup. */
1028 + if ( !c->x86_model_id[0] ) {
1030 + p = table_lookup_model(c);
1032 + strcpy(c->x86_model_id, p);
1034 + /* Last resort... */
1035 + sprintf(c->x86_model_id, "%02x/%02x",
1036 + c->x86, c->x86_model);
1039 + /* Now the feature flags better reflect actual CPU features! */
1041 + printk(KERN_DEBUG "CPU: After all inits, caps:");
1042 + for (i = 0; i < NCAPINTS; i++)
1043 + printk(" %08lx", c->x86_capability[i]);
1047 + * On SMP, boot_cpu_data holds the common feature set between
1048 + * all CPUs; so make sure that we indicate which features are
1049 + * common between the CPUs. The first time this routine gets
1050 + * executed, c == &boot_cpu_data.
1052 + if ( c != &boot_cpu_data ) {
1053 + /* AND the already accumulated flags with these */
1054 + for ( i = 0 ; i < NCAPINTS ; i++ )
1055 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1058 + /* Init Machine Check Exception if available. */
1061 + if (c == &boot_cpu_data)
1065 + if (c == &boot_cpu_data)
1071 +#ifdef CONFIG_X86_HT
1072 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
1074 + u32 eax, ebx, ecx, edx;
1075 + int index_msb, core_bits;
1077 + cpuid(1, &eax, &ebx, &ecx, &edx);
1079 + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
1082 + smp_num_siblings = (ebx & 0xff0000) >> 16;
1084 + if (smp_num_siblings == 1) {
1085 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
1086 + } else if (smp_num_siblings > 1 ) {
1088 + if (smp_num_siblings > NR_CPUS) {
1089 + printk(KERN_WARNING "CPU: Unsupported number of the "
1090 + "siblings %d", smp_num_siblings);
1091 + smp_num_siblings = 1;
1095 + index_msb = get_count_order(smp_num_siblings);
1096 + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
1098 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
1101 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
1103 + index_msb = get_count_order(smp_num_siblings) ;
1105 + core_bits = get_count_order(c->x86_max_cores);
1107 + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
1108 + ((1 << core_bits) - 1);
1110 + if (c->x86_max_cores > 1)
1111 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
1117 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1119 + char *vendor = NULL;
1121 + if (c->x86_vendor < X86_VENDOR_NUM)
1122 + vendor = this_cpu->c_vendor;
1123 + else if (c->cpuid_level >= 0)
1124 + vendor = c->x86_vendor_id;
1126 + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
1127 + printk("%s ", vendor);
1129 + if (!c->x86_model_id[0])
1130 + printk("%d86", c->x86);
1132 + printk("%s", c->x86_model_id);
1134 + if (c->x86_mask || c->cpuid_level >= 0)
1135 + printk(" stepping %02x\n", c->x86_mask);
1140 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1142 +/* This is hacky. :)
1143 + * We're emulating future behavior.
1144 + * In the future, the cpu-specific init functions will be called implicitly
1145 + * via the magic of initcalls.
1146 + * They will insert themselves into the cpu_devs structure.
1147 + * Then, when cpu_init() is called, we can just iterate over that array.
1150 +extern int intel_cpu_init(void);
1151 +extern int cyrix_init_cpu(void);
1152 +extern int nsc_init_cpu(void);
1153 +extern int amd_init_cpu(void);
1154 +extern int centaur_init_cpu(void);
1155 +extern int transmeta_init_cpu(void);
1156 +extern int rise_init_cpu(void);
1157 +extern int nexgen_init_cpu(void);
1158 +extern int umc_init_cpu(void);
1160 +void __init early_cpu_init(void)
1166 + centaur_init_cpu();
1167 + transmeta_init_cpu();
1169 + nexgen_init_cpu();
1171 + early_cpu_detect();
1173 +#ifdef CONFIG_DEBUG_PAGEALLOC
1174 + /* pse is not compatible with on-the-fly unmapping,
1175 + * disable it even if the cpus claim to support it.
1177 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1182 +static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
1184 + unsigned long frames[16];
1188 + for (va = gdt_descr->address, f = 0;
1189 + va < gdt_descr->address + gdt_descr->size;
1190 + va += PAGE_SIZE, f++) {
1191 + frames[f] = virt_to_mfn(va);
1192 + make_lowmem_page_readonly(
1193 + (void *)va, XENFEAT_writable_descriptor_tables);
1195 + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
1200 + * cpu_init() initializes state that is per-CPU. Some data is already
1201 + * initialized (naturally) in the bootstrap process, such as the GDT
1202 + * and IDT. We reload them nevertheless, this function acts as a
1203 + * 'CPU state barrier', nothing should get across.
1205 +void __cpuinit cpu_init(void)
1207 + int cpu = smp_processor_id();
1208 +#ifndef CONFIG_X86_NO_TSS
1209 + struct tss_struct * t = &per_cpu(init_tss, cpu);
1211 + struct thread_struct *thread = ¤t->thread;
1212 + struct desc_struct *gdt;
1213 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1215 + if (cpu_test_and_set(cpu, cpu_initialized)) {
1216 + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1217 + for (;;) local_irq_enable();
1219 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1221 + if (cpu_has_vme || cpu_has_de)
1222 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1223 + if (tsc_disable && cpu_has_tsc) {
1224 + printk(KERN_NOTICE "Disabling TSC...\n");
1225 + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1226 + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1227 + set_in_cr4(X86_CR4_TSD);
1231 + /* The CPU hotplug case */
1232 + if (cpu_gdt_descr->address) {
1233 + gdt = (struct desc_struct *)cpu_gdt_descr->address;
1234 + memset(gdt, 0, PAGE_SIZE);
1238 + * This is a horrible hack to allocate the GDT. The problem
1239 + * is that cpu_init() is called really early for the boot CPU
1240 + * (and hence needs bootmem) but much later for the secondary
1241 + * CPUs, when bootmem will have gone away
1243 + if (NODE_DATA(0)->bdata->node_bootmem_map) {
1244 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1245 + /* alloc_bootmem_pages panics on failure, so no check */
1246 + memset(gdt, 0, PAGE_SIZE);
1248 + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
1249 + if (unlikely(!gdt)) {
1250 + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
1252 + local_irq_enable();
1257 + * Initialize the per-CPU GDT with the boot GDT,
1258 + * and set up the GDT descriptor:
1260 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1262 + /* Set up GDT entry for 16bit stack */
1263 + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
1264 + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
1265 + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
1266 + (CPU_16BIT_STACK_SIZE - 1);
1268 + cpu_gdt_descr->size = GDT_SIZE - 1;
1269 + cpu_gdt_descr->address = (unsigned long)gdt;
1271 + if (cpu == 0 && cpu_gdt_descr->address == 0) {
1272 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1273 + /* alloc_bootmem_pages panics on failure, so no check */
1274 + memset(gdt, 0, PAGE_SIZE);
1276 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1278 + cpu_gdt_descr->size = GDT_SIZE;
1279 + cpu_gdt_descr->address = (unsigned long)gdt;
1283 + cpu_gdt_init(cpu_gdt_descr);
1286 + * Set up and load the per-CPU TSS and LDT
1288 + atomic_inc(&init_mm.mm_count);
1289 + current->active_mm = &init_mm;
1292 + enter_lazy_tlb(&init_mm, current);
1294 + load_esp0(t, thread);
1296 + load_LDT(&init_mm.context);
1298 +#ifdef CONFIG_DOUBLEFAULT
1299 + /* Set up doublefault TSS pointer in the GDT */
1300 + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1303 + /* Clear %fs and %gs. */
1304 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
1306 + /* Clear all 6 debug registers: */
1307 + set_debugreg(0, 0);
1308 + set_debugreg(0, 1);
1309 + set_debugreg(0, 2);
1310 + set_debugreg(0, 3);
1311 + set_debugreg(0, 6);
1312 + set_debugreg(0, 7);
1315 + * Force FPU initialization:
1317 + current_thread_info()->status = 0;
1318 + clear_used_math();
1319 + mxcsr_feature_mask_init();
1322 +#ifdef CONFIG_HOTPLUG_CPU
1323 +void __cpuinit cpu_uninit(void)
1325 + int cpu = raw_smp_processor_id();
1326 + cpu_clear(cpu, cpu_initialized);
1328 + /* lazy TLB state */
1329 + per_cpu(cpu_tlbstate, cpu).state = 0;
1330 + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1333 Index: head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c
1334 ===================================================================
1335 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1336 +++ head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100
1338 +#include <linux/init.h>
1339 +#include <linux/proc_fs.h>
1340 +#include <linux/ctype.h>
1341 +#include <linux/module.h>
1342 +#include <linux/seq_file.h>
1343 +#include <asm/uaccess.h>
1344 +#include <linux/mutex.h>
1346 +#include <asm/mtrr.h>
1349 +static DEFINE_MUTEX(mtrr_mutex);
1351 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
1352 + unsigned int *size, mtrr_type * type)
1354 + struct xen_platform_op op;
1356 + op.cmd = XENPF_read_memtype;
1357 + op.u.read_memtype.reg = reg;
1358 + if (unlikely(HYPERVISOR_platform_op(&op)))
1359 + memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
1361 + *size = op.u.read_memtype.nr_mfns;
1362 + *base = op.u.read_memtype.mfn;
1363 + *type = op.u.read_memtype.type;
1366 +struct mtrr_ops generic_mtrr_ops = {
1367 + .use_intel_if = 1,
1368 + .get = generic_get_mtrr,
1371 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1372 +unsigned int num_var_ranges;
1373 +unsigned int *usage_table;
1375 +static void __init set_num_var_ranges(void)
1377 + struct xen_platform_op op;
1379 + for (num_var_ranges = 0; ; num_var_ranges++) {
1380 + op.cmd = XENPF_read_memtype;
1381 + op.u.read_memtype.reg = num_var_ranges;
1382 + if (HYPERVISOR_platform_op(&op) != 0)
1387 +static void __init init_table(void)
1391 + max = num_var_ranges;
1392 + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1394 + printk(KERN_ERR "mtrr: could not allocate\n");
1397 + for (i = 0; i < max; i++)
1398 + usage_table[i] = 0;
1401 +int mtrr_add_page(unsigned long base, unsigned long size,
1402 + unsigned int type, char increment)
1405 + struct xen_platform_op op;
1407 + mutex_lock(&mtrr_mutex);
1409 + op.cmd = XENPF_add_memtype;
1410 + op.u.add_memtype.mfn = base;
1411 + op.u.add_memtype.nr_mfns = size;
1412 + op.u.add_memtype.type = type;
1413 + error = HYPERVISOR_platform_op(&op);
1415 + mutex_unlock(&mtrr_mutex);
1416 + BUG_ON(error > 0);
1421 + ++usage_table[op.u.add_memtype.reg];
1423 + mutex_unlock(&mtrr_mutex);
1425 + return op.u.add_memtype.reg;
1428 +static int mtrr_check(unsigned long base, unsigned long size)
1430 + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
1431 + printk(KERN_WARNING
1432 + "mtrr: size and base must be multiples of 4 kiB\n");
1434 + "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
1442 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1445 + if (mtrr_check(base, size))
1447 + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
1451 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
1455 + unsigned long lbase;
1456 + unsigned int lsize;
1457 + int error = -EINVAL;
1458 + struct xen_platform_op op;
1460 + mutex_lock(&mtrr_mutex);
1463 + /* Search for existing MTRR */
1464 + for (i = 0; i < num_var_ranges; ++i) {
1465 + mtrr_if->get(i, &lbase, &lsize, <ype);
1466 + if (lbase == base && lsize == size) {
1472 + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
1477 + if (usage_table[reg] < 1) {
1478 + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1481 + if (--usage_table[reg] < 1) {
1482 + op.cmd = XENPF_del_memtype;
1483 + op.u.del_memtype.handle = 0;
1484 + op.u.del_memtype.reg = reg;
1485 + error = HYPERVISOR_platform_op(&op);
1487 + BUG_ON(error > 0);
1493 + mutex_unlock(&mtrr_mutex);
1498 +mtrr_del(int reg, unsigned long base, unsigned long size)
1500 + if (mtrr_check(base, size))
1502 + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
1505 +EXPORT_SYMBOL(mtrr_add);
1506 +EXPORT_SYMBOL(mtrr_del);
1508 +void __init mtrr_bp_init(void)
1512 +void mtrr_ap_init(void)
1516 +static int __init mtrr_init(void)
1518 + struct cpuinfo_x86 *c = &boot_cpu_data;
1520 + if (!is_initial_xendomain())
1523 + if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
1524 + (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
1525 + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
1526 + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
1529 + set_num_var_ranges();
1535 +subsys_initcall(mtrr_init);
1536 Index: head-2008-11-25/arch/x86/kernel/entry_32-xen.S
1537 ===================================================================
1538 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1539 +++ head-2008-11-25/arch/x86/kernel/entry_32-xen.S 2007-12-10 08:47:31.000000000 +0100
1542 + * linux/arch/i386/entry.S
1544 + * Copyright (C) 1991, 1992 Linus Torvalds
1548 + * entry.S contains the system-call and fault low-level handling routines.
1549 + * This also contains the timer-interrupt handler, as well as all interrupts
1550 + * and faults that can result in a task-switch.
1552 + * NOTE: This code handles signal-recognition, which happens every time
1553 + * after a timer-interrupt and after each system call.
1555 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
1558 + * Stack layout in 'ret_from_system_call':
1559 + * ptrace needs to have all regs on the stack.
1560 + * if the order here is changed, it needs to be
1561 + * updated in fork.c:copy_process, signal.c:do_signal,
1562 + * ptrace.c and ptrace.h
1573 + * 24(%esp) - orig_eax
1576 + * 30(%esp) - %eflags
1577 + * 34(%esp) - %oldesp
1578 + * 38(%esp) - %oldss
1580 + * "current" is in register %ebx during any slow entries.
1583 +#include <linux/linkage.h>
1584 +#include <asm/thread_info.h>
1585 +#include <asm/irqflags.h>
1586 +#include <asm/errno.h>
1587 +#include <asm/segment.h>
1588 +#include <asm/smp.h>
1589 +#include <asm/page.h>
1590 +#include <asm/desc.h>
1591 +#include <asm/dwarf2.h>
1592 +#include "irq_vectors.h"
1593 +#include <xen/interface/xen.h>
1595 +#define nr_syscalls ((syscall_table_size)/4)
1613 +CF_MASK = 0x00000001
1614 +TF_MASK = 0x00000100
1615 +IF_MASK = 0x00000200
1616 +DF_MASK = 0x00000400
1617 +NT_MASK = 0x00004000
1618 +VM_MASK = 0x00020000
1619 +/* Pseudo-eflags. */
1620 +NMI_MASK = 0x80000000
1623 +#define DISABLE_INTERRUPTS cli
1624 +#define ENABLE_INTERRUPTS sti
1626 +/* Offsets into shared_info_t. */
1627 +#define evtchn_upcall_pending /* 0 */
1628 +#define evtchn_upcall_mask 1
1630 +#define sizeof_vcpu_shift 6
1633 +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1634 + shl $sizeof_vcpu_shift,%esi ; \
1635 + addl HYPERVISOR_shared_info,%esi
1637 +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1640 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1641 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1642 +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1643 + __DISABLE_INTERRUPTS
1644 +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1645 + __ENABLE_INTERRUPTS
1646 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1649 +#ifdef CONFIG_PREEMPT
1650 +#define preempt_stop cli; TRACE_IRQS_OFF
1652 +#define preempt_stop
1653 +#define resume_kernel restore_nocheck
1656 +.macro TRACE_IRQS_IRET
1657 +#ifdef CONFIG_TRACE_IRQFLAGS
1658 + testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1666 +#define resume_userspace_sig check_userspace
1668 +#define resume_userspace_sig resume_userspace
1674 + CFI_ADJUST_CFA_OFFSET 4;\
1675 + /*CFI_REL_OFFSET es, 0;*/\
1677 + CFI_ADJUST_CFA_OFFSET 4;\
1678 + /*CFI_REL_OFFSET ds, 0;*/\
1680 + CFI_ADJUST_CFA_OFFSET 4;\
1681 + CFI_REL_OFFSET eax, 0;\
1683 + CFI_ADJUST_CFA_OFFSET 4;\
1684 + CFI_REL_OFFSET ebp, 0;\
1686 + CFI_ADJUST_CFA_OFFSET 4;\
1687 + CFI_REL_OFFSET edi, 0;\
1689 + CFI_ADJUST_CFA_OFFSET 4;\
1690 + CFI_REL_OFFSET esi, 0;\
1692 + CFI_ADJUST_CFA_OFFSET 4;\
1693 + CFI_REL_OFFSET edx, 0;\
1695 + CFI_ADJUST_CFA_OFFSET 4;\
1696 + CFI_REL_OFFSET ecx, 0;\
1698 + CFI_ADJUST_CFA_OFFSET 4;\
1699 + CFI_REL_OFFSET ebx, 0;\
1700 + movl $(__USER_DS), %edx; \
1704 +#define RESTORE_INT_REGS \
1706 + CFI_ADJUST_CFA_OFFSET -4;\
1709 + CFI_ADJUST_CFA_OFFSET -4;\
1712 + CFI_ADJUST_CFA_OFFSET -4;\
1715 + CFI_ADJUST_CFA_OFFSET -4;\
1718 + CFI_ADJUST_CFA_OFFSET -4;\
1721 + CFI_ADJUST_CFA_OFFSET -4;\
1724 + CFI_ADJUST_CFA_OFFSET -4;\
1727 +#define RESTORE_REGS \
1728 + RESTORE_INT_REGS; \
1730 + CFI_ADJUST_CFA_OFFSET -4;\
1731 + /*CFI_RESTORE ds;*/\
1733 + CFI_ADJUST_CFA_OFFSET -4;\
1734 + /*CFI_RESTORE es;*/\
1735 +.section .fixup,"ax"; \
1736 +3: movl $0,(%esp); \
1738 +4: movl $0,(%esp); \
1741 +.section __ex_table,"a";\
1747 +#define RING0_INT_FRAME \
1748 + CFI_STARTPROC simple;\
1749 + CFI_DEF_CFA esp, 3*4;\
1750 + /*CFI_OFFSET cs, -2*4;*/\
1751 + CFI_OFFSET eip, -3*4
1753 +#define RING0_EC_FRAME \
1754 + CFI_STARTPROC simple;\
1755 + CFI_DEF_CFA esp, 4*4;\
1756 + /*CFI_OFFSET cs, -2*4;*/\
1757 + CFI_OFFSET eip, -3*4
1759 +#define RING0_PTREGS_FRAME \
1760 + CFI_STARTPROC simple;\
1761 + CFI_DEF_CFA esp, OLDESP-EBX;\
1762 + /*CFI_OFFSET cs, CS-OLDESP;*/\
1763 + CFI_OFFSET eip, EIP-OLDESP;\
1764 + /*CFI_OFFSET es, ES-OLDESP;*/\
1765 + /*CFI_OFFSET ds, DS-OLDESP;*/\
1766 + CFI_OFFSET eax, EAX-OLDESP;\
1767 + CFI_OFFSET ebp, EBP-OLDESP;\
1768 + CFI_OFFSET edi, EDI-OLDESP;\
1769 + CFI_OFFSET esi, ESI-OLDESP;\
1770 + CFI_OFFSET edx, EDX-OLDESP;\
1771 + CFI_OFFSET ecx, ECX-OLDESP;\
1772 + CFI_OFFSET ebx, EBX-OLDESP
1774 +ENTRY(ret_from_fork)
1777 + CFI_ADJUST_CFA_OFFSET 4
1778 + call schedule_tail
1779 + GET_THREAD_INFO(%ebp)
1781 + CFI_ADJUST_CFA_OFFSET -4
1782 + pushl $0x0202 # Reset kernel eflags
1783 + CFI_ADJUST_CFA_OFFSET 4
1785 + CFI_ADJUST_CFA_OFFSET -4
1790 + * Return to user mode is not as complex as all this looks,
1791 + * but we want the default path for a system call return to
1792 + * go as quickly as possible which is why some of this is
1793 + * less clear than it otherwise should be.
1796 + # userspace resumption stub bypassing syscall exit tracing
1798 + RING0_PTREGS_FRAME
1799 +ret_from_exception:
1802 + GET_THREAD_INFO(%ebp)
1804 + movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1805 + movb CS(%esp), %al
1806 + testl $(VM_MASK | 2), %eax
1808 +ENTRY(resume_userspace)
1809 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1810 + # setting need_resched or sigpending
1811 + # between sampling and the iret
1812 + movl TI_flags(%ebp), %ecx
1813 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1814 + # int/exception return?
1818 +#ifdef CONFIG_PREEMPT
1819 +ENTRY(resume_kernel)
1821 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1822 + jnz restore_nocheck
1824 + movl TI_flags(%ebp), %ecx # need_resched set ?
1825 + testb $_TIF_NEED_RESCHED, %cl
1827 + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1829 + call preempt_schedule_irq
1834 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
1835 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
1837 + # sysenter call handler stub
1838 +ENTRY(sysenter_entry)
1839 + CFI_STARTPROC simple
1840 + CFI_DEF_CFA esp, 0
1841 + CFI_REGISTER esp, ebp
1842 + movl SYSENTER_stack_esp0(%esp),%esp
1845 + * No need to follow this irqs on/off section: the syscall
1846 + * disabled irqs and here we enable it straight after entry:
1849 + pushl $(__USER_DS)
1850 + CFI_ADJUST_CFA_OFFSET 4
1851 + /*CFI_REL_OFFSET ss, 0*/
1853 + CFI_ADJUST_CFA_OFFSET 4
1854 + CFI_REL_OFFSET esp, 0
1856 + CFI_ADJUST_CFA_OFFSET 4
1857 + pushl $(__USER_CS)
1858 + CFI_ADJUST_CFA_OFFSET 4
1859 + /*CFI_REL_OFFSET cs, 0*/
1861 + * Push current_thread_info()->sysenter_return to the stack.
1862 + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1863 + * pushed above; +8 corresponds to copy_thread's esp0 setting.
1865 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1866 + CFI_ADJUST_CFA_OFFSET 4
1867 + CFI_REL_OFFSET eip, 0
1870 + * Load the potential sixth argument from user stack.
1871 + * Careful about security.
1873 + cmpl $__PAGE_OFFSET-3,%ebp
1875 +1: movl (%ebp),%ebp
1876 +.section __ex_table,"a"
1878 + .long 1b,syscall_fault
1882 + CFI_ADJUST_CFA_OFFSET 4
1884 + GET_THREAD_INFO(%ebp)
1886 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1887 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1888 + jnz syscall_trace_entry
1889 + cmpl $(nr_syscalls), %eax
1890 + jae syscall_badsys
1891 + call *sys_call_table(,%eax,4)
1892 + movl %eax,EAX(%esp)
1893 + DISABLE_INTERRUPTS
1895 + movl TI_flags(%ebp), %ecx
1896 + testw $_TIF_ALLWORK_MASK, %cx
1897 + jne syscall_exit_work
1898 +/* if something modifies registers it must also disable sysexit */
1899 + movl EIP(%esp), %edx
1900 + movl OLDESP(%esp), %ecx
1904 + __ENABLE_INTERRUPTS
1905 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1907 + jnz 14f # process more events if necessary...
1908 + movl ESI(%esp), %esi
1910 +14: __DISABLE_INTERRUPTS
1912 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1914 + call evtchn_do_upcall
1921 +#endif /* !CONFIG_XEN */
1924 + # pv sysenter call handler stub
1925 +ENTRY(sysenter_entry_pv)
1927 + movl $__USER_DS,16(%esp)
1928 + movl %ebp,12(%esp)
1929 + movl $__USER_CS,4(%esp)
1931 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
1932 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1934 + * Load the potential sixth argument from user stack.
1935 + * Careful about security.
1937 + cmpl $__PAGE_OFFSET-3,%ebp
1939 +1: movl (%ebp),%ebp
1940 +.section __ex_table,"a"
1942 + .long 1b,syscall_fault
1944 + /* fall through */
1946 +ENDPROC(sysenter_entry_pv)
1948 + # system call handler stub
1950 + RING0_INT_FRAME # can't unwind into user space anyway
1951 + pushl %eax # save orig_eax
1952 + CFI_ADJUST_CFA_OFFSET 4
1954 + GET_THREAD_INFO(%ebp)
1955 + testl $TF_MASK,EFLAGS(%esp)
1957 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1959 + # system call tracing in operation / emulation
1960 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1961 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1962 + jnz syscall_trace_entry
1963 + cmpl $(nr_syscalls), %eax
1964 + jae syscall_badsys
1966 + call *sys_call_table(,%eax,4)
1967 + movl %eax,EAX(%esp) # store the return value
1969 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1970 + # setting need_resched or sigpending
1971 + # between sampling and the iret
1973 + movl TI_flags(%ebp), %ecx
1974 + testw $_TIF_ALLWORK_MASK, %cx # current->work
1975 + jne syscall_exit_work
1979 + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1980 + # Warning: OLDSS(%esp) contains the wrong/random values if we
1981 + # are returning to the kernel.
1982 + # See comments in process.c:copy_thread() for details.
1983 + movb OLDSS(%esp), %ah
1984 + movb CS(%esp), %al
1985 + andl $(VM_MASK | (4 << 8) | 3), %eax
1986 + cmpl $((4 << 8) | 3), %eax
1987 + CFI_REMEMBER_STATE
1988 + je ldt_ss # returning to user-space with LDT SS
1992 + movl EFLAGS(%esp), %eax
1993 + testl $(VM_MASK|NMI_MASK), %eax
1994 + CFI_REMEMBER_STATE
1995 + jnz hypervisor_iret
1996 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1998 + andb evtchn_upcall_mask(%esi),%al
1999 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2000 + CFI_REMEMBER_STATE
2001 + jnz restore_all_enable_events # != 0 => enable event delivery
2004 +restore_nocheck_notrace:
2007 + CFI_ADJUST_CFA_OFFSET -4
2009 +.section .fixup,"ax"
2015 + pushl $0 # no error code
2016 + pushl $do_iret_error
2019 +.section __ex_table,"a"
2027 + larl OLDSS(%esp), %eax
2028 + jnz restore_nocheck
2029 + testl $0x00400000, %eax # returning to 32bit stack?
2030 + jnz restore_nocheck # allright, normal return
2031 + /* If returning to userspace with 16bit stack,
2032 + * try to fix the higher word of ESP, as the CPU
2033 + * won't restore it.
2034 + * This is an "official" bug of all the x86-compatible
2035 + * CPUs, which we can try to work around to make
2036 + * dosemu and wine happy. */
2037 + subl $8, %esp # reserve space for switch16 pointer
2038 + CFI_ADJUST_CFA_OFFSET 8
2042 + /* Set up the 16bit stack frame with switch32 pointer on top,
2043 + * and a switch16 pointer on top of the current frame. */
2044 + call setup_x86_bogus_stack
2045 + CFI_ADJUST_CFA_OFFSET -8 # frame has moved
2048 + lss 20+4(%esp), %esp # switch to 16bit stack
2050 +.section __ex_table,"a"
2056 +restore_all_enable_events:
2058 + __ENABLE_INTERRUPTS
2059 +scrit: /**** START OF CRITICAL REGION ****/
2061 + jnz 14f # process more events if necessary...
2064 + CFI_ADJUST_CFA_OFFSET -4
2066 +.section __ex_table,"a"
2070 +14: __DISABLE_INTERRUPTS
2073 +ecrit: /**** END OF CRITICAL REGION ****/
2077 + andl $~NMI_MASK, EFLAGS(%esp)
2080 + CFI_ADJUST_CFA_OFFSET -4
2081 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
2085 + # perform work that needs to be done immediately before resumption
2087 + RING0_PTREGS_FRAME # can't unwind into user space anyway
2089 + testb $_TIF_NEED_RESCHED, %cl
2093 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
2094 + # setting need_resched or sigpending
2095 + # between sampling and the iret
2097 + movl TI_flags(%ebp), %ecx
2098 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
2099 + # than syscall tracing?
2101 + testb $_TIF_NEED_RESCHED, %cl
2104 +work_notifysig: # deal with pending signals and
2105 + # notify-resume requests
2106 + testl $VM_MASK, EFLAGS(%esp)
2108 + jne work_notifysig_v86 # returning to kernel-space or
2111 + call do_notify_resume
2112 + jmp resume_userspace_sig
2115 +work_notifysig_v86:
2117 + pushl %ecx # save ti_flags for do_notify_resume
2118 + CFI_ADJUST_CFA_OFFSET 4
2119 + call save_v86_state # %eax contains pt_regs pointer
2121 + CFI_ADJUST_CFA_OFFSET -4
2124 + call do_notify_resume
2125 + jmp resume_userspace_sig
2128 + # perform syscall exit tracing
2130 +syscall_trace_entry:
2131 + movl $-ENOSYS,EAX(%esp)
2134 + call do_syscall_trace
2136 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2137 + # so must skip actual syscall
2138 + movl ORIG_EAX(%esp), %eax
2139 + cmpl $(nr_syscalls), %eax
2143 + # perform syscall exit tracing
2146 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2149 + ENABLE_INTERRUPTS # could let do_syscall_trace() call
2150 + # schedule() instead
2153 + call do_syscall_trace
2154 + jmp resume_userspace
2157 + RING0_INT_FRAME # can't unwind into user space anyway
2159 + pushl %eax # save orig_eax
2160 + CFI_ADJUST_CFA_OFFSET 4
2162 + GET_THREAD_INFO(%ebp)
2163 + movl $-EFAULT,EAX(%esp)
2164 + jmp resume_userspace
2167 + movl $-ENOSYS,EAX(%esp)
2168 + jmp resume_userspace
2172 +#define FIXUP_ESPFIX_STACK \
2173 + movl %esp, %eax; \
2174 + /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2175 + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2176 + /* copy data from 16bit stack to 32bit stack */ \
2177 + call fixup_x86_bogus_stack; \
2178 + /* put ESP to the proper location */ \
2180 +#define UNWIND_ESPFIX_STACK \
2182 + CFI_ADJUST_CFA_OFFSET 4; \
2184 + /* see if on 16bit stack */ \
2185 + cmpw $__ESPFIX_SS, %ax; \
2188 + CFI_ADJUST_CFA_OFFSET -4; \
2189 +.section .fixup,"ax"; \
2190 +28: movl $__KERNEL_DS, %eax; \
2193 + /* switch to 32bit stack */ \
2194 + FIXUP_ESPFIX_STACK; \
2199 + * Build the entry stubs and pointer table with
2200 + * some assembler magic.
2207 +ENTRY(irq_entries_start)
2212 + CFI_ADJUST_CFA_OFFSET -4
2214 +1: pushl $~(vector)
2215 + CFI_ADJUST_CFA_OFFSET 4
2216 + jmp common_interrupt
2224 + * the CPU automatically disables interrupts when executing an IRQ vector,
2225 + * so IRQ-flags tracing has to follow that:
2236 +#define BUILD_INTERRUPT(name, nr) \
2238 + RING0_INT_FRAME; \
2240 + CFI_ADJUST_CFA_OFFSET 4; \
2244 + call smp_/**/name; \
2245 + jmp ret_from_intr; \
2248 +/* The include is where all of the SMP etc. interrupts come from */
2249 +#include "entry_arch.h"
2251 +#define UNWIND_ESPFIX_STACK
2254 +ENTRY(divide_error)
2256 + pushl $0 # no error code
2257 + CFI_ADJUST_CFA_OFFSET 4
2258 + pushl $do_divide_error
2259 + CFI_ADJUST_CFA_OFFSET 4
2263 + CFI_ADJUST_CFA_OFFSET 4
2264 + /*CFI_REL_OFFSET ds, 0*/
2266 + CFI_ADJUST_CFA_OFFSET 4
2267 + CFI_REL_OFFSET eax, 0
2270 + CFI_ADJUST_CFA_OFFSET 4
2271 + CFI_REL_OFFSET ebp, 0
2273 + CFI_ADJUST_CFA_OFFSET 4
2274 + CFI_REL_OFFSET edi, 0
2276 + CFI_ADJUST_CFA_OFFSET 4
2277 + CFI_REL_OFFSET esi, 0
2279 + CFI_ADJUST_CFA_OFFSET 4
2280 + CFI_REL_OFFSET edx, 0
2281 + decl %eax # eax = -1
2283 + CFI_ADJUST_CFA_OFFSET 4
2284 + CFI_REL_OFFSET ecx, 0
2286 + CFI_ADJUST_CFA_OFFSET 4
2287 + CFI_REL_OFFSET ebx, 0
2290 + CFI_ADJUST_CFA_OFFSET 4
2291 + /*CFI_REL_OFFSET es, 0*/
2292 + UNWIND_ESPFIX_STACK
2294 + CFI_ADJUST_CFA_OFFSET -4
2295 + /*CFI_REGISTER es, ecx*/
2296 + movl ES(%esp), %edi # get the function address
2297 + movl ORIG_EAX(%esp), %edx # get the error code
2298 + movl %eax, ORIG_EAX(%esp)
2299 + movl %ecx, ES(%esp)
2300 + /*CFI_REL_OFFSET es, ES*/
2301 + movl $(__USER_DS), %ecx
2304 + movl %esp,%eax # pt_regs pointer
2306 + jmp ret_from_exception
2310 +# A note on the "critical region" in our callback handler.
2311 +# We want to avoid stacking callback handlers due to events occurring
2312 +# during handling of the last event. To do this, we keep events disabled
2313 +# until we've done all processing. HOWEVER, we must enable events before
2314 +# popping the stack frame (can't be done atomically) and so it would still
2315 +# be possible to get enough handler activations to overflow the stack.
2316 +# Although unlikely, bugs of that kind are hard to track down, so we'd
2317 +# like to avoid the possibility.
2318 +# So, on entry to the handler we detect whether we interrupted an
2319 +# existing activation in its critical region -- if so, we pop the current
2320 +# activation and restart the handler using the previous one.
2322 +# The sysexit critical region is slightly different. sysexit
2323 +# atomically removes the entire stack frame. If we interrupt in the
2324 +# critical region we know that the entire frame is present and correct
2325 +# so we can simply throw away the new one.
2326 +ENTRY(hypervisor_callback)
2329 + CFI_ADJUST_CFA_OFFSET 4
2331 + movl EIP(%esp),%eax
2335 + jb critical_region_fixup
2336 + cmpl $sysexit_scrit,%eax
2338 + cmpl $sysexit_ecrit,%eax
2340 + addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2342 + CFI_ADJUST_CFA_OFFSET 4
2343 + call evtchn_do_upcall
2345 + CFI_ADJUST_CFA_OFFSET -4
2349 +# [How we do the fixup]. We want to merge the current stack frame with the
2350 +# just-interrupted frame. How we do this depends on where in the critical
2351 +# region the interrupted handler was executing, and so how many saved
2352 +# registers are in each frame. We do this quickly using the lookup table
2353 +# 'critical_fixup_table'. For each byte offset in the critical region, it
2354 +# provides the number of bytes which have already been popped from the
2355 +# interrupted stack frame.
2356 +critical_region_fixup:
2357 + movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
2358 + cmpb $0xff,%cl # 0xff => vcpu_info critical region
2361 +15: leal (%esp,%ecx),%esi # %esi points at end of src region
2362 + leal OLDESP(%esp),%edi # %edi points at end of dst region
2363 + shrl $2,%ecx # convert words to bytes
2364 + je 17f # skip loop if nothing to copy
2365 +16: subl $4,%esi # pre-decrementing copy loop
2370 +17: movl %edi,%esp # final %edi is top of merged stack
2373 +.section .rodata,"a"
2374 +critical_fixup_table:
2375 + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
2376 + .byte 0xff,0xff # jnz 14f
2377 + .byte 0x00 # pop %ebx
2378 + .byte 0x04 # pop %ecx
2379 + .byte 0x08 # pop %edx
2380 + .byte 0x0c # pop %esi
2381 + .byte 0x10 # pop %edi
2382 + .byte 0x14 # pop %ebp
2383 + .byte 0x18 # pop %eax
2384 + .byte 0x1c # pop %ds
2385 + .byte 0x20 # pop %es
2386 + .byte 0x24,0x24,0x24 # add $4,%esp
2388 + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2389 + .byte 0x00,0x00 # jmp 11b
2392 +# Hypervisor uses this for application faults while it executes.
2393 +# We get here for two reasons:
2394 +# 1. Fault while reloading DS, ES, FS or GS
2395 +# 2. Fault while executing IRET
2396 +# Category 1 we fix up by reattempting the load, and zeroing the segment
2397 +# register if the load fails.
2398 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
2399 +# normal Linux return path in this case because if we use the IRET hypercall
2400 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
2401 +# We distinguish between categories by maintaining a status value in EAX.
2402 +ENTRY(failsafe_callback)
2407 +3: mov 12(%esp),%fs
2408 +4: mov 16(%esp),%gs
2412 + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
2414 +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
2418 + jmp ret_from_exception
2419 +.section .fixup,"ax"; \
2420 +6: xorl %eax,%eax; \
2421 + movl %eax,4(%esp); \
2423 +7: xorl %eax,%eax; \
2424 + movl %eax,8(%esp); \
2426 +8: xorl %eax,%eax; \
2427 + movl %eax,12(%esp); \
2429 +9: xorl %eax,%eax; \
2430 + movl %eax,16(%esp); \
2433 +.section __ex_table,"a"; \
2443 +ENTRY(coprocessor_error)
2446 + CFI_ADJUST_CFA_OFFSET 4
2447 + pushl $do_coprocessor_error
2448 + CFI_ADJUST_CFA_OFFSET 4
2452 +ENTRY(simd_coprocessor_error)
2455 + CFI_ADJUST_CFA_OFFSET 4
2456 + pushl $do_simd_coprocessor_error
2457 + CFI_ADJUST_CFA_OFFSET 4
2461 +ENTRY(device_not_available)
2463 + pushl $-1 # mark this as an int
2464 + CFI_ADJUST_CFA_OFFSET 4
2468 + testl $0x4, %eax # EM (math emulation bit)
2469 + je device_available_emulate
2470 + pushl $0 # temporary storage for ORIG_EIP
2471 + CFI_ADJUST_CFA_OFFSET 4
2474 + CFI_ADJUST_CFA_OFFSET -4
2475 + jmp ret_from_exception
2476 +device_available_emulate:
2479 + call math_state_restore
2480 + jmp ret_from_exception
2485 + * Debug traps and NMI can happen at the one SYSENTER instruction
2486 + * that sets up the real kernel stack. Check here, since we can't
2487 + * allow the wrong stack to be used.
2489 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2490 + * already pushed 3 words if it hits on the sysenter instruction:
2491 + * eflags, cs and eip.
2493 + * We just load the right stack, and push the three (known) values
2494 + * by hand onto the new stack - while updating the return eip past
2495 + * the instruction that would have done it for sysenter.
2497 +#define FIX_STACK(offset, ok, label) \
2498 + cmpw $__KERNEL_CS,4(%esp); \
2501 + movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2503 + pushl $__KERNEL_CS; \
2504 + pushl $sysenter_past_esp
2505 +#endif /* CONFIG_XEN */
2507 +KPROBE_ENTRY(debug)
2510 + cmpl $sysenter_entry,(%esp)
2511 + jne debug_stack_correct
2512 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2513 +debug_stack_correct:
2514 +#endif /* !CONFIG_XEN */
2515 + pushl $-1 # mark this as an int
2516 + CFI_ADJUST_CFA_OFFSET 4
2518 + xorl %edx,%edx # error code 0
2519 + movl %esp,%eax # pt_regs pointer
2521 + jmp ret_from_exception
2526 + * NMI is doubly nasty. It can happen _while_ we're handling
2527 + * a debug fault, and the debug fault hasn't yet been able to
2528 + * clear up the stack. So we first check whether we got an
2529 + * NMI on the sysenter entry path, but after that we need to
2530 + * check whether we got an NMI on the debug path where the debug
2531 + * fault happened on the sysenter path.
2536 + CFI_ADJUST_CFA_OFFSET 4
2538 + cmpw $__ESPFIX_SS, %ax
2540 + CFI_ADJUST_CFA_OFFSET -4
2541 + je nmi_16bit_stack
2542 + cmpl $sysenter_entry,(%esp)
2543 + je nmi_stack_fixup
2545 + CFI_ADJUST_CFA_OFFSET 4
2547 + /* Do not access memory above the end of our stack page,
2548 + * it might not exist.
2550 + andl $(THREAD_SIZE-1),%eax
2551 + cmpl $(THREAD_SIZE-20),%eax
2553 + CFI_ADJUST_CFA_OFFSET -4
2554 + jae nmi_stack_correct
2555 + cmpl $sysenter_entry,12(%esp)
2556 + je nmi_debug_stack_check
2559 + CFI_ADJUST_CFA_OFFSET 4
2561 + xorl %edx,%edx # zero error code
2562 + movl %esp,%eax # pt_regs pointer
2564 + jmp restore_nocheck_notrace
2568 + FIX_STACK(12,nmi_stack_correct, 1)
2569 + jmp nmi_stack_correct
2570 +nmi_debug_stack_check:
2571 + cmpw $__KERNEL_CS,16(%esp)
2572 + jne nmi_stack_correct
2573 + cmpl $debug,(%esp)
2574 + jb nmi_stack_correct
2575 + cmpl $debug_esp_fix_insn,(%esp)
2576 + ja nmi_stack_correct
2577 + FIX_STACK(24,nmi_stack_correct, 1)
2578 + jmp nmi_stack_correct
2582 + /* create the pointer to lss back */
2584 + CFI_ADJUST_CFA_OFFSET 4
2586 + CFI_ADJUST_CFA_OFFSET 4
2589 + /* copy the iret frame of 12 bytes */
2592 + CFI_ADJUST_CFA_OFFSET 4
2595 + CFI_ADJUST_CFA_OFFSET 4
2597 + FIXUP_ESPFIX_STACK # %eax == %esp
2598 + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2599 + xorl %edx,%edx # zero error code
2602 + lss 12+4(%esp), %esp # back to 16bit stack
2605 +.section __ex_table,"a"
2613 + CFI_ADJUST_CFA_OFFSET 4
2615 + xorl %edx,%edx # zero error code
2616 + movl %esp,%eax # pt_regs pointer
2618 + orl $NMI_MASK, EFLAGS(%esp)
2625 + pushl $-1 # mark this as an int
2626 + CFI_ADJUST_CFA_OFFSET 4
2628 + xorl %edx,%edx # zero error code
2629 + movl %esp,%eax # pt_regs pointer
2631 + jmp ret_from_exception
2638 + CFI_ADJUST_CFA_OFFSET 4
2639 + pushl $do_overflow
2640 + CFI_ADJUST_CFA_OFFSET 4
2647 + CFI_ADJUST_CFA_OFFSET 4
2649 + CFI_ADJUST_CFA_OFFSET 4
2656 + CFI_ADJUST_CFA_OFFSET 4
2657 + pushl $do_invalid_op
2658 + CFI_ADJUST_CFA_OFFSET 4
2662 +ENTRY(coprocessor_segment_overrun)
2665 + CFI_ADJUST_CFA_OFFSET 4
2666 + pushl $do_coprocessor_segment_overrun
2667 + CFI_ADJUST_CFA_OFFSET 4
2673 + pushl $do_invalid_TSS
2674 + CFI_ADJUST_CFA_OFFSET 4
2678 +ENTRY(segment_not_present)
2680 + pushl $do_segment_not_present
2681 + CFI_ADJUST_CFA_OFFSET 4
2685 +ENTRY(stack_segment)
2687 + pushl $do_stack_segment
2688 + CFI_ADJUST_CFA_OFFSET 4
2692 +KPROBE_ENTRY(general_protection)
2694 + pushl $do_general_protection
2695 + CFI_ADJUST_CFA_OFFSET 4
2700 +ENTRY(alignment_check)
2702 + pushl $do_alignment_check
2703 + CFI_ADJUST_CFA_OFFSET 4
2707 +KPROBE_ENTRY(page_fault)
2709 + pushl $do_page_fault
2710 + CFI_ADJUST_CFA_OFFSET 4
2715 +#ifdef CONFIG_X86_MCE
2716 +ENTRY(machine_check)
2719 + CFI_ADJUST_CFA_OFFSET 4
2720 + pushl machine_check_vector
2721 + CFI_ADJUST_CFA_OFFSET 4
2727 +ENTRY(spurious_interrupt_bug)
2730 + CFI_ADJUST_CFA_OFFSET 4
2731 + pushl $do_spurious_interrupt_bug
2732 + CFI_ADJUST_CFA_OFFSET 4
2735 +#endif /* !CONFIG_XEN */
2737 +#ifdef CONFIG_STACK_UNWIND
2738 +ENTRY(arch_unwind_init_running)
2740 + movl 4(%esp), %edx
2742 + leal 4(%esp), %eax
2743 + movl %ebx, EBX(%edx)
2745 + movl %ebx, ECX(%edx)
2746 + movl %ebx, EDX(%edx)
2747 + movl %esi, ESI(%edx)
2748 + movl %edi, EDI(%edx)
2749 + movl %ebp, EBP(%edx)
2750 + movl %ebx, EAX(%edx)
2751 + movl $__USER_DS, DS(%edx)
2752 + movl $__USER_DS, ES(%edx)
2753 + movl %ebx, ORIG_EAX(%edx)
2754 + movl %ecx, EIP(%edx)
2755 + movl 12(%esp), %ecx
2756 + movl $__KERNEL_CS, CS(%edx)
2757 + movl %ebx, EFLAGS(%edx)
2758 + movl %eax, OLDESP(%edx)
2759 + movl 8(%esp), %eax
2760 + movl %ecx, 8(%esp)
2761 + movl EBX(%edx), %ebx
2762 + movl $__KERNEL_DS, OLDSS(%edx)
2765 +ENDPROC(arch_unwind_init_running)
2768 +ENTRY(fixup_4gb_segment)
2770 + pushl $do_fixup_4gb_segment
2771 + CFI_ADJUST_CFA_OFFSET 4
2775 +.section .rodata,"a"
2776 +#include "syscall_table.S"
2778 +syscall_table_size=(.-sys_call_table)
2779 Index: head-2008-11-25/arch/x86/kernel/fixup.c
2780 ===================================================================
2781 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2782 +++ head-2008-11-25/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100
2784 +/******************************************************************************
2787 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
2788 + * Used to avoid repeated slow emulation of common instructions used by the
2789 + * user-space TLS (Thread-Local Storage) libraries.
2792 + * Issues with the binary rewriting have caused it to be removed. Instead
2793 + * we rely on Xen's emulator to boot the kernel, and then print a banner
2794 + * message recommending that the user disables /lib/tls.
2796 + * Copyright (c) 2004, K A Fraser
2798 + * This program is free software; you can redistribute it and/or modify
2799 + * it under the terms of the GNU General Public License as published by
2800 + * the Free Software Foundation; either version 2 of the License, or
2801 + * (at your option) any later version.
2803 + * This program is distributed in the hope that it will be useful,
2804 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2805 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2806 + * GNU General Public License for more details.
2808 + * You should have received a copy of the GNU General Public License
2809 + * along with this program; if not, write to the Free Software
2810 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2813 +#include <linux/init.h>
2814 +#include <linux/sched.h>
2815 +#include <linux/slab.h>
2816 +#include <linux/kernel.h>
2817 +#include <linux/delay.h>
2818 +#include <linux/version.h>
2820 +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
2822 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2824 + static unsigned long printed = 0;
2828 + /* Ignore statically-linked init. */
2829 + if (current->tgid == 1)
2832 + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
2833 + VMASST_TYPE_4gb_segments_notify));
2835 + if (test_and_set_bit(0, &printed))
2838 + sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
2841 + DP("***************************************************************");
2842 + DP("***************************************************************");
2843 + DP("** WARNING: Currently emulating unsupported memory accesses **");
2844 + DP("** in /lib/tls glibc libraries. The emulation is **");
2845 + DP("** slow. To ensure full performance you should **");
2846 + DP("** install a 'xen-friendly' (nosegneg) version of **");
2847 + DP("** the library, or disable tls support by executing **");
2848 + DP("** the following as root: **");
2849 + DP("** mv /lib/tls /lib/tls.disabled **");
2850 + DP("** Offending process: %-38.38s **", info);
2851 + DP("***************************************************************");
2852 + DP("***************************************************************");
2855 + for (i = 5; i > 0; i--) {
2856 + touch_softlockup_watchdog();
2857 + printk("Pausing... %d", i);
2859 + printk("\b\b\b\b\b\b\b\b\b\b\b\b");
2862 + printk("Continuing...\n\n");
2865 +static int __init fixup_init(void)
2867 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
2868 + VMASST_TYPE_4gb_segments_notify));
2871 +__initcall(fixup_init);
2872 Index: head-2008-11-25/arch/x86/kernel/head_32-xen.S
2873 ===================================================================
2874 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2875 +++ head-2008-11-25/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200
2880 +#include <linux/elfnote.h>
2881 +#include <linux/threads.h>
2882 +#include <linux/linkage.h>
2883 +#include <asm/segment.h>
2884 +#include <asm/page.h>
2885 +#include <asm/cache.h>
2886 +#include <asm/thread_info.h>
2887 +#include <asm/asm-offsets.h>
2888 +#include <asm/dwarf2.h>
2889 +#include <xen/interface/xen.h>
2890 +#include <xen/interface/elfnote.h>
2893 + * References to members of the new_cpu_data structure.
2896 +#define X86 new_cpu_data+CPUINFO_x86
2897 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
2898 +#define X86_MODEL new_cpu_data+CPUINFO_x86_model
2899 +#define X86_MASK new_cpu_data+CPUINFO_x86_mask
2900 +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
2901 +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
2902 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
2903 +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
2905 +#define VIRT_ENTRY_OFFSET 0x0
2906 +.org VIRT_ENTRY_OFFSET
2908 + movl %esi,xen_start_info
2911 + /* Set up the stack pointer */
2912 + movl $(init_thread_union+THREAD_SIZE),%esp
2914 + /* get vendor info */
2915 + xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2917 + movl %eax,X86_CPUID # save CPUID level
2918 + movl %ebx,X86_VENDOR_ID # lo 4 chars
2919 + movl %edx,X86_VENDOR_ID+4 # next 4 chars
2920 + movl %ecx,X86_VENDOR_ID+8 # last 4 chars
2922 + movl $1,%eax # Use the CPUID instruction to get CPU type
2924 + movb %al,%cl # save reg for future use
2925 + andb $0x0f,%ah # mask processor family
2927 + andb $0xf0,%al # mask model
2929 + movb %al,X86_MODEL
2930 + andb $0x0f,%cl # mask mask revision
2932 + movl %edx,X86_CAPABILITY
2934 + movb $1,X86_HARD_MATH
2936 + xorl %eax,%eax # Clear FS/GS and LDT
2939 + cld # gcc2 wants the direction flag cleared at all times
2941 + pushl %eax # fake return address
2944 +#define HYPERCALL_PAGE_OFFSET 0x1000
2945 +.org HYPERCALL_PAGE_OFFSET
2946 +ENTRY(hypercall_page)
2952 + * Real beginning of normal "text" segment
2960 +.section ".bss.page_aligned","w"
2961 +ENTRY(empty_zero_page)
2965 + * This starts the data section.
2970 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
2972 + .align L1_CACHE_BYTES
2973 +ENTRY(cpu_gdt_table)
2974 + .quad 0x0000000000000000 /* NULL descriptor */
2975 + .quad 0x0000000000000000 /* 0x0b reserved */
2976 + .quad 0x0000000000000000 /* 0x13 reserved */
2977 + .quad 0x0000000000000000 /* 0x1b reserved */
2978 + .quad 0x0000000000000000 /* 0x20 unused */
2979 + .quad 0x0000000000000000 /* 0x28 unused */
2980 + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
2981 + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
2982 + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
2983 + .quad 0x0000000000000000 /* 0x4b reserved */
2984 + .quad 0x0000000000000000 /* 0x53 reserved */
2985 + .quad 0x0000000000000000 /* 0x5b reserved */
2987 + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
2988 + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
2989 + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
2990 + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
2992 + .quad 0x0000000000000000 /* 0x80 TSS descriptor */
2993 + .quad 0x0000000000000000 /* 0x88 LDT descriptor */
2996 + * Segments used for calling PnP BIOS have byte granularity.
2997 + * They code segments and data segments have fixed 64k limits,
2998 + * the transfer segment sizes are set at run time.
3000 + .quad 0x0000000000000000 /* 0x90 32-bit code */
3001 + .quad 0x0000000000000000 /* 0x98 16-bit code */
3002 + .quad 0x0000000000000000 /* 0xa0 16-bit data */
3003 + .quad 0x0000000000000000 /* 0xa8 16-bit data */
3004 + .quad 0x0000000000000000 /* 0xb0 16-bit data */
3007 + * The APM segments have byte granularity and their bases
3008 + * are set at run time. All have 64k limits.
3010 + .quad 0x0000000000000000 /* 0xb8 APM CS code */
3011 + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
3012 + .quad 0x0000000000000000 /* 0xc8 APM DS data */
3014 + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
3015 + .quad 0x0000000000000000 /* 0xd8 - unused */
3016 + .quad 0x0000000000000000 /* 0xe0 - unused */
3017 + .quad 0x0000000000000000 /* 0xe8 - unused */
3018 + .quad 0x0000000000000000 /* 0xf0 - unused */
3019 + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
3021 +#if CONFIG_XEN_COMPAT <= 0x030002
3023 + * __xen_guest information
3026 + .if (\value) < 0 || (\value) >= 0x10
3027 + utoa (((\value)>>4)&0x0fffffff)
3029 + .if ((\value) & 0xf) < 10
3030 + .byte '0' + ((\value) & 0xf)
3032 + .byte 'A' + ((\value) & 0xf) - 10
3036 +.section __xen_guest
3037 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
3038 + .ascii ",XEN_VER=xen-3.0"
3039 + .ascii ",VIRT_BASE=0x"
3040 + utoa __PAGE_OFFSET
3041 + .ascii ",ELF_PADDR_OFFSET=0x"
3042 + utoa __PAGE_OFFSET
3043 + .ascii ",VIRT_ENTRY=0x"
3044 + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
3045 + .ascii ",HYPERCALL_PAGE=0x"
3046 + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3047 + .ascii ",FEATURES=writable_page_tables"
3048 + .ascii "|writable_descriptor_tables"
3049 + .ascii "|auto_translated_physmap"
3050 + .ascii "|pae_pgdir_above_4gb"
3051 + .ascii "|supervisor_mode_kernel"
3052 +#ifdef CONFIG_X86_PAE
3053 + .ascii ",PAE=yes[extended-cr3]"
3057 + .ascii ",LOADER=generic"
3059 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
3062 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
3063 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
3064 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
3065 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
3066 +#if CONFIG_XEN_COMPAT <= 0x030002
3067 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
3069 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
3071 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
3072 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
3073 + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
3074 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
3075 +#ifdef CONFIG_X86_PAE
3076 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
3077 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
3079 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
3080 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT)
3082 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
3083 + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
3084 Index: head-2008-11-25/arch/x86/kernel/init_task-xen.c
3085 ===================================================================
3086 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3087 +++ head-2008-11-25/arch/x86/kernel/init_task-xen.c 2007-06-12 13:12:48.000000000 +0200
3089 +#include <linux/mm.h>
3090 +#include <linux/module.h>
3091 +#include <linux/sched.h>
3092 +#include <linux/init.h>
3093 +#include <linux/init_task.h>
3094 +#include <linux/fs.h>
3095 +#include <linux/mqueue.h>
3097 +#include <asm/uaccess.h>
3098 +#include <asm/pgtable.h>
3099 +#include <asm/desc.h>
3101 +static struct fs_struct init_fs = INIT_FS;
3102 +static struct files_struct init_files = INIT_FILES;
3103 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3104 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3106 +#define swapper_pg_dir ((pgd_t *)NULL)
3107 +struct mm_struct init_mm = INIT_MM(init_mm);
3108 +#undef swapper_pg_dir
3110 +EXPORT_SYMBOL(init_mm);
3113 + * Initial thread structure.
3115 + * We need to make sure that this is THREAD_SIZE aligned due to the
3116 + * way process stacks are handled. This is done by having a special
3117 + * "init_task" linker map entry..
3119 +union thread_union init_thread_union
3120 + __attribute__((__section__(".data.init_task"))) =
3121 + { INIT_THREAD_INFO(init_task) };
3124 + * Initial task structure.
3126 + * All other task structs will be allocated on slabs in fork.c
3128 +struct task_struct init_task = INIT_TASK(init_task);
3130 +EXPORT_SYMBOL(init_task);
3132 +#ifndef CONFIG_X86_NO_TSS
3134 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3135 + * no more per-task TSS's.
3137 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3140 Index: head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c
3141 ===================================================================
3142 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3143 +++ head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c 2008-11-25 12:22:34.000000000 +0100
3146 + * Intel IO-APIC support for multi-Pentium hosts.
3148 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3150 + * Many thanks to Stig Venaas for trying out countless experimental
3151 + * patches and reporting/debugging problems patiently!
3153 + * (c) 1999, Multiple IO-APIC support, developed by
3154 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3155 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3156 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
3157 + * and Ingo Molnar <mingo@redhat.com>
3160 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
3161 + * thanks to Eric Gilmore
3162 + * and Rolf G. Tews
3163 + * for testing these extensively
3164 + * Paul Diefenbaugh : Added full ACPI support
3167 +#include <linux/mm.h>
3168 +#include <linux/interrupt.h>
3169 +#include <linux/init.h>
3170 +#include <linux/delay.h>
3171 +#include <linux/sched.h>
3172 +#include <linux/smp_lock.h>
3173 +#include <linux/mc146818rtc.h>
3174 +#include <linux/compiler.h>
3175 +#include <linux/acpi.h>
3176 +#include <linux/module.h>
3177 +#include <linux/sysdev.h>
3179 +#include <asm/io.h>
3180 +#include <asm/smp.h>
3181 +#include <asm/desc.h>
3182 +#include <asm/timer.h>
3183 +#include <asm/i8259.h>
3184 +#include <asm/nmi.h>
3186 +#include <mach_apic.h>
3188 +#include "io_ports.h"
3192 +#include <xen/interface/xen.h>
3193 +#include <xen/interface/physdev.h>
3194 +#include <xen/evtchn.h>
3197 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
3198 +#define disable_8259A_irq(_irq) ((void)0)
3199 +#define i8259A_irq_pending(_irq) (0)
3201 +unsigned long io_apic_irqs;
3203 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
3205 + struct physdev_apic apic_op;
3208 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3209 + apic_op.reg = reg;
3210 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
3213 + return apic_op.value;
3216 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
3218 + struct physdev_apic apic_op;
3220 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3221 + apic_op.reg = reg;
3222 + apic_op.value = value;
3223 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
3226 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
3227 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
3229 +#endif /* CONFIG_XEN */
3231 +int (*ioapic_renumber_irq)(int ioapic, int irq);
3232 +atomic_t irq_mis_count;
3234 +/* Where if anywhere is the i8259 connect in external int mode */
3235 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
3237 +static DEFINE_SPINLOCK(ioapic_lock);
3238 +static DEFINE_SPINLOCK(vector_lock);
3240 +int timer_over_8254 __initdata = 1;
3243 + * Is the SiS APIC rmw bug present ?
3244 + * -1 = don't know, 0 = no, 1 = yes
3246 +int sis_apic_bug = -1;
3249 + * # of IRQ routing registers
3251 +int nr_ioapic_registers[MAX_IO_APICS];
3253 +int disable_timer_pin_1 __initdata;
3256 + * Rough estimation of how many shared IRQs there are, can
3257 + * be changed anytime.
3259 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
3260 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
3263 + * This is performance-critical, we want to do it O(1)
3265 + * the indexing order of this array favors 1:1 mappings
3266 + * between pins and IRQs.
3269 +static struct irq_pin_list {
3270 + int apic, pin, next;
3271 +} irq_2_pin[PIN_MAP_SIZE];
3273 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
3274 +#ifdef CONFIG_PCI_MSI
3275 +#define vector_to_irq(vector) \
3276 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
3278 +#define vector_to_irq(vector) (vector)
3282 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
3283 + * shared ISA-space IRQs, so we have to support them. We are super
3284 + * fast in the common case, and fast for shared ISA-space IRQs.
3286 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
3288 + static int first_free_entry = NR_IRQS;
3289 + struct irq_pin_list *entry = irq_2_pin + irq;
3291 + while (entry->next)
3292 + entry = irq_2_pin + entry->next;
3294 + if (entry->pin != -1) {
3295 + entry->next = first_free_entry;
3296 + entry = irq_2_pin + entry->next;
3297 + if (++first_free_entry >= PIN_MAP_SIZE)
3298 + panic("io_apic.c: whoops");
3300 + entry->apic = apic;
3305 +#define clear_IO_APIC() ((void)0)
3308 + * Reroute an IRQ to a different pin.
3310 +static void __init replace_pin_at_irq(unsigned int irq,
3311 + int oldapic, int oldpin,
3312 + int newapic, int newpin)
3314 + struct irq_pin_list *entry = irq_2_pin + irq;
3317 + if (entry->apic == oldapic && entry->pin == oldpin) {
3318 + entry->apic = newapic;
3319 + entry->pin = newpin;
3323 + entry = irq_2_pin + entry->next;
3327 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
3329 + struct irq_pin_list *entry = irq_2_pin + irq;
3330 + unsigned int pin, reg;
3336 + reg = io_apic_read(entry->apic, 0x10 + pin*2);
3339 + io_apic_modify(entry->apic, 0x10 + pin*2, reg);
3342 + entry = irq_2_pin + entry->next;
3347 +static void __mask_IO_APIC_irq (unsigned int irq)
3349 + __modify_IO_APIC_irq(irq, 0x00010000, 0);
3353 +static void __unmask_IO_APIC_irq (unsigned int irq)
3355 + __modify_IO_APIC_irq(irq, 0, 0x00010000);
3358 +/* mask = 1, trigger = 0 */
3359 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
3361 + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
3364 +/* mask = 0, trigger = 1 */
3365 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
3367 + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
3370 +static void mask_IO_APIC_irq (unsigned int irq)
3372 + unsigned long flags;
3374 + spin_lock_irqsave(&ioapic_lock, flags);
3375 + __mask_IO_APIC_irq(irq);
3376 + spin_unlock_irqrestore(&ioapic_lock, flags);
3379 +static void unmask_IO_APIC_irq (unsigned int irq)
3381 + unsigned long flags;
3383 + spin_lock_irqsave(&ioapic_lock, flags);
3384 + __unmask_IO_APIC_irq(irq);
3385 + spin_unlock_irqrestore(&ioapic_lock, flags);
3388 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
3390 + struct IO_APIC_route_entry entry;
3391 + unsigned long flags;
3393 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
3394 + spin_lock_irqsave(&ioapic_lock, flags);
3395 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3396 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3397 + spin_unlock_irqrestore(&ioapic_lock, flags);
3398 + if (entry.delivery_mode == dest_SMI)
3402 + * Disable it in the IO-APIC irq-routing table:
3404 + memset(&entry, 0, sizeof(entry));
3406 + spin_lock_irqsave(&ioapic_lock, flags);
3407 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3408 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3409 + spin_unlock_irqrestore(&ioapic_lock, flags);
3412 +static void clear_IO_APIC (void)
3416 + for (apic = 0; apic < nr_ioapics; apic++)
3417 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
3418 + clear_IO_APIC_pin(apic, pin);
3422 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
3424 + unsigned long flags;
3426 + struct irq_pin_list *entry = irq_2_pin + irq;
3427 + unsigned int apicid_value;
3430 + cpus_and(tmp, cpumask, cpu_online_map);
3431 + if (cpus_empty(tmp))
3432 + tmp = TARGET_CPUS;
3434 + cpus_and(cpumask, tmp, CPU_MASK_ALL);
3436 + apicid_value = cpu_mask_to_apicid(cpumask);
3437 + /* Prepare to do the io_apic_write */
3438 + apicid_value = apicid_value << 24;
3439 + spin_lock_irqsave(&ioapic_lock, flags);
3444 + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
3447 + entry = irq_2_pin + entry->next;
3449 + set_irq_info(irq, cpumask);
3450 + spin_unlock_irqrestore(&ioapic_lock, flags);
3453 +#if defined(CONFIG_IRQBALANCE)
3454 +# include <asm/processor.h> /* kernel_thread() */
3455 +# include <linux/kernel_stat.h> /* kstat */
3456 +# include <linux/slab.h> /* kmalloc() */
3457 +# include <linux/timer.h> /* time_after() */
3459 +#ifdef CONFIG_BALANCED_IRQ_DEBUG
3460 +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
3461 +# define Dprintk(x...) do { TDprintk(x); } while (0)
3463 +# define TDprintk(x...)
3464 +# define Dprintk(x...)
3467 +#define IRQBALANCE_CHECK_ARCH -999
3468 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3469 +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
3470 +#define BALANCED_IRQ_MORE_DELTA (HZ/10)
3471 +#define BALANCED_IRQ_LESS_DELTA (HZ)
3473 +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
3474 +static int physical_balance __read_mostly;
3475 +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
3477 +static struct irq_cpu_info {
3478 + unsigned long * last_irq;
3479 + unsigned long * irq_delta;
3480 + unsigned long irq;
3481 +} irq_cpu_data[NR_CPUS];
3483 +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
3484 +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
3485 +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
3487 +#define IDLE_ENOUGH(cpu,now) \
3488 + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
3490 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
3492 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
3494 +static cpumask_t balance_irq_affinity[NR_IRQS] = {
3495 + [0 ... NR_IRQS-1] = CPU_MASK_ALL
3498 +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
3500 + balance_irq_affinity[irq] = mask;
3503 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
3504 + unsigned long now, int direction)
3506 + int search_idle = 1;
3507 + int cpu = curr_cpu;
3512 + if (unlikely(cpu == curr_cpu))
3515 + if (direction == 1) {
3517 + if (cpu >= NR_CPUS)
3524 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
3525 + (search_idle && !IDLE_ENOUGH(cpu,now)));
3530 +static inline void balance_irq(int cpu, int irq)
3532 + unsigned long now = jiffies;
3533 + cpumask_t allowed_mask;
3534 + unsigned int new_cpu;
3536 + if (irqbalance_disabled)
3539 + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
3540 + new_cpu = move(cpu, allowed_mask, now, 1);
3541 + if (cpu != new_cpu) {
3542 + set_pending_irq(irq, cpumask_of_cpu(new_cpu));
3546 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
3549 + Dprintk("Rotating IRQs among CPUs.\n");
3550 + for_each_online_cpu(i) {
3551 + for (j = 0; j < NR_IRQS; j++) {
3552 + if (!irq_desc[j].action)
3554 + /* Is it a significant load ? */
3555 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
3556 + useful_load_threshold)
3558 + balance_irq(i, j);
3561 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3562 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3566 +static void do_irq_balance(void)
3569 + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
3570 + unsigned long move_this_load = 0;
3571 + int max_loaded = 0, min_loaded = 0;
3573 + unsigned long useful_load_threshold = balanced_irq_interval + 10;
3575 + int tmp_loaded, first_attempt = 1;
3576 + unsigned long tmp_cpu_irq;
3577 + unsigned long imbalance = 0;
3578 + cpumask_t allowed_mask, target_cpu_mask, tmp;
3580 + for_each_possible_cpu(i) {
3581 + int package_index;
3583 + if (!cpu_online(i))
3585 + package_index = CPU_TO_PACKAGEINDEX(i);
3586 + for (j = 0; j < NR_IRQS; j++) {
3587 + unsigned long value_now, delta;
3588 + /* Is this an active IRQ? */
3589 + if (!irq_desc[j].action)
3591 + if ( package_index == i )
3592 + IRQ_DELTA(package_index,j) = 0;
3593 + /* Determine the total count per processor per IRQ */
3594 + value_now = (unsigned long) kstat_cpu(i).irqs[j];
3596 + /* Determine the activity per processor per IRQ */
3597 + delta = value_now - LAST_CPU_IRQ(i,j);
3599 + /* Update last_cpu_irq[][] for the next time */
3600 + LAST_CPU_IRQ(i,j) = value_now;
3602 + /* Ignore IRQs whose rate is less than the clock */
3603 + if (delta < useful_load_threshold)
3605 + /* update the load for the processor or package total */
3606 + IRQ_DELTA(package_index,j) += delta;
3608 + /* Keep track of the higher numbered sibling as well */
3609 + if (i != package_index)
3610 + CPU_IRQ(i) += delta;
3612 + * We have sibling A and sibling B in the package
3614 + * cpu_irq[A] = load for cpu A + load for cpu B
3615 + * cpu_irq[B] = load for cpu B
3617 + CPU_IRQ(package_index) += delta;
3620 + /* Find the least loaded processor package */
3621 + for_each_online_cpu(i) {
3622 + if (i != CPU_TO_PACKAGEINDEX(i))
3624 + if (min_cpu_irq > CPU_IRQ(i)) {
3625 + min_cpu_irq = CPU_IRQ(i);
3629 + max_cpu_irq = ULONG_MAX;
3632 + /* Look for heaviest loaded processor.
3633 + * We may come back to get the next heaviest loaded processor.
3634 + * Skip processors with trivial loads.
3638 + for_each_online_cpu(i) {
3639 + if (i != CPU_TO_PACKAGEINDEX(i))
3641 + if (max_cpu_irq <= CPU_IRQ(i))
3643 + if (tmp_cpu_irq < CPU_IRQ(i)) {
3644 + tmp_cpu_irq = CPU_IRQ(i);
3649 + if (tmp_loaded == -1) {
3650 + /* In the case of small number of heavy interrupt sources,
3651 + * loading some of the cpus too much. We use Ingo's original
3652 + * approach to rotate them around.
3654 + if (!first_attempt && imbalance >= useful_load_threshold) {
3655 + rotate_irqs_among_cpus(useful_load_threshold);
3658 + goto not_worth_the_effort;
3661 + first_attempt = 0; /* heaviest search */
3662 + max_cpu_irq = tmp_cpu_irq; /* load */
3663 + max_loaded = tmp_loaded; /* processor */
3664 + imbalance = (max_cpu_irq - min_cpu_irq) / 2;
3666 + Dprintk("max_loaded cpu = %d\n", max_loaded);
3667 + Dprintk("min_loaded cpu = %d\n", min_loaded);
3668 + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
3669 + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
3670 + Dprintk("load imbalance = %lu\n", imbalance);
3672 + /* if imbalance is less than approx 10% of max load, then
3673 + * observe diminishing returns action. - quit
3675 + if (imbalance < (max_cpu_irq >> 3)) {
3676 + Dprintk("Imbalance too trivial\n");
3677 + goto not_worth_the_effort;
3681 + /* if we select an IRQ to move that can't go where we want, then
3682 + * see if there is another one to try.
3684 + move_this_load = 0;
3685 + selected_irq = -1;
3686 + for (j = 0; j < NR_IRQS; j++) {
3687 + /* Is this an active IRQ? */
3688 + if (!irq_desc[j].action)
3690 + if (imbalance <= IRQ_DELTA(max_loaded,j))
3692 + /* Try to find the IRQ that is closest to the imbalance
3693 + * without going over.
3695 + if (move_this_load < IRQ_DELTA(max_loaded,j)) {
3696 + move_this_load = IRQ_DELTA(max_loaded,j);
3700 + if (selected_irq == -1) {
3701 + goto tryanothercpu;
3704 + imbalance = move_this_load;
3706 + /* For physical_balance case, we accumlated both load
3707 + * values in the one of the siblings cpu_irq[],
3708 + * to use the same code for physical and logical processors
3709 + * as much as possible.
3711 + * NOTE: the cpu_irq[] array holds the sum of the load for
3712 + * sibling A and sibling B in the slot for the lowest numbered
3713 + * sibling (A), _AND_ the load for sibling B in the slot for
3714 + * the higher numbered sibling.
3716 + * We seek the least loaded sibling by making the comparison
3719 + load = CPU_IRQ(min_loaded) >> 1;
3720 + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
3721 + if (load > CPU_IRQ(j)) {
3722 + /* This won't change cpu_sibling_map[min_loaded] */
3723 + load = CPU_IRQ(j);
3728 + cpus_and(allowed_mask,
3730 + balance_irq_affinity[selected_irq]);
3731 + target_cpu_mask = cpumask_of_cpu(min_loaded);
3732 + cpus_and(tmp, target_cpu_mask, allowed_mask);
3734 + if (!cpus_empty(tmp)) {
3736 + Dprintk("irq = %d moved to cpu = %d\n",
3737 + selected_irq, min_loaded);
3738 + /* mark for change destination */
3739 + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
3741 + /* Since we made a change, come back sooner to
3742 + * check for more variation.
3744 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3745 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3748 + goto tryanotherirq;
3750 +not_worth_the_effort:
3752 + * if we did not find an IRQ to move, then adjust the time interval
3755 + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
3756 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
3757 + Dprintk("IRQ worth rotating not found\n");
3761 +static int balanced_irq(void *unused)
3764 + unsigned long prev_balance_time = jiffies;
3765 + long time_remaining = balanced_irq_interval;
3767 + daemonize("kirqd");
3769 + /* push everything to CPU 0 to give us a starting point. */
3770 + for (i = 0 ; i < NR_IRQS ; i++) {
3771 + irq_desc[i].pending_mask = cpumask_of_cpu(0);
3772 + set_pending_irq(i, cpumask_of_cpu(0));
3776 + time_remaining = schedule_timeout_interruptible(time_remaining);
3778 + if (time_after(jiffies,
3779 + prev_balance_time+balanced_irq_interval)) {
3780 + preempt_disable();
3782 + prev_balance_time = jiffies;
3783 + time_remaining = balanced_irq_interval;
3790 +static int __init balanced_irq_init(void)
3793 + struct cpuinfo_x86 *c;
3796 + cpus_shift_right(tmp, cpu_online_map, 2);
3797 + c = &boot_cpu_data;
3798 + /* When not overwritten by the command line ask subarchitecture. */
3799 + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
3800 + irqbalance_disabled = NO_BALANCE_IRQ;
3801 + if (irqbalance_disabled)
3804 + /* disable irqbalance completely if there is only one processor online */
3805 + if (num_online_cpus() < 2) {
3806 + irqbalance_disabled = 1;
3810 + * Enable physical balance only if more than 1 physical processor
3813 + if (smp_num_siblings > 1 && !cpus_empty(tmp))
3814 + physical_balance = 1;
3816 + for_each_online_cpu(i) {
3817 + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3818 + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3819 + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
3820 + printk(KERN_ERR "balanced_irq_init: out of memory");
3823 + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
3824 + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
3827 + printk(KERN_INFO "Starting balanced_irq\n");
3828 + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
3831 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
3833 + for_each_possible_cpu(i) {
3834 + kfree(irq_cpu_data[i].irq_delta);
3835 + irq_cpu_data[i].irq_delta = NULL;
3836 + kfree(irq_cpu_data[i].last_irq);
3837 + irq_cpu_data[i].last_irq = NULL;
3842 +int __init irqbalance_disable(char *str)
3844 + irqbalance_disabled = 1;
3848 +__setup("noirqbalance", irqbalance_disable);
3850 +late_initcall(balanced_irq_init);
3851 +#endif /* CONFIG_IRQBALANCE */
3852 +#endif /* CONFIG_SMP */
3856 +void fastcall send_IPI_self(int vector)
3864 + apic_wait_icr_idle();
3865 + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
3867 + * Send the IPI. The write to APIC_ICR fires this off.
3869 + apic_write_around(APIC_ICR, cfg);
3872 +#endif /* !CONFIG_SMP */
3876 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3877 + * specific CPU-side IRQs.
3880 +#define MAX_PIRQS 8
3881 +static int pirq_entries [MAX_PIRQS];
3882 +static int pirqs_enabled;
3883 +int skip_ioapic_setup;
3885 +static int __init ioapic_setup(char *str)
3887 + skip_ioapic_setup = 1;
3891 +__setup("noapic", ioapic_setup);
3893 +static int __init ioapic_pirq_setup(char *str)
3896 + int ints[MAX_PIRQS+1];
3898 + get_options(str, ARRAY_SIZE(ints), ints);
3900 + for (i = 0; i < MAX_PIRQS; i++)
3901 + pirq_entries[i] = -1;
3903 + pirqs_enabled = 1;
3904 + apic_printk(APIC_VERBOSE, KERN_INFO
3905 + "PIRQ redirection, working around broken MP-BIOS.\n");
3907 + if (ints[0] < MAX_PIRQS)
3910 + for (i = 0; i < max; i++) {
3911 + apic_printk(APIC_VERBOSE, KERN_DEBUG
3912 + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3914 + * PIRQs are mapped upside down, usually.
3916 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3921 +__setup("pirq=", ioapic_pirq_setup);
3924 + * Find the IRQ entry number of a certain pin.
3926 +static int find_irq_entry(int apic, int pin, int type)
3930 + for (i = 0; i < mp_irq_entries; i++)
3931 + if (mp_irqs[i].mpc_irqtype == type &&
3932 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
3933 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
3934 + mp_irqs[i].mpc_dstirq == pin)
3941 + * Find the pin to which IRQ[irq] (ISA) is connected
3943 +static int __init find_isa_irq_pin(int irq, int type)
3947 + for (i = 0; i < mp_irq_entries; i++) {
3948 + int lbus = mp_irqs[i].mpc_srcbus;
3950 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3951 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3952 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3953 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3955 + (mp_irqs[i].mpc_irqtype == type) &&
3956 + (mp_irqs[i].mpc_srcbusirq == irq))
3958 + return mp_irqs[i].mpc_dstirq;
3963 +static int __init find_isa_irq_apic(int irq, int type)
3967 + for (i = 0; i < mp_irq_entries; i++) {
3968 + int lbus = mp_irqs[i].mpc_srcbus;
3970 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3971 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3972 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3973 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3975 + (mp_irqs[i].mpc_irqtype == type) &&
3976 + (mp_irqs[i].mpc_srcbusirq == irq))
3979 + if (i < mp_irq_entries) {
3981 + for(apic = 0; apic < nr_ioapics; apic++) {
3982 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
3991 + * Find a specific PCI IRQ entry.
3992 + * Not an __init, possibly needed by modules
3994 +static int pin_2_irq(int idx, int apic, int pin);
3996 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
3998 + int apic, i, best_guess = -1;
4000 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4001 + "slot:%d, pin:%d.\n", bus, slot, pin);
4002 + if (mp_bus_id_to_pci_bus[bus] == -1) {
4003 + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4006 + for (i = 0; i < mp_irq_entries; i++) {
4007 + int lbus = mp_irqs[i].mpc_srcbus;
4009 + for (apic = 0; apic < nr_ioapics; apic++)
4010 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4011 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4014 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4015 + !mp_irqs[i].mpc_irqtype &&
4017 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4018 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4020 + if (!(apic || IO_APIC_IRQ(irq)))
4023 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4026 + * Use the first all-but-pin matching entry as a
4027 + * best-guess fuzzy result for broken mptables.
4029 + if (best_guess < 0)
4033 + return best_guess;
4035 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4038 + * This function currently is only a helper for the i386 smp boot process where
4039 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4040 + * so mask in all cases should simply be TARGET_CPUS
4044 +void __init setup_ioapic_dest(void)
4046 + int pin, ioapic, irq, irq_entry;
4048 + if (skip_ioapic_setup == 1)
4051 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4052 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4053 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4054 + if (irq_entry == -1)
4056 + irq = pin_2_irq(irq_entry, ioapic, pin);
4057 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
4062 +#endif /* !CONFIG_XEN */
4066 + * EISA Edge/Level control register, ELCR
4068 +static int EISA_ELCR(unsigned int irq)
4071 + unsigned int port = 0x4d0 + (irq >> 3);
4072 + return (inb(port) >> (irq & 7)) & 1;
4074 + apic_printk(APIC_VERBOSE, KERN_INFO
4075 + "Broken MPtable reports ISA irq %d\n", irq);
4079 +/* EISA interrupts are always polarity zero and can be edge or level
4080 + * trigger depending on the ELCR value. If an interrupt is listed as
4081 + * EISA conforming in the MP table, that means its trigger type must
4082 + * be read in from the ELCR */
4084 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4085 +#define default_EISA_polarity(idx) (0)
4087 +/* ISA interrupts are always polarity zero edge triggered,
4088 + * when listed as conforming in the MP table. */
4090 +#define default_ISA_trigger(idx) (0)
4091 +#define default_ISA_polarity(idx) (0)
4093 +/* PCI interrupts are always polarity one level triggered,
4094 + * when listed as conforming in the MP table. */
4096 +#define default_PCI_trigger(idx) (1)
4097 +#define default_PCI_polarity(idx) (1)
4099 +/* MCA interrupts are always polarity zero level triggered,
4100 + * when listed as conforming in the MP table. */
4102 +#define default_MCA_trigger(idx) (1)
4103 +#define default_MCA_polarity(idx) (0)
4105 +/* NEC98 interrupts are always polarity zero edge triggered,
4106 + * when listed as conforming in the MP table. */
4108 +#define default_NEC98_trigger(idx) (0)
4109 +#define default_NEC98_polarity(idx) (0)
4111 +static int __init MPBIOS_polarity(int idx)
4113 + int bus = mp_irqs[idx].mpc_srcbus;
4117 + * Determine IRQ line polarity (high active or low active):
4119 + switch (mp_irqs[idx].mpc_irqflag & 3)
4121 + case 0: /* conforms, ie. bus-type dependent polarity */
4123 + switch (mp_bus_id_to_type[bus])
4125 + case MP_BUS_ISA: /* ISA pin */
4127 + polarity = default_ISA_polarity(idx);
4130 + case MP_BUS_EISA: /* EISA pin */
4132 + polarity = default_EISA_polarity(idx);
4135 + case MP_BUS_PCI: /* PCI pin */
4137 + polarity = default_PCI_polarity(idx);
4140 + case MP_BUS_MCA: /* MCA pin */
4142 + polarity = default_MCA_polarity(idx);
4145 + case MP_BUS_NEC98: /* NEC 98 pin */
4147 + polarity = default_NEC98_polarity(idx);
4152 + printk(KERN_WARNING "broken BIOS!!\n");
4159 + case 1: /* high active */
4164 + case 2: /* reserved */
4166 + printk(KERN_WARNING "broken BIOS!!\n");
4170 + case 3: /* low active */
4175 + default: /* invalid */
4177 + printk(KERN_WARNING "broken BIOS!!\n");
4185 +static int MPBIOS_trigger(int idx)
4187 + int bus = mp_irqs[idx].mpc_srcbus;
4191 + * Determine IRQ trigger mode (edge or level sensitive):
4193 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
4195 + case 0: /* conforms, ie. bus-type dependent */
4197 + switch (mp_bus_id_to_type[bus])
4199 + case MP_BUS_ISA: /* ISA pin */
4201 + trigger = default_ISA_trigger(idx);
4204 + case MP_BUS_EISA: /* EISA pin */
4206 + trigger = default_EISA_trigger(idx);
4209 + case MP_BUS_PCI: /* PCI pin */
4211 + trigger = default_PCI_trigger(idx);
4214 + case MP_BUS_MCA: /* MCA pin */
4216 + trigger = default_MCA_trigger(idx);
4219 + case MP_BUS_NEC98: /* NEC 98 pin */
4221 + trigger = default_NEC98_trigger(idx);
4226 + printk(KERN_WARNING "broken BIOS!!\n");
4233 + case 1: /* edge */
4238 + case 2: /* reserved */
4240 + printk(KERN_WARNING "broken BIOS!!\n");
4244 + case 3: /* level */
4249 + default: /* invalid */
4251 + printk(KERN_WARNING "broken BIOS!!\n");
4259 +static inline int irq_polarity(int idx)
4261 + return MPBIOS_polarity(idx);
4264 +static inline int irq_trigger(int idx)
4266 + return MPBIOS_trigger(idx);
4269 +static int pin_2_irq(int idx, int apic, int pin)
4272 + int bus = mp_irqs[idx].mpc_srcbus;
4275 + * Debugging check, we are in big trouble if this message pops up!
4277 + if (mp_irqs[idx].mpc_dstirq != pin)
4278 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
4280 + switch (mp_bus_id_to_type[bus])
4282 + case MP_BUS_ISA: /* ISA pin */
4285 + case MP_BUS_NEC98:
4287 + irq = mp_irqs[idx].mpc_srcbusirq;
4290 + case MP_BUS_PCI: /* PCI pin */
4293 + * PCI IRQs are mapped in order
4297 + irq += nr_ioapic_registers[i++];
4301 + * For MPS mode, so far only needed by ES7000 platform
4303 + if (ioapic_renumber_irq)
4304 + irq = ioapic_renumber_irq(apic, irq);
4310 + printk(KERN_ERR "unknown bus type %d.\n",bus);
4317 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
4319 + if ((pin >= 16) && (pin <= 23)) {
4320 + if (pirq_entries[pin-16] != -1) {
4321 + if (!pirq_entries[pin-16]) {
4322 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4323 + "disabling PIRQ%d\n", pin-16);
4325 + irq = pirq_entries[pin-16];
4326 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4327 + "using PIRQ%d -> IRQ %d\n",
4335 +static inline int IO_APIC_irq_trigger(int irq)
4337 + int apic, idx, pin;
4339 + for (apic = 0; apic < nr_ioapics; apic++) {
4340 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4341 + idx = find_irq_entry(apic,pin,mp_INT);
4342 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
4343 + return irq_trigger(idx);
4347 + * nonexistent IRQs are edge default
4352 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
4353 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
4355 +int assign_irq_vector(int irq)
4357 + unsigned long flags;
4359 + struct physdev_irq irq_op;
4361 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
4363 + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
4366 + spin_lock_irqsave(&vector_lock, flags);
4368 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
4369 + spin_unlock_irqrestore(&vector_lock, flags);
4370 + return IO_APIC_VECTOR(irq);
4374 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
4375 + spin_unlock_irqrestore(&vector_lock, flags);
4379 + vector = irq_op.vector;
4380 + vector_irq[vector] = irq;
4381 + if (irq != AUTO_ASSIGN)
4382 + IO_APIC_VECTOR(irq) = vector;
4384 + spin_unlock_irqrestore(&vector_lock, flags);
4390 +static struct hw_interrupt_type ioapic_level_type;
4391 +static struct hw_interrupt_type ioapic_edge_type;
4393 +#define IOAPIC_AUTO -1
4394 +#define IOAPIC_EDGE 0
4395 +#define IOAPIC_LEVEL 1
4397 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
4401 + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
4403 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
4404 + trigger == IOAPIC_LEVEL)
4405 + irq_desc[idx].chip = &ioapic_level_type;
4407 + irq_desc[idx].chip = &ioapic_edge_type;
4408 + set_intr_gate(vector, interrupt[idx]);
4411 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
4414 +static void __init setup_IO_APIC_irqs(void)
4416 + struct IO_APIC_route_entry entry;
4417 + int apic, pin, idx, irq, first_notcon = 1, vector;
4418 + unsigned long flags;
4420 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
4422 + for (apic = 0; apic < nr_ioapics; apic++) {
4423 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4426 + * add it to the IO-APIC irq-routing table:
4428 + memset(&entry,0,sizeof(entry));
4430 + entry.delivery_mode = INT_DELIVERY_MODE;
4431 + entry.dest_mode = INT_DEST_MODE;
4432 + entry.mask = 0; /* enable IRQ */
4433 + entry.dest.logical.logical_dest =
4434 + cpu_mask_to_apicid(TARGET_CPUS);
4436 + idx = find_irq_entry(apic,pin,mp_INT);
4438 + if (first_notcon) {
4439 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4440 + " IO-APIC (apicid-pin) %d-%d",
4441 + mp_ioapics[apic].mpc_apicid,
4445 + apic_printk(APIC_VERBOSE, ", %d-%d",
4446 + mp_ioapics[apic].mpc_apicid, pin);
4450 + entry.trigger = irq_trigger(idx);
4451 + entry.polarity = irq_polarity(idx);
4453 + if (irq_trigger(idx)) {
4454 + entry.trigger = 1;
4458 + irq = pin_2_irq(idx, apic, pin);
4460 + * skip adding the timer int on secondary nodes, which causes
4461 + * a small but painful rift in the time-space continuum
4463 + if (multi_timer_check(apic, irq))
4466 + add_pin_to_irq(irq, apic, pin);
4468 + if (/*!apic &&*/ !IO_APIC_IRQ(irq))
4471 + if (IO_APIC_IRQ(irq)) {
4472 + vector = assign_irq_vector(irq);
4473 + entry.vector = vector;
4474 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
4476 + if (!apic && (irq < 16))
4477 + disable_8259A_irq(irq);
4479 + spin_lock_irqsave(&ioapic_lock, flags);
4480 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4481 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4482 + set_native_irq_info(irq, TARGET_CPUS);
4483 + spin_unlock_irqrestore(&ioapic_lock, flags);
4487 + if (!first_notcon)
4488 + apic_printk(APIC_VERBOSE, " not connected.\n");
4492 + * Set up the 8259A-master output pin:
4495 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
4497 + struct IO_APIC_route_entry entry;
4498 + unsigned long flags;
4500 + memset(&entry,0,sizeof(entry));
4502 + disable_8259A_irq(0);
4505 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
4508 + * We use logical delivery to get the timer IRQ
4509 + * to the first CPU.
4511 + entry.dest_mode = INT_DEST_MODE;
4512 + entry.mask = 0; /* unmask IRQ now */
4513 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4514 + entry.delivery_mode = INT_DELIVERY_MODE;
4515 + entry.polarity = 0;
4516 + entry.trigger = 0;
4517 + entry.vector = vector;
4520 + * The timer IRQ doesn't have to know that behind the
4521 + * scene we have a 8259A-master in AEOI mode ...
4523 + irq_desc[0].chip = &ioapic_edge_type;
4526 + * Add it to the IO-APIC irq-routing table:
4528 + spin_lock_irqsave(&ioapic_lock, flags);
4529 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4530 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4531 + spin_unlock_irqrestore(&ioapic_lock, flags);
4533 + enable_8259A_irq(0);
4536 +static inline void UNEXPECTED_IO_APIC(void)
4540 +void __init print_IO_APIC(void)
4543 + union IO_APIC_reg_00 reg_00;
4544 + union IO_APIC_reg_01 reg_01;
4545 + union IO_APIC_reg_02 reg_02;
4546 + union IO_APIC_reg_03 reg_03;
4547 + unsigned long flags;
4549 + if (apic_verbosity == APIC_QUIET)
4552 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
4553 + for (i = 0; i < nr_ioapics; i++)
4554 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
4555 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
4558 + * We are a bit conservative about what we expect. We have to
4559 + * know about every hardware change ASAP.
4561 + printk(KERN_INFO "testing the IO APIC.......................\n");
4563 + for (apic = 0; apic < nr_ioapics; apic++) {
4565 + spin_lock_irqsave(&ioapic_lock, flags);
4566 + reg_00.raw = io_apic_read(apic, 0);
4567 + reg_01.raw = io_apic_read(apic, 1);
4568 + if (reg_01.bits.version >= 0x10)
4569 + reg_02.raw = io_apic_read(apic, 2);
4570 + if (reg_01.bits.version >= 0x20)
4571 + reg_03.raw = io_apic_read(apic, 3);
4572 + spin_unlock_irqrestore(&ioapic_lock, flags);
4574 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
4575 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
4576 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
4577 + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
4578 + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
4579 + if (reg_00.bits.ID >= get_physical_broadcast())
4580 + UNEXPECTED_IO_APIC();
4581 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
4582 + UNEXPECTED_IO_APIC();
4584 + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
4585 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
4586 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
4587 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
4588 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
4589 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
4590 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
4591 + (reg_01.bits.entries != 0x2E) &&
4592 + (reg_01.bits.entries != 0x3F)
4594 + UNEXPECTED_IO_APIC();
4596 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
4597 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
4598 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
4599 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
4600 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
4601 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
4602 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
4604 + UNEXPECTED_IO_APIC();
4605 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
4606 + UNEXPECTED_IO_APIC();
4609 + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
4610 + * but the value of reg_02 is read as the previous read register
4611 + * value, so ignore it if reg_02 == reg_01.
4613 + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
4614 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
4615 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
4616 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
4617 + UNEXPECTED_IO_APIC();
4621 + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
4622 + * or reg_03, but the value of reg_0[23] is read as the previous read
4623 + * register value, so ignore it if reg_03 == reg_0[12].
4625 + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
4626 + reg_03.raw != reg_01.raw) {
4627 + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
4628 + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
4629 + if (reg_03.bits.__reserved_1)
4630 + UNEXPECTED_IO_APIC();
4633 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
4635 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
4636 + " Stat Dest Deli Vect: \n");
4638 + for (i = 0; i <= reg_01.bits.entries; i++) {
4639 + struct IO_APIC_route_entry entry;
4641 + spin_lock_irqsave(&ioapic_lock, flags);
4642 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
4643 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
4644 + spin_unlock_irqrestore(&ioapic_lock, flags);
4646 + printk(KERN_DEBUG " %02x %03X %02X ",
4648 + entry.dest.logical.logical_dest,
4649 + entry.dest.physical.physical_dest
4652 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
4657 + entry.delivery_status,
4659 + entry.delivery_mode,
4664 + if (use_pci_vector())
4665 + printk(KERN_INFO "Using vector-based indexing\n");
4666 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
4667 + for (i = 0; i < NR_IRQS; i++) {
4668 + struct irq_pin_list *entry = irq_2_pin + i;
4669 + if (entry->pin < 0)
4671 + if (use_pci_vector() && !platform_legacy_irq(i))
4672 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
4674 + printk(KERN_DEBUG "IRQ%d ", i);
4676 + printk("-> %d:%d", entry->apic, entry->pin);
4679 + entry = irq_2_pin + entry->next;
4684 + printk(KERN_INFO ".................................... done.\n");
4689 +static void print_APIC_bitfield (int base)
4694 + if (apic_verbosity == APIC_QUIET)
4697 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
4698 + for (i = 0; i < 8; i++) {
4699 + v = apic_read(base + i*0x10);
4700 + for (j = 0; j < 32; j++) {
4710 +void /*__init*/ print_local_APIC(void * dummy)
4712 + unsigned int v, ver, maxlvt;
4714 + if (apic_verbosity == APIC_QUIET)
4717 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
4718 + smp_processor_id(), hard_smp_processor_id());
4719 + v = apic_read(APIC_ID);
4720 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
4721 + v = apic_read(APIC_LVR);
4722 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
4723 + ver = GET_APIC_VERSION(v);
4724 + maxlvt = get_maxlvt();
4726 + v = apic_read(APIC_TASKPRI);
4727 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
4729 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
4730 + v = apic_read(APIC_ARBPRI);
4731 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
4732 + v & APIC_ARBPRI_MASK);
4733 + v = apic_read(APIC_PROCPRI);
4734 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
4737 + v = apic_read(APIC_EOI);
4738 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
4739 + v = apic_read(APIC_RRR);
4740 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
4741 + v = apic_read(APIC_LDR);
4742 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
4743 + v = apic_read(APIC_DFR);
4744 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
4745 + v = apic_read(APIC_SPIV);
4746 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
4748 + printk(KERN_DEBUG "... APIC ISR field:\n");
4749 + print_APIC_bitfield(APIC_ISR);
4750 + printk(KERN_DEBUG "... APIC TMR field:\n");
4751 + print_APIC_bitfield(APIC_TMR);
4752 + printk(KERN_DEBUG "... APIC IRR field:\n");
4753 + print_APIC_bitfield(APIC_IRR);
4755 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
4756 + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
4757 + apic_write(APIC_ESR, 0);
4758 + v = apic_read(APIC_ESR);
4759 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
4762 + v = apic_read(APIC_ICR);
4763 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
4764 + v = apic_read(APIC_ICR2);
4765 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
4767 + v = apic_read(APIC_LVTT);
4768 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
4770 + if (maxlvt > 3) { /* PC is LVT#4. */
4771 + v = apic_read(APIC_LVTPC);
4772 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
4774 + v = apic_read(APIC_LVT0);
4775 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
4776 + v = apic_read(APIC_LVT1);
4777 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
4779 + if (maxlvt > 2) { /* ERR is LVT#3. */
4780 + v = apic_read(APIC_LVTERR);
4781 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
4784 + v = apic_read(APIC_TMICT);
4785 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
4786 + v = apic_read(APIC_TMCCT);
4787 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
4788 + v = apic_read(APIC_TDCR);
4789 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
4793 +void print_all_local_APICs (void)
4795 + on_each_cpu(print_local_APIC, NULL, 1, 1);
4798 +void /*__init*/ print_PIC(void)
4801 + unsigned long flags;
4803 + if (apic_verbosity == APIC_QUIET)
4806 + printk(KERN_DEBUG "\nprinting PIC contents\n");
4808 + spin_lock_irqsave(&i8259A_lock, flags);
4810 + v = inb(0xa1) << 8 | inb(0x21);
4811 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
4813 + v = inb(0xa0) << 8 | inb(0x20);
4814 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
4818 + v = inb(0xa0) << 8 | inb(0x20);
4822 + spin_unlock_irqrestore(&i8259A_lock, flags);
4824 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
4826 + v = inb(0x4d1) << 8 | inb(0x4d0);
4827 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
4829 +#endif /* !CONFIG_XEN */
4831 +static void __init enable_IO_APIC(void)
4833 + union IO_APIC_reg_01 reg_01;
4834 + int i8259_apic, i8259_pin;
4836 + unsigned long flags;
4838 + for (i = 0; i < PIN_MAP_SIZE; i++) {
4839 + irq_2_pin[i].pin = -1;
4840 + irq_2_pin[i].next = 0;
4842 + if (!pirqs_enabled)
4843 + for (i = 0; i < MAX_PIRQS; i++)
4844 + pirq_entries[i] = -1;
4847 + * The number of IO-APIC IRQ registers (== #pins):
4849 + for (apic = 0; apic < nr_ioapics; apic++) {
4850 + spin_lock_irqsave(&ioapic_lock, flags);
4851 + reg_01.raw = io_apic_read(apic, 1);
4852 + spin_unlock_irqrestore(&ioapic_lock, flags);
4853 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
4855 + for(apic = 0; apic < nr_ioapics; apic++) {
4857 + /* See if any of the pins is in ExtINT mode */
4858 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4859 + struct IO_APIC_route_entry entry;
4860 + spin_lock_irqsave(&ioapic_lock, flags);
4861 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4862 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4863 + spin_unlock_irqrestore(&ioapic_lock, flags);
4866 + /* If the interrupt line is enabled and in ExtInt mode
4867 + * I have found the pin where the i8259 is connected.
4869 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
4870 + ioapic_i8259.apic = apic;
4871 + ioapic_i8259.pin = pin;
4877 + /* Look to see what if the MP table has reported the ExtINT */
4878 + /* If we could not find the appropriate pin by looking at the ioapic
4879 + * the i8259 probably is not connected the ioapic but give the
4880 + * mptable a chance anyway.
4882 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
4883 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
4884 + /* Trust the MP table if nothing is setup in the hardware */
4885 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
4886 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
4887 + ioapic_i8259.pin = i8259_pin;
4888 + ioapic_i8259.apic = i8259_apic;
4890 + /* Complain if the MP table and the hardware disagree */
4891 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
4892 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
4894 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
4898 + * Do not trust the IO-APIC being empty at bootup
4904 + * Not an __init, needed by the reboot code
4906 +void disable_IO_APIC(void)
4909 + * Clear the IO-APIC before rebooting:
4915 + * If the i8259 is routed through an IOAPIC
4916 + * Put that IOAPIC in virtual wire mode
4917 + * so legacy interrupts can be delivered.
4919 + if (ioapic_i8259.pin != -1) {
4920 + struct IO_APIC_route_entry entry;
4921 + unsigned long flags;
4923 + memset(&entry, 0, sizeof(entry));
4924 + entry.mask = 0; /* Enabled */
4925 + entry.trigger = 0; /* Edge */
4927 + entry.polarity = 0; /* High */
4928 + entry.delivery_status = 0;
4929 + entry.dest_mode = 0; /* Physical */
4930 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
4932 + entry.dest.physical.physical_dest =
4933 + GET_APIC_ID(apic_read(APIC_ID));
4936 + * Add it to the IO-APIC irq-routing table:
4938 + spin_lock_irqsave(&ioapic_lock, flags);
4939 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
4940 + *(((int *)&entry)+1));
4941 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
4942 + *(((int *)&entry)+0));
4943 + spin_unlock_irqrestore(&ioapic_lock, flags);
4945 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
4950 + * function to set the IO-APIC physical IDs based on the
4951 + * values stored in the MPC table.
4953 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
4956 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
4957 +static void __init setup_ioapic_ids_from_mpc(void)
4959 + union IO_APIC_reg_00 reg_00;
4960 + physid_mask_t phys_id_present_map;
4963 + unsigned char old_id;
4964 + unsigned long flags;
4967 + * Don't check I/O APIC IDs for xAPIC systems. They have
4968 + * no meaning without the serial APIC bus.
4970 + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4971 + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4974 + * This is broken; anything with a real cpu count has to
4975 + * circumvent this idiocy regardless.
4977 + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
4980 + * Set the IOAPIC ID to the value stored in the MPC table.
4982 + for (apic = 0; apic < nr_ioapics; apic++) {
4984 + /* Read the register 0 value */
4985 + spin_lock_irqsave(&ioapic_lock, flags);
4986 + reg_00.raw = io_apic_read(apic, 0);
4987 + spin_unlock_irqrestore(&ioapic_lock, flags);
4989 + old_id = mp_ioapics[apic].mpc_apicid;
4991 + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
4992 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
4993 + apic, mp_ioapics[apic].mpc_apicid);
4994 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
4996 + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5000 + * Sanity check, is the ID really free? Every APIC in a
5001 + * system must have a unique ID or we get lots of nice
5002 + * 'stuck on smp_invalidate_needed IPI wait' messages.
5004 + if (check_apicid_used(phys_id_present_map,
5005 + mp_ioapics[apic].mpc_apicid)) {
5006 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5007 + apic, mp_ioapics[apic].mpc_apicid);
5008 + for (i = 0; i < get_physical_broadcast(); i++)
5009 + if (!physid_isset(i, phys_id_present_map))
5011 + if (i >= get_physical_broadcast())
5012 + panic("Max APIC ID exceeded!\n");
5013 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5015 + physid_set(i, phys_id_present_map);
5016 + mp_ioapics[apic].mpc_apicid = i;
5018 + physid_mask_t tmp;
5019 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5020 + apic_printk(APIC_VERBOSE, "Setting %d in the "
5021 + "phys_id_present_map\n",
5022 + mp_ioapics[apic].mpc_apicid);
5023 + physids_or(phys_id_present_map, phys_id_present_map, tmp);
5028 + * We need to adjust the IRQ routing table
5029 + * if the ID changed.
5031 + if (old_id != mp_ioapics[apic].mpc_apicid)
5032 + for (i = 0; i < mp_irq_entries; i++)
5033 + if (mp_irqs[i].mpc_dstapic == old_id)
5034 + mp_irqs[i].mpc_dstapic
5035 + = mp_ioapics[apic].mpc_apicid;
5038 + * Read the right value from the MPC table and
5039 + * write it into the ID register.
5041 + apic_printk(APIC_VERBOSE, KERN_INFO
5042 + "...changing IO-APIC physical APIC ID to %d ...",
5043 + mp_ioapics[apic].mpc_apicid);
5045 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5046 + spin_lock_irqsave(&ioapic_lock, flags);
5047 + io_apic_write(apic, 0, reg_00.raw);
5048 + spin_unlock_irqrestore(&ioapic_lock, flags);
5053 + spin_lock_irqsave(&ioapic_lock, flags);
5054 + reg_00.raw = io_apic_read(apic, 0);
5055 + spin_unlock_irqrestore(&ioapic_lock, flags);
5056 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5057 + printk("could not set ID!\n");
5059 + apic_printk(APIC_VERBOSE, " ok.\n");
5063 +static void __init setup_ioapic_ids_from_mpc(void) { }
5068 + * There is a nasty bug in some older SMP boards, their mptable lies
5069 + * about the timer IRQ. We do the following to work around the situation:
5071 + * - timer IRQ defaults to IO-APIC IRQ
5072 + * - if this function detects that timer IRQs are defunct, then we fall
5073 + * back to ISA timer IRQs
5075 +static int __init timer_irq_works(void)
5077 + unsigned long t1 = jiffies;
5079 + local_irq_enable();
5080 + /* Let ten ticks pass... */
5081 + mdelay((10 * 1000) / HZ);
5084 + * Expect a few ticks at least, to be sure some possible
5085 + * glue logic does not lock up after one or two first
5086 + * ticks in a non-ExtINT mode. Also the local APIC
5087 + * might have cached one ExtINT interrupt. Finally, at
5088 + * least one tick may be lost due to delays.
5090 + if (jiffies - t1 > 4)
5097 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5098 + * number of pending IRQ events unhandled. These cases are very rare,
5099 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5100 + * better to do it this way as thus we do not have to be aware of
5101 + * 'pending' interrupts in the IRQ path, except at this point.
5104 + * Edge triggered needs to resend any interrupt
5105 + * that was delayed but this is now handled in the device
5106 + * independent code.
5110 + * Starting up a edge-triggered IO-APIC interrupt is
5111 + * nasty - we need to make sure that we get the edge.
5112 + * If it is already asserted for some reason, we need
5113 + * return 1 to indicate that is was pending.
5115 + * This is not complete - we should be able to fake
5116 + * an edge even if it isn't on the 8259A...
5118 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5120 + int was_pending = 0;
5121 + unsigned long flags;
5123 + spin_lock_irqsave(&ioapic_lock, flags);
5125 + disable_8259A_irq(irq);
5126 + if (i8259A_irq_pending(irq))
5129 + __unmask_IO_APIC_irq(irq);
5130 + spin_unlock_irqrestore(&ioapic_lock, flags);
5132 + return was_pending;
5136 + * Once we have recorded IRQ_PENDING already, we can mask the
5137 + * interrupt for real. This prevents IRQ storms from unhandled
5140 +static void ack_edge_ioapic_irq(unsigned int irq)
5143 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5144 + == (IRQ_PENDING | IRQ_DISABLED))
5145 + mask_IO_APIC_irq(irq);
5150 + * Level triggered interrupts can just be masked,
5151 + * and shutting down and starting up the interrupt
5152 + * is the same as enabling and disabling them -- except
5153 + * with a startup need to return a "was pending" value.
5155 + * Level triggered interrupts are special because we
5156 + * do not touch any IO-APIC register while handling
5157 + * them. We ack the APIC in the end-IRQ handler, not
5158 + * in the start-IRQ-handler. Protection against reentrance
5159 + * from the same interrupt is still provided, both by the
5160 + * generic IRQ layer and by the fact that an unacked local
5161 + * APIC does not accept IRQs.
5163 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
5165 + unmask_IO_APIC_irq(irq);
5167 + return 0; /* don't check for pending */
5170 +static void end_level_ioapic_irq (unsigned int irq)
5177 + * It appears there is an erratum which affects at least version 0x11
5178 + * of I/O APIC (that's the 82093AA and cores integrated into various
5179 + * chipsets). Under certain conditions a level-triggered interrupt is
5180 + * erroneously delivered as edge-triggered one but the respective IRR
5181 + * bit gets set nevertheless. As a result the I/O unit expects an EOI
5182 + * message but it will never arrive and further interrupts are blocked
5183 + * from the source. The exact reason is so far unknown, but the
5184 + * phenomenon was observed when two consecutive interrupt requests
5185 + * from a given source get delivered to the same CPU and the source is
5186 + * temporarily disabled in between.
5188 + * A workaround is to simulate an EOI message manually. We achieve it
5189 + * by setting the trigger mode to edge and then to level when the edge
5190 + * trigger mode gets detected in the TMR of a local APIC for a
5191 + * level-triggered interrupt. We mask the source for the time of the
5192 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
5193 + * The idea is from Manfred Spraul. --macro
5195 + i = IO_APIC_VECTOR(irq);
5197 + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
5201 + if (!(v & (1 << (i & 0x1f)))) {
5202 + atomic_inc(&irq_mis_count);
5203 + spin_lock(&ioapic_lock);
5204 + __mask_and_edge_IO_APIC_irq(irq);
5205 + __unmask_and_level_IO_APIC_irq(irq);
5206 + spin_unlock(&ioapic_lock);
5210 +#ifdef CONFIG_PCI_MSI
5211 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
5213 + int irq = vector_to_irq(vector);
5215 + return startup_edge_ioapic_irq(irq);
5218 +static void ack_edge_ioapic_vector(unsigned int vector)
5220 + int irq = vector_to_irq(vector);
5222 + move_native_irq(vector);
5223 + ack_edge_ioapic_irq(irq);
5226 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
5228 + int irq = vector_to_irq(vector);
5230 + return startup_level_ioapic_irq (irq);
5233 +static void end_level_ioapic_vector (unsigned int vector)
5235 + int irq = vector_to_irq(vector);
5237 + move_native_irq(vector);
5238 + end_level_ioapic_irq(irq);
5241 +static void mask_IO_APIC_vector (unsigned int vector)
5243 + int irq = vector_to_irq(vector);
5245 + mask_IO_APIC_irq(irq);
5248 +static void unmask_IO_APIC_vector (unsigned int vector)
5250 + int irq = vector_to_irq(vector);
5252 + unmask_IO_APIC_irq(irq);
5256 +static void set_ioapic_affinity_vector (unsigned int vector,
5257 + cpumask_t cpu_mask)
5259 + int irq = vector_to_irq(vector);
5261 + set_native_irq_info(vector, cpu_mask);
5262 + set_ioapic_affinity_irq(irq, cpu_mask);
5267 +static int ioapic_retrigger(unsigned int irq)
5269 + send_IPI_self(IO_APIC_VECTOR(irq));
5275 + * Level and edge triggered IO-APIC interrupts need different handling,
5276 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
5277 + * handled with the level-triggered descriptor, but that one has slightly
5278 + * more overhead. Level-triggered interrupts cannot be handled with the
5279 + * edge-triggered handler, without risking IRQ storms and other ugly
5282 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
5283 + .typename = "IO-APIC-edge",
5284 + .startup = startup_edge_ioapic,
5285 + .shutdown = shutdown_edge_ioapic,
5286 + .enable = enable_edge_ioapic,
5287 + .disable = disable_edge_ioapic,
5288 + .ack = ack_edge_ioapic,
5289 + .end = end_edge_ioapic,
5291 + .set_affinity = set_ioapic_affinity,
5293 + .retrigger = ioapic_retrigger,
5296 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
5297 + .typename = "IO-APIC-level",
5298 + .startup = startup_level_ioapic,
5299 + .shutdown = shutdown_level_ioapic,
5300 + .enable = enable_level_ioapic,
5301 + .disable = disable_level_ioapic,
5302 + .ack = mask_and_ack_level_ioapic,
5303 + .end = end_level_ioapic,
5305 + .set_affinity = set_ioapic_affinity,
5307 + .retrigger = ioapic_retrigger,
5309 +#endif /* !CONFIG_XEN */
5311 +static inline void init_IO_APIC_traps(void)
5316 + * NOTE! The local APIC isn't very good at handling
5317 + * multiple interrupts at the same interrupt level.
5318 + * As the interrupt level is determined by taking the
5319 + * vector number and shifting that right by 4, we
5320 + * want to spread these out a bit so that they don't
5321 + * all fall in the same interrupt level.
5323 + * Also, we've got to be careful not to trash gate
5324 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
5326 + for (irq = 0; irq < NR_IRQS ; irq++) {
5328 + if (use_pci_vector()) {
5329 + if (!platform_legacy_irq(tmp))
5330 + if ((tmp = vector_to_irq(tmp)) == -1)
5333 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
5335 + * Hmm.. We don't have an entry for this,
5336 + * so default to an old-fashioned 8259
5337 + * interrupt if we can..
5340 + make_8259A_irq(irq);
5343 + /* Strange. Oh, well.. */
5344 + irq_desc[irq].chip = &no_irq_type;
5351 +static void enable_lapic_irq (unsigned int irq)
5355 + v = apic_read(APIC_LVT0);
5356 + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
5359 +static void disable_lapic_irq (unsigned int irq)
5363 + v = apic_read(APIC_LVT0);
5364 + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
5367 +static void ack_lapic_irq (unsigned int irq)
5372 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
5374 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
5375 + .typename = "local-APIC-edge",
5376 + .startup = NULL, /* startup_irq() not used for IRQ0 */
5377 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
5378 + .enable = enable_lapic_irq,
5379 + .disable = disable_lapic_irq,
5380 + .ack = ack_lapic_irq,
5381 + .end = end_lapic_irq
5384 +static void setup_nmi (void)
5387 + * Dirty trick to enable the NMI watchdog ...
5388 + * We put the 8259A master into AEOI mode and
5389 + * unmask on all local APICs LVT0 as NMI.
5391 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
5392 + * is from Maciej W. Rozycki - so we do not have to EOI from
5393 + * the NMI handler or the timer interrupt.
5395 + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
5397 + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
5399 + apic_printk(APIC_VERBOSE, " done.\n");
5403 + * This looks a bit hackish but it's about the only one way of sending
5404 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
5405 + * not support the ExtINT mode, unfortunately. We need to send these
5406 + * cycles as some i82489DX-based boards have glue logic that keeps the
5407 + * 8259A interrupt line asserted until INTA. --macro
5409 +static inline void unlock_ExtINT_logic(void)
5412 + struct IO_APIC_route_entry entry0, entry1;
5413 + unsigned char save_control, save_freq_select;
5414 + unsigned long flags;
5416 + pin = find_isa_irq_pin(8, mp_INT);
5417 + apic = find_isa_irq_apic(8, mp_INT);
5421 + spin_lock_irqsave(&ioapic_lock, flags);
5422 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5423 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5424 + spin_unlock_irqrestore(&ioapic_lock, flags);
5425 + clear_IO_APIC_pin(apic, pin);
5427 + memset(&entry1, 0, sizeof(entry1));
5429 + entry1.dest_mode = 0; /* physical delivery */
5430 + entry1.mask = 0; /* unmask IRQ now */
5431 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
5432 + entry1.delivery_mode = dest_ExtINT;
5433 + entry1.polarity = entry0.polarity;
5434 + entry1.trigger = 0;
5435 + entry1.vector = 0;
5437 + spin_lock_irqsave(&ioapic_lock, flags);
5438 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
5439 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
5440 + spin_unlock_irqrestore(&ioapic_lock, flags);
5442 + save_control = CMOS_READ(RTC_CONTROL);
5443 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
5444 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
5446 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
5451 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
5455 + CMOS_WRITE(save_control, RTC_CONTROL);
5456 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
5457 + clear_IO_APIC_pin(apic, pin);
5459 + spin_lock_irqsave(&ioapic_lock, flags);
5460 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
5461 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
5462 + spin_unlock_irqrestore(&ioapic_lock, flags);
5465 +int timer_uses_ioapic_pin_0;
5468 + * This code may look a bit paranoid, but it's supposed to cooperate with
5469 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
5470 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
5471 + * fanatically on his truly buggy board.
5473 +static inline void check_timer(void)
5475 + int apic1, pin1, apic2, pin2;
5479 + * get/set the timer IRQ vector:
5481 + disable_8259A_irq(0);
5482 + vector = assign_irq_vector(0);
5483 + set_intr_gate(vector, interrupt[0]);
5486 + * Subtle, code in do_timer_interrupt() expects an AEOI
5487 + * mode for the 8259A whenever interrupts are routed
5488 + * through I/O APICs. Also IRQ0 has to be enabled in
5489 + * the 8259A which implies the virtual wire has to be
5490 + * disabled in the local APIC.
5492 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5495 + if (timer_over_8254 > 0)
5496 + enable_8259A_irq(0);
5498 + pin1 = find_isa_irq_pin(0, mp_INT);
5499 + apic1 = find_isa_irq_apic(0, mp_INT);
5500 + pin2 = ioapic_i8259.pin;
5501 + apic2 = ioapic_i8259.apic;
5504 + timer_uses_ioapic_pin_0 = 1;
5506 + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
5507 + vector, apic1, pin1, apic2, pin2);
5511 + * Ok, does IRQ0 through the IOAPIC work?
5513 + unmask_IO_APIC_irq(0);
5514 + if (timer_irq_works()) {
5515 + if (nmi_watchdog == NMI_IO_APIC) {
5516 + disable_8259A_irq(0);
5518 + enable_8259A_irq(0);
5520 + if (disable_timer_pin_1 > 0)
5521 + clear_IO_APIC_pin(0, pin1);
5524 + clear_IO_APIC_pin(apic1, pin1);
5525 + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
5529 + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
5531 + printk("\n..... (found pin %d) ...", pin2);
5533 + * legacy devices should be connected to IO APIC #0
5535 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
5536 + if (timer_irq_works()) {
5537 + printk("works.\n");
5539 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
5541 + add_pin_to_irq(0, apic2, pin2);
5542 + if (nmi_watchdog == NMI_IO_APIC) {
5548 + * Cleanup, just in case ...
5550 + clear_IO_APIC_pin(apic2, pin2);
5552 + printk(" failed.\n");
5554 + if (nmi_watchdog == NMI_IO_APIC) {
5555 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
5559 + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
5561 + disable_8259A_irq(0);
5562 + irq_desc[0].chip = &lapic_irq_type;
5563 + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
5564 + enable_8259A_irq(0);
5566 + if (timer_irq_works()) {
5567 + printk(" works.\n");
5570 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
5571 + printk(" failed.\n");
5573 + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
5577 + make_8259A_irq(0);
5578 + apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
5580 + unlock_ExtINT_logic();
5582 + if (timer_irq_works()) {
5583 + printk(" works.\n");
5586 + printk(" failed :(.\n");
5587 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
5588 + "report. Then try booting with the 'noapic' option");
5591 +int timer_uses_ioapic_pin_0 = 0;
5592 +#define check_timer() ((void)0)
5597 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
5598 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
5599 + * Linux doesn't really care, as it's not actually used
5600 + * for any interrupt handling anyway.
5602 +#define PIC_IRQS (1 << PIC_CASCADE_IR)
5604 +void __init setup_IO_APIC(void)
5609 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
5611 + io_apic_irqs = ~PIC_IRQS;
5613 + printk("ENABLING IO-APIC IRQs\n");
5616 + * Set up IO-APIC IRQ routing.
5619 + setup_ioapic_ids_from_mpc();
5623 + setup_IO_APIC_irqs();
5624 + init_IO_APIC_traps();
5630 +static int __init setup_disable_8254_timer(char *s)
5632 + timer_over_8254 = -1;
5635 +static int __init setup_enable_8254_timer(char *s)
5637 + timer_over_8254 = 2;
5641 +__setup("disable_8254_timer", setup_disable_8254_timer);
5642 +__setup("enable_8254_timer", setup_enable_8254_timer);
5645 + * Called after all the initialization is done. If we didnt find any
5646 + * APIC bugs then we can allow the modify fast path
5649 +static int __init io_apic_bug_finalize(void)
5651 + if(sis_apic_bug == -1)
5653 + if (is_initial_xendomain()) {
5654 + struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
5655 + op.u.platform_quirk.quirk_id = sis_apic_bug ?
5656 + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
5657 + VOID(HYPERVISOR_platform_op(&op));
5662 +late_initcall(io_apic_bug_finalize);
5664 +struct sysfs_ioapic_data {
5665 + struct sys_device dev;
5666 + struct IO_APIC_route_entry entry[0];
5668 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
5670 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
5672 + struct IO_APIC_route_entry *entry;
5673 + struct sysfs_ioapic_data *data;
5674 + unsigned long flags;
5677 + data = container_of(dev, struct sysfs_ioapic_data, dev);
5678 + entry = data->entry;
5679 + spin_lock_irqsave(&ioapic_lock, flags);
5680 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5681 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
5682 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
5684 + spin_unlock_irqrestore(&ioapic_lock, flags);
5689 +static int ioapic_resume(struct sys_device *dev)
5691 + struct IO_APIC_route_entry *entry;
5692 + struct sysfs_ioapic_data *data;
5693 + unsigned long flags;
5694 + union IO_APIC_reg_00 reg_00;
5697 + data = container_of(dev, struct sysfs_ioapic_data, dev);
5698 + entry = data->entry;
5700 + spin_lock_irqsave(&ioapic_lock, flags);
5701 + reg_00.raw = io_apic_read(dev->id, 0);
5702 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
5703 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
5704 + io_apic_write(dev->id, 0, reg_00.raw);
5706 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5707 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
5708 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
5710 + spin_unlock_irqrestore(&ioapic_lock, flags);
5715 +static struct sysdev_class ioapic_sysdev_class = {
5716 + set_kset_name("ioapic"),
5718 + .suspend = ioapic_suspend,
5719 + .resume = ioapic_resume,
5723 +static int __init ioapic_init_sysfs(void)
5725 + struct sys_device * dev;
5726 + int i, size, error = 0;
5728 + error = sysdev_class_register(&ioapic_sysdev_class);
5732 + for (i = 0; i < nr_ioapics; i++ ) {
5733 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
5734 + * sizeof(struct IO_APIC_route_entry);
5735 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
5736 + if (!mp_ioapic_data[i]) {
5737 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5740 + memset(mp_ioapic_data[i], 0, size);
5741 + dev = &mp_ioapic_data[i]->dev;
5743 + dev->cls = &ioapic_sysdev_class;
5744 + error = sysdev_register(dev);
5746 + kfree(mp_ioapic_data[i]);
5747 + mp_ioapic_data[i] = NULL;
5748 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5756 +device_initcall(ioapic_init_sysfs);
5758 +/* --------------------------------------------------------------------------
5759 + ACPI-based IOAPIC Configuration
5760 + -------------------------------------------------------------------------- */
5764 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
5767 + union IO_APIC_reg_00 reg_00;
5768 + static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
5769 + physid_mask_t tmp;
5770 + unsigned long flags;
5774 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
5775 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
5776 + * supports up to 16 on one shared APIC bus.
5778 + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
5779 + * advantage of new APIC bus architecture.
5782 + if (physids_empty(apic_id_map))
5783 + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
5785 + spin_lock_irqsave(&ioapic_lock, flags);
5786 + reg_00.raw = io_apic_read(ioapic, 0);
5787 + spin_unlock_irqrestore(&ioapic_lock, flags);
5789 + if (apic_id >= get_physical_broadcast()) {
5790 + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
5791 + "%d\n", ioapic, apic_id, reg_00.bits.ID);
5792 + apic_id = reg_00.bits.ID;
5796 + * Every APIC in a system must have a unique ID or we get lots of nice
5797 + * 'stuck on smp_invalidate_needed IPI wait' messages.
5799 + if (check_apicid_used(apic_id_map, apic_id)) {
5801 + for (i = 0; i < get_physical_broadcast(); i++) {
5802 + if (!check_apicid_used(apic_id_map, i))
5806 + if (i == get_physical_broadcast())
5807 + panic("Max apic_id exceeded!\n");
5809 + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
5810 + "trying %d\n", ioapic, apic_id, i);
5815 + tmp = apicid_to_cpu_present(apic_id);
5816 + physids_or(apic_id_map, apic_id_map, tmp);
5818 + if (reg_00.bits.ID != apic_id) {
5819 + reg_00.bits.ID = apic_id;
5821 + spin_lock_irqsave(&ioapic_lock, flags);
5822 + io_apic_write(ioapic, 0, reg_00.raw);
5823 + reg_00.raw = io_apic_read(ioapic, 0);
5824 + spin_unlock_irqrestore(&ioapic_lock, flags);
5826 + /* Sanity check */
5827 + if (reg_00.bits.ID != apic_id) {
5828 + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
5833 + apic_printk(APIC_VERBOSE, KERN_INFO
5834 + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
5835 +#endif /* !CONFIG_XEN */
5841 +int __init io_apic_get_version (int ioapic)
5843 + union IO_APIC_reg_01 reg_01;
5844 + unsigned long flags;
5846 + spin_lock_irqsave(&ioapic_lock, flags);
5847 + reg_01.raw = io_apic_read(ioapic, 1);
5848 + spin_unlock_irqrestore(&ioapic_lock, flags);
5850 + return reg_01.bits.version;
5854 +int __init io_apic_get_redir_entries (int ioapic)
5856 + union IO_APIC_reg_01 reg_01;
5857 + unsigned long flags;
5859 + spin_lock_irqsave(&ioapic_lock, flags);
5860 + reg_01.raw = io_apic_read(ioapic, 1);
5861 + spin_unlock_irqrestore(&ioapic_lock, flags);
5863 + return reg_01.bits.entries;
5867 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
5869 + struct IO_APIC_route_entry entry;
5870 + unsigned long flags;
5872 + if (!IO_APIC_IRQ(irq)) {
5873 + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
5879 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
5880 + * Note that we mask (disable) IRQs now -- these get enabled when the
5881 + * corresponding device driver registers for this IRQ.
5884 + memset(&entry,0,sizeof(entry));
5886 + entry.delivery_mode = INT_DELIVERY_MODE;
5887 + entry.dest_mode = INT_DEST_MODE;
5888 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5889 + entry.trigger = edge_level;
5890 + entry.polarity = active_high_low;
5894 + * IRQs < 16 are already in the irq_2_pin[] map
5897 + add_pin_to_irq(irq, ioapic, pin);
5899 + entry.vector = assign_irq_vector(irq);
5901 + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
5902 + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
5903 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
5904 + edge_level, active_high_low);
5906 + ioapic_register_intr(irq, entry.vector, edge_level);
5908 + if (!ioapic && (irq < 16))
5909 + disable_8259A_irq(irq);
5911 + spin_lock_irqsave(&ioapic_lock, flags);
5912 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
5913 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
5914 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
5915 + spin_unlock_irqrestore(&ioapic_lock, flags);
5920 +#endif /* CONFIG_ACPI */
5921 Index: head-2008-11-25/arch/x86/kernel/ioport_32-xen.c
5922 ===================================================================
5923 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5924 +++ head-2008-11-25/arch/x86/kernel/ioport_32-xen.c 2008-01-28 12:24:19.000000000 +0100
5927 + * linux/arch/i386/kernel/ioport.c
5929 + * This contains the io-permission bitmap code - written by obz, with changes
5933 +#include <linux/sched.h>
5934 +#include <linux/kernel.h>
5935 +#include <linux/capability.h>
5936 +#include <linux/errno.h>
5937 +#include <linux/types.h>
5938 +#include <linux/ioport.h>
5939 +#include <linux/smp.h>
5940 +#include <linux/smp_lock.h>
5941 +#include <linux/stddef.h>
5942 +#include <linux/slab.h>
5943 +#include <linux/thread_info.h>
5944 +#include <xen/interface/physdev.h>
5946 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
5947 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
5949 + unsigned long mask;
5950 + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
5951 + unsigned int low_index = base & (BITS_PER_LONG-1);
5952 + int length = low_index + extent;
5954 + if (low_index != 0) {
5955 + mask = (~0UL << low_index);
5956 + if (length < BITS_PER_LONG)
5957 + mask &= ~(~0UL << length);
5959 + *bitmap_base++ |= mask;
5961 + *bitmap_base++ &= ~mask;
5962 + length -= BITS_PER_LONG;
5965 + mask = (new_value ? ~0UL : 0UL);
5966 + while (length >= BITS_PER_LONG) {
5967 + *bitmap_base++ = mask;
5968 + length -= BITS_PER_LONG;
5972 + mask = ~(~0UL << length);
5974 + *bitmap_base++ |= mask;
5976 + *bitmap_base++ &= ~mask;
5982 + * this changes the io permissions bitmap in the current task.
5984 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
5986 + struct thread_struct * t = ¤t->thread;
5987 + unsigned long *bitmap;
5988 + struct physdev_set_iobitmap set_iobitmap;
5990 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
5992 + if (turn_on && !capable(CAP_SYS_RAWIO))
5996 + * If it's the first ioperm() call in this thread's lifetime, set the
5997 + * IO bitmap up. ioperm() is much less timing critical than clone(),
5998 + * this is why we delay this operation until now:
6000 + if (!t->io_bitmap_ptr) {
6001 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6005 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
6006 + t->io_bitmap_ptr = bitmap;
6007 + set_thread_flag(TIF_IO_BITMAP);
6009 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
6010 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
6011 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
6015 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6021 + * sys_iopl has to be used when you want to access the IO ports
6022 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6023 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6025 + * Here we just change the eflags value on the stack: we allow
6026 + * only the super-user to do it. This depends on the stack-layout
6027 + * on system-call entry - see also fork() and the signal handling
6031 +asmlinkage long sys_iopl(unsigned long unused)
6033 + volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6034 + unsigned int level = regs->ebx;
6035 + struct thread_struct *t = ¤t->thread;
6036 + unsigned int old = (t->iopl >> 12) & 3;
6040 + /* Trying to gain more privileges? */
6041 + if (level > old) {
6042 + if (!capable(CAP_SYS_RAWIO))
6045 + t->iopl = level << 12;
6046 + set_iopl_mask(t->iopl);
6049 Index: head-2008-11-25/arch/x86/kernel/irq_32-xen.c
6050 ===================================================================
6051 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6052 +++ head-2008-11-25/arch/x86/kernel/irq_32-xen.c 2008-10-29 09:55:56.000000000 +0100
6055 + * linux/arch/i386/kernel/irq.c
6057 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6059 + * This file contains the lowest level x86-specific interrupt
6060 + * entry, irq-stacks and irq statistics code. All the remaining
6061 + * irq logic is done by the generic kernel/irq/ code and
6062 + * by the x86-specific irq controller code. (e.g. i8259.c and
6066 +#include <asm/uaccess.h>
6067 +#include <linux/module.h>
6068 +#include <linux/seq_file.h>
6069 +#include <linux/interrupt.h>
6070 +#include <linux/kernel_stat.h>
6071 +#include <linux/notifier.h>
6072 +#include <linux/cpu.h>
6073 +#include <linux/delay.h>
6075 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6076 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6078 +#ifndef CONFIG_X86_LOCAL_APIC
6080 + * 'what should we do if we get a hw irq event on an illegal vector'.
6081 + * each architecture has to answer this themselves.
6083 +void ack_bad_irq(unsigned int irq)
6085 + printk("unexpected IRQ trap at vector %02x\n", irq);
6089 +#ifdef CONFIG_4KSTACKS
6091 + * per-CPU IRQ handling contexts (thread information and stack)
6094 + struct thread_info tinfo;
6095 + u32 stack[THREAD_SIZE/sizeof(u32)];
6098 +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
6099 +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
6103 + * do_IRQ handles all normal device IRQ's (the special
6104 + * SMP cross-CPU interrupts have their own specific
6107 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6109 + /* high bit used in ret_from_ code */
6110 + int irq = ~regs->orig_eax;
6111 +#ifdef CONFIG_4KSTACKS
6112 + union irq_ctx *curctx, *irqctx;
6116 + if (unlikely((unsigned)irq >= NR_IRQS)) {
6117 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
6118 + __FUNCTION__, irq);
6123 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6124 + /* Debugging check for stack overflow: is there less than 1KB free? */
6128 + __asm__ __volatile__("andl %%esp,%0" :
6129 + "=r" (esp) : "0" (THREAD_SIZE - 1));
6130 + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6131 + printk("do_IRQ: stack overflow: %ld\n",
6132 + esp - sizeof(struct thread_info));
6138 +#ifdef CONFIG_4KSTACKS
6140 + curctx = (union irq_ctx *) current_thread_info();
6141 + irqctx = hardirq_ctx[smp_processor_id()];
6144 + * this is where we switch to the IRQ stack. However, if we are
6145 + * already using the IRQ stack (because we interrupted a hardirq
6146 + * handler) we can't do that and just have to keep using the
6147 + * current stack (which is the irq stack already after all)
6149 + if (curctx != irqctx) {
6150 + int arg1, arg2, ebx;
6152 + /* build the stack frame on the IRQ stack */
6153 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6154 + irqctx->tinfo.task = curctx->tinfo.task;
6155 + irqctx->tinfo.previous_esp = current_stack_pointer;
6158 + * Copy the softirq bits in preempt_count so that the
6159 + * softirq checks work in the hardirq context.
6161 + irqctx->tinfo.preempt_count =
6162 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
6163 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
6166 + " xchgl %%ebx,%%esp \n"
6167 + " call __do_IRQ \n"
6168 + " movl %%ebx,%%esp \n"
6169 + : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6170 + : "0" (irq), "1" (regs), "2" (isp)
6171 + : "memory", "cc", "ecx"
6175 + __do_IRQ(irq, regs);
6182 +#ifdef CONFIG_4KSTACKS
6185 + * These should really be __section__(".bss.page_aligned") as well, but
6186 + * gcc's 3.0 and earlier don't handle that correctly.
6188 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6189 + __attribute__((__aligned__(THREAD_SIZE)));
6191 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6192 + __attribute__((__aligned__(THREAD_SIZE)));
6195 + * allocate per-cpu stacks for hardirq and for softirq processing
6197 +void irq_ctx_init(int cpu)
6199 + union irq_ctx *irqctx;
6201 + if (hardirq_ctx[cpu])
6204 + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
6205 + irqctx->tinfo.task = NULL;
6206 + irqctx->tinfo.exec_domain = NULL;
6207 + irqctx->tinfo.cpu = cpu;
6208 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
6209 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6211 + hardirq_ctx[cpu] = irqctx;
6213 + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
6214 + irqctx->tinfo.task = NULL;
6215 + irqctx->tinfo.exec_domain = NULL;
6216 + irqctx->tinfo.cpu = cpu;
6217 + irqctx->tinfo.preempt_count = 0;
6218 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6220 + softirq_ctx[cpu] = irqctx;
6222 + printk("CPU %u irqstacks, hard=%p soft=%p\n",
6223 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
6226 +void irq_ctx_exit(int cpu)
6228 + hardirq_ctx[cpu] = NULL;
6231 +extern asmlinkage void __do_softirq(void);
6233 +asmlinkage void do_softirq(void)
6235 + unsigned long flags;
6236 + struct thread_info *curctx;
6237 + union irq_ctx *irqctx;
6240 + if (in_interrupt())
6243 + local_irq_save(flags);
6245 + if (local_softirq_pending()) {
6246 + curctx = current_thread_info();
6247 + irqctx = softirq_ctx[smp_processor_id()];
6248 + irqctx->tinfo.task = curctx->task;
6249 + irqctx->tinfo.previous_esp = current_stack_pointer;
6251 + /* build the stack frame on the softirq stack */
6252 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6255 + " xchgl %%ebx,%%esp \n"
6256 + " call __do_softirq \n"
6257 + " movl %%ebx,%%esp \n"
6260 + : "memory", "cc", "edx", "ecx", "eax"
6263 + * Shouldnt happen, we returned above if in_interrupt():
6265 + WARN_ON_ONCE(softirq_count());
6268 + local_irq_restore(flags);
6271 +EXPORT_SYMBOL(do_softirq);
6275 + * Interrupt statistics:
6278 +atomic_t irq_err_count;
6281 + * /proc/interrupts printing:
6284 +int show_interrupts(struct seq_file *p, void *v)
6286 + int i = *(loff_t *) v, j;
6287 + struct irqaction * action;
6288 + unsigned long flags;
6291 + seq_printf(p, " ");
6292 + for_each_online_cpu(j)
6293 + seq_printf(p, "CPU%-8d",j);
6294 + seq_putc(p, '\n');
6297 + if (i < NR_IRQS) {
6298 + spin_lock_irqsave(&irq_desc[i].lock, flags);
6299 + action = irq_desc[i].action;
6302 + seq_printf(p, "%3d: ",i);
6304 + seq_printf(p, "%10u ", kstat_irqs(i));
6306 + for_each_online_cpu(j)
6307 + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
6309 + seq_printf(p, " %14s", irq_desc[i].chip->typename);
6310 + seq_printf(p, " %s", action->name);
6312 + for (action=action->next; action; action = action->next)
6313 + seq_printf(p, ", %s", action->name);
6315 + seq_putc(p, '\n');
6317 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
6318 + } else if (i == NR_IRQS) {
6319 + seq_printf(p, "NMI: ");
6320 + for_each_online_cpu(j)
6321 + seq_printf(p, "%10u ", nmi_count(j));
6322 + seq_putc(p, '\n');
6323 +#ifdef CONFIG_X86_LOCAL_APIC
6324 + seq_printf(p, "LOC: ");
6325 + for_each_online_cpu(j)
6326 + seq_printf(p, "%10u ",
6327 + per_cpu(irq_stat,j).apic_timer_irqs);
6328 + seq_putc(p, '\n');
6330 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
6331 +#if defined(CONFIG_X86_IO_APIC)
6332 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
6338 +#ifdef CONFIG_HOTPLUG_CPU
6340 +void fixup_irqs(cpumask_t map)
6343 + static int warned;
6345 + for (irq = 0; irq < NR_IRQS; irq++) {
6350 + cpus_and(mask, irq_desc[irq].affinity, map);
6351 + if (any_online_cpu(mask) == NR_CPUS) {
6352 + /*printk("Breaking affinity for irq %i\n", irq);*/
6355 + if (irq_desc[irq].chip->set_affinity)
6356 + irq_desc[irq].chip->set_affinity(irq, mask);
6357 + else if (irq_desc[irq].action && !(warned++))
6358 + printk("Cannot set affinity for irq %i\n", irq);
6363 + /* Ingo Molnar says: "after the IO-APIC masks have been redirected
6364 + [note the nop - the interrupt-enable boundary on x86 is two
6365 + instructions from sti] - to flush out pending hardirqs and
6366 + IPIs. After this point nothing is supposed to reach this CPU." */
6367 + __asm__ __volatile__("sti; nop; cli");
6370 + /* That doesn't seem sufficient. Give it 1ms. */
6371 + local_irq_enable();
6373 + local_irq_disable();
6378 Index: head-2008-11-25/arch/x86/kernel/ldt_32-xen.c
6379 ===================================================================
6380 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6381 +++ head-2008-11-25/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6384 + * linux/kernel/ldt.c
6386 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
6387 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6390 +#include <linux/errno.h>
6391 +#include <linux/sched.h>
6392 +#include <linux/string.h>
6393 +#include <linux/mm.h>
6394 +#include <linux/smp.h>
6395 +#include <linux/smp_lock.h>
6396 +#include <linux/vmalloc.h>
6397 +#include <linux/slab.h>
6399 +#include <asm/uaccess.h>
6400 +#include <asm/system.h>
6401 +#include <asm/ldt.h>
6402 +#include <asm/desc.h>
6403 +#include <asm/mmu_context.h>
6405 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
6406 +static void flush_ldt(void *null)
6408 + if (current->active_mm)
6409 + load_LDT(¤t->active_mm->context);
6413 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
6419 + if (mincount <= pc->size)
6421 + oldsize = pc->size;
6422 + mincount = (mincount+511)&(~511);
6423 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
6424 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
6426 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
6432 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
6434 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
6437 + pc->size = mincount;
6443 + preempt_disable();
6445 + make_pages_readonly(
6447 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6448 + XENFEAT_writable_descriptor_tables);
6451 + mask = cpumask_of_cpu(smp_processor_id());
6452 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
6453 + smp_call_function(flush_ldt, NULL, 1, 1);
6458 + make_pages_writable(
6460 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
6461 + XENFEAT_writable_descriptor_tables);
6462 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
6470 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
6472 + int err = alloc_ldt(new, old->size, 0);
6475 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
6476 + make_pages_readonly(
6478 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6479 + XENFEAT_writable_descriptor_tables);
6484 + * we do not have to muck with descriptors here, that is
6485 + * done in switch_mm() as needed.
6487 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
6489 + struct mm_struct * old_mm;
6492 + init_MUTEX(&mm->context.sem);
6493 + mm->context.size = 0;
6494 + mm->context.has_foreign_mappings = 0;
6495 + old_mm = current->mm;
6496 + if (old_mm && old_mm->context.size > 0) {
6497 + down(&old_mm->context.sem);
6498 + retval = copy_ldt(&mm->context, &old_mm->context);
6499 + up(&old_mm->context.sem);
6505 + * No need to lock the MM as we are the last user
6507 +void destroy_context(struct mm_struct *mm)
6509 + if (mm->context.size) {
6510 + if (mm == current->active_mm)
6512 + make_pages_writable(
6514 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6515 + XENFEAT_writable_descriptor_tables);
6516 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
6517 + vfree(mm->context.ldt);
6519 + kfree(mm->context.ldt);
6520 + mm->context.size = 0;
6524 +static int read_ldt(void __user * ptr, unsigned long bytecount)
6527 + unsigned long size;
6528 + struct mm_struct * mm = current->mm;
6530 + if (!mm->context.size)
6532 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
6533 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
6535 + down(&mm->context.sem);
6536 + size = mm->context.size*LDT_ENTRY_SIZE;
6537 + if (size > bytecount)
6541 + if (copy_to_user(ptr, mm->context.ldt, size))
6543 + up(&mm->context.sem);
6545 + goto error_return;
6546 + if (size != bytecount) {
6547 + /* zero-fill the rest */
6548 + if (clear_user(ptr+size, bytecount-size) != 0) {
6550 + goto error_return;
6558 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
6561 + unsigned long size;
6565 + address = &default_ldt[0];
6566 + size = 5*sizeof(struct desc_struct);
6567 + if (size > bytecount)
6571 + if (copy_to_user(ptr, address, size))
6577 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
6579 + struct mm_struct * mm = current->mm;
6580 + __u32 entry_1, entry_2;
6582 + struct user_desc ldt_info;
6585 + if (bytecount != sizeof(ldt_info))
6588 + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
6592 + if (ldt_info.entry_number >= LDT_ENTRIES)
6594 + if (ldt_info.contents == 3) {
6597 + if (ldt_info.seg_not_present == 0)
6601 + down(&mm->context.sem);
6602 + if (ldt_info.entry_number >= mm->context.size) {
6603 + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
6608 + /* Allow LDTs to be cleared by the user. */
6609 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
6610 + if (oldmode || LDT_empty(&ldt_info)) {
6617 + entry_1 = LDT_entry_a(&ldt_info);
6618 + entry_2 = LDT_entry_b(&ldt_info);
6620 + entry_2 &= ~(1 << 20);
6622 + /* Install the new entry ... */
6624 + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
6625 + entry_1, entry_2);
6628 + up(&mm->context.sem);
6633 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
6635 + int ret = -ENOSYS;
6639 + ret = read_ldt(ptr, bytecount);
6642 + ret = write_ldt(ptr, bytecount, 1);
6645 + ret = read_default_ldt(ptr, bytecount);
6648 + ret = write_ldt(ptr, bytecount, 0);
6653 Index: head-2008-11-25/arch/x86/kernel/microcode-xen.c
6654 ===================================================================
6655 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6656 +++ head-2008-11-25/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200
6659 + * Intel CPU Microcode Update Driver for Linux
6661 + * Copyright (C) 2000-2004 Tigran Aivazian
6663 + * This driver allows to upgrade microcode on Intel processors
6664 + * belonging to IA-32 family - PentiumPro, Pentium II,
6665 + * Pentium III, Xeon, Pentium 4, etc.
6667 + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
6668 + * Order Number 245472 or free download from:
6670 + * http://developer.intel.com/design/pentium4/manuals/245472.htm
6672 + * For more information, go to http://www.urbanmyth.org/microcode
6674 + * This program is free software; you can redistribute it and/or
6675 + * modify it under the terms of the GNU General Public License
6676 + * as published by the Free Software Foundation; either version
6677 + * 2 of the License, or (at your option) any later version.
6680 +//#define DEBUG /* pr_debug */
6681 +#include <linux/capability.h>
6682 +#include <linux/kernel.h>
6683 +#include <linux/init.h>
6684 +#include <linux/sched.h>
6685 +#include <linux/cpumask.h>
6686 +#include <linux/module.h>
6687 +#include <linux/slab.h>
6688 +#include <linux/vmalloc.h>
6689 +#include <linux/miscdevice.h>
6690 +#include <linux/spinlock.h>
6691 +#include <linux/mm.h>
6692 +#include <linux/mutex.h>
6693 +#include <linux/syscalls.h>
6695 +#include <asm/msr.h>
6696 +#include <asm/uaccess.h>
6697 +#include <asm/processor.h>
6699 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
6700 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
6701 +MODULE_LICENSE("GPL");
6703 +static int verbose;
6704 +module_param(verbose, int, 0644);
6706 +#define MICROCODE_VERSION "1.14a-xen"
6708 +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
6709 +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
6710 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
6712 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
6713 +static DEFINE_MUTEX(microcode_mutex);
6715 +static int microcode_open (struct inode *unused1, struct file *unused2)
6717 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
6721 +static int do_microcode_update (const void __user *ubuf, size_t len)
6726 + kbuf = vmalloc(len);
6730 + if (copy_from_user(kbuf, ubuf, len) == 0) {
6731 + struct xen_platform_op op;
6733 + op.cmd = XENPF_microcode_update;
6734 + set_xen_guest_handle(op.u.microcode.data, kbuf);
6735 + op.u.microcode.length = len;
6736 + err = HYPERVISOR_platform_op(&op);
6745 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
6749 + if (len < MC_HEADER_SIZE) {
6750 + printk(KERN_ERR "microcode: not enough data\n");
6754 + mutex_lock(µcode_mutex);
6756 + ret = do_microcode_update(buf, len);
6758 + ret = (ssize_t)len;
6760 + mutex_unlock(µcode_mutex);
6765 +static struct file_operations microcode_fops = {
6766 + .owner = THIS_MODULE,
6767 + .write = microcode_write,
6768 + .open = microcode_open,
6771 +static struct miscdevice microcode_dev = {
6772 + .minor = MICROCODE_MINOR,
6773 + .name = "microcode",
6774 + .fops = µcode_fops,
6777 +static int __init microcode_init (void)
6781 + error = misc_register(µcode_dev);
6784 + "microcode: can't misc_register on minor=%d\n",
6790 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
6794 +static void __exit microcode_exit (void)
6796 + misc_deregister(µcode_dev);
6799 +module_init(microcode_init)
6800 +module_exit(microcode_exit)
6801 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
6802 Index: head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c
6803 ===================================================================
6804 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6805 +++ head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6808 + * Intel Multiprocessor Specification 1.1 and 1.4
6809 + * compliant MP-table parsing routines.
6811 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6812 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6815 + * Erich Boleyn : MP v1.4 and additional changes.
6816 + * Alan Cox : Added EBDA scanning
6817 + * Ingo Molnar : various cleanups and rewrites
6818 + * Maciej W. Rozycki: Bits for default MP configurations
6819 + * Paul Diefenbaugh: Added full ACPI support
6822 +#include <linux/mm.h>
6823 +#include <linux/init.h>
6824 +#include <linux/acpi.h>
6825 +#include <linux/delay.h>
6826 +#include <linux/bootmem.h>
6827 +#include <linux/smp_lock.h>
6828 +#include <linux/kernel_stat.h>
6829 +#include <linux/mc146818rtc.h>
6830 +#include <linux/bitops.h>
6832 +#include <asm/smp.h>
6833 +#include <asm/acpi.h>
6834 +#include <asm/mtrr.h>
6835 +#include <asm/mpspec.h>
6836 +#include <asm/io_apic.h>
6838 +#include <mach_apic.h>
6839 +#include <mach_mpparse.h>
6840 +#include <bios_ebda.h>
6842 +/* Have we found an MP table */
6843 +int smp_found_config;
6844 +unsigned int __initdata maxcpus = NR_CPUS;
6847 + * Various Linux-internal data structures created from the
6850 +int apic_version [MAX_APICS];
6851 +int mp_bus_id_to_type [MAX_MP_BUSSES];
6852 +int mp_bus_id_to_node [MAX_MP_BUSSES];
6853 +int mp_bus_id_to_local [MAX_MP_BUSSES];
6854 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
6855 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
6856 +static int mp_current_pci_id;
6858 +/* I/O APIC entries */
6859 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6861 +/* # of MP IRQ source entries */
6862 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6864 +/* MP IRQ source entries */
6865 +int mp_irq_entries;
6870 +unsigned long mp_lapic_addr;
6872 +unsigned int def_to_bigsmp = 0;
6874 +/* Processor that is doing the boot up */
6875 +unsigned int boot_cpu_physical_apicid = -1U;
6876 +/* Internal processor count */
6877 +static unsigned int __devinitdata num_processors;
6879 +/* Bitmask of physically existing CPUs */
6880 +physid_mask_t phys_cpu_present_map;
6882 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
6885 + * Intel MP BIOS table parsing routines:
6890 + * Checksum an MP configuration block.
6893 +static int __init mpf_checksum(unsigned char *mp, int len)
6900 + return sum & 0xFF;
6904 + * Have to match translation table entries to main table entries by counter
6905 + * hence the mpc_record variable .... can't see a less disgusting way of
6909 +static int mpc_record;
6910 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
6913 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
6916 + physid_mask_t phys_cpu;
6918 + if (!(m->mpc_cpuflag & CPU_ENABLED))
6921 + apicid = mpc_apic_id(m, translation_table[mpc_record]);
6923 + if (m->mpc_featureflag&(1<<0))
6924 + Dprintk(" Floating point unit present.\n");
6925 + if (m->mpc_featureflag&(1<<7))
6926 + Dprintk(" Machine Exception supported.\n");
6927 + if (m->mpc_featureflag&(1<<8))
6928 + Dprintk(" 64 bit compare & exchange supported.\n");
6929 + if (m->mpc_featureflag&(1<<9))
6930 + Dprintk(" Internal APIC present.\n");
6931 + if (m->mpc_featureflag&(1<<11))
6932 + Dprintk(" SEP present.\n");
6933 + if (m->mpc_featureflag&(1<<12))
6934 + Dprintk(" MTRR present.\n");
6935 + if (m->mpc_featureflag&(1<<13))
6936 + Dprintk(" PGE present.\n");
6937 + if (m->mpc_featureflag&(1<<14))
6938 + Dprintk(" MCA present.\n");
6939 + if (m->mpc_featureflag&(1<<15))
6940 + Dprintk(" CMOV present.\n");
6941 + if (m->mpc_featureflag&(1<<16))
6942 + Dprintk(" PAT present.\n");
6943 + if (m->mpc_featureflag&(1<<17))
6944 + Dprintk(" PSE present.\n");
6945 + if (m->mpc_featureflag&(1<<18))
6946 + Dprintk(" PSN present.\n");
6947 + if (m->mpc_featureflag&(1<<19))
6948 + Dprintk(" Cache Line Flush Instruction present.\n");
6950 + if (m->mpc_featureflag&(1<<21))
6951 + Dprintk(" Debug Trace and EMON Store present.\n");
6952 + if (m->mpc_featureflag&(1<<22))
6953 + Dprintk(" ACPI Thermal Throttle Registers present.\n");
6954 + if (m->mpc_featureflag&(1<<23))
6955 + Dprintk(" MMX present.\n");
6956 + if (m->mpc_featureflag&(1<<24))
6957 + Dprintk(" FXSR present.\n");
6958 + if (m->mpc_featureflag&(1<<25))
6959 + Dprintk(" XMM present.\n");
6960 + if (m->mpc_featureflag&(1<<26))
6961 + Dprintk(" Willamette New Instructions present.\n");
6962 + if (m->mpc_featureflag&(1<<27))
6963 + Dprintk(" Self Snoop present.\n");
6964 + if (m->mpc_featureflag&(1<<28))
6965 + Dprintk(" HT present.\n");
6966 + if (m->mpc_featureflag&(1<<29))
6967 + Dprintk(" Thermal Monitor present.\n");
6968 + /* 30, 31 Reserved */
6971 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
6972 + Dprintk(" Bootup CPU\n");
6973 + boot_cpu_physical_apicid = m->mpc_apicid;
6976 + ver = m->mpc_apicver;
6979 + * Validate version
6982 + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
6983 + "fixing up to 0x10. (tell your hw vendor)\n",
6987 + apic_version[m->mpc_apicid] = ver;
6989 + phys_cpu = apicid_to_cpu_present(apicid);
6990 + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
6992 + if (num_processors >= NR_CPUS) {
6993 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
6994 + " Processor ignored.\n", NR_CPUS);
6998 + if (num_processors >= maxcpus) {
6999 + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7000 + " Processor ignored.\n", maxcpus);
7004 + cpu_set(num_processors, cpu_possible_map);
7008 + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
7009 + * but we need to work other dependencies like SMP_SUSPEND etc
7010 + * before this can be done without some confusion.
7011 + * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
7012 + * - Ashok Raj <ashok.raj@intel.com>
7014 + if (num_processors > 8) {
7015 + switch (boot_cpu_data.x86_vendor) {
7016 + case X86_VENDOR_INTEL:
7017 + if (!APIC_XAPIC(ver)) {
7018 + def_to_bigsmp = 0;
7021 + /* If P4 and above fall through */
7022 + case X86_VENDOR_AMD:
7023 + def_to_bigsmp = 1;
7026 + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7029 +void __init MP_processor_info (struct mpc_config_processor *m)
7033 +#endif /* CONFIG_XEN */
7035 +static void __init MP_bus_info (struct mpc_config_bus *m)
7039 + memcpy(str, m->mpc_bustype, 6);
7042 + mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7044 + if (m->mpc_busid >= MAX_MP_BUSSES) {
7045 + printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
7046 + " is too large, max. supported is %d\n",
7047 + m->mpc_busid, str, MAX_MP_BUSSES - 1);
7051 + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7052 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7053 + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7054 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7055 + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7056 + mpc_oem_pci_bus(m, translation_table[mpc_record]);
7057 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7058 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7059 + mp_current_pci_id++;
7060 + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7061 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7062 + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7063 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7065 + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7069 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7071 + if (!(m->mpc_flags & MPC_APIC_USABLE))
7074 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7075 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7076 + if (nr_ioapics >= MAX_IO_APICS) {
7077 + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7078 + MAX_IO_APICS, nr_ioapics);
7079 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7081 + if (!m->mpc_apicaddr) {
7082 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7083 + " found in MP table, skipping!\n");
7086 + mp_ioapics[nr_ioapics] = *m;
7090 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7092 + mp_irqs [mp_irq_entries] = *m;
7093 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7094 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7095 + m->mpc_irqtype, m->mpc_irqflag & 3,
7096 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7097 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7098 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7099 + panic("Max # of irq sources exceeded!!\n");
7102 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7104 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7105 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7106 + m->mpc_irqtype, m->mpc_irqflag & 3,
7107 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7108 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7110 + * Well it seems all SMP boards in existence
7111 + * use ExtINT/LVT1 == LINT0 and
7112 + * NMI/LVT2 == LINT1 - the following check
7113 + * will show us if this assumptions is false.
7114 + * Until then we do not have to add baggage.
7116 + if ((m->mpc_irqtype == mp_ExtINT) &&
7117 + (m->mpc_destapiclint != 0))
7119 + if ((m->mpc_irqtype == mp_NMI) &&
7120 + (m->mpc_destapiclint != 1))
7124 +#ifdef CONFIG_X86_NUMAQ
7125 +static void __init MP_translation_info (struct mpc_config_translation *m)
7127 + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7129 + if (mpc_record >= MAX_MPC_ENTRY)
7130 + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7132 + translation_table[mpc_record] = m; /* stash this for later */
7133 + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7134 + node_set_online(m->trans_quad);
7138 + * Read/parse the MPC oem tables
7141 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7142 + unsigned short oemsize)
7144 + int count = sizeof (*oemtable); /* the header size */
7145 + unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7148 + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7149 + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7151 + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7152 + oemtable->oem_signature[0],
7153 + oemtable->oem_signature[1],
7154 + oemtable->oem_signature[2],
7155 + oemtable->oem_signature[3]);
7158 + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7160 + printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7163 + while (count < oemtable->oem_length) {
7164 + switch (*oemptr) {
7165 + case MP_TRANSLATION:
7167 + struct mpc_config_translation *m=
7168 + (struct mpc_config_translation *)oemptr;
7169 + MP_translation_info(m);
7170 + oemptr += sizeof(*m);
7171 + count += sizeof(*m);
7177 + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7184 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
7187 + if (strncmp(oem, "IBM NUMA", 8))
7188 + printk("Warning! May not be a NUMA-Q system!\n");
7189 + if (mpc->mpc_oemptr)
7190 + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
7191 + mpc->mpc_oemsize);
7193 +#endif /* CONFIG_X86_NUMAQ */
7196 + * Read/parse the MPC
7199 +static int __init smp_read_mpc(struct mp_config_table *mpc)
7203 + int count=sizeof(*mpc);
7204 + unsigned char *mpt=((unsigned char *)mpc)+count;
7206 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
7207 + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
7208 + *(u32 *)mpc->mpc_signature);
7211 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
7212 + printk(KERN_ERR "SMP mptable: checksum error!\n");
7215 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
7216 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
7220 + if (!mpc->mpc_lapic) {
7221 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
7224 + memcpy(oem,mpc->mpc_oem,8);
7226 + printk(KERN_INFO "OEM ID: %s ",oem);
7228 + memcpy(str,mpc->mpc_productid,12);
7230 + printk("Product ID: %s ",str);
7232 + mps_oem_check(mpc, oem, str);
7234 + printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
7237 + * Save the local APIC address (it might be non-default) -- but only
7238 + * if we're not using ACPI.
7241 + mp_lapic_addr = mpc->mpc_lapic;
7244 + * Now process the configuration blocks.
7247 + while (count < mpc->mpc_length) {
7249 + case MP_PROCESSOR:
7251 + struct mpc_config_processor *m=
7252 + (struct mpc_config_processor *)mpt;
7253 + /* ACPI may have already provided this data */
7255 + MP_processor_info(m);
7256 + mpt += sizeof(*m);
7257 + count += sizeof(*m);
7262 + struct mpc_config_bus *m=
7263 + (struct mpc_config_bus *)mpt;
7265 + mpt += sizeof(*m);
7266 + count += sizeof(*m);
7271 + struct mpc_config_ioapic *m=
7272 + (struct mpc_config_ioapic *)mpt;
7273 + MP_ioapic_info(m);
7275 + count+=sizeof(*m);
7280 + struct mpc_config_intsrc *m=
7281 + (struct mpc_config_intsrc *)mpt;
7283 + MP_intsrc_info(m);
7285 + count+=sizeof(*m);
7290 + struct mpc_config_lintsrc *m=
7291 + (struct mpc_config_lintsrc *)mpt;
7292 + MP_lintsrc_info(m);
7294 + count+=sizeof(*m);
7299 + count = mpc->mpc_length;
7305 + clustered_apic_check();
7306 + if (!num_processors)
7307 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
7308 + return num_processors;
7311 +static int __init ELCR_trigger(unsigned int irq)
7313 + unsigned int port;
7315 + port = 0x4d0 + (irq >> 3);
7316 + return (inb(port) >> (irq & 7)) & 1;
7319 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
7321 + struct mpc_config_intsrc intsrc;
7323 + int ELCR_fallback = 0;
7325 + intsrc.mpc_type = MP_INTSRC;
7326 + intsrc.mpc_irqflag = 0; /* conforming */
7327 + intsrc.mpc_srcbus = 0;
7328 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
7330 + intsrc.mpc_irqtype = mp_INT;
7333 + * If true, we have an ISA/PCI system with no IRQ entries
7334 + * in the MP table. To prevent the PCI interrupts from being set up
7335 + * incorrectly, we try to use the ELCR. The sanity check to see if
7336 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
7337 + * never be level sensitive, so we simply see if the ELCR agrees.
7338 + * If it does, we assume it's valid.
7340 + if (mpc_default_type == 5) {
7341 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
7343 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
7344 + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
7346 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
7347 + ELCR_fallback = 1;
7351 + for (i = 0; i < 16; i++) {
7352 + switch (mpc_default_type) {
7354 + if (i == 0 || i == 13)
7355 + continue; /* IRQ0 & IRQ13 not connected */
7356 + /* fall through */
7359 + continue; /* IRQ2 is never connected */
7362 + if (ELCR_fallback) {
7364 + * If the ELCR indicates a level-sensitive interrupt, we
7365 + * copy that information over to the MP table in the
7366 + * irqflag field (level sensitive, active high polarity).
7368 + if (ELCR_trigger(i))
7369 + intsrc.mpc_irqflag = 13;
7371 + intsrc.mpc_irqflag = 0;
7374 + intsrc.mpc_srcbusirq = i;
7375 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
7376 + MP_intsrc_info(&intsrc);
7379 + intsrc.mpc_irqtype = mp_ExtINT;
7380 + intsrc.mpc_srcbusirq = 0;
7381 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
7382 + MP_intsrc_info(&intsrc);
7385 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
7387 + struct mpc_config_processor processor;
7388 + struct mpc_config_bus bus;
7389 + struct mpc_config_ioapic ioapic;
7390 + struct mpc_config_lintsrc lintsrc;
7391 + int linttypes[2] = { mp_ExtINT, mp_NMI };
7395 + * local APIC has default address
7397 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
7400 + * 2 CPUs, numbered 0 & 1.
7402 + processor.mpc_type = MP_PROCESSOR;
7403 + /* Either an integrated APIC or a discrete 82489DX. */
7404 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7405 + processor.mpc_cpuflag = CPU_ENABLED;
7406 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7407 + (boot_cpu_data.x86_model << 4) |
7408 + boot_cpu_data.x86_mask;
7409 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7410 + processor.mpc_reserved[0] = 0;
7411 + processor.mpc_reserved[1] = 0;
7412 + for (i = 0; i < 2; i++) {
7413 + processor.mpc_apicid = i;
7414 + MP_processor_info(&processor);
7417 + bus.mpc_type = MP_BUS;
7418 + bus.mpc_busid = 0;
7419 + switch (mpc_default_type) {
7422 + printk(KERN_ERR "Unknown standard configuration %d\n",
7423 + mpc_default_type);
7424 + /* fall through */
7427 + memcpy(bus.mpc_bustype, "ISA ", 6);
7432 + memcpy(bus.mpc_bustype, "EISA ", 6);
7436 + memcpy(bus.mpc_bustype, "MCA ", 6);
7438 + MP_bus_info(&bus);
7439 + if (mpc_default_type > 4) {
7440 + bus.mpc_busid = 1;
7441 + memcpy(bus.mpc_bustype, "PCI ", 6);
7442 + MP_bus_info(&bus);
7445 + ioapic.mpc_type = MP_IOAPIC;
7446 + ioapic.mpc_apicid = 2;
7447 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7448 + ioapic.mpc_flags = MPC_APIC_USABLE;
7449 + ioapic.mpc_apicaddr = 0xFEC00000;
7450 + MP_ioapic_info(&ioapic);
7453 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
7455 + construct_default_ioirq_mptable(mpc_default_type);
7457 + lintsrc.mpc_type = MP_LINTSRC;
7458 + lintsrc.mpc_irqflag = 0; /* conforming */
7459 + lintsrc.mpc_srcbusid = 0;
7460 + lintsrc.mpc_srcbusirq = 0;
7461 + lintsrc.mpc_destapic = MP_APIC_ALL;
7462 + for (i = 0; i < 2; i++) {
7463 + lintsrc.mpc_irqtype = linttypes[i];
7464 + lintsrc.mpc_destapiclint = i;
7465 + MP_lintsrc_info(&lintsrc);
7469 +static struct intel_mp_floating *mpf_found;
7472 + * Scan the memory blocks for an SMP configuration block.
7474 +void __init get_smp_config (void)
7476 + struct intel_mp_floating *mpf = mpf_found;
7479 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
7480 + * processors, where MPS only supports physical.
7482 + if (acpi_lapic && acpi_ioapic) {
7483 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
7486 + else if (acpi_lapic)
7487 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
7489 + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
7490 + if (mpf->mpf_feature2 & (1<<7)) {
7491 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
7494 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
7499 + * Now see if we need to read further.
7501 + if (mpf->mpf_feature1 != 0) {
7503 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
7504 + construct_default_ISA_mptable(mpf->mpf_feature1);
7506 + } else if (mpf->mpf_physptr) {
7509 + * Read the physical hardware table. Anything here will
7510 + * override the defaults.
7512 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
7513 + smp_found_config = 0;
7514 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
7515 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
7519 + * If there are no explicit MP IRQ entries, then we are
7520 + * broken. We set up most of the low 16 IO-APIC pins to
7521 + * ISA defaults and hope it will work.
7523 + if (!mp_irq_entries) {
7524 + struct mpc_config_bus bus;
7526 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
7528 + bus.mpc_type = MP_BUS;
7529 + bus.mpc_busid = 0;
7530 + memcpy(bus.mpc_bustype, "ISA ", 6);
7531 + MP_bus_info(&bus);
7533 + construct_default_ioirq_mptable(0);
7539 + printk(KERN_INFO "Processors: %d\n", num_processors);
7541 + * Only use the first configuration found.
7545 +static int __init smp_scan_config (unsigned long base, unsigned long length)
7547 + unsigned long *bp = isa_bus_to_virt(base);
7548 + struct intel_mp_floating *mpf;
7550 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
7551 + if (sizeof(*mpf) != 16)
7552 + printk("Error: MPF size\n");
7554 + while (length > 0) {
7555 + mpf = (struct intel_mp_floating *)bp;
7556 + if ((*bp == SMP_MAGIC_IDENT) &&
7557 + (mpf->mpf_length == 1) &&
7558 + !mpf_checksum((unsigned char *)bp, 16) &&
7559 + ((mpf->mpf_specification == 1)
7560 + || (mpf->mpf_specification == 4)) ) {
7562 + smp_found_config = 1;
7564 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
7565 + virt_to_phys(mpf));
7566 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
7567 + if (mpf->mpf_physptr) {
7569 + * We cannot access to MPC table to compute
7570 + * table size yet, as only few megabytes from
7571 + * the bottom is mapped now.
7572 + * PC-9800's MPC table places on the very last
7573 + * of physical memory; so that simply reserving
7574 + * PAGE_SIZE from mpg->mpf_physptr yields BUG()
7575 + * in reserve_bootmem.
7577 + unsigned long size = PAGE_SIZE;
7578 + unsigned long end = max_low_pfn * PAGE_SIZE;
7579 + if (mpf->mpf_physptr + size > end)
7580 + size = end - mpf->mpf_physptr;
7581 + reserve_bootmem(mpf->mpf_physptr, size);
7584 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
7585 + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
7597 +void __init find_smp_config (void)
7600 + unsigned int address;
7604 + * FIXME: Linux assumes you have 640K of base ram..
7605 + * this continues the error...
7607 + * 1) Scan the bottom 1K for a signature
7608 + * 2) Scan the top 1K of base RAM
7609 + * 3) Scan the 64K of bios
7611 + if (smp_scan_config(0x0,0x400) ||
7612 + smp_scan_config(639*0x400,0x400) ||
7613 + smp_scan_config(0xF0000,0x10000))
7616 + * If it is an SMP machine we should know now, unless the
7617 + * configuration is in an EISA/MCA bus machine with an
7618 + * extended bios data area.
7620 + * there is a real-mode segmented pointer pointing to the
7621 + * 4K EBDA area at 0x40E, calculate and scan it here.
7623 + * NOTE! There are Linux loaders that will corrupt the EBDA
7624 + * area, and as such this kind of SMP config may be less
7625 + * trustworthy, simply because the SMP table may have been
7626 + * stomped on during early boot. These loaders are buggy and
7627 + * should be fixed.
7629 + * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
7633 + address = get_bios_ebda();
7635 + smp_scan_config(address, 0x400);
7641 +/* --------------------------------------------------------------------------
7642 + ACPI-based MP Configuration
7643 + -------------------------------------------------------------------------- */
7647 +void __init mp_register_lapic_address (
7651 + mp_lapic_addr = (unsigned long) address;
7653 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
7655 + if (boot_cpu_physical_apicid == -1U)
7656 + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
7658 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
7663 +void __devinit mp_register_lapic (
7667 + struct mpc_config_processor processor;
7670 + if (MAX_APICS - id <= 0) {
7671 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
7676 + if (id == boot_cpu_physical_apicid)
7680 + processor.mpc_type = MP_PROCESSOR;
7681 + processor.mpc_apicid = id;
7682 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
7683 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
7684 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
7685 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7686 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
7687 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7688 + processor.mpc_reserved[0] = 0;
7689 + processor.mpc_reserved[1] = 0;
7692 + MP_processor_info(&processor);
7695 +#ifdef CONFIG_X86_IO_APIC
7697 +#define MP_ISA_BUS 0
7698 +#define MP_MAX_IOAPIC_PIN 127
7700 +static struct mp_ioapic_routing {
7704 + u32 pin_programmed[4];
7705 +} mp_ioapic_routing[MAX_IO_APICS];
7708 +static int mp_find_ioapic (
7713 + /* Find the IOAPIC that manages this GSI. */
7714 + for (i = 0; i < nr_ioapics; i++) {
7715 + if ((gsi >= mp_ioapic_routing[i].gsi_base)
7716 + && (gsi <= mp_ioapic_routing[i].gsi_end))
7720 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
7726 +void __init mp_register_ioapic (
7734 + if (nr_ioapics >= MAX_IO_APICS) {
7735 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
7736 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
7737 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
7740 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
7741 + " found in MADT table, skipping!\n");
7745 + idx = nr_ioapics++;
7747 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
7748 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
7749 + mp_ioapics[idx].mpc_apicaddr = address;
7752 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
7754 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
7755 + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
7756 + tmpid = io_apic_get_unique_id(idx, id);
7759 + if (tmpid == -1) {
7763 + mp_ioapics[idx].mpc_apicid = tmpid;
7764 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
7767 + * Build basic GSI lookup table to facilitate gsi->io_apic lookups
7768 + * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
7770 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
7771 + mp_ioapic_routing[idx].gsi_base = gsi_base;
7772 + mp_ioapic_routing[idx].gsi_end = gsi_base +
7773 + io_apic_get_redir_entries(idx);
7775 + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
7776 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
7777 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
7778 + mp_ioapic_routing[idx].gsi_base,
7779 + mp_ioapic_routing[idx].gsi_end);
7785 +void __init mp_override_legacy_irq (
7791 + struct mpc_config_intsrc intsrc;
7796 + * Convert 'gsi' to 'ioapic.pin'.
7798 + ioapic = mp_find_ioapic(gsi);
7801 + pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7804 + * TBD: This check is for faulty timer entries, where the override
7805 + * erroneously sets the trigger to level, resulting in a HUGE
7806 + * increase of timer interrupts!
7808 + if ((bus_irq == 0) && (trigger == 3))
7811 + intsrc.mpc_type = MP_INTSRC;
7812 + intsrc.mpc_irqtype = mp_INT;
7813 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
7814 + intsrc.mpc_srcbus = MP_ISA_BUS;
7815 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
7816 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
7817 + intsrc.mpc_dstirq = pin; /* INTIN# */
7819 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
7820 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7821 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7822 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
7824 + mp_irqs[mp_irq_entries] = intsrc;
7825 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7826 + panic("Max # of irq sources exceeded!\n");
7831 +void __init mp_config_acpi_legacy_irqs (void)
7833 + struct mpc_config_intsrc intsrc;
7838 + * Fabricate the legacy ISA bus (bus #31).
7840 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
7841 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
7844 + * Older generations of ES7000 have no legacy identity mappings
7846 + if (es7000_plat == 1)
7850 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
7852 + ioapic = mp_find_ioapic(0);
7856 + intsrc.mpc_type = MP_INTSRC;
7857 + intsrc.mpc_irqflag = 0; /* Conforming */
7858 + intsrc.mpc_srcbus = MP_ISA_BUS;
7859 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
7862 + * Use the default configuration for the IRQs 0-15. Unless
7863 + * overriden by (MADT) interrupt source override entries.
7865 + for (i = 0; i < 16; i++) {
7868 + for (idx = 0; idx < mp_irq_entries; idx++) {
7869 + struct mpc_config_intsrc *irq = mp_irqs + idx;
7871 + /* Do we already have a mapping for this ISA IRQ? */
7872 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
7875 + /* Do we already have a mapping for this IOAPIC pin */
7876 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
7877 + (irq->mpc_dstirq == i))
7881 + if (idx != mp_irq_entries) {
7882 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
7883 + continue; /* IRQ already used */
7886 + intsrc.mpc_irqtype = mp_INT;
7887 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
7888 + intsrc.mpc_dstirq = i;
7890 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
7891 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7892 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7893 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
7894 + intsrc.mpc_dstirq);
7896 + mp_irqs[mp_irq_entries] = intsrc;
7897 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7898 + panic("Max # of irq sources exceeded!\n");
7902 +#define MAX_GSI_NUM 4096
7904 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
7907 + int ioapic_pin = 0;
7909 + static int pci_irq = 16;
7911 + * Mapping between Global System Interrups, which
7912 + * represent all possible interrupts, and IRQs
7913 + * assigned to actual devices.
7915 + static int gsi_to_irq[MAX_GSI_NUM];
7917 + /* Don't set up the ACPI SCI because it's already set up */
7918 + if (acpi_fadt.sci_int == gsi)
7921 + ioapic = mp_find_ioapic(gsi);
7923 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
7927 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7929 + if (ioapic_renumber_irq)
7930 + gsi = ioapic_renumber_irq(ioapic, gsi);
7933 + * Avoid pin reprogramming. PRTs typically include entries
7934 + * with redundant pin->gsi mappings (but unique PCI devices);
7935 + * we only program the IOAPIC on the first.
7937 + bit = ioapic_pin % 32;
7938 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
7940 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
7941 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
7945 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
7946 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
7947 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
7948 + return gsi_to_irq[gsi];
7951 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
7953 + if (triggering == ACPI_LEVEL_SENSITIVE) {
7955 + * For PCI devices assign IRQs in order, avoiding gaps
7956 + * due to unused I/O APIC pins.
7959 + if (gsi < MAX_GSI_NUM) {
7961 + * Retain the VIA chipset work-around (gsi > 15), but
7962 + * avoid a problem where the 8254 timer (IRQ0) is setup
7963 + * via an override (so it's not on pin 0 of the ioapic),
7964 + * and at the same time, the pin 0 interrupt is a PCI
7965 + * type. The gsi > 15 test could cause these two pins
7966 + * to be shared as IRQ0, and they are not shareable.
7967 + * So test for this condition, and if necessary, avoid
7968 + * the pin collision.
7970 + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
7973 + * Don't assign IRQ used by ACPI SCI
7975 + if (gsi == acpi_fadt.sci_int)
7977 + gsi_to_irq[irq] = gsi;
7979 + printk(KERN_ERR "GSI %u is too high\n", gsi);
7984 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
7985 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
7986 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
7990 +#endif /* CONFIG_X86_IO_APIC */
7991 +#endif /* CONFIG_ACPI */
7992 Index: head-2008-11-25/arch/x86/kernel/pci-dma-xen.c
7993 ===================================================================
7994 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
7995 +++ head-2008-11-25/arch/x86/kernel/pci-dma-xen.c 2008-10-29 09:55:56.000000000 +0100
7998 + * Dynamic DMA mapping support.
8000 + * On i386 there is no hardware dynamic DMA address translation,
8001 + * so consistent alloc/free are merely page allocation/freeing.
8002 + * The rest of the dynamic DMA mapping interface is implemented
8006 +#include <linux/types.h>
8007 +#include <linux/mm.h>
8008 +#include <linux/string.h>
8009 +#include <linux/pci.h>
8010 +#include <linux/module.h>
8011 +#include <linux/version.h>
8012 +#include <asm/io.h>
8013 +#include <xen/balloon.h>
8014 +#include <xen/gnttab.h>
8015 +#include <asm/swiotlb.h>
8016 +#include <asm/tlbflush.h>
8017 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8018 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
8019 +#include <asm/bug.h>
8022 +#include <asm/proto.h>
8024 +int iommu_merge __read_mostly = 0;
8025 +EXPORT_SYMBOL(iommu_merge);
8027 +dma_addr_t bad_dma_address __read_mostly;
8028 +EXPORT_SYMBOL(bad_dma_address);
8030 +/* This tells the BIO block layer to assume merging. Default to off
8031 + because we cannot guarantee merging later. */
8032 +int iommu_bio_merge __read_mostly = 0;
8033 +EXPORT_SYMBOL(iommu_bio_merge);
8035 +int force_iommu __read_mostly= 0;
8037 +__init int iommu_setup(char *p)
8042 +void __init pci_iommu_alloc(void)
8044 +#ifdef CONFIG_SWIOTLB
8045 + pci_swiotlb_init();
8049 +static int __init pci_iommu_init(void)
8055 +/* Must execute after PCI subsystem */
8056 +fs_initcall(pci_iommu_init);
8059 +struct dma_coherent_mem {
8064 + unsigned long *bitmap;
8067 +#define IOMMU_BUG_ON(test) \
8069 + if (unlikely(test)) { \
8070 + printk(KERN_ALERT "Fatal DMA error! " \
8071 + "Please use 'swiotlb=force'\n"); \
8076 +static int check_pages_physically_contiguous(unsigned long pfn,
8077 + unsigned int offset,
8080 + unsigned long next_mfn;
8084 + next_mfn = pfn_to_mfn(pfn);
8085 + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
8087 + for (i = 1; i < nr_pages; i++) {
8088 + if (pfn_to_mfn(++pfn) != ++next_mfn)
8094 +int range_straddles_page_boundary(paddr_t p, size_t size)
8096 + unsigned long pfn = p >> PAGE_SHIFT;
8097 + unsigned int offset = p & ~PAGE_MASK;
8099 + return ((offset + size > PAGE_SIZE) &&
8100 + !check_pages_physically_contiguous(pfn, offset, size));
8104 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8105 + enum dma_data_direction direction)
8109 + if (direction == DMA_NONE)
8111 + WARN_ON(nents == 0 || sg[0].length == 0);
8114 + rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8116 + for (i = 0; i < nents; i++ ) {
8117 + BUG_ON(!sg[i].page);
8118 + sg[i].dma_address =
8119 + gnttab_dma_map_page(sg[i].page) + sg[i].offset;
8120 + sg[i].dma_length = sg[i].length;
8121 + IOMMU_BUG_ON(address_needs_mapping(
8122 + hwdev, sg[i].dma_address));
8123 + IOMMU_BUG_ON(range_straddles_page_boundary(
8124 + page_to_pseudophys(sg[i].page) + sg[i].offset,
8130 + flush_write_buffers();
8133 +EXPORT_SYMBOL(dma_map_sg);
8136 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8137 + enum dma_data_direction direction)
8141 + BUG_ON(direction == DMA_NONE);
8143 + swiotlb_unmap_sg(hwdev, sg, nents, direction);
8145 + for (i = 0; i < nents; i++ )
8146 + gnttab_dma_unmap_page(sg[i].dma_address);
8149 +EXPORT_SYMBOL(dma_unmap_sg);
8151 +#ifdef CONFIG_HIGHMEM
8153 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8154 + size_t size, enum dma_data_direction direction)
8156 + dma_addr_t dma_addr;
8158 + BUG_ON(direction == DMA_NONE);
8161 + dma_addr = swiotlb_map_page(
8162 + dev, page, offset, size, direction);
8164 + dma_addr = gnttab_dma_map_page(page) + offset;
8165 + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8170 +EXPORT_SYMBOL(dma_map_page);
8173 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8174 + enum dma_data_direction direction)
8176 + BUG_ON(direction == DMA_NONE);
8178 + swiotlb_unmap_page(dev, dma_address, size, direction);
8180 + gnttab_dma_unmap_page(dma_address);
8182 +EXPORT_SYMBOL(dma_unmap_page);
8183 +#endif /* CONFIG_HIGHMEM */
8186 +dma_mapping_error(dma_addr_t dma_addr)
8189 + return swiotlb_dma_mapping_error(dma_addr);
8192 +EXPORT_SYMBOL(dma_mapping_error);
8195 +dma_supported(struct device *dev, u64 mask)
8198 + return swiotlb_dma_supported(dev, mask);
8200 + * By default we'll BUG when an infeasible DMA is requested, and
8201 + * request swiotlb=force (see IOMMU_BUG_ON).
8205 +EXPORT_SYMBOL(dma_supported);
8207 +void *dma_alloc_coherent(struct device *dev, size_t size,
8208 + dma_addr_t *dma_handle, gfp_t gfp)
8211 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8212 + unsigned int order = get_order(size);
8213 + unsigned long vstart;
8216 + /* ignore region specifiers */
8217 + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8220 + int page = bitmap_find_free_region(mem->bitmap, mem->size,
8223 + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8224 + ret = mem->virt_base + (page << PAGE_SHIFT);
8225 + memset(ret, 0, size);
8228 + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8232 + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8235 + vstart = __get_free_pages(gfp, order);
8236 + ret = (void *)vstart;
8238 + if (dev != NULL && dev->coherent_dma_mask)
8239 + mask = dev->coherent_dma_mask;
8241 + mask = 0xffffffff;
8243 + if (ret != NULL) {
8244 + if (xen_create_contiguous_region(vstart, order,
8245 + fls64(mask)) != 0) {
8246 + free_pages(vstart, order);
8249 + memset(ret, 0, size);
8250 + *dma_handle = virt_to_bus(ret);
8254 +EXPORT_SYMBOL(dma_alloc_coherent);
8256 +void dma_free_coherent(struct device *dev, size_t size,
8257 + void *vaddr, dma_addr_t dma_handle)
8259 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8260 + int order = get_order(size);
8262 + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
8263 + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
8265 + bitmap_release_region(mem->bitmap, page, order);
8267 + xen_destroy_contiguous_region((unsigned long)vaddr, order);
8268 + free_pages((unsigned long)vaddr, order);
8271 +EXPORT_SYMBOL(dma_free_coherent);
8273 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
8274 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
8275 + dma_addr_t device_addr, size_t size, int flags)
8277 + void __iomem *mem_base;
8278 + int pages = size >> PAGE_SHIFT;
8279 + int bitmap_size = (pages + 31)/32;
8281 + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
8288 + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
8290 + mem_base = ioremap(bus_addr, size);
8294 + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
8295 + if (!dev->dma_mem)
8297 + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
8298 + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
8299 + if (!dev->dma_mem->bitmap)
8301 + memset(dev->dma_mem->bitmap, 0, bitmap_size);
8303 + dev->dma_mem->virt_base = mem_base;
8304 + dev->dma_mem->device_base = device_addr;
8305 + dev->dma_mem->size = pages;
8306 + dev->dma_mem->flags = flags;
8308 + if (flags & DMA_MEMORY_MAP)
8309 + return DMA_MEMORY_MAP;
8311 + return DMA_MEMORY_IO;
8314 + kfree(dev->dma_mem->bitmap);
8318 +EXPORT_SYMBOL(dma_declare_coherent_memory);
8320 +void dma_release_declared_memory(struct device *dev)
8322 + struct dma_coherent_mem *mem = dev->dma_mem;
8326 + dev->dma_mem = NULL;
8327 + iounmap(mem->virt_base);
8328 + kfree(mem->bitmap);
8331 +EXPORT_SYMBOL(dma_release_declared_memory);
8333 +void *dma_mark_declared_memory_occupied(struct device *dev,
8334 + dma_addr_t device_addr, size_t size)
8336 + struct dma_coherent_mem *mem = dev->dma_mem;
8337 + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
8341 + return ERR_PTR(-EINVAL);
8343 + pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
8344 + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
8346 + return ERR_PTR(err);
8347 + return mem->virt_base + (pos << PAGE_SHIFT);
8349 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
8350 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
8353 +dma_map_single(struct device *dev, void *ptr, size_t size,
8354 + enum dma_data_direction direction)
8358 + if (direction == DMA_NONE)
8360 + WARN_ON(size == 0);
8363 + dma = swiotlb_map_single(dev, ptr, size, direction);
8365 + dma = gnttab_dma_map_page(virt_to_page(ptr)) +
8366 + offset_in_page(ptr);
8367 + IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
8368 + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
8371 + flush_write_buffers();
8374 +EXPORT_SYMBOL(dma_map_single);
8377 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
8378 + enum dma_data_direction direction)
8380 + if (direction == DMA_NONE)
8383 + swiotlb_unmap_single(dev, dma_addr, size, direction);
8385 + gnttab_dma_unmap_page(dma_addr);
8387 +EXPORT_SYMBOL(dma_unmap_single);
8390 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
8391 + enum dma_data_direction direction)
8394 + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
8396 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
8399 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
8400 + enum dma_data_direction direction)
8403 + swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
8405 +EXPORT_SYMBOL(dma_sync_single_for_device);
8406 Index: head-2008-11-25/arch/x86/kernel/process_32-xen.c
8407 ===================================================================
8408 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
8409 +++ head-2008-11-25/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200
8412 + * linux/arch/i386/kernel/process.c
8414 + * Copyright (C) 1995 Linus Torvalds
8416 + * Pentium III FXSR, SSE support
8417 + * Gareth Hughes <gareth@valinux.com>, May 2000
8421 + * This file handles the architecture-dependent parts of process handling..
8424 +#include <stdarg.h>
8426 +#include <linux/cpu.h>
8427 +#include <linux/errno.h>
8428 +#include <linux/sched.h>
8429 +#include <linux/fs.h>
8430 +#include <linux/kernel.h>
8431 +#include <linux/mm.h>
8432 +#include <linux/elfcore.h>
8433 +#include <linux/smp.h>
8434 +#include <linux/smp_lock.h>
8435 +#include <linux/stddef.h>
8436 +#include <linux/slab.h>
8437 +#include <linux/vmalloc.h>
8438 +#include <linux/user.h>
8439 +#include <linux/a.out.h>
8440 +#include <linux/interrupt.h>
8441 +#include <linux/utsname.h>
8442 +#include <linux/delay.h>
8443 +#include <linux/reboot.h>
8444 +#include <linux/init.h>
8445 +#include <linux/mc146818rtc.h>
8446 +#include <linux/module.h>
8447 +#include <linux/kallsyms.h>
8448 +#include <linux/ptrace.h>
8449 +#include <linux/random.h>
8451 +#include <asm/uaccess.h>
8452 +#include <asm/pgtable.h>
8453 +#include <asm/system.h>
8454 +#include <asm/io.h>
8455 +#include <asm/ldt.h>
8456 +#include <asm/processor.h>
8457 +#include <asm/i387.h>
8458 +#include <asm/desc.h>
8459 +#include <asm/vm86.h>
8460 +#ifdef CONFIG_MATH_EMULATION
8461 +#include <asm/math_emu.h>
8464 +#include <xen/interface/physdev.h>
8465 +#include <xen/interface/vcpu.h>
8466 +#include <xen/cpu_hotplug.h>
8468 +#include <linux/err.h>
8470 +#include <asm/tlbflush.h>
8471 +#include <asm/cpu.h>
8473 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
8475 +static int hlt_counter;
8477 +unsigned long boot_option_idle_override = 0;
8478 +EXPORT_SYMBOL(boot_option_idle_override);
8481 + * Return saved PC of a blocked thread.
8483 +unsigned long thread_saved_pc(struct task_struct *tsk)
8485 + return ((unsigned long *)tsk->thread.esp)[3];
8489 + * Powermanagement idle function, if any..
8491 +void (*pm_idle)(void);
8492 +EXPORT_SYMBOL(pm_idle);
8493 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
8495 +void disable_hlt(void)
8500 +EXPORT_SYMBOL(disable_hlt);
8502 +void enable_hlt(void)
8507 +EXPORT_SYMBOL(enable_hlt);
8510 + * On SMP it's slightly faster (but much more power-consuming!)
8511 + * to poll the ->work.need_resched flag instead of waiting for the
8512 + * cross-CPU IPI to arrive. Use this option with caution.
8514 +static void poll_idle (void)
8516 + local_irq_enable();
8523 + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
8526 +static void xen_idle(void)
8528 + local_irq_disable();
8530 + if (need_resched())
8531 + local_irq_enable();
8533 + current_thread_info()->status &= ~TS_POLLING;
8534 + smp_mb__after_clear_bit();
8536 + current_thread_info()->status |= TS_POLLING;
8539 +#ifdef CONFIG_APM_MODULE
8540 +EXPORT_SYMBOL(default_idle);
8543 +#ifdef CONFIG_HOTPLUG_CPU
8544 +extern cpumask_t cpu_initialized;
8545 +static inline void play_dead(void)
8548 + local_irq_disable();
8549 + cpu_clear(smp_processor_id(), cpu_initialized);
8550 + preempt_enable_no_resched();
8551 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
8555 +static inline void play_dead(void)
8559 +#endif /* CONFIG_HOTPLUG_CPU */
8562 + * The idle thread. There's no useful work to be
8563 + * done, so just try to conserve power and have a
8564 + * low exit latency (ie sit in a loop waiting for
8565 + * somebody to say that they'd like to reschedule)
8567 +void cpu_idle(void)
8569 + int cpu = smp_processor_id();
8571 + current_thread_info()->status |= TS_POLLING;
8573 + /* endless idle loop with no priority at all */
8575 + while (!need_resched()) {
8576 + void (*idle)(void);
8578 + if (__get_cpu_var(cpu_idle_state))
8579 + __get_cpu_var(cpu_idle_state) = 0;
8582 + idle = xen_idle; /* no alternatives */
8584 + if (cpu_is_offline(cpu))
8587 + __get_cpu_var(irq_stat).idle_timestamp = jiffies;
8590 + preempt_enable_no_resched();
8592 + preempt_disable();
8596 +void cpu_idle_wait(void)
8598 + unsigned int cpu, this_cpu = get_cpu();
8601 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
8605 + for_each_online_cpu(cpu) {
8606 + per_cpu(cpu_idle_state, cpu) = 1;
8607 + cpu_set(cpu, map);
8610 + __get_cpu_var(cpu_idle_state) = 0;
8615 + for_each_online_cpu(cpu) {
8616 + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
8617 + cpu_clear(cpu, map);
8619 + cpus_and(map, map, cpu_online_map);
8620 + } while (!cpus_empty(map));
8622 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
8624 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
8628 +static int __init idle_setup (char *str)
8630 + if (!strncmp(str, "poll", 4)) {
8631 + printk("using polling idle threads.\n");
8632 + pm_idle = poll_idle;
8635 + boot_option_idle_override = 1;
8639 +__setup("idle=", idle_setup);
8641 +void show_regs(struct pt_regs * regs)
8643 + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
8646 + printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
8647 + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
8648 + print_symbol("EIP is at %s\n", regs->eip);
8650 + if (user_mode_vm(regs))
8651 + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
8652 + printk(" EFLAGS: %08lx %s (%s %.*s)\n",
8653 + regs->eflags, print_tainted(), system_utsname.release,
8654 + (int)strcspn(system_utsname.version, " "),
8655 + system_utsname.version);
8656 + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
8657 + regs->eax,regs->ebx,regs->ecx,regs->edx);
8658 + printk("ESI: %08lx EDI: %08lx EBP: %08lx",
8659 + regs->esi, regs->edi, regs->ebp);
8660 + printk(" DS: %04x ES: %04x\n",
8661 + 0xffff & regs->xds,0xffff & regs->xes);
8666 + cr4 = read_cr4_safe();
8667 + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
8668 + show_trace(NULL, regs, ®s->esp);
8672 + * This gets run with %ebx containing the
8673 + * function to call, and %edx containing
8676 +extern void kernel_thread_helper(void);
8677 +__asm__(".section .text\n"
8679 + "kernel_thread_helper:\n\t"
8680 + "movl %edx,%eax\n\t"
8688 + * Create a kernel thread
8690 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
8692 + struct pt_regs regs;
8694 + memset(®s, 0, sizeof(regs));
8696 + regs.ebx = (unsigned long) fn;
8697 + regs.edx = (unsigned long) arg;
8699 + regs.xds = __USER_DS;
8700 + regs.xes = __USER_DS;
8701 + regs.orig_eax = -1;
8702 + regs.eip = (unsigned long) kernel_thread_helper;
8703 + regs.xcs = GET_KERNEL_CS();
8704 + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
8706 + /* Ok, create the new process.. */
8707 + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
8709 +EXPORT_SYMBOL(kernel_thread);
8712 + * Free current thread data structures etc..
8714 +void exit_thread(void)
8716 + /* The process may have allocated an io port bitmap... nuke it. */
8717 + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
8718 + struct task_struct *tsk = current;
8719 + struct thread_struct *t = &tsk->thread;
8720 + struct physdev_set_iobitmap set_iobitmap;
8721 + memset(&set_iobitmap, 0, sizeof(set_iobitmap));
8722 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
8724 + kfree(t->io_bitmap_ptr);
8725 + t->io_bitmap_ptr = NULL;
8726 + clear_thread_flag(TIF_IO_BITMAP);
8730 +void flush_thread(void)
8732 + struct task_struct *tsk = current;
8734 + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
8735 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
8736 + clear_tsk_thread_flag(tsk, TIF_DEBUG);
8738 + * Forget coprocessor state..
8741 + clear_used_math();
8744 +void release_thread(struct task_struct *dead_task)
8746 + BUG_ON(dead_task->mm);
8747 + release_vm86_irqs(dead_task);
8751 + * This gets called before we allocate a new thread and copy
8752 + * the current task into it.
8754 +void prepare_to_copy(struct task_struct *tsk)
8759 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
8760 + unsigned long unused,
8761 + struct task_struct * p, struct pt_regs * regs)
8763 + struct pt_regs * childregs;
8764 + struct task_struct *tsk;
8767 + childregs = task_pt_regs(p);
8768 + *childregs = *regs;
8769 + childregs->eax = 0;
8770 + childregs->esp = esp;
8772 + p->thread.esp = (unsigned long) childregs;
8773 + p->thread.esp0 = (unsigned long) (childregs+1);
8775 + p->thread.eip = (unsigned long) ret_from_fork;
8777 + savesegment(fs,p->thread.fs);
8778 + savesegment(gs,p->thread.gs);
8781 + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
8782 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
8783 + if (!p->thread.io_bitmap_ptr) {
8784 + p->thread.io_bitmap_max = 0;
8787 + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
8789 + set_tsk_thread_flag(p, TIF_IO_BITMAP);
8793 + * Set a new TLS for the child thread?
8795 + if (clone_flags & CLONE_SETTLS) {
8796 + struct desc_struct *desc;
8797 + struct user_desc info;
8801 + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
8804 + if (LDT_empty(&info))
8807 + idx = info.entry_number;
8808 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
8811 + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
8812 + desc->a = LDT_entry_a(&info);
8813 + desc->b = LDT_entry_b(&info);
8816 + p->thread.iopl = current->thread.iopl;
8820 + if (err && p->thread.io_bitmap_ptr) {
8821 + kfree(p->thread.io_bitmap_ptr);
8822 + p->thread.io_bitmap_max = 0;
8828 + * fill in the user structure for a core dump..
8830 +void dump_thread(struct pt_regs * regs, struct user * dump)
8834 +/* changed the size calculations - should hopefully work better. lbt */
8835 + dump->magic = CMAGIC;
8836 + dump->start_code = 0;
8837 + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
8838 + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
8839 + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
8840 + dump->u_dsize -= dump->u_tsize;
8841 + dump->u_ssize = 0;
8842 + for (i = 0; i < 8; i++)
8843 + dump->u_debugreg[i] = current->thread.debugreg[i];
8845 + if (dump->start_stack < TASK_SIZE)
8846 + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
8848 + dump->regs.ebx = regs->ebx;
8849 + dump->regs.ecx = regs->ecx;
8850 + dump->regs.edx = regs->edx;
8851 + dump->regs.esi = regs->esi;
8852 + dump->regs.edi = regs->edi;
8853 + dump->regs.ebp = regs->ebp;
8854 + dump->regs.eax = regs->eax;
8855 + dump->regs.ds = regs->xds;
8856 + dump->regs.es = regs->xes;
8857 + savesegment(fs,dump->regs.fs);
8858 + savesegment(gs,dump->regs.gs);
8859 + dump->regs.orig_eax = regs->orig_eax;
8860 + dump->regs.eip = regs->eip;
8861 + dump->regs.cs = regs->xcs;
8862 + dump->regs.eflags = regs->eflags;
8863 + dump->regs.esp = regs->esp;
8864 + dump->regs.ss = regs->xss;
8866 + dump->u_fpvalid = dump_fpu (regs, &dump->i387);
8868 +EXPORT_SYMBOL(dump_thread);
8871 + * Capture the user space registers if the task is not running (in user space)
8873 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
8875 + struct pt_regs ptregs = *task_pt_regs(tsk);
8876 + ptregs.xcs &= 0xffff;
8877 + ptregs.xds &= 0xffff;
8878 + ptregs.xes &= 0xffff;
8879 + ptregs.xss &= 0xffff;
8881 + elf_core_copy_regs(regs, &ptregs);
8886 +static noinline void __switch_to_xtra(struct task_struct *next_p)
8888 + struct thread_struct *next;
8890 + next = &next_p->thread;
8892 + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
8893 + set_debugreg(next->debugreg[0], 0);
8894 + set_debugreg(next->debugreg[1], 1);
8895 + set_debugreg(next->debugreg[2], 2);
8896 + set_debugreg(next->debugreg[3], 3);
8898 + set_debugreg(next->debugreg[6], 6);
8899 + set_debugreg(next->debugreg[7], 7);
8904 + * This function selects if the context switch from prev to next
8905 + * has to tweak the TSC disable bit in the cr4.
8907 +static inline void disable_tsc(struct task_struct *prev_p,
8908 + struct task_struct *next_p)
8910 + struct thread_info *prev, *next;
8913 + * gcc should eliminate the ->thread_info dereference if
8914 + * has_secure_computing returns 0 at compile time (SECCOMP=n).
8916 + prev = task_thread_info(prev_p);
8917 + next = task_thread_info(next_p);
8919 + if (has_secure_computing(prev) || has_secure_computing(next)) {
8920 + /* slow path here */
8921 + if (has_secure_computing(prev) &&
8922 + !has_secure_computing(next)) {
8923 + write_cr4(read_cr4() & ~X86_CR4_TSD);
8924 + } else if (!has_secure_computing(prev) &&
8925 + has_secure_computing(next))
8926 + write_cr4(read_cr4() | X86_CR4_TSD);
8931 + * switch_to(x,yn) should switch tasks from x to y.
8933 + * We fsave/fwait so that an exception goes off at the right time
8934 + * (as a call from the fsave or fwait in effect) rather than to
8935 + * the wrong process. Lazy FP saving no longer makes any sense
8936 + * with modern CPU's, and this simplifies a lot of things (SMP
8937 + * and UP become the same).
8939 + * NOTE! We used to use the x86 hardware context switching. The
8940 + * reason for not using it any more becomes apparent when you
8941 + * try to recover gracefully from saved state that is no longer
8942 + * valid (stale segment register values in particular). With the
8943 + * hardware task-switch, there is no way to fix up bad state in
8944 + * a reasonable manner.
8946 + * The fact that Intel documents the hardware task-switching to
8947 + * be slow is a fairly red herring - this code is not noticeably
8948 + * faster. However, there _is_ some room for improvement here,
8949 + * so the performance issues may eventually be a valid point.
8950 + * More important, however, is the fact that this allows us much
8951 + * more flexibility.
8953 + * The return value (in %eax) will be the "prev" task after
8954 + * the task-switch, and shows up in ret_from_fork in entry.S,
8957 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
8959 + struct thread_struct *prev = &prev_p->thread,
8960 + *next = &next_p->thread;
8961 + int cpu = smp_processor_id();
8962 +#ifndef CONFIG_X86_NO_TSS
8963 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
8965 +#if CONFIG_XEN_COMPAT > 0x030002
8966 + struct physdev_set_iopl iopl_op;
8967 + struct physdev_set_iobitmap iobmp_op;
8969 + struct physdev_op _pdo[2], *pdo = _pdo;
8970 +#define iopl_op pdo->u.set_iopl
8971 +#define iobmp_op pdo->u.set_iobitmap
8973 + multicall_entry_t _mcl[8], *mcl = _mcl;
8975 + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
8978 + * This is basically '__unlazy_fpu', except that we queue a
8979 + * multicall to indicate FPU task switch, rather than
8980 + * synchronously trapping to Xen.
8982 + if (prev_p->thread_info->status & TS_USEDFPU) {
8983 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
8984 + mcl->op = __HYPERVISOR_fpu_taskswitch;
8988 +#if 0 /* lazy fpu sanity check */
8989 + else BUG_ON(!(read_cr0() & 8));
8994 + * This is load_esp0(tss, next) with a multicall.
8996 + mcl->op = __HYPERVISOR_stack_switch;
8997 + mcl->args[0] = __KERNEL_DS;
8998 + mcl->args[1] = next->esp0;
9002 + * Load the per-thread Thread-Local Storage descriptor.
9003 + * This is load_TLS(next, cpu) with multicalls.
9005 +#define C(i) do { \
9006 + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
9007 + next->tls_array[i].b != prev->tls_array[i].b)) { \
9008 + mcl->op = __HYPERVISOR_update_descriptor; \
9009 + *(u64 *)&mcl->args[0] = virt_to_machine( \
9010 + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9011 + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
9018 + if (unlikely(prev->iopl != next->iopl)) {
9019 + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
9020 +#if CONFIG_XEN_COMPAT > 0x030002
9021 + mcl->op = __HYPERVISOR_physdev_op;
9022 + mcl->args[0] = PHYSDEVOP_set_iopl;
9023 + mcl->args[1] = (unsigned long)&iopl_op;
9025 + mcl->op = __HYPERVISOR_physdev_op_compat;
9026 + pdo->cmd = PHYSDEVOP_set_iopl;
9027 + mcl->args[0] = (unsigned long)pdo++;
9032 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9033 + set_xen_guest_handle(iobmp_op.bitmap,
9034 + (char *)next->io_bitmap_ptr);
9035 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9036 +#if CONFIG_XEN_COMPAT > 0x030002
9037 + mcl->op = __HYPERVISOR_physdev_op;
9038 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
9039 + mcl->args[1] = (unsigned long)&iobmp_op;
9041 + mcl->op = __HYPERVISOR_physdev_op_compat;
9042 + pdo->cmd = PHYSDEVOP_set_iobitmap;
9043 + mcl->args[0] = (unsigned long)pdo++;
9048 +#if CONFIG_XEN_COMPAT <= 0x030002
9049 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
9051 + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
9052 + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
9056 + * Restore %fs and %gs if needed.
9058 + * Glibc normally makes %fs be zero, and %gs is one of
9059 + * the TLS segments.
9061 + if (unlikely(next->fs))
9062 + loadsegment(fs, next->fs);
9065 + loadsegment(gs, next->gs);
9068 + * Now maybe handle debug registers
9070 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
9071 + __switch_to_xtra(next_p);
9073 + disable_tsc(prev_p, next_p);
9078 +asmlinkage int sys_fork(struct pt_regs regs)
9080 + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
9083 +asmlinkage int sys_clone(struct pt_regs regs)
9085 + unsigned long clone_flags;
9086 + unsigned long newsp;
9087 + int __user *parent_tidptr, *child_tidptr;
9089 + clone_flags = regs.ebx;
9091 + parent_tidptr = (int __user *)regs.edx;
9092 + child_tidptr = (int __user *)regs.edi;
9095 + return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
9099 + * This is trivial, and on the face of it looks like it
9100 + * could equally well be done in user mode.
9102 + * Not so, for quite unobvious reasons - register pressure.
9103 + * In user mode vfork() cannot have a stack frame, and if
9104 + * done by calling the "clone()" system call directly, you
9105 + * do not have enough call-clobbered registers to hold all
9106 + * the information you need.
9108 +asmlinkage int sys_vfork(struct pt_regs regs)
9110 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
9114 + * sys_execve() executes a new program.
9116 +asmlinkage int sys_execve(struct pt_regs regs)
9121 + filename = getname((char __user *) regs.ebx);
9122 + error = PTR_ERR(filename);
9123 + if (IS_ERR(filename))
9125 + error = do_execve(filename,
9126 + (char __user * __user *) regs.ecx,
9127 + (char __user * __user *) regs.edx,
9130 + task_lock(current);
9131 + current->ptrace &= ~PT_DTRACE;
9132 + task_unlock(current);
9133 + /* Make sure we don't return using sysenter.. */
9134 + set_thread_flag(TIF_IRET);
9136 + putname(filename);
9141 +#define top_esp (THREAD_SIZE - sizeof(unsigned long))
9142 +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
9144 +unsigned long get_wchan(struct task_struct *p)
9146 + unsigned long ebp, esp, eip;
9147 + unsigned long stack_page;
9149 + if (!p || p == current || p->state == TASK_RUNNING)
9151 + stack_page = (unsigned long)task_stack_page(p);
9152 + esp = p->thread.esp;
9153 + if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9155 + /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9156 + ebp = *(unsigned long *) esp;
9158 + if (ebp < stack_page || ebp > top_ebp+stack_page)
9160 + eip = *(unsigned long *) (ebp+4);
9161 + if (!in_sched_functions(eip))
9163 + ebp = *(unsigned long *) ebp;
9164 + } while (count++ < 16);
9169 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9171 +static int get_free_idx(void)
9173 + struct thread_struct *t = ¤t->thread;
9176 + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9177 + if (desc_empty(t->tls_array + idx))
9178 + return idx + GDT_ENTRY_TLS_MIN;
9183 + * Set a given TLS descriptor:
9185 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9187 + struct thread_struct *t = ¤t->thread;
9188 + struct user_desc info;
9189 + struct desc_struct *desc;
9192 + if (copy_from_user(&info, u_info, sizeof(info)))
9194 + idx = info.entry_number;
9197 + * index -1 means the kernel should try to find and
9198 + * allocate an empty descriptor:
9201 + idx = get_free_idx();
9204 + if (put_user(idx, &u_info->entry_number))
9208 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9211 + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9214 + * We must not get preempted while modifying the TLS.
9218 + if (LDT_empty(&info)) {
9222 + desc->a = LDT_entry_a(&info);
9223 + desc->b = LDT_entry_b(&info);
9233 + * Get the current Thread-Local Storage area:
9236 +#define GET_BASE(desc) ( \
9237 + (((desc)->a >> 16) & 0x0000ffff) | \
9238 + (((desc)->b << 16) & 0x00ff0000) | \
9239 + ( (desc)->b & 0xff000000) )
9241 +#define GET_LIMIT(desc) ( \
9242 + ((desc)->a & 0x0ffff) | \
9243 + ((desc)->b & 0xf0000) )
9245 +#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
9246 +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
9247 +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
9248 +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
9249 +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
9250 +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
9252 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9254 + struct user_desc info;
9255 + struct desc_struct *desc;
9258 + if (get_user(idx, &u_info->entry_number))
9260 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9263 + memset(&info, 0, sizeof(info));
9265 + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9267 + info.entry_number = idx;
9268 + info.base_addr = GET_BASE(desc);
9269 + info.limit = GET_LIMIT(desc);
9270 + info.seg_32bit = GET_32BIT(desc);
9271 + info.contents = GET_CONTENTS(desc);
9272 + info.read_exec_only = !GET_WRITABLE(desc);
9273 + info.limit_in_pages = GET_LIMIT_PAGES(desc);
9274 + info.seg_not_present = !GET_PRESENT(desc);
9275 + info.useable = GET_USEABLE(desc);
9277 + if (copy_to_user(u_info, &info, sizeof(info)))
9282 +unsigned long arch_align_stack(unsigned long sp)
9284 + if (randomize_va_space)
9285 + sp -= get_random_int() % 8192;
9288 Index: head-2008-11-25/arch/x86/kernel/quirks-xen.c
9289 ===================================================================
9290 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
9291 +++ head-2008-11-25/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100
9294 + * This file contains work-arounds for x86 and x86_64 platform bugs.
9296 +#include <linux/pci.h>
9297 +#include <linux/irq.h>
9299 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9301 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9306 + /* BIOS may enable hardware IRQ balancing for
9307 + * E7520/E7320/E7525(revision ID 0x9 and below)
9308 + * based platforms.
9309 + * Disable SW irqbalance/affinity on those platforms.
9311 + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
9315 + printk(KERN_INFO "Intel E7520/7320/7525 detected.");
9317 + /* enable access to config space*/
9318 + pci_read_config_byte(dev, 0xf4, &config);
9319 + pci_write_config_byte(dev, 0xf4, config|0x2);
9321 + /* read xTPR register */
9322 + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
9324 + if (!(word & (1 << 13))) {
9325 + struct xen_platform_op op;
9326 + printk(KERN_INFO "Disabling irq balancing and affinity\n");
9327 + op.cmd = XENPF_platform_quirk;
9328 + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
9329 + WARN_ON(HYPERVISOR_platform_op(&op));
9332 + /* put back the original value for config space*/
9333 + if (!(config & 0x2))
9334 + pci_write_config_byte(dev, 0xf4, config);
9336 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
9337 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
9338 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
9340 Index: head-2008-11-25/arch/x86/kernel/setup_32-xen.c
9341 ===================================================================
9342 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
9343 +++ head-2008-11-25/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200
9346 + * linux/arch/i386/kernel/setup.c
9348 + * Copyright (C) 1995 Linus Torvalds
9350 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
9352 + * Memory region support
9353 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
9355 + * Added E820 sanitization routine (removes overlapping memory regions);
9356 + * Brian Moyle <bmoyle@mvista.com>, February 2001
9358 + * Moved CPU detection code to cpu/${cpu}.c
9359 + * Patrick Mochel <mochel@osdl.org>, March 2002
9361 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
9362 + * Alex Achenbach <xela@slit.de>, December 2002.
9367 + * This file handles the architecture-dependent parts of initialization
9370 +#include <linux/sched.h>
9371 +#include <linux/mm.h>
9372 +#include <linux/mmzone.h>
9373 +#include <linux/screen_info.h>
9374 +#include <linux/ioport.h>
9375 +#include <linux/acpi.h>
9376 +#include <linux/apm_bios.h>
9377 +#include <linux/initrd.h>
9378 +#include <linux/bootmem.h>
9379 +#include <linux/seq_file.h>
9380 +#include <linux/platform_device.h>
9381 +#include <linux/console.h>
9382 +#include <linux/mca.h>
9383 +#include <linux/root_dev.h>
9384 +#include <linux/highmem.h>
9385 +#include <linux/module.h>
9386 +#include <linux/efi.h>
9387 +#include <linux/init.h>
9388 +#include <linux/edd.h>
9389 +#include <linux/nodemask.h>
9390 +#include <linux/kernel.h>
9391 +#include <linux/percpu.h>
9392 +#include <linux/notifier.h>
9393 +#include <linux/kexec.h>
9394 +#include <linux/crash_dump.h>
9395 +#include <linux/dmi.h>
9396 +#include <linux/pfn.h>
9398 +#include <video/edid.h>
9400 +#include <asm/apic.h>
9401 +#include <asm/e820.h>
9402 +#include <asm/mpspec.h>
9403 +#include <asm/setup.h>
9404 +#include <asm/arch_hooks.h>
9405 +#include <asm/sections.h>
9406 +#include <asm/io_apic.h>
9407 +#include <asm/ist.h>
9408 +#include <asm/io.h>
9409 +#include <asm/hypervisor.h>
9410 +#include <xen/interface/physdev.h>
9411 +#include <xen/interface/memory.h>
9412 +#include <xen/features.h>
9413 +#include <xen/firmware.h>
9414 +#include <xen/xencons.h>
9415 +#include <setup_arch.h>
9416 +#include <bios_ebda.h>
9419 +#include <xen/interface/kexec.h>
9422 +/* Forward Declaration. */
9423 +void __init find_max_pfn(void);
9425 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
9426 +static struct notifier_block xen_panic_block = {
9427 + xen_panic_event, NULL, 0 /* try to go last */
9430 +extern char hypercall_page[PAGE_SIZE];
9431 +EXPORT_SYMBOL(hypercall_page);
9433 +int disable_pse __devinitdata = 0;
9440 +int efi_enabled = 0;
9441 +EXPORT_SYMBOL(efi_enabled);
9444 +/* cpu data as detected by the assembly code in head.S */
9445 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9446 +/* common cpu data for all cpus */
9447 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9448 +EXPORT_SYMBOL(boot_cpu_data);
9450 +unsigned long mmu_cr4_features;
9453 + int acpi_disabled = 0;
9455 + int acpi_disabled = 1;
9457 +EXPORT_SYMBOL(acpi_disabled);
9460 +int __initdata acpi_force = 0;
9461 +extern acpi_interrupt_flags acpi_sci_flags;
9464 +/* for MCA, but anyone else can use it if they want */
9465 +unsigned int machine_id;
9467 +EXPORT_SYMBOL(machine_id);
9469 +unsigned int machine_submodel_id;
9470 +unsigned int BIOS_revision;
9471 +unsigned int mca_pentium_flag;
9473 +/* For PCI or other memory-mapped resources */
9474 +unsigned long pci_mem_start = 0x10000000;
9476 +EXPORT_SYMBOL(pci_mem_start);
9479 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
9480 +int bootloader_type;
9482 +/* user-defined highmem size */
9483 +static unsigned int highmem_pages = -1;
9488 +struct drive_info_struct { char dummy[32]; } drive_info;
9489 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
9490 + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
9491 +EXPORT_SYMBOL(drive_info);
9493 +struct screen_info screen_info;
9494 +EXPORT_SYMBOL(screen_info);
9495 +struct apm_info apm_info;
9496 +EXPORT_SYMBOL(apm_info);
9497 +struct sys_desc_table_struct {
9498 + unsigned short length;
9499 + unsigned char table[0];
9501 +struct edid_info edid_info;
9502 +EXPORT_SYMBOL_GPL(edid_info);
9504 +#define copy_edid() (edid_info = EDID_INFO)
9506 +struct ist_info ist_info;
9507 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
9508 + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
9509 +EXPORT_SYMBOL(ist_info);
9511 +struct e820map e820;
9513 +struct e820map machine_e820;
9516 +extern void early_cpu_init(void);
9517 +extern void generic_apic_probe(char *);
9518 +extern int root_mountflags;
9520 +unsigned long saved_videomode;
9522 +#define RAMDISK_IMAGE_START_MASK 0x07FF
9523 +#define RAMDISK_PROMPT_FLAG 0x8000
9524 +#define RAMDISK_LOAD_FLAG 0x4000
9526 +static char command_line[COMMAND_LINE_SIZE];
9528 +unsigned char __initdata boot_params[PARAM_SIZE];
9530 +static struct resource data_resource = {
9531 + .name = "Kernel data",
9534 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9537 +static struct resource code_resource = {
9538 + .name = "Kernel code",
9541 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9544 +static struct resource system_rom_resource = {
9545 + .name = "System ROM",
9548 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9551 +static struct resource extension_rom_resource = {
9552 + .name = "Extension ROM",
9555 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9558 +static struct resource adapter_rom_resources[] = { {
9559 + .name = "Adapter ROM",
9562 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9564 + .name = "Adapter ROM",
9567 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9569 + .name = "Adapter ROM",
9572 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9574 + .name = "Adapter ROM",
9577 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9579 + .name = "Adapter ROM",
9582 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9584 + .name = "Adapter ROM",
9587 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9590 +#define ADAPTER_ROM_RESOURCES \
9591 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
9593 +static struct resource video_rom_resource = {
9594 + .name = "Video ROM",
9597 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9600 +static struct resource video_ram_resource = {
9601 + .name = "Video RAM area",
9604 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9607 +static struct resource standard_io_resources[] = { {
9611 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9616 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9621 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9626 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9628 + .name = "keyboard",
9631 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9633 + .name = "dma page reg",
9636 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9641 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9646 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9651 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9654 +#define STANDARD_IO_RESOURCES \
9655 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
9657 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
9659 +static int __init romchecksum(unsigned char *rom, unsigned long length)
9661 + unsigned char *p, sum = 0;
9663 + for (p = rom; p < rom + length; p++)
9668 +static void __init probe_roms(void)
9670 + unsigned long start, length, upper;
9671 + unsigned char *rom;
9675 + /* Nothing to do if not running in dom0. */
9676 + if (!is_initial_xendomain())
9681 + upper = adapter_rom_resources[0].start;
9682 + for (start = video_rom_resource.start; start < upper; start += 2048) {
9683 + rom = isa_bus_to_virt(start);
9684 + if (!romsignature(rom))
9687 + video_rom_resource.start = start;
9689 + /* 0 < length <= 0x7f * 512, historically */
9690 + length = rom[2] * 512;
9692 + /* if checksum okay, trust length byte */
9693 + if (length && romchecksum(rom, length))
9694 + video_rom_resource.end = start + length - 1;
9696 + request_resource(&iomem_resource, &video_rom_resource);
9700 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
9701 + if (start < upper)
9705 + request_resource(&iomem_resource, &system_rom_resource);
9706 + upper = system_rom_resource.start;
9708 + /* check for extension rom (ignore length byte!) */
9709 + rom = isa_bus_to_virt(extension_rom_resource.start);
9710 + if (romsignature(rom)) {
9711 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
9712 + if (romchecksum(rom, length)) {
9713 + request_resource(&iomem_resource, &extension_rom_resource);
9714 + upper = extension_rom_resource.start;
9718 + /* check for adapter roms on 2k boundaries */
9719 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
9720 + rom = isa_bus_to_virt(start);
9721 + if (!romsignature(rom))
9724 + /* 0 < length <= 0x7f * 512, historically */
9725 + length = rom[2] * 512;
9727 + /* but accept any length that fits if checksum okay */
9728 + if (!length || start + length > upper || !romchecksum(rom, length))
9731 + adapter_rom_resources[i].start = start;
9732 + adapter_rom_resources[i].end = start + length - 1;
9733 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
9735 + start = adapter_rom_resources[i++].end & ~2047UL;
9740 + * Point at the empty zero page to start with. We map the real shared_info
9741 + * page as soon as fixmap is up and running.
9743 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
9744 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
9746 +unsigned long *phys_to_machine_mapping;
9747 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
9748 +EXPORT_SYMBOL(phys_to_machine_mapping);
9750 +/* Raw start-of-day parameters from the hypervisor. */
9751 +start_info_t *xen_start_info;
9752 +EXPORT_SYMBOL(xen_start_info);
9754 +void __init add_memory_region(unsigned long long start,
9755 + unsigned long long size, int type)
9759 + if (!efi_enabled) {
9762 + if (x == E820MAX) {
9763 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
9767 + e820.map[x].addr = start;
9768 + e820.map[x].size = size;
9769 + e820.map[x].type = type;
9772 +} /* add_memory_region */
9774 +static void __init limit_regions(unsigned long long size)
9776 + unsigned long long current_addr = 0;
9779 + if (efi_enabled) {
9780 + efi_memory_desc_t *md;
9783 + for (p = memmap.map, i = 0; p < memmap.map_end;
9784 + p += memmap.desc_size, i++) {
9786 + current_addr = md->phys_addr + (md->num_pages << 12);
9787 + if (md->type == EFI_CONVENTIONAL_MEMORY) {
9788 + if (current_addr >= size) {
9790 + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
9791 + memmap.nr_map = i + 1;
9797 + for (i = 0; i < e820.nr_map; i++) {
9798 + current_addr = e820.map[i].addr + e820.map[i].size;
9799 + if (current_addr < size)
9802 + if (e820.map[i].type != E820_RAM)
9805 + if (e820.map[i].addr >= size) {
9807 + * This region starts past the end of the
9808 + * requested size, skip it completely.
9812 + e820.nr_map = i + 1;
9813 + e820.map[i].size -= current_addr - size;
9818 + if (i==e820.nr_map && current_addr < size) {
9820 + * The e820 map finished before our requested size so
9821 + * extend the final entry to the requested address.
9824 + if (e820.map[i].type == E820_RAM)
9825 + e820.map[i].size -= current_addr - size;
9827 + add_memory_region(current_addr, size - current_addr, E820_RAM);
9832 +#define E820_DEBUG 1
9834 +static void __init print_memory_map(char *who)
9838 + for (i = 0; i < e820.nr_map; i++) {
9839 + printk(" %s: %016Lx - %016Lx ", who,
9841 + e820.map[i].addr + e820.map[i].size);
9842 + switch (e820.map[i].type) {
9843 + case E820_RAM: printk("(usable)\n");
9845 + case E820_RESERVED:
9846 + printk("(reserved)\n");
9849 + printk("(ACPI data)\n");
9852 + printk("(ACPI NVS)\n");
9854 + default: printk("type %lu\n", e820.map[i].type);
9861 + * Sanitize the BIOS e820 map.
9863 + * Some e820 responses include overlapping entries. The following
9864 + * replaces the original e820 map with a new one, removing overlaps.
9867 +struct change_member {
9868 + struct e820entry *pbios; /* pointer to original bios entry */
9869 + unsigned long long addr; /* address for this change point */
9871 +static struct change_member change_point_list[2*E820MAX] __initdata;
9872 +static struct change_member *change_point[2*E820MAX] __initdata;
9873 +static struct e820entry *overlap_list[E820MAX] __initdata;
9874 +static struct e820entry new_bios[E820MAX] __initdata;
9876 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
9878 + struct change_member *change_tmp;
9879 + unsigned long current_type, last_type;
9880 + unsigned long long last_addr;
9881 + int chgidx, still_changing;
9882 + int overlap_entries;
9883 + int new_bios_entry;
9884 + int old_nr, new_nr, chg_nr;
9888 + Visually we're performing the following (1,2,3,4 = memory types)...
9890 + Sample memory map (w/overlaps):
9891 + ____22__________________
9892 + ______________________4_
9893 + ____1111________________
9894 + _44_____________________
9895 + 11111111________________
9896 + ____________________33__
9897 + ___________44___________
9898 + __________33333_________
9899 + ______________22________
9900 + ___________________2222_
9901 + _________111111111______
9902 + _____________________11_
9903 + _________________4______
9905 + Sanitized equivalent (no overlap):
9906 + 1_______________________
9907 + _44_____________________
9908 + ___1____________________
9909 + ____22__________________
9910 + ______11________________
9911 + _________1______________
9912 + __________3_____________
9913 + ___________44___________
9914 + _____________33_________
9915 + _______________2________
9916 + ________________1_______
9917 + _________________4______
9918 + ___________________2____
9919 + ____________________33__
9920 + ______________________4_
9923 + /* if there's only one memory region, don't bother */
9927 + old_nr = *pnr_map;
9929 + /* bail out if we find any unreasonable addresses in bios map */
9930 + for (i=0; i<old_nr; i++)
9931 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
9934 + /* create pointers for initial change-point information (for sorting) */
9935 + for (i=0; i < 2*old_nr; i++)
9936 + change_point[i] = &change_point_list[i];
9938 + /* record all known change-points (starting and ending addresses),
9939 + omitting those that are for empty memory regions */
9941 + for (i=0; i < old_nr; i++) {
9942 + if (biosmap[i].size != 0) {
9943 + change_point[chgidx]->addr = biosmap[i].addr;
9944 + change_point[chgidx++]->pbios = &biosmap[i];
9945 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
9946 + change_point[chgidx++]->pbios = &biosmap[i];
9949 + chg_nr = chgidx; /* true number of change-points */
9951 + /* sort change-point list by memory addresses (low -> high) */
9952 + still_changing = 1;
9953 + while (still_changing) {
9954 + still_changing = 0;
9955 + for (i=1; i < chg_nr; i++) {
9956 + /* if <current_addr> > <last_addr>, swap */
9957 + /* or, if current=<start_addr> & last=<end_addr>, swap */
9958 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
9959 + ((change_point[i]->addr == change_point[i-1]->addr) &&
9960 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
9961 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
9964 + change_tmp = change_point[i];
9965 + change_point[i] = change_point[i-1];
9966 + change_point[i-1] = change_tmp;
9972 + /* create a new bios memory map, removing overlaps */
9973 + overlap_entries=0; /* number of entries in the overlap table */
9974 + new_bios_entry=0; /* index for creating new bios map entries */
9975 + last_type = 0; /* start with undefined memory type */
9976 + last_addr = 0; /* start with 0 as last starting address */
9977 + /* loop through change-points, determining affect on the new bios map */
9978 + for (chgidx=0; chgidx < chg_nr; chgidx++)
9980 + /* keep track of all overlapping bios entries */
9981 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
9983 + /* add map entry to overlap list (> 1 entry implies an overlap) */
9984 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
9988 + /* remove entry from list (order independent, so swap with last) */
9989 + for (i=0; i<overlap_entries; i++)
9991 + if (overlap_list[i] == change_point[chgidx]->pbios)
9992 + overlap_list[i] = overlap_list[overlap_entries-1];
9994 + overlap_entries--;
9996 + /* if there are overlapping entries, decide which "type" to use */
9997 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
9999 + for (i=0; i<overlap_entries; i++)
10000 + if (overlap_list[i]->type > current_type)
10001 + current_type = overlap_list[i]->type;
10002 + /* continue building up new bios map based on this information */
10003 + if (current_type != last_type) {
10004 + if (last_type != 0) {
10005 + new_bios[new_bios_entry].size =
10006 + change_point[chgidx]->addr - last_addr;
10007 + /* move forward only if the new size was non-zero */
10008 + if (new_bios[new_bios_entry].size != 0)
10009 + if (++new_bios_entry >= E820MAX)
10010 + break; /* no more space left for new bios entries */
10012 + if (current_type != 0) {
10013 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10014 + new_bios[new_bios_entry].type = current_type;
10015 + last_addr=change_point[chgidx]->addr;
10017 + last_type = current_type;
10020 + new_nr = new_bios_entry; /* retain count for new bios entries */
10022 + /* copy new bios mapping into original location */
10023 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10024 + *pnr_map = new_nr;
10030 + * Copy the BIOS e820 map into a safe place.
10032 + * Sanity-check it while we're at it..
10034 + * If we're lucky and live on a modern system, the setup code
10035 + * will have given us a memory map that we can use to properly
10036 + * set up memory. If we aren't, we'll fake a memory map.
10038 + * We check to see that the memory map contains at least 2 elements
10039 + * before we'll use it, because the detection code in setup.S may
10040 + * not be perfect and most every PC known to man has two memory
10041 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
10042 + * thinkpad 560x, for example, does not cooperate with the memory
10043 + * detection code.)
10045 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10047 +#ifndef CONFIG_XEN
10048 + /* Only one memory region (or negative)? Ignore it */
10052 + BUG_ON(nr_map < 1);
10056 + unsigned long long start = biosmap->addr;
10057 + unsigned long long size = biosmap->size;
10058 + unsigned long long end = start + size;
10059 + unsigned long type = biosmap->type;
10061 + /* Overflow in 64 bits? Ignore the memory map. */
10065 +#ifndef CONFIG_XEN
10067 + * Some BIOSes claim RAM in the 640k - 1M region.
10068 + * Not right. Fix it up.
10070 + if (type == E820_RAM) {
10071 + if (start < 0x100000ULL && end > 0xA0000ULL) {
10072 + if (start < 0xA0000ULL)
10073 + add_memory_region(start, 0xA0000ULL-start, type);
10074 + if (end <= 0x100000ULL)
10076 + start = 0x100000ULL;
10077 + size = end - start;
10081 + add_memory_region(start, size, type);
10082 + } while (biosmap++,--nr_map);
10085 + if (is_initial_xendomain()) {
10086 + struct xen_memory_map memmap;
10088 + memmap.nr_entries = E820MAX;
10089 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
10091 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
10093 + machine_e820.nr_map = memmap.nr_entries;
10095 + machine_e820 = e820;
10101 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10103 +#ifdef CONFIG_EDD_MODULE
10104 +EXPORT_SYMBOL(edd);
10106 +#ifndef CONFIG_XEN
10108 + * copy_edd() - Copy the BIOS EDD information
10109 + * from boot_params into a safe place.
10112 +static inline void copy_edd(void)
10114 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10115 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10116 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10117 + edd.edd_info_nr = EDD_NR;
10121 +static inline void copy_edd(void)
10126 +static void __init parse_cmdline_early (char ** cmdline_p)
10128 + char c = ' ', *to = command_line, *from = saved_command_line;
10129 + int len = 0, max_cmdline;
10132 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10133 + max_cmdline = COMMAND_LINE_SIZE;
10134 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10135 + /* Save unparsed command line copy for /proc/cmdline */
10136 + saved_command_line[max_cmdline-1] = '\0';
10142 + * "mem=nopentium" disables the 4MB page tables.
10143 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10144 + * to <mem>, overriding the bios size.
10145 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10146 + * <start> to <start>+<mem>, overriding the bios size.
10148 + * HPA tells me bootloaders need to parse mem=, so no new
10149 + * option should be mem= [also see Documentation/i386/boot.txt]
10151 + if (!memcmp(from, "mem=", 4)) {
10152 + if (to != command_line)
10154 + if (!memcmp(from+4, "nopentium", 9)) {
10156 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10159 + /* If the user specifies memory size, we
10160 + * limit the BIOS-provided memory map to
10161 + * that size. exactmap can be used to specify
10162 + * the exact map. mem=number can be used to
10163 + * trim the existing memory map.
10165 + unsigned long long mem_size;
10167 + mem_size = memparse(from+4, &from);
10168 + limit_regions(mem_size);
10173 + else if (!memcmp(from, "memmap=", 7)) {
10174 + if (to != command_line)
10176 + if (!memcmp(from+7, "exactmap", 8)) {
10177 +#ifdef CONFIG_CRASH_DUMP
10178 + /* If we are doing a crash dump, we
10179 + * still need to know the real mem
10180 + * size before original memory map is
10184 + saved_max_pfn = max_pfn;
10190 + /* If the user specifies memory size, we
10191 + * limit the BIOS-provided memory map to
10192 + * that size. exactmap can be used to specify
10193 + * the exact map. mem=number can be used to
10194 + * trim the existing memory map.
10196 + unsigned long long start_at, mem_size;
10198 + mem_size = memparse(from+7, &from);
10199 + if (*from == '@') {
10200 + start_at = memparse(from+1, &from);
10201 + add_memory_region(start_at, mem_size, E820_RAM);
10202 + } else if (*from == '#') {
10203 + start_at = memparse(from+1, &from);
10204 + add_memory_region(start_at, mem_size, E820_ACPI);
10205 + } else if (*from == '$') {
10206 + start_at = memparse(from+1, &from);
10207 + add_memory_region(start_at, mem_size, E820_RESERVED);
10209 + limit_regions(mem_size);
10215 + else if (!memcmp(from, "noexec=", 7))
10216 + noexec_setup(from + 7);
10219 +#ifdef CONFIG_X86_MPPARSE
10221 + * If the BIOS enumerates physical processors before logical,
10222 + * maxcpus=N at enumeration-time can be used to disable HT.
10224 + else if (!memcmp(from, "maxcpus=", 8)) {
10225 + extern unsigned int maxcpus;
10227 + maxcpus = simple_strtoul(from + 8, NULL, 0);
10231 +#ifdef CONFIG_ACPI
10232 + /* "acpi=off" disables both ACPI table parsing and interpreter */
10233 + else if (!memcmp(from, "acpi=off", 8)) {
10237 + /* acpi=force to over-ride black-list */
10238 + else if (!memcmp(from, "acpi=force", 10)) {
10241 + acpi_disabled = 0;
10244 + /* acpi=strict disables out-of-spec workarounds */
10245 + else if (!memcmp(from, "acpi=strict", 11)) {
10249 + /* Limit ACPI just to boot-time to enable HT */
10250 + else if (!memcmp(from, "acpi=ht", 7)) {
10256 + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10257 + else if (!memcmp(from, "pci=noacpi", 10)) {
10258 + acpi_disable_pci();
10260 + /* "acpi=noirq" disables ACPI interrupt routing */
10261 + else if (!memcmp(from, "acpi=noirq", 10)) {
10262 + acpi_noirq_set();
10265 + else if (!memcmp(from, "acpi_sci=edge", 13))
10266 + acpi_sci_flags.trigger = 1;
10268 + else if (!memcmp(from, "acpi_sci=level", 14))
10269 + acpi_sci_flags.trigger = 3;
10271 + else if (!memcmp(from, "acpi_sci=high", 13))
10272 + acpi_sci_flags.polarity = 1;
10274 + else if (!memcmp(from, "acpi_sci=low", 12))
10275 + acpi_sci_flags.polarity = 3;
10277 +#ifdef CONFIG_X86_IO_APIC
10278 + else if (!memcmp(from, "acpi_skip_timer_override", 24))
10279 + acpi_skip_timer_override = 1;
10281 + if (!memcmp(from, "disable_timer_pin_1", 19))
10282 + disable_timer_pin_1 = 1;
10283 + if (!memcmp(from, "enable_timer_pin_1", 18))
10284 + disable_timer_pin_1 = -1;
10286 + /* disable IO-APIC */
10287 + else if (!memcmp(from, "noapic", 6))
10288 + disable_ioapic_setup();
10289 +#endif /* CONFIG_X86_IO_APIC */
10290 +#endif /* CONFIG_ACPI */
10292 +#ifdef CONFIG_X86_LOCAL_APIC
10293 + /* enable local APIC */
10294 + else if (!memcmp(from, "lapic", 5))
10297 + /* disable local APIC */
10298 + else if (!memcmp(from, "nolapic", 6))
10300 +#endif /* CONFIG_X86_LOCAL_APIC */
10302 +#ifdef CONFIG_KEXEC
10303 + /* crashkernel=size@addr specifies the location to reserve for
10304 + * a crash kernel. By reserving this memory we guarantee
10305 + * that linux never set's it up as a DMA target.
10306 + * Useful for holding code to do something appropriate
10307 + * after a kernel panic.
10309 + else if (!memcmp(from, "crashkernel=", 12)) {
10310 +#ifndef CONFIG_XEN
10311 + unsigned long size, base;
10312 + size = memparse(from+12, &from);
10313 + if (*from == '@') {
10314 + base = memparse(from+1, &from);
10315 + /* FIXME: Do I want a sanity check
10316 + * to validate the memory range?
10318 + crashk_res.start = base;
10319 + crashk_res.end = base + size - 1;
10322 + printk("Ignoring crashkernel command line, "
10323 + "parameter will be supplied by xen\n");
10327 +#ifdef CONFIG_PROC_VMCORE
10328 + /* elfcorehdr= specifies the location of elf core header
10329 + * stored by the crashed kernel.
10331 + else if (!memcmp(from, "elfcorehdr=", 11))
10332 + elfcorehdr_addr = memparse(from+11, &from);
10336 + * highmem=size forces highmem to be exactly 'size' bytes.
10337 + * This works even on boxes that have no highmem otherwise.
10338 + * This also works to reduce highmem size on bigger boxes.
10340 + else if (!memcmp(from, "highmem=", 8))
10341 + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
10344 + * vmalloc=size forces the vmalloc area to be exactly 'size'
10345 + * bytes. This can be used to increase (or decrease) the
10346 + * vmalloc area - the default is 128m.
10348 + else if (!memcmp(from, "vmalloc=", 8))
10349 + __VMALLOC_RESERVE = memparse(from+8, &from);
10355 + if (COMMAND_LINE_SIZE <= ++len)
10360 + *cmdline_p = command_line;
10362 + printk(KERN_INFO "user-defined physical RAM map:\n");
10363 + print_memory_map("user");
10368 + * Callback for efi_memory_walk.
10371 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
10373 + unsigned long *max_pfn = arg, pfn;
10375 + if (start < end) {
10376 + pfn = PFN_UP(end -1);
10377 + if (pfn > *max_pfn)
10384 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
10386 + memory_present(0, start, end);
10391 + * This function checks if any part of the range <start,end> is mapped
10395 +e820_any_mapped(u64 start, u64 end, unsigned type)
10399 +#ifndef CONFIG_XEN
10400 + for (i = 0; i < e820.nr_map; i++) {
10401 + const struct e820entry *ei = &e820.map[i];
10403 + if (!is_initial_xendomain())
10405 + for (i = 0; i < machine_e820.nr_map; ++i) {
10406 + const struct e820entry *ei = &machine_e820.map[i];
10409 + if (type && ei->type != type)
10411 + if (ei->addr >= end || ei->addr + ei->size <= start)
10417 +EXPORT_SYMBOL_GPL(e820_any_mapped);
10420 + * This function checks if the entire range <start,end> is mapped with type.
10422 + * Note: this function only works correct if the e820 table is sorted and
10423 + * not-overlapping, which is the case
10426 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
10432 +#ifndef CONFIG_XEN
10433 + for (i = 0; i < e820.nr_map; i++) {
10434 + struct e820entry *ei = &e820.map[i];
10436 + if (!is_initial_xendomain())
10438 + for (i = 0; i < machine_e820.nr_map; ++i) {
10439 + const struct e820entry *ei = &machine_e820.map[i];
10441 + if (type && ei->type != type)
10443 + /* is the region (part) in overlap with the current region ?*/
10444 + if (ei->addr >= end || ei->addr + ei->size <= start)
10446 + /* if the region is at the beginning of <start,end> we move
10447 + * start to the end of the region since it's ok until there
10449 + if (ei->addr <= start)
10450 + start = ei->addr + ei->size;
10451 + /* if start is now at or beyond end, we're done, full
10453 + if (start >= end)
10454 + return 1; /* we're done */
10460 + * Find the highest page frame number we have available
10462 +void __init find_max_pfn(void)
10467 + if (efi_enabled) {
10468 + efi_memmap_walk(efi_find_max_pfn, &max_pfn);
10469 + efi_memmap_walk(efi_memory_present_wrapper, NULL);
10473 + for (i = 0; i < e820.nr_map; i++) {
10474 + unsigned long start, end;
10476 + if (e820.map[i].type != E820_RAM)
10478 + start = PFN_UP(e820.map[i].addr);
10479 + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10480 + if (start >= end)
10482 + if (end > max_pfn)
10484 + memory_present(0, start, end);
10489 + * Determine low and high memory ranges:
10491 +unsigned long __init find_max_low_pfn(void)
10493 + unsigned long max_low_pfn;
10495 + max_low_pfn = max_pfn;
10496 + if (max_low_pfn > MAXMEM_PFN) {
10497 + if (highmem_pages == -1)
10498 + highmem_pages = max_pfn - MAXMEM_PFN;
10499 + if (highmem_pages + MAXMEM_PFN < max_pfn)
10500 + max_pfn = MAXMEM_PFN + highmem_pages;
10501 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
10502 + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
10503 + highmem_pages = 0;
10505 + max_low_pfn = MAXMEM_PFN;
10506 +#ifndef CONFIG_HIGHMEM
10507 + /* Maximum memory usable is what is directly addressable */
10508 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
10510 + if (max_pfn > MAX_NONPAE_PFN)
10511 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10513 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
10514 + max_pfn = MAXMEM_PFN;
10515 +#else /* !CONFIG_HIGHMEM */
10516 +#ifndef CONFIG_X86_PAE
10517 + if (max_pfn > MAX_NONPAE_PFN) {
10518 + max_pfn = MAX_NONPAE_PFN;
10519 + printk(KERN_WARNING "Warning only 4GB will be used.\n");
10520 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10522 +#endif /* !CONFIG_X86_PAE */
10523 +#endif /* !CONFIG_HIGHMEM */
10525 + if (highmem_pages == -1)
10526 + highmem_pages = 0;
10527 +#ifdef CONFIG_HIGHMEM
10528 + if (highmem_pages >= max_pfn) {
10529 + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
10530 + highmem_pages = 0;
10532 + if (highmem_pages) {
10533 + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
10534 + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
10535 + highmem_pages = 0;
10537 + max_low_pfn -= highmem_pages;
10540 + if (highmem_pages)
10541 + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
10544 + return max_low_pfn;
10548 + * Free all available memory for boot time allocation. Used
10549 + * as a callback function by efi_memory_walk()
10553 +free_available_memory(unsigned long start, unsigned long end, void *arg)
10555 + /* check max_low_pfn */
10556 + if (start >= (max_low_pfn << PAGE_SHIFT))
10558 + if (end >= (max_low_pfn << PAGE_SHIFT))
10559 + end = max_low_pfn << PAGE_SHIFT;
10561 + free_bootmem(start, end - start);
10566 + * Register fully available low RAM pages with the bootmem allocator.
10568 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
10572 + if (efi_enabled) {
10573 + efi_memmap_walk(free_available_memory, NULL);
10576 + for (i = 0; i < e820.nr_map; i++) {
10577 + unsigned long curr_pfn, last_pfn, size;
10579 + * Reserve usable low memory
10581 + if (e820.map[i].type != E820_RAM)
10584 + * We are rounding up the start address of usable memory:
10586 + curr_pfn = PFN_UP(e820.map[i].addr);
10587 + if (curr_pfn >= max_low_pfn)
10590 + * ... and at the end of the usable range downwards:
10592 + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10596 + * Truncate to the number of actual pages currently
10599 + if (last_pfn > xen_start_info->nr_pages)
10600 + last_pfn = xen_start_info->nr_pages;
10603 + if (last_pfn > max_low_pfn)
10604 + last_pfn = max_low_pfn;
10607 + * .. finally, did all the rounding and playing
10608 + * around just make the area go away?
10610 + if (last_pfn <= curr_pfn)
10613 + size = last_pfn - curr_pfn;
10614 + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
10618 +#ifndef CONFIG_XEN
10620 + * workaround for Dell systems that neglect to reserve EBDA
10622 +static void __init reserve_ebda_region(void)
10624 + unsigned int addr;
10625 + addr = get_bios_ebda();
10627 + reserve_bootmem(addr, PAGE_SIZE);
10631 +#ifndef CONFIG_NEED_MULTIPLE_NODES
10632 +void __init setup_bootmem_allocator(void);
10633 +static unsigned long __init setup_memory(void)
10636 + * partially used pages are not usable - thus
10637 + * we are rounding upwards:
10639 + min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
10640 + xen_start_info->nr_pt_frames;
10644 + max_low_pfn = find_max_low_pfn();
10646 +#ifdef CONFIG_HIGHMEM
10647 + highstart_pfn = highend_pfn = max_pfn;
10648 + if (max_pfn > max_low_pfn) {
10649 + highstart_pfn = max_low_pfn;
10651 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
10652 + pages_to_mb(highend_pfn - highstart_pfn));
10654 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
10655 + pages_to_mb(max_low_pfn));
10657 + setup_bootmem_allocator();
10659 + return max_low_pfn;
10662 +void __init zone_sizes_init(void)
10664 + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
10665 + unsigned int max_dma, low;
10667 + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
10668 + low = max_low_pfn;
10670 + if (low < max_dma)
10671 + zones_size[ZONE_DMA] = low;
10673 + zones_size[ZONE_DMA] = max_dma;
10674 + zones_size[ZONE_NORMAL] = low - max_dma;
10675 +#ifdef CONFIG_HIGHMEM
10676 + zones_size[ZONE_HIGHMEM] = highend_pfn - low;
10679 + free_area_init(zones_size);
10682 +extern unsigned long __init setup_memory(void);
10683 +extern void zone_sizes_init(void);
10684 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
10686 +void __init setup_bootmem_allocator(void)
10688 + unsigned long bootmap_size;
10690 + * Initialize the boot-time allocator (with low memory only):
10692 + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
10694 + register_bootmem_low_pages(max_low_pfn);
10697 + * Reserve the bootmem bitmap itself as well. We do this in two
10698 + * steps (first step was init_bootmem()) because this catches
10699 + * the (very unlikely) case of us accidentally initializing the
10700 + * bootmem allocator with an invalid RAM area.
10702 + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
10703 + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
10705 +#ifndef CONFIG_XEN
10707 + * reserve physical page 0 - it's a special BIOS page on many boxes,
10708 + * enabling clean reboots, SMP operation, laptop functions.
10710 + reserve_bootmem(0, PAGE_SIZE);
10712 + /* reserve EBDA region, it's a 4K region */
10713 + reserve_ebda_region();
10715 + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
10716 + PCI prefetch into it (errata #56). Usually the page is reserved anyways,
10717 + unless you have no PS/2 mouse plugged in. */
10718 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10719 + boot_cpu_data.x86 == 6)
10720 + reserve_bootmem(0xa0000 - 4096, 4096);
10724 + * But first pinch a few for the stack/trampoline stuff
10725 + * FIXME: Don't need the extra page at 4K, but need to fix
10726 + * trampoline before removing it. (see the GDT stuff)
10728 + reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
10730 +#ifdef CONFIG_ACPI_SLEEP
10732 + * Reserve low memory region for sleep support.
10734 + acpi_reserve_bootmem();
10736 +#endif /* !CONFIG_XEN */
10738 +#ifdef CONFIG_BLK_DEV_INITRD
10739 + if (xen_start_info->mod_start) {
10740 + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
10741 + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
10742 + initrd_start = INITRD_START + PAGE_OFFSET;
10743 + initrd_end = initrd_start+INITRD_SIZE;
10744 + initrd_below_start_ok = 1;
10747 + printk(KERN_ERR "initrd extends beyond end of memory "
10748 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
10749 + INITRD_START + INITRD_SIZE,
10750 + max_low_pfn << PAGE_SHIFT);
10751 + initrd_start = 0;
10755 +#ifdef CONFIG_KEXEC
10757 + xen_machine_kexec_setup_resources();
10759 + if (crashk_res.start != crashk_res.end)
10760 + reserve_bootmem(crashk_res.start,
10761 + crashk_res.end - crashk_res.start + 1);
10767 + * The node 0 pgdat is initialized before all of these because
10768 + * it's needed for bootmem. node>0 pgdats have their virtual
10769 + * space allocated before the pagetables are in place to access
10770 + * them, so they can't be cleared then.
10772 + * This should all compile down to nothing when NUMA is off.
10774 +void __init remapped_pgdat_init(void)
10778 + for_each_online_node(nid) {
10780 + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
10785 + * Request address space for all standard RAM and ROM resources
10786 + * and also for regions reported as reserved by the e820.
10788 +static void __init
10789 +legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
10790 + struct resource *code_resource,
10791 + struct resource *data_resource)
10797 + for (i = 0; i < nr_map; i++) {
10798 + struct resource *res;
10799 +#ifndef CONFIG_RESOURCES_64BIT
10800 + if (e820[i].addr + e820[i].size > 0x100000000ULL)
10803 + res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
10804 + switch (e820[i].type) {
10805 + case E820_RAM: res->name = "System RAM"; break;
10806 + case E820_ACPI: res->name = "ACPI Tables"; break;
10807 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
10808 + default: res->name = "reserved";
10810 + res->start = e820[i].addr;
10811 + res->end = res->start + e820[i].size - 1;
10812 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
10813 + if (request_resource(&iomem_resource, res)) {
10817 + if (e820[i].type == E820_RAM) {
10819 + * We don't know which RAM region contains kernel data,
10820 + * so we try it repeatedly and let the resource manager
10823 +#ifndef CONFIG_XEN
10824 + request_resource(res, code_resource);
10825 + request_resource(res, data_resource);
10827 +#ifdef CONFIG_KEXEC
10828 + if (crashk_res.start != crashk_res.end)
10829 + request_resource(res, &crashk_res);
10831 + xen_machine_kexec_register_resources(res);
10839 + * Locate a unused range of the physical address space below 4G which
10840 + * can be used for PCI mappings.
10842 +static void __init
10843 +e820_setup_gap(struct e820entry *e820, int nr_map)
10845 + unsigned long gapstart, gapsize, round;
10846 + unsigned long long last;
10850 + * Search for the bigest gap in the low 32 bits of the e820
10853 + last = 0x100000000ull;
10854 + gapstart = 0x10000000;
10855 + gapsize = 0x400000;
10857 + while (--i >= 0) {
10858 + unsigned long long start = e820[i].addr;
10859 + unsigned long long end = start + e820[i].size;
10862 + * Since "last" is at most 4GB, we know we'll
10863 + * fit in 32 bits if this condition is true
10865 + if (last > end) {
10866 + unsigned long gap = last - end;
10868 + if (gap > gapsize) {
10873 + if (start < last)
10878 + * See how much we want to round up: start off with
10879 + * rounding to the next 1MB area.
10881 + round = 0x100000;
10882 + while ((gapsize >> 4) > round)
10884 + /* Fun with two's complement */
10885 + pci_mem_start = (gapstart + round) & -round;
10887 + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
10888 + pci_mem_start, gapstart, gapsize);
10892 + * Request address space for all standard resources
10894 + * This is called just before pcibios_init(), which is also a
10895 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
10897 +static int __init request_standard_resources(void)
10901 + /* Nothing to do if not running in dom0. */
10902 + if (!is_initial_xendomain())
10905 + printk("Setting up standard PCI resources\n");
10907 + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
10908 + &code_resource, &data_resource);
10911 + efi_initialize_iomem_resources(&code_resource, &data_resource);
10913 + legacy_init_iomem_resources(e820.map, e820.nr_map,
10914 + &code_resource, &data_resource);
10917 + /* EFI systems may still have VGA */
10918 + request_resource(&iomem_resource, &video_ram_resource);
10920 + /* request I/O space for devices used on all i[345]86 PCs */
10921 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
10922 + request_resource(&ioport_resource, &standard_io_resources[i]);
10926 +subsys_initcall(request_standard_resources);
10928 +static void __init register_memory(void)
10931 + if (is_initial_xendomain())
10932 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
10935 + e820_setup_gap(e820.map, e820.nr_map);
10939 +static void set_mca_bus(int x)
10944 +static void set_mca_bus(int x) { }
10948 + * Determine if we were loaded by an EFI loader. If so, then we have also been
10949 + * passed the efi memmap, systab, etc., so we should use these data structures
10950 + * for initialization. Note, the efi init code path is determined by the
10951 + * global efi_enabled. This allows the same kernel image to be used on existing
10952 + * systems (with a traditional BIOS) as well as on EFI systems.
10954 +void __init setup_arch(char **cmdline_p)
10956 + int i, j, k, fpp;
10957 + struct physdev_set_iopl set_iopl;
10958 + unsigned long max_low_pfn;
10959 + unsigned long p2m_pages;
10961 + /* Force a quick death if the kernel panics (not domain 0). */
10962 + extern int panic_timeout;
10963 + if (!panic_timeout && !is_initial_xendomain())
10964 + panic_timeout = 1;
10966 + /* Register a call for panic conditions. */
10967 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
10969 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10970 + VMASST_TYPE_4gb_segments));
10971 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10972 + VMASST_TYPE_writable_pagetables));
10974 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
10975 + pre_setup_arch_hook();
10976 + early_cpu_init();
10978 + prefill_possible_map();
10982 + * FIXME: This isn't an official loader_type right
10983 + * now but does currently work with elilo.
10984 + * If we were configured as an EFI kernel, check to make
10985 + * sure that we were loaded correctly from elilo and that
10986 + * the system table is valid. If not, then initialize normally.
10989 + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
10993 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
10994 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
10996 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
10997 + drive_info = DRIVE_INFO;
10998 + screen_info = SCREEN_INFO;
11000 + apm_info.bios = APM_BIOS_INFO;
11001 + ist_info = IST_INFO;
11002 + saved_videomode = VIDEO_MODE;
11003 + if( SYS_DESC_TABLE.length != 0 ) {
11004 + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11005 + machine_id = SYS_DESC_TABLE.table[0];
11006 + machine_submodel_id = SYS_DESC_TABLE.table[1];
11007 + BIOS_revision = SYS_DESC_TABLE.table[2];
11009 + bootloader_type = LOADER_TYPE;
11011 + if (is_initial_xendomain()) {
11012 + const struct dom0_vga_console_info *info =
11013 + (void *)((char *)xen_start_info +
11014 + xen_start_info->console.dom0.info_off);
11016 + dom0_init_screen_info(info,
11017 + xen_start_info->console.dom0.info_size);
11018 + xen_start_info->console.domU.mfn = 0;
11019 + xen_start_info->console.domU.evtchn = 0;
11021 + screen_info.orig_video_isVGA = 0;
11023 +#ifdef CONFIG_BLK_DEV_RAM
11024 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11025 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11026 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11033 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11034 + print_memory_map(machine_specific_memory_setup());
11039 + if (!MOUNT_ROOT_RDONLY)
11040 + root_mountflags &= ~MS_RDONLY;
11041 + init_mm.start_code = (unsigned long) _text;
11042 + init_mm.end_code = (unsigned long) _etext;
11043 + init_mm.end_data = (unsigned long) _edata;
11044 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11045 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11047 + code_resource.start = virt_to_phys(_text);
11048 + code_resource.end = virt_to_phys(_etext)-1;
11049 + data_resource.start = virt_to_phys(_etext);
11050 + data_resource.end = virt_to_phys(_edata)-1;
11052 + parse_cmdline_early(cmdline_p);
11054 +#ifdef CONFIG_EARLY_PRINTK
11056 + char *s = strstr(*cmdline_p, "earlyprintk=");
11058 + setup_early_printk(strchr(s, '=') + 1);
11059 + printk("early console enabled\n");
11064 + max_low_pfn = setup_memory();
11067 + * NOTE: before this point _nobody_ is allowed to allocate
11068 + * any memory using the bootmem allocator. Although the
11069 + * alloctor is now initialised only the first 8Mb of the kernel
11070 + * virtual address space has been mapped. All allocations before
11071 + * paging_init() has completed must use the alloc_bootmem_low_pages()
11072 + * variant (which allocates DMA'able memory) and care must be taken
11073 + * not to exceed the 8Mb limit.
11077 + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11080 + remapped_pgdat_init();
11082 + zone_sizes_init();
11084 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11086 + * Find and reserve possible boot-time SMP configuration:
11088 + find_smp_config();
11091 + p2m_pages = max_pfn;
11092 + if (xen_start_info->nr_pages > max_pfn) {
11094 + * the max_pfn was shrunk (probably by mem= or highmem=
11095 + * kernel parameter); shrink reservation with the HV
11097 + struct xen_memory_reservation reservation = {
11098 + .address_bits = 0,
11099 + .extent_order = 0,
11100 + .domid = DOMID_SELF
11102 + unsigned int difference;
11105 + difference = xen_start_info->nr_pages - max_pfn;
11107 + set_xen_guest_handle(reservation.extent_start,
11108 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
11109 + reservation.nr_extents = difference;
11110 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
11112 + BUG_ON (ret != difference);
11114 + else if (max_pfn > xen_start_info->nr_pages)
11115 + p2m_pages = xen_start_info->nr_pages;
11117 + /* Make sure we have a correctly sized P->M table. */
11118 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11119 + phys_to_machine_mapping = alloc_bootmem_low_pages(
11120 + max_pfn * sizeof(unsigned long));
11121 + memset(phys_to_machine_mapping, ~0,
11122 + max_pfn * sizeof(unsigned long));
11123 + memcpy(phys_to_machine_mapping,
11124 + (unsigned long *)xen_start_info->mfn_list,
11125 + p2m_pages * sizeof(unsigned long));
11127 + __pa(xen_start_info->mfn_list),
11128 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11129 + sizeof(unsigned long))));
11132 + * Initialise the list of the frames that specify the list of
11133 + * frames that make up the p2m table. Used by save/restore
11135 + pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11137 + fpp = PAGE_SIZE/sizeof(unsigned long);
11138 + for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11139 + if ((j % fpp) == 0) {
11142 + pfn_to_mfn_frame_list[k] =
11143 + alloc_bootmem_low_pages(PAGE_SIZE);
11144 + pfn_to_mfn_frame_list_list[k] =
11145 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
11148 + pfn_to_mfn_frame_list[k][j] =
11149 + virt_to_mfn(&phys_to_machine_mapping[i]);
11151 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11152 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11153 + virt_to_mfn(pfn_to_mfn_frame_list_list);
11156 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
11157 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
11158 + if (i != 4 && request_dma(i, "xen") != 0)
11162 + * NOTE: at this point the bootmem allocator is fully available.
11165 + if (is_initial_xendomain())
11166 + dmi_scan_machine();
11168 +#ifdef CONFIG_X86_GENERICARCH
11169 + generic_apic_probe(*cmdline_p);
11172 + efi_map_memmap();
11174 + set_iopl.iopl = 1;
11175 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
11177 +#ifdef CONFIG_ACPI
11178 + if (!is_initial_xendomain()) {
11179 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11180 + acpi_disabled = 1;
11185 + * Parse the ACPI tables for possible boot-time SMP configuration.
11187 + acpi_boot_table_init();
11190 +#ifdef CONFIG_X86_IO_APIC
11191 + check_acpi_pci(); /* Checks more than just ACPI actually */
11194 +#ifdef CONFIG_ACPI
11195 + acpi_boot_init();
11197 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11198 + if (def_to_bigsmp)
11199 + printk(KERN_WARNING "More than 8 CPUs detected and "
11200 + "CONFIG_X86_PC cannot handle it.\nUse "
11201 + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11204 +#ifdef CONFIG_X86_LOCAL_APIC
11205 + if (smp_found_config)
11206 + get_smp_config();
11209 + register_memory();
11211 + if (is_initial_xendomain()) {
11213 +#if defined(CONFIG_VGA_CONSOLE)
11214 + if (!efi_enabled ||
11215 + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11216 + conswitchp = &vga_con;
11217 +#elif defined(CONFIG_DUMMY_CONSOLE)
11218 + conswitchp = &dummy_con;
11222 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
11223 + conswitchp = &dummy_con;
11230 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11232 + HYPERVISOR_shutdown(SHUTDOWN_crash);
11233 + /* we're never actually going to get here... */
11234 + return NOTIFY_DONE;
11237 +static __init int add_pcspkr(void)
11239 + struct platform_device *pd;
11242 + if (!is_initial_xendomain())
11245 + pd = platform_device_alloc("pcspkr", -1);
11249 + ret = platform_device_add(pd);
11251 + platform_device_put(pd);
11255 +device_initcall(add_pcspkr);
11258 + * Local Variables:
11260 + * c-file-style:"k&r"
11261 + * c-basic-offset:8
11264 Index: head-2008-11-25/arch/x86/kernel/smp_32-xen.c
11265 ===================================================================
11266 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
11267 +++ head-2008-11-25/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100
11270 + * Intel SMP support routines.
11272 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11273 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11275 + * This code is released under the GNU General Public License version 2 or
11279 +#include <linux/init.h>
11281 +#include <linux/mm.h>
11282 +#include <linux/delay.h>
11283 +#include <linux/spinlock.h>
11284 +#include <linux/smp_lock.h>
11285 +#include <linux/kernel_stat.h>
11286 +#include <linux/mc146818rtc.h>
11287 +#include <linux/cache.h>
11288 +#include <linux/interrupt.h>
11289 +#include <linux/cpu.h>
11290 +#include <linux/module.h>
11292 +#include <asm/mtrr.h>
11293 +#include <asm/tlbflush.h>
11295 +#include <mach_apic.h>
11297 +#include <xen/evtchn.h>
11300 + * Some notes on x86 processor bugs affecting SMP operation:
11302 + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11303 + * The Linux implications for SMP are handled as follows:
11305 + * Pentium III / [Xeon]
11306 + * None of the E1AP-E3AP errata are visible to the user.
11308 + * E1AP. see PII A1AP
11309 + * E2AP. see PII A2AP
11310 + * E3AP. see PII A3AP
11312 + * Pentium II / [Xeon]
11313 + * None of the A1AP-A3AP errata are visible to the user.
11315 + * A1AP. see PPro 1AP
11316 + * A2AP. see PPro 2AP
11317 + * A3AP. see PPro 7AP
11320 + * None of 1AP-9AP errata are visible to the normal user,
11321 + * except occasional delivery of 'spurious interrupt' as trap #15.
11322 + * This is very rare and a non-problem.
11324 + * 1AP. Linux maps APIC as non-cacheable
11325 + * 2AP. worked around in hardware
11326 + * 3AP. fixed in C0 and above steppings microcode update.
11327 + * Linux does not use excessive STARTUP_IPIs.
11328 + * 4AP. worked around in hardware
11329 + * 5AP. symmetric IO mode (normal Linux operation) not affected.
11330 + * 'noapic' mode has vector 0xf filled out properly.
11331 + * 6AP. 'noapic' mode might be affected - fixed in later steppings
11332 + * 7AP. We do not assume writes to the LVT deassering IRQs
11333 + * 8AP. We do not enable low power mode (deep sleep) during MP bootup
11334 + * 9AP. We do not use mixed mode
11337 + * There is a marginal case where REP MOVS on 100MHz SMP
11338 + * machines with B stepping processors can fail. XXX should provide
11339 + * an L1cache=Writethrough or L1cache=off option.
11341 + * B stepping CPUs may hang. There are hardware work arounds
11342 + * for this. We warn about it in case your board doesn't have the work
11343 + * arounds. Basically thats so I can tell anyone with a B stepping
11344 + * CPU and SMP problems "tough".
11346 + * Specific items [From Pentium Processor Specification Update]
11348 + * 1AP. Linux doesn't use remote read
11349 + * 2AP. Linux doesn't trust APIC errors
11350 + * 3AP. We work around this
11351 + * 4AP. Linux never generated 3 interrupts of the same priority
11352 + * to cause a lost local interrupt.
11353 + * 5AP. Remote read is never used
11354 + * 6AP. not affected - worked around in hardware
11355 + * 7AP. not affected - worked around in hardware
11356 + * 8AP. worked around in hardware - we get explicit CS errors if not
11357 + * 9AP. only 'noapic' mode affected. Might generate spurious
11358 + * interrupts, we log only the first one and count the
11360 + * 10AP. not affected - worked around in hardware
11361 + * 11AP. Linux reads the APIC between writes to avoid this, as per
11362 + * the documentation. Make sure you preserve this as it affects
11363 + * the C stepping chips too.
11364 + * 12AP. not affected - worked around in hardware
11365 + * 13AP. not affected - worked around in hardware
11366 + * 14AP. we always deassert INIT during bootup
11367 + * 15AP. not affected - worked around in hardware
11368 + * 16AP. not affected - worked around in hardware
11369 + * 17AP. not affected - worked around in hardware
11370 + * 18AP. not affected - worked around in hardware
11371 + * 19AP. not affected - worked around in BIOS
11373 + * If this sounds worrying believe me these bugs are either ___RARE___,
11374 + * or are signal timing bugs worked around in hardware and there's
11375 + * about nothing of note with C stepping upwards.
11378 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
11381 + * the following functions deal with sending IPIs between CPUs.
11383 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
11386 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
11388 + unsigned int icr = shortcut | APIC_DEST_LOGICAL;
11390 + switch (vector) {
11392 + icr |= APIC_DM_FIXED | vector;
11395 + icr |= APIC_DM_NMI;
11401 +static inline int __prepare_ICR2 (unsigned int mask)
11403 + return SET_APIC_DEST_FIELD(mask);
11406 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
11408 +static inline void __send_IPI_one(unsigned int cpu, int vector)
11410 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
11412 + notify_remote_via_irq(irq);
11415 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
11419 + switch (shortcut) {
11420 + case APIC_DEST_SELF:
11421 + __send_IPI_one(smp_processor_id(), vector);
11423 + case APIC_DEST_ALLBUT:
11424 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11425 + if (cpu == smp_processor_id())
11427 + if (cpu_isset(cpu, cpu_online_map)) {
11428 + __send_IPI_one(cpu, vector);
11433 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
11439 +void fastcall send_IPI_self(int vector)
11441 + __send_IPI_shortcut(APIC_DEST_SELF, vector);
11445 + * This is only used on smaller machines.
11447 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
11449 + unsigned long flags;
11450 + unsigned int cpu;
11452 + local_irq_save(flags);
11453 + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
11455 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11456 + if (cpu_isset(cpu, mask)) {
11457 + __send_IPI_one(cpu, vector);
11461 + local_irq_restore(flags);
11464 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
11467 + send_IPI_mask_bitmask(mask, vector);
11470 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
11474 + * Smarter SMP flushing macros.
11475 + * c/o Linus Torvalds.
11477 + * These mean you can really definitely utterly forget about
11478 + * writing to user space from interrupts. (Its not allowed anyway).
11480 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
11483 +static cpumask_t flush_cpumask;
11484 +static struct mm_struct * flush_mm;
11485 +static unsigned long flush_va;
11486 +static DEFINE_SPINLOCK(tlbstate_lock);
11487 +#define FLUSH_ALL 0xffffffff
11490 + * We cannot call mmdrop() because we are in interrupt context,
11491 + * instead update mm->cpu_vm_mask.
11493 + * We need to reload %cr3 since the page tables may be going
11494 + * away from under us..
11496 +static inline void leave_mm (unsigned long cpu)
11498 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
11500 + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
11501 + load_cr3(swapper_pg_dir);
11506 + * The flush IPI assumes that a thread switch happens in this order:
11507 + * [cpu0: the cpu that switches]
11508 + * 1) switch_mm() either 1a) or 1b)
11509 + * 1a) thread switch to a different mm
11510 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
11511 + * Stop ipi delivery for the old mm. This is not synchronized with
11512 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
11513 + * for the wrong mm, and in the worst case we perform a superflous
11515 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
11516 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
11517 + * was in lazy tlb mode.
11518 + * 1a3) update cpu_tlbstate[].active_mm
11519 + * Now cpu0 accepts tlb flushes for the new mm.
11520 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
11521 + * Now the other cpus will send tlb flush ipis.
11522 + * 1a4) change cr3.
11523 + * 1b) thread switch without mm change
11524 + * cpu_tlbstate[].active_mm is correct, cpu0 already handles
11526 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
11527 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
11528 + * Atomically set the bit [other cpus will start sending flush ipis],
11529 + * and test the bit.
11530 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
11531 + * 2) switch %%esp, ie current
11533 + * The interrupt must handle 2 special cases:
11534 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
11535 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
11536 + * runs in kernel space, the cpu could load tlb entries for user space
11539 + * The good news is that cpu_tlbstate is local to each cpu, no
11540 + * write/read ordering problems.
11546 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
11547 + * 2) Leave the mm if we are in the lazy tlb mode.
11550 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
11551 + struct pt_regs *regs)
11553 + unsigned long cpu;
11557 + if (!cpu_isset(cpu, flush_cpumask))
11560 + * This was a BUG() but until someone can quote me the
11561 + * line from the intel manual that guarantees an IPI to
11562 + * multiple CPUs is retried _only_ on the erroring CPUs
11563 + * its staying as a return
11568 + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
11569 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
11570 + if (flush_va == FLUSH_ALL)
11571 + local_flush_tlb();
11573 + __flush_tlb_one(flush_va);
11577 + smp_mb__before_clear_bit();
11578 + cpu_clear(cpu, flush_cpumask);
11579 + smp_mb__after_clear_bit();
11581 + put_cpu_no_resched();
11583 + return IRQ_HANDLED;
11586 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
11587 + unsigned long va)
11590 + * A couple of (to be removed) sanity checks:
11592 + * - current CPU must not be in mask
11593 + * - mask must exist :)
11595 + BUG_ON(cpus_empty(cpumask));
11596 + BUG_ON(cpu_isset(smp_processor_id(), cpumask));
11599 + /* If a CPU which we ran on has gone down, OK. */
11600 + cpus_and(cpumask, cpumask, cpu_online_map);
11601 + if (cpus_empty(cpumask))
11605 + * i'm not happy about this global shared spinlock in the
11606 + * MM hot path, but we'll see how contended it is.
11607 + * Temporarily this turns IRQs off, so that lockups are
11608 + * detected by the NMI watchdog.
11610 + spin_lock(&tlbstate_lock);
11614 +#if NR_CPUS <= BITS_PER_LONG
11615 + atomic_set_mask(cpumask, &flush_cpumask);
11619 + unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
11620 + unsigned long *cpu_mask = (unsigned long *)&cpumask;
11621 + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
11622 + atomic_set_mask(cpu_mask[k], &flush_mask[k]);
11626 + * We have to send the IPI only to
11629 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
11631 + while (!cpus_empty(flush_cpumask))
11632 + /* nothing. lockup detection does not belong here */
11637 + spin_unlock(&tlbstate_lock);
11640 +void flush_tlb_current_task(void)
11642 + struct mm_struct *mm = current->mm;
11643 + cpumask_t cpu_mask;
11645 + preempt_disable();
11646 + cpu_mask = mm->cpu_vm_mask;
11647 + cpu_clear(smp_processor_id(), cpu_mask);
11649 + local_flush_tlb();
11650 + if (!cpus_empty(cpu_mask))
11651 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11652 + preempt_enable();
11655 +void flush_tlb_mm (struct mm_struct * mm)
11657 + cpumask_t cpu_mask;
11659 + preempt_disable();
11660 + cpu_mask = mm->cpu_vm_mask;
11661 + cpu_clear(smp_processor_id(), cpu_mask);
11663 + if (current->active_mm == mm) {
11665 + local_flush_tlb();
11667 + leave_mm(smp_processor_id());
11669 + if (!cpus_empty(cpu_mask))
11670 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11672 + preempt_enable();
11675 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
11677 + struct mm_struct *mm = vma->vm_mm;
11678 + cpumask_t cpu_mask;
11680 + preempt_disable();
11681 + cpu_mask = mm->cpu_vm_mask;
11682 + cpu_clear(smp_processor_id(), cpu_mask);
11684 + if (current->active_mm == mm) {
11686 + __flush_tlb_one(va);
11688 + leave_mm(smp_processor_id());
11691 + if (!cpus_empty(cpu_mask))
11692 + flush_tlb_others(cpu_mask, mm, va);
11694 + preempt_enable();
11696 +EXPORT_SYMBOL(flush_tlb_page);
11698 +static void do_flush_tlb_all(void* info)
11700 + unsigned long cpu = smp_processor_id();
11702 + __flush_tlb_all();
11703 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
11707 +void flush_tlb_all(void)
11709 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
11715 + * this function sends a 'reschedule' IPI to another CPU.
11716 + * it goes straight through and wastes no time serializing
11717 + * anything. Worst case is that we lose a reschedule ...
11719 +void smp_send_reschedule(int cpu)
11721 + WARN_ON(cpu_is_offline(cpu));
11722 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
11726 + * Structure and data for smp_call_function(). This is designed to minimise
11727 + * static memory requirements. It also looks cleaner.
11729 +static DEFINE_SPINLOCK(call_lock);
11731 +struct call_data_struct {
11732 + void (*func) (void *info);
11734 + atomic_t started;
11735 + atomic_t finished;
11739 +void lock_ipi_call_lock(void)
11741 + spin_lock_irq(&call_lock);
11744 +void unlock_ipi_call_lock(void)
11746 + spin_unlock_irq(&call_lock);
11749 +static struct call_data_struct *call_data;
11752 + * smp_call_function(): Run a function on all other CPUs.
11753 + * @func: The function to run. This must be fast and non-blocking.
11754 + * @info: An arbitrary pointer to pass to the function.
11755 + * @nonatomic: currently unused.
11756 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
11758 + * Returns 0 on success, else a negative status code. Does not return until
11759 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
11761 + * You must not call this function with disabled interrupts or from a
11762 + * hardware interrupt handler or from a bottom half handler.
11764 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
11767 + struct call_data_struct data;
11770 + /* Holding any lock stops cpus from going down. */
11771 + spin_lock(&call_lock);
11772 + cpus = num_online_cpus() - 1;
11774 + spin_unlock(&call_lock);
11778 + /* Can deadlock when called with interrupts disabled */
11779 + WARN_ON(irqs_disabled());
11781 + data.func = func;
11782 + data.info = info;
11783 + atomic_set(&data.started, 0);
11784 + data.wait = wait;
11786 + atomic_set(&data.finished, 0);
11788 + call_data = &data;
11791 + /* Send a message to all other CPUs and wait for them to respond */
11792 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
11794 + /* Wait for response */
11795 + while (atomic_read(&data.started) != cpus)
11799 + while (atomic_read(&data.finished) != cpus)
11801 + spin_unlock(&call_lock);
11805 +EXPORT_SYMBOL(smp_call_function);
11807 +static void stop_this_cpu (void * dummy)
11810 + * Remove this CPU:
11812 + cpu_clear(smp_processor_id(), cpu_online_map);
11813 + local_irq_disable();
11814 + disable_all_local_evtchn();
11815 + if (cpu_data[smp_processor_id()].hlt_works_ok)
11821 + * this function calls the 'stop' function on all other CPUs in the system.
11824 +void smp_send_stop(void)
11826 + smp_call_function(stop_this_cpu, NULL, 1, 0);
11828 + local_irq_disable();
11829 + disable_all_local_evtchn();
11830 + local_irq_enable();
11834 + * Reschedule call back. Nothing to do,
11835 + * all the work is done automatically when
11836 + * we return from the interrupt.
11838 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
11839 + struct pt_regs *regs)
11842 + return IRQ_HANDLED;
11845 +#include <linux/kallsyms.h>
11846 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
11847 + struct pt_regs *regs)
11849 + void (*func) (void *info) = call_data->func;
11850 + void *info = call_data->info;
11851 + int wait = call_data->wait;
11854 + * Notify initiating CPU that I've grabbed the data and am
11855 + * about to execute the function
11858 + atomic_inc(&call_data->started);
11860 + * At this point the info structure may be out of scope unless wait==1
11868 + atomic_inc(&call_data->finished);
11871 + return IRQ_HANDLED;
11874 Index: head-2008-11-25/arch/x86/kernel/time_32-xen.c
11875 ===================================================================
11876 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
11877 +++ head-2008-11-25/arch/x86/kernel/time_32-xen.c 2008-09-01 12:07:31.000000000 +0200
11880 + * linux/arch/i386/kernel/time.c
11882 + * Copyright (C) 1991, 1992, 1995 Linus Torvalds
11884 + * This file contains the PC-specific time handling details:
11885 + * reading the RTC at bootup, etc..
11886 + * 1994-07-02 Alan Modra
11887 + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
11888 + * 1995-03-26 Markus Kuhn
11889 + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
11890 + * precision CMOS clock update
11891 + * 1996-05-03 Ingo Molnar
11892 + * fixed time warps in do_[slow|fast]_gettimeoffset()
11893 + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11894 + * "A Kernel Model for Precision Timekeeping" by Dave Mills
11895 + * 1998-09-05 (Various)
11896 + * More robust do_fast_gettimeoffset() algorithm implemented
11897 + * (works with APM, Cyrix 6x86MX and Centaur C6),
11898 + * monotonic gettimeofday() with fast_get_timeoffset(),
11899 + * drift-proof precision TSC calibration on boot
11900 + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
11901 + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
11902 + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
11903 + * 1998-12-16 Andrea Arcangeli
11904 + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
11905 + * because was not accounting lost_ticks.
11906 + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
11907 + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
11908 + * serialize accesses to xtime/lost_ticks).
11911 +#include <linux/errno.h>
11912 +#include <linux/sched.h>
11913 +#include <linux/kernel.h>
11914 +#include <linux/param.h>
11915 +#include <linux/string.h>
11916 +#include <linux/mm.h>
11917 +#include <linux/interrupt.h>
11918 +#include <linux/time.h>
11919 +#include <linux/delay.h>
11920 +#include <linux/init.h>
11921 +#include <linux/smp.h>
11922 +#include <linux/module.h>
11923 +#include <linux/sysdev.h>
11924 +#include <linux/bcd.h>
11925 +#include <linux/efi.h>
11926 +#include <linux/mca.h>
11927 +#include <linux/sysctl.h>
11928 +#include <linux/percpu.h>
11929 +#include <linux/kernel_stat.h>
11930 +#include <linux/posix-timers.h>
11931 +#include <linux/cpufreq.h>
11933 +#include <asm/io.h>
11934 +#include <asm/smp.h>
11935 +#include <asm/irq.h>
11936 +#include <asm/msr.h>
11937 +#include <asm/delay.h>
11938 +#include <asm/mpspec.h>
11939 +#include <asm/uaccess.h>
11940 +#include <asm/processor.h>
11941 +#include <asm/timer.h>
11942 +#include <asm/sections.h>
11944 +#include "mach_time.h"
11946 +#include <linux/timex.h>
11948 +#include <asm/hpet.h>
11950 +#include <asm/arch_hooks.h>
11952 +#include <xen/evtchn.h>
11953 +#include <xen/interface/vcpu.h>
11955 +#if defined (__i386__)
11956 +#include <asm/i8259.h>
11959 +int pit_latch_buggy; /* extern */
11961 +#if defined(__x86_64__)
11962 +unsigned long vxtime_hz = PIT_TICK_RATE;
11963 +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
11964 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
11965 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
11966 +struct timespec __xtime __section_xtime;
11967 +struct timezone __sys_tz __section_sys_tz;
11970 +unsigned int cpu_khz; /* Detected as we calibrate the TSC */
11971 +EXPORT_SYMBOL(cpu_khz);
11973 +extern unsigned long wall_jiffies;
11975 +DEFINE_SPINLOCK(rtc_lock);
11976 +EXPORT_SYMBOL(rtc_lock);
11978 +extern struct init_timer_opts timer_tsc_init;
11979 +extern struct timer_opts timer_tsc;
11980 +#define timer_none timer_tsc
11982 +/* These are peridically updated in shared_info, and then copied here. */
11983 +struct shadow_time_info {
11984 + u64 tsc_timestamp; /* TSC at last update of time vals. */
11985 + u64 system_timestamp; /* Time, in nanosecs, since boot. */
11986 + u32 tsc_to_nsec_mul;
11987 + u32 tsc_to_usec_mul;
11991 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
11992 +static struct timespec shadow_tv;
11993 +static u32 shadow_tv_version;
11995 +static struct timeval monotonic_tv;
11996 +static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED;
11998 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
11999 +static u64 processed_system_time; /* System time (ns) at last processing. */
12000 +static DEFINE_PER_CPU(u64, processed_system_time);
12002 +/* How much CPU time was spent blocked and how much was 'stolen'? */
12003 +static DEFINE_PER_CPU(u64, processed_stolen_time);
12004 +static DEFINE_PER_CPU(u64, processed_blocked_time);
12006 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
12007 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
12009 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
12010 +#define NS_PER_TICK (1000000000LL/HZ)
12012 +static void __clock_was_set(void *unused)
12016 +static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
12019 + * GCC 4.3 can turn loops over an induction variable into division. We do
12020 + * not support arbitrary 64-bit division, and so must break the induction.
12022 +#define clobber_induction_variable(v) asm ( "" : "+r" (v) )
12024 +static inline void __normalize_time(time_t *sec, s64 *nsec)
12026 + while (*nsec >= NSEC_PER_SEC) {
12027 + clobber_induction_variable(*nsec);
12028 + (*nsec) -= NSEC_PER_SEC;
12031 + while (*nsec < 0) {
12032 + clobber_induction_variable(*nsec);
12033 + (*nsec) += NSEC_PER_SEC;
12038 +/* Does this guest OS track Xen time, or set its wall clock independently? */
12039 +static int independent_wallclock = 0;
12040 +static int __init __independent_wallclock(char *str)
12042 + independent_wallclock = 1;
12045 +__setup("independent_wallclock", __independent_wallclock);
12047 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
12048 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
12049 +static int __init __permitted_clock_jitter(char *str)
12051 + permitted_clock_jitter = simple_strtoul(str, NULL, 0);
12054 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
12057 +static void delay_tsc(unsigned long loops)
12059 + unsigned long bclock, now;
12065 + } while ((now - bclock) < loops);
12068 +struct timer_opts timer_tsc = {
12070 + .delay = delay_tsc,
12075 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
12076 + * yielding a 64-bit result.
12078 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
12086 + delta >>= -shift;
12093 + "mov %4,%%eax ; "
12094 + "mov %%edx,%4 ; "
12097 + "add %4,%%eax ; "
12098 + "adc %5,%%edx ; "
12099 + : "=A" (product), "=r" (tmp1), "=r" (tmp2)
12100 + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
12103 + "mul %%rdx ; shrd $32,%%rdx,%%rax"
12104 + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
12110 +#if 0 /* defined (__i386__) */
12111 +int read_current_timer(unsigned long *timer_val)
12113 + rdtscl(*timer_val);
12118 +void init_cpu_khz(void)
12120 + u64 __cpu_khz = 1000000ULL << 32;
12121 + struct vcpu_time_info *info = &vcpu_info(0)->time;
12122 + do_div(__cpu_khz, info->tsc_to_system_mul);
12123 + if (info->tsc_shift < 0)
12124 + cpu_khz = __cpu_khz << -info->tsc_shift;
12126 + cpu_khz = __cpu_khz >> info->tsc_shift;
12129 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
12133 + delta = now - shadow->tsc_timestamp;
12134 + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
12137 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
12141 + delta = now - shadow->tsc_timestamp;
12142 + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
12145 +static void __update_wallclock(time_t sec, long nsec)
12147 + long wtm_nsec, xtime_nsec;
12148 + time_t wtm_sec, xtime_sec;
12149 + u64 tmp, wc_nsec;
12151 + /* Adjust wall-clock time base based on wall_jiffies ticks. */
12152 + wc_nsec = processed_system_time;
12153 + wc_nsec += sec * (u64)NSEC_PER_SEC;
12155 + wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
12157 + /* Split wallclock base into seconds and nanoseconds. */
12159 + xtime_nsec = do_div(tmp, 1000000000);
12160 + xtime_sec = (time_t)tmp;
12162 + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
12163 + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
12165 + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
12166 + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
12171 +static void update_wallclock(void)
12173 + shared_info_t *s = HYPERVISOR_shared_info;
12176 + shadow_tv_version = s->wc_version;
12178 + shadow_tv.tv_sec = s->wc_sec;
12179 + shadow_tv.tv_nsec = s->wc_nsec;
12181 + } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
12183 + if (!independent_wallclock)
12184 + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
12188 + * Reads a consistent set of time-base values from Xen, into a shadow data
12191 +static void get_time_values_from_xen(unsigned int cpu)
12193 + struct vcpu_time_info *src;
12194 + struct shadow_time_info *dst;
12195 + unsigned long flags;
12196 + u32 pre_version, post_version;
12198 + src = &vcpu_info(cpu)->time;
12199 + dst = &per_cpu(shadow_time, cpu);
12201 + local_irq_save(flags);
12204 + pre_version = dst->version = src->version;
12206 + dst->tsc_timestamp = src->tsc_timestamp;
12207 + dst->system_timestamp = src->system_time;
12208 + dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
12209 + dst->tsc_shift = src->tsc_shift;
12211 + post_version = src->version;
12212 + } while ((pre_version & 1) | (pre_version ^ post_version));
12214 + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
12216 + local_irq_restore(flags);
12219 +static inline int time_values_up_to_date(unsigned int cpu)
12221 + struct vcpu_time_info *src;
12222 + struct shadow_time_info *dst;
12224 + src = &vcpu_info(cpu)->time;
12225 + dst = &per_cpu(shadow_time, cpu);
12228 + return (dst->version == src->version);
12232 + * This is a special lock that is owned by the CPU and holds the index
12233 + * register we are working with. It is required for NMI access to the
12234 + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
12236 +volatile unsigned long cmos_lock = 0;
12237 +EXPORT_SYMBOL(cmos_lock);
12239 +/* Routines for accessing the CMOS RAM/RTC. */
12240 +unsigned char rtc_cmos_read(unsigned char addr)
12242 + unsigned char val;
12243 + lock_cmos_prefix(addr);
12244 + outb_p(addr, RTC_PORT(0));
12245 + val = inb_p(RTC_PORT(1));
12246 + lock_cmos_suffix(addr);
12249 +EXPORT_SYMBOL(rtc_cmos_read);
12251 +void rtc_cmos_write(unsigned char val, unsigned char addr)
12253 + lock_cmos_prefix(addr);
12254 + outb_p(addr, RTC_PORT(0));
12255 + outb_p(val, RTC_PORT(1));
12256 + lock_cmos_suffix(addr);
12258 +EXPORT_SYMBOL(rtc_cmos_write);
12261 + * This version of gettimeofday has microsecond resolution
12262 + * and better than microsecond precision on fast x86 machines with TSC.
12264 +void do_gettimeofday(struct timeval *tv)
12266 + unsigned long seq;
12267 + unsigned long usec, sec;
12268 + unsigned long flags;
12270 + unsigned int cpu;
12271 + struct shadow_time_info *shadow;
12272 + u32 local_time_version;
12275 + shadow = &per_cpu(shadow_time, cpu);
12278 + unsigned long lost;
12280 + local_time_version = shadow->version;
12281 + seq = read_seqbegin(&xtime_lock);
12283 + usec = get_usec_offset(shadow);
12284 + lost = jiffies - wall_jiffies;
12286 + if (unlikely(lost))
12287 + usec += lost * (USEC_PER_SEC / HZ);
12289 + sec = xtime.tv_sec;
12290 + usec += (xtime.tv_nsec / NSEC_PER_USEC);
12292 + nsec = shadow->system_timestamp - processed_system_time;
12293 + __normalize_time(&sec, &nsec);
12294 + usec += (long)nsec / NSEC_PER_USEC;
12296 + if (unlikely(!time_values_up_to_date(cpu))) {
12298 + * We may have blocked for a long time,
12299 + * rendering our calculations invalid
12300 + * (e.g. the time delta may have
12301 + * overflowed). Detect that and recalculate
12302 + * with fresh values.
12304 + get_time_values_from_xen(cpu);
12307 + } while (read_seqretry(&xtime_lock, seq) ||
12308 + (local_time_version != shadow->version));
12312 + while (usec >= USEC_PER_SEC) {
12313 + usec -= USEC_PER_SEC;
12317 + spin_lock_irqsave(&monotonic_lock, flags);
12318 + if ((sec > monotonic_tv.tv_sec) ||
12319 + ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec)))
12321 + monotonic_tv.tv_sec = sec;
12322 + monotonic_tv.tv_usec = usec;
12324 + sec = monotonic_tv.tv_sec;
12325 + usec = monotonic_tv.tv_usec;
12327 + spin_unlock_irqrestore(&monotonic_lock, flags);
12329 + tv->tv_sec = sec;
12330 + tv->tv_usec = usec;
12333 +EXPORT_SYMBOL(do_gettimeofday);
12335 +int do_settimeofday(struct timespec *tv)
12339 + unsigned int cpu;
12340 + struct shadow_time_info *shadow;
12341 + struct xen_platform_op op;
12343 + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
12347 + shadow = &per_cpu(shadow_time, cpu);
12349 + write_seqlock_irq(&xtime_lock);
12352 + * Ensure we don't get blocked for a long time so that our time delta
12353 + * overflows. If that were to happen then our shadow time values would
12354 + * be stale, so we can retry with fresh ones.
12357 + nsec = tv->tv_nsec - get_nsec_offset(shadow);
12358 + if (time_values_up_to_date(cpu))
12360 + get_time_values_from_xen(cpu);
12362 + sec = tv->tv_sec;
12363 + __normalize_time(&sec, &nsec);
12365 + if (is_initial_xendomain() && !independent_wallclock) {
12366 + op.cmd = XENPF_settime;
12367 + op.u.settime.secs = sec;
12368 + op.u.settime.nsecs = nsec;
12369 + op.u.settime.system_time = shadow->system_timestamp;
12370 + WARN_ON(HYPERVISOR_platform_op(&op));
12371 + update_wallclock();
12372 + } else if (independent_wallclock) {
12373 + nsec -= shadow->system_timestamp;
12374 + __normalize_time(&sec, &nsec);
12375 + __update_wallclock(sec, nsec);
12378 + /* Reset monotonic gettimeofday() timeval. */
12379 + spin_lock(&monotonic_lock);
12380 + monotonic_tv.tv_sec = 0;
12381 + monotonic_tv.tv_usec = 0;
12382 + spin_unlock(&monotonic_lock);
12384 + write_sequnlock_irq(&xtime_lock);
12392 +EXPORT_SYMBOL(do_settimeofday);
12394 +static void sync_xen_wallclock(unsigned long dummy);
12395 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
12396 +static void sync_xen_wallclock(unsigned long dummy)
12400 + struct xen_platform_op op;
12402 + if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
12405 + write_seqlock_irq(&xtime_lock);
12407 + sec = xtime.tv_sec;
12408 + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
12409 + __normalize_time(&sec, &nsec);
12411 + op.cmd = XENPF_settime;
12412 + op.u.settime.secs = sec;
12413 + op.u.settime.nsecs = nsec;
12414 + op.u.settime.system_time = processed_system_time;
12415 + WARN_ON(HYPERVISOR_platform_op(&op));
12417 + update_wallclock();
12419 + write_sequnlock_irq(&xtime_lock);
12421 + /* Once per minute. */
12422 + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
12425 +static int set_rtc_mmss(unsigned long nowtime)
12428 + unsigned long flags;
12430 + if (independent_wallclock || !is_initial_xendomain())
12433 + /* gets recalled with irq locally disabled */
12434 + /* XXX - does irqsave resolve this? -johnstul */
12435 + spin_lock_irqsave(&rtc_lock, flags);
12437 + retval = efi_set_rtc_mmss(nowtime);
12439 + retval = mach_set_rtc_mmss(nowtime);
12440 + spin_unlock_irqrestore(&rtc_lock, flags);
12445 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
12446 + * Note: This function is required to return accurate
12447 + * time even in the absence of multiple timer ticks.
12449 +unsigned long long monotonic_clock(void)
12451 + unsigned int cpu = get_cpu();
12452 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12454 + u32 local_time_version;
12457 + local_time_version = shadow->version;
12459 + time = shadow->system_timestamp + get_nsec_offset(shadow);
12460 + if (!time_values_up_to_date(cpu))
12461 + get_time_values_from_xen(cpu);
12463 + } while (local_time_version != shadow->version);
12469 +EXPORT_SYMBOL(monotonic_clock);
12472 +unsigned long long sched_clock(void)
12474 + return monotonic_clock();
12478 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
12479 +unsigned long profile_pc(struct pt_regs *regs)
12481 + unsigned long pc = instruction_pointer(regs);
12484 + /* Assume the lock function has either no stack frame or only a single word.
12485 + This checks if the address on the stack looks like a kernel text address.
12486 + There is a small window for false hits, but in that case the tick
12487 + is just accounted to the spinlock function.
12488 + Better would be to write these functions in assembler again
12489 + and check exactly. */
12490 + if (!user_mode_vm(regs) && in_lock_functions(pc)) {
12491 + char *v = *(char **)regs->rsp;
12492 + if ((v >= _stext && v <= _etext) ||
12493 + (v >= _sinittext && v <= _einittext) ||
12494 + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
12495 + return (unsigned long)v;
12496 + return ((unsigned long *)regs->rsp)[1];
12499 + if (!user_mode_vm(regs) && in_lock_functions(pc))
12500 + return *(unsigned long *)(regs->ebp + 4);
12505 +EXPORT_SYMBOL(profile_pc);
12509 + * This is the same as the above, except we _also_ save the current
12510 + * Time Stamp Counter value at the time of the timer interrupt, so that
12511 + * we later on can estimate the time of day more exactly.
12513 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
12515 + s64 delta, delta_cpu, stolen, blocked;
12517 + unsigned int i, cpu = smp_processor_id();
12518 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12519 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12522 + * Here we are in the timer irq handler. We just have irqs locally
12523 + * disabled but we don't know if the timer_bh is running on the other
12524 + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
12525 + * the irq version of write_lock because as just said we have irq
12526 + * locally disabled. -arca
12528 + write_seqlock(&xtime_lock);
12531 + get_time_values_from_xen(cpu);
12533 + /* Obtain a consistent snapshot of elapsed wallclock cycles. */
12534 + delta = delta_cpu =
12535 + shadow->system_timestamp + get_nsec_offset(shadow);
12536 + delta -= processed_system_time;
12537 + delta_cpu -= per_cpu(processed_system_time, cpu);
12540 + * Obtain a consistent snapshot of stolen/blocked cycles. We
12541 + * can use state_entry_time to detect if we get preempted here.
12544 + sched_time = runstate->state_entry_time;
12546 + stolen = runstate->time[RUNSTATE_runnable] +
12547 + runstate->time[RUNSTATE_offline] -
12548 + per_cpu(processed_stolen_time, cpu);
12549 + blocked = runstate->time[RUNSTATE_blocked] -
12550 + per_cpu(processed_blocked_time, cpu);
12552 + } while (sched_time != runstate->state_entry_time);
12553 + } while (!time_values_up_to_date(cpu));
12555 + if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
12556 + unlikely(delta_cpu < -(s64)permitted_clock_jitter))
12557 + && printk_ratelimit()) {
12558 + printk("Timer ISR/%u: Time went backwards: "
12559 + "delta=%lld delta_cpu=%lld shadow=%lld "
12560 + "off=%lld processed=%lld cpu_processed=%lld\n",
12561 + cpu, delta, delta_cpu, shadow->system_timestamp,
12562 + (s64)get_nsec_offset(shadow),
12563 + processed_system_time,
12564 + per_cpu(processed_system_time, cpu));
12565 + for (i = 0; i < num_online_cpus(); i++)
12566 + printk(" %d: %lld\n", i,
12567 + per_cpu(processed_system_time, i));
12570 + /* System-wide jiffy work. */
12571 + while (delta >= NS_PER_TICK) {
12572 + delta -= NS_PER_TICK;
12573 + processed_system_time += NS_PER_TICK;
12577 + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
12578 + update_wallclock();
12579 + if (keventd_up())
12580 + schedule_work(&clock_was_set_work);
12583 + write_sequnlock(&xtime_lock);
12586 + * Account stolen ticks.
12587 + * HACK: Passing NULL to account_steal_time()
12588 + * ensures that the ticks are accounted as stolen.
12590 + if ((stolen > 0) && (delta_cpu > 0)) {
12591 + delta_cpu -= stolen;
12592 + if (unlikely(delta_cpu < 0))
12593 + stolen += delta_cpu; /* clamp local-time progress */
12594 + do_div(stolen, NS_PER_TICK);
12595 + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
12596 + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
12597 + account_steal_time(NULL, (cputime_t)stolen);
12601 + * Account blocked ticks.
12602 + * HACK: Passing idle_task to account_steal_time()
12603 + * ensures that the ticks are accounted as idle/wait.
12605 + if ((blocked > 0) && (delta_cpu > 0)) {
12606 + delta_cpu -= blocked;
12607 + if (unlikely(delta_cpu < 0))
12608 + blocked += delta_cpu; /* clamp local-time progress */
12609 + do_div(blocked, NS_PER_TICK);
12610 + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
12611 + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
12612 + account_steal_time(idle_task(cpu), (cputime_t)blocked);
12615 + /* Account user/system ticks. */
12616 + if (delta_cpu > 0) {
12617 + do_div(delta_cpu, NS_PER_TICK);
12618 + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
12619 + if (user_mode_vm(regs))
12620 + account_user_time(current, (cputime_t)delta_cpu);
12622 + account_system_time(current, HARDIRQ_OFFSET,
12623 + (cputime_t)delta_cpu);
12626 + /* Offlined for more than a few seconds? Avoid lockup warnings. */
12627 + if (stolen > 5*HZ)
12628 + touch_softlockup_watchdog();
12630 + /* Local timer processing (see update_process_times()). */
12631 + run_local_timers();
12632 + if (rcu_pending(cpu))
12633 + rcu_check_callbacks(cpu, user_mode_vm(regs));
12634 + scheduler_tick();
12635 + run_posix_cpu_timers(current);
12636 + profile_tick(CPU_PROFILING, regs);
12638 + return IRQ_HANDLED;
12641 +static void init_missing_ticks_accounting(unsigned int cpu)
12643 + struct vcpu_register_runstate_memory_area area;
12644 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12647 + memset(runstate, 0, sizeof(*runstate));
12649 + area.addr.v = runstate;
12650 + rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
12651 + WARN_ON(rc && rc != -ENOSYS);
12653 + per_cpu(processed_blocked_time, cpu) =
12654 + runstate->time[RUNSTATE_blocked];
12655 + per_cpu(processed_stolen_time, cpu) =
12656 + runstate->time[RUNSTATE_runnable] +
12657 + runstate->time[RUNSTATE_offline];
12660 +/* not static: needed by APM */
12661 +unsigned long get_cmos_time(void)
12663 + unsigned long retval;
12664 + unsigned long flags;
12666 + spin_lock_irqsave(&rtc_lock, flags);
12669 + retval = efi_get_time();
12671 + retval = mach_get_cmos_time();
12673 + spin_unlock_irqrestore(&rtc_lock, flags);
12677 +EXPORT_SYMBOL(get_cmos_time);
12679 +static void sync_cmos_clock(unsigned long dummy);
12681 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
12683 +static void sync_cmos_clock(unsigned long dummy)
12685 + struct timeval now, next;
12689 + * If we have an externally synchronized Linux clock, then update
12690 + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
12691 + * called as close as possible to 500 ms before the new second starts.
12692 + * This code is run on a timer. If the clock is set, that timer
12693 + * may not expire at the correct time. Thus, we adjust...
12695 + if (!ntp_synced())
12697 + * Not synced, exit, do not restart a timer (if one is
12698 + * running, let it run out).
12702 + do_gettimeofday(&now);
12703 + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
12704 + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
12705 + fail = set_rtc_mmss(now.tv_sec);
12707 + next.tv_usec = USEC_AFTER - now.tv_usec;
12708 + if (next.tv_usec <= 0)
12709 + next.tv_usec += USEC_PER_SEC;
12712 + next.tv_sec = 659;
12716 + if (next.tv_usec >= USEC_PER_SEC) {
12718 + next.tv_usec -= USEC_PER_SEC;
12720 + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
12723 +void notify_arch_cmos_timer(void)
12725 + mod_timer(&sync_cmos_timer, jiffies + 1);
12726 + mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
12729 +static int timer_resume(struct sys_device *dev)
12731 + extern void time_resume(void);
12736 +static struct sysdev_class timer_sysclass = {
12737 + .resume = timer_resume,
12738 + set_kset_name("timer"),
12742 +/* XXX this driverfs stuff should probably go elsewhere later -john */
12743 +static struct sys_device device_timer = {
12745 + .cls = &timer_sysclass,
12748 +static int time_init_device(void)
12750 + int error = sysdev_class_register(&timer_sysclass);
12752 + error = sysdev_register(&device_timer);
12756 +device_initcall(time_init_device);
12758 +#ifdef CONFIG_HPET_TIMER
12759 +extern void (*late_time_init)(void);
12760 +/* Duplicate of time_init() below, with hpet_enable part added */
12761 +static void __init hpet_time_init(void)
12763 + xtime.tv_sec = get_cmos_time();
12764 + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
12765 + set_normalized_timespec(&wall_to_monotonic,
12766 + -xtime.tv_sec, -xtime.tv_nsec);
12768 + if ((hpet_enable() >= 0) && hpet_use_timer) {
12769 + printk("Using HPET for base-timer\n");
12772 + time_init_hook();
12776 +/* Dynamically-mapped IRQ. */
12777 +DEFINE_PER_CPU(int, timer_irq);
12779 +extern void (*late_time_init)(void);
12780 +static void setup_cpu0_timer_irq(void)
12782 + per_cpu(timer_irq, 0) =
12783 + bind_virq_to_irqhandler(
12790 + BUG_ON(per_cpu(timer_irq, 0) < 0);
12793 +static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
12794 + .period_ns = NS_PER_TICK
12797 +void __init time_init(void)
12799 +#ifdef CONFIG_HPET_TIMER
12800 + if (is_hpet_capable()) {
12802 + * HPET initialization needs to do memory-mapped io. So, let
12803 + * us do a late initialization after mem_init().
12805 + late_time_init = hpet_time_init;
12810 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
12811 + &xen_set_periodic_tick)) {
12813 +#if CONFIG_XEN_COMPAT <= 0x030004
12821 + get_time_values_from_xen(0);
12823 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12824 + per_cpu(processed_system_time, 0) = processed_system_time;
12825 + init_missing_ticks_accounting(0);
12827 + update_wallclock();
12830 + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
12831 + cpu_khz / 1000, cpu_khz % 1000);
12833 +#if defined(__x86_64__)
12834 + vxtime.mode = VXTIME_TSC;
12835 + vxtime.quot = (1000000L << 32) / vxtime_hz;
12836 + vxtime.tsc_quot = (1000L << 32) / cpu_khz;
12838 + rdtscll(vxtime.last_tsc);
12841 + /* Cannot request_irq() until kmem is initialised. */
12842 + late_time_init = setup_cpu0_timer_irq;
12845 +/* Convert jiffies to system time. */
12846 +u64 jiffies_to_st(unsigned long j)
12848 + unsigned long seq;
12853 + seq = read_seqbegin(&xtime_lock);
12854 + delta = j - jiffies;
12856 + /* Triggers in some wrap-around cases, but that's okay:
12857 + * we just end up with a shorter timeout. */
12858 + st = processed_system_time + NS_PER_TICK;
12859 + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
12860 + /* Very long timeout means there is no pending timer.
12861 + * We indicate this to Xen by passing zero timeout. */
12864 + st = processed_system_time + delta * (u64)NS_PER_TICK;
12866 + } while (read_seqretry(&xtime_lock, seq));
12870 +EXPORT_SYMBOL(jiffies_to_st);
12873 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
12874 + * These functions are based on implementations from arch/s390/kernel/time.c
12876 +static void stop_hz_timer(void)
12878 + struct vcpu_set_singleshot_timer singleshot;
12879 + unsigned int cpu = smp_processor_id();
12883 + cpu_set(cpu, nohz_cpu_mask);
12885 + /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
12886 + /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
12887 + /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
12888 + /* stop the hz timer then the cpumasks created for subsequent values */
12889 + /* of cur in rcu_start_batch are guaranteed to pick up the updated */
12890 + /* nohz_cpu_mask and so will not depend on this cpu. */
12894 + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
12895 + if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
12896 + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
12897 + cpu_clear(cpu, nohz_cpu_mask);
12901 + singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2;
12902 + singleshot.flags = 0;
12903 + rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
12904 +#if CONFIG_XEN_COMPAT <= 0x030004
12906 + BUG_ON(rc != -ENOSYS);
12907 + rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
12913 +static void start_hz_timer(void)
12915 + cpu_clear(smp_processor_id(), nohz_cpu_mask);
12918 +void raw_safe_halt(void)
12921 + /* Blocking includes an implicit local_irq_enable(). */
12922 + HYPERVISOR_block();
12923 + start_hz_timer();
12925 +EXPORT_SYMBOL(raw_safe_halt);
12929 + if (irqs_disabled())
12930 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
12932 +EXPORT_SYMBOL(halt);
12934 +/* No locking required. Interrupts are disabled on all CPUs. */
12935 +void time_resume(void)
12937 + unsigned int cpu;
12941 + for_each_online_cpu(cpu) {
12942 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12943 + &xen_set_periodic_tick)) {
12945 +#if CONFIG_XEN_COMPAT <= 0x030004
12952 + get_time_values_from_xen(cpu);
12953 + per_cpu(processed_system_time, cpu) =
12954 + per_cpu(shadow_time, 0).system_timestamp;
12955 + init_missing_ticks_accounting(cpu);
12958 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12960 + update_wallclock();
12964 +static char timer_name[NR_CPUS][15];
12966 +int __cpuinit local_setup_timer(unsigned int cpu)
12970 + BUG_ON(cpu == 0);
12972 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12973 + &xen_set_periodic_tick)) {
12975 +#if CONFIG_XEN_COMPAT <= 0x030004
12984 + seq = read_seqbegin(&xtime_lock);
12985 + /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
12986 + per_cpu(processed_system_time, cpu) =
12987 + per_cpu(shadow_time, 0).system_timestamp;
12988 + init_missing_ticks_accounting(cpu);
12989 + } while (read_seqretry(&xtime_lock, seq));
12991 + sprintf(timer_name[cpu], "timer%u", cpu);
12992 + irq = bind_virq_to_irqhandler(VIRQ_TIMER,
13000 + per_cpu(timer_irq, cpu) = irq;
13005 +void __cpuexit local_teardown_timer(unsigned int cpu)
13007 + BUG_ON(cpu == 0);
13008 + unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
13012 +#ifdef CONFIG_CPU_FREQ
13013 +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
13016 + struct cpufreq_freqs *freq = data;
13017 + struct xen_platform_op op;
13019 + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
13022 + if (val == CPUFREQ_PRECHANGE)
13025 + op.cmd = XENPF_change_freq;
13026 + op.u.change_freq.flags = 0;
13027 + op.u.change_freq.cpu = freq->cpu;
13028 + op.u.change_freq.freq = (u64)freq->new * 1000;
13029 + WARN_ON(HYPERVISOR_platform_op(&op));
13034 +static struct notifier_block time_cpufreq_notifier_block = {
13035 + .notifier_call = time_cpufreq_notifier
13038 +static int __init cpufreq_time_setup(void)
13040 + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
13041 + CPUFREQ_TRANSITION_NOTIFIER)) {
13042 + printk(KERN_ERR "failed to set up cpufreq notifier\n");
13048 +core_initcall(cpufreq_time_setup);
13052 + * /proc/sys/xen: This really belongs in another file. It can stay here for
13055 +static ctl_table xen_subtable[] = {
13058 + .procname = "independent_wallclock",
13059 + .data = &independent_wallclock,
13060 + .maxlen = sizeof(independent_wallclock),
13062 + .proc_handler = proc_dointvec
13066 + .procname = "permitted_clock_jitter",
13067 + .data = &permitted_clock_jitter,
13068 + .maxlen = sizeof(permitted_clock_jitter),
13070 + .proc_handler = proc_doulongvec_minmax
13074 +static ctl_table xen_table[] = {
13077 + .procname = "xen",
13079 + .child = xen_subtable},
13082 +static int __init xen_sysctl_init(void)
13084 + (void)register_sysctl_table(xen_table, 0);
13087 +__initcall(xen_sysctl_init);
13088 Index: head-2008-11-25/arch/x86/kernel/traps_32-xen.c
13089 ===================================================================
13090 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
13091 +++ head-2008-11-25/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200
13094 + * linux/arch/i386/traps.c
13096 + * Copyright (C) 1991, 1992 Linus Torvalds
13098 + * Pentium III FXSR, SSE support
13099 + * Gareth Hughes <gareth@valinux.com>, May 2000
13103 + * 'Traps.c' handles hardware traps and faults after we have saved some
13104 + * state in 'asm.s'.
13106 +#include <linux/sched.h>
13107 +#include <linux/kernel.h>
13108 +#include <linux/string.h>
13109 +#include <linux/errno.h>
13110 +#include <linux/timer.h>
13111 +#include <linux/mm.h>
13112 +#include <linux/init.h>
13113 +#include <linux/delay.h>
13114 +#include <linux/spinlock.h>
13115 +#include <linux/interrupt.h>
13116 +#include <linux/highmem.h>
13117 +#include <linux/kallsyms.h>
13118 +#include <linux/ptrace.h>
13119 +#include <linux/utsname.h>
13120 +#include <linux/kprobes.h>
13121 +#include <linux/kexec.h>
13122 +#include <linux/unwind.h>
13124 +#ifdef CONFIG_EISA
13125 +#include <linux/ioport.h>
13126 +#include <linux/eisa.h>
13130 +#include <linux/mca.h>
13133 +#include <asm/processor.h>
13134 +#include <asm/system.h>
13135 +#include <asm/uaccess.h>
13136 +#include <asm/io.h>
13137 +#include <asm/atomic.h>
13138 +#include <asm/debugreg.h>
13139 +#include <asm/desc.h>
13140 +#include <asm/i387.h>
13141 +#include <asm/nmi.h>
13142 +#include <asm/unwind.h>
13143 +#include <asm/smp.h>
13144 +#include <asm/arch_hooks.h>
13145 +#include <asm/kdebug.h>
13147 +#include <linux/module.h>
13149 +#include "mach_traps.h"
13151 +asmlinkage int system_call(void);
13153 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
13154 + { 0, 0 }, { 0, 0 } };
13156 +/* Do we ignore FPU interrupts ? */
13157 +char ignore_fpu_irq = 0;
13159 +#ifndef CONFIG_X86_NO_IDT
13161 + * The IDT has to be page-aligned to simplify the Pentium
13162 + * F0 0F bug workaround.. We have a special link segment
13165 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
13168 +asmlinkage void divide_error(void);
13169 +asmlinkage void debug(void);
13170 +asmlinkage void nmi(void);
13171 +asmlinkage void int3(void);
13172 +asmlinkage void overflow(void);
13173 +asmlinkage void bounds(void);
13174 +asmlinkage void invalid_op(void);
13175 +asmlinkage void device_not_available(void);
13176 +asmlinkage void coprocessor_segment_overrun(void);
13177 +asmlinkage void invalid_TSS(void);
13178 +asmlinkage void segment_not_present(void);
13179 +asmlinkage void stack_segment(void);
13180 +asmlinkage void general_protection(void);
13181 +asmlinkage void page_fault(void);
13182 +asmlinkage void coprocessor_error(void);
13183 +asmlinkage void simd_coprocessor_error(void);
13184 +asmlinkage void alignment_check(void);
13185 +#ifndef CONFIG_XEN
13186 +asmlinkage void spurious_interrupt_bug(void);
13188 +asmlinkage void fixup_4gb_segment(void);
13190 +asmlinkage void machine_check(void);
13192 +static int kstack_depth_to_print = 24;
13193 +#ifdef CONFIG_STACK_UNWIND
13194 +static int call_trace = 1;
13196 +#define call_trace (-1)
13198 +ATOMIC_NOTIFIER_HEAD(i386die_chain);
13200 +int register_die_notifier(struct notifier_block *nb)
13202 + vmalloc_sync_all();
13203 + return atomic_notifier_chain_register(&i386die_chain, nb);
13205 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
13207 +int unregister_die_notifier(struct notifier_block *nb)
13209 + return atomic_notifier_chain_unregister(&i386die_chain, nb);
13211 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
13213 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
13215 + return p > (void *)tinfo &&
13216 + p < (void *)tinfo + THREAD_SIZE - 3;
13220 + * Print one address/symbol entries per line.
13222 +static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
13224 + printk(" [<%08lx>] ", addr);
13226 + print_symbol("%s\n", addr);
13229 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
13230 + unsigned long *stack, unsigned long ebp,
13233 + unsigned long addr;
13235 +#ifdef CONFIG_FRAME_POINTER
13236 + while (valid_stack_ptr(tinfo, (void *)ebp)) {
13237 + addr = *(unsigned long *)(ebp + 4);
13238 + print_addr_and_symbol(addr, log_lvl);
13240 + * break out of recursive entries (such as
13241 + * end_of_stack_stop_unwind_function):
13243 + if (ebp == *(unsigned long *)ebp)
13245 + ebp = *(unsigned long *)ebp;
13248 + while (valid_stack_ptr(tinfo, stack)) {
13250 + if (__kernel_text_address(addr))
13251 + print_addr_and_symbol(addr, log_lvl);
13257 +static asmlinkage int
13258 +show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
13262 + while (unwind(info) == 0 && UNW_PC(info)) {
13264 + print_addr_and_symbol(UNW_PC(info), log_lvl);
13265 + if (arch_unw_user_mode(info))
13271 +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
13272 + unsigned long *stack, char *log_lvl)
13274 + unsigned long ebp;
13279 + if (call_trace >= 0) {
13281 + struct unwind_frame_info info;
13284 + if (unwind_init_frame_info(&info, task, regs) == 0)
13285 + unw_ret = show_trace_unwind(&info, log_lvl);
13286 + } else if (task == current)
13287 + unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
13289 + if (unwind_init_blocked(&info, task) == 0)
13290 + unw_ret = show_trace_unwind(&info, log_lvl);
13292 + if (unw_ret > 0) {
13293 + if (call_trace == 1 && !arch_unw_user_mode(&info)) {
13294 + print_symbol("DWARF2 unwinder stuck at %s\n",
13296 + if (UNW_SP(&info) >= PAGE_OFFSET) {
13297 + printk("Leftover inexact backtrace:\n");
13298 + stack = (void *)UNW_SP(&info);
13300 + printk("Full inexact backtrace again:\n");
13301 + } else if (call_trace >= 1)
13304 + printk("Full inexact backtrace again:\n");
13306 + printk("Inexact backtrace:\n");
13309 + if (task == current) {
13310 + /* Grab ebp right from our regs */
13311 + asm ("movl %%ebp, %0" : "=r" (ebp) : );
13313 + /* ebp is the last reg pushed by switch_to */
13314 + ebp = *(unsigned long *) task->thread.esp;
13318 + struct thread_info *context;
13319 + context = (struct thread_info *)
13320 + ((unsigned long)stack & (~(THREAD_SIZE - 1)));
13321 + ebp = print_context_stack(context, stack, ebp, log_lvl);
13322 + stack = (unsigned long*)context->previous_esp;
13325 + printk("%s =======================\n", log_lvl);
13329 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
13331 + show_trace_log_lvl(task, regs, stack, "");
13334 +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
13335 + unsigned long *esp, char *log_lvl)
13337 + unsigned long *stack;
13340 + if (esp == NULL) {
13342 + esp = (unsigned long*)task->thread.esp;
13344 + esp = (unsigned long *)&esp;
13348 + for(i = 0; i < kstack_depth_to_print; i++) {
13349 + if (kstack_end(stack))
13351 + if (i && ((i % 8) == 0))
13352 + printk("\n%s ", log_lvl);
13353 + printk("%08lx ", *stack++);
13355 + printk("\n%sCall Trace:\n", log_lvl);
13356 + show_trace_log_lvl(task, regs, esp, log_lvl);
13359 +void show_stack(struct task_struct *task, unsigned long *esp)
13362 + show_stack_log_lvl(task, NULL, esp, "");
13366 + * The architecture-independent dump_stack generator
13368 +void dump_stack(void)
13370 + unsigned long stack;
13372 + show_trace(current, NULL, &stack);
13375 +EXPORT_SYMBOL(dump_stack);
13377 +void show_registers(struct pt_regs *regs)
13380 + int in_kernel = 1;
13381 + unsigned long esp;
13382 + unsigned short ss;
13384 + esp = (unsigned long) (®s->esp);
13385 + savesegment(ss, ss);
13386 + if (user_mode_vm(regs)) {
13389 + ss = regs->xss & 0xffff;
13392 + printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
13393 + "EFLAGS: %08lx (%s %.*s) \n",
13394 + smp_processor_id(), 0xffff & regs->xcs, regs->eip,
13395 + print_tainted(), regs->eflags, system_utsname.release,
13396 + (int)strcspn(system_utsname.version, " "),
13397 + system_utsname.version);
13398 + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
13399 + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
13400 + regs->eax, regs->ebx, regs->ecx, regs->edx);
13401 + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
13402 + regs->esi, regs->edi, regs->ebp, esp);
13403 + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
13404 + regs->xds & 0xffff, regs->xes & 0xffff, ss);
13405 + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
13406 + TASK_COMM_LEN, current->comm, current->pid,
13407 + current_thread_info(), current, current->thread_info);
13409 + * When in-kernel, we also print out the stack and code at the
13410 + * time of the fault..
13415 + printk("\n" KERN_EMERG "Stack: ");
13416 + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
13418 + printk(KERN_EMERG "Code: ");
13420 + eip = (u8 __user *)regs->eip - 43;
13421 + for (i = 0; i < 64; i++, eip++) {
13424 + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
13425 + printk(" Bad EIP value.");
13428 + if (eip == (u8 __user *)regs->eip)
13429 + printk("<%02x> ", c);
13431 + printk("%02x ", c);
13437 +static void handle_BUG(struct pt_regs *regs)
13439 + unsigned long eip = regs->eip;
13440 + unsigned short ud2;
13442 + if (eip < PAGE_OFFSET)
13444 + if (__get_user(ud2, (unsigned short __user *)eip))
13446 + if (ud2 != 0x0b0f)
13449 + printk(KERN_EMERG "------------[ cut here ]------------\n");
13451 +#ifdef CONFIG_DEBUG_BUGVERBOSE
13453 + unsigned short line;
13457 + if (__get_user(line, (unsigned short __user *)(eip + 2)))
13459 + if (__get_user(file, (char * __user *)(eip + 4)) ||
13460 + (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
13461 + file = "<bad filename>";
13463 + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
13467 + printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
13470 +/* This is gone through when something in the kernel
13471 + * has done something bad and is about to be terminated.
13473 +void die(const char * str, struct pt_regs * regs, long err)
13478 + int lock_owner_depth;
13480 + .lock = SPIN_LOCK_UNLOCKED,
13481 + .lock_owner = -1,
13482 + .lock_owner_depth = 0
13484 + static int die_counter;
13485 + unsigned long flags;
13489 + if (die.lock_owner != raw_smp_processor_id()) {
13490 + console_verbose();
13491 + spin_lock_irqsave(&die.lock, flags);
13492 + die.lock_owner = smp_processor_id();
13493 + die.lock_owner_depth = 0;
13494 + bust_spinlocks(1);
13497 + local_save_flags(flags);
13499 + if (++die.lock_owner_depth < 3) {
13501 + unsigned long esp;
13502 + unsigned short ss;
13504 + handle_BUG(regs);
13505 + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
13506 +#ifdef CONFIG_PREEMPT
13507 + printk(KERN_EMERG "PREEMPT ");
13512 + printk(KERN_EMERG);
13516 +#ifdef CONFIG_DEBUG_PAGEALLOC
13518 + printk(KERN_EMERG);
13519 + printk("DEBUG_PAGEALLOC");
13524 + if (notify_die(DIE_OOPS, str, regs, err,
13525 + current->thread.trap_no, SIGSEGV) !=
13527 + show_registers(regs);
13528 + /* Executive summary in case the oops scrolled away */
13529 + esp = (unsigned long) (®s->esp);
13530 + savesegment(ss, ss);
13531 + if (user_mode(regs)) {
13533 + ss = regs->xss & 0xffff;
13535 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
13536 + print_symbol("%s", regs->eip);
13537 + printk(" SS:ESP %04x:%08lx\n", ss, esp);
13542 + printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
13544 + bust_spinlocks(0);
13545 + die.lock_owner = -1;
13546 + spin_unlock_irqrestore(&die.lock, flags);
13551 + if (kexec_should_crash(current))
13552 + crash_kexec(regs);
13554 + if (in_interrupt())
13555 + panic("Fatal exception in interrupt");
13557 + if (panic_on_oops)
13558 + panic("Fatal exception");
13561 + do_exit(SIGSEGV);
13564 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
13566 + if (!user_mode_vm(regs))
13567 + die(str, regs, err);
13570 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
13571 + struct pt_regs * regs, long error_code,
13574 + struct task_struct *tsk = current;
13575 + tsk->thread.error_code = error_code;
13576 + tsk->thread.trap_no = trapnr;
13578 + if (regs->eflags & VM_MASK) {
13581 + goto trap_signal;
13584 + if (!user_mode(regs))
13585 + goto kernel_trap;
13589 + force_sig_info(signr, info, tsk);
13591 + force_sig(signr, tsk);
13596 + if (!fixup_exception(regs))
13597 + die(str, regs, error_code);
13602 + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
13603 + if (ret) goto trap_signal;
13608 +#define DO_ERROR(trapnr, signr, str, name) \
13609 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13611 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13612 + == NOTIFY_STOP) \
13614 + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
13617 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13618 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13620 + siginfo_t info; \
13621 + info.si_signo = signr; \
13622 + info.si_errno = 0; \
13623 + info.si_code = sicode; \
13624 + info.si_addr = (void __user *)siaddr; \
13625 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13626 + == NOTIFY_STOP) \
13628 + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
13631 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
13632 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13634 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13635 + == NOTIFY_STOP) \
13637 + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
13640 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13641 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13643 + siginfo_t info; \
13644 + info.si_signo = signr; \
13645 + info.si_errno = 0; \
13646 + info.si_code = sicode; \
13647 + info.si_addr = (void __user *)siaddr; \
13648 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13649 + == NOTIFY_STOP) \
13651 + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
13654 +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
13655 +#ifndef CONFIG_KPROBES
13656 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
13658 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
13659 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
13660 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
13661 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
13662 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
13663 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
13664 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
13665 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
13666 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
13668 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
13671 + current->thread.error_code = error_code;
13672 + current->thread.trap_no = 13;
13674 + if (regs->eflags & VM_MASK)
13677 + if (!user_mode(regs))
13678 + goto gp_in_kernel;
13680 + current->thread.error_code = error_code;
13681 + current->thread.trap_no = 13;
13682 + force_sig(SIGSEGV, current);
13686 + local_irq_enable();
13687 + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
13691 + if (!fixup_exception(regs)) {
13692 + if (notify_die(DIE_GPF, "general protection fault", regs,
13693 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
13695 + die("general protection fault", regs, error_code);
13699 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
13701 + printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
13702 + "to continue\n");
13703 + printk(KERN_EMERG "You probably have a hardware problem with your RAM "
13706 + /* Clear and disable the memory parity error line. */
13707 + clear_mem_error(reason);
13710 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
13712 + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
13713 + show_registers(regs);
13715 + /* Re-enable the IOCK line, wait for a few seconds */
13716 + clear_io_check_error(reason);
13719 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
13722 + /* Might actually be able to figure out what the guilty party
13725 + mca_handle_nmi();
13729 + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
13730 + reason, smp_processor_id());
13731 + printk("Dazed and confused, but trying to continue\n");
13732 + printk("Do you have a strange power saving mode enabled?\n");
13735 +static DEFINE_SPINLOCK(nmi_print_lock);
13737 +void die_nmi (struct pt_regs *regs, const char *msg)
13739 + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
13743 + spin_lock(&nmi_print_lock);
13745 + * We are in trouble anyway, lets at least try
13746 + * to get a message out.
13748 + bust_spinlocks(1);
13749 + printk(KERN_EMERG "%s", msg);
13750 + printk(" on CPU%d, eip %08lx, registers:\n",
13751 + smp_processor_id(), regs->eip);
13752 + show_registers(regs);
13753 + printk(KERN_EMERG "console shuts up ...\n");
13754 + console_silent();
13755 + spin_unlock(&nmi_print_lock);
13756 + bust_spinlocks(0);
13758 + /* If we are in kernel we are probably nested up pretty bad
13759 + * and might aswell get out now while we still can.
13761 + if (!user_mode_vm(regs)) {
13762 + current->thread.trap_no = 2;
13763 + crash_kexec(regs);
13766 + do_exit(SIGSEGV);
13769 +static void default_do_nmi(struct pt_regs * regs)
13771 + unsigned char reason = 0;
13773 + /* Only the BSP gets external NMIs from the system. */
13774 + if (!smp_processor_id())
13775 + reason = get_nmi_reason();
13777 + if (!(reason & 0xc0)) {
13778 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
13781 +#ifdef CONFIG_X86_LOCAL_APIC
13783 + * Ok, so this is none of the documented NMI sources,
13784 + * so it must be the NMI watchdog.
13786 + if (nmi_watchdog) {
13787 + nmi_watchdog_tick(regs);
13791 + unknown_nmi_error(reason, regs);
13794 + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
13796 + if (reason & 0x80)
13797 + mem_parity_error(reason, regs);
13798 + if (reason & 0x40)
13799 + io_check_error(reason, regs);
13801 + * Reassert NMI in case it became active meanwhile
13802 + * as it's edge-triggered.
13807 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
13812 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
13814 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
13820 + cpu = smp_processor_id();
13822 + ++nmi_count(cpu);
13824 + if (!rcu_dereference(nmi_callback)(regs, cpu))
13825 + default_do_nmi(regs);
13830 +void set_nmi_callback(nmi_callback_t callback)
13832 + vmalloc_sync_all();
13833 + rcu_assign_pointer(nmi_callback, callback);
13835 +EXPORT_SYMBOL_GPL(set_nmi_callback);
13837 +void unset_nmi_callback(void)
13839 + nmi_callback = dummy_nmi_callback;
13841 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
13843 +#ifdef CONFIG_KPROBES
13844 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
13846 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
13849 + /* This is an interrupt gate, because kprobes wants interrupts
13850 + disabled. Normal trap handlers don't. */
13851 + restore_interrupts(regs);
13852 + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
13857 + * Our handling of the processor debug registers is non-trivial.
13858 + * We do not clear them on entry and exit from the kernel. Therefore
13859 + * it is possible to get a watchpoint trap here from inside the kernel.
13860 + * However, the code in ./ptrace.c has ensured that the user can
13861 + * only set watchpoints on userspace addresses. Therefore the in-kernel
13862 + * watchpoint trap can only occur in code which is reading/writing
13863 + * from user space. Such code must not hold kernel locks (since it
13864 + * can equally take a page fault), therefore it is safe to call
13865 + * force_sig_info even though that claims and releases locks.
13867 + * Code in ./signal.c ensures that the debug control register
13868 + * is restored before we deliver any signal, and therefore that
13869 + * user code runs with the correct debug control register even though
13870 + * we clear it here.
13872 + * Being careful here means that we don't have to be as careful in a
13873 + * lot of more complicated places (task switching can be a bit lazy
13874 + * about restoring all the debug state, and ptrace doesn't have to
13875 + * find every occurrence of the TF bit that could be saved away even
13878 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
13880 + unsigned int condition;
13881 + struct task_struct *tsk = current;
13883 + get_debugreg(condition, 6);
13885 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
13886 + SIGTRAP) == NOTIFY_STOP)
13888 + /* It's safe to allow irq's after DR6 has been saved */
13889 + if (regs->eflags & X86_EFLAGS_IF)
13890 + local_irq_enable();
13892 + /* Mask out spurious debug traps due to lazy DR7 setting */
13893 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
13894 + if (!tsk->thread.debugreg[7])
13898 + if (regs->eflags & VM_MASK)
13901 + /* Save debug status register where ptrace can see it */
13902 + tsk->thread.debugreg[6] = condition;
13905 + * Single-stepping through TF: make sure we ignore any events in
13906 + * kernel space (but re-enable TF when returning to user mode).
13908 + if (condition & DR_STEP) {
13910 + * We already checked v86 mode above, so we can
13911 + * check for kernel mode by just checking the CPL
13914 + if (!user_mode(regs))
13915 + goto clear_TF_reenable;
13918 + /* Ok, finally something we can handle */
13919 + send_sigtrap(tsk, regs, error_code);
13921 + /* Disable additional traps. They'll be re-enabled when
13922 + * the signal is delivered.
13925 + set_debugreg(0, 7);
13929 + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
13932 +clear_TF_reenable:
13933 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
13934 + regs->eflags &= ~TF_MASK;
13939 + * Note that we play around with the 'TS' bit in an attempt to get
13940 + * the correct behaviour even in the presence of the asynchronous
13941 + * IRQ13 behaviour
13943 +void math_error(void __user *eip)
13945 + struct task_struct * task;
13947 + unsigned short cwd, swd;
13950 + * Save the info for the exception handler and clear the error.
13953 + save_init_fpu(task);
13954 + task->thread.trap_no = 16;
13955 + task->thread.error_code = 0;
13956 + info.si_signo = SIGFPE;
13957 + info.si_errno = 0;
13958 + info.si_code = __SI_FAULT;
13959 + info.si_addr = eip;
13961 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
13962 + * status. 0x3f is the exception bits in these regs, 0x200 is the
13963 + * C1 reg you need in case of a stack fault, 0x040 is the stack
13964 + * fault bit. We should only be taking one exception at a time,
13965 + * so if this combination doesn't produce any single exception,
13966 + * then we have a bad program that isn't syncronizing its FPU usage
13967 + * and it will suffer the consequences since we won't be able to
13968 + * fully reproduce the context of the exception
13970 + cwd = get_fpu_cwd(task);
13971 + swd = get_fpu_swd(task);
13972 + switch (swd & ~cwd & 0x3f) {
13973 + case 0x000: /* No unmasked exception */
13975 + default: /* Multiple exceptions */
13977 + case 0x001: /* Invalid Op */
13979 + * swd & 0x240 == 0x040: Stack Underflow
13980 + * swd & 0x240 == 0x240: Stack Overflow
13981 + * User must clear the SF bit (0x40) if set
13983 + info.si_code = FPE_FLTINV;
13985 + case 0x002: /* Denormalize */
13986 + case 0x010: /* Underflow */
13987 + info.si_code = FPE_FLTUND;
13989 + case 0x004: /* Zero Divide */
13990 + info.si_code = FPE_FLTDIV;
13992 + case 0x008: /* Overflow */
13993 + info.si_code = FPE_FLTOVF;
13995 + case 0x020: /* Precision */
13996 + info.si_code = FPE_FLTRES;
13999 + force_sig_info(SIGFPE, &info, task);
14002 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
14004 + ignore_fpu_irq = 1;
14005 + math_error((void __user *)regs->eip);
14008 +static void simd_math_error(void __user *eip)
14010 + struct task_struct * task;
14012 + unsigned short mxcsr;
14015 + * Save the info for the exception handler and clear the error.
14018 + save_init_fpu(task);
14019 + task->thread.trap_no = 19;
14020 + task->thread.error_code = 0;
14021 + info.si_signo = SIGFPE;
14022 + info.si_errno = 0;
14023 + info.si_code = __SI_FAULT;
14024 + info.si_addr = eip;
14026 + * The SIMD FPU exceptions are handled a little differently, as there
14027 + * is only a single status/control register. Thus, to determine which
14028 + * unmasked exception was caught we must mask the exception mask bits
14029 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
14031 + mxcsr = get_fpu_mxcsr(task);
14032 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
14036 + case 0x001: /* Invalid Op */
14037 + info.si_code = FPE_FLTINV;
14039 + case 0x002: /* Denormalize */
14040 + case 0x010: /* Underflow */
14041 + info.si_code = FPE_FLTUND;
14043 + case 0x004: /* Zero Divide */
14044 + info.si_code = FPE_FLTDIV;
14046 + case 0x008: /* Overflow */
14047 + info.si_code = FPE_FLTOVF;
14049 + case 0x020: /* Precision */
14050 + info.si_code = FPE_FLTRES;
14053 + force_sig_info(SIGFPE, &info, task);
14056 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
14059 + if (cpu_has_xmm) {
14060 + /* Handle SIMD FPU exceptions on PIII+ processors. */
14061 + ignore_fpu_irq = 1;
14062 + simd_math_error((void __user *)regs->eip);
14065 + * Handle strange cache flush from user space exception
14066 + * in all other cases. This is undocumented behaviour.
14068 + if (regs->eflags & VM_MASK) {
14069 + handle_vm86_fault((struct kernel_vm86_regs *)regs,
14073 + current->thread.trap_no = 19;
14074 + current->thread.error_code = error_code;
14075 + die_if_kernel("cache flush denied", regs, error_code);
14076 + force_sig(SIGSEGV, current);
14080 +#ifndef CONFIG_XEN
14081 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
14085 + /* No need to warn about this any longer. */
14086 + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
14090 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
14092 + unsigned long *switch16_ptr, *switch32_ptr;
14093 + struct pt_regs *regs;
14094 + unsigned long stack_top, stack_bot;
14095 + unsigned short iret_frame16_off;
14096 + int cpu = smp_processor_id();
14097 + /* reserve the space on 32bit stack for the magic switch16 pointer */
14098 + memmove(stk, stk + 8, sizeof(struct pt_regs));
14099 + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
14100 + regs = (struct pt_regs *)stk;
14101 + /* now the switch32 on 16bit stack */
14102 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14103 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14104 + switch32_ptr = (unsigned long *)(stack_top - 8);
14105 + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
14106 + /* copy iret frame on 16bit stack */
14107 + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20);
14108 + /* fill in the switch pointers */
14109 + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
14110 + switch16_ptr[1] = __ESPFIX_SS;
14111 + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
14112 + 8 - CPU_16BIT_STACK_SIZE;
14113 + switch32_ptr[1] = __KERNEL_DS;
14116 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
14118 + unsigned long *switch32_ptr;
14119 + unsigned char *stack16, *stack32;
14120 + unsigned long stack_top, stack_bot;
14122 + int cpu = smp_processor_id();
14123 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14124 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14125 + switch32_ptr = (unsigned long *)(stack_top - 8);
14126 + /* copy the data from 16bit stack to 32bit stack */
14127 + len = CPU_16BIT_STACK_SIZE - 8 - sp;
14128 + stack16 = (unsigned char *)(stack_bot + sp);
14129 + stack32 = (unsigned char *)
14130 + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
14131 + memcpy(stack32, stack16, len);
14137 + * 'math_state_restore()' saves the current math information in the
14138 + * old math state array, and gets the new ones from the current task
14140 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
14141 + * Don't touch unless you *really* know how it works.
14143 + * Must be called with kernel preemption disabled (in this case,
14144 + * local interrupts are disabled at the call-site in entry.S).
14146 +asmlinkage void math_state_restore(struct pt_regs regs)
14148 + struct thread_info *thread = current_thread_info();
14149 + struct task_struct *tsk = thread->task;
14151 + /* NB. 'clts' is done for us by Xen during virtual trap. */
14152 + if (!tsk_used_math(tsk))
14154 + restore_fpu(tsk);
14155 + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
14158 +#ifndef CONFIG_MATH_EMULATION
14160 +asmlinkage void math_emulate(long arg)
14162 + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
14163 + printk(KERN_EMERG "killing %s.\n",current->comm);
14164 + force_sig(SIGFPE,current);
14168 +#endif /* CONFIG_MATH_EMULATION */
14170 +#ifdef CONFIG_X86_F00F_BUG
14171 +void __init trap_init_f00f_bug(void)
14173 + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
14176 + * Update the IDT descriptor and reload the IDT so that
14177 + * it uses the read-only mapped virtual address.
14179 + idt_descr.address = fix_to_virt(FIX_F00F_IDT);
14180 + load_idt(&idt_descr);
14186 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
14187 + * for those that specify <dpl>|4 in the second field.
14189 +static trap_info_t __cpuinitdata trap_table[] = {
14190 + { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
14191 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
14192 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
14193 + { 4, 3, __KERNEL_CS, (unsigned long)overflow },
14194 + { 5, 0, __KERNEL_CS, (unsigned long)bounds },
14195 + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
14196 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
14197 + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
14198 + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
14199 + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
14200 + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
14201 + { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
14202 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
14203 + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
14204 + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
14205 + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
14206 +#ifdef CONFIG_X86_MCE
14207 + { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
14209 + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
14210 + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
14214 +void __init trap_init(void)
14218 + ret = HYPERVISOR_set_trap_table(trap_table);
14220 + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
14222 + if (cpu_has_fxsr) {
14224 + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
14225 + * Generates a compile-time "error: zero width for bit-field" if
14226 + * the alignment is wrong.
14228 + struct fxsrAlignAssert {
14229 + int _:!(offsetof(struct task_struct,
14230 + thread.i387.fxsave) & 15);
14233 + printk(KERN_INFO "Enabling fast FPU save and restore... ");
14234 + set_in_cr4(X86_CR4_OSFXSR);
14235 + printk("done.\n");
14237 + if (cpu_has_xmm) {
14238 + printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
14240 + set_in_cr4(X86_CR4_OSXMMEXCPT);
14241 + printk("done.\n");
14245 + * Should be a barrier for any external CPU state.
14250 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
14252 + const trap_info_t *t = trap_table;
14254 + for (t = trap_table; t->address; t++) {
14255 + trap_ctxt[t->vector].flags = t->flags;
14256 + trap_ctxt[t->vector].cs = t->cs;
14257 + trap_ctxt[t->vector].address = t->address;
14261 +static int __init kstack_setup(char *s)
14263 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
14266 +__setup("kstack=", kstack_setup);
14268 +#ifdef CONFIG_STACK_UNWIND
14269 +static int __init call_trace_setup(char *s)
14271 + if (strcmp(s, "old") == 0)
14273 + else if (strcmp(s, "both") == 0)
14275 + else if (strcmp(s, "newfallback") == 0)
14277 + else if (strcmp(s, "new") == 2)
14281 +__setup("call_trace=", call_trace_setup);
14283 Index: head-2008-11-25/arch/x86/mach-xen/Makefile
14284 ===================================================================
14285 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14286 +++ head-2008-11-25/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200
14289 +# Makefile for the linux kernel.
14293 Index: head-2008-11-25/arch/x86/mach-xen/setup.c
14294 ===================================================================
14295 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14296 +++ head-2008-11-25/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200
14299 + * Machine specific setup for generic
14302 +#include <linux/mm.h>
14303 +#include <linux/smp.h>
14304 +#include <linux/init.h>
14305 +#include <linux/interrupt.h>
14306 +#include <linux/module.h>
14307 +#include <asm/acpi.h>
14308 +#include <asm/arch_hooks.h>
14309 +#include <asm/e820.h>
14310 +#include <asm/setup.h>
14311 +#include <asm/fixmap.h>
14313 +#include <xen/interface/callback.h>
14314 +#include <xen/interface/memory.h>
14316 +#ifdef CONFIG_HOTPLUG_CPU
14317 +#define DEFAULT_SEND_IPI (1)
14319 +#define DEFAULT_SEND_IPI (0)
14322 +int no_broadcast=DEFAULT_SEND_IPI;
14324 +static __init int no_ipi_broadcast(char *str)
14326 + get_option(&str, &no_broadcast);
14327 + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
14328 + "IPI Broadcast");
14332 +__setup("no_ipi_broadcast", no_ipi_broadcast);
14334 +static int __init print_ipi_mode(void)
14336 + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
14341 +late_initcall(print_ipi_mode);
14344 + * machine_specific_memory_setup - Hook for machine specific memory setup.
14347 + * This is included late in kernel/setup.c so that it can make
14348 + * use of all of the static functions.
14351 +char * __init machine_specific_memory_setup(void)
14354 + struct xen_memory_map memmap;
14356 + * This is rather large for a stack variable but this early in
14357 + * the boot process we know we have plenty slack space.
14359 + struct e820entry map[E820MAX];
14361 + memmap.nr_entries = E820MAX;
14362 + set_xen_guest_handle(memmap.buffer, map);
14364 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
14365 + if ( rc == -ENOSYS ) {
14366 + memmap.nr_entries = 1;
14367 + map[0].addr = 0ULL;
14368 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
14369 + /* 8MB slack (to balance backend allocations). */
14370 + map[0].size += 8ULL << 20;
14371 + map[0].type = E820_RAM;
14376 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
14378 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
14384 +extern void hypervisor_callback(void);
14385 +extern void failsafe_callback(void);
14386 +extern void nmi(void);
14388 +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
14389 +EXPORT_SYMBOL(machine_to_phys_mapping);
14390 +unsigned int machine_to_phys_order;
14391 +EXPORT_SYMBOL(machine_to_phys_order);
14393 +void __init pre_setup_arch_hook(void)
14395 + struct xen_machphys_mapping mapping;
14396 + unsigned long machine_to_phys_nr_ents;
14397 + struct xen_platform_parameters pp;
14399 + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
14401 + setup_xen_features();
14403 + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
14404 + set_fixaddr_top(pp.virt_start);
14406 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
14407 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
14408 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
14410 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
14411 + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
14413 + if (!xen_feature(XENFEAT_auto_translated_physmap))
14414 + phys_to_machine_mapping =
14415 + (unsigned long *)xen_start_info->mfn_list;
14418 +void __init machine_specific_arch_setup(void)
14421 + static struct callback_register __initdata event = {
14422 + .type = CALLBACKTYPE_event,
14423 + .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
14425 + static struct callback_register __initdata failsafe = {
14426 + .type = CALLBACKTYPE_failsafe,
14427 + .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
14429 + static struct callback_register __initdata nmi_cb = {
14430 + .type = CALLBACKTYPE_nmi,
14431 + .address = { __KERNEL_CS, (unsigned long)nmi },
14434 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
14436 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
14437 +#if CONFIG_XEN_COMPAT <= 0x030002
14438 + if (ret == -ENOSYS)
14439 + ret = HYPERVISOR_set_callbacks(
14440 + event.address.cs, event.address.eip,
14441 + failsafe.address.cs, failsafe.address.eip);
14445 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
14446 +#if CONFIG_XEN_COMPAT <= 0x030002
14447 + if (ret == -ENOSYS) {
14448 + static struct xennmi_callback __initdata cb = {
14449 + .handler_address = (unsigned long)nmi
14452 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
14456 Index: head-2008-11-25/arch/x86/lib/scrub.c
14457 ===================================================================
14458 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14459 +++ head-2008-11-25/arch/x86/lib/scrub.c 2008-02-08 12:30:51.000000000 +0100
14461 +#include <asm/cpufeature.h>
14462 +#include <asm/page.h>
14463 +#include <asm/processor.h>
14465 +void scrub_pages(void *v, unsigned int count)
14467 + if (likely(cpu_has_xmm2)) {
14468 + unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
14470 + for (; n--; v += sizeof(long) * 4)
14471 + asm("movnti %1,(%0)\n\t"
14472 + "movnti %1,%c2(%0)\n\t"
14473 + "movnti %1,2*%c2(%0)\n\t"
14474 + "movnti %1,3*%c2(%0)\n\t"
14475 + : : "r" (v), "r" (0L), "i" (sizeof(long))
14477 + asm volatile("sfence" : : : "memory");
14479 + for (; count--; v += PAGE_SIZE)
14482 Index: head-2008-11-25/arch/x86/mm/fault_32-xen.c
14483 ===================================================================
14484 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14485 +++ head-2008-11-25/arch/x86/mm/fault_32-xen.c 2007-12-10 08:47:31.000000000 +0100
14488 + * linux/arch/i386/mm/fault.c
14490 + * Copyright (C) 1995 Linus Torvalds
14493 +#include <linux/signal.h>
14494 +#include <linux/sched.h>
14495 +#include <linux/kernel.h>
14496 +#include <linux/errno.h>
14497 +#include <linux/string.h>
14498 +#include <linux/types.h>
14499 +#include <linux/ptrace.h>
14500 +#include <linux/mman.h>
14501 +#include <linux/mm.h>
14502 +#include <linux/smp.h>
14503 +#include <linux/smp_lock.h>
14504 +#include <linux/interrupt.h>
14505 +#include <linux/init.h>
14506 +#include <linux/tty.h>
14507 +#include <linux/vt_kern.h> /* For unblank_screen() */
14508 +#include <linux/highmem.h>
14509 +#include <linux/module.h>
14510 +#include <linux/kprobes.h>
14512 +#include <asm/system.h>
14513 +#include <asm/uaccess.h>
14514 +#include <asm/desc.h>
14515 +#include <asm/kdebug.h>
14517 +extern void die(const char *,struct pt_regs *,long);
14519 +#ifdef CONFIG_KPROBES
14520 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
14521 +int register_page_fault_notifier(struct notifier_block *nb)
14523 + vmalloc_sync_all();
14524 + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
14527 +int unregister_page_fault_notifier(struct notifier_block *nb)
14529 + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
14532 +static inline int notify_page_fault(enum die_val val, const char *str,
14533 + struct pt_regs *regs, long err, int trap, int sig)
14535 + struct die_args args = {
14542 + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
14545 +static inline int notify_page_fault(enum die_val val, const char *str,
14546 + struct pt_regs *regs, long err, int trap, int sig)
14548 + return NOTIFY_DONE;
14554 + * Unlock any spinlocks which will prevent us from getting the
14557 +void bust_spinlocks(int yes)
14559 + int loglevel_save = console_loglevel;
14562 + oops_in_progress = 1;
14566 + unblank_screen();
14568 + oops_in_progress = 0;
14570 + * OK, the message is on the console. Now we call printk()
14571 + * without oops_in_progress set so that printk will give klogd
14572 + * a poke. Hold onto your hats...
14574 + console_loglevel = 15; /* NMI oopser may have shut the console up */
14576 + console_loglevel = loglevel_save;
14580 + * Return EIP plus the CS segment base. The segment limit is also
14581 + * adjusted, clamped to the kernel/user address space (whichever is
14582 + * appropriate), and returned in *eip_limit.
14584 + * The segment is checked, because it might have been changed by another
14585 + * task between the original faulting instruction and here.
14587 + * If CS is no longer a valid code segment, or if EIP is beyond the
14588 + * limit, or if it is a kernel address when CS is not a kernel segment,
14589 + * then the returned value will be greater than *eip_limit.
14591 + * This is slow, but is very rarely executed.
14593 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
14594 + unsigned long *eip_limit)
14596 + unsigned long eip = regs->eip;
14597 + unsigned seg = regs->xcs & 0xffff;
14598 + u32 seg_ar, seg_limit, base, *desc;
14600 + /* Unlikely, but must come before segment checks. */
14601 + if (unlikely(regs->eflags & VM_MASK)) {
14603 + *eip_limit = base + 0xffff;
14604 + return base + (eip & 0xffff);
14607 + /* The standard kernel/user address space limit. */
14608 + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
14610 + /* By far the most common cases. */
14611 + if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
14614 + /* Check the segment exists, is within the current LDT/GDT size,
14615 + that kernel/user (ring 0..3) has the appropriate privilege,
14616 + that it's a code segment, and get the limit. */
14617 + __asm__ ("larl %3,%0; lsll %3,%1"
14618 + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
14619 + if ((~seg_ar & 0x9800) || eip > seg_limit) {
14621 + return 1; /* So that returned eip > *eip_limit. */
14624 + /* Get the GDT/LDT descriptor base.
14625 + When you look for races in this code remember that
14626 + LDT and other horrors are only used in user space. */
14627 + if (seg & (1<<2)) {
14628 + /* Must lock the LDT while reading it. */
14629 + down(¤t->mm->context.sem);
14630 + desc = current->mm->context.ldt;
14631 + desc = (void *)desc + (seg & ~7);
14633 + /* Must disable preemption while reading the GDT. */
14634 + desc = (u32 *)get_cpu_gdt_table(get_cpu());
14635 + desc = (void *)desc + (seg & ~7);
14638 + /* Decode the code segment base from the descriptor */
14639 + base = get_desc_base((unsigned long *)desc);
14641 + if (seg & (1<<2)) {
14642 + up(¤t->mm->context.sem);
14646 + /* Adjust EIP and segment limit, and clamp at the kernel limit.
14647 + It's legitimate for segments to wrap at 0xffffffff. */
14648 + seg_limit += base;
14649 + if (seg_limit < *eip_limit && seg_limit >= base)
14650 + *eip_limit = seg_limit;
14651 + return eip + base;
14655 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
14656 + * Check that here and ignore it.
14658 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
14660 + unsigned long limit;
14661 + unsigned long instr = get_segment_eip (regs, &limit);
14662 + int scan_more = 1;
14663 + int prefetch = 0;
14666 + for (i = 0; scan_more && i < 15; i++) {
14667 + unsigned char opcode;
14668 + unsigned char instr_hi;
14669 + unsigned char instr_lo;
14671 + if (instr > limit)
14673 + if (__get_user(opcode, (unsigned char __user *) instr))
14676 + instr_hi = opcode & 0xf0;
14677 + instr_lo = opcode & 0x0f;
14680 + switch (instr_hi) {
14683 + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
14684 + scan_more = ((instr_lo & 7) == 0x6);
14688 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
14689 + scan_more = (instr_lo & 0xC) == 0x4;
14692 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
14693 + scan_more = !instr_lo || (instr_lo>>1) == 1;
14696 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
14698 + if (instr > limit)
14700 + if (__get_user(opcode, (unsigned char __user *) instr))
14702 + prefetch = (instr_lo == 0xF) &&
14703 + (opcode == 0x0D || opcode == 0x18);
14713 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
14714 + unsigned long error_code)
14716 + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
14717 + boot_cpu_data.x86 >= 6)) {
14718 + /* Catch an obscure case of prefetch inside an NX page. */
14719 + if (nx_enabled && (error_code & 16))
14721 + return __is_prefetch(regs, addr);
14726 +static noinline void force_sig_info_fault(int si_signo, int si_code,
14727 + unsigned long address, struct task_struct *tsk)
14731 + info.si_signo = si_signo;
14732 + info.si_errno = 0;
14733 + info.si_code = si_code;
14734 + info.si_addr = (void __user *)address;
14735 + force_sig_info(si_signo, &info, tsk);
14738 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
14740 +#ifdef CONFIG_X86_PAE
14741 +static void dump_fault_path(unsigned long address)
14743 + unsigned long *p, page;
14744 + unsigned long mfn;
14746 + page = read_cr3();
14747 + p = (unsigned long *)__va(page);
14748 + p += (address >> 30) * 2;
14749 + printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
14750 + if (p[0] & _PAGE_PRESENT) {
14751 + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14752 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14753 + p = (unsigned long *)__va(page);
14754 + address &= 0x3fffffff;
14755 + p += (address >> 21) * 2;
14756 + printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
14757 + page, p[1], p[0]);
14758 + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14759 +#ifdef CONFIG_HIGHPTE
14760 + if (mfn_to_pfn(mfn) >= highstart_pfn)
14763 + if (p[0] & _PAGE_PRESENT) {
14764 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14765 + p = (unsigned long *) __va(page);
14766 + address &= 0x001fffff;
14767 + p += (address >> 12) * 2;
14768 + printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
14769 + page, p[1], p[0]);
14774 +static void dump_fault_path(unsigned long address)
14776 + unsigned long page;
14778 + page = read_cr3();
14779 + page = ((unsigned long *) __va(page))[address >> 22];
14780 + if (oops_may_print())
14781 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
14782 + machine_to_phys(page));
14784 + * We must not directly access the pte in the highpte
14785 + * case if the page table is located in highmem.
14786 + * And lets rather not kmap-atomic the pte, just in case
14787 + * it's allocated already.
14789 +#ifdef CONFIG_HIGHPTE
14790 + if ((page >> PAGE_SHIFT) >= highstart_pfn)
14793 + if ((page & 1) && oops_may_print()) {
14794 + page &= PAGE_MASK;
14795 + address &= 0x003ff000;
14796 + page = machine_to_phys(page);
14797 + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
14798 + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
14799 + machine_to_phys(page));
14804 +static int spurious_fault(struct pt_regs *regs,
14805 + unsigned long address,
14806 + unsigned long error_code)
14813 + /* Reserved-bit violation or user access to kernel space? */
14814 + if (error_code & 0x0c)
14817 + pgd = init_mm.pgd + pgd_index(address);
14818 + if (!pgd_present(*pgd))
14821 + pud = pud_offset(pgd, address);
14822 + if (!pud_present(*pud))
14825 + pmd = pmd_offset(pud, address);
14826 + if (!pmd_present(*pmd))
14829 + pte = pte_offset_kernel(pmd, address);
14830 + if (!pte_present(*pte))
14832 + if ((error_code & 0x02) && !pte_write(*pte))
14834 +#ifdef CONFIG_X86_PAE
14835 + if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
14842 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
14844 + unsigned index = pgd_index(address);
14846 + pud_t *pud, *pud_k;
14847 + pmd_t *pmd, *pmd_k;
14850 + pgd_k = init_mm.pgd + index;
14852 + if (!pgd_present(*pgd_k))
14856 + * set_pgd(pgd, *pgd_k); here would be useless on PAE
14857 + * and redundant with the set_pmd() on non-PAE. As would
14861 + pud = pud_offset(pgd, address);
14862 + pud_k = pud_offset(pgd_k, address);
14863 + if (!pud_present(*pud_k))
14866 + pmd = pmd_offset(pud, address);
14867 + pmd_k = pmd_offset(pud_k, address);
14868 + if (!pmd_present(*pmd_k))
14870 + if (!pmd_present(*pmd))
14871 +#if CONFIG_XEN_COMPAT > 0x030002
14872 + set_pmd(pmd, *pmd_k);
14875 + * When running on older Xen we must launder *pmd_k through
14876 + * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
14878 + set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
14881 + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
14886 + * Handle a fault on the vmalloc or module mapping area
14888 + * This assumes no large pages in there.
14890 +static inline int vmalloc_fault(unsigned long address)
14892 + unsigned long pgd_paddr;
14896 + * Synchronize this task's top level page-table
14897 + * with the 'reference' page table.
14899 + * Do _not_ use "current" here. We might be inside
14900 + * an interrupt in the middle of a task switch..
14902 + pgd_paddr = read_cr3();
14903 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
14906 + pte_k = pte_offset_kernel(pmd_k, address);
14907 + if (!pte_present(*pte_k))
14913 + * This routine handles page faults. It determines the address,
14914 + * and the problem, and then passes it off to one of the appropriate
14918 + * bit 0 == 0 means no page found, 1 means protection fault
14919 + * bit 1 == 0 means read, 1 means write
14920 + * bit 2 == 0 means kernel, 1 means user-mode
14921 + * bit 3 == 1 means use of reserved bit detected
14922 + * bit 4 == 1 means fault was an instruction fetch
14924 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
14925 + unsigned long error_code)
14927 + struct task_struct *tsk;
14928 + struct mm_struct *mm;
14929 + struct vm_area_struct * vma;
14930 + unsigned long address;
14931 + int write, si_code;
14933 + /* get the address */
14934 + address = read_cr2();
14936 + /* Set the "privileged fault" bit to something sane. */
14937 + error_code &= ~4;
14938 + error_code |= (regs->xcs & 2) << 1;
14939 + if (regs->eflags & X86_EFLAGS_VM)
14944 + si_code = SEGV_MAPERR;
14947 + * We fault-in kernel-space virtual memory on-demand. The
14948 + * 'reference' page table is init_mm.pgd.
14950 + * NOTE! We MUST NOT take any locks for this case. We may
14951 + * be in an interrupt or a critical region, and should
14952 + * only copy the information from the master page table,
14955 + * This verifies that the fault happens in kernel space
14956 + * (error_code & 4) == 0, and that the fault was not a
14957 + * protection error (error_code & 9) == 0.
14959 + if (unlikely(address >= TASK_SIZE)) {
14961 + /* Faults in hypervisor area can never be patched up. */
14962 + if (address >= hypervisor_virt_start)
14963 + goto bad_area_nosemaphore;
14965 + if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
14967 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
14968 + if (spurious_fault(regs, address, error_code))
14970 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14971 + SIGSEGV) == NOTIFY_STOP)
14974 + * Don't take the mm semaphore here. If we fixup a prefetch
14975 + * fault we could otherwise deadlock.
14977 + goto bad_area_nosemaphore;
14980 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14981 + SIGSEGV) == NOTIFY_STOP)
14984 + /* It's safe to allow irq's after cr2 has been saved and the vmalloc
14985 + fault has been handled. */
14986 + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
14987 + local_irq_enable();
14992 + * If we're in an interrupt, have no user context or are running in an
14993 + * atomic region then we must not take the fault..
14995 + if (in_atomic() || !mm)
14996 + goto bad_area_nosemaphore;
14998 + /* When running in the kernel we expect faults to occur only to
14999 + * addresses in user space. All other faults represent errors in the
15000 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
15001 + * erroneous fault occurring in a code path which already holds mmap_sem
15002 + * we will deadlock attempting to validate the fault against the
15003 + * address space. Luckily the kernel only validly references user
15004 + * space from well defined areas of code, which are listed in the
15005 + * exceptions table.
15007 + * As the vast majority of faults will be valid we will only perform
15008 + * the source reference check when there is a possibilty of a deadlock.
15009 + * Attempt to lock the address space, if we cannot we then validate the
15010 + * source. If this is invalid we can skip the address space check,
15011 + * thus avoiding the deadlock.
15013 + if (!down_read_trylock(&mm->mmap_sem)) {
15014 + if ((error_code & 4) == 0 &&
15015 + !search_exception_tables(regs->eip))
15016 + goto bad_area_nosemaphore;
15017 + down_read(&mm->mmap_sem);
15020 + vma = find_vma(mm, address);
15023 + if (vma->vm_start <= address)
15025 + if (!(vma->vm_flags & VM_GROWSDOWN))
15027 + if (error_code & 4) {
15029 + * Accessing the stack below %esp is always a bug.
15030 + * The large cushion allows instructions like enter
15031 + * and pusha to work. ("enter $65535,$31" pushes
15032 + * 32 pointers and then decrements %esp by 65535.)
15034 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
15037 + if (expand_stack(vma, address))
15040 + * Ok, we have a good vm_area for this memory access, so
15041 + * we can handle it..
15044 + si_code = SEGV_ACCERR;
15046 + switch (error_code & 3) {
15047 + default: /* 3: write, present */
15048 +#ifdef TEST_VERIFY_AREA
15049 + if (regs->cs == GET_KERNEL_CS())
15050 + printk("WP fault at %08lx\n", regs->eip);
15052 + /* fall through */
15053 + case 2: /* write, not present */
15054 + if (!(vma->vm_flags & VM_WRITE))
15058 + case 1: /* read, present */
15060 + case 0: /* read, not present */
15061 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
15067 + * If for any reason at all we couldn't handle the fault,
15068 + * make sure we exit gracefully rather than endlessly redo
15071 + switch (handle_mm_fault(mm, vma, address, write)) {
15072 + case VM_FAULT_MINOR:
15075 + case VM_FAULT_MAJOR:
15078 + case VM_FAULT_SIGBUS:
15080 + case VM_FAULT_OOM:
15081 + goto out_of_memory;
15087 + * Did it hit the DOS screen memory VA from vm86 mode?
15089 + if (regs->eflags & VM_MASK) {
15090 + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
15092 + tsk->thread.screen_bitmap |= 1 << bit;
15094 + up_read(&mm->mmap_sem);
15098 + * Something tried to access memory that isn't in our memory map..
15099 + * Fix it, but check if it's kernel or user first..
15102 + up_read(&mm->mmap_sem);
15104 +bad_area_nosemaphore:
15105 + /* User mode accesses just cause a SIGSEGV */
15106 + if (error_code & 4) {
15108 + * Valid to do another page fault here because this one came
15109 + * from user space.
15111 + if (is_prefetch(regs, address, error_code))
15114 + tsk->thread.cr2 = address;
15115 + /* Kernel addresses are always protection faults */
15116 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
15117 + tsk->thread.trap_no = 14;
15118 + force_sig_info_fault(SIGSEGV, si_code, address, tsk);
15122 +#ifdef CONFIG_X86_F00F_BUG
15124 + * Pentium F0 0F C7 C8 bug workaround.
15126 + if (boot_cpu_data.f00f_bug) {
15127 + unsigned long nr;
15129 + nr = (address - idt_descr.address) >> 3;
15132 + do_invalid_op(regs, 0);
15139 + /* Are we prepared to handle this kernel fault? */
15140 + if (fixup_exception(regs))
15144 + * Valid to do another page fault here, because if this fault
15145 + * had been triggered by is_prefetch fixup_exception would have
15148 + if (is_prefetch(regs, address, error_code))
15152 + * Oops. The kernel tried to access some bad page. We'll have to
15153 + * terminate things with extreme prejudice.
15156 + bust_spinlocks(1);
15158 + if (oops_may_print()) {
15159 + #ifdef CONFIG_X86_PAE
15160 + if (error_code & 16) {
15161 + pte_t *pte = lookup_address(address);
15163 + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
15164 + printk(KERN_CRIT "kernel tried to execute "
15165 + "NX-protected page - exploit attempt? "
15166 + "(uid: %d)\n", current->uid);
15169 + if (address < PAGE_SIZE)
15170 + printk(KERN_ALERT "BUG: unable to handle kernel NULL "
15171 + "pointer dereference");
15173 + printk(KERN_ALERT "BUG: unable to handle kernel paging"
15175 + printk(" at virtual address %08lx\n",address);
15176 + printk(KERN_ALERT " printing eip:\n");
15177 + printk("%08lx\n", regs->eip);
15179 + dump_fault_path(address);
15180 + tsk->thread.cr2 = address;
15181 + tsk->thread.trap_no = 14;
15182 + tsk->thread.error_code = error_code;
15183 + die("Oops", regs, error_code);
15184 + bust_spinlocks(0);
15185 + do_exit(SIGKILL);
15188 + * We ran out of memory, or some other thing happened to us that made
15189 + * us unable to handle the page fault gracefully.
15192 + up_read(&mm->mmap_sem);
15193 + if (tsk->pid == 1) {
15195 + down_read(&mm->mmap_sem);
15198 + printk("VM: killing process %s\n", tsk->comm);
15199 + if (error_code & 4)
15200 + do_exit(SIGKILL);
15204 + up_read(&mm->mmap_sem);
15206 + /* Kernel mode? Handle exceptions or die */
15207 + if (!(error_code & 4))
15210 + /* User space => ok to do another page fault */
15211 + if (is_prefetch(regs, address, error_code))
15214 + tsk->thread.cr2 = address;
15215 + tsk->thread.error_code = error_code;
15216 + tsk->thread.trap_no = 14;
15217 + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
15220 +#if !HAVE_SHARED_KERNEL_PMD
15221 +void vmalloc_sync_all(void)
15224 + * Note that races in the updates of insync and start aren't
15225 + * problematic: insync can only get set bits added, and updates to
15226 + * start are only improving performance (without affecting correctness
15228 + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
15229 + * This change works just fine with 2-level paging too.
15231 +#define sync_index(a) ((a) >> PMD_SHIFT)
15232 + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
15233 + static unsigned long start = TASK_SIZE;
15234 + unsigned long address;
15236 + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
15237 + for (address = start;
15238 + address >= TASK_SIZE && address < hypervisor_virt_start;
15239 + address += 1UL << PMD_SHIFT) {
15240 + if (!test_bit(sync_index(address), insync)) {
15241 + unsigned long flags;
15242 + struct page *page;
15244 + spin_lock_irqsave(&pgd_lock, flags);
15245 + /* XEN: failure path assumes non-empty pgd_list. */
15246 + if (unlikely(!pgd_list)) {
15247 + spin_unlock_irqrestore(&pgd_lock, flags);
15250 + for (page = pgd_list; page; page =
15251 + (struct page *)page->index)
15252 + if (!vmalloc_sync_one(page_address(page),
15254 + BUG_ON(page != pgd_list);
15257 + spin_unlock_irqrestore(&pgd_lock, flags);
15259 + set_bit(sync_index(address), insync);
15261 + if (address == start && test_bit(sync_index(address), insync))
15262 + start = address + (1UL << PMD_SHIFT);
15266 Index: head-2008-11-25/arch/x86/mm/highmem_32-xen.c
15267 ===================================================================
15268 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15269 +++ head-2008-11-25/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100
15271 +#include <linux/highmem.h>
15272 +#include <linux/module.h>
15274 +void *kmap(struct page *page)
15277 + if (!PageHighMem(page))
15278 + return page_address(page);
15279 + return kmap_high(page);
15282 +void kunmap(struct page *page)
15284 + if (in_interrupt())
15286 + if (!PageHighMem(page))
15288 + kunmap_high(page);
15292 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
15293 + * no global lock is needed and because the kmap code must perform a global TLB
15294 + * invalidation when the kmap pool wraps.
15296 + * However when holding an atomic kmap is is not legal to sleep, so atomic
15297 + * kmaps are appropriate for short, tight code paths only.
15299 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
15301 + enum fixed_addresses idx;
15302 + unsigned long vaddr;
15304 + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
15305 + inc_preempt_count();
15306 + if (!PageHighMem(page))
15307 + return page_address(page);
15309 + idx = type + KM_TYPE_NR*smp_processor_id();
15310 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15311 +#ifdef CONFIG_DEBUG_HIGHMEM
15312 + if (!pte_none(*(kmap_pte-idx)))
15315 + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
15317 + return (void*) vaddr;
15320 +void *kmap_atomic(struct page *page, enum km_type type)
15322 + return __kmap_atomic(page, type, kmap_prot);
15325 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
15326 +void *kmap_atomic_pte(struct page *page, enum km_type type)
15328 + return __kmap_atomic(page, type,
15329 + test_bit(PG_pinned, &page->flags)
15330 + ? PAGE_KERNEL_RO : kmap_prot);
15333 +void kunmap_atomic(void *kvaddr, enum km_type type)
15335 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
15336 + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
15337 + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
15339 + if (vaddr < FIXADDR_START) { // FIXME
15340 + dec_preempt_count();
15341 + preempt_check_resched();
15346 +#if defined(CONFIG_DEBUG_HIGHMEM)
15347 + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
15351 + * force other mappings to Oops if they'll try to access
15352 + * this pte without first remap it
15354 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
15355 + __flush_tlb_one(vaddr);
15356 +#elif defined(CONFIG_XEN)
15358 + * We must ensure there are no dangling pagetable references when
15359 + * returning memory to Xen (decrease_reservation).
15360 + * XXX TODO: We could make this faster by only zapping when
15361 + * kmap_flush_unused is called but that is trickier and more invasive.
15363 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
15366 + dec_preempt_count();
15367 + preempt_check_resched();
15370 +/* This is the same as kmap_atomic() but can map memory that doesn't
15371 + * have a struct page associated with it.
15373 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
15375 + enum fixed_addresses idx;
15376 + unsigned long vaddr;
15378 + inc_preempt_count();
15380 + idx = type + KM_TYPE_NR*smp_processor_id();
15381 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15382 + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
15383 + __flush_tlb_one(vaddr);
15385 + return (void*) vaddr;
15388 +struct page *kmap_atomic_to_page(void *ptr)
15390 + unsigned long idx, vaddr = (unsigned long)ptr;
15393 + if (vaddr < FIXADDR_START)
15394 + return virt_to_page(ptr);
15396 + idx = virt_to_fix(vaddr);
15397 + pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
15398 + return pte_page(*pte);
15401 +void clear_highpage(struct page *page)
15405 + if (likely(xen_feature(XENFEAT_highmem_assist))
15406 + && PageHighMem(page)) {
15407 + struct mmuext_op meo;
15409 + meo.cmd = MMUEXT_CLEAR_PAGE;
15410 + meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
15411 + if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15415 + kaddr = kmap_atomic(page, KM_USER0);
15416 + clear_page(kaddr);
15417 + kunmap_atomic(kaddr, KM_USER0);
15420 +void copy_highpage(struct page *to, struct page *from)
15422 + void *vfrom, *vto;
15424 + if (likely(xen_feature(XENFEAT_highmem_assist))
15425 + && (PageHighMem(from) || PageHighMem(to))) {
15426 + unsigned long from_pfn = page_to_pfn(from);
15427 + unsigned long to_pfn = page_to_pfn(to);
15428 + struct mmuext_op meo;
15430 + meo.cmd = MMUEXT_COPY_PAGE;
15431 + meo.arg1.mfn = pfn_to_mfn(to_pfn);
15432 + meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
15433 + if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
15434 + && mfn_to_pfn(meo.arg1.mfn) == to_pfn
15435 + && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15439 + vfrom = kmap_atomic(from, KM_USER0);
15440 + vto = kmap_atomic(to, KM_USER1);
15441 + copy_page(vto, vfrom);
15442 + kunmap_atomic(vfrom, KM_USER0);
15443 + kunmap_atomic(vto, KM_USER1);
15446 +EXPORT_SYMBOL(kmap);
15447 +EXPORT_SYMBOL(kunmap);
15448 +EXPORT_SYMBOL(kmap_atomic);
15449 +EXPORT_SYMBOL(kmap_atomic_pte);
15450 +EXPORT_SYMBOL(kunmap_atomic);
15451 +EXPORT_SYMBOL(kmap_atomic_to_page);
15452 +EXPORT_SYMBOL(clear_highpage);
15453 +EXPORT_SYMBOL(copy_highpage);
15454 Index: head-2008-11-25/arch/x86/mm/hypervisor.c
15455 ===================================================================
15456 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15457 +++ head-2008-11-25/arch/x86/mm/hypervisor.c 2008-10-29 09:55:56.000000000 +0100
15459 +/******************************************************************************
15460 + * mm/hypervisor.c
15462 + * Update page tables via the hypervisor.
15464 + * Copyright (c) 2002-2004, K A Fraser
15466 + * This program is free software; you can redistribute it and/or
15467 + * modify it under the terms of the GNU General Public License version 2
15468 + * as published by the Free Software Foundation; or, when distributed
15469 + * separately from the Linux kernel or incorporated into other
15470 + * software packages, subject to the following license:
15472 + * Permission is hereby granted, free of charge, to any person obtaining a copy
15473 + * of this source file (the "Software"), to deal in the Software without
15474 + * restriction, including without limitation the rights to use, copy, modify,
15475 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
15476 + * and to permit persons to whom the Software is furnished to do so, subject to
15477 + * the following conditions:
15479 + * The above copyright notice and this permission notice shall be included in
15480 + * all copies or substantial portions of the Software.
15482 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15483 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15484 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15485 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15486 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
15487 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
15488 + * IN THE SOFTWARE.
15491 +#include <linux/sched.h>
15492 +#include <linux/mm.h>
15493 +#include <linux/vmalloc.h>
15494 +#include <asm/page.h>
15495 +#include <asm/pgtable.h>
15496 +#include <asm/hypervisor.h>
15497 +#include <xen/balloon.h>
15498 +#include <xen/features.h>
15499 +#include <xen/interface/memory.h>
15500 +#include <linux/module.h>
15501 +#include <linux/percpu.h>
15502 +#include <asm/tlbflush.h>
15503 +#include <linux/highmem.h>
15505 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
15508 +#ifdef CONFIG_HIGHPTE
15509 + u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ?
15510 + arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr);
15512 + u.ptr = virt_to_machine(ptr);
15514 + u.val = __pte_val(val);
15515 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15517 +EXPORT_SYMBOL_GPL(xen_l1_entry_update);
15519 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
15522 + u.ptr = virt_to_machine(ptr);
15523 + u.val = __pmd_val(val);
15524 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15527 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
15528 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
15531 + u.ptr = virt_to_machine(ptr);
15532 + u.val = __pud_val(val);
15533 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15537 +#ifdef CONFIG_X86_64
15538 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
15541 + u.ptr = virt_to_machine(ptr);
15542 + u.val = __pgd_val(val);
15543 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15545 +#endif /* CONFIG_X86_64 */
15547 +void xen_pt_switch(unsigned long ptr)
15549 + struct mmuext_op op;
15550 + op.cmd = MMUEXT_NEW_BASEPTR;
15551 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15552 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15555 +void xen_new_user_pt(unsigned long ptr)
15557 + struct mmuext_op op;
15558 + op.cmd = MMUEXT_NEW_USER_BASEPTR;
15559 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15560 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15563 +void xen_tlb_flush(void)
15565 + struct mmuext_op op;
15566 + op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
15567 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15569 +EXPORT_SYMBOL(xen_tlb_flush);
15571 +void xen_invlpg(unsigned long ptr)
15573 + struct mmuext_op op;
15574 + op.cmd = MMUEXT_INVLPG_LOCAL;
15575 + op.arg1.linear_addr = ptr & PAGE_MASK;
15576 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15578 +EXPORT_SYMBOL(xen_invlpg);
15582 +void xen_tlb_flush_all(void)
15584 + struct mmuext_op op;
15585 + op.cmd = MMUEXT_TLB_FLUSH_ALL;
15586 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15589 +void xen_tlb_flush_mask(cpumask_t *mask)
15591 + struct mmuext_op op;
15592 + if ( cpus_empty(*mask) )
15594 + op.cmd = MMUEXT_TLB_FLUSH_MULTI;
15595 + set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15596 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15599 +void xen_invlpg_all(unsigned long ptr)
15601 + struct mmuext_op op;
15602 + op.cmd = MMUEXT_INVLPG_ALL;
15603 + op.arg1.linear_addr = ptr & PAGE_MASK;
15604 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15607 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
15609 + struct mmuext_op op;
15610 + if ( cpus_empty(*mask) )
15612 + op.cmd = MMUEXT_INVLPG_MULTI;
15613 + op.arg1.linear_addr = ptr & PAGE_MASK;
15614 + set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15615 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15618 +#endif /* CONFIG_SMP */
15620 +void xen_pgd_pin(unsigned long ptr)
15622 + struct mmuext_op op;
15623 +#ifdef CONFIG_X86_64
15624 + op.cmd = MMUEXT_PIN_L4_TABLE;
15625 +#elif defined(CONFIG_X86_PAE)
15626 + op.cmd = MMUEXT_PIN_L3_TABLE;
15628 + op.cmd = MMUEXT_PIN_L2_TABLE;
15630 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15631 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15634 +void xen_pgd_unpin(unsigned long ptr)
15636 + struct mmuext_op op;
15637 + op.cmd = MMUEXT_UNPIN_TABLE;
15638 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15639 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15642 +void xen_set_ldt(const void *ptr, unsigned int ents)
15644 + struct mmuext_op op;
15645 + op.cmd = MMUEXT_SET_LDT;
15646 + op.arg1.linear_addr = (unsigned long)ptr;
15647 + op.arg2.nr_ents = ents;
15648 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15651 +/* Protected by balloon_lock. */
15652 +#define MAX_CONTIG_ORDER 9 /* 2MB */
15653 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
15654 +static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
15655 +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
15657 +/* Ensure multi-page extents are contiguous in machine memory. */
15658 +int xen_create_contiguous_region(
15659 + unsigned long vstart, unsigned int order, unsigned int address_bits)
15661 + unsigned long *in_frames = discontig_frames, out_frame;
15662 + unsigned long frame, flags;
15665 + struct xen_memory_exchange exchange = {
15667 + .nr_extents = 1UL << order,
15668 + .extent_order = 0,
15669 + .domid = DOMID_SELF
15673 + .extent_order = order,
15674 + .address_bits = address_bits,
15675 + .domid = DOMID_SELF
15680 + * Currently an auto-translated guest will not perform I/O, nor will
15681 + * it require PAE page directories below 4GB. Therefore any calls to
15682 + * this function are redundant and can be ignored.
15684 + if (xen_feature(XENFEAT_auto_translated_physmap))
15687 + if (unlikely(order > MAX_CONTIG_ORDER))
15690 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
15691 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
15693 + scrub_pages((void *)vstart, 1 << order);
15695 + balloon_lock(flags);
15697 + /* 1. Zap current PTEs, remembering MFNs. */
15698 + for (i = 0; i < (1U<<order); i++) {
15699 + in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
15700 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15702 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15703 + INVALID_P2M_ENTRY);
15705 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15708 + /* 2. Get a new contiguous memory extent. */
15709 + out_frame = __pa(vstart) >> PAGE_SHIFT;
15710 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15711 + success = (exchange.nr_exchanged == (1UL << order));
15712 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15713 + BUG_ON(success && (rc != 0));
15714 +#if CONFIG_XEN_COMPAT <= 0x030002
15715 + if (unlikely(rc == -ENOSYS)) {
15716 + /* Compatibility when XENMEM_exchange is unsupported. */
15717 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15718 + &exchange.in) != (1UL << order))
15720 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15721 + &exchange.out) == 1);
15723 + /* Couldn't get special memory: fall back to normal. */
15724 + for (i = 0; i < (1U<<order); i++)
15725 + in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
15726 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15727 + &exchange.in) != (1UL<<order))
15733 + /* 3. Map the new extent in place of old pages. */
15734 + for (i = 0; i < (1U<<order); i++) {
15735 + frame = success ? (out_frame + i) : in_frames[i];
15736 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15737 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15738 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15741 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15742 + ? UVMF_TLB_FLUSH|UVMF_ALL
15743 + : UVMF_INVLPG|UVMF_ALL;
15744 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15747 + balloon_unlock(flags);
15749 + return success ? 0 : -ENOMEM;
15751 +EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
15753 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
15755 + unsigned long *out_frames = discontig_frames, in_frame;
15756 + unsigned long frame, flags;
15759 + struct xen_memory_exchange exchange = {
15762 + .extent_order = order,
15763 + .domid = DOMID_SELF
15766 + .nr_extents = 1UL << order,
15767 + .extent_order = 0,
15768 + .domid = DOMID_SELF
15772 + if (xen_feature(XENFEAT_auto_translated_physmap))
15775 + if (unlikely(order > MAX_CONTIG_ORDER))
15778 + set_xen_guest_handle(exchange.in.extent_start, &in_frame);
15779 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
15781 + scrub_pages((void *)vstart, 1 << order);
15783 + balloon_lock(flags);
15785 + /* 1. Find start MFN of contiguous extent. */
15786 + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
15788 + /* 2. Zap current PTEs. */
15789 + for (i = 0; i < (1U<<order); i++) {
15790 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15792 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15793 + INVALID_P2M_ENTRY);
15794 + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
15796 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15799 + /* 3. Do the exchange for non-contiguous MFNs. */
15800 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15801 + success = (exchange.nr_exchanged == 1);
15802 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15803 + BUG_ON(success && (rc != 0));
15804 +#if CONFIG_XEN_COMPAT <= 0x030002
15805 + if (unlikely(rc == -ENOSYS)) {
15806 + /* Compatibility when XENMEM_exchange is unsupported. */
15807 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15808 + &exchange.in) != 1)
15810 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15811 + &exchange.out) != (1UL << order))
15817 + /* 4. Map new pages in place of old pages. */
15818 + for (i = 0; i < (1U<<order); i++) {
15819 + frame = success ? out_frames[i] : (in_frame + i);
15820 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15821 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15822 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15825 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15826 + ? UVMF_TLB_FLUSH|UVMF_ALL
15827 + : UVMF_INVLPG|UVMF_ALL;
15828 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15831 + balloon_unlock(flags);
15833 +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
15835 +int xen_limit_pages_to_max_mfn(
15836 + struct page *pages, unsigned int order, unsigned int address_bits)
15838 + unsigned long flags, frame;
15839 + unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
15840 + struct page *page;
15841 + unsigned int i, n, nr_mcl;
15843 + DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
15845 + struct xen_memory_exchange exchange = {
15847 + .extent_order = 0,
15848 + .domid = DOMID_SELF
15851 + .extent_order = 0,
15852 + .address_bits = address_bits,
15853 + .domid = DOMID_SELF
15857 + if (xen_feature(XENFEAT_auto_translated_physmap))
15860 + if (unlikely(order > MAX_CONTIG_ORDER))
15863 + bitmap_zero(limit_map, 1U << order);
15864 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
15865 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
15867 + /* 0. Scrub the pages. */
15868 + for (i = 0, n = 0; i < 1U<<order ; i++) {
15869 + page = &pages[i];
15870 + if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
15872 + __set_bit(i, limit_map);
15874 + if (!PageHighMem(page))
15875 + scrub_pages(page_address(page), 1);
15876 +#ifdef CONFIG_XEN_SCRUB_PAGES
15878 + scrub_pages(kmap(page), 1);
15884 + if (bitmap_empty(limit_map, 1U << order))
15888 + kmap_flush_unused();
15890 + balloon_lock(flags);
15892 + /* 1. Zap current PTEs (if any), remembering MFNs. */
15893 + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15894 + if(!test_bit(i, limit_map))
15896 + page = &pages[i];
15898 + out_frames[n] = page_to_pfn(page);
15899 + in_frames[n] = pfn_to_mfn(out_frames[n]);
15901 + if (!PageHighMem(page))
15902 + MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15903 + (unsigned long)page_address(page),
15906 + set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
15909 + if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15912 + /* 2. Get new memory below the required limit. */
15913 + exchange.in.nr_extents = n;
15914 + exchange.out.nr_extents = n;
15915 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15916 + success = (exchange.nr_exchanged == n);
15917 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15918 + BUG_ON(success && (rc != 0));
15919 +#if CONFIG_XEN_COMPAT <= 0x030002
15920 + if (unlikely(rc == -ENOSYS)) {
15921 + /* Compatibility when XENMEM_exchange is unsupported. */
15922 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15923 + &exchange.in) != n)
15925 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15926 + &exchange.out) != n)
15932 + /* 3. Map the new pages in place of old pages. */
15933 + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15934 + if(!test_bit(i, limit_map))
15936 + page = &pages[i];
15938 + frame = success ? out_frames[n] : in_frames[n];
15940 + if (!PageHighMem(page))
15941 + MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15942 + (unsigned long)page_address(page),
15943 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15945 + set_phys_to_machine(page_to_pfn(page), frame);
15949 + cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
15950 + ? UVMF_TLB_FLUSH|UVMF_ALL
15951 + : UVMF_INVLPG|UVMF_ALL;
15952 + if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15956 + balloon_unlock(flags);
15958 + return success ? 0 : -ENOMEM;
15960 +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
15963 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
15965 + __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
15966 + maddr_t mach_lp = arbitrary_virt_to_machine(lp);
15967 + return HYPERVISOR_update_descriptor(
15968 + mach_lp, (u64)entry_a | ((u64)entry_b<<32));
15972 +#define MAX_BATCHED_FULL_PTES 32
15974 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
15975 + unsigned long addr, unsigned long end, pgprot_t newprot)
15977 + int rc = 0, i = 0;
15978 + mmu_update_t u[MAX_BATCHED_FULL_PTES];
15982 + if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
15985 + pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
15987 + if (pte_present(*pte)) {
15988 + u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
15989 + | ((unsigned long)pte & ~PAGE_MASK)
15990 + | MMU_PT_UPDATE_PRESERVE_AD;
15991 + u[i].val = __pte_val(pte_modify(*pte, newprot));
15992 + if (++i == MAX_BATCHED_FULL_PTES) {
15993 + if ((rc = HYPERVISOR_mmu_update(
15994 + &u[0], i, NULL, DOMID_SELF)) != 0)
15999 + } while (pte++, addr += PAGE_SIZE, addr != end);
16001 + rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
16002 + pte_unmap_unlock(pte - 1, ptl);
16003 + BUG_ON(rc && rc != -ENOSYS);
16006 Index: head-2008-11-25/arch/x86/mm/init_32-xen.c
16007 ===================================================================
16008 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
16009 +++ head-2008-11-25/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100
16012 + * linux/arch/i386/mm/init.c
16014 + * Copyright (C) 1995 Linus Torvalds
16016 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16019 +#include <linux/module.h>
16020 +#include <linux/signal.h>
16021 +#include <linux/sched.h>
16022 +#include <linux/kernel.h>
16023 +#include <linux/errno.h>
16024 +#include <linux/string.h>
16025 +#include <linux/types.h>
16026 +#include <linux/ptrace.h>
16027 +#include <linux/mman.h>
16028 +#include <linux/mm.h>
16029 +#include <linux/hugetlb.h>
16030 +#include <linux/swap.h>
16031 +#include <linux/smp.h>
16032 +#include <linux/init.h>
16033 +#include <linux/highmem.h>
16034 +#include <linux/pagemap.h>
16035 +#include <linux/poison.h>
16036 +#include <linux/bootmem.h>
16037 +#include <linux/slab.h>
16038 +#include <linux/proc_fs.h>
16039 +#include <linux/efi.h>
16040 +#include <linux/memory_hotplug.h>
16041 +#include <linux/initrd.h>
16042 +#include <linux/cpumask.h>
16043 +#include <linux/dma-mapping.h>
16044 +#include <linux/scatterlist.h>
16046 +#include <asm/processor.h>
16047 +#include <asm/system.h>
16048 +#include <asm/uaccess.h>
16049 +#include <asm/pgtable.h>
16050 +#include <asm/dma.h>
16051 +#include <asm/fixmap.h>
16052 +#include <asm/e820.h>
16053 +#include <asm/apic.h>
16054 +#include <asm/tlb.h>
16055 +#include <asm/tlbflush.h>
16056 +#include <asm/sections.h>
16057 +#include <asm/hypervisor.h>
16058 +#include <asm/swiotlb.h>
16060 +unsigned int __VMALLOC_RESERVE = 128 << 20;
16062 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
16063 +unsigned long highstart_pfn, highend_pfn;
16065 +static int noinline do_test_wp_bit(void);
16068 + * Creates a middle page table and puts a pointer to it in the
16069 + * given global directory entry. This only returns the gd entry
16070 + * in non-PAE compilation mode, since the middle layer is folded.
16072 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
16075 + pmd_t *pmd_table;
16077 +#ifdef CONFIG_X86_PAE
16078 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16079 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
16080 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
16081 + pud = pud_offset(pgd, 0);
16082 + if (pmd_table != pmd_offset(pud, 0))
16085 + pud = pud_offset(pgd, 0);
16086 + pmd_table = pmd_offset(pud, 0);
16089 + return pmd_table;
16093 + * Create a page table and place a pointer to it in a middle page
16094 + * directory entry.
16096 +static pte_t * __init one_page_table_init(pmd_t *pmd)
16098 + if (pmd_none(*pmd)) {
16099 + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16100 + make_lowmem_page_readonly(page_table,
16101 + XENFEAT_writable_page_tables);
16102 + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
16103 + if (page_table != pte_offset_kernel(pmd, 0))
16106 + return page_table;
16109 + return pte_offset_kernel(pmd, 0);
16113 + * This function initializes a certain range of kernel virtual memory
16114 + * with new bootmem page tables, everywhere page tables are missing in
16115 + * the given range.
16119 + * NOTE: The pagetables are allocated contiguous on the physical space
16120 + * so we can cache the place of the first one and move around without
16121 + * checking the pgd every time.
16123 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
16128 + int pgd_idx, pmd_idx;
16129 + unsigned long vaddr;
16132 + pgd_idx = pgd_index(vaddr);
16133 + pmd_idx = pmd_index(vaddr);
16134 + pgd = pgd_base + pgd_idx;
16136 + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
16137 + if (pgd_none(*pgd))
16138 + one_md_table_init(pgd);
16139 + pud = pud_offset(pgd, vaddr);
16140 + pmd = pmd_offset(pud, vaddr);
16141 + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
16142 + if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
16143 + one_page_table_init(pmd);
16145 + vaddr += PMD_SIZE;
16151 +static inline int is_kernel_text(unsigned long addr)
16153 + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
16159 + * This maps the physical memory to kernel virtual address space, a total
16160 + * of max_low_pfn pages, by creating page tables starting from address
16163 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
16165 + unsigned long pfn;
16169 + int pgd_idx, pmd_idx, pte_ofs;
16171 + unsigned long max_ram_pfn = xen_start_info->nr_pages;
16172 + if (max_ram_pfn > max_low_pfn)
16173 + max_ram_pfn = max_low_pfn;
16175 + pgd_idx = pgd_index(PAGE_OFFSET);
16176 + pgd = pgd_base + pgd_idx;
16178 + pmd_idx = pmd_index(PAGE_OFFSET);
16179 + pte_ofs = pte_index(PAGE_OFFSET);
16181 + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
16184 + * Native linux hasn't PAE-paging enabled yet at this
16185 + * point. When running as xen domain we are in PAE
16186 + * mode already, thus we can't simply hook a empty
16187 + * pmd. That would kill the mappings we are currently
16190 + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
16192 + pmd = one_md_table_init(pgd);
16194 + if (pfn >= max_low_pfn)
16197 + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
16198 + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
16199 + if (address >= hypervisor_virt_start)
16202 + /* Map with big pages if possible, otherwise create normal page tables. */
16203 + if (cpu_has_pse) {
16204 + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
16206 + if (is_kernel_text(address) || is_kernel_text(address2))
16207 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
16209 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
16210 + pfn += PTRS_PER_PTE;
16212 + pte = one_page_table_init(pmd);
16215 + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
16216 + /* XEN: Only map initial RAM allocation. */
16217 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
16219 + if (is_kernel_text(address))
16220 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
16222 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
16231 +#ifndef CONFIG_XEN
16233 +static inline int page_kills_ppro(unsigned long pagenr)
16235 + if (pagenr >= 0x70000 && pagenr <= 0x7003F)
16242 +#define page_kills_ppro(p) 0
16246 +extern int is_available_memory(efi_memory_desc_t *);
16248 +int page_is_ram(unsigned long pagenr)
16251 + unsigned long addr, end;
16253 + if (efi_enabled) {
16254 + efi_memory_desc_t *md;
16257 + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
16259 + if (!is_available_memory(md))
16261 + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16262 + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
16264 + if ((pagenr >= addr) && (pagenr < end))
16270 + for (i = 0; i < e820.nr_map; i++) {
16272 + if (e820.map[i].type != E820_RAM) /* not usable memory */
16275 + * !!!FIXME!!! Some BIOSen report areas as RAM that
16276 + * are not. Notably the 640->1Mb area. We need a sanity
16279 + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16280 + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
16281 + if ((pagenr >= addr) && (pagenr < end))
16287 +#ifdef CONFIG_HIGHMEM
16289 +pgprot_t kmap_prot;
16291 +#define kmap_get_fixmap_pte(vaddr) \
16292 + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
16294 +static void __init kmap_init(void)
16296 + unsigned long kmap_vstart;
16298 + /* cache the first kmap pte */
16299 + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
16300 + kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
16302 + kmap_prot = PAGE_KERNEL;
16305 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
16311 + unsigned long vaddr;
16313 + vaddr = PKMAP_BASE;
16314 + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
16316 + pgd = swapper_pg_dir + pgd_index(vaddr);
16317 + pud = pud_offset(pgd, vaddr);
16318 + pmd = pmd_offset(pud, vaddr);
16319 + pte = pte_offset_kernel(pmd, vaddr);
16320 + pkmap_page_table = pte;
16323 +static void __meminit free_new_highpage(struct page *page, int pfn)
16325 + init_page_count(page);
16326 + if (pfn < xen_start_info->nr_pages)
16327 + __free_page(page);
16328 + totalhigh_pages++;
16331 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
16333 + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
16334 + ClearPageReserved(page);
16335 + free_new_highpage(page, pfn);
16337 + SetPageReserved(page);
16340 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
16342 + free_new_highpage(page, pfn);
16343 + totalram_pages++;
16344 +#ifdef CONFIG_FLATMEM
16345 + max_mapnr = max(pfn, max_mapnr);
16352 + * Not currently handling the NUMA case.
16353 + * Assuming single node and all memory that
16354 + * has been added dynamically that would be
16355 + * onlined here is in HIGHMEM
16357 +void online_page(struct page *page)
16359 + ClearPageReserved(page);
16360 + add_one_highpage_hotplug(page, page_to_pfn(page));
16364 +#ifdef CONFIG_NUMA
16365 +extern void set_highmem_pages_init(int);
16367 +static void __init set_highmem_pages_init(int bad_ppro)
16370 + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
16371 + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
16372 + totalram_pages += totalhigh_pages;
16374 +#endif /* CONFIG_FLATMEM */
16377 +#define kmap_init() do { } while (0)
16378 +#define permanent_kmaps_init(pgd_base) do { } while (0)
16379 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
16380 +#endif /* CONFIG_HIGHMEM */
16382 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
16383 +EXPORT_SYMBOL(__PAGE_KERNEL);
16384 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
16386 +#ifdef CONFIG_NUMA
16387 +extern void __init remap_numa_kva(void);
16389 +#define remap_numa_kva() do {} while (0)
16392 +pgd_t *swapper_pg_dir;
16394 +static void __init pagetable_init (void)
16396 + unsigned long vaddr;
16397 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
16399 + /* Enable PSE if available */
16400 + if (cpu_has_pse) {
16401 + set_in_cr4(X86_CR4_PSE);
16404 + /* Enable PGE if available */
16405 + if (cpu_has_pge) {
16406 + set_in_cr4(X86_CR4_PGE);
16407 + __PAGE_KERNEL |= _PAGE_GLOBAL;
16408 + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
16411 + kernel_physical_mapping_init(pgd_base);
16412 + remap_numa_kva();
16415 + * Fixed mappings, only the page table structure has to be
16416 + * created - mappings will be set by set_fixmap():
16418 + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
16419 + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
16421 + permanent_kmaps_init(pgd_base);
16424 +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
16426 + * Swap suspend & friends need this for resume because things like the intel-agp
16427 + * driver might have split up a kernel 4MB mapping.
16429 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
16430 + __attribute__ ((aligned (PAGE_SIZE)));
16432 +static inline void save_pg_dir(void)
16434 + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
16437 +static inline void save_pg_dir(void)
16442 +void zap_low_mappings (void)
16449 + * Zap initial low-memory mappings.
16451 + * Note that "pgd_clear()" doesn't do it for
16452 + * us, because pgd_clear() is a no-op on i386.
16454 + for (i = 0; i < USER_PTRS_PER_PGD; i++)
16455 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16456 + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
16458 + set_pgd(swapper_pg_dir+i, __pgd(0));
16463 +static int disable_nx __initdata = 0;
16464 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
16465 +EXPORT_SYMBOL(__supported_pte_mask);
16468 + * noexec = on|off
16470 + * Control non executable mappings.
16475 +void __init noexec_setup(const char *str)
16477 + if (!strncmp(str, "on",2) && cpu_has_nx) {
16478 + __supported_pte_mask |= _PAGE_NX;
16480 + } else if (!strncmp(str,"off",3)) {
16482 + __supported_pte_mask &= ~_PAGE_NX;
16486 +int nx_enabled = 0;
16487 +#ifdef CONFIG_X86_PAE
16489 +static void __init set_nx(void)
16491 + unsigned int v[4], l, h;
16493 + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
16494 + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
16495 + if ((v[3] & (1 << 20)) && !disable_nx) {
16496 + rdmsr(MSR_EFER, l, h);
16498 + wrmsr(MSR_EFER, l, h);
16500 + __supported_pte_mask |= _PAGE_NX;
16506 + * Enables/disables executability of a given kernel page and
16507 + * returns the previous setting.
16509 +int __init set_kernel_exec(unsigned long vaddr, int enable)
16517 + pte = lookup_address(vaddr);
16520 + if (!pte_exec_kernel(*pte))
16524 + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
16526 + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
16527 + __flush_tlb_all();
16535 + * paging_init() sets up the page tables - note that the first 8MB are
16536 + * already mapped by head.S.
16538 + * This routines also unmaps the page at virtual kernel address 0, so
16539 + * that we can trap those pesky NULL-reference errors in the kernel.
16541 +void __init paging_init(void)
16545 +#ifdef CONFIG_X86_PAE
16548 + printk("NX (Execute Disable) protection: active\n");
16551 + pagetable_init();
16553 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16555 + * We will bail out later - printk doesn't work right now so
16556 + * the user would just see a hanging kernel.
16557 + * when running as xen domain we are already in PAE mode at
16561 + set_in_cr4(X86_CR4_PAE);
16563 + __flush_tlb_all();
16567 + /* Switch to the real shared_info page, and clear the
16569 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
16570 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
16571 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
16573 + /* Setup mapping of lower 1st MB */
16574 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
16575 + if (is_initial_xendomain())
16576 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
16578 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
16579 + virt_to_machine(empty_zero_page),
16584 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
16585 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
16586 + * used to involve black magic jumps to work around some nasty CPU bugs,
16587 + * but fortunately the switch to using exceptions got rid of all that.
16590 +static void __init test_wp_bit(void)
16592 + printk("Checking if this processor honours the WP bit even in supervisor mode... ");
16594 + /* Any page-aligned address will do, the test is non-destructive */
16595 + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
16596 + boot_cpu_data.wp_works_ok = do_test_wp_bit();
16597 + clear_fixmap(FIX_WP_TEST);
16599 + if (!boot_cpu_data.wp_works_ok) {
16601 +#ifdef CONFIG_X86_WP_WORKS_OK
16602 + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
16609 +static void __init set_max_mapnr_init(void)
16611 +#ifdef CONFIG_HIGHMEM
16612 + num_physpages = highend_pfn;
16614 + num_physpages = max_low_pfn;
16616 +#ifdef CONFIG_FLATMEM
16617 + max_mapnr = num_physpages;
16621 +static struct kcore_list kcore_mem, kcore_vmalloc;
16623 +void __init mem_init(void)
16625 + extern int ppro_with_ram_bug(void);
16626 + int codesize, reservedpages, datasize, initsize;
16629 + unsigned long pfn;
16631 +#if defined(CONFIG_SWIOTLB)
16635 +#ifdef CONFIG_FLATMEM
16640 + bad_ppro = ppro_with_ram_bug();
16642 +#ifdef CONFIG_HIGHMEM
16643 + /* check that fixmap and pkmap do not overlap */
16644 + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
16645 + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
16646 + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
16647 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
16652 + set_max_mapnr_init();
16654 +#ifdef CONFIG_HIGHMEM
16655 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
16657 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
16659 + printk("vmalloc area: %lx-%lx, maxmem %lx\n",
16660 + VMALLOC_START,VMALLOC_END,MAXMEM);
16661 + BUG_ON(VMALLOC_START > VMALLOC_END);
16663 + /* this will put all low memory onto the freelists */
16664 + totalram_pages += free_all_bootmem();
16665 + /* XEN: init and count low-mem pages outside initial allocation. */
16666 + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
16667 + ClearPageReserved(pfn_to_page(pfn));
16668 + init_page_count(pfn_to_page(pfn));
16669 + totalram_pages++;
16672 + reservedpages = 0;
16673 + for (tmp = 0; tmp < max_low_pfn; tmp++)
16675 + * Only count reserved RAM pages
16677 + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
16680 + set_highmem_pages_init(bad_ppro);
16682 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
16683 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
16684 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
16686 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
16687 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
16688 + VMALLOC_END-VMALLOC_START);
16690 + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
16691 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
16692 + num_physpages << (PAGE_SHIFT-10),
16694 + reservedpages << (PAGE_SHIFT-10),
16697 + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
16700 +#ifdef CONFIG_X86_PAE
16701 + if (!cpu_has_pae)
16702 + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
16704 + if (boot_cpu_data.wp_works_ok < 0)
16708 + * Subtle. SMP is doing it's boot stuff late (because it has to
16709 + * fork idle threads) - but it also needs low mappings for the
16710 + * protected-mode entry to work. We zap these entries only after
16711 + * the WP-bit has been tested.
16713 +#ifndef CONFIG_SMP
16714 + zap_low_mappings();
16717 + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
16721 + * this is for the non-NUMA, single node SMP system case.
16722 + * Specifically, in the case of x86, we will always add
16723 + * memory to the highmem for now.
16725 +#ifdef CONFIG_MEMORY_HOTPLUG
16726 +#ifndef CONFIG_NEED_MULTIPLE_NODES
16727 +int arch_add_memory(int nid, u64 start, u64 size)
16729 + struct pglist_data *pgdata = &contig_page_data;
16730 + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
16731 + unsigned long start_pfn = start >> PAGE_SHIFT;
16732 + unsigned long nr_pages = size >> PAGE_SHIFT;
16734 + return __add_pages(zone, start_pfn, nr_pages);
16737 +int remove_memory(u64 start, u64 size)
16744 +kmem_cache_t *pgd_cache;
16745 +kmem_cache_t *pmd_cache;
16747 +void __init pgtable_cache_init(void)
16749 + if (PTRS_PER_PMD > 1) {
16750 + pmd_cache = kmem_cache_create("pmd",
16751 + PTRS_PER_PMD*sizeof(pmd_t),
16752 + PTRS_PER_PMD*sizeof(pmd_t),
16757 + panic("pgtable_cache_init(): cannot create pmd cache");
16759 + pgd_cache = kmem_cache_create("pgd",
16760 +#ifndef CONFIG_XEN
16761 + PTRS_PER_PGD*sizeof(pgd_t),
16762 + PTRS_PER_PGD*sizeof(pgd_t),
16769 + PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
16771 + panic("pgtable_cache_init(): Cannot create pgd cache");
16775 + * This function cannot be __init, since exceptions don't work in that
16776 + * section. Put this after the callers, so that it cannot be inlined.
16778 +static int noinline do_test_wp_bit(void)
16783 + __asm__ __volatile__(
16785 + "1: movb %1,%0 \n"
16788 + ".section __ex_table,\"a\"\n"
16790 + " .long 1b,2b \n"
16792 + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
16801 +#ifdef CONFIG_DEBUG_RODATA
16803 +void mark_rodata_ro(void)
16805 + unsigned long addr = (unsigned long)__start_rodata;
16807 + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
16808 + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
16810 + printk("Write protecting the kernel read-only data: %uk\n",
16811 + (__end_rodata - __start_rodata) >> 10);
16814 + * change_page_attr() requires a global_flush_tlb() call after it.
16815 + * We do this after the printk so that if something went wrong in the
16816 + * change, the printk gets out at least to give a better debug hint
16817 + * of who is the culprit.
16819 + global_flush_tlb();
16823 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
16825 + unsigned long addr;
16827 + for (addr = begin; addr < end; addr += PAGE_SIZE) {
16828 + ClearPageReserved(virt_to_page(addr));
16829 + init_page_count(virt_to_page(addr));
16830 + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
16832 + totalram_pages++;
16834 + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
16837 +void free_initmem(void)
16839 + free_init_pages("unused kernel memory",
16840 + (unsigned long)(&__init_begin),
16841 + (unsigned long)(&__init_end));
16844 +#ifdef CONFIG_BLK_DEV_INITRD
16845 +void free_initrd_mem(unsigned long start, unsigned long end)
16847 + free_init_pages("initrd memory", start, end);
16851 Index: head-2008-11-25/arch/x86/mm/ioremap_32-xen.c
16852 ===================================================================
16853 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
16854 +++ head-2008-11-25/arch/x86/mm/ioremap_32-xen.c 2008-04-02 12:34:02.000000000 +0200
16857 + * arch/i386/mm/ioremap.c
16859 + * Re-map IO memory to kernel address space so that we can access it.
16860 + * This is needed for high PCI addresses that aren't mapped in the
16861 + * 640k-1MB IO memory area on PC's
16863 + * (C) Copyright 1995 1996 Linus Torvalds
16866 +#include <linux/vmalloc.h>
16867 +#include <linux/init.h>
16868 +#include <linux/slab.h>
16869 +#include <linux/module.h>
16870 +#include <asm/io.h>
16871 +#include <asm/fixmap.h>
16872 +#include <asm/cacheflush.h>
16873 +#include <asm/tlbflush.h>
16874 +#include <asm/pgtable.h>
16875 +#include <asm/pgalloc.h>
16877 +#define ISA_START_ADDRESS 0x0
16878 +#define ISA_END_ADDRESS 0x100000
16880 +static int direct_remap_area_pte_fn(pte_t *pte,
16881 + struct page *pmd_page,
16882 + unsigned long address,
16885 + mmu_update_t **v = (mmu_update_t **)data;
16887 + BUG_ON(!pte_none(*pte));
16889 + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
16890 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
16896 +static int __direct_remap_pfn_range(struct mm_struct *mm,
16897 + unsigned long address,
16898 + unsigned long mfn,
16899 + unsigned long size,
16904 + unsigned long i, start_address;
16905 + mmu_update_t *u, *v, *w;
16907 + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
16911 + start_address = address;
16913 + flush_cache_all();
16915 + for (i = 0; i < size; i += PAGE_SIZE) {
16916 + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
16917 + /* Flush a full batch after filling in the PTE ptrs. */
16918 + rc = apply_to_page_range(mm, start_address,
16919 + address - start_address,
16920 + direct_remap_area_pte_fn, &w);
16924 + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
16927 + start_address = address;
16931 + * Fill in the machine address: PTE ptr is done later by
16932 + * apply_to_page_range().
16934 + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
16937 + address += PAGE_SIZE;
16942 + /* Final batch. */
16943 + rc = apply_to_page_range(mm, start_address,
16944 + address - start_address,
16945 + direct_remap_area_pte_fn, &w);
16949 + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
16958 + free_page((unsigned long)u);
16963 +int direct_remap_pfn_range(struct vm_area_struct *vma,
16964 + unsigned long address,
16965 + unsigned long mfn,
16966 + unsigned long size,
16970 + if (xen_feature(XENFEAT_auto_translated_physmap))
16971 + return remap_pfn_range(vma, address, mfn, size, prot);
16973 + if (domid == DOMID_SELF)
16976 + vma->vm_flags |= VM_IO | VM_RESERVED;
16978 + vma->vm_mm->context.has_foreign_mappings = 1;
16980 + return __direct_remap_pfn_range(
16981 + vma->vm_mm, address, mfn, size, prot, domid);
16983 +EXPORT_SYMBOL(direct_remap_pfn_range);
16985 +int direct_kernel_remap_pfn_range(unsigned long address,
16986 + unsigned long mfn,
16987 + unsigned long size,
16991 + return __direct_remap_pfn_range(
16992 + &init_mm, address, mfn, size, prot, domid);
16994 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
16996 +static int lookup_pte_fn(
16997 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
16999 + uint64_t *ptep = (uint64_t *)data;
17001 + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17002 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17006 +int create_lookup_pte_addr(struct mm_struct *mm,
17007 + unsigned long address,
17010 + return apply_to_page_range(mm, address, PAGE_SIZE,
17011 + lookup_pte_fn, ptep);
17014 +EXPORT_SYMBOL(create_lookup_pte_addr);
17016 +static int noop_fn(
17017 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
17022 +int touch_pte_range(struct mm_struct *mm,
17023 + unsigned long address,
17024 + unsigned long size)
17026 + return apply_to_page_range(mm, address, size, noop_fn, NULL);
17029 +EXPORT_SYMBOL(touch_pte_range);
17032 + * Does @address reside within a non-highmem page that is local to this virtual
17033 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
17034 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
17035 + * why this works.
17037 +static inline int is_local_lowmem(unsigned long address)
17039 + extern unsigned long max_low_pfn;
17040 + return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
17044 + * Generic mapping function (not visible outside):
17048 + * Remap an arbitrary physical address space into the kernel virtual
17049 + * address space. Needed when the kernel wants to access high addresses
17052 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
17053 + * have to convert them into an offset in a page-aligned mapping, but the
17054 + * caller shouldn't need to know that small detail.
17056 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
17058 + void __iomem * addr;
17059 + struct vm_struct * area;
17060 + unsigned long offset, last_addr;
17061 + domid_t domid = DOMID_IO;
17063 + /* Don't allow wraparound or zero size */
17064 + last_addr = phys_addr + size - 1;
17065 + if (!size || last_addr < phys_addr)
17069 + * Don't remap the low PCI/ISA area, it's always mapped..
17071 + if (is_initial_xendomain() &&
17072 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17073 + return (void __iomem *) isa_bus_to_virt(phys_addr);
17076 + * Don't allow anybody to remap normal RAM that we're using..
17078 + if (is_local_lowmem(phys_addr)) {
17079 + char *t_addr, *t_end;
17080 + struct page *page;
17082 + t_addr = bus_to_virt(phys_addr);
17083 + t_end = t_addr + (size - 1);
17085 + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
17086 + if(!PageReserved(page))
17089 + domid = DOMID_SELF;
17093 + * Mappings have to be page-aligned
17095 + offset = phys_addr & ~PAGE_MASK;
17096 + phys_addr &= PAGE_MASK;
17097 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
17100 + * Ok, go for it..
17102 + area = get_vm_area(size, VM_IOREMAP | (flags << 20));
17105 + area->phys_addr = phys_addr;
17106 + addr = (void __iomem *) area->addr;
17107 + flags |= _KERNPG_TABLE;
17108 + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
17109 + phys_addr>>PAGE_SHIFT,
17110 + size, __pgprot(flags), domid)) {
17111 + vunmap((void __force *) addr);
17114 + return (void __iomem *) (offset + (char __iomem *)addr);
17116 +EXPORT_SYMBOL(__ioremap);
17119 + * ioremap_nocache - map bus memory into CPU space
17120 + * @offset: bus address of the memory
17121 + * @size: size of the resource to map
17123 + * ioremap_nocache performs a platform specific sequence of operations to
17124 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
17125 + * writew/writel functions and the other mmio helpers. The returned
17126 + * address is not guaranteed to be usable directly as a virtual
17129 + * This version of ioremap ensures that the memory is marked uncachable
17130 + * on the CPU as well as honouring existing caching rules from things like
17131 + * the PCI bus. Note that there are other caches and buffers on many
17132 + * busses. In particular driver authors should read up on PCI writes
17134 + * It's useful if some control registers are in such an area and
17135 + * write combining or read caching is not desirable:
17137 + * Must be freed with iounmap.
17140 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
17142 + unsigned long last_addr;
17143 + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
17147 + /* Guaranteed to be > phys_addr, as per __ioremap() */
17148 + last_addr = phys_addr + size - 1;
17150 + if (is_local_lowmem(last_addr)) {
17151 + struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
17152 + unsigned long npages;
17154 + phys_addr &= PAGE_MASK;
17156 + /* This might overflow and become zero.. */
17157 + last_addr = PAGE_ALIGN(last_addr);
17159 + /* .. but that's ok, because modulo-2**n arithmetic will make
17160 + * the page-aligned "last - first" come out right.
17162 + npages = (last_addr - phys_addr) >> PAGE_SHIFT;
17164 + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
17168 + global_flush_tlb();
17173 +EXPORT_SYMBOL(ioremap_nocache);
17176 + * iounmap - Free a IO remapping
17177 + * @addr: virtual address from ioremap_*
17179 + * Caller must ensure there is only one unmapping for the same pointer.
17181 +void iounmap(volatile void __iomem *addr)
17183 + struct vm_struct *p, *o;
17185 + if ((void __force *)addr <= high_memory)
17189 + * __ioremap special-cases the PCI/ISA range by not instantiating a
17190 + * vm_area and by simply returning an address into the kernel mapping
17191 + * of ISA space. So handle that here.
17193 + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17196 + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
17198 + /* Use the vm area unlocked, assuming the caller
17199 + ensures there isn't another iounmap for the same address
17200 + in parallel. Reuse of the virtual address is prevented by
17201 + leaving it in the global lists until we're done with it.
17202 + cpa takes care of the direct mappings. */
17203 + read_lock(&vmlist_lock);
17204 + for (p = vmlist; p; p = p->next) {
17205 + if (p->addr == addr)
17208 + read_unlock(&vmlist_lock);
17211 + printk("iounmap: bad address %p\n", addr);
17216 + /* Reset the direct mapping. Can block */
17217 + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
17218 + /* p->size includes the guard page, but cpa doesn't like that */
17219 + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
17220 + (p->size - PAGE_SIZE) >> PAGE_SHIFT,
17222 + global_flush_tlb();
17225 + /* Finally remove it */
17226 + o = remove_vm_area((void *)addr);
17227 + BUG_ON(p != o || o == NULL);
17230 +EXPORT_SYMBOL(iounmap);
17232 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
17234 + unsigned long offset, last_addr;
17235 + unsigned int nrpages;
17236 + enum fixed_addresses idx;
17238 + /* Don't allow wraparound or zero size */
17239 + last_addr = phys_addr + size - 1;
17240 + if (!size || last_addr < phys_addr)
17244 + * Don't remap the low PCI/ISA area, it's always mapped..
17246 + if (is_initial_xendomain() &&
17247 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17248 + return isa_bus_to_virt(phys_addr);
17251 + * Mappings have to be page-aligned
17253 + offset = phys_addr & ~PAGE_MASK;
17254 + phys_addr &= PAGE_MASK;
17255 + size = PAGE_ALIGN(last_addr) - phys_addr;
17258 + * Mappings have to fit in the FIX_BTMAP area.
17260 + nrpages = size >> PAGE_SHIFT;
17261 + if (nrpages > NR_FIX_BTMAPS)
17265 + * Ok, go for it..
17267 + idx = FIX_BTMAP_BEGIN;
17268 + while (nrpages > 0) {
17269 + set_fixmap(idx, phys_addr);
17270 + phys_addr += PAGE_SIZE;
17274 + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
17277 +void __init bt_iounmap(void *addr, unsigned long size)
17279 + unsigned long virt_addr;
17280 + unsigned long offset;
17281 + unsigned int nrpages;
17282 + enum fixed_addresses idx;
17284 + virt_addr = (unsigned long)addr;
17285 + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
17287 + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17289 + offset = virt_addr & ~PAGE_MASK;
17290 + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
17292 + idx = FIX_BTMAP_BEGIN;
17293 + while (nrpages > 0) {
17294 + clear_fixmap(idx);
17299 Index: head-2008-11-25/arch/x86/mm/pgtable_32-xen.c
17300 ===================================================================
17301 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
17302 +++ head-2008-11-25/arch/x86/mm/pgtable_32-xen.c 2007-10-09 11:48:25.000000000 +0200
17305 + * linux/arch/i386/mm/pgtable.c
17308 +#include <linux/sched.h>
17309 +#include <linux/kernel.h>
17310 +#include <linux/errno.h>
17311 +#include <linux/mm.h>
17312 +#include <linux/swap.h>
17313 +#include <linux/smp.h>
17314 +#include <linux/highmem.h>
17315 +#include <linux/slab.h>
17316 +#include <linux/pagemap.h>
17317 +#include <linux/spinlock.h>
17318 +#include <linux/module.h>
17320 +#include <asm/system.h>
17321 +#include <asm/pgtable.h>
17322 +#include <asm/pgalloc.h>
17323 +#include <asm/fixmap.h>
17324 +#include <asm/e820.h>
17325 +#include <asm/tlb.h>
17326 +#include <asm/tlbflush.h>
17327 +#include <asm/io.h>
17328 +#include <asm/mmu_context.h>
17330 +#include <xen/features.h>
17331 +#include <asm/hypervisor.h>
17333 +static void pgd_test_and_unpin(pgd_t *pgd);
17335 +void show_mem(void)
17337 + int total = 0, reserved = 0;
17338 + int shared = 0, cached = 0;
17340 + struct page *page;
17341 + pg_data_t *pgdat;
17343 + unsigned long flags;
17345 + printk(KERN_INFO "Mem-info:\n");
17346 + show_free_areas();
17347 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
17348 + for_each_online_pgdat(pgdat) {
17349 + pgdat_resize_lock(pgdat, &flags);
17350 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
17351 + page = pgdat_page_nr(pgdat, i);
17353 + if (PageHighMem(page))
17355 + if (PageReserved(page))
17357 + else if (PageSwapCache(page))
17359 + else if (page_count(page))
17360 + shared += page_count(page) - 1;
17362 + pgdat_resize_unlock(pgdat, &flags);
17364 + printk(KERN_INFO "%d pages of RAM\n", total);
17365 + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
17366 + printk(KERN_INFO "%d reserved pages\n", reserved);
17367 + printk(KERN_INFO "%d pages shared\n", shared);
17368 + printk(KERN_INFO "%d pages swap cached\n", cached);
17370 + printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
17371 + printk(KERN_INFO "%lu pages writeback\n",
17372 + global_page_state(NR_WRITEBACK));
17373 + printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
17374 + printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
17375 + printk(KERN_INFO "%lu pages pagetables\n",
17376 + global_page_state(NR_PAGETABLE));
17380 + * Associate a large virtual page frame with a given physical page frame
17381 + * and protection flags for that frame. pfn is for the base of the page,
17382 + * vaddr is what the page gets mapped to - both must be properly aligned.
17383 + * The pmd must already be instantiated. Assumes PAE mode.
17385 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
17391 + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
17392 + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
17393 + return; /* BUG(); */
17395 + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
17396 + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
17397 + return; /* BUG(); */
17399 + pgd = swapper_pg_dir + pgd_index(vaddr);
17400 + if (pgd_none(*pgd)) {
17401 + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
17402 + return; /* BUG(); */
17404 + pud = pud_offset(pgd, vaddr);
17405 + pmd = pmd_offset(pud, vaddr);
17406 + set_pmd(pmd, pfn_pmd(pfn, flags));
17408 + * It's enough to flush this one mapping.
17409 + * (PGE mappings get flushed as well)
17411 + __flush_tlb_one(vaddr);
17414 +static int nr_fixmaps = 0;
17415 +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
17416 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
17417 +EXPORT_SYMBOL(__FIXADDR_TOP);
17419 +void __init set_fixaddr_top(unsigned long top)
17421 + BUG_ON(nr_fixmaps > 0);
17422 + hypervisor_virt_start = top;
17423 + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
17426 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
17428 + unsigned long address = __fix_to_virt(idx);
17431 + if (idx >= __end_of_fixed_addresses) {
17436 + case FIX_WP_TEST:
17438 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
17441 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
17444 + if (HYPERVISOR_update_va_mapping(address, pte,
17445 + UVMF_INVLPG|UVMF_ALL))
17450 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17452 + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
17454 + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
17458 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17460 + struct page *pte;
17462 +#ifdef CONFIG_HIGHPTE
17463 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17465 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17468 + SetPageForeign(pte, pte_free);
17469 + init_page_count(pte);
17474 +void pte_free(struct page *pte)
17476 + unsigned long pfn = page_to_pfn(pte);
17478 + if (!PageHighMem(pte)) {
17479 + unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
17481 + if (!pte_write(*virt_to_ptep(va)))
17482 + if (HYPERVISOR_update_va_mapping(
17483 + va, pfn_pte(pfn, PAGE_KERNEL), 0))
17486 + clear_bit(PG_pinned, &pte->flags);
17488 + ClearPageForeign(pte);
17489 + init_page_count(pte);
17491 + __free_page(pte);
17494 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
17496 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17500 + * List of all pgd's needed for non-PAE so it can invalidate entries
17501 + * in both cached and uncached pgd's; not needed for PAE since the
17502 + * kernel pmd is shared. If PAE were not to share the pmd a similar
17503 + * tactic would be needed. This is essentially codepath-based locking
17504 + * against pageattr.c; it is the unique case in which a valid change
17505 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
17506 + * vmalloc faults work because attached pagetables are never freed.
17507 + * The locking scheme was chosen on the basis of manfred's
17508 + * recommendations and having no core impact whatsoever.
17511 +DEFINE_SPINLOCK(pgd_lock);
17512 +struct page *pgd_list;
17514 +static inline void pgd_list_add(pgd_t *pgd)
17516 + struct page *page = virt_to_page(pgd);
17517 + page->index = (unsigned long)pgd_list;
17519 + set_page_private(pgd_list, (unsigned long)&page->index);
17521 + set_page_private(page, (unsigned long)&pgd_list);
17524 +static inline void pgd_list_del(pgd_t *pgd)
17526 + struct page *next, **pprev, *page = virt_to_page(pgd);
17527 + next = (struct page *)page->index;
17528 + pprev = (struct page **)page_private(page);
17531 + set_page_private(next, (unsigned long)pprev);
17534 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17536 + unsigned long flags;
17538 + if (PTRS_PER_PMD > 1) {
17539 + if (HAVE_SHARED_KERNEL_PMD)
17540 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17541 + swapper_pg_dir + USER_PTRS_PER_PGD,
17542 + KERNEL_PGD_PTRS);
17544 + spin_lock_irqsave(&pgd_lock, flags);
17545 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17546 + swapper_pg_dir + USER_PTRS_PER_PGD,
17547 + KERNEL_PGD_PTRS);
17548 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
17549 + pgd_list_add(pgd);
17550 + spin_unlock_irqrestore(&pgd_lock, flags);
17554 +/* never called when PTRS_PER_PMD > 1 */
17555 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17557 + unsigned long flags; /* can be called from interrupt context */
17559 + spin_lock_irqsave(&pgd_lock, flags);
17560 + pgd_list_del(pgd);
17561 + spin_unlock_irqrestore(&pgd_lock, flags);
17563 + pgd_test_and_unpin(pgd);
17566 +pgd_t *pgd_alloc(struct mm_struct *mm)
17569 + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
17571 + unsigned long flags;
17573 + pgd_test_and_unpin(pgd);
17575 + if (PTRS_PER_PMD == 1 || !pgd)
17578 + if (HAVE_SHARED_KERNEL_PMD) {
17579 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17580 + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17583 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
17589 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
17590 + * allocation). We therefore store virtual addresses of pmds as they
17591 + * do not change across save/restore, and poke the machine addresses
17592 + * into the pgdir under the pgd_lock.
17594 + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
17596 + kmem_cache_free(pgd_cache, pgd);
17600 + /* Allocate pmds, remember virtual addresses. */
17601 + for (i = 0; i < PTRS_PER_PGD; ++i) {
17602 + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17607 + spin_lock_irqsave(&pgd_lock, flags);
17609 + /* Protect against save/restore: move below 4GB under pgd_lock. */
17610 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
17611 + int rc = xen_create_contiguous_region(
17612 + (unsigned long)pgd, 0, 32);
17614 + spin_unlock_irqrestore(&pgd_lock, flags);
17619 + /* Copy kernel pmd contents and write-protect the new pmds. */
17620 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17621 + unsigned long v = (unsigned long)i << PGDIR_SHIFT;
17622 + pgd_t *kpgd = pgd_offset_k(v);
17623 + pud_t *kpud = pud_offset(kpgd, v);
17624 + pmd_t *kpmd = pmd_offset(kpud, v);
17625 + memcpy(pmd[i], kpmd, PAGE_SIZE);
17626 + make_lowmem_page_readonly(
17627 + pmd[i], XENFEAT_writable_page_tables);
17630 + /* It is safe to poke machine addresses of pmds under the pmd_lock. */
17631 + for (i = 0; i < PTRS_PER_PGD; i++)
17632 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
17634 + /* Ensure this pgd gets picked up and pinned on save/restore. */
17635 + pgd_list_add(pgd);
17637 + spin_unlock_irqrestore(&pgd_lock, flags);
17644 + if (HAVE_SHARED_KERNEL_PMD) {
17645 + for (i--; i >= 0; i--)
17646 + kmem_cache_free(pmd_cache,
17647 + (void *)__va(pgd_val(pgd[i])-1));
17649 + for (i--; i >= 0; i--)
17650 + kmem_cache_free(pmd_cache, pmd[i]);
17653 + kmem_cache_free(pgd_cache, pgd);
17657 +void pgd_free(pgd_t *pgd)
17662 + * After this the pgd should not be pinned for the duration of this
17663 + * function's execution. We should never sleep and thus never race:
17664 + * 1. User pmds will not become write-protected under our feet due
17665 + * to a concurrent mm_pin_all().
17666 + * 2. The machine addresses in PGD entries will not become invalid
17667 + * due to a concurrent save/restore.
17669 + pgd_test_and_unpin(pgd);
17671 + /* in the PAE case user pgd entries are overwritten before usage */
17672 + if (PTRS_PER_PMD > 1) {
17673 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17674 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17675 + kmem_cache_free(pmd_cache, pmd);
17678 + if (!HAVE_SHARED_KERNEL_PMD) {
17679 + unsigned long flags;
17680 + spin_lock_irqsave(&pgd_lock, flags);
17681 + pgd_list_del(pgd);
17682 + spin_unlock_irqrestore(&pgd_lock, flags);
17684 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17685 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17686 + make_lowmem_page_writable(
17687 + pmd, XENFEAT_writable_page_tables);
17688 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17689 + kmem_cache_free(pmd_cache, pmd);
17692 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
17693 + xen_destroy_contiguous_region(
17694 + (unsigned long)pgd, 0);
17698 + /* in the non-PAE case, free_pgtables() clears user pgd entries */
17699 + kmem_cache_free(pgd_cache, pgd);
17702 +void make_lowmem_page_readonly(void *va, unsigned int feature)
17707 + if (xen_feature(feature))
17710 + pte = virt_to_ptep(va);
17711 + rc = HYPERVISOR_update_va_mapping(
17712 + (unsigned long)va, pte_wrprotect(*pte), 0);
17716 +void make_lowmem_page_writable(void *va, unsigned int feature)
17721 + if (xen_feature(feature))
17724 + pte = virt_to_ptep(va);
17725 + rc = HYPERVISOR_update_va_mapping(
17726 + (unsigned long)va, pte_mkwrite(*pte), 0);
17730 +void make_page_readonly(void *va, unsigned int feature)
17735 + if (xen_feature(feature))
17738 + pte = virt_to_ptep(va);
17739 + rc = HYPERVISOR_update_va_mapping(
17740 + (unsigned long)va, pte_wrprotect(*pte), 0);
17741 + if (rc) /* fallback? */
17742 + xen_l1_entry_update(pte, pte_wrprotect(*pte));
17743 + if ((unsigned long)va >= (unsigned long)high_memory) {
17744 + unsigned long pfn = pte_pfn(*pte);
17745 +#ifdef CONFIG_HIGHMEM
17746 + if (pfn >= highstart_pfn)
17747 + kmap_flush_unused(); /* flush stale writable kmaps */
17750 + make_lowmem_page_readonly(
17751 + phys_to_virt(pfn << PAGE_SHIFT), feature);
17755 +void make_page_writable(void *va, unsigned int feature)
17760 + if (xen_feature(feature))
17763 + pte = virt_to_ptep(va);
17764 + rc = HYPERVISOR_update_va_mapping(
17765 + (unsigned long)va, pte_mkwrite(*pte), 0);
17766 + if (rc) /* fallback? */
17767 + xen_l1_entry_update(pte, pte_mkwrite(*pte));
17768 + if ((unsigned long)va >= (unsigned long)high_memory) {
17769 + unsigned long pfn = pte_pfn(*pte);
17770 +#ifdef CONFIG_HIGHMEM
17771 + if (pfn < highstart_pfn)
17773 + make_lowmem_page_writable(
17774 + phys_to_virt(pfn << PAGE_SHIFT), feature);
17778 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17780 + if (xen_feature(feature))
17783 + while (nr-- != 0) {
17784 + make_page_readonly(va, feature);
17785 + va = (void *)((unsigned long)va + PAGE_SIZE);
17789 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17791 + if (xen_feature(feature))
17794 + while (nr-- != 0) {
17795 + make_page_writable(va, feature);
17796 + va = (void *)((unsigned long)va + PAGE_SIZE);
17800 +static void _pin_lock(struct mm_struct *mm, int lock) {
17802 + spin_lock(&mm->page_table_lock);
17803 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17804 + /* While mm->page_table_lock protects us against insertions and
17805 + * removals of higher level page table pages, it doesn't protect
17806 + * against updates of pte-s. Such updates, however, require the
17807 + * pte pages to be in consistent state (unpinned+writable or
17808 + * pinned+readonly). The pinning and attribute changes, however
17809 + * cannot be done atomically, which is why such updates must be
17810 + * prevented from happening concurrently.
17811 + * Note that no pte lock can ever elsewhere be acquired nesting
17812 + * with an already acquired one in the same mm, or with the mm's
17813 + * page_table_lock already acquired, as that would break in the
17814 + * non-split case (where all these are actually resolving to the
17815 + * one page_table_lock). Thus acquiring all of them here is not
17816 + * going to result in dead locks, and the order of acquires
17817 + * doesn't matter.
17820 + pgd_t *pgd = mm->pgd;
17823 + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17827 + if (pgd_none(*pgd))
17829 + pud = pud_offset(pgd, 0);
17830 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17834 + if (pud_none(*pud))
17836 + pmd = pmd_offset(pud, 0);
17837 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17840 + if (pmd_none(*pmd))
17842 + ptl = pte_lockptr(0, pmd);
17846 + spin_unlock(ptl);
17853 + spin_unlock(&mm->page_table_lock);
17855 +#define pin_lock(mm) _pin_lock(mm, 1)
17856 +#define pin_unlock(mm) _pin_lock(mm, 0)
17858 +#define PIN_BATCH 4
17859 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17861 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
17862 + unsigned int cpu, unsigned seq)
17864 + unsigned long pfn = page_to_pfn(page);
17866 + if (PageHighMem(page)) {
17867 + if (pgprot_val(flags) & _PAGE_RW)
17868 + clear_bit(PG_pinned, &page->flags);
17870 + set_bit(PG_pinned, &page->flags);
17872 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17873 + (unsigned long)__va(pfn << PAGE_SHIFT),
17874 + pfn_pte(pfn, flags), 0);
17875 + if (unlikely(++seq == PIN_BATCH)) {
17876 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17877 + PIN_BATCH, NULL)))
17886 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17888 + pgd_t *pgd = pgd_base;
17892 + unsigned int cpu, seq;
17894 + if (xen_feature(XENFEAT_auto_translated_physmap))
17899 + for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17900 + if (pgd_none(*pgd))
17902 + pud = pud_offset(pgd, 0);
17903 + if (PTRS_PER_PUD > 1) /* not folded */
17904 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
17905 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17906 + if (pud_none(*pud))
17908 + pmd = pmd_offset(pud, 0);
17909 + if (PTRS_PER_PMD > 1) /* not folded */
17910 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
17911 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17912 + if (pmd_none(*pmd))
17914 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
17919 + if (likely(seq != 0)) {
17920 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17921 + (unsigned long)pgd_base,
17922 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17924 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17927 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
17928 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17935 +static void __pgd_pin(pgd_t *pgd)
17937 + pgd_walk(pgd, PAGE_KERNEL_RO);
17938 + kmap_flush_unused();
17939 + xen_pgd_pin(__pa(pgd));
17940 + set_bit(PG_pinned, &virt_to_page(pgd)->flags);
17943 +static void __pgd_unpin(pgd_t *pgd)
17945 + xen_pgd_unpin(__pa(pgd));
17946 + pgd_walk(pgd, PAGE_KERNEL);
17947 + clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
17950 +static void pgd_test_and_unpin(pgd_t *pgd)
17952 + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
17953 + __pgd_unpin(pgd);
17956 +void mm_pin(struct mm_struct *mm)
17958 + if (xen_feature(XENFEAT_writable_page_tables))
17961 + __pgd_pin(mm->pgd);
17965 +void mm_unpin(struct mm_struct *mm)
17967 + if (xen_feature(XENFEAT_writable_page_tables))
17970 + __pgd_unpin(mm->pgd);
17974 +void mm_pin_all(void)
17976 + struct page *page;
17977 + unsigned long flags;
17979 + if (xen_feature(XENFEAT_writable_page_tables))
17983 + * Allow uninterrupted access to the pgd_list. Also protects
17984 + * __pgd_pin() by disabling preemption.
17985 + * All other CPUs must be at a safe point (e.g., in stop_machine
17986 + * or offlined entirely).
17988 + spin_lock_irqsave(&pgd_lock, flags);
17989 + for (page = pgd_list; page; page = (struct page *)page->index) {
17990 + if (!test_bit(PG_pinned, &page->flags))
17991 + __pgd_pin((pgd_t *)page_address(page));
17993 + spin_unlock_irqrestore(&pgd_lock, flags);
17996 +void _arch_dup_mmap(struct mm_struct *mm)
17998 + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
18002 +void _arch_exit_mmap(struct mm_struct *mm)
18004 + struct task_struct *tsk = current;
18009 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18010 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18012 + if (tsk->active_mm == mm) {
18013 + tsk->active_mm = &init_mm;
18014 + atomic_inc(&init_mm.mm_count);
18016 + switch_mm(mm, &init_mm, tsk);
18018 + atomic_dec(&mm->mm_count);
18019 + BUG_ON(atomic_read(&mm->mm_count) == 0);
18022 + task_unlock(tsk);
18024 + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
18025 + (atomic_read(&mm->mm_count) == 1) &&
18026 + !mm->context.has_foreign_mappings)
18029 Index: head-2008-11-25/arch/x86/oprofile/xenoprof.c
18030 ===================================================================
18031 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
18032 +++ head-2008-11-25/arch/x86/oprofile/xenoprof.c 2008-01-28 12:24:19.000000000 +0100
18035 + * @file xenoprof.c
18037 + * @remark Copyright 2002 OProfile authors
18038 + * @remark Read the file COPYING
18040 + * @author John Levon <levon@movementarian.org>
18042 + * Modified by Aravind Menon and Jose Renato Santos for Xen
18043 + * These modifications are:
18044 + * Copyright (C) 2005 Hewlett-Packard Co.
18046 + * x86-specific part
18047 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
18048 + * VA Linux Systems Japan K.K.
18051 +#include <linux/init.h>
18052 +#include <linux/oprofile.h>
18053 +#include <linux/sched.h>
18054 +#include <asm/pgtable.h>
18056 +#include <xen/driver_util.h>
18057 +#include <xen/interface/xen.h>
18058 +#include <xen/interface/xenoprof.h>
18059 +#include <xen/xenoprof.h>
18060 +#include "op_counter.h"
18062 +static unsigned int num_events = 0;
18064 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
18066 + num_events = init->num_events;
18067 + /* just in case - make sure we do not overflow event list
18068 + (i.e. counter_config list) */
18069 + if (num_events > OP_MAX_COUNTER) {
18070 + num_events = OP_MAX_COUNTER;
18071 + init->num_events = num_events;
18075 +void xenoprof_arch_counter(void)
18078 + struct xenoprof_counter counter;
18080 + for (i=0; i<num_events; i++) {
18082 + counter.count = (uint64_t)counter_config[i].count;
18083 + counter.enabled = (uint32_t)counter_config[i].enabled;
18084 + counter.event = (uint32_t)counter_config[i].event;
18085 + counter.kernel = (uint32_t)counter_config[i].kernel;
18086 + counter.user = (uint32_t)counter_config[i].user;
18087 + counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
18088 + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
18093 +void xenoprof_arch_start(void)
18098 +void xenoprof_arch_stop(void)
18103 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
18105 + if (sbuf->buffer) {
18106 + vunmap(sbuf->buffer);
18107 + sbuf->buffer = NULL;
18111 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
18112 + struct xenoprof_shared_buffer * sbuf)
18115 + struct vm_struct *area;
18117 + sbuf->buffer = NULL;
18118 + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
18121 + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
18123 + area = alloc_vm_area(npages * PAGE_SIZE);
18124 + if (area == NULL)
18127 + if ( (ret = direct_kernel_remap_pfn_range(
18128 + (unsigned long)area->addr,
18129 + get_buffer->buf_gmaddr >> PAGE_SHIFT,
18130 + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
18132 + vunmap(area->addr);
18136 + sbuf->buffer = area->addr;
18140 +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
18141 + struct xenoprof_shared_buffer * sbuf)
18145 + struct vm_struct *area;
18146 + pgprot_t prot = __pgprot(_KERNPG_TABLE);
18148 + sbuf->buffer = NULL;
18149 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
18153 + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
18155 + area = alloc_vm_area(npages * PAGE_SIZE);
18156 + if (area == NULL) {
18161 + ret = direct_kernel_remap_pfn_range(
18162 + (unsigned long)area->addr,
18163 + pdomain->buf_gmaddr >> PAGE_SHIFT,
18164 + npages * PAGE_SIZE, prot, DOMID_SELF);
18166 + vunmap(area->addr);
18169 + sbuf->buffer = area->addr;
18175 +struct op_counter_config counter_config[OP_MAX_COUNTER];
18177 +int xenoprof_create_files(struct super_block * sb, struct dentry * root)
18181 + for (i = 0; i < num_events; ++i) {
18182 + struct dentry * dir;
18185 + snprintf(buf, 2, "%d", i);
18186 + dir = oprofilefs_mkdir(sb, root, buf);
18187 + oprofilefs_create_ulong(sb, dir, "enabled",
18188 + &counter_config[i].enabled);
18189 + oprofilefs_create_ulong(sb, dir, "event",
18190 + &counter_config[i].event);
18191 + oprofilefs_create_ulong(sb, dir, "count",
18192 + &counter_config[i].count);
18193 + oprofilefs_create_ulong(sb, dir, "unit_mask",
18194 + &counter_config[i].unit_mask);
18195 + oprofilefs_create_ulong(sb, dir, "kernel",
18196 + &counter_config[i].kernel);
18197 + oprofilefs_create_ulong(sb, dir, "user",
18198 + &counter_config[i].user);
18204 +int __init oprofile_arch_init(struct oprofile_operations * ops)
18206 + return xenoprofile_init(ops);
18209 +void oprofile_arch_exit(void)
18211 + xenoprofile_exit();
18213 Index: head-2008-11-25/arch/x86/pci/irq-xen.c
18214 ===================================================================
18215 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
18216 +++ head-2008-11-25/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100
18219 + * Low-Level PCI Support for PC -- Routing of Interrupts
18221 + * (c) 1999--2000 Martin Mares <mj@ucw.cz>
18224 +#include <linux/types.h>
18225 +#include <linux/kernel.h>
18226 +#include <linux/pci.h>
18227 +#include <linux/init.h>
18228 +#include <linux/slab.h>
18229 +#include <linux/interrupt.h>
18230 +#include <linux/dmi.h>
18231 +#include <asm/io.h>
18232 +#include <asm/smp.h>
18233 +#include <asm/io_apic.h>
18234 +#include <linux/irq.h>
18235 +#include <linux/acpi.h>
18239 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
18240 +#define PIRQ_VERSION 0x0100
18242 +static int broken_hp_bios_irq9;
18243 +static int acer_tm360_irqrouting;
18245 +static struct irq_routing_table *pirq_table;
18247 +static int pirq_enable_irq(struct pci_dev *dev);
18250 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
18251 + * Avoid using: 13, 14 and 15 (FP error and IDE).
18252 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
18254 +unsigned int pcibios_irq_mask = 0xfff8;
18256 +static int pirq_penalty[16] = {
18257 + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
18258 + 0, 0, 0, 0, 1000, 100000, 100000, 100000
18261 +struct irq_router {
18263 + u16 vendor, device;
18264 + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
18265 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
18268 +struct irq_router_handler {
18270 + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
18273 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
18274 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
18277 + * Check passed address for the PCI IRQ Routing Table signature
18278 + * and perform checksum verification.
18281 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
18283 + struct irq_routing_table *rt;
18287 + rt = (struct irq_routing_table *) addr;
18288 + if (rt->signature != PIRQ_SIGNATURE ||
18289 + rt->version != PIRQ_VERSION ||
18291 + rt->size < sizeof(struct irq_routing_table))
18294 + for (i=0; i < rt->size; i++)
18297 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
18306 + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
18309 +static struct irq_routing_table * __init pirq_find_routing_table(void)
18312 + struct irq_routing_table *rt;
18315 + if (!is_initial_xendomain())
18318 + if (pirq_table_addr) {
18319 + rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
18322 + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
18324 + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
18325 + rt = pirq_check_routing_table(addr);
18333 + * If we have a IRQ routing table, use it to search for peer host
18334 + * bridges. It's a gross hack, but since there are no other known
18335 + * ways how to get a list of buses, we have to go this way.
18338 +static void __init pirq_peer_trick(void)
18340 + struct irq_routing_table *rt = pirq_table;
18343 + struct irq_info *e;
18345 + memset(busmap, 0, sizeof(busmap));
18346 + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
18347 + e = &rt->slots[i];
18351 + DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
18352 + for(j=0; j<4; j++)
18353 + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
18357 + busmap[e->bus] = 1;
18359 + for(i = 1; i < 256; i++) {
18360 + if (!busmap[i] || pci_find_bus(0, i))
18362 + if (pci_scan_bus(i, &pci_root_ops, NULL))
18363 + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
18365 + pcibios_last_bus = -1;
18369 + * Code for querying and setting of IRQ routes on various interrupt routers.
18372 +void eisa_set_level_irq(unsigned int irq)
18374 + unsigned char mask = 1 << (irq & 7);
18375 + unsigned int port = 0x4d0 + (irq >> 3);
18376 + unsigned char val;
18377 + static u16 eisa_irq_mask;
18379 + if (irq >= 16 || (1 << irq) & eisa_irq_mask)
18382 + eisa_irq_mask |= (1 << irq);
18383 + printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
18385 + if (!(val & mask)) {
18386 + DBG(KERN_DEBUG " -> edge");
18387 + outb(val | mask, port);
18392 + * Common IRQ routing practice: nybbles in config space,
18393 + * offset by some magic constant.
18395 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
18398 + unsigned reg = offset + (nr >> 1);
18400 + pci_read_config_byte(router, reg, &x);
18401 + return (nr & 1) ? (x >> 4) : (x & 0xf);
18404 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
18407 + unsigned reg = offset + (nr >> 1);
18409 + pci_read_config_byte(router, reg, &x);
18410 + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
18411 + pci_write_config_byte(router, reg, x);
18415 + * ALI pirq entries are damn ugly, and completely undocumented.
18416 + * This has been figured out from pirq tables, and it's not a pretty
18419 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18421 + static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18423 + return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18426 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18428 + static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18429 + unsigned int val = irqmap[irq];
18432 + write_config_nybble(router, 0x48, pirq-1, val);
18439 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
18440 + * just a pointer to the config space.
18442 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18446 + pci_read_config_byte(router, pirq, &x);
18447 + return (x < 16) ? x : 0;
18450 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18452 + pci_write_config_byte(router, pirq, irq);
18457 + * The VIA pirq rules are nibble-based, like ALI,
18458 + * but without the ugly irq number munging.
18459 + * However, PIRQD is in the upper instead of lower 4 bits.
18461 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18463 + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
18466 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18468 + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
18473 + * The VIA pirq rules are nibble-based, like ALI,
18474 + * but without the ugly irq number munging.
18475 + * However, for 82C586, nibble map is different .
18477 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18479 + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18480 + return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18483 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18485 + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18486 + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18491 + * ITE 8330G pirq rules are nibble-based
18492 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
18493 + * 2+3 are both mapped to irq 9 on my system
18495 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18497 + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18498 + return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18501 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18503 + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18504 + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18509 + * OPTI: high four bits are nibble pointer..
18510 + * I wonder what the low bits do?
18512 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18514 + return read_config_nybble(router, 0xb8, pirq >> 4);
18517 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18519 + write_config_nybble(router, 0xb8, pirq >> 4, irq);
18524 + * Cyrix: nibble offset 0x5C
18525 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
18526 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
18528 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18530 + return read_config_nybble(router, 0x5C, (pirq-1)^1);
18533 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18535 + write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
18540 + * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
18541 + * We have to deal with the following issues here:
18542 + * - vendors have different ideas about the meaning of link values
18543 + * - some onboard devices (integrated in the chipset) have special
18544 + * links and are thus routed differently (i.e. not via PCI INTA-INTD)
18545 + * - different revision of the router have a different layout for
18546 + * the routing registers, particularly for the onchip devices
18548 + * For all routing registers the common thing is we have one byte
18549 + * per routeable link which is defined as:
18550 + * bit 7 IRQ mapping enabled (0) or disabled (1)
18551 + * bits [6:4] reserved (sometimes used for onchip devices)
18552 + * bits [3:0] IRQ to map to
18553 + * allowed: 3-7, 9-12, 14-15
18554 + * reserved: 0, 1, 2, 8, 13
18556 + * The config-space registers located at 0x41/0x42/0x43/0x44 are
18557 + * always used to route the normal PCI INT A/B/C/D respectively.
18558 + * Apparently there are systems implementing PCI routing table using
18559 + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
18560 + * We try our best to handle both link mappings.
18562 + * Currently (2003-05-21) it appears most SiS chipsets follow the
18563 + * definition of routing registers from the SiS-5595 southbridge.
18564 + * According to the SiS 5595 datasheets the revision id's of the
18565 + * router (ISA-bridge) should be 0x01 or 0xb0.
18567 + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
18568 + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
18569 + * They seem to work with the current routing code. However there is
18570 + * some concern because of the two USB-OHCI HCs (original SiS 5595
18571 + * had only one). YMMV.
18573 + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
18576 + * bits [6:5] must be written 01
18577 + * bit 4 channel-select primary (0), secondary (1)
18580 + * bit 6 OHCI function disabled (0), enabled (1)
18582 + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
18584 + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
18586 + * We support USBIRQ (in addition to INTA-INTD) and keep the
18587 + * IDE, ACPI and DAQ routing untouched as set by the BIOS.
18589 + * Currently the only reported exception is the new SiS 65x chipset
18590 + * which includes the SiS 69x southbridge. Here we have the 85C503
18591 + * router revision 0x04 and there are changes in the register layout
18592 + * mostly related to the different USB HCs with USB 2.0 support.
18594 + * Onchip routing for router rev-id 0x04 (try-and-error observation)
18596 + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
18597 + * bit 6-4 are probably unused, not like 5595
18600 +#define PIRQ_SIS_IRQ_MASK 0x0f
18601 +#define PIRQ_SIS_IRQ_DISABLE 0x80
18602 +#define PIRQ_SIS_USB_ENABLE 0x40
18604 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18610 + if (reg >= 0x01 && reg <= 0x04)
18612 + pci_read_config_byte(router, reg, &x);
18613 + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
18616 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18622 + if (reg >= 0x01 && reg <= 0x04)
18624 + pci_read_config_byte(router, reg, &x);
18625 + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
18626 + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
18627 + pci_write_config_byte(router, reg, x);
18633 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
18634 + * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
18635 + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
18636 + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
18637 + * for the busbridge to the docking station.
18640 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18643 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18646 + return read_config_nybble(router, 0x74, pirq-1);
18649 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18652 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18655 + write_config_nybble(router, 0x74, pirq-1, irq);
18660 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
18661 + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
18662 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
18663 + * register is a straight binary coding of desired PIC IRQ (low nibble).
18665 + * The 'link' value in the PIRQ table is already in the correct format
18666 + * for the Index register. There are some special index values:
18667 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
18668 + * and 0x03 for SMBus.
18670 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18672 + outb_p(pirq, 0xc00);
18673 + return inb(0xc01) & 0xf;
18676 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18678 + outb_p(pirq, 0xc00);
18679 + outb_p(irq, 0xc01);
18683 +/* Support for AMD756 PCI IRQ Routing
18684 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
18685 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
18686 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
18687 + * The AMD756 pirq rules are nibble-based
18688 + * offset 0x56 0-3 PIRQA 4-7 PIRQB
18689 + * offset 0x57 0-3 PIRQC 4-7 PIRQD
18691 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18697 + irq = read_config_nybble(router, 0x56, pirq - 1);
18699 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
18700 + dev->vendor, dev->device, pirq, irq);
18704 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18706 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
18707 + dev->vendor, dev->device, pirq, irq);
18710 + write_config_nybble(router, 0x56, pirq - 1, irq);
18715 +#ifdef CONFIG_PCI_BIOS
18717 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18719 + struct pci_dev *bridge;
18720 + int pin = pci_get_interrupt_pin(dev, &bridge);
18721 + return pcibios_set_irq_routing(bridge, pin, irq);
18726 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18728 + static struct pci_device_id __initdata pirq_440gx[] = {
18729 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
18730 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
18734 + /* 440GX has a proprietary PIRQ router -- don't use it */
18735 + if (pci_dev_present(pirq_440gx))
18740 + case PCI_DEVICE_ID_INTEL_82371FB_0:
18741 + case PCI_DEVICE_ID_INTEL_82371SB_0:
18742 + case PCI_DEVICE_ID_INTEL_82371AB_0:
18743 + case PCI_DEVICE_ID_INTEL_82371MX:
18744 + case PCI_DEVICE_ID_INTEL_82443MX_0:
18745 + case PCI_DEVICE_ID_INTEL_82801AA_0:
18746 + case PCI_DEVICE_ID_INTEL_82801AB_0:
18747 + case PCI_DEVICE_ID_INTEL_82801BA_0:
18748 + case PCI_DEVICE_ID_INTEL_82801BA_10:
18749 + case PCI_DEVICE_ID_INTEL_82801CA_0:
18750 + case PCI_DEVICE_ID_INTEL_82801CA_12:
18751 + case PCI_DEVICE_ID_INTEL_82801DB_0:
18752 + case PCI_DEVICE_ID_INTEL_82801E_0:
18753 + case PCI_DEVICE_ID_INTEL_82801EB_0:
18754 + case PCI_DEVICE_ID_INTEL_ESB_1:
18755 + case PCI_DEVICE_ID_INTEL_ICH6_0:
18756 + case PCI_DEVICE_ID_INTEL_ICH6_1:
18757 + case PCI_DEVICE_ID_INTEL_ICH7_0:
18758 + case PCI_DEVICE_ID_INTEL_ICH7_1:
18759 + case PCI_DEVICE_ID_INTEL_ICH7_30:
18760 + case PCI_DEVICE_ID_INTEL_ICH7_31:
18761 + case PCI_DEVICE_ID_INTEL_ESB2_0:
18762 + case PCI_DEVICE_ID_INTEL_ICH8_0:
18763 + case PCI_DEVICE_ID_INTEL_ICH8_1:
18764 + case PCI_DEVICE_ID_INTEL_ICH8_2:
18765 + case PCI_DEVICE_ID_INTEL_ICH8_3:
18766 + case PCI_DEVICE_ID_INTEL_ICH8_4:
18767 + case PCI_DEVICE_ID_INTEL_ICH9_0:
18768 + case PCI_DEVICE_ID_INTEL_ICH9_1:
18769 + case PCI_DEVICE_ID_INTEL_ICH9_2:
18770 + case PCI_DEVICE_ID_INTEL_ICH9_3:
18771 + case PCI_DEVICE_ID_INTEL_ICH9_4:
18772 + case PCI_DEVICE_ID_INTEL_ICH9_5:
18773 + r->name = "PIIX/ICH";
18774 + r->get = pirq_piix_get;
18775 + r->set = pirq_piix_set;
18781 +static __init int via_router_probe(struct irq_router *r,
18782 + struct pci_dev *router, u16 device)
18784 + /* FIXME: We should move some of the quirk fixup stuff here */
18787 + * work arounds for some buggy BIOSes
18789 + if (device == PCI_DEVICE_ID_VIA_82C586_0) {
18790 + switch(router->device) {
18791 + case PCI_DEVICE_ID_VIA_82C686:
18793 + * Asus k7m bios wrongly reports 82C686A
18794 + * as 586-compatible
18796 + device = PCI_DEVICE_ID_VIA_82C686;
18798 + case PCI_DEVICE_ID_VIA_8235:
18800 + * Asus a7v-x bios wrongly reports 8235
18801 + * as 586-compatible
18803 + device = PCI_DEVICE_ID_VIA_8235;
18809 + case PCI_DEVICE_ID_VIA_82C586_0:
18811 + r->get = pirq_via586_get;
18812 + r->set = pirq_via586_set;
18814 + case PCI_DEVICE_ID_VIA_82C596:
18815 + case PCI_DEVICE_ID_VIA_82C686:
18816 + case PCI_DEVICE_ID_VIA_8231:
18817 + case PCI_DEVICE_ID_VIA_8233A:
18818 + case PCI_DEVICE_ID_VIA_8235:
18819 + case PCI_DEVICE_ID_VIA_8237:
18820 + /* FIXME: add new ones for 8233/5 */
18822 + r->get = pirq_via_get;
18823 + r->set = pirq_via_set;
18829 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18833 + case PCI_DEVICE_ID_VLSI_82C534:
18834 + r->name = "VLSI 82C534";
18835 + r->get = pirq_vlsi_get;
18836 + r->set = pirq_vlsi_set;
18843 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18847 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
18848 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
18849 + r->name = "ServerWorks";
18850 + r->get = pirq_serverworks_get;
18851 + r->set = pirq_serverworks_set;
18857 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18859 + if (device != PCI_DEVICE_ID_SI_503)
18863 + r->get = pirq_sis_get;
18864 + r->set = pirq_sis_set;
18868 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18872 + case PCI_DEVICE_ID_CYRIX_5520:
18873 + r->name = "NatSemi";
18874 + r->get = pirq_cyrix_get;
18875 + r->set = pirq_cyrix_set;
18881 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18885 + case PCI_DEVICE_ID_OPTI_82C700:
18886 + r->name = "OPTI";
18887 + r->get = pirq_opti_get;
18888 + r->set = pirq_opti_set;
18894 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18898 + case PCI_DEVICE_ID_ITE_IT8330G_0:
18900 + r->get = pirq_ite_get;
18901 + r->set = pirq_ite_set;
18907 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18911 + case PCI_DEVICE_ID_AL_M1533:
18912 + case PCI_DEVICE_ID_AL_M1563:
18913 + printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
18915 + r->get = pirq_ali_get;
18916 + r->set = pirq_ali_set;
18922 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18926 + case PCI_DEVICE_ID_AMD_VIPER_740B:
18927 + r->name = "AMD756";
18929 + case PCI_DEVICE_ID_AMD_VIPER_7413:
18930 + r->name = "AMD766";
18932 + case PCI_DEVICE_ID_AMD_VIPER_7443:
18933 + r->name = "AMD768";
18938 + r->get = pirq_amd756_get;
18939 + r->set = pirq_amd756_set;
18943 +static __initdata struct irq_router_handler pirq_routers[] = {
18944 + { PCI_VENDOR_ID_INTEL, intel_router_probe },
18945 + { PCI_VENDOR_ID_AL, ali_router_probe },
18946 + { PCI_VENDOR_ID_ITE, ite_router_probe },
18947 + { PCI_VENDOR_ID_VIA, via_router_probe },
18948 + { PCI_VENDOR_ID_OPTI, opti_router_probe },
18949 + { PCI_VENDOR_ID_SI, sis_router_probe },
18950 + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
18951 + { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
18952 + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
18953 + { PCI_VENDOR_ID_AMD, amd_router_probe },
18954 + /* Someone with docs needs to add the ATI Radeon IGP */
18957 +static struct irq_router pirq_router;
18958 +static struct pci_dev *pirq_router_dev;
18962 + * FIXME: should we have an option to say "generic for
18966 +static void __init pirq_find_router(struct irq_router *r)
18968 + struct irq_routing_table *rt = pirq_table;
18969 + struct irq_router_handler *h;
18971 +#ifdef CONFIG_PCI_BIOS
18972 + if (!rt->signature) {
18973 + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
18974 + r->set = pirq_bios_set;
18975 + r->name = "BIOS";
18980 + /* Default unless a driver reloads it */
18981 + r->name = "default";
18985 + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
18986 + rt->rtr_vendor, rt->rtr_device);
18988 + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
18989 + if (!pirq_router_dev) {
18990 + DBG(KERN_DEBUG "PCI: Interrupt router not found at "
18991 + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
18995 + for( h = pirq_routers; h->vendor; h++) {
18996 + /* First look for a router match */
18997 + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
18999 + /* Fall back to a device match */
19000 + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19003 + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19004 + pirq_router.name,
19005 + pirq_router_dev->vendor,
19006 + pirq_router_dev->device,
19007 + pci_name(pirq_router_dev));
19010 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
19012 + struct irq_routing_table *rt = pirq_table;
19013 + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19014 + struct irq_info *info;
19016 + for (info = rt->slots; entries--; info++)
19017 + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19022 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19025 + struct irq_info *info;
19026 + int i, pirq, newirq;
19029 + struct irq_router *r = &pirq_router;
19030 + struct pci_dev *dev2 = NULL;
19031 + char *msg = NULL;
19033 + /* Find IRQ pin */
19034 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19036 + DBG(KERN_DEBUG " -> no interrupt pin\n");
19041 + /* Find IRQ routing entry */
19046 + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19047 + info = pirq_get_info(dev);
19049 + DBG(" -> not found in routing table\n" KERN_DEBUG);
19052 + pirq = info->irq[pin].link;
19053 + mask = info->irq[pin].bitmap;
19055 + DBG(" -> not routed\n" KERN_DEBUG);
19058 + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19059 + mask &= pcibios_irq_mask;
19061 + /* Work around broken HP Pavilion Notebooks which assign USB to
19062 + IRQ 9 even though it is actually wired to IRQ 11 */
19064 + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19066 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19067 + r->set(pirq_router_dev, dev, pirq, 11);
19070 + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19071 + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19074 + dev->irq = r->get(pirq_router_dev, dev, pirq);
19075 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19079 + * Find the best IRQ to assign: use the one
19080 + * reported by the device if possible.
19082 + newirq = dev->irq;
19083 + if (newirq && !((1 << newirq) & mask)) {
19084 + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19085 + else printk("\n" KERN_WARNING
19086 + "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19087 + "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19090 + if (!newirq && assign) {
19091 + for (i = 0; i < 16; i++) {
19092 + if (!(mask & (1 << i)))
19094 + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
19098 + DBG(" -> newirq=%d", newirq);
19100 + /* Check if it is hardcoded */
19101 + if ((pirq & 0xf0) == 0xf0) {
19102 + irq = pirq & 0xf;
19103 + DBG(" -> hardcoded IRQ %d\n", irq);
19104 + msg = "Hardcoded";
19105 + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19106 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19107 + DBG(" -> got IRQ %d\n", irq);
19109 + eisa_set_level_irq(irq);
19110 + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19111 + DBG(" -> assigning IRQ %d", newirq);
19112 + if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19113 + eisa_set_level_irq(newirq);
19114 + DBG(" ... OK\n");
19115 + msg = "Assigned";
19121 + DBG(" ... failed\n");
19122 + if (newirq && mask == (1 << newirq)) {
19128 + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19130 + /* Update IRQ for all devices with the same pirq value */
19131 + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
19132 + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
19136 + info = pirq_get_info(dev2);
19139 + if (info->irq[pin].link == pirq) {
19140 + /* We refuse to override the dev->irq information. Give a warning! */
19141 + if ( dev2->irq && dev2->irq != irq && \
19142 + (!(pci_probe & PCI_USE_PIRQ_MASK) || \
19143 + ((1 << dev2->irq) & mask)) ) {
19144 +#ifndef CONFIG_PCI_MSI
19145 + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
19146 + pci_name(dev2), dev2->irq, irq);
19151 + pirq_penalty[irq]++;
19153 + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
19159 +static void __init pcibios_fixup_irqs(void)
19161 + struct pci_dev *dev = NULL;
19164 + DBG(KERN_DEBUG "PCI: IRQ fixup\n");
19165 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19167 + * If the BIOS has set an out of range IRQ number, just ignore it.
19168 + * Also keep track of which IRQ's are already in use.
19170 + if (dev->irq >= 16) {
19171 + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
19174 + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
19175 + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
19176 + pirq_penalty[dev->irq] = 0;
19177 + pirq_penalty[dev->irq]++;
19181 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19182 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19183 +#ifdef CONFIG_X86_IO_APIC
19185 + * Recalculate IRQ numbers if we use the I/O APIC.
19187 + if (io_apic_assign_pci_irqs)
19192 + pin--; /* interrupt pins are numbered starting from 1 */
19193 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19195 + * Busses behind bridges are typically not listed in the MP-table.
19196 + * In this case we have to look up the IRQ based on the parent bus,
19197 + * parent slot, and pin number. The SMP code detects such bridged
19198 + * busses itself so we should get into this branch reliably.
19200 + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19201 + struct pci_dev * bridge = dev->bus->self;
19203 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19204 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19205 + PCI_SLOT(bridge->devfn), pin);
19207 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19208 + pci_name(bridge), 'A' + pin, irq);
19211 + if (use_pci_vector() &&
19212 + !platform_legacy_irq(irq))
19213 + irq = IO_APIC_VECTOR(irq);
19215 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19216 + pci_name(dev), 'A' + pin, irq);
19223 + * Still no IRQ? Try to lookup one...
19225 + if (pin && !dev->irq)
19226 + pcibios_lookup_irq(dev, 0);
19231 + * Work around broken HP Pavilion Notebooks which assign USB to
19232 + * IRQ 9 even though it is actually wired to IRQ 11
19234 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
19236 + if (!broken_hp_bios_irq9) {
19237 + broken_hp_bios_irq9 = 1;
19238 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19244 + * Work around broken Acer TravelMate 360 Notebooks which assign
19245 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
19247 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
19249 + if (!acer_tm360_irqrouting) {
19250 + acer_tm360_irqrouting = 1;
19251 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19256 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
19258 + .callback = fix_broken_hp_bios_irq9,
19259 + .ident = "HP Pavilion N5400 Series Laptop",
19261 + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
19262 + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
19263 + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
19264 + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
19268 + .callback = fix_acer_tm360_irqrouting,
19269 + .ident = "Acer TravelMate 36x Laptop",
19271 + DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
19272 + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
19278 +static int __init pcibios_irq_init(void)
19280 + DBG(KERN_DEBUG "PCI: IRQ init\n");
19282 + if (pcibios_enable_irq || raw_pci_ops == NULL)
19285 + dmi_check_system(pciirq_dmi_table);
19287 + pirq_table = pirq_find_routing_table();
19289 +#ifdef CONFIG_PCI_BIOS
19290 + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
19291 + pirq_table = pcibios_get_irq_routing_table();
19293 + if (pirq_table) {
19294 + pirq_peer_trick();
19295 + pirq_find_router(&pirq_router);
19296 + if (pirq_table->exclusive_irqs) {
19298 + for (i=0; i<16; i++)
19299 + if (!(pirq_table->exclusive_irqs & (1 << i)))
19300 + pirq_penalty[i] += 100;
19302 + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
19303 + if (io_apic_assign_pci_irqs)
19304 + pirq_table = NULL;
19307 + pcibios_enable_irq = pirq_enable_irq;
19309 + pcibios_fixup_irqs();
19313 +subsys_initcall(pcibios_irq_init);
19316 +static void pirq_penalize_isa_irq(int irq, int active)
19319 + * If any ISAPnP device reports an IRQ in its list of possible
19320 + * IRQ's, we try to avoid assigning it to PCI devices.
19324 + pirq_penalty[irq] += 1000;
19326 + pirq_penalty[irq] += 100;
19330 +void pcibios_penalize_isa_irq(int irq, int active)
19332 +#ifdef CONFIG_ACPI
19334 + acpi_penalize_isa_irq(irq, active);
19337 + pirq_penalize_isa_irq(irq, active);
19340 +static int pirq_enable_irq(struct pci_dev *dev)
19343 + struct pci_dev *temp_dev;
19345 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19346 + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
19349 + pin--; /* interrupt pins are numbered starting from 1 */
19351 + if (io_apic_assign_pci_irqs) {
19354 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19356 + * Busses behind bridges are typically not listed in the MP-table.
19357 + * In this case we have to look up the IRQ based on the parent bus,
19358 + * parent slot, and pin number. The SMP code detects such bridged
19359 + * busses itself so we should get into this branch reliably.
19362 + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19363 + struct pci_dev * bridge = dev->bus->self;
19365 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19366 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19367 + PCI_SLOT(bridge->devfn), pin);
19369 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19370 + pci_name(bridge), 'A' + pin, irq);
19375 +#ifdef CONFIG_PCI_MSI
19376 + if (!platform_legacy_irq(irq))
19377 + irq = IO_APIC_VECTOR(irq);
19379 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19380 + pci_name(dev), 'A' + pin, irq);
19384 + msg = " Probably buggy MP table.";
19385 + } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
19388 + msg = " Please try using pci=biosirq.";
19390 + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
19391 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
19394 + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
19395 + 'A' + pin, pci_name(dev), msg);
19400 +int pci_vector_resources(int last, int nr_released)
19402 + int count = nr_released;
19405 + int offset = (last % 8);
19407 + while (next < FIRST_SYSTEM_VECTOR) {
19409 +#ifdef CONFIG_X86_64
19410 + if (next == IA32_SYSCALL_VECTOR)
19413 + if (next == SYSCALL_VECTOR)
19417 + if (next >= FIRST_SYSTEM_VECTOR) {
19419 + next = FIRST_DEVICE_VECTOR + offset;
19429 Index: head-2008-11-25/arch/x86/pci/pcifront.c
19430 ===================================================================
19431 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19432 +++ head-2008-11-25/arch/x86/pci/pcifront.c 2007-06-12 13:12:49.000000000 +0200
19435 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
19436 + * to support the Xen PCI Frontend's operation
19438 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
19440 +#include <linux/module.h>
19441 +#include <linux/init.h>
19442 +#include <linux/pci.h>
19443 +#include <asm/acpi.h>
19446 +static int pcifront_enable_irq(struct pci_dev *dev)
19449 + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
19455 +extern u8 pci_cache_line_size;
19457 +static int __init pcifront_x86_stub_init(void)
19459 + struct cpuinfo_x86 *c = &boot_cpu_data;
19461 + /* Only install our method if we haven't found real hardware already */
19465 + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
19467 + /* Copied from arch/i386/pci/common.c */
19468 + pci_cache_line_size = 32 >> 2;
19469 + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
19470 + pci_cache_line_size = 64 >> 2; /* K7 & K8 */
19471 + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
19472 + pci_cache_line_size = 128 >> 2; /* P4 */
19474 + /* On x86, we need to disable the normal IRQ routing table and
19475 + * just ask the backend
19477 + pcibios_enable_irq = pcifront_enable_irq;
19478 + pcibios_disable_irq = NULL;
19480 +#ifdef CONFIG_ACPI
19481 + /* Keep ACPI out of the picture */
19488 +arch_initcall(pcifront_x86_stub_init);
19489 Index: head-2008-11-25/arch/x86/ia32/ia32entry-xen.S
19490 ===================================================================
19491 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19492 +++ head-2008-11-25/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200
19495 + * Compatibility mode system call entry point for x86-64.
19497 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
19500 +#include <asm/dwarf2.h>
19501 +#include <asm/calling.h>
19502 +#include <asm/asm-offsets.h>
19503 +#include <asm/current.h>
19504 +#include <asm/errno.h>
19505 +#include <asm/ia32_unistd.h>
19506 +#include <asm/thread_info.h>
19507 +#include <asm/segment.h>
19508 +#include <asm/vsyscall32.h>
19509 +#include <asm/irqflags.h>
19510 +#include <linux/linkage.h>
19512 +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19514 + .macro IA32_ARG_FIXUP noebp=0
19522 + movl %edx,%edx /* zero extension */
19525 + /* clobbers %eax */
19526 + .macro CLEAR_RREGS
19528 + movq %rax,R11(%rsp)
19529 + movq %rax,R10(%rsp)
19530 + movq %rax,R9(%rsp)
19531 + movq %rax,R8(%rsp)
19534 + .macro LOAD_ARGS32 offset
19535 + movl \offset(%rsp),%r11d
19536 + movl \offset+8(%rsp),%r10d
19537 + movl \offset+16(%rsp),%r9d
19538 + movl \offset+24(%rsp),%r8d
19539 + movl \offset+40(%rsp),%ecx
19540 + movl \offset+48(%rsp),%edx
19541 + movl \offset+56(%rsp),%esi
19542 + movl \offset+64(%rsp),%edi
19543 + movl \offset+72(%rsp),%eax
19546 + .macro CFI_STARTPROC32 simple
19547 + CFI_STARTPROC \simple
19550 + CFI_UNDEFINED r10
19551 + CFI_UNDEFINED r11
19552 + CFI_UNDEFINED r12
19553 + CFI_UNDEFINED r13
19554 + CFI_UNDEFINED r14
19555 + CFI_UNDEFINED r15
19559 + * 32bit SYSENTER instruction entry.
19562 + * %eax System call number.
19568 + * %ebp user stack
19573 + * This is purely a fast path. For anything complicated we use the int 0x80
19574 + * path below. Set up a complete hardware stack frame to share code
19575 + * with the int 0x80 path.
19577 +ENTRY(ia32_sysenter_target)
19578 + CFI_STARTPROC32 simple
19579 + CFI_DEF_CFA rsp,SS+8-RIP+16
19580 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19581 + CFI_REL_OFFSET rsp,RSP-RIP+16
19582 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19583 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19584 + CFI_REL_OFFSET rip,RIP-RIP+16
19585 + CFI_REL_OFFSET r11,8
19586 + CFI_REL_OFFSET rcx,0
19587 + movq 8(%rsp),%r11
19590 + CFI_ADJUST_CFA_OFFSET -8
19592 + movl %ebp,%ebp /* zero extension */
19594 + movl $__USER32_DS,40(%rsp)
19595 + movq %rbp,32(%rsp)
19596 + movl $__USER32_CS,16(%rsp)
19597 + movl $VSYSCALL32_SYSEXIT,8(%rsp)
19601 + /* no need to do an access_ok check here because rbp has been
19602 + 32bit zero extended */
19603 +1: movl (%rbp),%r9d
19604 + .section __ex_table,"a"
19605 + .quad 1b,ia32_badarg
19607 + GET_THREAD_INFO(%r10)
19608 + orl $TS_COMPAT,threadinfo_status(%r10)
19609 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19610 + jnz sysenter_tracesys
19612 + cmpl $(IA32_NR_syscalls-1),%eax
19615 + call *ia32_sys_call_table(,%rax,8)
19616 + movq %rax,RAX-ARGOFFSET(%rsp)
19617 + jmp int_ret_from_sys_call
19619 +sysenter_tracesys:
19622 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19623 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19624 + call syscall_trace_enter
19625 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19628 + /* no need to do an access_ok check here because rbp has been
19629 + 32bit zero extended */
19630 +1: movl (%rbp),%r9d
19631 + .section __ex_table,"a"
19632 + .quad 1b,ia32_badarg
19634 + jmp sysenter_do_call
19636 +ENDPROC(ia32_sysenter_target)
19639 + * 32bit SYSCALL instruction entry.
19642 + * %eax System call number.
19644 + * %ecx return EIP
19648 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
19649 + * %esp user stack
19654 + * This is purely a fast path. For anything complicated we use the int 0x80
19655 + * path below. Set up a complete hardware stack frame to share code
19656 + * with the int 0x80 path.
19658 +ENTRY(ia32_cstar_target)
19659 + CFI_STARTPROC32 simple
19660 + CFI_DEF_CFA rsp,SS+8-RIP+16
19661 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19662 + CFI_REL_OFFSET rsp,RSP-RIP+16
19663 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19664 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19665 + CFI_REL_OFFSET rip,RIP-RIP+16
19666 + movl %eax,%eax /* zero extension */
19667 + movl RSP-RIP+16(%rsp),%r8d
19669 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
19670 + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
19672 + movl $__USER32_CS,CS-ARGOFFSET(%rsp)
19673 + movl $__USER32_DS,SS-ARGOFFSET(%rsp)
19674 + /* no need to do an access_ok check here because r8 has been
19675 + 32bit zero extended */
19676 + /* hardware stack frame is complete now */
19677 +1: movl (%r8),%r9d
19678 + .section __ex_table,"a"
19679 + .quad 1b,ia32_badarg
19681 + GET_THREAD_INFO(%r10)
19682 + orl $TS_COMPAT,threadinfo_status(%r10)
19683 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19684 + jnz cstar_tracesys
19686 + cmpl $IA32_NR_syscalls-1,%eax
19689 + call *ia32_sys_call_table(,%rax,8)
19690 + movq %rax,RAX-ARGOFFSET(%rsp)
19691 + jmp int_ret_from_sys_call
19696 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19697 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19698 + call syscall_trace_enter
19699 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19701 + movl RSP-ARGOFFSET(%rsp), %r8d
19702 + /* no need to do an access_ok check here because r8 has been
19703 + 32bit zero extended */
19704 +1: movl (%r8),%r9d
19705 + .section __ex_table,"a"
19706 + .quad 1b,ia32_badarg
19708 + jmp cstar_do_call
19709 +END(ia32_cstar_target)
19712 + movq $-EFAULT,%rax
19717 + * Emulated IA32 system calls via int 0x80.
19720 + * %eax System call number.
19726 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
19729 + * Uses the same stack frame as the x86-64 version.
19730 + * All registers except %eax must be saved (but ptrace may violate that)
19731 + * Arguments are zero extended. For system calls that want sign extension and
19732 + * take long arguments a wrapper is needed. Most calls can just be called
19734 + * Assumes it is only called from user space and entered with interrupts on.
19737 +ENTRY(ia32_syscall)
19738 + CFI_STARTPROC simple
19739 + CFI_DEF_CFA rsp,SS+8-RIP+16
19740 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19741 + CFI_REL_OFFSET rsp,RSP-RIP+16
19742 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19743 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19744 + CFI_REL_OFFSET rip,RIP-RIP+16
19745 + CFI_REL_OFFSET r11,8
19746 + CFI_REL_OFFSET rcx,0
19747 + movq 8(%rsp),%r11
19750 + CFI_ADJUST_CFA_OFFSET -8
19755 + /* note the registers are not zero extended to the sf.
19756 + this could be a problem. */
19758 + GET_THREAD_INFO(%r10)
19759 + orl $TS_COMPAT,threadinfo_status(%r10)
19760 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19761 + jnz ia32_tracesys
19763 + cmpl $(IA32_NR_syscalls-1),%eax
19766 + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
19768 + movq %rax,RAX-ARGOFFSET(%rsp)
19769 + jmp int_ret_from_sys_call
19773 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19774 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19775 + call syscall_trace_enter
19776 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19778 + jmp ia32_do_syscall
19782 + movq $0,ORIG_RAX-ARGOFFSET(%rsp)
19783 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
19784 + jmp int_ret_from_sys_call
19787 + movq $-ENOSYS,%rax
19791 + .macro PTREGSCALL label, func, arg
19794 + leaq \func(%rip),%rax
19795 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
19796 + jmp ia32_ptregs_common
19801 + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
19802 + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
19803 + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
19804 + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
19805 + PTREGSCALL stub32_execve, sys32_execve, %rcx
19806 + PTREGSCALL stub32_fork, sys_fork, %rdi
19807 + PTREGSCALL stub32_clone, sys32_clone, %rdx
19808 + PTREGSCALL stub32_vfork, sys_vfork, %rdi
19809 + PTREGSCALL stub32_iopl, sys_iopl, %rsi
19810 + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
19812 +ENTRY(ia32_ptregs_common)
19815 + CFI_STARTPROC32 simple
19816 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
19817 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
19818 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
19819 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
19820 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
19821 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
19822 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
19823 +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
19824 +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
19825 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
19826 +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
19830 + jmp ia32_sysret /* misbalances the return cache */
19832 +END(ia32_ptregs_common)
19834 + .section .rodata,"a"
19836 +ia32_sys_call_table:
19837 + .quad sys_restart_syscall
19839 + .quad stub32_fork
19842 + .quad compat_sys_open /* 5 */
19844 + .quad sys32_waitpid
19847 + .quad sys_unlink /* 10 */
19848 + .quad stub32_execve
19850 + .quad compat_sys_time
19852 + .quad sys_chmod /* 15 */
19853 + .quad sys_lchown16
19854 + .quad quiet_ni_syscall /* old break syscall holder */
19856 + .quad sys32_lseek
19857 + .quad sys_getpid /* 20 */
19858 + .quad compat_sys_mount /* mount */
19859 + .quad sys_oldumount /* old_umount */
19860 + .quad sys_setuid16
19861 + .quad sys_getuid16
19862 + .quad compat_sys_stime /* stime */ /* 25 */
19863 + .quad sys32_ptrace /* ptrace */
19865 + .quad sys_fstat /* (old)fstat */
19867 + .quad compat_sys_utime /* 30 */
19868 + .quad quiet_ni_syscall /* old stty syscall holder */
19869 + .quad quiet_ni_syscall /* old gtty syscall holder */
19872 + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
19877 + .quad sys_rmdir /* 40 */
19880 + .quad compat_sys_times
19881 + .quad quiet_ni_syscall /* old prof syscall holder */
19882 + .quad sys_brk /* 45 */
19883 + .quad sys_setgid16
19884 + .quad sys_getgid16
19886 + .quad sys_geteuid16
19887 + .quad sys_getegid16 /* 50 */
19889 + .quad sys_umount /* new_umount */
19890 + .quad quiet_ni_syscall /* old lock syscall holder */
19891 + .quad compat_sys_ioctl
19892 + .quad compat_sys_fcntl64 /* 55 */
19893 + .quad quiet_ni_syscall /* old mpx syscall holder */
19894 + .quad sys_setpgid
19895 + .quad quiet_ni_syscall /* old ulimit syscall holder */
19896 + .quad sys32_olduname
19897 + .quad sys_umask /* 60 */
19899 + .quad sys32_ustat
19901 + .quad sys_getppid
19902 + .quad sys_getpgrp /* 65 */
19904 + .quad sys32_sigaction
19905 + .quad sys_sgetmask
19906 + .quad sys_ssetmask
19907 + .quad sys_setreuid16 /* 70 */
19908 + .quad sys_setregid16
19909 + .quad stub32_sigsuspend
19910 + .quad compat_sys_sigpending
19911 + .quad sys_sethostname
19912 + .quad compat_sys_setrlimit /* 75 */
19913 + .quad compat_sys_old_getrlimit /* old_getrlimit */
19914 + .quad compat_sys_getrusage
19915 + .quad sys32_gettimeofday
19916 + .quad sys32_settimeofday
19917 + .quad sys_getgroups16 /* 80 */
19918 + .quad sys_setgroups16
19919 + .quad sys32_old_select
19920 + .quad sys_symlink
19922 + .quad sys_readlink /* 85 */
19923 +#ifdef CONFIG_IA32_AOUT
19926 + .quad quiet_ni_syscall
19930 + .quad compat_sys_old_readdir
19931 + .quad sys32_mmap /* 90 */
19933 + .quad sys_truncate
19934 + .quad sys_ftruncate
19936 + .quad sys_fchown16 /* 95 */
19937 + .quad sys_getpriority
19938 + .quad sys_setpriority
19939 + .quad quiet_ni_syscall /* old profil syscall holder */
19940 + .quad compat_sys_statfs
19941 + .quad compat_sys_fstatfs /* 100 */
19943 + .quad compat_sys_socketcall
19945 + .quad compat_sys_setitimer
19946 + .quad compat_sys_getitimer /* 105 */
19947 + .quad compat_sys_newstat
19948 + .quad compat_sys_newlstat
19949 + .quad compat_sys_newfstat
19950 + .quad sys32_uname
19951 + .quad stub32_iopl /* 110 */
19952 + .quad sys_vhangup
19953 + .quad quiet_ni_syscall /* old "idle" system call */
19954 + .quad sys32_vm86_warning /* vm86old */
19955 + .quad compat_sys_wait4
19956 + .quad sys_swapoff /* 115 */
19957 + .quad sys32_sysinfo
19960 + .quad stub32_sigreturn
19961 + .quad stub32_clone /* 120 */
19962 + .quad sys_setdomainname
19964 + .quad sys_modify_ldt
19965 + .quad compat_sys_adjtimex
19966 + .quad sys32_mprotect /* 125 */
19967 + .quad compat_sys_sigprocmask
19968 + .quad quiet_ni_syscall /* create_module */
19969 + .quad sys_init_module
19970 + .quad sys_delete_module
19971 + .quad quiet_ni_syscall /* 130 get_kernel_syms */
19972 + .quad sys_quotactl
19973 + .quad sys_getpgid
19975 + .quad quiet_ni_syscall /* bdflush */
19976 + .quad sys_sysfs /* 135 */
19977 + .quad sys_personality
19978 + .quad quiet_ni_syscall /* for afs_syscall */
19979 + .quad sys_setfsuid16
19980 + .quad sys_setfsgid16
19981 + .quad sys_llseek /* 140 */
19982 + .quad compat_sys_getdents
19983 + .quad compat_sys_select
19986 + .quad compat_sys_readv /* 145 */
19987 + .quad compat_sys_writev
19989 + .quad sys_fdatasync
19990 + .quad sys32_sysctl /* sysctl */
19991 + .quad sys_mlock /* 150 */
19992 + .quad sys_munlock
19993 + .quad sys_mlockall
19994 + .quad sys_munlockall
19995 + .quad sys_sched_setparam
19996 + .quad sys_sched_getparam /* 155 */
19997 + .quad sys_sched_setscheduler
19998 + .quad sys_sched_getscheduler
19999 + .quad sys_sched_yield
20000 + .quad sys_sched_get_priority_max
20001 + .quad sys_sched_get_priority_min /* 160 */
20002 + .quad sys_sched_rr_get_interval
20003 + .quad compat_sys_nanosleep
20005 + .quad sys_setresuid16
20006 + .quad sys_getresuid16 /* 165 */
20007 + .quad sys32_vm86_warning /* vm86 */
20008 + .quad quiet_ni_syscall /* query_module */
20010 + .quad compat_sys_nfsservctl
20011 + .quad sys_setresgid16 /* 170 */
20012 + .quad sys_getresgid16
20014 + .quad stub32_rt_sigreturn
20015 + .quad sys32_rt_sigaction
20016 + .quad sys32_rt_sigprocmask /* 175 */
20017 + .quad sys32_rt_sigpending
20018 + .quad compat_sys_rt_sigtimedwait
20019 + .quad sys32_rt_sigqueueinfo
20020 + .quad stub32_rt_sigsuspend
20021 + .quad sys32_pread /* 180 */
20022 + .quad sys32_pwrite
20023 + .quad sys_chown16
20027 + .quad stub32_sigaltstack
20028 + .quad sys32_sendfile
20029 + .quad quiet_ni_syscall /* streams1 */
20030 + .quad quiet_ni_syscall /* streams2 */
20031 + .quad stub32_vfork /* 190 */
20032 + .quad compat_sys_getrlimit
20033 + .quad sys32_mmap2
20034 + .quad sys32_truncate64
20035 + .quad sys32_ftruncate64
20036 + .quad sys32_stat64 /* 195 */
20037 + .quad sys32_lstat64
20038 + .quad sys32_fstat64
20041 + .quad sys_getgid /* 200 */
20042 + .quad sys_geteuid
20043 + .quad sys_getegid
20044 + .quad sys_setreuid
20045 + .quad sys_setregid
20046 + .quad sys_getgroups /* 205 */
20047 + .quad sys_setgroups
20049 + .quad sys_setresuid
20050 + .quad sys_getresuid
20051 + .quad sys_setresgid /* 210 */
20052 + .quad sys_getresgid
20056 + .quad sys_setfsuid /* 215 */
20057 + .quad sys_setfsgid
20058 + .quad sys_pivot_root
20059 + .quad sys_mincore
20060 + .quad sys_madvise
20061 + .quad compat_sys_getdents64 /* 220 getdents64 */
20062 + .quad compat_sys_fcntl64
20063 + .quad quiet_ni_syscall /* tux */
20064 + .quad quiet_ni_syscall /* security */
20066 + .quad sys_readahead /* 225 */
20067 + .quad sys_setxattr
20068 + .quad sys_lsetxattr
20069 + .quad sys_fsetxattr
20070 + .quad sys_getxattr
20071 + .quad sys_lgetxattr /* 230 */
20072 + .quad sys_fgetxattr
20073 + .quad sys_listxattr
20074 + .quad sys_llistxattr
20075 + .quad sys_flistxattr
20076 + .quad sys_removexattr /* 235 */
20077 + .quad sys_lremovexattr
20078 + .quad sys_fremovexattr
20080 + .quad sys_sendfile64
20081 + .quad compat_sys_futex /* 240 */
20082 + .quad compat_sys_sched_setaffinity
20083 + .quad compat_sys_sched_getaffinity
20084 + .quad sys32_set_thread_area
20085 + .quad sys32_get_thread_area
20086 + .quad compat_sys_io_setup /* 245 */
20087 + .quad sys_io_destroy
20088 + .quad compat_sys_io_getevents
20089 + .quad compat_sys_io_submit
20090 + .quad sys_io_cancel
20091 + .quad sys_fadvise64 /* 250 */
20092 + .quad quiet_ni_syscall /* free_huge_pages */
20093 + .quad sys_exit_group
20094 + .quad sys32_lookup_dcookie
20095 + .quad sys_epoll_create
20096 + .quad sys_epoll_ctl /* 255 */
20097 + .quad sys_epoll_wait
20098 + .quad sys_remap_file_pages
20099 + .quad sys_set_tid_address
20100 + .quad compat_sys_timer_create
20101 + .quad compat_sys_timer_settime /* 260 */
20102 + .quad compat_sys_timer_gettime
20103 + .quad sys_timer_getoverrun
20104 + .quad sys_timer_delete
20105 + .quad compat_sys_clock_settime
20106 + .quad compat_sys_clock_gettime /* 265 */
20107 + .quad compat_sys_clock_getres
20108 + .quad compat_sys_clock_nanosleep
20109 + .quad compat_sys_statfs64
20110 + .quad compat_sys_fstatfs64
20111 + .quad sys_tgkill /* 270 */
20112 + .quad compat_sys_utimes
20113 + .quad sys32_fadvise64_64
20114 + .quad quiet_ni_syscall /* sys_vserver */
20116 + .quad compat_sys_get_mempolicy /* 275 */
20117 + .quad sys_set_mempolicy
20118 + .quad compat_sys_mq_open
20119 + .quad sys_mq_unlink
20120 + .quad compat_sys_mq_timedsend
20121 + .quad compat_sys_mq_timedreceive /* 280 */
20122 + .quad compat_sys_mq_notify
20123 + .quad compat_sys_mq_getsetattr
20124 + .quad compat_sys_kexec_load /* reserved for kexec */
20125 + .quad compat_sys_waitid
20126 + .quad quiet_ni_syscall /* 285: sys_altroot */
20127 + .quad sys_add_key
20128 + .quad sys_request_key
20130 + .quad sys_ioprio_set
20131 + .quad sys_ioprio_get /* 290 */
20132 + .quad sys_inotify_init
20133 + .quad sys_inotify_add_watch
20134 + .quad sys_inotify_rm_watch
20135 + .quad sys_migrate_pages
20136 + .quad compat_sys_openat /* 295 */
20137 + .quad sys_mkdirat
20138 + .quad sys_mknodat
20139 + .quad sys_fchownat
20140 + .quad compat_sys_futimesat
20141 + .quad sys32_fstatat /* 300 */
20142 + .quad sys_unlinkat
20143 + .quad sys_renameat
20145 + .quad sys_symlinkat
20146 + .quad sys_readlinkat /* 305 */
20147 + .quad sys_fchmodat
20148 + .quad sys_faccessat
20149 + .quad quiet_ni_syscall /* pselect6 for now */
20150 + .quad quiet_ni_syscall /* ppoll for now */
20151 + .quad sys_unshare /* 310 */
20152 + .quad compat_sys_set_robust_list
20153 + .quad compat_sys_get_robust_list
20155 + .quad sys_sync_file_range
20157 + .quad compat_sys_vmsplice
20158 + .quad compat_sys_move_pages
20160 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c
20161 ===================================================================
20162 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20163 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200
20166 + * acpi.c - Architecture-Specific Low-Level ACPI Support
20168 + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
20169 + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
20170 + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
20171 + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
20172 + * Copyright (C) 2003 Pavel Machek, SuSE Labs
20174 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20176 + * This program is free software; you can redistribute it and/or modify
20177 + * it under the terms of the GNU General Public License as published by
20178 + * the Free Software Foundation; either version 2 of the License, or
20179 + * (at your option) any later version.
20181 + * This program is distributed in the hope that it will be useful,
20182 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
20183 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20184 + * GNU General Public License for more details.
20186 + * You should have received a copy of the GNU General Public License
20187 + * along with this program; if not, write to the Free Software
20188 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20190 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20193 +#include <linux/kernel.h>
20194 +#include <linux/init.h>
20195 +#include <linux/types.h>
20196 +#include <linux/stddef.h>
20197 +#include <linux/slab.h>
20198 +#include <linux/pci.h>
20199 +#include <linux/bootmem.h>
20200 +#include <linux/acpi.h>
20201 +#include <linux/cpumask.h>
20203 +#include <asm/mpspec.h>
20204 +#include <asm/io.h>
20205 +#include <asm/apic.h>
20206 +#include <asm/apicdef.h>
20207 +#include <asm/page.h>
20208 +#include <asm/pgtable.h>
20209 +#include <asm/pgalloc.h>
20210 +#include <asm/io_apic.h>
20211 +#include <asm/proto.h>
20212 +#include <asm/tlbflush.h>
20214 +/* --------------------------------------------------------------------------
20215 + Low-Level Sleep Support
20216 + -------------------------------------------------------------------------- */
20218 +#ifdef CONFIG_ACPI_SLEEP
20220 +#ifndef CONFIG_ACPI_PV_SLEEP
20221 +/* address in low memory of the wakeup routine. */
20222 +unsigned long acpi_wakeup_address = 0;
20223 +unsigned long acpi_video_flags;
20224 +extern char wakeup_start, wakeup_end;
20226 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
20228 +static pgd_t low_ptr;
20230 +static void init_low_mapping(void)
20232 + pgd_t *slot0 = pgd_offset(current->mm, 0UL);
20233 + low_ptr = *slot0;
20234 + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
20235 + WARN_ON(num_online_cpus() != 1);
20236 + local_flush_tlb();
20241 + * acpi_save_state_mem - save kernel state
20243 + * Create an identity mapped page table and copy the wakeup routine to
20246 +int acpi_save_state_mem(void)
20248 +#ifndef CONFIG_ACPI_PV_SLEEP
20249 + init_low_mapping();
20251 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
20252 + &wakeup_end - &wakeup_start);
20253 + acpi_copy_wakeup_routine(acpi_wakeup_address);
20259 + * acpi_restore_state
20261 +void acpi_restore_state_mem(void)
20263 +#ifndef CONFIG_ACPI_PV_SLEEP
20264 + set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
20265 + local_flush_tlb();
20270 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
20272 + * We allocate a page in low memory for the wakeup
20273 + * routine for when we come back from a sleep state. The
20274 + * runtime allocator allows specification of <16M pages, but not
20277 +void __init acpi_reserve_bootmem(void)
20279 +#ifndef CONFIG_ACPI_PV_SLEEP
20280 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
20281 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
20283 + "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
20287 +#ifndef CONFIG_ACPI_PV_SLEEP
20288 +static int __init acpi_sleep_setup(char *str)
20290 + while ((str != NULL) && (*str != '\0')) {
20291 + if (strncmp(str, "s3_bios", 7) == 0)
20292 + acpi_video_flags = 1;
20293 + if (strncmp(str, "s3_mode", 7) == 0)
20294 + acpi_video_flags |= 2;
20295 + str = strchr(str, ',');
20297 + str += strspn(str, ", \t");
20303 +__setup("acpi_sleep=", acpi_sleep_setup);
20304 +#endif /* CONFIG_ACPI_PV_SLEEP */
20306 +#endif /*CONFIG_ACPI_SLEEP */
20308 +void acpi_pci_link_exit(void)
20311 Index: head-2008-11-25/arch/x86/kernel/apic_64-xen.c
20312 ===================================================================
20313 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20314 +++ head-2008-11-25/arch/x86/kernel/apic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
20317 + * Local APIC handling, local APIC timers
20319 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
20322 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
20323 + * thanks to Eric Gilmore
20324 + * and Rolf G. Tews
20325 + * for testing these extensively.
20326 + * Maciej W. Rozycki : Various updates and fixes.
20327 + * Mikael Pettersson : Power Management for UP-APIC.
20328 + * Pavel Machek and
20329 + * Mikael Pettersson : PM converted to driver model.
20332 +#include <linux/init.h>
20334 +#include <linux/mm.h>
20335 +#include <linux/delay.h>
20336 +#include <linux/bootmem.h>
20337 +#include <linux/smp_lock.h>
20338 +#include <linux/interrupt.h>
20339 +#include <linux/mc146818rtc.h>
20340 +#include <linux/kernel_stat.h>
20341 +#include <linux/sysdev.h>
20342 +#include <linux/module.h>
20344 +#include <asm/atomic.h>
20345 +#include <asm/smp.h>
20346 +#include <asm/mtrr.h>
20347 +#include <asm/mpspec.h>
20348 +#include <asm/desc.h>
20349 +#include <asm/arch_hooks.h>
20350 +#include <asm/hpet.h>
20351 +#include <asm/idle.h>
20353 +int apic_verbosity;
20356 + * 'what should we do if we get a hw irq event on an illegal vector'.
20357 + * each architecture has to answer this themselves.
20359 +void ack_bad_irq(unsigned int irq)
20361 + printk("unexpected IRQ trap at vector %02x\n", irq);
20363 + * Currently unexpected vectors happen only on SMP and APIC.
20364 + * We _must_ ack these because every local APIC has only N
20365 + * irq slots per priority level, and a 'hanging, unacked' IRQ
20366 + * holds up an irq slot - in excessive cases (when multiple
20367 + * unexpected vectors occur) that might lock up the APIC
20369 + * But don't ack when the APIC is disabled. -AK
20371 + if (!disable_apic)
20375 +int setup_profiling_timer(unsigned int multiplier)
20380 +void smp_local_timer_interrupt(struct pt_regs *regs)
20382 + profile_tick(CPU_PROFILING, regs);
20383 +#ifndef CONFIG_XEN
20385 + update_process_times(user_mode(regs));
20389 + * We take the 'long' return path, and there every subsystem
20390 + * grabs the appropriate locks (kernel lock/ irq lock).
20392 + * we might want to decouple profiling from the 'long path',
20393 + * and do the profiling totally in assembly.
20395 + * Currently this isn't too much of an issue (performance wise),
20396 + * we can take more than 100K local irqs per second on a 100 MHz P5.
20401 + * Local APIC timer interrupt. This is the most natural way for doing
20402 + * local interrupts, but local timer interrupts can be emulated by
20403 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
20405 + * [ if a single-CPU system runs an SMP kernel then we call the local
20406 + * interrupt as well. Thus we cannot inline the local irq ... ]
20408 +void smp_apic_timer_interrupt(struct pt_regs *regs)
20411 + * the NMI deadlock-detector uses this.
20413 + add_pda(apic_timer_irqs, 1);
20416 + * NOTE! We'd better ACK the irq immediately,
20417 + * because timer handling can be slow.
20421 + * update_process_times() expects us to have done irq_enter().
20422 + * Besides, if we don't timer interrupts ignore the global
20423 + * interrupt lock, which is the WrongThing (tm) to do.
20427 + smp_local_timer_interrupt(regs);
20432 + * This interrupt should _never_ happen with our APIC/SMP architecture
20434 +asmlinkage void smp_spurious_interrupt(void)
20440 + * Check if this really is a spurious interrupt and ACK it
20441 + * if it is a vectored one. Just in case...
20442 + * Spurious interrupts should not be ACKed.
20444 + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
20445 + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
20449 + static unsigned long last_warning;
20450 + static unsigned long skipped;
20452 + /* see sw-dev-man vol 3, chapter 7.4.13.5 */
20453 + if (time_before(last_warning+30*HZ,jiffies)) {
20454 + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
20455 + smp_processor_id(), skipped);
20456 + last_warning = jiffies;
20466 + * This interrupt should never happen with our APIC/SMP architecture
20469 +asmlinkage void smp_error_interrupt(void)
20471 + unsigned int v, v1;
20475 + /* First tickle the hardware, only then report what went on. -- REW */
20476 + v = apic_read(APIC_ESR);
20477 + apic_write(APIC_ESR, 0);
20478 + v1 = apic_read(APIC_ESR);
20480 + atomic_inc(&irq_err_count);
20482 + /* Here is what the APIC error bits mean:
20484 + 1: Receive CS error
20485 + 2: Send accept error
20486 + 3: Receive accept error
20488 + 5: Send illegal vector
20489 + 6: Received illegal vector
20490 + 7: Illegal register address
20492 + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
20493 + smp_processor_id(), v , v1);
20500 + * This initializes the IO-APIC and APIC hardware if this is
20503 +int __init APIC_init_uniprocessor (void)
20505 +#ifdef CONFIG_X86_IO_APIC
20506 + if (smp_found_config)
20507 + if (!skip_ioapic_setup && nr_ioapics)
20513 Index: head-2008-11-25/arch/x86/kernel/e820_64-xen.c
20514 ===================================================================
20515 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20516 +++ head-2008-11-25/arch/x86/kernel/e820_64-xen.c 2008-04-22 19:56:27.000000000 +0200
20519 + * Handle the memory map.
20520 + * The functions here do the job until bootmem takes over.
20522 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
20523 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
20524 + * Alex Achenbach <xela@slit.de>, December 2002.
20525 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
20528 +#include <linux/kernel.h>
20529 +#include <linux/types.h>
20530 +#include <linux/init.h>
20531 +#include <linux/bootmem.h>
20532 +#include <linux/ioport.h>
20533 +#include <linux/string.h>
20534 +#include <linux/kexec.h>
20535 +#include <linux/module.h>
20537 +#include <asm/pgtable.h>
20538 +#include <asm/page.h>
20539 +#include <asm/e820.h>
20540 +#include <asm/proto.h>
20541 +#include <asm/bootsetup.h>
20542 +#include <asm/sections.h>
20543 +#include <xen/interface/memory.h>
20546 + * PFN of last memory page.
20548 +unsigned long end_pfn;
20549 +EXPORT_SYMBOL(end_pfn);
20552 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
20553 + * The direct mapping extends to end_pfn_map, so that we can directly access
20554 + * apertures, ACPI and other tables without having to play with fixmaps.
20556 +unsigned long end_pfn_map;
20559 + * Last pfn which the user wants to use.
20561 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
20563 +extern struct resource code_resource, data_resource;
20566 +extern struct e820map machine_e820;
20569 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */
20570 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
20572 + unsigned long addr = *addrp, last = addr + size;
20574 +#ifndef CONFIG_XEN
20575 + /* various gunk below that needed for SMP startup */
20576 + if (addr < 0x8000) {
20581 + /* direct mapping tables of the kernel */
20582 + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
20583 + *addrp = table_end << PAGE_SHIFT;
20588 +#ifdef CONFIG_BLK_DEV_INITRD
20589 + if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
20590 + addr < INITRD_START+INITRD_SIZE) {
20591 + *addrp = INITRD_START + INITRD_SIZE;
20595 + /* kernel code + 640k memory hole (later should not be needed, but
20596 + be paranoid for now) */
20597 + if (last >= 640*1024 && addr < 1024*1024) {
20598 + *addrp = 1024*1024;
20601 + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
20602 + *addrp = __pa_symbol(&_end);
20606 + if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
20607 + *addrp = ebda_addr + ebda_size;
20611 + /* XXX ramdisk image here? */
20613 + if (last < (table_end<<PAGE_SHIFT)) {
20614 + *addrp = table_end << PAGE_SHIFT;
20622 + * This function checks if any part of the range <start,end> is mapped
20625 +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
20629 +#ifndef CONFIG_XEN
20630 + for (i = 0; i < e820.nr_map; i++) {
20631 + struct e820entry *ei = &e820.map[i];
20633 + if (!is_initial_xendomain())
20635 + for (i = 0; i < machine_e820.nr_map; i++) {
20636 + const struct e820entry *ei = &machine_e820.map[i];
20639 + if (type && ei->type != type)
20641 + if (ei->addr >= end || ei->addr + ei->size <= start)
20647 +EXPORT_SYMBOL_GPL(e820_any_mapped);
20650 + * This function checks if the entire range <start,end> is mapped with type.
20652 + * Note: this function only works correct if the e820 table is sorted and
20653 + * not-overlapping, which is the case
20655 +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
20659 +#ifndef CONFIG_XEN
20660 + for (i = 0; i < e820.nr_map; i++) {
20661 + struct e820entry *ei = &e820.map[i];
20663 + if (!is_initial_xendomain())
20665 + for (i = 0; i < machine_e820.nr_map; i++) {
20666 + const struct e820entry *ei = &machine_e820.map[i];
20669 + if (type && ei->type != type)
20671 + /* is the region (part) in overlap with the current region ?*/
20672 + if (ei->addr >= end || ei->addr + ei->size <= start)
20675 + /* if the region is at the beginning of <start,end> we move
20676 + * start to the end of the region since it's ok until there
20678 + if (ei->addr <= start)
20679 + start = ei->addr + ei->size;
20680 + /* if start is now at or beyond end, we're done, full coverage */
20681 + if (start >= end)
20682 + return 1; /* we're done */
20688 + * Find a free area in a specific range.
20690 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
20693 + for (i = 0; i < e820.nr_map; i++) {
20694 + struct e820entry *ei = &e820.map[i];
20695 + unsigned long addr = ei->addr, last;
20696 + if (ei->type != E820_RAM)
20698 + if (addr < start)
20700 + if (addr > ei->addr + ei->size)
20702 + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
20704 + last = addr + size;
20705 + if (last > ei->addr + ei->size)
20715 + * Free bootmem based on the e820 table for a node.
20717 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
20720 + for (i = 0; i < e820.nr_map; i++) {
20721 + struct e820entry *ei = &e820.map[i];
20722 + unsigned long last, addr;
20724 + if (ei->type != E820_RAM ||
20725 + ei->addr+ei->size <= start ||
20729 + addr = round_up(ei->addr, PAGE_SIZE);
20730 + if (addr < start)
20733 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
20737 + if (last > addr && last-addr >= PAGE_SIZE)
20738 + free_bootmem_node(pgdat, addr, last-addr);
20743 + * Find the highest page frame number we have available
20745 +unsigned long __init e820_end_of_ram(void)
20748 + unsigned long end_pfn = 0;
20750 + for (i = 0; i < e820.nr_map; i++) {
20751 + struct e820entry *ei = &e820.map[i];
20752 + unsigned long start, end;
20754 + start = round_up(ei->addr, PAGE_SIZE);
20755 + end = round_down(ei->addr + ei->size, PAGE_SIZE);
20756 + if (start >= end)
20758 + if (ei->type == E820_RAM) {
20759 + if (end > end_pfn<<PAGE_SHIFT)
20760 + end_pfn = end>>PAGE_SHIFT;
20762 + if (end > end_pfn_map<<PAGE_SHIFT)
20763 + end_pfn_map = end>>PAGE_SHIFT;
20767 + if (end_pfn > end_pfn_map)
20768 + end_pfn_map = end_pfn;
20769 + if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
20770 + end_pfn_map = MAXMEM>>PAGE_SHIFT;
20771 + if (end_pfn > end_user_pfn)
20772 + end_pfn = end_user_pfn;
20773 + if (end_pfn > end_pfn_map)
20774 + end_pfn = end_pfn_map;
20780 + * Compute how much memory is missing in a range.
20781 + * Unlike the other functions in this file the arguments are in page numbers.
20783 +unsigned long __init
20784 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
20786 + unsigned long ram = 0;
20787 + unsigned long start = start_pfn << PAGE_SHIFT;
20788 + unsigned long end = end_pfn << PAGE_SHIFT;
20790 + for (i = 0; i < e820.nr_map; i++) {
20791 + struct e820entry *ei = &e820.map[i];
20792 + unsigned long last, addr;
20794 + if (ei->type != E820_RAM ||
20795 + ei->addr+ei->size <= start ||
20799 + addr = round_up(ei->addr, PAGE_SIZE);
20800 + if (addr < start)
20803 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
20808 + ram += last - addr;
20810 + return ((end - start) - ram) >> PAGE_SHIFT;
20814 + * Mark e820 reserved areas as busy for the resource manager.
20816 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
20819 + for (i = 0; i < nr_map; i++) {
20820 + struct resource *res;
20821 + res = alloc_bootmem_low(sizeof(struct resource));
20822 + switch (e820[i].type) {
20823 + case E820_RAM: res->name = "System RAM"; break;
20824 + case E820_ACPI: res->name = "ACPI Tables"; break;
20825 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
20826 + default: res->name = "reserved";
20828 + res->start = e820[i].addr;
20829 + res->end = res->start + e820[i].size - 1;
20830 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
20831 + request_resource(&iomem_resource, res);
20832 + if (e820[i].type == E820_RAM) {
20834 + * We don't know which RAM region contains kernel data,
20835 + * so we try it repeatedly and let the resource manager
20838 +#ifndef CONFIG_XEN
20839 + request_resource(res, &code_resource);
20840 + request_resource(res, &data_resource);
20842 +#ifdef CONFIG_KEXEC
20843 + if (crashk_res.start != crashk_res.end)
20844 + request_resource(res, &crashk_res);
20846 + xen_machine_kexec_register_resources(res);
20854 + * Add a memory region to the kernel e820 map.
20856 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
20858 + int x = e820.nr_map;
20860 + if (x == E820MAX) {
20861 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
20865 + e820.map[x].addr = start;
20866 + e820.map[x].size = size;
20867 + e820.map[x].type = type;
20871 +void __init e820_print_map(char *who)
20875 + for (i = 0; i < e820.nr_map; i++) {
20876 + printk(" %s: %016Lx - %016Lx ", who,
20877 + (unsigned long long) e820.map[i].addr,
20878 + (unsigned long long) (e820.map[i].addr + e820.map[i].size));
20879 + switch (e820.map[i].type) {
20880 + case E820_RAM: printk("(usable)\n");
20882 + case E820_RESERVED:
20883 + printk("(reserved)\n");
20886 + printk("(ACPI data)\n");
20889 + printk("(ACPI NVS)\n");
20891 + default: printk("type %u\n", e820.map[i].type);
20898 + * Sanitize the BIOS e820 map.
20900 + * Some e820 responses include overlapping entries. The following
20901 + * replaces the original e820 map with a new one, removing overlaps.
20904 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
20906 + struct change_member {
20907 + struct e820entry *pbios; /* pointer to original bios entry */
20908 + unsigned long long addr; /* address for this change point */
20910 + static struct change_member change_point_list[2*E820MAX] __initdata;
20911 + static struct change_member *change_point[2*E820MAX] __initdata;
20912 + static struct e820entry *overlap_list[E820MAX] __initdata;
20913 + static struct e820entry new_bios[E820MAX] __initdata;
20914 + struct change_member *change_tmp;
20915 + unsigned long current_type, last_type;
20916 + unsigned long long last_addr;
20917 + int chgidx, still_changing;
20918 + int overlap_entries;
20919 + int new_bios_entry;
20920 + int old_nr, new_nr, chg_nr;
20924 + Visually we're performing the following (1,2,3,4 = memory types)...
20926 + Sample memory map (w/overlaps):
20927 + ____22__________________
20928 + ______________________4_
20929 + ____1111________________
20930 + _44_____________________
20931 + 11111111________________
20932 + ____________________33__
20933 + ___________44___________
20934 + __________33333_________
20935 + ______________22________
20936 + ___________________2222_
20937 + _________111111111______
20938 + _____________________11_
20939 + _________________4______
20941 + Sanitized equivalent (no overlap):
20942 + 1_______________________
20943 + _44_____________________
20944 + ___1____________________
20945 + ____22__________________
20946 + ______11________________
20947 + _________1______________
20948 + __________3_____________
20949 + ___________44___________
20950 + _____________33_________
20951 + _______________2________
20952 + ________________1_______
20953 + _________________4______
20954 + ___________________2____
20955 + ____________________33__
20956 + ______________________4_
20959 + /* if there's only one memory region, don't bother */
20960 + if (*pnr_map < 2)
20963 + old_nr = *pnr_map;
20965 + /* bail out if we find any unreasonable addresses in bios map */
20966 + for (i=0; i<old_nr; i++)
20967 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
20970 + /* create pointers for initial change-point information (for sorting) */
20971 + for (i=0; i < 2*old_nr; i++)
20972 + change_point[i] = &change_point_list[i];
20974 + /* record all known change-points (starting and ending addresses),
20975 + omitting those that are for empty memory regions */
20977 + for (i=0; i < old_nr; i++) {
20978 + if (biosmap[i].size != 0) {
20979 + change_point[chgidx]->addr = biosmap[i].addr;
20980 + change_point[chgidx++]->pbios = &biosmap[i];
20981 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
20982 + change_point[chgidx++]->pbios = &biosmap[i];
20987 + /* sort change-point list by memory addresses (low -> high) */
20988 + still_changing = 1;
20989 + while (still_changing) {
20990 + still_changing = 0;
20991 + for (i=1; i < chg_nr; i++) {
20992 + /* if <current_addr> > <last_addr>, swap */
20993 + /* or, if current=<start_addr> & last=<end_addr>, swap */
20994 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
20995 + ((change_point[i]->addr == change_point[i-1]->addr) &&
20996 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
20997 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
21000 + change_tmp = change_point[i];
21001 + change_point[i] = change_point[i-1];
21002 + change_point[i-1] = change_tmp;
21003 + still_changing=1;
21008 + /* create a new bios memory map, removing overlaps */
21009 + overlap_entries=0; /* number of entries in the overlap table */
21010 + new_bios_entry=0; /* index for creating new bios map entries */
21011 + last_type = 0; /* start with undefined memory type */
21012 + last_addr = 0; /* start with 0 as last starting address */
21013 + /* loop through change-points, determining affect on the new bios map */
21014 + for (chgidx=0; chgidx < chg_nr; chgidx++)
21016 + /* keep track of all overlapping bios entries */
21017 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
21019 + /* add map entry to overlap list (> 1 entry implies an overlap) */
21020 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
21024 + /* remove entry from list (order independent, so swap with last) */
21025 + for (i=0; i<overlap_entries; i++)
21027 + if (overlap_list[i] == change_point[chgidx]->pbios)
21028 + overlap_list[i] = overlap_list[overlap_entries-1];
21030 + overlap_entries--;
21032 + /* if there are overlapping entries, decide which "type" to use */
21033 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
21034 + current_type = 0;
21035 + for (i=0; i<overlap_entries; i++)
21036 + if (overlap_list[i]->type > current_type)
21037 + current_type = overlap_list[i]->type;
21038 + /* continue building up new bios map based on this information */
21039 + if (current_type != last_type) {
21040 + if (last_type != 0) {
21041 + new_bios[new_bios_entry].size =
21042 + change_point[chgidx]->addr - last_addr;
21043 + /* move forward only if the new size was non-zero */
21044 + if (new_bios[new_bios_entry].size != 0)
21045 + if (++new_bios_entry >= E820MAX)
21046 + break; /* no more space left for new bios entries */
21048 + if (current_type != 0) {
21049 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
21050 + new_bios[new_bios_entry].type = current_type;
21051 + last_addr=change_point[chgidx]->addr;
21053 + last_type = current_type;
21056 + new_nr = new_bios_entry; /* retain count for new bios entries */
21058 + /* copy new bios mapping into original location */
21059 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
21060 + *pnr_map = new_nr;
21066 + * Copy the BIOS e820 map into a safe place.
21068 + * Sanity-check it while we're at it..
21070 + * If we're lucky and live on a modern system, the setup code
21071 + * will have given us a memory map that we can use to properly
21072 + * set up memory. If we aren't, we'll fake a memory map.
21074 + * We check to see that the memory map contains at least 2 elements
21075 + * before we'll use it, because the detection code in setup.S may
21076 + * not be perfect and most every PC known to man has two memory
21077 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
21078 + * thinkpad 560x, for example, does not cooperate with the memory
21079 + * detection code.)
21081 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
21083 +#ifndef CONFIG_XEN
21084 + /* Only one memory region (or negative)? Ignore it */
21088 + BUG_ON(nr_map < 1);
21092 + unsigned long start = biosmap->addr;
21093 + unsigned long size = biosmap->size;
21094 + unsigned long end = start + size;
21095 + unsigned long type = biosmap->type;
21097 + /* Overflow in 64 bits? Ignore the memory map. */
21101 +#ifndef CONFIG_XEN
21103 + * Some BIOSes claim RAM in the 640k - 1M region.
21104 + * Not right. Fix it up.
21106 + * This should be removed on Hammer which is supposed to not
21107 + * have non e820 covered ISA mappings there, but I had some strange
21108 + * problems so it stays for now. -AK
21110 + if (type == E820_RAM) {
21111 + if (start < 0x100000ULL && end > 0xA0000ULL) {
21112 + if (start < 0xA0000ULL)
21113 + add_memory_region(start, 0xA0000ULL-start, type);
21114 + if (end <= 0x100000ULL)
21116 + start = 0x100000ULL;
21117 + size = end - start;
21122 + add_memory_region(start, size, type);
21123 + } while (biosmap++,--nr_map);
21126 + if (is_initial_xendomain()) {
21127 + struct xen_memory_map memmap;
21129 + memmap.nr_entries = E820MAX;
21130 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
21132 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
21134 + machine_e820.nr_map = memmap.nr_entries;
21136 + machine_e820 = e820;
21142 +#ifndef CONFIG_XEN
21143 +void __init setup_memory_region(void)
21145 + char *who = "BIOS-e820";
21148 + * Try to copy the BIOS-supplied E820-map.
21150 + * Otherwise fake a memory map; one section from 0k->640k,
21151 + * the next section from 1mb->appropriate_mem_k
21153 + sanitize_e820_map(E820_MAP, &E820_MAP_NR);
21154 + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
21155 + unsigned long mem_size;
21157 + /* compare results from other methods and take the greater */
21158 + if (ALT_MEM_K < EXT_MEM_K) {
21159 + mem_size = EXT_MEM_K;
21162 + mem_size = ALT_MEM_K;
21163 + who = "BIOS-e801";
21167 + add_memory_region(0, LOWMEMSIZE(), E820_RAM);
21168 + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
21170 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21171 + e820_print_map(who);
21174 +#else /* CONFIG_XEN */
21176 +void __init setup_memory_region(void)
21179 + struct xen_memory_map memmap;
21181 + * This is rather large for a stack variable but this early in
21182 + * the boot process we know we have plenty slack space.
21184 + struct e820entry map[E820MAX];
21186 + memmap.nr_entries = E820MAX;
21187 + set_xen_guest_handle(memmap.buffer, map);
21189 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
21190 + if ( rc == -ENOSYS ) {
21191 + memmap.nr_entries = 1;
21192 + map[0].addr = 0ULL;
21193 + map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
21194 + /* 8MB slack (to balance backend allocations). */
21195 + map[0].size += 8 << 20;
21196 + map[0].type = E820_RAM;
21201 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
21203 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
21205 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21206 + e820_print_map("Xen");
21210 +void __init parse_memopt(char *p, char **from)
21213 + unsigned long current_end;
21214 + unsigned long end;
21216 + end_user_pfn = memparse(p, from);
21217 + end_user_pfn >>= PAGE_SHIFT;
21219 + end = end_user_pfn<<PAGE_SHIFT;
21220 + i = e820.nr_map-1;
21221 + current_end = e820.map[i].addr + e820.map[i].size;
21223 + if (current_end < end) {
21225 + * The e820 map ends before our requested size so
21226 + * extend the final entry to the requested address.
21228 + if (e820.map[i].type == E820_RAM)
21229 + e820.map[i].size = end - e820.map[i].addr;
21231 + add_memory_region(current_end, end - current_end, E820_RAM);
21235 +void __init parse_memmapopt(char *p, char **from)
21237 + unsigned long long start_at, mem_size;
21239 + mem_size = memparse(p, from);
21242 + start_at = memparse(p+1, from);
21243 + add_memory_region(start_at, mem_size, E820_RAM);
21244 + } else if (*p == '#') {
21245 + start_at = memparse(p+1, from);
21246 + add_memory_region(start_at, mem_size, E820_ACPI);
21247 + } else if (*p == '$') {
21248 + start_at = memparse(p+1, from);
21249 + add_memory_region(start_at, mem_size, E820_RESERVED);
21251 + end_user_pfn = (mem_size >> PAGE_SHIFT);
21256 +unsigned long pci_mem_start = 0xaeedbabe;
21257 +EXPORT_SYMBOL(pci_mem_start);
21260 + * Search for the biggest gap in the low 32 bits of the e820
21261 + * memory space. We pass this space to PCI to assign MMIO resources
21262 + * for hotplug or unconfigured devices in.
21263 + * Hopefully the BIOS let enough space left.
21265 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
21267 + unsigned long gapstart, gapsize, round;
21268 + unsigned long last;
21272 + last = 0x100000000ull;
21273 + gapstart = 0x10000000;
21274 + gapsize = 0x400000;
21276 + while (--i >= 0) {
21277 + unsigned long long start = e820[i].addr;
21278 + unsigned long long end = start + e820[i].size;
21281 + * Since "last" is at most 4GB, we know we'll
21282 + * fit in 32 bits if this condition is true
21284 + if (last > end) {
21285 + unsigned long gap = last - end;
21287 + if (gap > gapsize) {
21293 + if (start < last)
21298 + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
21299 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
21300 + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
21304 + * See how much we want to round up: start off with
21305 + * rounding to the next 1MB area.
21307 + round = 0x100000;
21308 + while ((gapsize >> 4) > round)
21310 + /* Fun with two's complement */
21311 + pci_mem_start = (gapstart + round) & -round;
21313 + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
21314 + pci_mem_start, gapstart, gapsize);
21316 Index: head-2008-11-25/arch/x86/kernel/early_printk-xen.c
21317 ===================================================================
21318 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
21319 +++ head-2008-11-25/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200
21321 +#include <linux/console.h>
21322 +#include <linux/kernel.h>
21323 +#include <linux/init.h>
21324 +#include <linux/string.h>
21325 +#include <linux/screen_info.h>
21326 +#include <asm/io.h>
21327 +#include <asm/processor.h>
21328 +#include <asm/fcntl.h>
21330 +/* Simple VGA output */
21333 +#include <asm/setup.h>
21334 +#define VGABASE (__ISA_IO_base + 0xb8000)
21336 +#include <asm/bootsetup.h>
21337 +#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
21340 +#ifndef CONFIG_XEN
21341 +static int max_ypos = 25, max_xpos = 80;
21342 +static int current_ypos = 25, current_xpos = 0;
21344 +static void early_vga_write(struct console *con, const char *str, unsigned n)
21349 + while ((c = *str++) != '\0' && n-- > 0) {
21350 + if (current_ypos >= max_ypos) {
21351 + /* scroll 1 line up */
21352 + for (k = 1, j = 0; k < max_ypos; k++, j++) {
21353 + for (i = 0; i < max_xpos; i++) {
21354 + writew(readw(VGABASE+2*(max_xpos*k+i)),
21355 + VGABASE + 2*(max_xpos*j + i));
21358 + for (i = 0; i < max_xpos; i++)
21359 + writew(0x720, VGABASE + 2*(max_xpos*j + i));
21360 + current_ypos = max_ypos-1;
21363 + current_xpos = 0;
21365 + } else if (c != '\r') {
21366 + writew(((0x7 << 8) | (unsigned short) c),
21367 + VGABASE + 2*(max_xpos*current_ypos +
21368 + current_xpos++));
21369 + if (current_xpos >= max_xpos) {
21370 + current_xpos = 0;
21377 +static struct console early_vga_console = {
21378 + .name = "earlyvga",
21379 + .write = early_vga_write,
21380 + .flags = CON_PRINTBUFFER,
21384 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
21386 +static int early_serial_base = 0x3f8; /* ttyS0 */
21388 +#define XMTRDY 0x20
21392 +#define TXR 0 /* Transmit register (WRITE) */
21393 +#define RXR 0 /* Receive register (READ) */
21394 +#define IER 1 /* Interrupt Enable */
21395 +#define IIR 2 /* Interrupt ID */
21396 +#define FCR 2 /* FIFO control */
21397 +#define LCR 3 /* Line control */
21398 +#define MCR 4 /* Modem control */
21399 +#define LSR 5 /* Line Status */
21400 +#define MSR 6 /* Modem Status */
21401 +#define DLL 0 /* Divisor Latch Low */
21402 +#define DLH 1 /* Divisor latch High */
21404 +static int early_serial_putc(unsigned char ch)
21406 + unsigned timeout = 0xffff;
21407 + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
21409 + outb(ch, early_serial_base + TXR);
21410 + return timeout ? 0 : -1;
21413 +static void early_serial_write(struct console *con, const char *s, unsigned n)
21415 + while (*s && n-- > 0) {
21416 + early_serial_putc(*s);
21418 + early_serial_putc('\r');
21423 +#define DEFAULT_BAUD 9600
21425 +static __init void early_serial_init(char *s)
21428 + unsigned divisor;
21429 + unsigned baud = DEFAULT_BAUD;
21437 + if (!strncmp(s,"0x",2)) {
21438 + early_serial_base = simple_strtoul(s, &e, 16);
21440 + static int bases[] = { 0x3f8, 0x2f8 };
21442 + if (!strncmp(s,"ttyS",4))
21444 + port = simple_strtoul(s, &e, 10);
21445 + if (port > 1 || s == e)
21447 + early_serial_base = bases[port];
21449 + s += strcspn(s, ",");
21454 + outb(0x3, early_serial_base + LCR); /* 8n1 */
21455 + outb(0, early_serial_base + IER); /* no interrupt */
21456 + outb(0, early_serial_base + FCR); /* no fifo */
21457 + outb(0x3, early_serial_base + MCR); /* DTR + RTS */
21460 + baud = simple_strtoul(s, &e, 0);
21461 + if (baud == 0 || s == e)
21462 + baud = DEFAULT_BAUD;
21465 + divisor = 115200 / baud;
21466 + c = inb(early_serial_base + LCR);
21467 + outb(c | DLAB, early_serial_base + LCR);
21468 + outb(divisor & 0xff, early_serial_base + DLL);
21469 + outb((divisor >> 8) & 0xff, early_serial_base + DLH);
21470 + outb(c & ~DLAB, early_serial_base + LCR);
21473 +#else /* CONFIG_XEN */
21476 +early_serial_write(struct console *con, const char *s, unsigned count)
21480 + while (count > 0) {
21481 + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
21489 +static __init void early_serial_init(char *s)
21494 + * No early VGA console on Xen, as we do not have convenient ISA-space
21495 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
21497 +#define early_vga_console early_serial_console
21501 +static struct console early_serial_console = {
21502 + .name = "earlyser",
21503 + .write = early_serial_write,
21504 + .flags = CON_PRINTBUFFER,
21508 +/* Console interface to a host file on AMD's SimNow! */
21510 +static int simnow_fd;
21513 + MAGIC1 = 0xBACCD00A,
21514 + MAGIC2 = 0xCA110000,
21519 +static noinline long simnow(long cmd, long a, long b, long c)
21522 + asm volatile("cpuid" :
21524 + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
21528 +void __init simnow_init(char *str)
21530 + char *fn = "klog";
21533 + /* error ignored */
21534 + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
21537 +static void simnow_write(struct console *con, const char *s, unsigned n)
21539 + simnow(XWRITE, simnow_fd, (unsigned long)s, n);
21542 +static struct console simnow_console = {
21543 + .name = "simnow",
21544 + .write = simnow_write,
21545 + .flags = CON_PRINTBUFFER,
21549 +/* Direct interface for emergencies */
21550 +struct console *early_console = &early_vga_console;
21551 +static int early_console_initialized = 0;
21553 +void early_printk(const char *fmt, ...)
21559 + va_start(ap,fmt);
21560 + n = vscnprintf(buf,512,fmt,ap);
21561 + early_console->write(early_console,buf,n);
21565 +static int __initdata keep_early;
21567 +int __init setup_early_printk(char *opt)
21572 + if (early_console_initialized)
21575 + strlcpy(buf,opt,sizeof(buf));
21576 + space = strchr(buf, ' ');
21580 + if (strstr(buf,"keep"))
21583 + if (!strncmp(buf, "serial", 6)) {
21584 + early_serial_init(buf + 6);
21585 + early_console = &early_serial_console;
21586 + } else if (!strncmp(buf, "ttyS", 4)) {
21587 + early_serial_init(buf);
21588 + early_console = &early_serial_console;
21589 + } else if (!strncmp(buf, "vga", 3)
21590 +#ifndef CONFIG_XEN
21591 + && SCREEN_INFO.orig_video_isVGA == 1) {
21592 + max_xpos = SCREEN_INFO.orig_video_cols;
21593 + max_ypos = SCREEN_INFO.orig_video_lines;
21594 + current_ypos = SCREEN_INFO.orig_y;
21596 + || !strncmp(buf, "xen", 3)) {
21598 + early_console = &early_vga_console;
21599 + } else if (!strncmp(buf, "simnow", 6)) {
21600 + simnow_init(buf + 6);
21601 + early_console = &simnow_console;
21604 + early_console_initialized = 1;
21605 + register_console(early_console);
21609 +void __init disable_early_printk(void)
21611 + if (!early_console_initialized || !early_console)
21613 + if (!keep_early) {
21614 + printk("disabling early console\n");
21615 + unregister_console(early_console);
21616 + early_console_initialized = 0;
21618 + printk("keeping early console\n");
21622 +__setup("earlyprintk=", setup_early_printk);
21623 Index: head-2008-11-25/arch/x86/kernel/entry_64-xen.S
21624 ===================================================================
21625 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
21626 +++ head-2008-11-25/arch/x86/kernel/entry_64-xen.S 2008-10-29 09:55:56.000000000 +0100
21629 + * linux/arch/x86_64/entry.S
21631 + * Copyright (C) 1991, 1992 Linus Torvalds
21632 + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
21633 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
21637 + * Jun Nakajima <jun.nakajima@intel.com>
21638 + * Asit Mallick <asit.k.mallick@intel.com>
21639 + * Modified for Xen
21643 + * entry.S contains the system-call and fault low-level handling routines.
21645 + * NOTE: This code handles signal-recognition, which happens every time
21646 + * after an interrupt and after each system call.
21648 + * Normal syscalls and interrupts don't save a full stack frame, this is
21649 + * only done for syscall tracing, signals or fork/exec et.al.
21651 + * A note on terminology:
21652 + * - top of stack: Architecture defined interrupt frame from SS to RIP
21653 + * at the top of the kernel process stack.
21654 + * - partial stack frame: partially saved registers upto R11.
21655 + * - full stack frame: Like partial stack frame, but all register saved.
21658 + * - schedule it carefully for the final hardware.
21661 +#define ASSEMBLY 1
21662 +#include <linux/linkage.h>
21663 +#include <asm/segment.h>
21664 +#include <asm/smp.h>
21665 +#include <asm/cache.h>
21666 +#include <asm/errno.h>
21667 +#include <asm/dwarf2.h>
21668 +#include <asm/calling.h>
21669 +#include <asm/asm-offsets.h>
21670 +#include <asm/msr.h>
21671 +#include <asm/unistd.h>
21672 +#include <asm/thread_info.h>
21673 +#include <asm/hw_irq.h>
21674 +#include <asm/page.h>
21675 +#include <asm/irqflags.h>
21676 +#include <asm/errno.h>
21677 +#include <xen/interface/arch-x86_64.h>
21678 +#include <xen/interface/features.h>
21680 +#include "xen_entry.S"
21684 +#ifndef CONFIG_PREEMPT
21685 +#define retint_kernel retint_restore_args
21689 +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
21690 +#ifdef CONFIG_TRACE_IRQFLAGS
21691 + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
21698 +NMI_MASK = 0x80000000
21701 + * C code is not supposed to know about undefined top of stack. Every time
21702 + * a C function with an pt_regs argument is called from the SYSCALL based
21703 + * fast path FIXUP_TOP_OF_STACK is needed.
21704 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
21708 + /* %rsp:at FRAMEEND */
21709 + .macro FIXUP_TOP_OF_STACK tmp
21710 + movq $__USER_CS,CS(%rsp)
21711 + movq $-1,RCX(%rsp)
21714 + .macro RESTORE_TOP_OF_STACK tmp,offset=0
21717 + .macro FAKE_STACK_FRAME child_rip
21718 + /* push in order ss, rsp, eflags, cs, rip */
21720 + pushq %rax /* ss */
21721 + CFI_ADJUST_CFA_OFFSET 8
21722 + /*CFI_REL_OFFSET ss,0*/
21723 + pushq %rax /* rsp */
21724 + CFI_ADJUST_CFA_OFFSET 8
21725 + CFI_REL_OFFSET rsp,0
21726 + pushq $(1<<9) /* eflags - interrupts on */
21727 + CFI_ADJUST_CFA_OFFSET 8
21728 + /*CFI_REL_OFFSET rflags,0*/
21729 + pushq $__KERNEL_CS /* cs */
21730 + CFI_ADJUST_CFA_OFFSET 8
21731 + /*CFI_REL_OFFSET cs,0*/
21732 + pushq \child_rip /* rip */
21733 + CFI_ADJUST_CFA_OFFSET 8
21734 + CFI_REL_OFFSET rip,0
21735 + pushq %rax /* orig rax */
21736 + CFI_ADJUST_CFA_OFFSET 8
21739 + .macro UNFAKE_STACK_FRAME
21741 + CFI_ADJUST_CFA_OFFSET -(6*8)
21744 + .macro CFI_DEFAULT_STACK start=1,adj=0
21746 + CFI_STARTPROC simple
21747 + CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET
21749 + CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
21752 + CFI_REL_OFFSET r15,R15
21753 + CFI_REL_OFFSET r14,R14
21754 + CFI_REL_OFFSET r13,R13
21755 + CFI_REL_OFFSET r12,R12
21756 + CFI_REL_OFFSET rbp,RBP
21757 + CFI_REL_OFFSET rbx,RBX
21759 + CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET
21760 + CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET
21761 + CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET
21762 + CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET
21763 + CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET
21764 + CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET
21765 + CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET
21766 + CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET
21767 + CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET
21768 + CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET
21769 + /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/
21770 + /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/
21771 + CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET
21772 + /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/
21776 + * Must be consistent with the definition in arch-x86/xen-x86_64.h:
21777 + * struct iret_context {
21778 + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
21780 + * with rax, r11, and rcx being taken care of in the hypercall stub.
21782 + .macro HYPERVISOR_IRET flag
21783 + testb $3,1*8(%rsp)
21785 + testl $NMI_MASK,2*8(%rsp)
21788 + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
21791 + /* Direct iret to kernel space. Correct CS and SS. */
21796 +2: /* Slow iret via hypervisor. */
21797 + andl $~NMI_MASK, 2*8(%rsp)
21799 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
21803 + * A newly forked process directly context switches into this.
21806 +ENTRY(ret_from_fork)
21807 + CFI_DEFAULT_STACK
21808 + push kernel_eflags(%rip)
21809 + CFI_ADJUST_CFA_OFFSET 4
21810 + popf # reset kernel eflags
21811 + CFI_ADJUST_CFA_OFFSET -4
21812 + call schedule_tail
21813 + GET_THREAD_INFO(%rcx)
21814 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
21818 + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
21819 + je int_ret_from_sys_call
21820 + testl $_TIF_IA32,threadinfo_flags(%rcx)
21821 + jnz int_ret_from_sys_call
21822 + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
21823 + jmp ret_from_sys_call
21826 + call syscall_trace_leave
21827 + GET_THREAD_INFO(%rcx)
21830 +END(ret_from_fork)
21833 + * initial frame state for interrupts and exceptions
21835 + .macro _frame ref
21836 + CFI_STARTPROC simple
21837 + CFI_DEF_CFA rsp,SS+8-\ref
21838 + /*CFI_REL_OFFSET ss,SS-\ref*/
21839 + CFI_REL_OFFSET rsp,RSP-\ref
21840 + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
21841 + /*CFI_REL_OFFSET cs,CS-\ref*/
21842 + CFI_REL_OFFSET rip,RIP-\ref
21846 + * System call entry. Upto 6 arguments in registers are supported.
21848 + * SYSCALL does not save anything on the stack and does not change the
21853 + * Register setup:
21854 + * rax system call number
21856 + * rcx return address for syscall/sysret, C arg3
21859 + * r10 arg3 (--> moved to rcx for C)
21862 + * r11 eflags for syscall/sysret, temporary for C
21863 + * r12-r15,rbp,rbx saved by C code, not touched.
21865 + * Interrupts are enabled on entry.
21866 + * Only called from user space.
21868 + * XXX if we had a free scratch register we could save the RSP into the stack frame
21869 + * and report it properly in ps. Unfortunately we haven't.
21871 + * When user can change the frames always force IRET. That is because
21872 + * it deals with uncanonical addresses better. SYSRET has trouble
21873 + * with them due to bugs in both AMD and Intel CPUs.
21876 +ENTRY(system_call)
21877 + _frame (RIP-0x10)
21879 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
21880 + GET_THREAD_INFO(%rcx)
21881 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
21882 + CFI_REMEMBER_STATE
21884 + cmpq $__NR_syscall_max,%rax
21887 + call *sys_call_table(,%rax,8) # XXX: rip relative
21888 + movq %rax,RAX-ARGOFFSET(%rsp)
21890 + * Syscall return path ending with SYSRET (fast path)
21891 + * Has incomplete stack frame and undefined top of stack.
21893 + .globl ret_from_sys_call
21894 +ret_from_sys_call:
21895 + movl $_TIF_ALLWORK_MASK,%edi
21896 + /* edi: flagmask */
21898 + GET_THREAD_INFO(%rcx)
21899 + XEN_BLOCK_EVENTS(%rsi)
21901 + movl threadinfo_flags(%rcx),%edx
21903 + CFI_REMEMBER_STATE
21904 + jnz sysret_careful
21906 + * sysretq will re-enable interrupts:
21909 + XEN_UNBLOCK_EVENTS(%rsi)
21910 + RESTORE_ARGS 0,8,0
21911 + HYPERVISOR_IRET VGCF_IN_SYSCALL
21913 + /* Handle reschedules */
21914 + /* edx: work, edi: workmask */
21916 + CFI_RESTORE_STATE
21917 + bt $TIF_NEED_RESCHED,%edx
21918 + jnc sysret_signal
21920 + XEN_UNBLOCK_EVENTS(%rsi)
21922 + CFI_ADJUST_CFA_OFFSET 8
21925 + CFI_ADJUST_CFA_OFFSET -8
21928 + /* Handle a signal */
21932 + XEN_UNBLOCK_EVENTS(%rsi)
21933 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
21936 + /* Really a signal */
21937 + /* edx: work flags (arg3) */
21938 + leaq do_notify_resume(%rip),%rax
21939 + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
21940 + xorl %esi,%esi # oldset -> arg2
21941 + call ptregscall_common
21942 +1: movl $_TIF_NEED_RESCHED,%edi
21943 + /* Use IRET because user could have changed frame. This
21944 + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
21945 + XEN_BLOCK_EVENTS(%rsi)
21947 + jmp int_with_check
21950 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
21951 + jmp ret_from_sys_call
21953 + /* Do syscall tracing */
21955 + CFI_RESTORE_STATE
21957 + movq $-ENOSYS,RAX(%rsp)
21958 + FIXUP_TOP_OF_STACK %rdi
21960 + call syscall_trace_enter
21961 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
21963 + cmpq $__NR_syscall_max,%rax
21965 + movq %r10,%rcx /* fixup for C */
21966 + call *sys_call_table(,%rax,8)
21967 +1: movq %rax,RAX-ARGOFFSET(%rsp)
21968 + /* Use IRET because user could have changed frame */
21969 + jmp int_ret_from_sys_call
21974 + * Syscall return path ending with IRET.
21975 + * Has correct top of stack, but partial stack frame.
21977 +ENTRY(int_ret_from_sys_call)
21978 + CFI_STARTPROC simple
21979 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
21980 + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
21981 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
21982 + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
21983 + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
21984 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
21985 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
21986 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
21987 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
21988 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
21989 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
21990 + CFI_REL_OFFSET r8,R8-ARGOFFSET
21991 + CFI_REL_OFFSET r9,R9-ARGOFFSET
21992 + CFI_REL_OFFSET r10,R10-ARGOFFSET
21993 + CFI_REL_OFFSET r11,R11-ARGOFFSET
21994 + XEN_BLOCK_EVENTS(%rsi)
21996 + testb $3,CS-ARGOFFSET(%rsp)
21998 + /* Need to set the proper %ss (not NULL) for ring 3 iretq */
21999 + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
22000 + jmp retint_restore_args # retrun from ring3 kernel
22002 + movl $_TIF_ALLWORK_MASK,%edi
22003 + /* edi: mask to check */
22005 + GET_THREAD_INFO(%rcx)
22006 + movl threadinfo_flags(%rcx),%edx
22009 + andl $~TS_COMPAT,threadinfo_status(%rcx)
22010 + jmp retint_restore_args
22012 + /* Either reschedule or signal or syscall exit tracking needed. */
22013 + /* First do a reschedule test. */
22014 + /* edx: work, edi: workmask */
22016 + bt $TIF_NEED_RESCHED,%edx
22017 + jnc int_very_careful
22020 + XEN_UNBLOCK_EVENTS(%rsi)
22022 + CFI_ADJUST_CFA_OFFSET 8
22025 + CFI_ADJUST_CFA_OFFSET -8
22026 + XEN_BLOCK_EVENTS(%rsi)
22028 + jmp int_with_check
22030 + /* handle signals and tracing -- both require a full stack frame */
22034 + XEN_UNBLOCK_EVENTS(%rsi)
22036 + /* Check for syscall exit trace */
22037 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
22040 + CFI_ADJUST_CFA_OFFSET 8
22041 + leaq 8(%rsp),%rdi # &ptregs -> arg1
22042 + call syscall_trace_leave
22044 + CFI_ADJUST_CFA_OFFSET -8
22045 + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
22046 + XEN_BLOCK_EVENTS(%rsi)
22048 + jmp int_restore_rest
22051 + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
22053 + movq %rsp,%rdi # &ptregs -> arg1
22054 + xorl %esi,%esi # oldset -> arg2
22055 + call do_notify_resume
22056 +1: movl $_TIF_NEED_RESCHED,%edi
22059 + XEN_BLOCK_EVENTS(%rsi)
22061 + jmp int_with_check
22063 +END(int_ret_from_sys_call)
22066 + * Certain special system calls that need to save a complete full stack frame.
22069 + .macro PTREGSCALL label,func,arg
22072 + leaq \func(%rip),%rax
22073 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
22074 + jmp ptregscall_common
22080 + PTREGSCALL stub_clone, sys_clone, %r8
22081 + PTREGSCALL stub_fork, sys_fork, %rdi
22082 + PTREGSCALL stub_vfork, sys_vfork, %rdi
22083 + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
22084 + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
22085 + PTREGSCALL stub_iopl, sys_iopl, %rsi
22087 +ENTRY(ptregscall_common)
22089 + CFI_ADJUST_CFA_OFFSET -8
22090 + CFI_REGISTER rip, r11
22093 + CFI_REGISTER rip, r15
22094 + FIXUP_TOP_OF_STACK %r11
22096 + RESTORE_TOP_OF_STACK %r11
22098 + CFI_REGISTER rip, r11
22101 + CFI_ADJUST_CFA_OFFSET 8
22102 + CFI_REL_OFFSET rip, 0
22105 +END(ptregscall_common)
22107 +ENTRY(stub_execve)
22110 + CFI_ADJUST_CFA_OFFSET -8
22111 + CFI_REGISTER rip, r11
22113 + FIXUP_TOP_OF_STACK %r11
22115 + RESTORE_TOP_OF_STACK %r11
22116 + movq %rax,RAX(%rsp)
22118 + jmp int_ret_from_sys_call
22123 + * sigreturn is special because it needs to restore all registers on return.
22124 + * This cannot be done with SYSRET, so use the IRET return path instead.
22126 +ENTRY(stub_rt_sigreturn)
22129 + CFI_ADJUST_CFA_OFFSET -8
22132 + FIXUP_TOP_OF_STACK %r11
22133 + call sys_rt_sigreturn
22134 + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
22136 + jmp int_ret_from_sys_call
22138 +END(stub_rt_sigreturn)
22140 +/* initial frame state for interrupts (and exceptions without error code) */
22141 +#define INTR_FRAME _frame (RIP-0x10); \
22142 + CFI_REL_OFFSET rcx,0; \
22143 + CFI_REL_OFFSET r11,8
22145 +/* initial frame state for exceptions with error code (and interrupts with
22146 + vector already pushed) */
22147 +#define XCPT_FRAME _frame (RIP-0x18); \
22148 + CFI_REL_OFFSET rcx,0; \
22149 + CFI_REL_OFFSET r11,8
22152 + * Interrupt exit.
22157 + CFI_DEFAULT_STACK adj=1
22158 + movl threadinfo_flags(%rcx),%edx
22160 + CFI_REMEMBER_STATE
22161 + jnz retint_careful
22162 +retint_restore_args:
22163 + movl EFLAGS-REST_SKIP(%rsp), %eax
22164 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
22165 + XEN_GET_VCPU_INFO(%rsi)
22166 + andb evtchn_upcall_mask(%rsi),%al
22167 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
22168 + jnz restore_all_enable_events # != 0 => enable event delivery
22169 + XEN_PUT_VCPU_INFO(%rsi)
22171 + RESTORE_ARGS 0,8,0
22172 + HYPERVISOR_IRET 0
22174 + /* edi: workmask, edx: work */
22176 + CFI_RESTORE_STATE
22177 + bt $TIF_NEED_RESCHED,%edx
22178 + jnc retint_signal
22180 + XEN_UNBLOCK_EVENTS(%rsi)
22183 + CFI_ADJUST_CFA_OFFSET 8
22186 + CFI_ADJUST_CFA_OFFSET -8
22187 + GET_THREAD_INFO(%rcx)
22188 + XEN_BLOCK_EVENTS(%rsi)
22194 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
22195 + jz retint_restore_args
22197 + XEN_UNBLOCK_EVENTS(%rsi)
22199 + movq $-1,ORIG_RAX(%rsp)
22200 + xorl %esi,%esi # oldset
22201 + movq %rsp,%rdi # &pt_regs
22202 + call do_notify_resume
22204 + XEN_BLOCK_EVENTS(%rsi)
22206 + movl $_TIF_NEED_RESCHED,%edi
22207 + GET_THREAD_INFO(%rcx)
22210 +#ifdef CONFIG_PREEMPT
22211 + /* Returning to kernel space. Check if we need preemption */
22212 + /* rcx: threadinfo. interrupts off. */
22215 + cmpl $0,threadinfo_preempt_count(%rcx)
22216 + jnz retint_restore_args
22217 + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
22218 + jnc retint_restore_args
22219 + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
22220 + jnc retint_restore_args
22221 + call preempt_schedule_irq
22222 + jmp retint_kernel /* check again */
22228 +#ifndef CONFIG_XEN
22230 + * APIC interrupts.
22232 + .macro apicinterrupt num,func
22235 + CFI_ADJUST_CFA_OFFSET 8
22241 +ENTRY(thermal_interrupt)
22242 + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
22243 +END(thermal_interrupt)
22245 +ENTRY(threshold_interrupt)
22246 + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
22247 +END(threshold_interrupt)
22250 +ENTRY(reschedule_interrupt)
22251 + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
22252 +END(reschedule_interrupt)
22254 + .macro INVALIDATE_ENTRY num
22255 +ENTRY(invalidate_interrupt\num)
22256 + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
22257 +END(invalidate_interrupt\num)
22260 + INVALIDATE_ENTRY 0
22261 + INVALIDATE_ENTRY 1
22262 + INVALIDATE_ENTRY 2
22263 + INVALIDATE_ENTRY 3
22264 + INVALIDATE_ENTRY 4
22265 + INVALIDATE_ENTRY 5
22266 + INVALIDATE_ENTRY 6
22267 + INVALIDATE_ENTRY 7
22269 +ENTRY(call_function_interrupt)
22270 + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
22271 +END(call_function_interrupt)
22274 +#ifdef CONFIG_X86_LOCAL_APIC
22275 +ENTRY(apic_timer_interrupt)
22276 + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
22277 +END(apic_timer_interrupt)
22279 +ENTRY(error_interrupt)
22280 + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
22281 +END(error_interrupt)
22283 +ENTRY(spurious_interrupt)
22284 + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
22285 +END(spurious_interrupt)
22287 +#endif /* !CONFIG_XEN */
22290 + * Exception entry points.
22292 + .macro zeroentry sym
22296 + movq 8(%rsp),%r11
22298 + addq $0x10,%rsp /* skip rcx and r11 */
22299 + CFI_ADJUST_CFA_OFFSET -0x10
22300 + pushq $0 /* push error code/oldrax */
22301 + CFI_ADJUST_CFA_OFFSET 8
22302 + pushq %rax /* push real oldrax to the rdi slot */
22303 + CFI_ADJUST_CFA_OFFSET 8
22304 + CFI_REL_OFFSET rax,0
22305 + leaq \sym(%rip),%rax
22310 + .macro errorentry sym
22314 + movq 8(%rsp),%r11
22316 + addq $0x10,%rsp /* rsp points to the error code */
22317 + CFI_ADJUST_CFA_OFFSET -0x10
22319 + CFI_ADJUST_CFA_OFFSET 8
22320 + CFI_REL_OFFSET rax,0
22321 + leaq \sym(%rip),%rax
22326 +#if 0 /* not XEN */
22327 + /* error code is on the stack already */
22328 + /* handle NMI like exceptions that can happen everywhere */
22329 + .macro paranoidentry sym, ist=0, irqtrace=1
22331 + movq 8(%rsp),%r11
22332 + addq $0x10,%rsp /* skip rcx and r11 */
22335 +#if 0 /* not XEN */
22337 + movl $MSR_GS_BASE,%ecx
22346 + movq %gs:pda_data_offset, %rbp
22349 + movq ORIG_RAX(%rsp),%rsi
22350 + movq $-1,ORIG_RAX(%rsp)
22352 + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22356 + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22359 + XEN_BLOCK_EVENTS(%rsi)
22366 + * "Paranoid" exit path from exception stack.
22367 + * Paranoid because this is used by NMIs and cannot take
22368 + * any kernel state for granted.
22369 + * We don't do kernel preemption checks here, because only
22370 + * NMI should be common and it does not enable IRQs and
22371 + * cannot get reschedule ticks.
22373 + * "trace" is 0 for the NMI handler only, because irq-tracing
22374 + * is fundamentally NMI-unsafe. (we cannot change the soft and
22375 + * hard flags at once, atomically)
22377 + .macro paranoidexit trace=1
22378 + /* ebx: no swapgs flag */
22379 +paranoid_exit\trace:
22380 + testl %ebx,%ebx /* swapgs needed? */
22381 + jnz paranoid_restore\trace
22382 + testl $3,CS(%rsp)
22383 + jnz paranoid_userspace\trace
22384 +paranoid_swapgs\trace:
22385 + TRACE_IRQS_IRETQ 0
22387 +paranoid_restore\trace:
22390 +paranoid_userspace\trace:
22391 + GET_THREAD_INFO(%rcx)
22392 + movl threadinfo_flags(%rcx),%ebx
22393 + andl $_TIF_WORK_MASK,%ebx
22394 + jz paranoid_swapgs\trace
22395 + movq %rsp,%rdi /* &pt_regs */
22397 + movq %rax,%rsp /* switch stack for scheduling */
22398 + testl $_TIF_NEED_RESCHED,%ebx
22399 + jnz paranoid_schedule\trace
22400 + movl %ebx,%edx /* arg3: thread flags */
22405 + xorl %esi,%esi /* arg2: oldset */
22406 + movq %rsp,%rdi /* arg1: &pt_regs */
22407 + call do_notify_resume
22412 + jmp paranoid_userspace\trace
22413 +paranoid_schedule\trace:
22423 + jmp paranoid_userspace\trace
22429 + * Exception entry point. This expects an error code/orig_rax on the stack
22430 + * and the exception handler in %rax.
22432 +ENTRY(error_entry)
22434 + CFI_REL_OFFSET rax,0
22435 + /* rdi slot contains rax, oldrax contains error code */
22438 + CFI_ADJUST_CFA_OFFSET (14*8)
22439 + movq %rsi,13*8(%rsp)
22440 + CFI_REL_OFFSET rsi,RSI
22441 + movq 14*8(%rsp),%rsi /* load rax from rdi slot */
22442 + CFI_REGISTER rax,rsi
22443 + movq %rdx,12*8(%rsp)
22444 + CFI_REL_OFFSET rdx,RDX
22445 + movq %rcx,11*8(%rsp)
22446 + CFI_REL_OFFSET rcx,RCX
22447 + movq %rsi,10*8(%rsp) /* store rax */
22448 + CFI_REL_OFFSET rax,RAX
22449 + movq %r8, 9*8(%rsp)
22450 + CFI_REL_OFFSET r8,R8
22451 + movq %r9, 8*8(%rsp)
22452 + CFI_REL_OFFSET r9,R9
22453 + movq %r10,7*8(%rsp)
22454 + CFI_REL_OFFSET r10,R10
22455 + movq %r11,6*8(%rsp)
22456 + CFI_REL_OFFSET r11,R11
22457 + movq %rbx,5*8(%rsp)
22458 + CFI_REL_OFFSET rbx,RBX
22459 + movq %rbp,4*8(%rsp)
22460 + CFI_REL_OFFSET rbp,RBP
22461 + movq %r12,3*8(%rsp)
22462 + CFI_REL_OFFSET r12,R12
22463 + movq %r13,2*8(%rsp)
22464 + CFI_REL_OFFSET r13,R13
22465 + movq %r14,1*8(%rsp)
22466 + CFI_REL_OFFSET r14,R14
22468 + CFI_REL_OFFSET r15,R15
22470 + cmpl $__KERNEL_CS,CS(%rsp)
22471 + CFI_REMEMBER_STATE
22472 + je error_kernelspace
22474 +error_call_handler:
22475 + movq %rdi, RDI(%rsp)
22476 + CFI_REL_OFFSET rdi,RDI
22478 + movq ORIG_RAX(%rsp),%rsi # get error code
22479 + movq $-1,ORIG_RAX(%rsp)
22484 + XEN_BLOCK_EVENTS(%rsi)
22486 + GET_THREAD_INFO(%rcx)
22487 + testb $3,CS-ARGOFFSET(%rsp)
22489 + movl threadinfo_flags(%rcx),%edx
22490 + movl $_TIF_WORK_MASK,%edi
22492 + jnz retint_careful
22494 + * The iret might restore flags:
22497 + jmp retint_restore_args
22501 + * We need to re-write the logic here because we don't do iretq to
22502 + * to return to user mode. It's still possible that we get trap/fault
22503 + * in the kernel (when accessing buffers pointed to by system calls,
22507 + CFI_RESTORE_STATE
22508 +error_kernelspace:
22510 + /* There are two places in the kernel that can potentially fault with
22511 + usergs. Handle them here. The exception handlers after
22512 + iret run with kernel gs again, so don't set the user space flag.
22513 + B stepping K8s sometimes report an truncated RIP for IRET
22514 + exceptions returning to compat mode. Check for these here too. */
22515 + leaq iret_label(%rip),%rbp
22516 + cmpq %rbp,RIP(%rsp)
22518 + movl %ebp,%ebp /* zero extend */
22519 + cmpq %rbp,RIP(%rsp)
22521 + cmpq $gs_change,RIP(%rsp)
22528 +ENTRY(hypervisor_callback)
22529 + zeroentry do_hypervisor_callback
22530 +END(hypervisor_callback)
22533 + * Copied from arch/xen/i386/kernel/entry.S
22535 +# A note on the "critical region" in our callback handler.
22536 +# We want to avoid stacking callback handlers due to events occurring
22537 +# during handling of the last event. To do this, we keep events disabled
22538 +# until we've done all processing. HOWEVER, we must enable events before
22539 +# popping the stack frame (can't be done atomically) and so it would still
22540 +# be possible to get enough handler activations to overflow the stack.
22541 +# Although unlikely, bugs of that kind are hard to track down, so we'd
22542 +# like to avoid the possibility.
22543 +# So, on entry to the handler we detect whether we interrupted an
22544 +# existing activation in its critical region -- if so, we pop the current
22545 +# activation and restart the handler using the previous one.
22546 +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
22548 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
22549 +# see the correct pointer to the pt_regs
22550 + movq %rdi, %rsp # we don't return, adjust the stack frame
22552 + CFI_DEFAULT_STACK
22553 +11: incl %gs:pda_irqcount
22555 + CFI_DEF_CFA_REGISTER rbp
22556 + cmovzq %gs:pda_irqstackptr,%rsp
22557 + pushq %rbp # backlink for old unwinder
22558 + call evtchn_do_upcall
22560 + CFI_DEF_CFA_REGISTER rsp
22561 + decl %gs:pda_irqcount
22564 +END(do_hypervisor_callback)
22566 +#ifdef CONFIG_X86_LOCAL_APIC
22568 + zeroentry do_nmi_callback
22569 +ENTRY(do_nmi_callback)
22573 + CFI_DEFAULT_STACK
22575 + orl $NMI_MASK,EFLAGS(%rsp)
22577 + XEN_BLOCK_EVENTS(%rsi)
22579 + GET_THREAD_INFO(%rcx)
22580 + jmp retint_restore_args
22587 +restore_all_enable_events:
22588 + CFI_DEFAULT_STACK adj=1
22590 + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
22592 +scrit: /**** START OF CRITICAL REGION ****/
22593 + XEN_TEST_PENDING(%rsi)
22594 + CFI_REMEMBER_STATE
22595 + jnz 14f # process more events if necessary...
22596 + XEN_PUT_VCPU_INFO(%rsi)
22597 + RESTORE_ARGS 0,8,0
22598 + HYPERVISOR_IRET 0
22600 + CFI_RESTORE_STATE
22601 +14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
22602 + XEN_PUT_VCPU_INFO(%rsi)
22604 + movq %rsp,%rdi # set the argument again
22607 +ecrit: /**** END OF CRITICAL REGION ****/
22608 +# At this point, unlike on x86-32, we don't do the fixup to simplify the
22609 +# code and the stack frame is more complex on x86-64.
22610 +# When the kernel is interrupted in the critical section, the kernel
22611 +# will do IRET in that case, and everything will be restored at that point,
22612 +# i.e. it just resumes from the next instruction interrupted with the same context.
22614 +# Hypervisor uses this for application faults while it executes.
22615 +# We get here for two reasons:
22616 +# 1. Fault while reloading DS, ES, FS or GS
22617 +# 2. Fault while executing IRET
22618 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
22619 +# registers that could be reloaded and zeroed the others.
22620 +# Category 2 we fix up by killing the current process. We cannot use the
22621 +# normal Linux return path in this case because if we use the IRET hypercall
22622 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
22623 +# We distinguish between categories by comparing each saved segment register
22624 +# with its current contents: any discrepancy means we in category 1.
22625 +ENTRY(failsafe_callback)
22626 + _frame (RIP-0x30)
22627 + CFI_REL_OFFSET rcx, 0
22628 + CFI_REL_OFFSET r11, 8
22630 + cmpw %cx,0x10(%rsp)
22631 + CFI_REMEMBER_STATE
22634 + cmpw %cx,0x18(%rsp)
22637 + cmpw %cx,0x20(%rsp)
22640 + cmpw %cx,0x28(%rsp)
22642 + /* All segments match their saved values => Category 2 (Bad IRET). */
22645 + movq 8(%rsp),%r11
22648 + CFI_ADJUST_CFA_OFFSET -0x30
22649 + movq $11,%rdi /* SIGSEGV */
22651 + CFI_RESTORE_STATE
22652 +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
22655 + movq 8(%rsp),%r11
22658 + CFI_ADJUST_CFA_OFFSET -0x30
22660 + CFI_ADJUST_CFA_OFFSET 8
22665 + .section __ex_table,"a"
22667 + .quad gs_change,bad_gs
22669 + .section .fixup,"ax"
22670 + /* running with kernelgs */
22672 +/* swapgs */ /* switch back to user gs */
22680 + * Create a kernel thread.
22682 + * C extern interface:
22683 + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
22685 + * asm input arguments:
22686 + * rdi: fn, rsi: arg, rdx: flags
22688 +ENTRY(kernel_thread)
22690 + FAKE_STACK_FRAME $child_rip
22693 + # rdi: flags, rsi: usp, rdx: will be &pt_regs
22695 + orq kernel_thread_flags(%rip),%rdi
22704 + movq %rax,RAX(%rsp)
22708 + * It isn't worth to check for reschedule here,
22709 + * so internally to the x86_64 port you can rely on kernel_thread()
22710 + * not to reschedule the child before returning, this avoids the need
22711 + * of hacks for example to fork off the per-CPU idle tasks.
22712 + * [Hopefully no generic code relies on the reschedule -AK]
22715 + UNFAKE_STACK_FRAME
22718 +ENDPROC(kernel_thread)
22721 + pushq $0 # fake return address
22724 + * Here we are in the child and the registers are set as they were
22725 + * at kernel_thread() invocation in the parent.
22734 +ENDPROC(child_rip)
22737 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
22739 + * C extern interface:
22740 + * extern long execve(char *name, char **argv, char **envp)
22742 + * asm input arguments:
22743 + * rdi: name, rsi: argv, rdx: envp
22745 + * We want to fallback into:
22746 + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
22748 + * do_sys_execve asm fallback arguments:
22749 + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
22753 + FAKE_STACK_FRAME $0
22756 + movq %rax, RAX(%rsp)
22760 + jmp int_ret_from_sys_call
22762 + UNFAKE_STACK_FRAME
22767 +KPROBE_ENTRY(page_fault)
22768 + errorentry do_page_fault
22772 +ENTRY(coprocessor_error)
22773 + zeroentry do_coprocessor_error
22774 +END(coprocessor_error)
22776 +ENTRY(simd_coprocessor_error)
22777 + zeroentry do_simd_coprocessor_error
22778 +END(simd_coprocessor_error)
22780 +ENTRY(device_not_available)
22781 + zeroentry math_state_restore
22782 +END(device_not_available)
22784 + /* runs on exception stack */
22785 +KPROBE_ENTRY(debug)
22788 + CFI_ADJUST_CFA_OFFSET 8 */
22789 + zeroentry do_debug
22796 + /* runs on exception stack */
22800 + CFI_ADJUST_CFA_OFFSET 8
22801 + paranoidentry do_nmi, 0, 0
22802 +#ifdef CONFIG_TRACE_IRQFLAGS
22805 + jmp paranoid_exit1
22812 +KPROBE_ENTRY(int3)
22815 + CFI_ADJUST_CFA_OFFSET 8 */
22816 + zeroentry do_int3
22817 +/* jmp paranoid_exit1
22823 + zeroentry do_overflow
22827 + zeroentry do_bounds
22831 + zeroentry do_invalid_op
22834 +ENTRY(coprocessor_segment_overrun)
22835 + zeroentry do_coprocessor_segment_overrun
22836 +END(coprocessor_segment_overrun)
22839 + zeroentry do_reserved
22843 + /* runs on exception stack */
22844 +ENTRY(double_fault)
22846 + paranoidentry do_double_fault
22847 + jmp paranoid_exit1
22852 +ENTRY(invalid_TSS)
22853 + errorentry do_invalid_TSS
22856 +ENTRY(segment_not_present)
22857 + errorentry do_segment_not_present
22858 +END(segment_not_present)
22860 + /* runs on exception stack */
22861 +ENTRY(stack_segment)
22863 + paranoidentry do_stack_segment */
22864 + errorentry do_stack_segment
22865 +/* jmp paranoid_exit1
22867 +END(stack_segment)
22869 +KPROBE_ENTRY(general_protection)
22870 + errorentry do_general_protection
22871 +END(general_protection)
22874 +ENTRY(alignment_check)
22875 + errorentry do_alignment_check
22876 +END(alignment_check)
22878 +ENTRY(divide_error)
22879 + zeroentry do_divide_error
22882 +ENTRY(spurious_interrupt_bug)
22883 + zeroentry do_spurious_interrupt_bug
22884 +END(spurious_interrupt_bug)
22886 +#ifdef CONFIG_X86_MCE
22887 + /* runs on exception stack */
22888 +ENTRY(machine_check)
22891 + CFI_ADJUST_CFA_OFFSET 8
22892 + paranoidentry do_machine_check
22893 + jmp paranoid_exit1
22895 +END(machine_check)
22898 +/* Call softirq on interrupt stack. Interrupts are off. */
22899 +ENTRY(call_softirq)
22902 + CFI_ADJUST_CFA_OFFSET 8
22903 + CFI_REL_OFFSET rbp,0
22905 + CFI_DEF_CFA_REGISTER rbp
22906 + incl %gs:pda_irqcount
22907 + cmove %gs:pda_irqstackptr,%rsp
22908 + push %rbp # backlink for old unwinder
22909 + call __do_softirq
22911 + CFI_DEF_CFA_REGISTER rsp
22912 + CFI_ADJUST_CFA_OFFSET -8
22913 + decl %gs:pda_irqcount
22916 +ENDPROC(call_softirq)
22918 +#ifdef CONFIG_STACK_UNWIND
22919 +ENTRY(arch_unwind_init_running)
22921 + movq %r15, R15(%rdi)
22922 + movq %r14, R14(%rdi)
22924 + movq %r13, R13(%rdi)
22925 + movq %r12, R12(%rdi)
22927 + movq %rbp, RBP(%rdi)
22928 + movq %rbx, RBX(%rdi)
22929 + movq (%rsp), %rcx
22930 + movq %rax, R11(%rdi)
22931 + movq %rax, R10(%rdi)
22932 + movq %rax, R9(%rdi)
22933 + movq %rax, R8(%rdi)
22934 + movq %rax, RAX(%rdi)
22935 + movq %rax, RCX(%rdi)
22936 + movq %rax, RDX(%rdi)
22937 + movq %rax, RSI(%rdi)
22938 + movq %rax, RDI(%rdi)
22939 + movq %rax, ORIG_RAX(%rdi)
22940 + movq %rcx, RIP(%rdi)
22941 + leaq 8(%rsp), %rcx
22942 + movq $__KERNEL_CS, CS(%rdi)
22943 + movq %rax, EFLAGS(%rdi)
22944 + movq %rcx, RSP(%rdi)
22945 + movq $__KERNEL_DS, SS(%rdi)
22948 +ENDPROC(arch_unwind_init_running)
22950 Index: head-2008-11-25/arch/x86/kernel/genapic_64-xen.c
22951 ===================================================================
22952 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
22953 +++ head-2008-11-25/arch/x86/kernel/genapic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
22956 + * Copyright 2004 James Cleverdon, IBM.
22957 + * Subject to the GNU Public License, v.2
22959 + * Generic APIC sub-arch probe layer.
22961 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
22962 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
22963 + * James Cleverdon.
22965 +#include <linux/threads.h>
22966 +#include <linux/cpumask.h>
22967 +#include <linux/string.h>
22968 +#include <linux/kernel.h>
22969 +#include <linux/ctype.h>
22970 +#include <linux/init.h>
22971 +#include <linux/module.h>
22973 +#include <asm/smp.h>
22974 +#include <asm/ipi.h>
22976 +#if defined(CONFIG_ACPI)
22977 +#include <acpi/acpi_bus.h>
22980 +/* which logical CPU number maps to which CPU (physical APIC ID) */
22981 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
22982 +EXPORT_SYMBOL(x86_cpu_to_apicid);
22983 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
22985 +extern struct genapic apic_cluster;
22986 +extern struct genapic apic_flat;
22987 +extern struct genapic apic_physflat;
22989 +#ifndef CONFIG_XEN
22990 +struct genapic *genapic = &apic_flat;
22992 +extern struct genapic apic_xen;
22993 +struct genapic *genapic = &apic_xen;
22998 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
23000 +void __init clustered_apic_check(void)
23002 +#ifndef CONFIG_XEN
23004 + u8 clusters, max_cluster;
23006 + u8 cluster_cnt[NUM_APIC_CLUSTERS];
23007 + int max_apic = 0;
23009 +#if defined(CONFIG_ACPI)
23011 + * Some x86_64 machines use physical APIC mode regardless of how many
23012 + * procs/clusters are present (x86_64 ES7000 is an example).
23014 + if (acpi_fadt.revision > FADT2_REVISION_ID)
23015 + if (acpi_fadt.force_apic_physical_destination_mode) {
23016 + genapic = &apic_cluster;
23021 + memset(cluster_cnt, 0, sizeof(cluster_cnt));
23022 + for (i = 0; i < NR_CPUS; i++) {
23023 + id = bios_cpu_apicid[i];
23024 + if (id == BAD_APICID)
23026 + if (id > max_apic)
23028 + cluster_cnt[APIC_CLUSTERID(id)]++;
23031 + /* Don't use clustered mode on AMD platforms. */
23032 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
23033 + genapic = &apic_physflat;
23034 +#ifndef CONFIG_HOTPLUG_CPU
23035 + /* In the CPU hotplug case we cannot use broadcast mode
23036 + because that opens a race when a CPU is removed.
23037 + Stay at physflat mode in this case.
23038 + It is bad to do this unconditionally though. Once
23039 + we have ACPI platform support for CPU hotplug
23040 + we should detect hotplug capablity from ACPI tables and
23041 + only do this when really needed. -AK */
23042 + if (max_apic <= 8)
23043 + genapic = &apic_flat;
23051 + for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
23052 + if (cluster_cnt[i] > 0) {
23054 + if (cluster_cnt[i] > max_cluster)
23055 + max_cluster = cluster_cnt[i];
23060 + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
23061 + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
23062 + * else physical mode.
23063 + * (We don't use lowest priority delivery + HW APIC IRQ steering, so
23064 + * can ignore the clustered logical case and go straight to physical.)
23066 + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
23067 +#ifdef CONFIG_HOTPLUG_CPU
23068 + /* Don't use APIC shortcuts in CPU hotplug to avoid races */
23069 + genapic = &apic_physflat;
23071 + genapic = &apic_flat;
23074 + genapic = &apic_cluster;
23078 + /* hardcode to xen apic functions */
23079 + genapic = &apic_xen;
23081 + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
23084 +/* Same for both flat and clustered. */
23087 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
23090 +void send_IPI_self(int vector)
23092 +#ifndef CONFIG_XEN
23093 + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23095 + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23098 Index: head-2008-11-25/arch/x86/kernel/genapic_xen_64.c
23099 ===================================================================
23100 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23101 +++ head-2008-11-25/arch/x86/kernel/genapic_xen_64.c 2007-06-12 13:13:01.000000000 +0200
23104 + * Copyright 2004 James Cleverdon, IBM.
23105 + * Subject to the GNU Public License, v.2
23107 + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
23109 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
23110 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
23111 + * James Cleverdon.
23113 + * Hacked to pieces for Xen by Chris Wright.
23115 +#include <linux/threads.h>
23116 +#include <linux/cpumask.h>
23117 +#include <linux/string.h>
23118 +#include <linux/kernel.h>
23119 +#include <linux/ctype.h>
23120 +#include <linux/init.h>
23121 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23122 +#include <asm/smp.h>
23123 +#include <asm/ipi.h>
23125 +#include <asm/apic.h>
23126 +#include <asm/apicdef.h>
23127 +#include <asm/genapic.h>
23129 +#include <xen/evtchn.h>
23131 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
23133 +static inline void __send_IPI_one(unsigned int cpu, int vector)
23135 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
23137 + notify_remote_via_irq(irq);
23140 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
23144 + switch (shortcut) {
23145 + case APIC_DEST_SELF:
23146 + __send_IPI_one(smp_processor_id(), vector);
23148 + case APIC_DEST_ALLBUT:
23149 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23150 + if (cpu == smp_processor_id())
23152 + if (cpu_isset(cpu, cpu_online_map)) {
23153 + __send_IPI_one(cpu, vector);
23157 + case APIC_DEST_ALLINC:
23158 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23159 + if (cpu_isset(cpu, cpu_online_map)) {
23160 + __send_IPI_one(cpu, vector);
23165 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
23171 +static cpumask_t xen_target_cpus(void)
23173 + return cpu_online_map;
23177 + * Set up the logical destination ID.
23178 + * Do nothing, not called now.
23180 +static void xen_init_apic_ldr(void)
23182 + Dprintk("%s\n", __FUNCTION__);
23186 +static void xen_send_IPI_allbutself(int vector)
23189 + * if there are no other CPUs in the system then
23190 + * we get an APIC send error if we try to broadcast.
23191 + * thus we have to avoid sending IPIs in this case.
23193 + Dprintk("%s\n", __FUNCTION__);
23194 + if (num_online_cpus() > 1)
23195 + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
23198 +static void xen_send_IPI_all(int vector)
23200 + Dprintk("%s\n", __FUNCTION__);
23201 + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
23204 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
23206 + unsigned long mask = cpus_addr(cpumask)[0];
23207 + unsigned int cpu;
23208 + unsigned long flags;
23210 + Dprintk("%s\n", __FUNCTION__);
23211 + local_irq_save(flags);
23212 + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
23214 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23215 + if (cpu_isset(cpu, cpumask)) {
23216 + __send_IPI_one(cpu, vector);
23219 + local_irq_restore(flags);
23222 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23223 +static int xen_apic_id_registered(void)
23225 + /* better be set */
23226 + Dprintk("%s\n", __FUNCTION__);
23227 + return physid_isset(smp_processor_id(), phys_cpu_present_map);
23231 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
23233 + Dprintk("%s\n", __FUNCTION__);
23234 + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
23237 +static unsigned int phys_pkg_id(int index_msb)
23241 + Dprintk("%s\n", __FUNCTION__);
23242 + ebx = cpuid_ebx(1);
23243 + return ((ebx >> 24) & 0xFF) >> index_msb;
23246 +struct genapic apic_xen = {
23248 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23249 + .int_delivery_mode = dest_LowestPrio,
23251 + .int_dest_mode = (APIC_DEST_LOGICAL != 0),
23252 + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
23253 + .target_cpus = xen_target_cpus,
23254 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23255 + .apic_id_registered = xen_apic_id_registered,
23257 + .init_apic_ldr = xen_init_apic_ldr,
23258 + .send_IPI_all = xen_send_IPI_all,
23259 + .send_IPI_allbutself = xen_send_IPI_allbutself,
23260 + .send_IPI_mask = xen_send_IPI_mask,
23261 + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
23262 + .phys_pkg_id = phys_pkg_id,
23264 Index: head-2008-11-25/arch/x86/kernel/head_64-xen.S
23265 ===================================================================
23266 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23267 +++ head-2008-11-25/arch/x86/kernel/head_64-xen.S 2007-08-06 15:10:49.000000000 +0200
23270 + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
23272 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23273 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
23274 + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
23275 + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
23277 + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
23279 + * Jun Nakajima <jun.nakajima@intel.com>
23280 + * Modified for Xen
23284 +#include <linux/linkage.h>
23285 +#include <linux/threads.h>
23286 +#include <linux/init.h>
23287 +#include <linux/elfnote.h>
23288 +#include <asm/desc.h>
23289 +#include <asm/segment.h>
23290 +#include <asm/page.h>
23291 +#include <asm/msr.h>
23292 +#include <asm/cache.h>
23293 +#include <asm/dwarf2.h>
23294 +#include <xen/interface/elfnote.h>
23296 + .section .bootstrap.text, "ax", @progbits
23298 + .globl startup_64
23300 + movq $(init_thread_union+THREAD_SIZE-8),%rsp
23302 + /* rsi is pointer to startup info structure.
23305 + pushq $0 # fake return address
23306 + jmp x86_64_start_kernel
23308 +#ifdef CONFIG_ACPI_SLEEP
23312 + .word gdt_end-cpu_gdt_table-1
23313 + .long cpu_gdt_table-__START_KERNEL_map
23319 +#define NEXT_PAGE(name) \
23320 + $page = $page + 1; \
23321 + .org $page * 0x1000; \
23322 + phys_##name = $page * 0x1000 + __PHYSICAL_START; \
23325 +NEXT_PAGE(init_level4_pgt)
23326 + /* This gets initialized in x86_64_start_kernel */
23328 +NEXT_PAGE(init_level4_user_pgt)
23330 + * We update two pgd entries to make kernel and user pgd consistent
23331 + * at pgd_populate(). It can be used for kernel modules. So we place
23332 + * this page here for those cases to avoid memory corruption.
23333 + * We also use this page to establish the initial mapping for the
23338 +NEXT_PAGE(level3_kernel_pgt)
23342 + * This is used for vsyscall area mapping as we have a different
23343 + * level4 page table for user.
23345 +NEXT_PAGE(level3_user_pgt)
23348 +NEXT_PAGE(level2_kernel_pgt)
23351 +NEXT_PAGE(hypercall_page)
23353 + .rept 0x1000 / 0x20
23354 + .skip 1 /* push %rcx */
23355 + CFI_ADJUST_CFA_OFFSET 8
23356 + CFI_REL_OFFSET rcx,0
23357 + .skip 2 /* push %r11 */
23358 + CFI_ADJUST_CFA_OFFSET 8
23359 + CFI_REL_OFFSET rcx,0
23360 + .skip 5 /* mov $#,%eax */
23361 + .skip 2 /* syscall */
23362 + .skip 2 /* pop %r11 */
23363 + CFI_ADJUST_CFA_OFFSET -8
23365 + .skip 1 /* pop %rcx */
23366 + CFI_ADJUST_CFA_OFFSET -8
23368 + .align 0x20,0 /* ret */
23375 +/* Just dummy symbol to allow compilation. Not used in sleep path */
23376 +#ifdef CONFIG_ACPI_SLEEP
23378 +ENTRY(wakeup_level4_pgt)
23385 + .globl cpu_gdt_descr
23387 + .word gdt_end-cpu_gdt_table-1
23389 + .quad cpu_gdt_table
23397 +/* We need valid kernel segments for data and code in long mode too
23398 + * IRET will check the segment types kkeil 2000/10/28
23399 + * Also sysret mandates a special GDT layout
23402 + .section .data.page_aligned, "aw"
23405 +/* The TLS descriptors are currently at a different place compared to i386.
23406 + Hopefully nobody expects them at a fixed place (Wine?) */
23408 +ENTRY(cpu_gdt_table)
23409 + .quad 0x0000000000000000 /* NULL descriptor */
23410 + .quad 0x0 /* unused */
23411 + .quad 0x00af9a000000ffff /* __KERNEL_CS */
23412 + .quad 0x00cf92000000ffff /* __KERNEL_DS */
23413 + .quad 0x00cffa000000ffff /* __USER32_CS */
23414 + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
23415 + .quad 0x00affa000000ffff /* __USER_CS */
23416 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
23417 + .quad 0,0 /* TSS */
23418 + .quad 0,0 /* LDT */
23419 + .quad 0,0,0 /* three TLS descriptors */
23420 + .quad 0 /* unused */
23422 + /* asm/segment.h:GDT_ENTRIES must match this */
23423 + /* This should be a multiple of the cache line size */
23424 + /* GDTs of other CPUs are now dynamically allocated */
23426 + /* zero the remaining page */
23427 + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
23429 + .section .bss.page_aligned, "aw", @nobits
23431 +ENTRY(empty_zero_page)
23434 +#if CONFIG_XEN_COMPAT <= 0x030002
23436 + * __xen_guest information
23439 + .if (\value) < 0 || (\value) >= 0x10
23440 + utoh (((\value)>>4)&0x0fffffffffffffff)
23442 + .if ((\value) & 0xf) < 10
23443 + .byte '0' + ((\value) & 0xf)
23445 + .byte 'A' + ((\value) & 0xf) - 10
23449 +.section __xen_guest
23450 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
23451 + .ascii ",XEN_VER=xen-3.0"
23452 + .ascii ",VIRT_BASE=0x"
23453 + utoh __START_KERNEL_map
23454 + .ascii ",ELF_PADDR_OFFSET=0x"
23455 + utoh __START_KERNEL_map
23456 + .ascii ",VIRT_ENTRY=0x"
23457 + utoh (__START_KERNEL_map + __PHYSICAL_START)
23458 + .ascii ",HYPERCALL_PAGE=0x"
23459 + utoh (phys_hypercall_page >> PAGE_SHIFT)
23460 + .ascii ",FEATURES=writable_page_tables"
23461 + .ascii "|writable_descriptor_tables"
23462 + .ascii "|auto_translated_physmap"
23463 + .ascii "|supervisor_mode_kernel"
23464 + .ascii ",LOADER=generic"
23466 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
23468 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
23469 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
23470 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
23471 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map)
23472 +#if CONFIG_XEN_COMPAT <= 0x030002
23473 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map)
23475 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0)
23477 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64)
23478 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page)
23479 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
23480 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
23481 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
23482 + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
23483 Index: head-2008-11-25/arch/x86/kernel/head64-xen.c
23484 ===================================================================
23485 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23486 +++ head-2008-11-25/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200
23489 + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
23491 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23493 + * Jun Nakajima <jun.nakajima@intel.com>
23494 + * Modified for Xen.
23497 +#include <linux/init.h>
23498 +#include <linux/linkage.h>
23499 +#include <linux/types.h>
23500 +#include <linux/kernel.h>
23501 +#include <linux/string.h>
23502 +#include <linux/percpu.h>
23503 +#include <linux/module.h>
23505 +#include <asm/processor.h>
23506 +#include <asm/proto.h>
23507 +#include <asm/smp.h>
23508 +#include <asm/bootsetup.h>
23509 +#include <asm/setup.h>
23510 +#include <asm/desc.h>
23511 +#include <asm/pgtable.h>
23512 +#include <asm/sections.h>
23514 +unsigned long start_pfn;
23516 +/* Don't add a printk in there. printk relies on the PDA which is not initialized
23519 +static void __init clear_bss(void)
23521 + memset(__bss_start, 0,
23522 + (unsigned long) __bss_stop - (unsigned long) __bss_start);
23526 +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
23527 +#define OLD_CL_MAGIC_ADDR 0x90020
23528 +#define OLD_CL_MAGIC 0xA33F
23529 +#define OLD_CL_BASE_ADDR 0x90000
23530 +#define OLD_CL_OFFSET 0x90022
23532 +extern char saved_command_line[];
23534 +static void __init copy_bootdata(char *real_mode_data)
23536 +#ifndef CONFIG_XEN
23538 + char * command_line;
23540 + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
23541 + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
23543 + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
23544 + printk("so old bootloader that it does not support commandline?!\n");
23547 + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
23548 + printk("old bootloader convention, maybe loadlin?\n");
23550 + command_line = (char *) ((u64)(new_data));
23551 + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
23555 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
23556 + max_cmdline = COMMAND_LINE_SIZE;
23557 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
23558 + saved_command_line[max_cmdline-1] = '\0';
23560 + printk("Bootdata ok (command line is %s)\n", saved_command_line);
23563 +static void __init setup_boot_cpu_data(void)
23565 + unsigned int dummy, eax;
23567 + /* get vendor info */
23568 + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
23569 + (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
23570 + (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
23571 + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
23573 + /* get cpu type */
23574 + cpuid(1, &eax, &dummy, &dummy,
23575 + (unsigned int *) &boot_cpu_data.x86_capability);
23576 + boot_cpu_data.x86 = (eax >> 8) & 0xf;
23577 + boot_cpu_data.x86_model = (eax >> 4) & 0xf;
23578 + boot_cpu_data.x86_mask = eax & 0xf;
23581 +#include <xen/interface/memory.h>
23582 +unsigned long *machine_to_phys_mapping;
23583 +EXPORT_SYMBOL(machine_to_phys_mapping);
23584 +unsigned int machine_to_phys_order;
23585 +EXPORT_SYMBOL(machine_to_phys_order);
23587 +void __init x86_64_start_kernel(char * real_mode_data)
23589 + struct xen_machphys_mapping mapping;
23590 + unsigned long machine_to_phys_nr_ents;
23594 + setup_xen_features();
23596 + xen_start_info = (struct start_info *)real_mode_data;
23597 + if (!xen_feature(XENFEAT_auto_translated_physmap))
23598 + phys_to_machine_mapping =
23599 + (unsigned long *)xen_start_info->mfn_list;
23600 + start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
23601 + xen_start_info->nr_pt_frames;
23603 + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
23604 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
23605 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
23606 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
23607 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
23609 + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
23610 + machine_to_phys_order++;
23613 + for (i = 0; i < 256; i++)
23614 + set_intr_gate(i, early_idt_handler);
23615 + asm volatile("lidt %0" :: "m" (idt_descr));
23619 + * This must be called really, really early:
23623 + for (i = 0; i < NR_CPUS; i++)
23624 + cpu_pda(i) = &boot_cpu_pda[i];
23627 + copy_bootdata(real_mode_data);
23629 + cpu_set(0, cpu_online_map);
23631 + s = strstr(saved_command_line, "earlyprintk=");
23633 + setup_early_printk(strchr(s, '=') + 1);
23634 +#ifdef CONFIG_NUMA
23635 + s = strstr(saved_command_line, "numa=");
23639 +#ifdef CONFIG_X86_IO_APIC
23640 + if (strstr(saved_command_line, "disableapic"))
23641 + disable_apic = 1;
23643 + /* You need early console to see that */
23644 + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
23645 + panic("Kernel too big for kernel mapping\n");
23647 + setup_boot_cpu_data();
23650 Index: head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c
23651 ===================================================================
23652 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23653 +++ head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c 2008-11-25 12:22:34.000000000 +0100
23656 + * Intel IO-APIC support for multi-Pentium hosts.
23658 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
23660 + * Many thanks to Stig Venaas for trying out countless experimental
23661 + * patches and reporting/debugging problems patiently!
23663 + * (c) 1999, Multiple IO-APIC support, developed by
23664 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
23665 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
23666 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
23667 + * and Ingo Molnar <mingo@redhat.com>
23670 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
23671 + * thanks to Eric Gilmore
23672 + * and Rolf G. Tews
23673 + * for testing these extensively
23674 + * Paul Diefenbaugh : Added full ACPI support
23677 +#include <linux/mm.h>
23678 +#include <linux/interrupt.h>
23679 +#include <linux/init.h>
23680 +#include <linux/delay.h>
23681 +#include <linux/sched.h>
23682 +#include <linux/smp_lock.h>
23683 +#include <linux/mc146818rtc.h>
23684 +#include <linux/acpi.h>
23685 +#include <linux/sysdev.h>
23686 +#ifdef CONFIG_ACPI
23687 +#include <acpi/acpi_bus.h>
23690 +#include <asm/io.h>
23691 +#include <asm/smp.h>
23692 +#include <asm/desc.h>
23693 +#include <asm/proto.h>
23694 +#include <asm/mach_apic.h>
23695 +#include <asm/acpi.h>
23696 +#include <asm/dma.h>
23697 +#include <asm/nmi.h>
23699 +#define __apicdebuginit __init
23701 +int sis_apic_bug; /* not actually supported, dummy for compile */
23703 +static int no_timer_check;
23705 +int disable_timer_pin_1 __initdata;
23707 +#ifndef CONFIG_XEN
23708 +int timer_over_8254 __initdata = 0;
23710 +/* Where if anywhere is the i8259 connect in external int mode */
23711 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
23714 +static DEFINE_SPINLOCK(ioapic_lock);
23715 +static DEFINE_SPINLOCK(vector_lock);
23718 + * # of IRQ routing registers
23720 +int nr_ioapic_registers[MAX_IO_APICS];
23723 + * Rough estimation of how many shared IRQs there are, can
23724 + * be changed anytime.
23726 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
23727 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
23730 + * This is performance-critical, we want to do it O(1)
23732 + * the indexing order of this array favors 1:1 mappings
23733 + * between pins and IRQs.
23736 +static struct irq_pin_list {
23737 + short apic, pin, next;
23738 +} irq_2_pin[PIN_MAP_SIZE];
23740 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
23741 +#ifdef CONFIG_PCI_MSI
23742 +#define vector_to_irq(vector) \
23743 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
23745 +#define vector_to_irq(vector) (vector)
23750 +#include <xen/interface/xen.h>
23751 +#include <xen/interface/physdev.h>
23752 +#include <xen/evtchn.h>
23755 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
23756 +#define disable_8259A_irq(_irq) ((void)0)
23757 +#define i8259A_irq_pending(_irq) (0)
23759 +unsigned long io_apic_irqs;
23761 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
23763 + struct physdev_apic apic_op;
23766 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23767 + apic_op.reg = reg;
23768 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
23771 + return apic_op.value;
23774 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
23776 + struct physdev_apic apic_op;
23778 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23779 + apic_op.reg = reg;
23780 + apic_op.value = value;
23781 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
23784 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
23785 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
23787 +#define clear_IO_APIC() ((void)0)
23792 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
23794 + unsigned long flags;
23795 + unsigned int dest;
23798 + cpus_and(tmp, mask, cpu_online_map);
23799 + if (cpus_empty(tmp))
23800 + tmp = TARGET_CPUS;
23802 + cpus_and(mask, tmp, CPU_MASK_ALL);
23804 + dest = cpu_mask_to_apicid(mask);
23807 + * Only the high 8 bits are valid.
23809 + dest = SET_APIC_LOGICAL_ID(dest);
23811 + spin_lock_irqsave(&ioapic_lock, flags);
23812 + __DO_ACTION(1, = dest, )
23813 + set_irq_info(irq, mask);
23814 + spin_unlock_irqrestore(&ioapic_lock, flags);
23818 +#endif /* !CONFIG_XEN */
23821 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
23822 + * shared ISA-space IRQs, so we have to support them. We are super
23823 + * fast in the common case, and fast for shared ISA-space IRQs.
23825 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
23827 + static int first_free_entry = NR_IRQS;
23828 + struct irq_pin_list *entry = irq_2_pin + irq;
23830 + BUG_ON(irq >= NR_IRQS);
23831 + while (entry->next)
23832 + entry = irq_2_pin + entry->next;
23834 + if (entry->pin != -1) {
23835 + entry->next = first_free_entry;
23836 + entry = irq_2_pin + entry->next;
23837 + if (++first_free_entry >= PIN_MAP_SIZE)
23838 + panic("io_apic.c: ran out of irq_2_pin entries!");
23840 + entry->apic = apic;
23841 + entry->pin = pin;
23844 +#ifndef CONFIG_XEN
23845 +#define __DO_ACTION(R, ACTION, FINAL) \
23849 + struct irq_pin_list *entry = irq_2_pin + irq; \
23851 + BUG_ON(irq >= NR_IRQS); \
23853 + unsigned int reg; \
23854 + pin = entry->pin; \
23857 + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
23859 + io_apic_modify(entry->apic, reg); \
23860 + if (!entry->next) \
23862 + entry = irq_2_pin + entry->next; \
23867 +#define DO_ACTION(name,R,ACTION, FINAL) \
23869 + static void name##_IO_APIC_irq (unsigned int irq) \
23870 + __DO_ACTION(R, ACTION, FINAL)
23872 +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
23874 +DO_ACTION( __unmask, 0, &= 0xfffeffff, )
23877 +static void mask_IO_APIC_irq (unsigned int irq)
23879 + unsigned long flags;
23881 + spin_lock_irqsave(&ioapic_lock, flags);
23882 + __mask_IO_APIC_irq(irq);
23883 + spin_unlock_irqrestore(&ioapic_lock, flags);
23886 +static void unmask_IO_APIC_irq (unsigned int irq)
23888 + unsigned long flags;
23890 + spin_lock_irqsave(&ioapic_lock, flags);
23891 + __unmask_IO_APIC_irq(irq);
23892 + spin_unlock_irqrestore(&ioapic_lock, flags);
23895 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
23897 + struct IO_APIC_route_entry entry;
23898 + unsigned long flags;
23900 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
23901 + spin_lock_irqsave(&ioapic_lock, flags);
23902 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
23903 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
23904 + spin_unlock_irqrestore(&ioapic_lock, flags);
23905 + if (entry.delivery_mode == dest_SMI)
23908 + * Disable it in the IO-APIC irq-routing table:
23910 + memset(&entry, 0, sizeof(entry));
23912 + spin_lock_irqsave(&ioapic_lock, flags);
23913 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
23914 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
23915 + spin_unlock_irqrestore(&ioapic_lock, flags);
23918 +static void clear_IO_APIC (void)
23922 + for (apic = 0; apic < nr_ioapics; apic++)
23923 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
23924 + clear_IO_APIC_pin(apic, pin);
23927 +#endif /* !CONFIG_XEN */
23929 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
23932 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
23933 + * specific CPU-side IRQs.
23936 +#define MAX_PIRQS 8
23937 +static int pirq_entries [MAX_PIRQS];
23938 +static int pirqs_enabled;
23939 +int skip_ioapic_setup;
23942 +/* dummy parsing: see setup.c */
23944 +static int __init disable_ioapic_setup(char *str)
23946 + skip_ioapic_setup = 1;
23950 +static int __init enable_ioapic_setup(char *str)
23952 + ioapic_force = 1;
23953 + skip_ioapic_setup = 0;
23957 +__setup("noapic", disable_ioapic_setup);
23958 +__setup("apic", enable_ioapic_setup);
23960 +#ifndef CONFIG_XEN
23961 +static int __init setup_disable_8254_timer(char *s)
23963 + timer_over_8254 = -1;
23966 +static int __init setup_enable_8254_timer(char *s)
23968 + timer_over_8254 = 2;
23972 +__setup("disable_8254_timer", setup_disable_8254_timer);
23973 +__setup("enable_8254_timer", setup_enable_8254_timer);
23974 +#endif /* !CONFIG_XEN */
23976 +#include <asm/pci-direct.h>
23977 +#include <linux/pci_ids.h>
23978 +#include <linux/pci.h>
23981 +#ifdef CONFIG_ACPI
23983 +static int nvidia_hpet_detected __initdata;
23985 +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
23987 + nvidia_hpet_detected = 1;
23992 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
23993 + off. Check for an Nvidia or VIA PCI bridge and turn it off.
23994 + Use pci direct infrastructure because this runs before the PCI subsystem.
23996 + Can be overwritten with "apic"
23998 + And another hack to disable the IOMMU on VIA chipsets.
24000 + ... and others. Really should move this somewhere else.
24002 + Kludge-O-Rama. */
24003 +void __init check_ioapic(void)
24005 + int num,slot,func;
24006 + /* Poor man's PCI discovery */
24007 + for (num = 0; num < 32; num++) {
24008 + for (slot = 0; slot < 32; slot++) {
24009 + for (func = 0; func < 8; func++) {
24013 + class = read_pci_config(num,slot,func,
24014 + PCI_CLASS_REVISION);
24015 + if (class == 0xffffffff)
24018 + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
24021 + vendor = read_pci_config(num, slot, func,
24023 + vendor &= 0xffff;
24024 + switch (vendor) {
24025 + case PCI_VENDOR_ID_VIA:
24026 +#ifdef CONFIG_IOMMU
24027 + if ((end_pfn > MAX_DMA32_PFN ||
24029 + !iommu_aperture_allowed) {
24031 + "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
24032 + iommu_aperture_disabled = 1;
24036 + case PCI_VENDOR_ID_NVIDIA:
24037 +#ifdef CONFIG_ACPI
24039 + * All timer overrides on Nvidia are
24040 + * wrong unless HPET is enabled.
24042 + nvidia_hpet_detected = 0;
24043 + acpi_table_parse(ACPI_HPET,
24044 + nvidia_hpet_check);
24045 + if (nvidia_hpet_detected == 0) {
24046 + acpi_skip_timer_override = 1;
24047 + printk(KERN_INFO "Nvidia board "
24048 + "detected. Ignoring ACPI "
24049 + "timer override.\n");
24052 + /* RED-PEN skip them on mptables too? */
24054 + case PCI_VENDOR_ID_ATI:
24056 + /* This should be actually default, but
24057 + for 2.6.16 let's do it for ATI only where
24058 + it's really needed. */
24059 +#ifndef CONFIG_XEN
24060 + if (timer_over_8254 == 1) {
24061 + timer_over_8254 = 0;
24063 + "ATI board detected. Disabling timer routing over 8254.\n");
24070 + /* No multi-function device? */
24071 + type = read_pci_config_byte(num,slot,func,
24072 + PCI_HEADER_TYPE);
24073 + if (!(type & 0x80))
24080 +static int __init ioapic_pirq_setup(char *str)
24083 + int ints[MAX_PIRQS+1];
24085 + get_options(str, ARRAY_SIZE(ints), ints);
24087 + for (i = 0; i < MAX_PIRQS; i++)
24088 + pirq_entries[i] = -1;
24090 + pirqs_enabled = 1;
24091 + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
24093 + if (ints[0] < MAX_PIRQS)
24096 + for (i = 0; i < max; i++) {
24097 + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
24099 + * PIRQs are mapped upside down, usually.
24101 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
24106 +__setup("pirq=", ioapic_pirq_setup);
24109 + * Find the IRQ entry number of a certain pin.
24111 +static int find_irq_entry(int apic, int pin, int type)
24115 + for (i = 0; i < mp_irq_entries; i++)
24116 + if (mp_irqs[i].mpc_irqtype == type &&
24117 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
24118 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
24119 + mp_irqs[i].mpc_dstirq == pin)
24125 +#ifndef CONFIG_XEN
24127 + * Find the pin to which IRQ[irq] (ISA) is connected
24129 +static int __init find_isa_irq_pin(int irq, int type)
24133 + for (i = 0; i < mp_irq_entries; i++) {
24134 + int lbus = mp_irqs[i].mpc_srcbus;
24136 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24137 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24138 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24139 + (mp_irqs[i].mpc_irqtype == type) &&
24140 + (mp_irqs[i].mpc_srcbusirq == irq))
24142 + return mp_irqs[i].mpc_dstirq;
24147 +static int __init find_isa_irq_apic(int irq, int type)
24151 + for (i = 0; i < mp_irq_entries; i++) {
24152 + int lbus = mp_irqs[i].mpc_srcbus;
24154 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24155 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24156 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24157 + (mp_irqs[i].mpc_irqtype == type) &&
24158 + (mp_irqs[i].mpc_srcbusirq == irq))
24161 + if (i < mp_irq_entries) {
24163 + for(apic = 0; apic < nr_ioapics; apic++) {
24164 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
24174 + * Find a specific PCI IRQ entry.
24175 + * Not an __init, possibly needed by modules
24177 +static int pin_2_irq(int idx, int apic, int pin);
24179 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
24181 + int apic, i, best_guess = -1;
24183 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
24185 + if (mp_bus_id_to_pci_bus[bus] == -1) {
24186 + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
24189 + for (i = 0; i < mp_irq_entries; i++) {
24190 + int lbus = mp_irqs[i].mpc_srcbus;
24192 + for (apic = 0; apic < nr_ioapics; apic++)
24193 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
24194 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
24197 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
24198 + !mp_irqs[i].mpc_irqtype &&
24200 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
24201 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
24203 + if (!(apic || IO_APIC_IRQ(irq)))
24206 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
24209 + * Use the first all-but-pin matching entry as a
24210 + * best-guess fuzzy result for broken mptables.
24212 + if (best_guess < 0)
24213 + best_guess = irq;
24216 + BUG_ON(best_guess >= NR_IRQS);
24217 + return best_guess;
24221 + * EISA Edge/Level control register, ELCR
24223 +static int EISA_ELCR(unsigned int irq)
24226 + unsigned int port = 0x4d0 + (irq >> 3);
24227 + return (inb(port) >> (irq & 7)) & 1;
24229 + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
24233 +/* EISA interrupts are always polarity zero and can be edge or level
24234 + * trigger depending on the ELCR value. If an interrupt is listed as
24235 + * EISA conforming in the MP table, that means its trigger type must
24236 + * be read in from the ELCR */
24238 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
24239 +#define default_EISA_polarity(idx) (0)
24241 +/* ISA interrupts are always polarity zero edge triggered,
24242 + * when listed as conforming in the MP table. */
24244 +#define default_ISA_trigger(idx) (0)
24245 +#define default_ISA_polarity(idx) (0)
24247 +/* PCI interrupts are always polarity one level triggered,
24248 + * when listed as conforming in the MP table. */
24250 +#define default_PCI_trigger(idx) (1)
24251 +#define default_PCI_polarity(idx) (1)
24253 +/* MCA interrupts are always polarity zero level triggered,
24254 + * when listed as conforming in the MP table. */
24256 +#define default_MCA_trigger(idx) (1)
24257 +#define default_MCA_polarity(idx) (0)
24259 +static int __init MPBIOS_polarity(int idx)
24261 + int bus = mp_irqs[idx].mpc_srcbus;
24265 + * Determine IRQ line polarity (high active or low active):
24267 + switch (mp_irqs[idx].mpc_irqflag & 3)
24269 + case 0: /* conforms, ie. bus-type dependent polarity */
24271 + switch (mp_bus_id_to_type[bus])
24273 + case MP_BUS_ISA: /* ISA pin */
24275 + polarity = default_ISA_polarity(idx);
24278 + case MP_BUS_EISA: /* EISA pin */
24280 + polarity = default_EISA_polarity(idx);
24283 + case MP_BUS_PCI: /* PCI pin */
24285 + polarity = default_PCI_polarity(idx);
24288 + case MP_BUS_MCA: /* MCA pin */
24290 + polarity = default_MCA_polarity(idx);
24295 + printk(KERN_WARNING "broken BIOS!!\n");
24302 + case 1: /* high active */
24307 + case 2: /* reserved */
24309 + printk(KERN_WARNING "broken BIOS!!\n");
24313 + case 3: /* low active */
24318 + default: /* invalid */
24320 + printk(KERN_WARNING "broken BIOS!!\n");
24328 +static int MPBIOS_trigger(int idx)
24330 + int bus = mp_irqs[idx].mpc_srcbus;
24334 + * Determine IRQ trigger mode (edge or level sensitive):
24336 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
24338 + case 0: /* conforms, ie. bus-type dependent */
24340 + switch (mp_bus_id_to_type[bus])
24342 + case MP_BUS_ISA: /* ISA pin */
24344 + trigger = default_ISA_trigger(idx);
24347 + case MP_BUS_EISA: /* EISA pin */
24349 + trigger = default_EISA_trigger(idx);
24352 + case MP_BUS_PCI: /* PCI pin */
24354 + trigger = default_PCI_trigger(idx);
24357 + case MP_BUS_MCA: /* MCA pin */
24359 + trigger = default_MCA_trigger(idx);
24364 + printk(KERN_WARNING "broken BIOS!!\n");
24371 + case 1: /* edge */
24376 + case 2: /* reserved */
24378 + printk(KERN_WARNING "broken BIOS!!\n");
24382 + case 3: /* level */
24387 + default: /* invalid */
24389 + printk(KERN_WARNING "broken BIOS!!\n");
24397 +static inline int irq_polarity(int idx)
24399 + return MPBIOS_polarity(idx);
24402 +static inline int irq_trigger(int idx)
24404 + return MPBIOS_trigger(idx);
24407 +static int next_irq = 16;
24410 + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
24411 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
24412 + * from ACPI, which can reach 800 in large boxen.
24414 + * Compact the sparse GSI space into a sequential IRQ series and reuse
24415 + * vectors if possible.
24417 +int gsi_irq_sharing(int gsi)
24419 + int i, tries, vector;
24421 + BUG_ON(gsi >= NR_IRQ_VECTORS);
24423 + if (platform_legacy_irq(gsi))
24426 + if (gsi_2_irq[gsi] != 0xFF)
24427 + return (int)gsi_2_irq[gsi];
24431 + vector = assign_irq_vector(gsi);
24434 + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
24435 + * use of vector and if found, return that IRQ. However, we never want
24436 + * to share legacy IRQs, which usually have a different trigger mode
24439 + for (i = 0; i < NR_IRQS; i++)
24440 + if (IO_APIC_VECTOR(i) == vector)
24442 + if (platform_legacy_irq(i)) {
24443 + if (--tries >= 0) {
24444 + IO_APIC_VECTOR(i) = 0;
24447 + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
24449 + if (i < NR_IRQS) {
24450 + gsi_2_irq[gsi] = i;
24451 + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
24457 + BUG_ON(i >= NR_IRQS);
24458 + gsi_2_irq[gsi] = i;
24459 + IO_APIC_VECTOR(i) = vector;
24460 + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
24465 +static int pin_2_irq(int idx, int apic, int pin)
24468 + int bus = mp_irqs[idx].mpc_srcbus;
24471 + * Debugging check, we are in big trouble if this message pops up!
24473 + if (mp_irqs[idx].mpc_dstirq != pin)
24474 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
24476 + switch (mp_bus_id_to_type[bus])
24478 + case MP_BUS_ISA: /* ISA pin */
24479 + case MP_BUS_EISA:
24482 + irq = mp_irqs[idx].mpc_srcbusirq;
24485 + case MP_BUS_PCI: /* PCI pin */
24488 + * PCI IRQs are mapped in order
24492 + irq += nr_ioapic_registers[i++];
24494 + irq = gsi_irq_sharing(irq);
24499 + printk(KERN_ERR "unknown bus type %d.\n",bus);
24504 + BUG_ON(irq >= NR_IRQS);
24507 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
24509 + if ((pin >= 16) && (pin <= 23)) {
24510 + if (pirq_entries[pin-16] != -1) {
24511 + if (!pirq_entries[pin-16]) {
24512 + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
24514 + irq = pirq_entries[pin-16];
24515 + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
24520 + BUG_ON(irq >= NR_IRQS);
24524 +static inline int IO_APIC_irq_trigger(int irq)
24526 + int apic, idx, pin;
24528 + for (apic = 0; apic < nr_ioapics; apic++) {
24529 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24530 + idx = find_irq_entry(apic,pin,mp_INT);
24531 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
24532 + return irq_trigger(idx);
24536 + * nonexistent IRQs are edge default
24541 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
24542 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
24544 +int assign_irq_vector(int irq)
24546 + unsigned long flags;
24548 + struct physdev_irq irq_op;
24550 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
24552 + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
24555 + spin_lock_irqsave(&vector_lock, flags);
24557 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
24558 + spin_unlock_irqrestore(&vector_lock, flags);
24559 + return IO_APIC_VECTOR(irq);
24562 + irq_op.irq = irq;
24563 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
24564 + spin_unlock_irqrestore(&vector_lock, flags);
24568 + vector = irq_op.vector;
24569 + vector_irq[vector] = irq;
24570 + if (irq != AUTO_ASSIGN)
24571 + IO_APIC_VECTOR(irq) = vector;
24573 + spin_unlock_irqrestore(&vector_lock, flags);
24578 +extern void (*interrupt[NR_IRQS])(void);
24579 +#ifndef CONFIG_XEN
24580 +static struct hw_interrupt_type ioapic_level_type;
24581 +static struct hw_interrupt_type ioapic_edge_type;
24583 +#define IOAPIC_AUTO -1
24584 +#define IOAPIC_EDGE 0
24585 +#define IOAPIC_LEVEL 1
24587 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
24591 + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
24593 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
24594 + trigger == IOAPIC_LEVEL)
24595 + irq_desc[idx].chip = &ioapic_level_type;
24597 + irq_desc[idx].chip = &ioapic_edge_type;
24598 + set_intr_gate(vector, interrupt[idx]);
24601 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
24602 +#endif /* !CONFIG_XEN */
24604 +static void __init setup_IO_APIC_irqs(void)
24606 + struct IO_APIC_route_entry entry;
24607 + int apic, pin, idx, irq, first_notcon = 1, vector;
24608 + unsigned long flags;
24610 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
24612 + for (apic = 0; apic < nr_ioapics; apic++) {
24613 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24616 + * add it to the IO-APIC irq-routing table:
24618 + memset(&entry,0,sizeof(entry));
24620 + entry.delivery_mode = INT_DELIVERY_MODE;
24621 + entry.dest_mode = INT_DEST_MODE;
24622 + entry.mask = 0; /* enable IRQ */
24623 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24625 + idx = find_irq_entry(apic,pin,mp_INT);
24627 + if (first_notcon) {
24628 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24629 + first_notcon = 0;
24631 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24635 + entry.trigger = irq_trigger(idx);
24636 + entry.polarity = irq_polarity(idx);
24638 + if (irq_trigger(idx)) {
24639 + entry.trigger = 1;
24641 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24644 + irq = pin_2_irq(idx, apic, pin);
24645 + add_pin_to_irq(irq, apic, pin);
24647 + if (/* !apic && */ !IO_APIC_IRQ(irq))
24650 + if (IO_APIC_IRQ(irq)) {
24651 + vector = assign_irq_vector(irq);
24652 + entry.vector = vector;
24654 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
24655 + if (!apic && (irq < 16))
24656 + disable_8259A_irq(irq);
24658 + spin_lock_irqsave(&ioapic_lock, flags);
24659 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24660 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24661 + set_native_irq_info(irq, TARGET_CPUS);
24662 + spin_unlock_irqrestore(&ioapic_lock, flags);
24666 + if (!first_notcon)
24667 + apic_printk(APIC_VERBOSE," not connected.\n");
24670 +#ifndef CONFIG_XEN
24672 + * Set up the 8259A-master output pin as broadcast to all
24675 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
24677 + struct IO_APIC_route_entry entry;
24678 + unsigned long flags;
24680 + memset(&entry,0,sizeof(entry));
24682 + disable_8259A_irq(0);
24685 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
24688 + * We use logical delivery to get the timer IRQ
24689 + * to the first CPU.
24691 + entry.dest_mode = INT_DEST_MODE;
24692 + entry.mask = 0; /* unmask IRQ now */
24693 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24694 + entry.delivery_mode = INT_DELIVERY_MODE;
24695 + entry.polarity = 0;
24696 + entry.trigger = 0;
24697 + entry.vector = vector;
24700 + * The timer IRQ doesn't have to know that behind the
24701 + * scene we have a 8259A-master in AEOI mode ...
24703 + irq_desc[0].chip = &ioapic_edge_type;
24706 + * Add it to the IO-APIC irq-routing table:
24708 + spin_lock_irqsave(&ioapic_lock, flags);
24709 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24710 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24711 + spin_unlock_irqrestore(&ioapic_lock, flags);
24713 + enable_8259A_irq(0);
24716 +void __init UNEXPECTED_IO_APIC(void)
24720 +void __apicdebuginit print_IO_APIC(void)
24723 + union IO_APIC_reg_00 reg_00;
24724 + union IO_APIC_reg_01 reg_01;
24725 + union IO_APIC_reg_02 reg_02;
24726 + unsigned long flags;
24728 + if (apic_verbosity == APIC_QUIET)
24731 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
24732 + for (i = 0; i < nr_ioapics; i++)
24733 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
24734 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
24737 + * We are a bit conservative about what we expect. We have to
24738 + * know about every hardware change ASAP.
24740 + printk(KERN_INFO "testing the IO APIC.......................\n");
24742 + for (apic = 0; apic < nr_ioapics; apic++) {
24744 + spin_lock_irqsave(&ioapic_lock, flags);
24745 + reg_00.raw = io_apic_read(apic, 0);
24746 + reg_01.raw = io_apic_read(apic, 1);
24747 + if (reg_01.bits.version >= 0x10)
24748 + reg_02.raw = io_apic_read(apic, 2);
24749 + spin_unlock_irqrestore(&ioapic_lock, flags);
24752 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
24753 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
24754 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
24755 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
24756 + UNEXPECTED_IO_APIC();
24758 + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01);
24759 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
24760 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
24761 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
24762 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
24763 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
24764 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
24765 + (reg_01.bits.entries != 0x2E) &&
24766 + (reg_01.bits.entries != 0x3F) &&
24767 + (reg_01.bits.entries != 0x03)
24769 + UNEXPECTED_IO_APIC();
24771 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
24772 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
24773 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
24774 + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
24775 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
24776 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
24777 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
24778 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
24780 + UNEXPECTED_IO_APIC();
24781 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
24782 + UNEXPECTED_IO_APIC();
24784 + if (reg_01.bits.version >= 0x10) {
24785 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
24786 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
24787 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
24788 + UNEXPECTED_IO_APIC();
24791 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
24793 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
24794 + " Stat Dest Deli Vect: \n");
24796 + for (i = 0; i <= reg_01.bits.entries; i++) {
24797 + struct IO_APIC_route_entry entry;
24799 + spin_lock_irqsave(&ioapic_lock, flags);
24800 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
24801 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
24802 + spin_unlock_irqrestore(&ioapic_lock, flags);
24804 + printk(KERN_DEBUG " %02x %03X %02X ",
24806 + entry.dest.logical.logical_dest,
24807 + entry.dest.physical.physical_dest
24810 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
24815 + entry.delivery_status,
24817 + entry.delivery_mode,
24822 + if (use_pci_vector())
24823 + printk(KERN_INFO "Using vector-based indexing\n");
24824 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
24825 + for (i = 0; i < NR_IRQS; i++) {
24826 + struct irq_pin_list *entry = irq_2_pin + i;
24827 + if (entry->pin < 0)
24829 + if (use_pci_vector() && !platform_legacy_irq(i))
24830 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
24832 + printk(KERN_DEBUG "IRQ%d ", i);
24834 + printk("-> %d:%d", entry->apic, entry->pin);
24835 + if (!entry->next)
24837 + entry = irq_2_pin + entry->next;
24842 + printk(KERN_INFO ".................................... done.\n");
24847 +static __apicdebuginit void print_APIC_bitfield (int base)
24852 + if (apic_verbosity == APIC_QUIET)
24855 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
24856 + for (i = 0; i < 8; i++) {
24857 + v = apic_read(base + i*0x10);
24858 + for (j = 0; j < 32; j++) {
24868 +void __apicdebuginit print_local_APIC(void * dummy)
24870 + unsigned int v, ver, maxlvt;
24872 + if (apic_verbosity == APIC_QUIET)
24875 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
24876 + smp_processor_id(), hard_smp_processor_id());
24877 + v = apic_read(APIC_ID);
24878 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
24879 + v = apic_read(APIC_LVR);
24880 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
24881 + ver = GET_APIC_VERSION(v);
24882 + maxlvt = get_maxlvt();
24884 + v = apic_read(APIC_TASKPRI);
24885 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
24887 + v = apic_read(APIC_ARBPRI);
24888 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
24889 + v & APIC_ARBPRI_MASK);
24890 + v = apic_read(APIC_PROCPRI);
24891 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
24893 + v = apic_read(APIC_EOI);
24894 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
24895 + v = apic_read(APIC_RRR);
24896 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
24897 + v = apic_read(APIC_LDR);
24898 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
24899 + v = apic_read(APIC_DFR);
24900 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
24901 + v = apic_read(APIC_SPIV);
24902 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
24904 + printk(KERN_DEBUG "... APIC ISR field:\n");
24905 + print_APIC_bitfield(APIC_ISR);
24906 + printk(KERN_DEBUG "... APIC TMR field:\n");
24907 + print_APIC_bitfield(APIC_TMR);
24908 + printk(KERN_DEBUG "... APIC IRR field:\n");
24909 + print_APIC_bitfield(APIC_IRR);
24911 + v = apic_read(APIC_ESR);
24912 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
24914 + v = apic_read(APIC_ICR);
24915 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
24916 + v = apic_read(APIC_ICR2);
24917 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
24919 + v = apic_read(APIC_LVTT);
24920 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
24922 + if (maxlvt > 3) { /* PC is LVT#4. */
24923 + v = apic_read(APIC_LVTPC);
24924 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
24926 + v = apic_read(APIC_LVT0);
24927 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
24928 + v = apic_read(APIC_LVT1);
24929 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
24931 + if (maxlvt > 2) { /* ERR is LVT#3. */
24932 + v = apic_read(APIC_LVTERR);
24933 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
24936 + v = apic_read(APIC_TMICT);
24937 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
24938 + v = apic_read(APIC_TMCCT);
24939 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
24940 + v = apic_read(APIC_TDCR);
24941 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
24945 +void print_all_local_APICs (void)
24947 + on_each_cpu(print_local_APIC, NULL, 1, 1);
24950 +void __apicdebuginit print_PIC(void)
24953 + unsigned long flags;
24955 + if (apic_verbosity == APIC_QUIET)
24958 + printk(KERN_DEBUG "\nprinting PIC contents\n");
24960 + spin_lock_irqsave(&i8259A_lock, flags);
24962 + v = inb(0xa1) << 8 | inb(0x21);
24963 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
24965 + v = inb(0xa0) << 8 | inb(0x20);
24966 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
24970 + v = inb(0xa0) << 8 | inb(0x20);
24974 + spin_unlock_irqrestore(&i8259A_lock, flags);
24976 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
24978 + v = inb(0x4d1) << 8 | inb(0x4d0);
24979 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
24981 +#endif /* !CONFIG_XEN */
24983 +static void __init enable_IO_APIC(void)
24985 + union IO_APIC_reg_01 reg_01;
24986 +#ifndef CONFIG_XEN
24987 + int i8259_apic, i8259_pin;
24990 + unsigned long flags;
24992 + for (i = 0; i < PIN_MAP_SIZE; i++) {
24993 + irq_2_pin[i].pin = -1;
24994 + irq_2_pin[i].next = 0;
24996 + if (!pirqs_enabled)
24997 + for (i = 0; i < MAX_PIRQS; i++)
24998 + pirq_entries[i] = -1;
25001 + * The number of IO-APIC IRQ registers (== #pins):
25003 + for (apic = 0; apic < nr_ioapics; apic++) {
25004 + spin_lock_irqsave(&ioapic_lock, flags);
25005 + reg_01.raw = io_apic_read(apic, 1);
25006 + spin_unlock_irqrestore(&ioapic_lock, flags);
25007 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
25009 +#ifndef CONFIG_XEN
25010 + for(apic = 0; apic < nr_ioapics; apic++) {
25012 + /* See if any of the pins is in ExtINT mode */
25013 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
25014 + struct IO_APIC_route_entry entry;
25015 + spin_lock_irqsave(&ioapic_lock, flags);
25016 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25017 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25018 + spin_unlock_irqrestore(&ioapic_lock, flags);
25021 + /* If the interrupt line is enabled and in ExtInt mode
25022 + * I have found the pin where the i8259 is connected.
25024 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
25025 + ioapic_i8259.apic = apic;
25026 + ioapic_i8259.pin = pin;
25027 + goto found_i8259;
25032 + /* Look to see what if the MP table has reported the ExtINT */
25033 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
25034 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
25035 + /* Trust the MP table if nothing is setup in the hardware */
25036 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
25037 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
25038 + ioapic_i8259.pin = i8259_pin;
25039 + ioapic_i8259.apic = i8259_apic;
25041 + /* Complain if the MP table and the hardware disagree */
25042 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
25043 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
25045 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
25050 + * Do not trust the IO-APIC being empty at bootup
25056 + * Not an __init, needed by the reboot code
25058 +void disable_IO_APIC(void)
25061 + * Clear the IO-APIC before rebooting:
25065 +#ifndef CONFIG_XEN
25067 + * If the i8259 is routed through an IOAPIC
25068 + * Put that IOAPIC in virtual wire mode
25069 + * so legacy interrupts can be delivered.
25071 + if (ioapic_i8259.pin != -1) {
25072 + struct IO_APIC_route_entry entry;
25073 + unsigned long flags;
25075 + memset(&entry, 0, sizeof(entry));
25076 + entry.mask = 0; /* Enabled */
25077 + entry.trigger = 0; /* Edge */
25079 + entry.polarity = 0; /* High */
25080 + entry.delivery_status = 0;
25081 + entry.dest_mode = 0; /* Physical */
25082 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
25083 + entry.vector = 0;
25084 + entry.dest.physical.physical_dest =
25085 + GET_APIC_ID(apic_read(APIC_ID));
25088 + * Add it to the IO-APIC irq-routing table:
25090 + spin_lock_irqsave(&ioapic_lock, flags);
25091 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
25092 + *(((int *)&entry)+1));
25093 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
25094 + *(((int *)&entry)+0));
25095 + spin_unlock_irqrestore(&ioapic_lock, flags);
25098 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
25103 + * function to set the IO-APIC physical IDs based on the
25104 + * values stored in the MPC table.
25106 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
25109 +#ifndef CONFIG_XEN
25110 +static void __init setup_ioapic_ids_from_mpc (void)
25112 + union IO_APIC_reg_00 reg_00;
25115 + unsigned char old_id;
25116 + unsigned long flags;
25119 + * Set the IOAPIC ID to the value stored in the MPC table.
25121 + for (apic = 0; apic < nr_ioapics; apic++) {
25123 + /* Read the register 0 value */
25124 + spin_lock_irqsave(&ioapic_lock, flags);
25125 + reg_00.raw = io_apic_read(apic, 0);
25126 + spin_unlock_irqrestore(&ioapic_lock, flags);
25128 + old_id = mp_ioapics[apic].mpc_apicid;
25131 + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
25135 + * We need to adjust the IRQ routing table
25136 + * if the ID changed.
25138 + if (old_id != mp_ioapics[apic].mpc_apicid)
25139 + for (i = 0; i < mp_irq_entries; i++)
25140 + if (mp_irqs[i].mpc_dstapic == old_id)
25141 + mp_irqs[i].mpc_dstapic
25142 + = mp_ioapics[apic].mpc_apicid;
25145 + * Read the right value from the MPC table and
25146 + * write it into the ID register.
25148 + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
25149 + mp_ioapics[apic].mpc_apicid);
25151 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
25152 + spin_lock_irqsave(&ioapic_lock, flags);
25153 + io_apic_write(apic, 0, reg_00.raw);
25154 + spin_unlock_irqrestore(&ioapic_lock, flags);
25159 + spin_lock_irqsave(&ioapic_lock, flags);
25160 + reg_00.raw = io_apic_read(apic, 0);
25161 + spin_unlock_irqrestore(&ioapic_lock, flags);
25162 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
25163 + printk("could not set ID!\n");
25165 + apic_printk(APIC_VERBOSE," ok.\n");
25169 +static void __init setup_ioapic_ids_from_mpc(void) { }
25173 + * There is a nasty bug in some older SMP boards, their mptable lies
25174 + * about the timer IRQ. We do the following to work around the situation:
25176 + * - timer IRQ defaults to IO-APIC IRQ
25177 + * - if this function detects that timer IRQs are defunct, then we fall
25178 + * back to ISA timer IRQs
25180 +#ifndef CONFIG_XEN
25181 +static int __init timer_irq_works(void)
25183 + unsigned long t1 = jiffies;
25185 + local_irq_enable();
25186 + /* Let ten ticks pass... */
25187 + mdelay((10 * 1000) / HZ);
25190 + * Expect a few ticks at least, to be sure some possible
25191 + * glue logic does not lock up after one or two first
25192 + * ticks in a non-ExtINT mode. Also the local APIC
25193 + * might have cached one ExtINT interrupt. Finally, at
25194 + * least one tick may be lost due to delays.
25197 + /* jiffies wrap? */
25198 + if (jiffies - t1 > 4)
25204 + * In the SMP+IOAPIC case it might happen that there are an unspecified
25205 + * number of pending IRQ events unhandled. These cases are very rare,
25206 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
25207 + * better to do it this way as thus we do not have to be aware of
25208 + * 'pending' interrupts in the IRQ path, except at this point.
25211 + * Edge triggered needs to resend any interrupt
25212 + * that was delayed but this is now handled in the device
25213 + * independent code.
25217 + * Starting up a edge-triggered IO-APIC interrupt is
25218 + * nasty - we need to make sure that we get the edge.
25219 + * If it is already asserted for some reason, we need
25220 + * return 1 to indicate that is was pending.
25222 + * This is not complete - we should be able to fake
25223 + * an edge even if it isn't on the 8259A...
25226 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
25228 + int was_pending = 0;
25229 + unsigned long flags;
25231 + spin_lock_irqsave(&ioapic_lock, flags);
25233 + disable_8259A_irq(irq);
25234 + if (i8259A_irq_pending(irq))
25237 + __unmask_IO_APIC_irq(irq);
25238 + spin_unlock_irqrestore(&ioapic_lock, flags);
25240 + return was_pending;
25244 + * Once we have recorded IRQ_PENDING already, we can mask the
25245 + * interrupt for real. This prevents IRQ storms from unhandled
25248 +static void ack_edge_ioapic_irq(unsigned int irq)
25251 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
25252 + == (IRQ_PENDING | IRQ_DISABLED))
25253 + mask_IO_APIC_irq(irq);
25258 + * Level triggered interrupts can just be masked,
25259 + * and shutting down and starting up the interrupt
25260 + * is the same as enabling and disabling them -- except
25261 + * with a startup need to return a "was pending" value.
25263 + * Level triggered interrupts are special because we
25264 + * do not touch any IO-APIC register while handling
25265 + * them. We ack the APIC in the end-IRQ handler, not
25266 + * in the start-IRQ-handler. Protection against reentrance
25267 + * from the same interrupt is still provided, both by the
25268 + * generic IRQ layer and by the fact that an unacked local
25269 + * APIC does not accept IRQs.
25271 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
25273 + unmask_IO_APIC_irq(irq);
25275 + return 0; /* don't check for pending */
25278 +static void end_level_ioapic_irq (unsigned int irq)
25284 +#ifdef CONFIG_PCI_MSI
25285 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
25287 + int irq = vector_to_irq(vector);
25289 + return startup_edge_ioapic_irq(irq);
25292 +static void ack_edge_ioapic_vector(unsigned int vector)
25294 + int irq = vector_to_irq(vector);
25296 + move_native_irq(vector);
25297 + ack_edge_ioapic_irq(irq);
25300 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
25302 + int irq = vector_to_irq(vector);
25304 + return startup_level_ioapic_irq (irq);
25307 +static void end_level_ioapic_vector (unsigned int vector)
25309 + int irq = vector_to_irq(vector);
25311 + move_native_irq(vector);
25312 + end_level_ioapic_irq(irq);
25315 +static void mask_IO_APIC_vector (unsigned int vector)
25317 + int irq = vector_to_irq(vector);
25319 + mask_IO_APIC_irq(irq);
25322 +static void unmask_IO_APIC_vector (unsigned int vector)
25324 + int irq = vector_to_irq(vector);
25326 + unmask_IO_APIC_irq(irq);
25330 +static void set_ioapic_affinity_vector (unsigned int vector,
25331 + cpumask_t cpu_mask)
25333 + int irq = vector_to_irq(vector);
25335 + set_native_irq_info(vector, cpu_mask);
25336 + set_ioapic_affinity_irq(irq, cpu_mask);
25338 +#endif // CONFIG_SMP
25339 +#endif // CONFIG_PCI_MSI
25341 +static int ioapic_retrigger(unsigned int irq)
25343 + send_IPI_self(IO_APIC_VECTOR(irq));
25349 + * Level and edge triggered IO-APIC interrupts need different handling,
25350 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
25351 + * handled with the level-triggered descriptor, but that one has slightly
25352 + * more overhead. Level-triggered interrupts cannot be handled with the
25353 + * edge-triggered handler, without risking IRQ storms and other ugly
25357 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
25358 + .typename = "IO-APIC-edge",
25359 + .startup = startup_edge_ioapic,
25360 + .shutdown = shutdown_edge_ioapic,
25361 + .enable = enable_edge_ioapic,
25362 + .disable = disable_edge_ioapic,
25363 + .ack = ack_edge_ioapic,
25364 + .end = end_edge_ioapic,
25366 + .set_affinity = set_ioapic_affinity,
25368 + .retrigger = ioapic_retrigger,
25371 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
25372 + .typename = "IO-APIC-level",
25373 + .startup = startup_level_ioapic,
25374 + .shutdown = shutdown_level_ioapic,
25375 + .enable = enable_level_ioapic,
25376 + .disable = disable_level_ioapic,
25377 + .ack = mask_and_ack_level_ioapic,
25378 + .end = end_level_ioapic,
25380 + .set_affinity = set_ioapic_affinity,
25382 + .retrigger = ioapic_retrigger,
25384 +#endif /* !CONFIG_XEN */
25386 +static inline void init_IO_APIC_traps(void)
25391 + * NOTE! The local APIC isn't very good at handling
25392 + * multiple interrupts at the same interrupt level.
25393 + * As the interrupt level is determined by taking the
25394 + * vector number and shifting that right by 4, we
25395 + * want to spread these out a bit so that they don't
25396 + * all fall in the same interrupt level.
25398 + * Also, we've got to be careful not to trash gate
25399 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
25401 + for (irq = 0; irq < NR_IRQS ; irq++) {
25403 + if (use_pci_vector()) {
25404 + if (!platform_legacy_irq(tmp))
25405 + if ((tmp = vector_to_irq(tmp)) == -1)
25408 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
25410 + * Hmm.. We don't have an entry for this,
25411 + * so default to an old-fashioned 8259
25412 + * interrupt if we can..
25415 + make_8259A_irq(irq);
25416 +#ifndef CONFIG_XEN
25418 + /* Strange. Oh, well.. */
25419 + irq_desc[irq].chip = &no_irq_type;
25425 +#ifndef CONFIG_XEN
25426 +static void enable_lapic_irq (unsigned int irq)
25430 + v = apic_read(APIC_LVT0);
25431 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
25434 +static void disable_lapic_irq (unsigned int irq)
25438 + v = apic_read(APIC_LVT0);
25439 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
25442 +static void ack_lapic_irq (unsigned int irq)
25447 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
25449 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
25450 + .typename = "local-APIC-edge",
25451 + .startup = NULL, /* startup_irq() not used for IRQ0 */
25452 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
25453 + .enable = enable_lapic_irq,
25454 + .disable = disable_lapic_irq,
25455 + .ack = ack_lapic_irq,
25456 + .end = end_lapic_irq,
25459 +static void setup_nmi (void)
25462 + * Dirty trick to enable the NMI watchdog ...
25463 + * We put the 8259A master into AEOI mode and
25464 + * unmask on all local APICs LVT0 as NMI.
25466 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
25467 + * is from Maciej W. Rozycki - so we do not have to EOI from
25468 + * the NMI handler or the timer interrupt.
25470 + printk(KERN_INFO "activating NMI Watchdog ...");
25472 + enable_NMI_through_LVT0(NULL);
25474 + printk(" done.\n");
25478 + * This looks a bit hackish but it's about the only one way of sending
25479 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
25480 + * not support the ExtINT mode, unfortunately. We need to send these
25481 + * cycles as some i82489DX-based boards have glue logic that keeps the
25482 + * 8259A interrupt line asserted until INTA. --macro
25484 +static inline void unlock_ExtINT_logic(void)
25486 + int apic, pin, i;
25487 + struct IO_APIC_route_entry entry0, entry1;
25488 + unsigned char save_control, save_freq_select;
25489 + unsigned long flags;
25491 + pin = find_isa_irq_pin(8, mp_INT);
25492 + apic = find_isa_irq_apic(8, mp_INT);
25496 + spin_lock_irqsave(&ioapic_lock, flags);
25497 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25498 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25499 + spin_unlock_irqrestore(&ioapic_lock, flags);
25500 + clear_IO_APIC_pin(apic, pin);
25502 + memset(&entry1, 0, sizeof(entry1));
25504 + entry1.dest_mode = 0; /* physical delivery */
25505 + entry1.mask = 0; /* unmask IRQ now */
25506 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
25507 + entry1.delivery_mode = dest_ExtINT;
25508 + entry1.polarity = entry0.polarity;
25509 + entry1.trigger = 0;
25510 + entry1.vector = 0;
25512 + spin_lock_irqsave(&ioapic_lock, flags);
25513 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
25514 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
25515 + spin_unlock_irqrestore(&ioapic_lock, flags);
25517 + save_control = CMOS_READ(RTC_CONTROL);
25518 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
25519 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
25520 + RTC_FREQ_SELECT);
25521 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
25524 + while (i-- > 0) {
25526 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
25530 + CMOS_WRITE(save_control, RTC_CONTROL);
25531 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
25532 + clear_IO_APIC_pin(apic, pin);
25534 + spin_lock_irqsave(&ioapic_lock, flags);
25535 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
25536 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
25537 + spin_unlock_irqrestore(&ioapic_lock, flags);
25540 +int timer_uses_ioapic_pin_0;
25543 + * This code may look a bit paranoid, but it's supposed to cooperate with
25544 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
25545 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
25546 + * fanatically on his truly buggy board.
25548 + * FIXME: really need to revamp this for modern platforms only.
25550 +static inline void check_timer(void)
25552 + int apic1, pin1, apic2, pin2;
25556 + * get/set the timer IRQ vector:
25558 + disable_8259A_irq(0);
25559 + vector = assign_irq_vector(0);
25560 + set_intr_gate(vector, interrupt[0]);
25563 + * Subtle, code in do_timer_interrupt() expects an AEOI
25564 + * mode for the 8259A whenever interrupts are routed
25565 + * through I/O APICs. Also IRQ0 has to be enabled in
25566 + * the 8259A which implies the virtual wire has to be
25567 + * disabled in the local APIC.
25569 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
25571 + if (timer_over_8254 > 0)
25572 + enable_8259A_irq(0);
25574 + pin1 = find_isa_irq_pin(0, mp_INT);
25575 + apic1 = find_isa_irq_apic(0, mp_INT);
25576 + pin2 = ioapic_i8259.pin;
25577 + apic2 = ioapic_i8259.apic;
25580 + timer_uses_ioapic_pin_0 = 1;
25582 + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
25583 + vector, apic1, pin1, apic2, pin2);
25585 + if (pin1 != -1) {
25587 + * Ok, does IRQ0 through the IOAPIC work?
25589 + unmask_IO_APIC_irq(0);
25590 + if (!no_timer_check && timer_irq_works()) {
25591 + nmi_watchdog_default();
25592 + if (nmi_watchdog == NMI_IO_APIC) {
25593 + disable_8259A_irq(0);
25595 + enable_8259A_irq(0);
25597 + if (disable_timer_pin_1 > 0)
25598 + clear_IO_APIC_pin(0, pin1);
25601 + clear_IO_APIC_pin(apic1, pin1);
25602 + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
25603 + "connected to IO-APIC\n");
25606 + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
25607 + "through the 8259A ... ");
25608 + if (pin2 != -1) {
25609 + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
25612 + * legacy devices should be connected to IO APIC #0
25614 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
25615 + if (timer_irq_works()) {
25616 + apic_printk(APIC_VERBOSE," works.\n");
25617 + nmi_watchdog_default();
25618 + if (nmi_watchdog == NMI_IO_APIC) {
25624 + * Cleanup, just in case ...
25626 + clear_IO_APIC_pin(apic2, pin2);
25628 + apic_printk(APIC_VERBOSE," failed.\n");
25630 + if (nmi_watchdog == NMI_IO_APIC) {
25631 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
25632 + nmi_watchdog = 0;
25635 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
25637 + disable_8259A_irq(0);
25638 + irq_desc[0].chip = &lapic_irq_type;
25639 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
25640 + enable_8259A_irq(0);
25642 + if (timer_irq_works()) {
25643 + apic_printk(APIC_VERBOSE," works.\n");
25646 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
25647 + apic_printk(APIC_VERBOSE," failed.\n");
25649 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
25652 + make_8259A_irq(0);
25653 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
25655 + unlock_ExtINT_logic();
25657 + if (timer_irq_works()) {
25658 + apic_printk(APIC_VERBOSE," works.\n");
25661 + apic_printk(APIC_VERBOSE," failed :(.\n");
25662 + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
25665 +#define check_timer() ((void)0)
25666 +int timer_uses_ioapic_pin_0 = 0;
25667 +#endif /* !CONFIG_XEN */
25669 +static int __init notimercheck(char *s)
25671 + no_timer_check = 1;
25674 +__setup("no_timer_check", notimercheck);
25678 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
25679 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
25680 + * Linux doesn't really care, as it's not actually used
25681 + * for any interrupt handling anyway.
25683 +#define PIC_IRQS (1<<2)
25685 +void __init setup_IO_APIC(void)
25687 + enable_IO_APIC();
25690 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
25692 + io_apic_irqs = ~PIC_IRQS;
25694 + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
25697 + * Set up the IO-APIC IRQ routing table.
25699 + if (!acpi_ioapic)
25700 + setup_ioapic_ids_from_mpc();
25701 +#ifndef CONFIG_XEN
25703 +#endif /* !CONFIG_XEN */
25704 + setup_IO_APIC_irqs();
25705 + init_IO_APIC_traps();
25707 + if (!acpi_ioapic)
25711 +struct sysfs_ioapic_data {
25712 + struct sys_device dev;
25713 + struct IO_APIC_route_entry entry[0];
25715 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
25717 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
25719 + struct IO_APIC_route_entry *entry;
25720 + struct sysfs_ioapic_data *data;
25721 + unsigned long flags;
25724 + data = container_of(dev, struct sysfs_ioapic_data, dev);
25725 + entry = data->entry;
25726 + spin_lock_irqsave(&ioapic_lock, flags);
25727 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25728 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
25729 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
25731 + spin_unlock_irqrestore(&ioapic_lock, flags);
25736 +static int ioapic_resume(struct sys_device *dev)
25738 + struct IO_APIC_route_entry *entry;
25739 + struct sysfs_ioapic_data *data;
25740 + unsigned long flags;
25741 + union IO_APIC_reg_00 reg_00;
25744 + data = container_of(dev, struct sysfs_ioapic_data, dev);
25745 + entry = data->entry;
25747 + spin_lock_irqsave(&ioapic_lock, flags);
25748 + reg_00.raw = io_apic_read(dev->id, 0);
25749 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
25750 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
25751 + io_apic_write(dev->id, 0, reg_00.raw);
25753 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25754 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
25755 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
25757 + spin_unlock_irqrestore(&ioapic_lock, flags);
25762 +static struct sysdev_class ioapic_sysdev_class = {
25763 + set_kset_name("ioapic"),
25764 +#ifndef CONFIG_XEN
25765 + .suspend = ioapic_suspend,
25766 + .resume = ioapic_resume,
25770 +static int __init ioapic_init_sysfs(void)
25772 + struct sys_device * dev;
25773 + int i, size, error = 0;
25775 + error = sysdev_class_register(&ioapic_sysdev_class);
25779 + for (i = 0; i < nr_ioapics; i++ ) {
25780 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
25781 + * sizeof(struct IO_APIC_route_entry);
25782 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
25783 + if (!mp_ioapic_data[i]) {
25784 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25787 + memset(mp_ioapic_data[i], 0, size);
25788 + dev = &mp_ioapic_data[i]->dev;
25790 + dev->cls = &ioapic_sysdev_class;
25791 + error = sysdev_register(dev);
25793 + kfree(mp_ioapic_data[i]);
25794 + mp_ioapic_data[i] = NULL;
25795 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25803 +device_initcall(ioapic_init_sysfs);
25805 +/* --------------------------------------------------------------------------
25806 + ACPI-based IOAPIC Configuration
25807 + -------------------------------------------------------------------------- */
25809 +#ifdef CONFIG_ACPI
25811 +#define IO_APIC_MAX_ID 0xFE
25813 +int __init io_apic_get_version (int ioapic)
25815 + union IO_APIC_reg_01 reg_01;
25816 + unsigned long flags;
25818 + spin_lock_irqsave(&ioapic_lock, flags);
25819 + reg_01.raw = io_apic_read(ioapic, 1);
25820 + spin_unlock_irqrestore(&ioapic_lock, flags);
25822 + return reg_01.bits.version;
25826 +int __init io_apic_get_redir_entries (int ioapic)
25828 + union IO_APIC_reg_01 reg_01;
25829 + unsigned long flags;
25831 + spin_lock_irqsave(&ioapic_lock, flags);
25832 + reg_01.raw = io_apic_read(ioapic, 1);
25833 + spin_unlock_irqrestore(&ioapic_lock, flags);
25835 + return reg_01.bits.entries;
25839 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
25841 + struct IO_APIC_route_entry entry;
25842 + unsigned long flags;
25844 + if (!IO_APIC_IRQ(irq)) {
25845 + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
25851 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
25852 + * Note that we mask (disable) IRQs now -- these get enabled when the
25853 + * corresponding device driver registers for this IRQ.
25856 + memset(&entry,0,sizeof(entry));
25858 + entry.delivery_mode = INT_DELIVERY_MODE;
25859 + entry.dest_mode = INT_DEST_MODE;
25860 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
25861 + entry.trigger = edge_level;
25862 + entry.polarity = active_high_low;
25863 + entry.mask = 1; /* Disabled (masked) */
25865 + irq = gsi_irq_sharing(irq);
25867 + * IRQs < 16 are already in the irq_2_pin[] map
25870 + add_pin_to_irq(irq, ioapic, pin);
25872 + entry.vector = assign_irq_vector(irq);
25874 + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
25875 + "IRQ %d Mode:%i Active:%i)\n", ioapic,
25876 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
25877 + edge_level, active_high_low);
25879 + ioapic_register_intr(irq, entry.vector, edge_level);
25881 + if (!ioapic && (irq < 16))
25882 + disable_8259A_irq(irq);
25884 + spin_lock_irqsave(&ioapic_lock, flags);
25885 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
25886 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
25887 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
25888 + spin_unlock_irqrestore(&ioapic_lock, flags);
25893 +#endif /* CONFIG_ACPI */
25896 +#ifndef CONFIG_XEN
25898 + * This function currently is only a helper for the i386 smp boot process where
25899 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
25900 + * so mask in all cases should simply be TARGET_CPUS
25903 +void __init setup_ioapic_dest(void)
25905 + int pin, ioapic, irq, irq_entry;
25907 + if (skip_ioapic_setup == 1)
25910 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
25911 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
25912 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
25913 + if (irq_entry == -1)
25915 + irq = pin_2_irq(irq_entry, ioapic, pin);
25916 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
25922 +#endif /* !CONFIG_XEN */
25923 Index: head-2008-11-25/arch/x86/kernel/ioport_64-xen.c
25924 ===================================================================
25925 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
25926 +++ head-2008-11-25/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100
25929 + * linux/arch/x86_64/kernel/ioport.c
25931 + * This contains the io-permission bitmap code - written by obz, with changes
25935 +#include <linux/sched.h>
25936 +#include <linux/kernel.h>
25937 +#include <linux/capability.h>
25938 +#include <linux/errno.h>
25939 +#include <linux/types.h>
25940 +#include <linux/ioport.h>
25941 +#include <linux/mm.h>
25942 +#include <linux/smp.h>
25943 +#include <linux/smp_lock.h>
25944 +#include <linux/stddef.h>
25945 +#include <linux/slab.h>
25946 +#include <linux/thread_info.h>
25947 +#include <xen/interface/physdev.h>
25949 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
25950 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
25955 + for (i = base; i < base + extent; i++)
25956 + __set_bit(i, bitmap);
25958 + for (i = base; i < base + extent; i++)
25959 + clear_bit(i, bitmap);
25963 + * this changes the io permissions bitmap in the current task.
25965 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
25967 + struct thread_struct * t = ¤t->thread;
25968 + unsigned long *bitmap;
25969 + struct physdev_set_iobitmap set_iobitmap;
25971 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
25973 + if (turn_on && !capable(CAP_SYS_RAWIO))
25977 + * If it's the first ioperm() call in this thread's lifetime, set the
25978 + * IO bitmap up. ioperm() is much less timing critical than clone(),
25979 + * this is why we delay this operation until now:
25981 + if (!t->io_bitmap_ptr) {
25982 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
25986 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
25987 + t->io_bitmap_ptr = bitmap;
25989 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
25990 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
25991 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
25995 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
26001 + * sys_iopl has to be used when you want to access the IO ports
26002 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
26003 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
26007 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
26009 + unsigned int old_iopl = current->thread.iopl;
26010 + struct physdev_set_iopl set_iopl;
26012 + if (new_iopl > 3)
26015 + /* Need "raw I/O" privileges for direct port access. */
26016 + if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
26019 + /* Change our version of the privilege levels. */
26020 + current->thread.iopl = new_iopl;
26022 + /* Force the change at ring 0. */
26023 + set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
26024 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26028 Index: head-2008-11-25/arch/x86/kernel/irq_64-xen.c
26029 ===================================================================
26030 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26031 +++ head-2008-11-25/arch/x86/kernel/irq_64-xen.c 2008-10-29 09:55:56.000000000 +0100
26034 + * linux/arch/x86_64/kernel/irq.c
26036 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
26038 + * This file contains the lowest level x86_64-specific interrupt
26039 + * entry and irq statistics code. All the remaining irq logic is
26040 + * done by the generic kernel/irq/ code and in the
26041 + * x86_64-specific irq controller code. (e.g. i8259.c and
26045 +#include <linux/kernel_stat.h>
26046 +#include <linux/interrupt.h>
26047 +#include <linux/seq_file.h>
26048 +#include <linux/module.h>
26049 +#include <linux/delay.h>
26050 +#include <asm/uaccess.h>
26051 +#include <asm/io_apic.h>
26052 +#include <asm/idle.h>
26054 +atomic_t irq_err_count;
26055 +#ifdef CONFIG_X86_IO_APIC
26056 +#ifdef APIC_MISMATCH_DEBUG
26057 +atomic_t irq_mis_count;
26061 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26063 + * Probabilistic stack overflow check:
26065 + * Only check the stack in process context, because everything else
26066 + * runs on the big interrupt stacks. Checking reliably is too expensive,
26067 + * so we just check from interrupts.
26069 +static inline void stack_overflow_check(struct pt_regs *regs)
26071 + u64 curbase = (u64) current->thread_info;
26072 + static unsigned long warned = -60*HZ;
26074 + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
26075 + regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
26076 + time_after(jiffies, warned + 60*HZ)) {
26077 + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
26078 + current->comm, curbase, regs->rsp);
26079 + show_stack(NULL,NULL);
26080 + warned = jiffies;
26086 + * Generic, controller-independent functions:
26089 +int show_interrupts(struct seq_file *p, void *v)
26091 + int i = *(loff_t *) v, j;
26092 + struct irqaction * action;
26093 + unsigned long flags;
26096 + seq_printf(p, " ");
26097 + for_each_online_cpu(j)
26098 + seq_printf(p, "CPU%-8d",j);
26099 + seq_putc(p, '\n');
26102 + if (i < NR_IRQS) {
26103 + spin_lock_irqsave(&irq_desc[i].lock, flags);
26104 + action = irq_desc[i].action;
26107 + seq_printf(p, "%3d: ",i);
26108 +#ifndef CONFIG_SMP
26109 + seq_printf(p, "%10u ", kstat_irqs(i));
26111 + for_each_online_cpu(j)
26112 + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
26114 + seq_printf(p, " %14s", irq_desc[i].chip->typename);
26116 + seq_printf(p, " %s", action->name);
26117 + for (action=action->next; action; action = action->next)
26118 + seq_printf(p, ", %s", action->name);
26119 + seq_putc(p, '\n');
26121 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
26122 + } else if (i == NR_IRQS) {
26123 + seq_printf(p, "NMI: ");
26124 + for_each_online_cpu(j)
26125 + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
26126 + seq_putc(p, '\n');
26127 +#ifdef CONFIG_X86_LOCAL_APIC
26128 + seq_printf(p, "LOC: ");
26129 + for_each_online_cpu(j)
26130 + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
26131 + seq_putc(p, '\n');
26133 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
26134 +#ifdef CONFIG_X86_IO_APIC
26135 +#ifdef APIC_MISMATCH_DEBUG
26136 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
26144 + * do_IRQ handles all normal device IRQ's (the special
26145 + * SMP cross-CPU interrupts have their own specific
26148 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
26150 + /* high bit used in ret_from_ code */
26151 + unsigned irq = ~regs->orig_rax;
26153 + if (unlikely(irq >= NR_IRQS)) {
26154 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
26155 + __FUNCTION__, irq);
26161 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26162 + stack_overflow_check(regs);
26164 + __do_IRQ(irq, regs);
26170 +#ifdef CONFIG_HOTPLUG_CPU
26171 +void fixup_irqs(cpumask_t map)
26173 + unsigned int irq;
26174 + static int warned;
26176 + for (irq = 0; irq < NR_IRQS; irq++) {
26181 + cpus_and(mask, irq_desc[irq].affinity, map);
26182 + if (any_online_cpu(mask) == NR_CPUS) {
26183 + /*printk("Breaking affinity for irq %i\n", irq);*/
26186 + if (irq_desc[irq].chip->set_affinity)
26187 + irq_desc[irq].chip->set_affinity(irq, mask);
26188 + else if (irq_desc[irq].action && !(warned++))
26189 + printk("Cannot set affinity for irq %i\n", irq);
26192 + /* That doesn't seem sufficient. Give it 1ms. */
26193 + local_irq_enable();
26195 + local_irq_disable();
26199 +extern void call_softirq(void);
26201 +asmlinkage void do_softirq(void)
26204 + unsigned long flags;
26206 + if (in_interrupt())
26209 + local_irq_save(flags);
26210 + pending = local_softirq_pending();
26211 + /* Switch to interrupt stack */
26214 + WARN_ON_ONCE(softirq_count());
26216 + local_irq_restore(flags);
26218 +EXPORT_SYMBOL(do_softirq);
26220 +#ifndef CONFIG_X86_LOCAL_APIC
26222 + * 'what should we do if we get a hw irq event on an illegal vector'.
26223 + * each architecture has to answer this themselves.
26225 +void ack_bad_irq(unsigned int irq)
26227 + printk("unexpected IRQ trap at vector %02x\n", irq);
26230 Index: head-2008-11-25/arch/x86/kernel/ldt_64-xen.c
26231 ===================================================================
26232 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26233 +++ head-2008-11-25/arch/x86/kernel/ldt_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26236 + * linux/arch/x86_64/kernel/ldt.c
26238 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
26239 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
26240 + * Copyright (C) 2002 Andi Kleen
26242 + * This handles calls from both 32bit and 64bit mode.
26245 +#include <linux/errno.h>
26246 +#include <linux/sched.h>
26247 +#include <linux/string.h>
26248 +#include <linux/mm.h>
26249 +#include <linux/smp.h>
26250 +#include <linux/smp_lock.h>
26251 +#include <linux/vmalloc.h>
26252 +#include <linux/slab.h>
26254 +#include <asm/uaccess.h>
26255 +#include <asm/system.h>
26256 +#include <asm/ldt.h>
26257 +#include <asm/desc.h>
26258 +#include <asm/proto.h>
26259 +#include <asm/pgalloc.h>
26261 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26262 +static void flush_ldt(void *null)
26264 + if (current->active_mm)
26265 + load_LDT(¤t->active_mm->context);
26269 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
26273 + unsigned oldsize;
26275 + if (mincount <= (unsigned)pc->size)
26277 + oldsize = pc->size;
26278 + mincount = (mincount+511)&(~511);
26279 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
26280 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
26282 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
26288 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
26289 + oldldt = pc->ldt;
26290 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
26292 + pc->ldt = newldt;
26294 + pc->size = mincount;
26300 + preempt_disable();
26302 + make_pages_readonly(
26304 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26305 + XENFEAT_writable_descriptor_tables);
26308 + mask = cpumask_of_cpu(smp_processor_id());
26309 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
26310 + smp_call_function(flush_ldt, NULL, 1, 1);
26311 + preempt_enable();
26315 + make_pages_writable(
26317 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
26318 + XENFEAT_writable_descriptor_tables);
26319 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
26327 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
26329 + int err = alloc_ldt(new, old->size, 0);
26332 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
26333 + make_pages_readonly(
26335 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26336 + XENFEAT_writable_descriptor_tables);
26341 + * we do not have to muck with descriptors here, that is
26342 + * done in switch_mm() as needed.
26344 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
26346 + struct mm_struct * old_mm;
26349 + memset(&mm->context, 0, sizeof(mm->context));
26350 + init_MUTEX(&mm->context.sem);
26351 + old_mm = current->mm;
26352 + if (old_mm && old_mm->context.size > 0) {
26353 + down(&old_mm->context.sem);
26354 + retval = copy_ldt(&mm->context, &old_mm->context);
26355 + up(&old_mm->context.sem);
26357 + if (retval == 0) {
26358 + spin_lock(&mm_unpinned_lock);
26359 + list_add(&mm->context.unpinned, &mm_unpinned);
26360 + spin_unlock(&mm_unpinned_lock);
26367 + * Don't touch the LDT register - we're already in the next thread.
26369 +void destroy_context(struct mm_struct *mm)
26371 + if (mm->context.size) {
26372 + if (mm == current->active_mm)
26374 + make_pages_writable(
26376 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26377 + XENFEAT_writable_descriptor_tables);
26378 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
26379 + vfree(mm->context.ldt);
26381 + kfree(mm->context.ldt);
26382 + mm->context.size = 0;
26384 + if (!mm->context.pinned) {
26385 + spin_lock(&mm_unpinned_lock);
26386 + list_del(&mm->context.unpinned);
26387 + spin_unlock(&mm_unpinned_lock);
26391 +static int read_ldt(void __user * ptr, unsigned long bytecount)
26394 + unsigned long size;
26395 + struct mm_struct * mm = current->mm;
26397 + if (!mm->context.size)
26399 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
26400 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
26402 + down(&mm->context.sem);
26403 + size = mm->context.size*LDT_ENTRY_SIZE;
26404 + if (size > bytecount)
26405 + size = bytecount;
26408 + if (copy_to_user(ptr, mm->context.ldt, size))
26410 + up(&mm->context.sem);
26412 + goto error_return;
26413 + if (size != bytecount) {
26414 + /* zero-fill the rest */
26415 + if (clear_user(ptr+size, bytecount-size) != 0) {
26417 + goto error_return;
26420 + return bytecount;
26425 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
26427 + /* Arbitrary number */
26428 + /* x86-64 default LDT is all zeros */
26429 + if (bytecount > 128)
26431 + if (clear_user(ptr, bytecount))
26433 + return bytecount;
26436 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
26438 + struct task_struct *me = current;
26439 + struct mm_struct * mm = me->mm;
26440 + __u32 entry_1, entry_2, *lp;
26441 + unsigned long mach_lp;
26443 + struct user_desc ldt_info;
26447 + if (bytecount != sizeof(ldt_info))
26450 + if (copy_from_user(&ldt_info, ptr, bytecount))
26454 + if (ldt_info.entry_number >= LDT_ENTRIES)
26456 + if (ldt_info.contents == 3) {
26459 + if (ldt_info.seg_not_present == 0)
26463 + down(&mm->context.sem);
26464 + if (ldt_info.entry_number >= (unsigned)mm->context.size) {
26465 + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
26470 + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
26471 + mach_lp = arbitrary_virt_to_machine(lp);
26473 + /* Allow LDTs to be cleared by the user. */
26474 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
26475 + if (oldmode || LDT_empty(&ldt_info)) {
26482 + entry_1 = LDT_entry_a(&ldt_info);
26483 + entry_2 = LDT_entry_b(&ldt_info);
26485 + entry_2 &= ~(1 << 20);
26487 + /* Install the new entry ... */
26489 + error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
26492 + up(&mm->context.sem);
26497 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
26499 + int ret = -ENOSYS;
26503 + ret = read_ldt(ptr, bytecount);
26506 + ret = write_ldt(ptr, bytecount, 1);
26509 + ret = read_default_ldt(ptr, bytecount);
26512 + ret = write_ldt(ptr, bytecount, 0);
26517 Index: head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c
26518 ===================================================================
26519 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26520 +++ head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26523 + * Intel Multiprocessor Specification 1.1 and 1.4
26524 + * compliant MP-table parsing routines.
26526 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
26527 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
26530 + * Erich Boleyn : MP v1.4 and additional changes.
26531 + * Alan Cox : Added EBDA scanning
26532 + * Ingo Molnar : various cleanups and rewrites
26533 + * Maciej W. Rozycki: Bits for default MP configurations
26534 + * Paul Diefenbaugh: Added full ACPI support
26537 +#include <linux/mm.h>
26538 +#include <linux/init.h>
26539 +#include <linux/delay.h>
26540 +#include <linux/bootmem.h>
26541 +#include <linux/smp_lock.h>
26542 +#include <linux/kernel_stat.h>
26543 +#include <linux/mc146818rtc.h>
26544 +#include <linux/acpi.h>
26545 +#include <linux/module.h>
26547 +#include <asm/smp.h>
26548 +#include <asm/mtrr.h>
26549 +#include <asm/mpspec.h>
26550 +#include <asm/pgalloc.h>
26551 +#include <asm/io_apic.h>
26552 +#include <asm/proto.h>
26553 +#include <asm/acpi.h>
26555 +/* Have we found an MP table */
26556 +int smp_found_config;
26557 +unsigned int __initdata maxcpus = NR_CPUS;
26559 +int acpi_found_madt;
26562 + * Various Linux-internal data structures created from the
26565 +unsigned char apic_version [MAX_APICS];
26566 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26567 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26569 +static int mp_current_pci_id = 0;
26570 +/* I/O APIC entries */
26571 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
26573 +/* # of MP IRQ source entries */
26574 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
26576 +/* MP IRQ source entries */
26577 +int mp_irq_entries;
26581 +unsigned long mp_lapic_addr = 0;
26585 +/* Processor that is doing the boot up */
26586 +unsigned int boot_cpu_id = -1U;
26587 +/* Internal processor count */
26588 +unsigned int num_processors __initdata = 0;
26590 +unsigned disabled_cpus __initdata;
26592 +/* Bitmask of physically existing CPUs */
26593 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
26595 +/* ACPI MADT entry parsing functions */
26596 +#ifdef CONFIG_ACPI
26597 +extern struct acpi_boot_flags acpi_boot;
26598 +#ifdef CONFIG_X86_LOCAL_APIC
26599 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
26600 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
26601 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
26602 +#endif /*CONFIG_X86_LOCAL_APIC*/
26603 +#ifdef CONFIG_X86_IO_APIC
26604 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
26605 +#endif /*CONFIG_X86_IO_APIC*/
26606 +#endif /*CONFIG_ACPI*/
26608 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
26612 + * Intel MP BIOS table parsing routines:
26616 + * Checksum an MP configuration block.
26619 +static int __init mpf_checksum(unsigned char *mp, int len)
26626 + return sum & 0xFF;
26629 +#ifndef CONFIG_XEN
26630 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26633 + unsigned char ver;
26634 + cpumask_t tmp_map;
26636 + if (!(m->mpc_cpuflag & CPU_ENABLED)) {
26641 + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
26643 + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
26644 + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
26647 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26648 + Dprintk(" Bootup CPU\n");
26649 + boot_cpu_id = m->mpc_apicid;
26651 + if (num_processors >= NR_CPUS) {
26652 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
26653 + " Processor ignored.\n", NR_CPUS);
26657 + num_processors++;
26658 + cpus_complement(tmp_map, cpu_present_map);
26659 + cpu = first_cpu(tmp_map);
26661 +#if MAX_APICS < 255
26662 + if ((int)m->mpc_apicid > MAX_APICS) {
26663 + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
26664 + m->mpc_apicid, MAX_APICS);
26668 + ver = m->mpc_apicver;
26670 + physid_set(m->mpc_apicid, phys_cpu_present_map);
26672 + * Validate version
26674 + if (ver == 0x0) {
26675 + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
26678 + apic_version[m->mpc_apicid] = ver;
26679 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26681 + * bios_cpu_apicid is required to have processors listed
26682 + * in same order as logical cpu numbers. Hence the first
26683 + * entry is BSP, and so on.
26687 + bios_cpu_apicid[cpu] = m->mpc_apicid;
26688 + x86_cpu_to_apicid[cpu] = m->mpc_apicid;
26690 + cpu_set(cpu, cpu_possible_map);
26691 + cpu_set(cpu, cpu_present_map);
26694 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26696 + num_processors++;
26698 +#endif /* CONFIG_XEN */
26700 +static void __init MP_bus_info (struct mpc_config_bus *m)
26704 + memcpy(str, m->mpc_bustype, 6);
26706 + Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
26708 + if (strncmp(str, "ISA", 3) == 0) {
26709 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
26710 + } else if (strncmp(str, "EISA", 4) == 0) {
26711 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
26712 + } else if (strncmp(str, "PCI", 3) == 0) {
26713 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
26714 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
26715 + mp_current_pci_id++;
26716 + } else if (strncmp(str, "MCA", 3) == 0) {
26717 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
26719 + printk(KERN_ERR "Unknown bustype %s\n", str);
26723 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
26725 + if (!(m->mpc_flags & MPC_APIC_USABLE))
26728 + printk("I/O APIC #%d Version %d at 0x%X.\n",
26729 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
26730 + if (nr_ioapics >= MAX_IO_APICS) {
26731 + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
26732 + MAX_IO_APICS, nr_ioapics);
26733 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
26735 + if (!m->mpc_apicaddr) {
26736 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
26737 + " found in MP table, skipping!\n");
26740 + mp_ioapics[nr_ioapics] = *m;
26744 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
26746 + mp_irqs [mp_irq_entries] = *m;
26747 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
26748 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
26749 + m->mpc_irqtype, m->mpc_irqflag & 3,
26750 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
26751 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
26752 + if (++mp_irq_entries >= MAX_IRQ_SOURCES)
26753 + panic("Max # of irq sources exceeded!!\n");
26756 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
26758 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
26759 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
26760 + m->mpc_irqtype, m->mpc_irqflag & 3,
26761 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
26762 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
26764 + * Well it seems all SMP boards in existence
26765 + * use ExtINT/LVT1 == LINT0 and
26766 + * NMI/LVT2 == LINT1 - the following check
26767 + * will show us if this assumptions is false.
26768 + * Until then we do not have to add baggage.
26770 + if ((m->mpc_irqtype == mp_ExtINT) &&
26771 + (m->mpc_destapiclint != 0))
26773 + if ((m->mpc_irqtype == mp_NMI) &&
26774 + (m->mpc_destapiclint != 1))
26779 + * Read/parse the MPC
26782 +static int __init smp_read_mpc(struct mp_config_table *mpc)
26785 + int count=sizeof(*mpc);
26786 + unsigned char *mpt=((unsigned char *)mpc)+count;
26788 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
26789 + printk("SMP mptable: bad signature [%c%c%c%c]!\n",
26790 + mpc->mpc_signature[0],
26791 + mpc->mpc_signature[1],
26792 + mpc->mpc_signature[2],
26793 + mpc->mpc_signature[3]);
26796 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
26797 + printk("SMP mptable: checksum error!\n");
26800 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
26801 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
26805 + if (!mpc->mpc_lapic) {
26806 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
26809 + memcpy(str,mpc->mpc_oem,8);
26811 + printk(KERN_INFO "OEM ID: %s ",str);
26813 + memcpy(str,mpc->mpc_productid,12);
26815 + printk("Product ID: %s ",str);
26817 + printk("APIC at: 0x%X\n",mpc->mpc_lapic);
26819 + /* save the local APIC address, it might be non-default */
26821 + mp_lapic_addr = mpc->mpc_lapic;
26824 + * Now process the configuration blocks.
26826 + while (count < mpc->mpc_length) {
26828 + case MP_PROCESSOR:
26830 + struct mpc_config_processor *m=
26831 + (struct mpc_config_processor *)mpt;
26833 + MP_processor_info(m);
26834 + mpt += sizeof(*m);
26835 + count += sizeof(*m);
26840 + struct mpc_config_bus *m=
26841 + (struct mpc_config_bus *)mpt;
26843 + mpt += sizeof(*m);
26844 + count += sizeof(*m);
26849 + struct mpc_config_ioapic *m=
26850 + (struct mpc_config_ioapic *)mpt;
26851 + MP_ioapic_info(m);
26853 + count+=sizeof(*m);
26858 + struct mpc_config_intsrc *m=
26859 + (struct mpc_config_intsrc *)mpt;
26861 + MP_intsrc_info(m);
26863 + count+=sizeof(*m);
26868 + struct mpc_config_lintsrc *m=
26869 + (struct mpc_config_lintsrc *)mpt;
26870 + MP_lintsrc_info(m);
26872 + count+=sizeof(*m);
26877 + clustered_apic_check();
26878 + if (!num_processors)
26879 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
26880 + return num_processors;
26883 +static int __init ELCR_trigger(unsigned int irq)
26885 + unsigned int port;
26887 + port = 0x4d0 + (irq >> 3);
26888 + return (inb(port) >> (irq & 7)) & 1;
26891 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
26893 + struct mpc_config_intsrc intsrc;
26895 + int ELCR_fallback = 0;
26897 + intsrc.mpc_type = MP_INTSRC;
26898 + intsrc.mpc_irqflag = 0; /* conforming */
26899 + intsrc.mpc_srcbus = 0;
26900 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
26902 + intsrc.mpc_irqtype = mp_INT;
26905 + * If true, we have an ISA/PCI system with no IRQ entries
26906 + * in the MP table. To prevent the PCI interrupts from being set up
26907 + * incorrectly, we try to use the ELCR. The sanity check to see if
26908 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
26909 + * never be level sensitive, so we simply see if the ELCR agrees.
26910 + * If it does, we assume it's valid.
26912 + if (mpc_default_type == 5) {
26913 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
26915 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
26916 + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
26918 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
26919 + ELCR_fallback = 1;
26923 + for (i = 0; i < 16; i++) {
26924 + switch (mpc_default_type) {
26926 + if (i == 0 || i == 13)
26927 + continue; /* IRQ0 & IRQ13 not connected */
26928 + /* fall through */
26931 + continue; /* IRQ2 is never connected */
26934 + if (ELCR_fallback) {
26936 + * If the ELCR indicates a level-sensitive interrupt, we
26937 + * copy that information over to the MP table in the
26938 + * irqflag field (level sensitive, active high polarity).
26940 + if (ELCR_trigger(i))
26941 + intsrc.mpc_irqflag = 13;
26943 + intsrc.mpc_irqflag = 0;
26946 + intsrc.mpc_srcbusirq = i;
26947 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
26948 + MP_intsrc_info(&intsrc);
26951 + intsrc.mpc_irqtype = mp_ExtINT;
26952 + intsrc.mpc_srcbusirq = 0;
26953 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
26954 + MP_intsrc_info(&intsrc);
26957 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
26959 + struct mpc_config_processor processor;
26960 + struct mpc_config_bus bus;
26961 + struct mpc_config_ioapic ioapic;
26962 + struct mpc_config_lintsrc lintsrc;
26963 + int linttypes[2] = { mp_ExtINT, mp_NMI };
26967 + * local APIC has default address
26969 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
26972 + * 2 CPUs, numbered 0 & 1.
26974 + processor.mpc_type = MP_PROCESSOR;
26975 + /* Either an integrated APIC or a discrete 82489DX. */
26976 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
26977 + processor.mpc_cpuflag = CPU_ENABLED;
26978 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
26979 + (boot_cpu_data.x86_model << 4) |
26980 + boot_cpu_data.x86_mask;
26981 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
26982 + processor.mpc_reserved[0] = 0;
26983 + processor.mpc_reserved[1] = 0;
26984 + for (i = 0; i < 2; i++) {
26985 + processor.mpc_apicid = i;
26986 + MP_processor_info(&processor);
26989 + bus.mpc_type = MP_BUS;
26990 + bus.mpc_busid = 0;
26991 + switch (mpc_default_type) {
26993 + printk(KERN_ERR "???\nUnknown standard configuration %d\n",
26994 + mpc_default_type);
26995 + /* fall through */
26998 + memcpy(bus.mpc_bustype, "ISA ", 6);
27003 + memcpy(bus.mpc_bustype, "EISA ", 6);
27007 + memcpy(bus.mpc_bustype, "MCA ", 6);
27009 + MP_bus_info(&bus);
27010 + if (mpc_default_type > 4) {
27011 + bus.mpc_busid = 1;
27012 + memcpy(bus.mpc_bustype, "PCI ", 6);
27013 + MP_bus_info(&bus);
27016 + ioapic.mpc_type = MP_IOAPIC;
27017 + ioapic.mpc_apicid = 2;
27018 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
27019 + ioapic.mpc_flags = MPC_APIC_USABLE;
27020 + ioapic.mpc_apicaddr = 0xFEC00000;
27021 + MP_ioapic_info(&ioapic);
27024 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
27026 + construct_default_ioirq_mptable(mpc_default_type);
27028 + lintsrc.mpc_type = MP_LINTSRC;
27029 + lintsrc.mpc_irqflag = 0; /* conforming */
27030 + lintsrc.mpc_srcbusid = 0;
27031 + lintsrc.mpc_srcbusirq = 0;
27032 + lintsrc.mpc_destapic = MP_APIC_ALL;
27033 + for (i = 0; i < 2; i++) {
27034 + lintsrc.mpc_irqtype = linttypes[i];
27035 + lintsrc.mpc_destapiclint = i;
27036 + MP_lintsrc_info(&lintsrc);
27040 +static struct intel_mp_floating *mpf_found;
27043 + * Scan the memory blocks for an SMP configuration block.
27045 +void __init get_smp_config (void)
27047 + struct intel_mp_floating *mpf = mpf_found;
27050 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
27051 + * processors, where MPS only supports physical.
27053 + if (acpi_lapic && acpi_ioapic) {
27054 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
27057 + else if (acpi_lapic)
27058 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
27060 + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
27061 + if (mpf->mpf_feature2 & (1<<7)) {
27062 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
27065 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
27070 + * Now see if we need to read further.
27072 + if (mpf->mpf_feature1 != 0) {
27074 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
27075 + construct_default_ISA_mptable(mpf->mpf_feature1);
27077 + } else if (mpf->mpf_physptr) {
27080 + * Read the physical hardware table. Anything here will
27081 + * override the defaults.
27083 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
27084 + smp_found_config = 0;
27085 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
27086 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
27090 + * If there are no explicit MP IRQ entries, then we are
27091 + * broken. We set up most of the low 16 IO-APIC pins to
27092 + * ISA defaults and hope it will work.
27094 + if (!mp_irq_entries) {
27095 + struct mpc_config_bus bus;
27097 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
27099 + bus.mpc_type = MP_BUS;
27100 + bus.mpc_busid = 0;
27101 + memcpy(bus.mpc_bustype, "ISA ", 6);
27102 + MP_bus_info(&bus);
27104 + construct_default_ioirq_mptable(0);
27110 + printk(KERN_INFO "Processors: %d\n", num_processors);
27112 + * Only use the first configuration found.
27116 +static int __init smp_scan_config (unsigned long base, unsigned long length)
27118 + extern void __bad_mpf_size(void);
27119 + unsigned int *bp = isa_bus_to_virt(base);
27120 + struct intel_mp_floating *mpf;
27122 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
27123 + if (sizeof(*mpf) != 16)
27124 + __bad_mpf_size();
27126 + while (length > 0) {
27127 + mpf = (struct intel_mp_floating *)bp;
27128 + if ((*bp == SMP_MAGIC_IDENT) &&
27129 + (mpf->mpf_length == 1) &&
27130 + !mpf_checksum((unsigned char *)bp, 16) &&
27131 + ((mpf->mpf_specification == 1)
27132 + || (mpf->mpf_specification == 4)) ) {
27134 + smp_found_config = 1;
27144 +void __init find_intel_smp (void)
27146 + unsigned int address;
27149 + * FIXME: Linux assumes you have 640K of base ram..
27150 + * this continues the error...
27152 + * 1) Scan the bottom 1K for a signature
27153 + * 2) Scan the top 1K of base RAM
27154 + * 3) Scan the 64K of bios
27156 + if (smp_scan_config(0x0,0x400) ||
27157 + smp_scan_config(639*0x400,0x400) ||
27158 + smp_scan_config(0xF0000,0x10000))
27161 + * If it is an SMP machine we should know now, unless the
27162 + * configuration is in an EISA/MCA bus machine with an
27163 + * extended bios data area.
27165 + * there is a real-mode segmented pointer pointing to the
27166 + * 4K EBDA area at 0x40E, calculate and scan it here.
27168 + * NOTE! There are Linux loaders that will corrupt the EBDA
27169 + * area, and as such this kind of SMP config may be less
27170 + * trustworthy, simply because the SMP table may have been
27171 + * stomped on during early boot. These loaders are buggy and
27172 + * should be fixed.
27175 + address = *(unsigned short *)phys_to_virt(0x40E);
27177 + if (smp_scan_config(address, 0x1000))
27180 + /* If we have come this far, we did not find an MP table */
27181 + printk(KERN_INFO "No mptable found.\n");
27185 + * - Intel MP Configuration Table
27187 +void __init find_smp_config (void)
27189 +#ifdef CONFIG_X86_LOCAL_APIC
27190 + find_intel_smp();
27195 +/* --------------------------------------------------------------------------
27196 + ACPI-based MP Configuration
27197 + -------------------------------------------------------------------------- */
27199 +#ifdef CONFIG_ACPI
27201 +void __init mp_register_lapic_address (
27204 +#ifndef CONFIG_XEN
27205 + mp_lapic_addr = (unsigned long) address;
27207 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
27209 + if (boot_cpu_id == -1U)
27210 + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
27212 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
27217 +void __cpuinit mp_register_lapic (
27221 + struct mpc_config_processor processor;
27222 + int boot_cpu = 0;
27224 + if (id >= MAX_APICS) {
27225 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
27230 + if (id == boot_cpu_physical_apicid)
27233 +#ifndef CONFIG_XEN
27234 + processor.mpc_type = MP_PROCESSOR;
27235 + processor.mpc_apicid = id;
27236 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
27237 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
27238 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
27239 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
27240 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
27241 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
27242 + processor.mpc_reserved[0] = 0;
27243 + processor.mpc_reserved[1] = 0;
27246 + MP_processor_info(&processor);
27249 +#ifdef CONFIG_X86_IO_APIC
27251 +#define MP_ISA_BUS 0
27252 +#define MP_MAX_IOAPIC_PIN 127
27254 +static struct mp_ioapic_routing {
27258 + u32 pin_programmed[4];
27259 +} mp_ioapic_routing[MAX_IO_APICS];
27262 +static int mp_find_ioapic (
27267 + /* Find the IOAPIC that manages this GSI. */
27268 + for (i = 0; i < nr_ioapics; i++) {
27269 + if ((gsi >= mp_ioapic_routing[i].gsi_start)
27270 + && (gsi <= mp_ioapic_routing[i].gsi_end))
27274 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
27280 +void __init mp_register_ioapic (
27287 + if (nr_ioapics >= MAX_IO_APICS) {
27288 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
27289 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
27290 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
27293 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
27294 + " found in MADT table, skipping!\n");
27298 + idx = nr_ioapics++;
27300 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
27301 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
27302 + mp_ioapics[idx].mpc_apicaddr = address;
27304 +#ifndef CONFIG_XEN
27305 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
27307 + mp_ioapics[idx].mpc_apicid = id;
27308 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
27311 + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
27312 + * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
27314 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
27315 + mp_ioapic_routing[idx].gsi_start = gsi_base;
27316 + mp_ioapic_routing[idx].gsi_end = gsi_base +
27317 + io_apic_get_redir_entries(idx);
27319 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
27320 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
27321 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
27322 + mp_ioapic_routing[idx].gsi_start,
27323 + mp_ioapic_routing[idx].gsi_end);
27329 +void __init mp_override_legacy_irq (
27335 + struct mpc_config_intsrc intsrc;
27340 + * Convert 'gsi' to 'ioapic.pin'.
27342 + ioapic = mp_find_ioapic(gsi);
27345 + pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27348 + * TBD: This check is for faulty timer entries, where the override
27349 + * erroneously sets the trigger to level, resulting in a HUGE
27350 + * increase of timer interrupts!
27352 + if ((bus_irq == 0) && (trigger == 3))
27355 + intsrc.mpc_type = MP_INTSRC;
27356 + intsrc.mpc_irqtype = mp_INT;
27357 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
27358 + intsrc.mpc_srcbus = MP_ISA_BUS;
27359 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
27360 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
27361 + intsrc.mpc_dstirq = pin; /* INTIN# */
27363 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
27364 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27365 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27366 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
27368 + mp_irqs[mp_irq_entries] = intsrc;
27369 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
27370 + panic("Max # of irq sources exceeded!\n");
27376 +void __init mp_config_acpi_legacy_irqs (void)
27378 + struct mpc_config_intsrc intsrc;
27383 + * Fabricate the legacy ISA bus (bus #31).
27385 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
27386 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
27389 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
27391 + ioapic = mp_find_ioapic(0);
27395 + intsrc.mpc_type = MP_INTSRC;
27396 + intsrc.mpc_irqflag = 0; /* Conforming */
27397 + intsrc.mpc_srcbus = MP_ISA_BUS;
27398 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
27401 + * Use the default configuration for the IRQs 0-15. Unless
27402 + * overridden by (MADT) interrupt source override entries.
27404 + for (i = 0; i < 16; i++) {
27407 + for (idx = 0; idx < mp_irq_entries; idx++) {
27408 + struct mpc_config_intsrc *irq = mp_irqs + idx;
27410 + /* Do we already have a mapping for this ISA IRQ? */
27411 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
27414 + /* Do we already have a mapping for this IOAPIC pin */
27415 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
27416 + (irq->mpc_dstirq == i))
27420 + if (idx != mp_irq_entries) {
27421 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
27422 + continue; /* IRQ already used */
27425 + intsrc.mpc_irqtype = mp_INT;
27426 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
27427 + intsrc.mpc_dstirq = i;
27429 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
27430 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27431 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27432 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
27433 + intsrc.mpc_dstirq);
27435 + mp_irqs[mp_irq_entries] = intsrc;
27436 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
27437 + panic("Max # of irq sources exceeded!\n");
27443 +#define MAX_GSI_NUM 4096
27445 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
27448 + int ioapic_pin = 0;
27449 + int idx, bit = 0;
27450 + static int pci_irq = 16;
27452 + * Mapping between Global System Interrupts, which
27453 + * represent all possible interrupts, to the IRQs
27454 + * assigned to actual devices.
27456 + static int gsi_to_irq[MAX_GSI_NUM];
27458 + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
27461 + /* Don't set up the ACPI SCI because it's already set up */
27462 + if (acpi_fadt.sci_int == gsi)
27465 + ioapic = mp_find_ioapic(gsi);
27466 + if (ioapic < 0) {
27467 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
27471 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27474 + * Avoid pin reprogramming. PRTs typically include entries
27475 + * with redundant pin->gsi mappings (but unique PCI devices);
27476 + * we only program the IOAPIC on the first.
27478 + bit = ioapic_pin % 32;
27479 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
27481 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
27482 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
27486 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
27487 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
27488 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
27489 + return gsi_to_irq[gsi];
27492 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
27494 + if (triggering == ACPI_LEVEL_SENSITIVE) {
27496 + * For PCI devices assign IRQs in order, avoiding gaps
27497 + * due to unused I/O APIC pins.
27500 + if (gsi < MAX_GSI_NUM) {
27502 + * Retain the VIA chipset work-around (gsi > 15), but
27503 + * avoid a problem where the 8254 timer (IRQ0) is setup
27504 + * via an override (so it's not on pin 0 of the ioapic),
27505 + * and at the same time, the pin 0 interrupt is a PCI
27506 + * type. The gsi > 15 test could cause these two pins
27507 + * to be shared as IRQ0, and they are not shareable.
27508 + * So test for this condition, and if necessary, avoid
27509 + * the pin collision.
27511 + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
27514 + * Don't assign IRQ used by ACPI SCI
27516 + if (gsi == acpi_fadt.sci_int)
27518 + gsi_to_irq[irq] = gsi;
27520 + printk(KERN_ERR "GSI %u is too high\n", gsi);
27525 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
27526 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
27527 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
27531 +#endif /*CONFIG_X86_IO_APIC*/
27532 +#endif /*CONFIG_ACPI*/
27533 Index: head-2008-11-25/arch/x86/kernel/process_64-xen.c
27534 ===================================================================
27535 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
27536 +++ head-2008-11-25/arch/x86/kernel/process_64-xen.c 2008-08-07 12:44:36.000000000 +0200
27539 + * linux/arch/x86-64/kernel/process.c
27541 + * Copyright (C) 1995 Linus Torvalds
27543 + * Pentium III FXSR, SSE support
27544 + * Gareth Hughes <gareth@valinux.com>, May 2000
27549 + * CPU hotplug support - ashok.raj@intel.com
27551 + * Jun Nakajima <jun.nakajima@intel.com>
27552 + * Modified for Xen
27556 + * This file handles the architecture-dependent parts of process handling..
27559 +#include <stdarg.h>
27561 +#include <linux/cpu.h>
27562 +#include <linux/errno.h>
27563 +#include <linux/sched.h>
27564 +#include <linux/kernel.h>
27565 +#include <linux/mm.h>
27566 +#include <linux/elfcore.h>
27567 +#include <linux/smp.h>
27568 +#include <linux/slab.h>
27569 +#include <linux/user.h>
27570 +#include <linux/module.h>
27571 +#include <linux/a.out.h>
27572 +#include <linux/interrupt.h>
27573 +#include <linux/delay.h>
27574 +#include <linux/ptrace.h>
27575 +#include <linux/utsname.h>
27576 +#include <linux/random.h>
27577 +#include <linux/notifier.h>
27578 +#include <linux/kprobes.h>
27580 +#include <asm/uaccess.h>
27581 +#include <asm/pgtable.h>
27582 +#include <asm/system.h>
27583 +#include <asm/io.h>
27584 +#include <asm/processor.h>
27585 +#include <asm/i387.h>
27586 +#include <asm/mmu_context.h>
27587 +#include <asm/pda.h>
27588 +#include <asm/prctl.h>
27589 +#include <asm/kdebug.h>
27590 +#include <xen/interface/platform.h>
27591 +#include <xen/interface/physdev.h>
27592 +#include <xen/interface/vcpu.h>
27593 +#include <asm/desc.h>
27594 +#include <asm/proto.h>
27595 +#include <asm/hardirq.h>
27596 +#include <asm/ia32.h>
27597 +#include <asm/idle.h>
27599 +#include <xen/cpu_hotplug.h>
27601 +asmlinkage extern void ret_from_fork(void);
27603 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
27605 +unsigned long boot_option_idle_override = 0;
27606 +EXPORT_SYMBOL(boot_option_idle_override);
27609 + * Powermanagement idle function, if any..
27611 +void (*pm_idle)(void);
27612 +EXPORT_SYMBOL(pm_idle);
27613 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
27615 +static ATOMIC_NOTIFIER_HEAD(idle_notifier);
27617 +void idle_notifier_register(struct notifier_block *n)
27619 + atomic_notifier_chain_register(&idle_notifier, n);
27621 +EXPORT_SYMBOL_GPL(idle_notifier_register);
27623 +void idle_notifier_unregister(struct notifier_block *n)
27625 + atomic_notifier_chain_unregister(&idle_notifier, n);
27627 +EXPORT_SYMBOL(idle_notifier_unregister);
27629 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
27630 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
27632 +void enter_idle(void)
27634 + __get_cpu_var(idle_state) = CPU_IDLE;
27635 + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
27638 +static void __exit_idle(void)
27640 + __get_cpu_var(idle_state) = CPU_NOT_IDLE;
27641 + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
27644 +/* Called from interrupts to signify idle end */
27645 +void exit_idle(void)
27647 + if (current->pid | read_pda(irqcount))
27653 + * On SMP it's slightly faster (but much more power-consuming!)
27654 + * to poll the ->need_resched flag instead of waiting for the
27655 + * cross-CPU IPI to arrive. Use this option with caution.
27657 +static void poll_idle (void)
27659 + local_irq_enable();
27667 + "i" (_TIF_NEED_RESCHED),
27668 + "m" (current_thread_info()->flags));
27671 +static void xen_idle(void)
27673 + local_irq_disable();
27675 + if (need_resched())
27676 + local_irq_enable();
27678 + current_thread_info()->status &= ~TS_POLLING;
27679 + smp_mb__after_clear_bit();
27681 + current_thread_info()->status |= TS_POLLING;
27685 +#ifdef CONFIG_HOTPLUG_CPU
27686 +static inline void play_dead(void)
27688 + idle_task_exit();
27689 + local_irq_disable();
27690 + cpu_clear(smp_processor_id(), cpu_initialized);
27691 + preempt_enable_no_resched();
27692 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
27696 +static inline void play_dead(void)
27700 +#endif /* CONFIG_HOTPLUG_CPU */
27703 + * The idle thread. There's no useful work to be
27704 + * done, so just try to conserve power and have a
27705 + * low exit latency (ie sit in a loop waiting for
27706 + * somebody to say that they'd like to reschedule)
27708 +void cpu_idle (void)
27710 + current_thread_info()->status |= TS_POLLING;
27711 + /* endless idle loop with no priority at all */
27713 + while (!need_resched()) {
27714 + void (*idle)(void);
27716 + if (__get_cpu_var(cpu_idle_state))
27717 + __get_cpu_var(cpu_idle_state) = 0;
27719 + idle = xen_idle; /* no alternatives */
27720 + if (cpu_is_offline(smp_processor_id()))
27727 + preempt_enable_no_resched();
27729 + preempt_disable();
27733 +void cpu_idle_wait(void)
27735 + unsigned int cpu, this_cpu = get_cpu();
27738 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
27742 + for_each_online_cpu(cpu) {
27743 + per_cpu(cpu_idle_state, cpu) = 1;
27744 + cpu_set(cpu, map);
27747 + __get_cpu_var(cpu_idle_state) = 0;
27752 + for_each_online_cpu(cpu) {
27753 + if (cpu_isset(cpu, map) &&
27754 + !per_cpu(cpu_idle_state, cpu))
27755 + cpu_clear(cpu, map);
27757 + cpus_and(map, map, cpu_online_map);
27758 + } while (!cpus_empty(map));
27760 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
27762 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
27766 +static int __init idle_setup (char *str)
27768 + if (!strncmp(str, "poll", 4)) {
27769 + printk("using polling idle threads.\n");
27770 + pm_idle = poll_idle;
27773 + boot_option_idle_override = 1;
27777 +__setup("idle=", idle_setup);
27779 +/* Prints also some state that isn't saved in the pt_regs */
27780 +void __show_regs(struct pt_regs * regs)
27782 + unsigned long fs, gs, shadowgs;
27783 + unsigned int fsindex,gsindex;
27784 + unsigned int ds,cs,es;
27788 + printk("Pid: %d, comm: %.20s %s %s %.*s\n",
27789 + current->pid, current->comm, print_tainted(),
27790 + system_utsname.release,
27791 + (int)strcspn(system_utsname.version, " "),
27792 + system_utsname.version);
27793 + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
27794 + printk_address(regs->rip);
27795 + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
27797 + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
27798 + regs->rax, regs->rbx, regs->rcx);
27799 + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
27800 + regs->rdx, regs->rsi, regs->rdi);
27801 + printk("RBP: %016lx R08: %016lx R09: %016lx\n",
27802 + regs->rbp, regs->r8, regs->r9);
27803 + printk("R10: %016lx R11: %016lx R12: %016lx\n",
27804 + regs->r10, regs->r11, regs->r12);
27805 + printk("R13: %016lx R14: %016lx R15: %016lx\n",
27806 + regs->r13, regs->r14, regs->r15);
27808 + asm("mov %%ds,%0" : "=r" (ds));
27809 + asm("mov %%cs,%0" : "=r" (cs));
27810 + asm("mov %%es,%0" : "=r" (es));
27811 + asm("mov %%fs,%0" : "=r" (fsindex));
27812 + asm("mov %%gs,%0" : "=r" (gsindex));
27814 + rdmsrl(MSR_FS_BASE, fs);
27815 + rdmsrl(MSR_GS_BASE, gs);
27816 + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
27818 + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
27819 + fs,fsindex,gs,gsindex,shadowgs);
27820 + printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
27824 +void show_regs(struct pt_regs *regs)
27826 + printk("CPU %d:", smp_processor_id());
27827 + __show_regs(regs);
27828 + show_trace(NULL, regs, (void *)(regs + 1));
27832 + * Free current thread data structures etc..
27834 +void exit_thread(void)
27836 + struct task_struct *me = current;
27837 + struct thread_struct *t = &me->thread;
27839 + if (me->thread.io_bitmap_ptr) {
27840 +#ifndef CONFIG_X86_NO_TSS
27841 + struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
27844 + struct physdev_set_iobitmap iobmp_op;
27845 + memset(&iobmp_op, 0, sizeof(iobmp_op));
27848 + kfree(t->io_bitmap_ptr);
27849 + t->io_bitmap_ptr = NULL;
27851 + * Careful, clear this in the TSS too:
27853 +#ifndef CONFIG_X86_NO_TSS
27854 + memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
27858 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
27861 + t->io_bitmap_max = 0;
27865 +void load_gs_index(unsigned gs)
27867 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
27870 +void flush_thread(void)
27872 + struct task_struct *tsk = current;
27873 + struct thread_info *t = current_thread_info();
27875 + if (t->flags & _TIF_ABI_PENDING) {
27876 + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
27877 + if (t->flags & _TIF_IA32)
27878 + current_thread_info()->status |= TS_COMPAT;
27881 + tsk->thread.debugreg0 = 0;
27882 + tsk->thread.debugreg1 = 0;
27883 + tsk->thread.debugreg2 = 0;
27884 + tsk->thread.debugreg3 = 0;
27885 + tsk->thread.debugreg6 = 0;
27886 + tsk->thread.debugreg7 = 0;
27887 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
27889 + * Forget coprocessor state..
27892 + clear_used_math();
27895 +void release_thread(struct task_struct *dead_task)
27897 + if (dead_task->mm) {
27898 + if (dead_task->mm->context.size) {
27899 + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
27901 + dead_task->mm->context.ldt,
27902 + dead_task->mm->context.size);
27908 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
27910 + struct user_desc ud = {
27911 + .base_addr = addr,
27912 + .limit = 0xfffff,
27914 + .limit_in_pages = 1,
27917 + struct n_desc_struct *desc = (void *)t->thread.tls_array;
27919 + desc->a = LDT_entry_a(&ud);
27920 + desc->b = LDT_entry_b(&ud);
27923 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
27925 + struct desc_struct *desc = (void *)t->thread.tls_array;
27927 + return desc->base0 |
27928 + (((u32)desc->base1) << 16) |
27929 + (((u32)desc->base2) << 24);
27933 + * This gets called before we allocate a new thread and copy
27934 + * the current task into it.
27936 +void prepare_to_copy(struct task_struct *tsk)
27941 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
27942 + unsigned long unused,
27943 + struct task_struct * p, struct pt_regs * regs)
27946 + struct pt_regs * childregs;
27947 + struct task_struct *me = current;
27949 + childregs = ((struct pt_regs *)
27950 + (THREAD_SIZE + task_stack_page(p))) - 1;
27951 + *childregs = *regs;
27953 + childregs->rax = 0;
27954 + childregs->rsp = rsp;
27956 + childregs->rsp = (unsigned long)childregs;
27958 + p->thread.rsp = (unsigned long) childregs;
27959 + p->thread.rsp0 = (unsigned long) (childregs+1);
27960 + p->thread.userrsp = me->thread.userrsp;
27962 + set_tsk_thread_flag(p, TIF_FORK);
27964 + p->thread.fs = me->thread.fs;
27965 + p->thread.gs = me->thread.gs;
27967 + asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
27968 + asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
27969 + asm("mov %%es,%0" : "=m" (p->thread.es));
27970 + asm("mov %%ds,%0" : "=m" (p->thread.ds));
27972 + if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
27973 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
27974 + if (!p->thread.io_bitmap_ptr) {
27975 + p->thread.io_bitmap_max = 0;
27978 + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
27979 + IO_BITMAP_BYTES);
27983 + * Set a new TLS for the child thread?
27985 + if (clone_flags & CLONE_SETTLS) {
27986 +#ifdef CONFIG_IA32_EMULATION
27987 + if (test_thread_flag(TIF_IA32))
27988 + err = ia32_child_tls(p, childregs);
27991 + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
27995 + p->thread.iopl = current->thread.iopl;
27999 + if (err && p->thread.io_bitmap_ptr) {
28000 + kfree(p->thread.io_bitmap_ptr);
28001 + p->thread.io_bitmap_max = 0;
28006 +static inline void __save_init_fpu( struct task_struct *tsk )
28008 + asm volatile( "rex64 ; fxsave %0 ; fnclex"
28009 + : "=m" (tsk->thread.i387.fxsave));
28010 + tsk->thread_info->status &= ~TS_USEDFPU;
28014 + * switch_to(x,y) should switch tasks from x to y.
28016 + * This could still be optimized:
28017 + * - fold all the options into a flag word and test it with a single test.
28018 + * - could test fs/gs bitsliced
28020 + * Kprobes not supported here. Set the probe on schedule instead.
28022 +__kprobes struct task_struct *
28023 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
28025 + struct thread_struct *prev = &prev_p->thread,
28026 + *next = &next_p->thread;
28027 + int cpu = smp_processor_id();
28028 +#ifndef CONFIG_X86_NO_TSS
28029 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
28031 +#if CONFIG_XEN_COMPAT > 0x030002
28032 + struct physdev_set_iopl iopl_op;
28033 + struct physdev_set_iobitmap iobmp_op;
28035 + struct physdev_op _pdo[2], *pdo = _pdo;
28036 +#define iopl_op pdo->u.set_iopl
28037 +#define iobmp_op pdo->u.set_iobitmap
28039 + multicall_entry_t _mcl[8], *mcl = _mcl;
28042 + * This is basically '__unlazy_fpu', except that we queue a
28043 + * multicall to indicate FPU task switch, rather than
28044 + * synchronously trapping to Xen.
28045 + * The AMD workaround requires it to be after DS reload, or
28046 + * after DS has been cleared, which we do in __prepare_arch_switch.
28048 + if (prev_p->thread_info->status & TS_USEDFPU) {
28049 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
28050 + mcl->op = __HYPERVISOR_fpu_taskswitch;
28051 + mcl->args[0] = 1;
28056 + * Reload esp0, LDT and the page table pointer:
28058 + mcl->op = __HYPERVISOR_stack_switch;
28059 + mcl->args[0] = __KERNEL_DS;
28060 + mcl->args[1] = next->rsp0;
28064 + * Load the per-thread Thread-Local Storage descriptor.
28065 + * This is load_TLS(next, cpu) with multicalls.
28067 +#define C(i) do { \
28068 + if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
28069 + mcl->op = __HYPERVISOR_update_descriptor; \
28070 + mcl->args[0] = virt_to_machine( \
28071 + &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
28072 + mcl->args[1] = next->tls_array[i]; \
28076 + C(0); C(1); C(2);
28079 + if (unlikely(prev->iopl != next->iopl)) {
28080 + iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
28081 +#if CONFIG_XEN_COMPAT > 0x030002
28082 + mcl->op = __HYPERVISOR_physdev_op;
28083 + mcl->args[0] = PHYSDEVOP_set_iopl;
28084 + mcl->args[1] = (unsigned long)&iopl_op;
28086 + mcl->op = __HYPERVISOR_physdev_op_compat;
28087 + pdo->cmd = PHYSDEVOP_set_iopl;
28088 + mcl->args[0] = (unsigned long)pdo++;
28093 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
28094 + set_xen_guest_handle(iobmp_op.bitmap,
28095 + (char *)next->io_bitmap_ptr);
28096 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
28097 +#if CONFIG_XEN_COMPAT > 0x030002
28098 + mcl->op = __HYPERVISOR_physdev_op;
28099 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
28100 + mcl->args[1] = (unsigned long)&iobmp_op;
28102 + mcl->op = __HYPERVISOR_physdev_op_compat;
28103 + pdo->cmd = PHYSDEVOP_set_iobitmap;
28104 + mcl->args[0] = (unsigned long)pdo++;
28109 +#if CONFIG_XEN_COMPAT <= 0x030002
28110 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
28112 + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
28113 + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
28117 + * Switch DS and ES.
28118 + * This won't pick up thread selector changes, but I guess that is ok.
28120 + if (unlikely(next->es))
28121 + loadsegment(es, next->es);
28123 + if (unlikely(next->ds))
28124 + loadsegment(ds, next->ds);
28127 + * Switch FS and GS.
28129 + if (unlikely(next->fsindex))
28130 + loadsegment(fs, next->fsindex);
28133 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
28135 + if (unlikely(next->gsindex))
28136 + load_gs_index(next->gsindex);
28139 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
28142 + * Switch the PDA context.
28144 + prev->userrsp = read_pda(oldrsp);
28145 + write_pda(oldrsp, next->userrsp);
28146 + write_pda(pcurrent, next_p);
28147 + write_pda(kernelstack,
28148 + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
28151 + * Now maybe reload the debug registers
28153 + if (unlikely(next->debugreg7)) {
28154 + set_debugreg(next->debugreg0, 0);
28155 + set_debugreg(next->debugreg1, 1);
28156 + set_debugreg(next->debugreg2, 2);
28157 + set_debugreg(next->debugreg3, 3);
28159 + set_debugreg(next->debugreg6, 6);
28160 + set_debugreg(next->debugreg7, 7);
28167 + * sys_execve() executes a new program.
28170 +long sys_execve(char __user *name, char __user * __user *argv,
28171 + char __user * __user *envp, struct pt_regs regs)
28176 + filename = getname(name);
28177 + error = PTR_ERR(filename);
28178 + if (IS_ERR(filename))
28180 + error = do_execve(filename, argv, envp, ®s);
28181 + if (error == 0) {
28182 + task_lock(current);
28183 + current->ptrace &= ~PT_DTRACE;
28184 + task_unlock(current);
28186 + putname(filename);
28190 +void set_personality_64bit(void)
28192 + /* inherit personality from parent */
28194 + /* Make sure to be in 64bit mode */
28195 + clear_thread_flag(TIF_IA32);
28197 + /* TBD: overwrites user setup. Should have two bits.
28198 + But 64bit processes have always behaved this way,
28199 + so it's not too bad. The main problem is just that
28200 + 32bit childs are affected again. */
28201 + current->personality &= ~READ_IMPLIES_EXEC;
28204 +asmlinkage long sys_fork(struct pt_regs *regs)
28206 + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
28210 +sys_clone(unsigned long clone_flags, unsigned long newsp,
28211 + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
28214 + newsp = regs->rsp;
28215 + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
28219 + * This is trivial, and on the face of it looks like it
28220 + * could equally well be done in user mode.
28222 + * Not so, for quite unobvious reasons - register pressure.
28223 + * In user mode vfork() cannot have a stack frame, and if
28224 + * done by calling the "clone()" system call directly, you
28225 + * do not have enough call-clobbered registers to hold all
28226 + * the information you need.
28228 +asmlinkage long sys_vfork(struct pt_regs *regs)
28230 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
28234 +unsigned long get_wchan(struct task_struct *p)
28236 + unsigned long stack;
28240 + if (!p || p == current || p->state==TASK_RUNNING)
28242 + stack = (unsigned long)task_stack_page(p);
28243 + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
28245 + fp = *(u64 *)(p->thread.rsp);
28247 + if (fp < (unsigned long)stack ||
28248 + fp > (unsigned long)stack+THREAD_SIZE)
28250 + rip = *(u64 *)(fp+8);
28251 + if (!in_sched_functions(rip))
28254 + } while (count++ < 16);
28258 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
28261 + int doit = task == current;
28265 + case ARCH_SET_GS:
28266 + if (addr >= TASK_SIZE_OF(task))
28269 + /* handle small bases via the GDT because that's faster to
28271 + if (addr <= 0xffffffff) {
28272 + set_32bit_tls(task, GS_TLS, addr);
28274 + load_TLS(&task->thread, cpu);
28275 + load_gs_index(GS_TLS_SEL);
28277 + task->thread.gsindex = GS_TLS_SEL;
28278 + task->thread.gs = 0;
28280 + task->thread.gsindex = 0;
28281 + task->thread.gs = addr;
28283 + load_gs_index(0);
28284 + ret = HYPERVISOR_set_segment_base(
28285 + SEGBASE_GS_USER, addr);
28290 + case ARCH_SET_FS:
28291 + /* Not strictly needed for fs, but do it for symmetry
28293 + if (addr >= TASK_SIZE_OF(task))
28296 + /* handle small bases via the GDT because that's faster to
28298 + if (addr <= 0xffffffff) {
28299 + set_32bit_tls(task, FS_TLS, addr);
28301 + load_TLS(&task->thread, cpu);
28302 + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
28304 + task->thread.fsindex = FS_TLS_SEL;
28305 + task->thread.fs = 0;
28307 + task->thread.fsindex = 0;
28308 + task->thread.fs = addr;
28310 + /* set the selector to 0 to not confuse
28312 + asm volatile("movl %0,%%fs" :: "r" (0));
28313 + ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
28319 + case ARCH_GET_FS: {
28320 + unsigned long base;
28321 + if (task->thread.fsindex == FS_TLS_SEL)
28322 + base = read_32bit_tls(task, FS_TLS);
28324 + rdmsrl(MSR_FS_BASE, base);
28326 + base = task->thread.fs;
28327 + ret = put_user(base, (unsigned long __user *)addr);
28330 + case ARCH_GET_GS: {
28331 + unsigned long base;
28332 + unsigned gsindex;
28333 + if (task->thread.gsindex == GS_TLS_SEL)
28334 + base = read_32bit_tls(task, GS_TLS);
28336 + asm("movl %%gs,%0" : "=r" (gsindex));
28338 + rdmsrl(MSR_KERNEL_GS_BASE, base);
28340 + base = task->thread.gs;
28343 + base = task->thread.gs;
28344 + ret = put_user(base, (unsigned long __user *)addr);
28356 +long sys_arch_prctl(int code, unsigned long addr)
28358 + return do_arch_prctl(current, code, addr);
28362 + * Capture the user space registers if the task is not running (in user space)
28364 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
28366 + struct pt_regs *pp, ptregs;
28368 + pp = task_pt_regs(tsk);
28371 + ptregs.cs &= 0xffff;
28372 + ptregs.ss &= 0xffff;
28374 + elf_core_copy_regs(regs, &ptregs);
28376 + boot_option_idle_override = 1;
28380 +unsigned long arch_align_stack(unsigned long sp)
28382 + if (randomize_va_space)
28383 + sp -= get_random_int() % 8192;
28384 + return sp & ~0xf;
28386 Index: head-2008-11-25/arch/x86/kernel/setup_64-xen.c
28387 ===================================================================
28388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
28389 +++ head-2008-11-25/arch/x86/kernel/setup_64-xen.c 2008-04-22 15:41:51.000000000 +0200
28392 + * linux/arch/x86-64/kernel/setup.c
28394 + * Copyright (C) 1995 Linus Torvalds
28396 + * Nov 2001 Dave Jones <davej@suse.de>
28397 + * Forked from i386 setup code.
28401 + * This file handles the architecture-dependent parts of initialization
28404 +#include <linux/errno.h>
28405 +#include <linux/sched.h>
28406 +#include <linux/kernel.h>
28407 +#include <linux/mm.h>
28408 +#include <linux/stddef.h>
28409 +#include <linux/unistd.h>
28410 +#include <linux/ptrace.h>
28411 +#include <linux/slab.h>
28412 +#include <linux/user.h>
28413 +#include <linux/a.out.h>
28414 +#include <linux/screen_info.h>
28415 +#include <linux/ioport.h>
28416 +#include <linux/delay.h>
28417 +#include <linux/init.h>
28418 +#include <linux/initrd.h>
28419 +#include <linux/highmem.h>
28420 +#include <linux/bootmem.h>
28421 +#include <linux/module.h>
28422 +#include <asm/processor.h>
28423 +#include <linux/console.h>
28424 +#include <linux/seq_file.h>
28425 +#include <linux/crash_dump.h>
28426 +#include <linux/root_dev.h>
28427 +#include <linux/pci.h>
28428 +#include <linux/acpi.h>
28429 +#include <linux/kallsyms.h>
28430 +#include <linux/edd.h>
28431 +#include <linux/mmzone.h>
28432 +#include <linux/kexec.h>
28433 +#include <linux/cpufreq.h>
28434 +#include <linux/dmi.h>
28435 +#include <linux/dma-mapping.h>
28436 +#include <linux/ctype.h>
28438 +#include <asm/mtrr.h>
28439 +#include <asm/uaccess.h>
28440 +#include <asm/system.h>
28441 +#include <asm/io.h>
28442 +#include <asm/smp.h>
28443 +#include <asm/msr.h>
28444 +#include <asm/desc.h>
28445 +#include <video/edid.h>
28446 +#include <asm/e820.h>
28447 +#include <asm/dma.h>
28448 +#include <asm/mpspec.h>
28449 +#include <asm/mmu_context.h>
28450 +#include <asm/bootsetup.h>
28451 +#include <asm/proto.h>
28452 +#include <asm/setup.h>
28453 +#include <asm/mach_apic.h>
28454 +#include <asm/numa.h>
28455 +#include <asm/sections.h>
28456 +#include <asm/dmi.h>
28458 +#include <linux/percpu.h>
28459 +#include <xen/interface/physdev.h>
28460 +#include "setup_arch_pre.h"
28461 +#include <asm/hypervisor.h>
28462 +#include <xen/interface/nmi.h>
28463 +#include <xen/features.h>
28464 +#include <xen/firmware.h>
28465 +#include <xen/xencons.h>
28466 +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
28467 +#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
28468 +#include <asm/mach-xen/setup_arch_post.h>
28469 +#include <xen/interface/memory.h>
28472 +#include <xen/interface/kexec.h>
28475 +extern unsigned long start_pfn;
28476 +extern struct edid_info edid_info;
28478 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
28479 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
28481 +extern char hypercall_page[PAGE_SIZE];
28482 +EXPORT_SYMBOL(hypercall_page);
28484 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
28485 +static struct notifier_block xen_panic_block = {
28486 + xen_panic_event, NULL, 0 /* try to go last */
28489 +unsigned long *phys_to_machine_mapping;
28490 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
28492 +EXPORT_SYMBOL(phys_to_machine_mapping);
28494 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
28495 +DEFINE_PER_CPU(int, nr_multicall_ents);
28497 +/* Raw start-of-day parameters from the hypervisor. */
28498 +start_info_t *xen_start_info;
28499 +EXPORT_SYMBOL(xen_start_info);
28503 + * Machine setup..
28506 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
28507 +EXPORT_SYMBOL(boot_cpu_data);
28509 +unsigned long mmu_cr4_features;
28511 +int acpi_disabled;
28512 +EXPORT_SYMBOL(acpi_disabled);
28513 +#ifdef CONFIG_ACPI
28514 +extern int __initdata acpi_ht;
28515 +extern acpi_interrupt_flags acpi_sci_flags;
28516 +int __initdata acpi_force = 0;
28519 +int acpi_numa __initdata;
28521 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
28522 +int bootloader_type;
28524 +unsigned long saved_video_mode;
28527 + * Early DMI memory
28529 +int dmi_alloc_index;
28530 +char dmi_alloc_data[DMI_MAX_DATA];
28535 +struct screen_info screen_info;
28536 +EXPORT_SYMBOL(screen_info);
28537 +struct sys_desc_table_struct {
28538 + unsigned short length;
28539 + unsigned char table[0];
28542 +struct edid_info edid_info;
28543 +EXPORT_SYMBOL_GPL(edid_info);
28544 +struct e820map e820;
28546 +struct e820map machine_e820;
28549 +extern int root_mountflags;
28551 +char command_line[COMMAND_LINE_SIZE];
28553 +struct resource standard_io_resources[] = {
28554 + { .name = "dma1", .start = 0x00, .end = 0x1f,
28555 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28556 + { .name = "pic1", .start = 0x20, .end = 0x21,
28557 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28558 + { .name = "timer0", .start = 0x40, .end = 0x43,
28559 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28560 + { .name = "timer1", .start = 0x50, .end = 0x53,
28561 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28562 + { .name = "keyboard", .start = 0x60, .end = 0x6f,
28563 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28564 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
28565 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28566 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
28567 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28568 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
28569 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28570 + { .name = "fpu", .start = 0xf0, .end = 0xff,
28571 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
28574 +#define STANDARD_IO_RESOURCES \
28575 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
28577 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
28579 +struct resource data_resource = {
28580 + .name = "Kernel data",
28583 + .flags = IORESOURCE_RAM,
28585 +struct resource code_resource = {
28586 + .name = "Kernel code",
28589 + .flags = IORESOURCE_RAM,
28592 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
28594 +static struct resource system_rom_resource = {
28595 + .name = "System ROM",
28596 + .start = 0xf0000,
28598 + .flags = IORESOURCE_ROM,
28601 +static struct resource extension_rom_resource = {
28602 + .name = "Extension ROM",
28603 + .start = 0xe0000,
28605 + .flags = IORESOURCE_ROM,
28608 +static struct resource adapter_rom_resources[] = {
28609 + { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
28610 + .flags = IORESOURCE_ROM },
28611 + { .name = "Adapter ROM", .start = 0, .end = 0,
28612 + .flags = IORESOURCE_ROM },
28613 + { .name = "Adapter ROM", .start = 0, .end = 0,
28614 + .flags = IORESOURCE_ROM },
28615 + { .name = "Adapter ROM", .start = 0, .end = 0,
28616 + .flags = IORESOURCE_ROM },
28617 + { .name = "Adapter ROM", .start = 0, .end = 0,
28618 + .flags = IORESOURCE_ROM },
28619 + { .name = "Adapter ROM", .start = 0, .end = 0,
28620 + .flags = IORESOURCE_ROM }
28623 +#define ADAPTER_ROM_RESOURCES \
28624 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
28626 +static struct resource video_rom_resource = {
28627 + .name = "Video ROM",
28628 + .start = 0xc0000,
28630 + .flags = IORESOURCE_ROM,
28633 +static struct resource video_ram_resource = {
28634 + .name = "Video RAM area",
28635 + .start = 0xa0000,
28637 + .flags = IORESOURCE_RAM,
28640 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
28642 +static int __init romchecksum(unsigned char *rom, unsigned long length)
28644 + unsigned char *p, sum = 0;
28646 + for (p = rom; p < rom + length; p++)
28651 +static void __init probe_roms(void)
28653 + unsigned long start, length, upper;
28654 + unsigned char *rom;
28658 + /* Nothing to do if not running in dom0. */
28659 + if (!is_initial_xendomain())
28664 + upper = adapter_rom_resources[0].start;
28665 + for (start = video_rom_resource.start; start < upper; start += 2048) {
28666 + rom = isa_bus_to_virt(start);
28667 + if (!romsignature(rom))
28670 + video_rom_resource.start = start;
28672 + /* 0 < length <= 0x7f * 512, historically */
28673 + length = rom[2] * 512;
28675 + /* if checksum okay, trust length byte */
28676 + if (length && romchecksum(rom, length))
28677 + video_rom_resource.end = start + length - 1;
28679 + request_resource(&iomem_resource, &video_rom_resource);
28683 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
28684 + if (start < upper)
28688 + request_resource(&iomem_resource, &system_rom_resource);
28689 + upper = system_rom_resource.start;
28691 + /* check for extension rom (ignore length byte!) */
28692 + rom = isa_bus_to_virt(extension_rom_resource.start);
28693 + if (romsignature(rom)) {
28694 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
28695 + if (romchecksum(rom, length)) {
28696 + request_resource(&iomem_resource, &extension_rom_resource);
28697 + upper = extension_rom_resource.start;
28701 + /* check for adapter roms on 2k boundaries */
28702 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
28703 + rom = isa_bus_to_virt(start);
28704 + if (!romsignature(rom))
28707 + /* 0 < length <= 0x7f * 512, historically */
28708 + length = rom[2] * 512;
28710 + /* but accept any length that fits if checksum okay */
28711 + if (!length || start + length > upper || !romchecksum(rom, length))
28714 + adapter_rom_resources[i].start = start;
28715 + adapter_rom_resources[i].end = start + length - 1;
28716 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
28718 + start = adapter_rom_resources[i++].end & ~2047UL;
28722 +/* Check for full argument with no trailing characters */
28723 +static int fullarg(char *p, char *arg)
28725 + int l = strlen(arg);
28726 + return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
28729 +static __init void parse_cmdline_early (char ** cmdline_p)
28731 + char c = ' ', *to = command_line, *from = COMMAND_LINE;
28741 + * If the BIOS enumerates physical processors before logical,
28742 + * maxcpus=N at enumeration-time can be used to disable HT.
28744 + else if (!memcmp(from, "maxcpus=", 8)) {
28745 + extern unsigned int maxcpus;
28747 + maxcpus = simple_strtoul(from + 8, NULL, 0);
28750 +#ifdef CONFIG_ACPI
28751 + /* "acpi=off" disables both ACPI table parsing and interpreter init */
28752 + if (fullarg(from,"acpi=off"))
28755 + if (fullarg(from, "acpi=force")) {
28756 + /* add later when we do DMI horrors: */
28758 + acpi_disabled = 0;
28761 + /* acpi=ht just means: do ACPI MADT parsing
28762 + at bootup, but don't enable the full ACPI interpreter */
28763 + if (fullarg(from, "acpi=ht")) {
28768 + else if (fullarg(from, "pci=noacpi"))
28769 + acpi_disable_pci();
28770 + else if (fullarg(from, "acpi=noirq"))
28771 + acpi_noirq_set();
28773 + else if (fullarg(from, "acpi_sci=edge"))
28774 + acpi_sci_flags.trigger = 1;
28775 + else if (fullarg(from, "acpi_sci=level"))
28776 + acpi_sci_flags.trigger = 3;
28777 + else if (fullarg(from, "acpi_sci=high"))
28778 + acpi_sci_flags.polarity = 1;
28779 + else if (fullarg(from, "acpi_sci=low"))
28780 + acpi_sci_flags.polarity = 3;
28782 + /* acpi=strict disables out-of-spec workarounds */
28783 + else if (fullarg(from, "acpi=strict")) {
28786 +#ifdef CONFIG_X86_IO_APIC
28787 + else if (fullarg(from, "acpi_skip_timer_override"))
28788 + acpi_skip_timer_override = 1;
28792 +#ifndef CONFIG_XEN
28793 + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
28794 + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
28795 + disable_apic = 1;
28798 + if (fullarg(from, "noapic"))
28799 + skip_ioapic_setup = 1;
28801 + if (fullarg(from,"apic")) {
28802 + skip_ioapic_setup = 0;
28803 + ioapic_force = 1;
28807 + if (!memcmp(from, "mem=", 4))
28808 + parse_memopt(from+4, &from);
28810 + if (!memcmp(from, "memmap=", 7)) {
28811 + /* exactmap option is for used defined memory */
28812 + if (!memcmp(from+7, "exactmap", 8)) {
28813 +#ifdef CONFIG_CRASH_DUMP
28814 + /* If we are doing a crash dump, we
28815 + * still need to know the real mem
28816 + * size before original memory map is
28819 + saved_max_pfn = e820_end_of_ram();
28827 + parse_memmapopt(from+7, &from);
28832 +#ifdef CONFIG_NUMA
28833 + if (!memcmp(from, "numa=", 5))
28834 + numa_setup(from+5);
28837 + if (!memcmp(from,"iommu=",6)) {
28838 + iommu_setup(from+6);
28841 + if (fullarg(from,"oops=panic"))
28842 + panic_on_oops = 1;
28844 + if (!memcmp(from, "noexec=", 7))
28845 + nonx_setup(from + 7);
28847 +#ifdef CONFIG_KEXEC
28848 + /* crashkernel=size@addr specifies the location to reserve for
28849 + * a crash kernel. By reserving this memory we guarantee
28850 + * that linux never set's it up as a DMA target.
28851 + * Useful for holding code to do something appropriate
28852 + * after a kernel panic.
28854 + else if (!memcmp(from, "crashkernel=", 12)) {
28855 +#ifndef CONFIG_XEN
28856 + unsigned long size, base;
28857 + size = memparse(from+12, &from);
28858 + if (*from == '@') {
28859 + base = memparse(from+1, &from);
28860 + /* FIXME: Do I want a sanity check
28861 + * to validate the memory range?
28863 + crashk_res.start = base;
28864 + crashk_res.end = base + size - 1;
28867 + printk("Ignoring crashkernel command line, "
28868 + "parameter will be supplied by xen\n");
28873 +#ifdef CONFIG_PROC_VMCORE
28874 + /* elfcorehdr= specifies the location of elf core header
28875 + * stored by the crashed kernel. This option will be passed
28876 + * by kexec loader to the capture kernel.
28878 + else if(!memcmp(from, "elfcorehdr=", 11))
28879 + elfcorehdr_addr = memparse(from+11, &from);
28882 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
28883 + else if (!memcmp(from, "additional_cpus=", 16))
28884 + setup_additional_cpus(from+16);
28891 + if (COMMAND_LINE_SIZE <= ++len)
28896 + printk(KERN_INFO "user-defined physical RAM map:\n");
28897 + e820_print_map("user");
28900 + *cmdline_p = command_line;
28903 +#ifndef CONFIG_NUMA
28904 +static void __init
28905 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
28907 + unsigned long bootmap_size, bootmap;
28909 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
28910 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
28911 + if (bootmap == -1L)
28912 + panic("Cannot find bootmem map of size %ld\n",bootmap_size);
28913 + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
28915 + e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
28917 + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
28919 + reserve_bootmem(bootmap, bootmap_size);
28923 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
28925 +#ifdef CONFIG_EDD_MODULE
28926 +EXPORT_SYMBOL(edd);
28928 +#ifndef CONFIG_XEN
28930 + * copy_edd() - Copy the BIOS EDD information
28931 + * from boot_params into a safe place.
28934 +static inline void copy_edd(void)
28936 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
28937 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
28938 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
28939 + edd.edd_info_nr = EDD_NR;
28943 +static inline void copy_edd(void)
28948 +#ifndef CONFIG_XEN
28949 +#define EBDA_ADDR_POINTER 0x40E
28951 +unsigned __initdata ebda_addr;
28952 +unsigned __initdata ebda_size;
28954 +static void discover_ebda(void)
28957 + * there is a real-mode segmented pointer pointing to the
28958 + * 4K EBDA area at 0x40E
28960 + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
28963 + ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
28965 + /* Round EBDA up to pages */
28966 + if (ebda_size == 0)
28968 + ebda_size <<= 10;
28969 + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
28970 + if (ebda_size > 64*1024)
28971 + ebda_size = 64*1024;
28974 +#define discover_ebda() ((void)0)
28977 +void __init setup_arch(char **cmdline_p)
28980 + /* Register a call for panic conditions. */
28981 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
28983 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
28984 + screen_info = SCREEN_INFO;
28986 + if (is_initial_xendomain()) {
28987 + const struct dom0_vga_console_info *info =
28988 + (void *)((char *)xen_start_info +
28989 + xen_start_info->console.dom0.info_off);
28991 + dom0_init_screen_info(info,
28992 + xen_start_info->console.dom0.info_size);
28993 + xen_start_info->console.domU.mfn = 0;
28994 + xen_start_info->console.domU.evtchn = 0;
28996 + screen_info.orig_video_isVGA = 0;
29000 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
29001 + VMASST_TYPE_writable_pagetables));
29005 + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
29006 + screen_info = SCREEN_INFO;
29007 + edid_info = EDID_INFO;
29008 +#endif /* !CONFIG_XEN */
29009 + saved_video_mode = SAVED_VIDEO_MODE;
29010 + bootloader_type = LOADER_TYPE;
29012 +#ifdef CONFIG_BLK_DEV_RAM
29013 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
29014 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
29015 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
29017 + setup_memory_region();
29020 + if (!MOUNT_ROOT_RDONLY)
29021 + root_mountflags &= ~MS_RDONLY;
29022 + init_mm.start_code = (unsigned long) &_text;
29023 + init_mm.end_code = (unsigned long) &_etext;
29024 + init_mm.end_data = (unsigned long) &_edata;
29025 + init_mm.brk = (unsigned long) &_end;
29027 + code_resource.start = virt_to_phys(&_text);
29028 + code_resource.end = virt_to_phys(&_etext)-1;
29029 + data_resource.start = virt_to_phys(&_etext);
29030 + data_resource.end = virt_to_phys(&_edata)-1;
29032 + parse_cmdline_early(cmdline_p);
29034 + early_identify_cpu(&boot_cpu_data);
29037 + * partially used pages are not usable - thus
29038 + * we are rounding upwards:
29040 + end_pfn = e820_end_of_ram();
29041 + num_physpages = end_pfn; /* for pfn_valid */
29047 + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
29049 + if (is_initial_xendomain())
29050 + dmi_scan_machine();
29052 +#ifdef CONFIG_ACPI_NUMA
29054 + * Parse SRAT to discover nodes.
29056 + acpi_numa_init();
29059 +#ifdef CONFIG_NUMA
29060 + numa_initmem_init(0, end_pfn);
29062 + contig_initmem_init(0, end_pfn);
29067 + * Reserve kernel, physmap, start info, initial page tables, and
29068 + * direct mapping.
29070 + reserve_bootmem_generic(__pa_symbol(&_text),
29071 + (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
29073 + /* Reserve direct mapping */
29074 + reserve_bootmem_generic(table_start << PAGE_SHIFT,
29075 + (table_end - table_start) << PAGE_SHIFT);
29077 + /* reserve kernel */
29078 + reserve_bootmem_generic(__pa_symbol(&_text),
29079 + __pa_symbol(&_end) - __pa_symbol(&_text));
29082 + * reserve physical page 0 - it's a special BIOS page on many boxes,
29083 + * enabling clean reboots, SMP operation, laptop functions.
29085 + reserve_bootmem_generic(0, PAGE_SIZE);
29087 + /* reserve ebda region */
29089 + reserve_bootmem_generic(ebda_addr, ebda_size);
29093 + * But first pinch a few for the stack/trampoline stuff
29094 + * FIXME: Don't need the extra page at 4K, but need to fix
29095 + * trampoline before removing it. (see the GDT stuff)
29097 + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
29099 + /* Reserve SMP trampoline */
29100 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
29104 +#ifdef CONFIG_ACPI_SLEEP
29106 + * Reserve low memory region for sleep support.
29108 + acpi_reserve_bootmem();
29111 +#ifdef CONFIG_BLK_DEV_INITRD
29112 + if (xen_start_info->mod_start) {
29113 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29114 + /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
29115 + initrd_start = INITRD_START + PAGE_OFFSET;
29116 + initrd_end = initrd_start+INITRD_SIZE;
29117 + initrd_below_start_ok = 1;
29119 + printk(KERN_ERR "initrd extends beyond end of memory "
29120 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29121 + (unsigned long)(INITRD_START + INITRD_SIZE),
29122 + (unsigned long)(end_pfn << PAGE_SHIFT));
29123 + initrd_start = 0;
29127 +#else /* CONFIG_XEN */
29128 +#ifdef CONFIG_BLK_DEV_INITRD
29129 + if (LOADER_TYPE && INITRD_START) {
29130 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29131 + reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
29133 + INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
29134 + initrd_end = initrd_start+INITRD_SIZE;
29137 + printk(KERN_ERR "initrd extends beyond end of memory "
29138 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29139 + (unsigned long)(INITRD_START + INITRD_SIZE),
29140 + (unsigned long)(end_pfn << PAGE_SHIFT));
29141 + initrd_start = 0;
29145 +#endif /* !CONFIG_XEN */
29146 +#ifdef CONFIG_KEXEC
29148 + xen_machine_kexec_setup_resources();
29150 + if (crashk_res.start != crashk_res.end) {
29151 + reserve_bootmem_generic(crashk_res.start,
29152 + crashk_res.end - crashk_res.start + 1);
29158 +#ifdef CONFIG_X86_LOCAL_APIC
29160 + * Find and reserve possible boot-time SMP configuration:
29162 + find_smp_config();
29166 + int i, j, k, fpp;
29167 + unsigned long p2m_pages;
29169 + p2m_pages = end_pfn;
29170 + if (xen_start_info->nr_pages > end_pfn) {
29172 + * the end_pfn was shrunk (probably by mem= or highmem=
29173 + * kernel parameter); shrink reservation with the HV
29175 + struct xen_memory_reservation reservation = {
29176 + .address_bits = 0,
29177 + .extent_order = 0,
29178 + .domid = DOMID_SELF
29180 + unsigned int difference;
29183 + difference = xen_start_info->nr_pages - end_pfn;
29185 + set_xen_guest_handle(reservation.extent_start,
29186 + ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
29187 + reservation.nr_extents = difference;
29188 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
29190 + BUG_ON (ret != difference);
29192 + else if (end_pfn > xen_start_info->nr_pages)
29193 + p2m_pages = xen_start_info->nr_pages;
29195 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29196 + /* Make sure we have a large enough P->M table. */
29197 + phys_to_machine_mapping = alloc_bootmem_pages(
29198 + end_pfn * sizeof(unsigned long));
29199 + memset(phys_to_machine_mapping, ~0,
29200 + end_pfn * sizeof(unsigned long));
29201 + memcpy(phys_to_machine_mapping,
29202 + (unsigned long *)xen_start_info->mfn_list,
29203 + p2m_pages * sizeof(unsigned long));
29205 + __pa(xen_start_info->mfn_list),
29206 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
29207 + sizeof(unsigned long))));
29210 + * Initialise the list of the frames that specify the
29211 + * list of frames that make up the p2m table. Used by
29214 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
29216 + fpp = PAGE_SIZE/sizeof(unsigned long);
29217 + for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
29218 + if ((j % fpp) == 0) {
29221 + pfn_to_mfn_frame_list[k] =
29222 + alloc_bootmem_pages(PAGE_SIZE);
29223 + pfn_to_mfn_frame_list_list[k] =
29224 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
29227 + pfn_to_mfn_frame_list[k][j] =
29228 + virt_to_mfn(&phys_to_machine_mapping[i]);
29230 + HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
29231 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
29232 + virt_to_mfn(pfn_to_mfn_frame_list_list);
29235 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
29236 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
29237 + if (i != 4 && request_dma(i, "xen") != 0)
29241 + if (!is_initial_xendomain()) {
29242 + acpi_disabled = 1;
29243 +#ifdef CONFIG_ACPI
29249 +#ifndef CONFIG_XEN
29253 + zap_low_mappings(0);
29256 + * set this early, so we dont allocate cpu0
29257 + * if MADT list doesnt list BSP first
29258 + * mpparse.c/MP_processor_info() allocates logical cpu numbers.
29260 + cpu_set(0, cpu_present_map);
29261 +#ifdef CONFIG_ACPI
29263 + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
29264 + * Call this early for SRAT node setup.
29266 + acpi_boot_table_init();
29269 + * Read APIC and some other early information from ACPI tables.
29271 + acpi_boot_init();
29274 + init_cpu_to_node();
29276 +#ifdef CONFIG_X86_LOCAL_APIC
29278 + * get boot-time SMP configuration:
29280 + if (smp_found_config)
29281 + get_smp_config();
29282 +#ifndef CONFIG_XEN
29283 + init_apic_mappings();
29286 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
29287 + prefill_possible_map();
29291 + * Request address space for all standard RAM and ROM resources
29292 + * and also for regions reported as reserved by the e820.
29296 + if (is_initial_xendomain())
29297 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
29299 + e820_reserve_resources(e820.map, e820.nr_map);
29302 + request_resource(&iomem_resource, &video_ram_resource);
29306 + /* request I/O space for devices used on all i[345]86 PCs */
29307 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
29308 + request_resource(&ioport_resource, &standard_io_resources[i]);
29312 + if (is_initial_xendomain())
29313 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
29315 + e820_setup_gap(e820.map, e820.nr_map);
29320 + struct physdev_set_iopl set_iopl;
29322 + set_iopl.iopl = 1;
29323 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
29325 + if (is_initial_xendomain()) {
29327 +#if defined(CONFIG_VGA_CONSOLE)
29328 + conswitchp = &vga_con;
29329 +#elif defined(CONFIG_DUMMY_CONSOLE)
29330 + conswitchp = &dummy_con;
29334 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
29335 + conswitchp = &dummy_con;
29339 +#else /* CONFIG_XEN */
29342 +#if defined(CONFIG_VGA_CONSOLE)
29343 + conswitchp = &vga_con;
29344 +#elif defined(CONFIG_DUMMY_CONSOLE)
29345 + conswitchp = &dummy_con;
29349 +#endif /* !CONFIG_XEN */
29354 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
29356 + HYPERVISOR_shutdown(SHUTDOWN_crash);
29357 + /* we're never actually going to get here... */
29358 + return NOTIFY_DONE;
29360 +#endif /* !CONFIG_XEN */
29363 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
29367 + if (c->extended_cpuid_level < 0x80000004)
29370 + v = (unsigned int *) c->x86_model_id;
29371 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
29372 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
29373 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
29374 + c->x86_model_id[48] = 0;
29379 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
29381 + unsigned int n, dummy, eax, ebx, ecx, edx;
29383 + n = c->extended_cpuid_level;
29385 + if (n >= 0x80000005) {
29386 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
29387 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
29388 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
29389 + c->x86_cache_size=(ecx>>24)+(edx>>24);
29390 + /* On K8 L1 TLB is inclusive, so don't count it */
29391 + c->x86_tlbsize = 0;
29394 + if (n >= 0x80000006) {
29395 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
29396 + ecx = cpuid_ecx(0x80000006);
29397 + c->x86_cache_size = ecx >> 16;
29398 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
29400 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
29401 + c->x86_cache_size, ecx & 0xFF);
29404 + if (n >= 0x80000007)
29405 + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
29406 + if (n >= 0x80000008) {
29407 + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
29408 + c->x86_virt_bits = (eax >> 8) & 0xff;
29409 + c->x86_phys_bits = eax & 0xff;
29413 +#ifdef CONFIG_NUMA
29414 +static int nearby_node(int apicid)
29417 + for (i = apicid - 1; i >= 0; i--) {
29418 + int node = apicid_to_node[i];
29419 + if (node != NUMA_NO_NODE && node_online(node))
29422 + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
29423 + int node = apicid_to_node[i];
29424 + if (node != NUMA_NO_NODE && node_online(node))
29427 + return first_node(node_online_map); /* Shouldn't happen */
29432 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
29433 + * Assumes number of cores is a power of two.
29435 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
29439 +#ifdef CONFIG_NUMA
29440 + int cpu = smp_processor_id();
29442 + unsigned apicid = hard_smp_processor_id();
29444 + unsigned ecx = cpuid_ecx(0x80000008);
29446 + c->x86_max_cores = (ecx & 0xff) + 1;
29448 + /* CPU telling us the core id bits shift? */
29449 + bits = (ecx >> 12) & 0xF;
29451 + /* Otherwise recompute */
29453 + while ((1 << bits) < c->x86_max_cores)
29457 + /* Low order bits define the core id (index of core in socket) */
29458 + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
29459 + /* Convert the APIC ID into the socket ID */
29460 + c->phys_proc_id = phys_pkg_id(bits);
29462 +#ifdef CONFIG_NUMA
29463 + node = c->phys_proc_id;
29464 + if (apicid_to_node[apicid] != NUMA_NO_NODE)
29465 + node = apicid_to_node[apicid];
29466 + if (!node_online(node)) {
29467 + /* Two possibilities here:
29468 + - The CPU is missing memory and no node was created.
29469 + In that case try picking one from a nearby CPU
29470 + - The APIC IDs differ from the HyperTransport node IDs
29471 + which the K8 northbridge parsing fills in.
29472 + Assume they are all increased by a constant offset,
29473 + but in the same order as the HT nodeids.
29474 + If that doesn't result in a usable node fall back to the
29475 + path for the previous case. */
29476 + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
29477 + if (ht_nodeid >= 0 &&
29478 + apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
29479 + node = apicid_to_node[ht_nodeid];
29480 + /* Pick a nearby node */
29481 + if (!node_online(node))
29482 + node = nearby_node(apicid);
29484 + numa_set_node(cpu, node);
29486 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29491 +static void __init init_amd(struct cpuinfo_x86 *c)
29496 + unsigned long value;
29499 + * Disable TLB flush filter by setting HWCR.FFDIS on K8
29500 + * bit 6 of msr C001_0015
29502 + * Errata 63 for SH-B3 steppings
29503 + * Errata 122 for all steppings (F+ have it disabled by default)
29505 + if (c->x86 == 15) {
29506 + rdmsrl(MSR_K8_HWCR, value);
29508 + wrmsrl(MSR_K8_HWCR, value);
29512 + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
29513 + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
29514 + clear_bit(0*32+31, &c->x86_capability);
29516 + /* On C+ stepping K8 rep microcode works well for copy/memset */
29517 + level = cpuid_eax(1);
29518 + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
29519 + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
29521 + /* Enable workaround for FXSAVE leak */
29523 + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
29525 + level = get_model_name(c);
29527 + switch (c->x86) {
29529 + /* Should distinguish Models here, but this is only
29530 + a fallback anyways. */
29531 + strcpy(c->x86_model_id, "Hammer");
29535 + display_cacheinfo(c);
29537 + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
29538 + if (c->x86_power & (1<<8))
29539 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29541 + /* Multi core CPU? */
29542 + if (c->extended_cpuid_level >= 0x80000008)
29543 + amd_detect_cmp(c);
29545 + /* Fix cpuid4 emulation for more */
29546 + num_cache_leaves = 3;
29549 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
29552 + u32 eax, ebx, ecx, edx;
29553 + int index_msb, core_bits;
29555 + cpuid(1, &eax, &ebx, &ecx, &edx);
29558 + if (!cpu_has(c, X86_FEATURE_HT))
29560 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
29563 + smp_num_siblings = (ebx & 0xff0000) >> 16;
29565 + if (smp_num_siblings == 1) {
29566 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
29567 + } else if (smp_num_siblings > 1 ) {
29569 + if (smp_num_siblings > NR_CPUS) {
29570 + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
29571 + smp_num_siblings = 1;
29575 + index_msb = get_count_order(smp_num_siblings);
29576 + c->phys_proc_id = phys_pkg_id(index_msb);
29578 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
29580 + index_msb = get_count_order(smp_num_siblings) ;
29582 + core_bits = get_count_order(c->x86_max_cores);
29584 + c->cpu_core_id = phys_pkg_id(index_msb) &
29585 + ((1 << core_bits) - 1);
29588 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
29589 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
29590 + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
29597 + * find out the number of processor cores on the die
29599 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
29601 + unsigned int eax, t;
29603 + if (c->cpuid_level < 4)
29606 + cpuid_count(4, 0, &eax, &t, &t, &t);
29609 + return ((eax >> 26) + 1);
29614 +static void srat_detect_node(void)
29616 +#ifdef CONFIG_NUMA
29618 + int cpu = smp_processor_id();
29619 + int apicid = hard_smp_processor_id();
29621 + /* Don't do the funky fallback heuristics the AMD version employs
29623 + node = apicid_to_node[apicid];
29624 + if (node == NUMA_NO_NODE)
29625 + node = first_node(node_online_map);
29626 + numa_set_node(cpu, node);
29628 + if (acpi_numa > 0)
29629 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29633 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
29635 + /* Cache sizes */
29638 + init_intel_cacheinfo(c);
29639 + if (c->cpuid_level > 9 ) {
29640 + unsigned eax = cpuid_eax(10);
29641 + /* Check for version and the number of counters */
29642 + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
29643 + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
29646 + n = c->extended_cpuid_level;
29647 + if (n >= 0x80000008) {
29648 + unsigned eax = cpuid_eax(0x80000008);
29649 + c->x86_virt_bits = (eax >> 8) & 0xff;
29650 + c->x86_phys_bits = eax & 0xff;
29651 + /* CPUID workaround for Intel 0F34 CPU */
29652 + if (c->x86_vendor == X86_VENDOR_INTEL &&
29653 + c->x86 == 0xF && c->x86_model == 0x3 &&
29654 + c->x86_mask == 0x4)
29655 + c->x86_phys_bits = 36;
29658 + if (c->x86 == 15)
29659 + c->x86_cache_alignment = c->x86_clflush_size * 2;
29660 + if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
29661 + (c->x86 == 0x6 && c->x86_model >= 0x0e))
29662 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29663 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
29664 + c->x86_max_cores = intel_num_cpu_cores(c);
29666 + srat_detect_node();
29669 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
29671 + char *v = c->x86_vendor_id;
29673 + if (!strcmp(v, "AuthenticAMD"))
29674 + c->x86_vendor = X86_VENDOR_AMD;
29675 + else if (!strcmp(v, "GenuineIntel"))
29676 + c->x86_vendor = X86_VENDOR_INTEL;
29678 + c->x86_vendor = X86_VENDOR_UNKNOWN;
29681 +struct cpu_model_info {
29684 + char *model_names[16];
29687 +/* Do some early cpuid on the boot CPU to get some parameter that are
29688 + needed before check_bugs. Everything advanced is in identify_cpu
29690 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
29694 + c->loops_per_jiffy = loops_per_jiffy;
29695 + c->x86_cache_size = -1;
29696 + c->x86_vendor = X86_VENDOR_UNKNOWN;
29697 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
29698 + c->x86_vendor_id[0] = '\0'; /* Unset */
29699 + c->x86_model_id[0] = '\0'; /* Unset */
29700 + c->x86_clflush_size = 64;
29701 + c->x86_cache_alignment = c->x86_clflush_size;
29702 + c->x86_max_cores = 1;
29703 + c->extended_cpuid_level = 0;
29704 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
29706 + /* Get vendor name */
29707 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
29708 + (unsigned int *)&c->x86_vendor_id[0],
29709 + (unsigned int *)&c->x86_vendor_id[8],
29710 + (unsigned int *)&c->x86_vendor_id[4]);
29712 + get_cpu_vendor(c);
29714 + /* Initialize the standard set of capabilities */
29715 + /* Note that the vendor-specific code below might override */
29717 + /* Intel-defined flags: level 0x00000001 */
29718 + if (c->cpuid_level >= 0x00000001) {
29720 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
29721 + &c->x86_capability[0]);
29722 + c->x86 = (tfms >> 8) & 0xf;
29723 + c->x86_model = (tfms >> 4) & 0xf;
29724 + c->x86_mask = tfms & 0xf;
29725 + if (c->x86 == 0xf)
29726 + c->x86 += (tfms >> 20) & 0xff;
29727 + if (c->x86 >= 0x6)
29728 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
29729 + if (c->x86_capability[0] & (1<<19))
29730 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
29732 + /* Have CPUID level 0 only - unheard of */
29737 + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
29742 + * This does the hard work of actually picking apart the CPU stuff...
29744 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
29749 + early_identify_cpu(c);
29751 + /* AMD-defined flags: level 0x80000001 */
29752 + xlvl = cpuid_eax(0x80000000);
29753 + c->extended_cpuid_level = xlvl;
29754 + if ((xlvl & 0xffff0000) == 0x80000000) {
29755 + if (xlvl >= 0x80000001) {
29756 + c->x86_capability[1] = cpuid_edx(0x80000001);
29757 + c->x86_capability[6] = cpuid_ecx(0x80000001);
29759 + if (xlvl >= 0x80000004)
29760 + get_model_name(c); /* Default name */
29763 + /* Transmeta-defined flags: level 0x80860001 */
29764 + xlvl = cpuid_eax(0x80860000);
29765 + if ((xlvl & 0xffff0000) == 0x80860000) {
29766 + /* Don't set x86_cpuid_level here for now to not confuse. */
29767 + if (xlvl >= 0x80860001)
29768 + c->x86_capability[2] = cpuid_edx(0x80860001);
29771 + c->apicid = phys_pkg_id(0);
29774 + * Vendor-specific initialization. In this section we
29775 + * canonicalize the feature flags, meaning if there are
29776 + * features a certain CPU supports which CPUID doesn't
29777 + * tell us, CPUID claiming incorrect flags, or other bugs,
29778 + * we handle them here.
29780 + * At the end of this section, c->x86_capability better
29781 + * indicate the features this CPU genuinely supports!
29783 + switch (c->x86_vendor) {
29784 + case X86_VENDOR_AMD:
29788 + case X86_VENDOR_INTEL:
29792 + case X86_VENDOR_UNKNOWN:
29794 + display_cacheinfo(c);
29798 + select_idle_routine(c);
29802 + * On SMP, boot_cpu_data holds the common feature set between
29803 + * all CPUs; so make sure that we indicate which features are
29804 + * common between the CPUs. The first time this routine gets
29805 + * executed, c == &boot_cpu_data.
29807 + if (c != &boot_cpu_data) {
29808 + /* AND the already accumulated flags with these */
29809 + for (i = 0 ; i < NCAPINTS ; i++)
29810 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
29813 +#ifdef CONFIG_X86_MCE
29816 + if (c == &boot_cpu_data)
29820 +#ifdef CONFIG_NUMA
29821 + numa_add_cpu(smp_processor_id());
29826 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
29828 + if (c->x86_model_id[0])
29829 + printk("%s", c->x86_model_id);
29831 + if (c->x86_mask || c->cpuid_level >= 0)
29832 + printk(" stepping %02x\n", c->x86_mask);
29838 + * Get CPU information for use by the procfs.
29841 +static int show_cpuinfo(struct seq_file *m, void *v)
29843 + struct cpuinfo_x86 *c = v;
29846 + * These flag bits must match the definitions in <asm/cpufeature.h>.
29847 + * NULL means this bit is undefined or reserved; either way it doesn't
29848 + * have meaning as far as Linux is concerned. Note that it's important
29849 + * to realize there is a difference between this table and CPUID -- if
29850 + * applications want to get the raw CPUID data, they should access
29851 + * /dev/cpu/<cpu_nr>/cpuid instead.
29853 + static char *x86_cap_flags[] = {
29854 + /* Intel-defined */
29855 + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
29856 + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
29857 + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
29858 + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
29860 + /* AMD-defined */
29861 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29862 + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
29863 + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
29864 + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
29866 + /* Transmeta-defined */
29867 + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
29868 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29869 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29870 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29872 + /* Other (Linux-defined) */
29873 + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
29874 + "constant_tsc", NULL, NULL,
29875 + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29876 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29877 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29879 + /* Intel-defined (#2) */
29880 + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
29881 + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
29882 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29883 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29885 + /* VIA/Cyrix/Centaur-defined */
29886 + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
29887 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29888 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29889 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29891 + /* AMD-defined (#2) */
29892 + "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
29893 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29894 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29895 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29897 + static char *x86_power_flags[] = {
29898 + "ts", /* temperature sensor */
29899 + "fid", /* frequency id control */
29900 + "vid", /* voltage id control */
29901 + "ttp", /* thermal trip */
29905 + /* nothing */ /* constant_tsc - moved to flags */
29910 + if (!cpu_online(c-cpu_data))
29914 + seq_printf(m,"processor\t: %u\n"
29915 + "vendor_id\t: %s\n"
29916 + "cpu family\t: %d\n"
29917 + "model\t\t: %d\n"
29918 + "model name\t: %s\n",
29919 + (unsigned)(c-cpu_data),
29920 + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
29922 + (int)c->x86_model,
29923 + c->x86_model_id[0] ? c->x86_model_id : "unknown");
29925 + if (c->x86_mask || c->cpuid_level >= 0)
29926 + seq_printf(m, "stepping\t: %d\n", c->x86_mask);
29928 + seq_printf(m, "stepping\t: unknown\n");
29930 + if (cpu_has(c,X86_FEATURE_TSC)) {
29931 + unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
29934 + seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
29935 + freq / 1000, (freq % 1000));
29939 + if (c->x86_cache_size >= 0)
29940 + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
29943 + if (smp_num_siblings * c->x86_max_cores > 1) {
29944 + int cpu = c - cpu_data;
29945 + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
29946 + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
29947 + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
29948 + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
29954 + "fpu_exception\t: yes\n"
29955 + "cpuid level\t: %d\n"
29962 + for ( i = 0 ; i < 32*NCAPINTS ; i++ )
29963 + if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
29964 + seq_printf(m, " %s", x86_cap_flags[i]);
29967 + seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
29968 + c->loops_per_jiffy/(500000/HZ),
29969 + (c->loops_per_jiffy/(5000/HZ)) % 100);
29971 + if (c->x86_tlbsize > 0)
29972 + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
29973 + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
29974 + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
29976 + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
29977 + c->x86_phys_bits, c->x86_virt_bits);
29979 + seq_printf(m, "power management:");
29982 + for (i = 0; i < 32; i++)
29983 + if (c->x86_power & (1 << i)) {
29984 + if (i < ARRAY_SIZE(x86_power_flags) &&
29985 + x86_power_flags[i])
29986 + seq_printf(m, "%s%s",
29987 + x86_power_flags[i][0]?" ":"",
29988 + x86_power_flags[i]);
29990 + seq_printf(m, " [%d]", i);
29994 + seq_printf(m, "\n\n");
29999 +static void *c_start(struct seq_file *m, loff_t *pos)
30001 + return *pos < NR_CPUS ? cpu_data + *pos : NULL;
30004 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
30007 + return c_start(m, pos);
30010 +static void c_stop(struct seq_file *m, void *v)
30014 +struct seq_operations cpuinfo_op = {
30018 + .show = show_cpuinfo,
30021 +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
30022 +#include <linux/platform_device.h>
30023 +static __init int add_pcspkr(void)
30025 + struct platform_device *pd;
30028 + if (!is_initial_xendomain())
30031 + pd = platform_device_alloc("pcspkr", -1);
30035 + ret = platform_device_add(pd);
30037 + platform_device_put(pd);
30041 +device_initcall(add_pcspkr);
30043 Index: head-2008-11-25/arch/x86/kernel/setup64-xen.c
30044 ===================================================================
30045 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30046 +++ head-2008-11-25/arch/x86/kernel/setup64-xen.c 2008-01-28 12:24:19.000000000 +0100
30049 + * X86-64 specific CPU setup.
30050 + * Copyright (C) 1995 Linus Torvalds
30051 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
30052 + * See setup.c for older changelog.
30054 + * Jun Nakajima <jun.nakajima@intel.com>
30055 + * Modified for Xen
30058 +#include <linux/init.h>
30059 +#include <linux/kernel.h>
30060 +#include <linux/sched.h>
30061 +#include <linux/string.h>
30062 +#include <linux/bootmem.h>
30063 +#include <linux/bitops.h>
30064 +#include <linux/module.h>
30065 +#include <asm/bootsetup.h>
30066 +#include <asm/pda.h>
30067 +#include <asm/pgtable.h>
30068 +#include <asm/processor.h>
30069 +#include <asm/desc.h>
30070 +#include <asm/atomic.h>
30071 +#include <asm/mmu_context.h>
30072 +#include <asm/smp.h>
30073 +#include <asm/i387.h>
30074 +#include <asm/percpu.h>
30075 +#include <asm/proto.h>
30076 +#include <asm/sections.h>
30078 +#include <asm/hypervisor.h>
30081 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30083 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30085 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
30086 +EXPORT_SYMBOL(_cpu_pda);
30087 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
30089 +#ifndef CONFIG_X86_NO_IDT
30090 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
30093 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
30095 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
30096 +EXPORT_SYMBOL(__supported_pte_mask);
30097 +static int do_not_nx __cpuinitdata = 0;
30100 +Control non executable mappings for 64bit processes.
30102 +on Enable(default)
30105 +int __init nonx_setup(char *str)
30107 + if (!strncmp(str, "on", 2)) {
30108 + __supported_pte_mask |= _PAGE_NX;
30110 + } else if (!strncmp(str, "off", 3)) {
30112 + __supported_pte_mask &= ~_PAGE_NX;
30116 +__setup("noexec=", nonx_setup); /* parsed early actually */
30118 +int force_personality32 = 0;
30120 +/* noexec32=on|off
30121 +Control non executable heap for 32bit processes.
30122 +To control the stack too use noexec=off
30124 +on PROT_READ does not imply PROT_EXEC for 32bit processes
30125 +off PROT_READ implies PROT_EXEC (default)
30127 +static int __init nonx32_setup(char *str)
30129 + if (!strcmp(str, "on"))
30130 + force_personality32 &= ~READ_IMPLIES_EXEC;
30131 + else if (!strcmp(str, "off"))
30132 + force_personality32 |= READ_IMPLIES_EXEC;
30135 +__setup("noexec32=", nonx32_setup);
30138 + * Great future plan:
30139 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
30140 + * Always point %gs to its beginning
30142 +void __init setup_per_cpu_areas(void)
30145 + unsigned long size;
30147 +#ifdef CONFIG_HOTPLUG_CPU
30148 + prefill_possible_map();
30151 + /* Copy section for each CPU (we discard the original) */
30152 + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
30153 +#ifdef CONFIG_MODULES
30154 + if (size < PERCPU_ENOUGH_ROOM)
30155 + size = PERCPU_ENOUGH_ROOM;
30158 + for_each_cpu_mask (i, cpu_possible_map) {
30161 + if (!NODE_DATA(cpu_to_node(i))) {
30162 + printk("cpu with no node %d, num_online_nodes %d\n",
30163 + i, num_online_nodes());
30164 + ptr = alloc_bootmem(size);
30166 + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
30169 + panic("Cannot allocate cpu data for CPU %d\n", i);
30170 + cpu_pda(i)->data_offset = ptr - __per_cpu_start;
30171 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
30176 +static void switch_pt(void)
30178 + xen_pt_switch(__pa_symbol(init_level4_pgt));
30179 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
30182 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30184 + unsigned long frames[16];
30185 + unsigned long va;
30188 + for (va = gdt_descr->address, f = 0;
30189 + va < gdt_descr->address + gdt_descr->size;
30190 + va += PAGE_SIZE, f++) {
30191 + frames[f] = virt_to_mfn(va);
30192 + make_page_readonly(
30193 + (void *)va, XENFEAT_writable_descriptor_tables);
30195 + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
30196 + sizeof (struct desc_struct)))
30200 +static void switch_pt(void)
30202 + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
30205 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30207 + asm volatile("lgdt %0" :: "m" (*gdt_descr));
30208 + asm volatile("lidt %0" :: "m" (idt_descr));
30212 +void pda_init(int cpu)
30214 + struct x8664_pda *pda = cpu_pda(cpu);
30216 + /* Setup up data that may be needed in __get_free_pages early */
30217 + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
30218 +#ifndef CONFIG_XEN
30219 + wrmsrl(MSR_GS_BASE, pda);
30221 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
30222 + (unsigned long)pda))
30225 + pda->cpunumber = cpu;
30226 + pda->irqcount = -1;
30227 + pda->kernelstack =
30228 + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
30229 + pda->active_mm = &init_mm;
30230 + pda->mmu_state = 0;
30236 + /* others are initialized in smpboot.c */
30237 + pda->pcurrent = &init_task;
30238 + pda->irqstackptr = boot_cpu_stack;
30240 + pda->irqstackptr = (char *)
30241 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
30242 + if (!pda->irqstackptr)
30243 + panic("cannot allocate irqstack for cpu %d", cpu);
30248 + pda->irqstackptr += IRQSTACKSIZE-64;
30251 +#ifndef CONFIG_X86_NO_TSS
30252 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
30253 +__attribute__((section(".bss.page_aligned")));
30256 +/* May not be marked __init: used by software suspend */
30257 +void syscall_init(void)
30259 +#ifndef CONFIG_XEN
30261 + * LSTAR and STAR live in a bit strange symbiosis.
30262 + * They both write to the same internal register. STAR allows to set CS/DS
30263 + * but only a 32bit target. LSTAR sets the 64bit rip.
30265 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
30266 + wrmsrl(MSR_LSTAR, system_call);
30268 + /* Flags to clear on syscall */
30269 + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
30271 +#ifdef CONFIG_IA32_EMULATION
30272 + syscall32_cpu_init ();
30276 +void __cpuinit check_efer(void)
30278 + unsigned long efer;
30280 + rdmsrl(MSR_EFER, efer);
30281 + if (!(efer & EFER_NX) || do_not_nx) {
30282 + __supported_pte_mask &= ~_PAGE_NX;
30286 +unsigned long kernel_eflags;
30289 + * cpu_init() initializes state that is per-CPU. Some data is already
30290 + * initialized (naturally) in the bootstrap process, such as the GDT
30291 + * and IDT. We reload them nevertheless, this function acts as a
30292 + * 'CPU state barrier', nothing should get across.
30293 + * A lot of state is already set up in PDA init.
30295 +void __cpuinit cpu_init (void)
30297 + int cpu = stack_smp_processor_id();
30298 +#ifndef CONFIG_X86_NO_TSS
30299 + struct tss_struct *t = &per_cpu(init_tss, cpu);
30300 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
30302 + char *estacks = NULL;
30305 + struct task_struct *me;
30307 + /* CPU 0 is initialised in head64.c */
30310 + zap_low_mappings(cpu);
30312 +#ifndef CONFIG_X86_NO_TSS
30314 + estacks = boot_exception_stacks;
30319 + if (cpu_test_and_set(cpu, cpu_initialized))
30320 + panic("CPU#%d already initialized!\n", cpu);
30322 + printk("Initializing CPU#%d\n", cpu);
30324 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
30327 + * Initialize the per-CPU GDT with the boot GDT,
30328 + * and set up the GDT descriptor:
30330 +#ifndef CONFIG_XEN
30332 + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
30335 + cpu_gdt_descr[cpu].size = GDT_SIZE;
30336 + cpu_gdt_init(&cpu_gdt_descr[cpu]);
30338 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
30341 + wrmsrl(MSR_FS_BASE, 0);
30342 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
30347 +#ifndef CONFIG_X86_NO_TSS
30349 + * set up and load the per-CPU TSS
30351 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
30353 + static const unsigned int order[N_EXCEPTION_STACKS] = {
30354 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
30355 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
30358 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
30360 + panic("Cannot allocate exception stack %ld %d\n",
30364 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
30365 + case DEBUG_STACK:
30366 + cpu_pda(cpu)->debugstack = (unsigned long)estacks;
30367 + estacks += DEBUG_STKSZ;
30371 + estacks += EXCEPTION_STKSZ;
30374 + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
30377 + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
30379 + * <= is required because the CPU will access up to
30380 + * 8 bits beyond the end of the IO permission bitmap.
30382 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
30383 + t->io_bitmap[i] = ~0UL;
30386 + atomic_inc(&init_mm.mm_count);
30387 + me->active_mm = &init_mm;
30390 + enter_lazy_tlb(&init_mm, me);
30392 +#ifndef CONFIG_X86_NO_TSS
30393 + set_tss_desc(cpu, t);
30395 +#ifndef CONFIG_XEN
30398 + load_LDT(&init_mm.context);
30401 + * Clear all 6 debug registers:
30404 + set_debugreg(0UL, 0);
30405 + set_debugreg(0UL, 1);
30406 + set_debugreg(0UL, 2);
30407 + set_debugreg(0UL, 3);
30408 + set_debugreg(0UL, 6);
30409 + set_debugreg(0UL, 7);
30413 + raw_local_save_flags(kernel_eflags);
30415 Index: head-2008-11-25/arch/x86/kernel/smp_64-xen.c
30416 ===================================================================
30417 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30418 +++ head-2008-11-25/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200
30421 + * Intel SMP support routines.
30423 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
30424 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
30425 + * (c) 2002,2003 Andi Kleen, SuSE Labs.
30427 + * This code is released under the GNU General Public License version 2 or
30431 +#include <linux/init.h>
30433 +#include <linux/mm.h>
30434 +#include <linux/delay.h>
30435 +#include <linux/spinlock.h>
30436 +#include <linux/smp_lock.h>
30437 +#include <linux/smp.h>
30438 +#include <linux/kernel_stat.h>
30439 +#include <linux/mc146818rtc.h>
30440 +#include <linux/interrupt.h>
30442 +#include <asm/mtrr.h>
30443 +#include <asm/pgalloc.h>
30444 +#include <asm/tlbflush.h>
30445 +#include <asm/mach_apic.h>
30446 +#include <asm/mmu_context.h>
30447 +#include <asm/proto.h>
30448 +#include <asm/apicdef.h>
30449 +#include <asm/idle.h>
30451 +#include <xen/evtchn.h>
30454 +#ifndef CONFIG_XEN
30456 + * Smarter SMP flushing macros.
30457 + * c/o Linus Torvalds.
30459 + * These mean you can really definitely utterly forget about
30460 + * writing to user space from interrupts. (Its not allowed anyway).
30462 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
30464 + * More scalable flush, from Andi Kleen
30466 + * To avoid global state use 8 different call vectors.
30467 + * Each CPU uses a specific vector to trigger flushes on other
30468 + * CPUs. Depending on the received vector the target CPUs look into
30469 + * the right per cpu variable for the flush data.
30471 + * With more than 8 CPUs they are hashed to the 8 available
30472 + * vectors. The limited global vector space forces us to this right now.
30473 + * In future when interrupts are split into per CPU domains this could be
30474 + * fixed, at the cost of triggering multiple IPIs in some cases.
30477 +union smp_flush_state {
30479 + cpumask_t flush_cpumask;
30480 + struct mm_struct *flush_mm;
30481 + unsigned long flush_va;
30482 +#define FLUSH_ALL -1ULL
30483 + spinlock_t tlbstate_lock;
30485 + char pad[SMP_CACHE_BYTES];
30486 +} ____cacheline_aligned;
30488 +/* State is put into the per CPU data section, but padded
30489 + to a full cache line because other CPUs can access it and we don't
30490 + want false sharing in the per cpu data segment. */
30491 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
30494 + * We cannot call mmdrop() because we are in interrupt context,
30495 + * instead update mm->cpu_vm_mask.
30497 +static inline void leave_mm(unsigned long cpu)
30499 + if (read_pda(mmu_state) == TLBSTATE_OK)
30501 + cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
30502 + load_cr3(swapper_pg_dir);
30507 + * The flush IPI assumes that a thread switch happens in this order:
30508 + * [cpu0: the cpu that switches]
30509 + * 1) switch_mm() either 1a) or 1b)
30510 + * 1a) thread switch to a different mm
30511 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
30512 + * Stop ipi delivery for the old mm. This is not synchronized with
30513 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
30514 + * for the wrong mm, and in the worst case we perform a superfluous
30516 + * 1a2) set cpu mmu_state to TLBSTATE_OK
30517 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
30518 + * was in lazy tlb mode.
30519 + * 1a3) update cpu active_mm
30520 + * Now cpu0 accepts tlb flushes for the new mm.
30521 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
30522 + * Now the other cpus will send tlb flush ipis.
30523 + * 1a4) change cr3.
30524 + * 1b) thread switch without mm change
30525 + * cpu active_mm is correct, cpu0 already handles
30527 + * 1b1) set cpu mmu_state to TLBSTATE_OK
30528 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
30529 + * Atomically set the bit [other cpus will start sending flush ipis],
30530 + * and test the bit.
30531 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
30532 + * 2) switch %%esp, ie current
30534 + * The interrupt must handle 2 special cases:
30535 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
30536 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
30537 + * runs in kernel space, the cpu could load tlb entries for user space
30540 + * The good news is that cpu mmu_state is local to each cpu, no
30541 + * write/read ordering problems.
30547 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
30548 + * 2) Leave the mm if we are in the lazy tlb mode.
30550 + * Interrupts are disabled.
30553 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
30557 + union smp_flush_state *f;
30559 + cpu = smp_processor_id();
30561 + * orig_rax contains the negated interrupt vector.
30562 + * Use that to determine where the sender put the data.
30564 + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
30565 + f = &per_cpu(flush_state, sender);
30567 + if (!cpu_isset(cpu, f->flush_cpumask))
30570 + * This was a BUG() but until someone can quote me the
30571 + * line from the intel manual that guarantees an IPI to
30572 + * multiple CPUs is retried _only_ on the erroring CPUs
30573 + * its staying as a return
30578 + if (f->flush_mm == read_pda(active_mm)) {
30579 + if (read_pda(mmu_state) == TLBSTATE_OK) {
30580 + if (f->flush_va == FLUSH_ALL)
30581 + local_flush_tlb();
30583 + __flush_tlb_one(f->flush_va);
30589 + cpu_clear(cpu, f->flush_cpumask);
30592 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
30593 + unsigned long va)
30596 + union smp_flush_state *f;
30598 + /* Caller has disabled preemption */
30599 + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
30600 + f = &per_cpu(flush_state, sender);
30602 + /* Could avoid this lock when
30603 + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
30604 + probably not worth checking this for a cache-hot lock. */
30605 + spin_lock(&f->tlbstate_lock);
30607 + f->flush_mm = mm;
30608 + f->flush_va = va;
30609 + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
30612 + * We have to send the IPI only to
30615 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
30617 + while (!cpus_empty(f->flush_cpumask))
30620 + f->flush_mm = NULL;
30622 + spin_unlock(&f->tlbstate_lock);
30625 +int __cpuinit init_smp_flush(void)
30628 + for_each_cpu_mask(i, cpu_possible_map) {
30629 + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
30634 +core_initcall(init_smp_flush);
30636 +void flush_tlb_current_task(void)
30638 + struct mm_struct *mm = current->mm;
30639 + cpumask_t cpu_mask;
30641 + preempt_disable();
30642 + cpu_mask = mm->cpu_vm_mask;
30643 + cpu_clear(smp_processor_id(), cpu_mask);
30645 + local_flush_tlb();
30646 + if (!cpus_empty(cpu_mask))
30647 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30648 + preempt_enable();
30650 +EXPORT_SYMBOL(flush_tlb_current_task);
30652 +void flush_tlb_mm (struct mm_struct * mm)
30654 + cpumask_t cpu_mask;
30656 + preempt_disable();
30657 + cpu_mask = mm->cpu_vm_mask;
30658 + cpu_clear(smp_processor_id(), cpu_mask);
30660 + if (current->active_mm == mm) {
30662 + local_flush_tlb();
30664 + leave_mm(smp_processor_id());
30666 + if (!cpus_empty(cpu_mask))
30667 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30669 + preempt_enable();
30671 +EXPORT_SYMBOL(flush_tlb_mm);
30673 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
30675 + struct mm_struct *mm = vma->vm_mm;
30676 + cpumask_t cpu_mask;
30678 + preempt_disable();
30679 + cpu_mask = mm->cpu_vm_mask;
30680 + cpu_clear(smp_processor_id(), cpu_mask);
30682 + if (current->active_mm == mm) {
30684 + __flush_tlb_one(va);
30686 + leave_mm(smp_processor_id());
30689 + if (!cpus_empty(cpu_mask))
30690 + flush_tlb_others(cpu_mask, mm, va);
30692 + preempt_enable();
30694 +EXPORT_SYMBOL(flush_tlb_page);
30696 +static void do_flush_tlb_all(void* info)
30698 + unsigned long cpu = smp_processor_id();
30700 + __flush_tlb_all();
30701 + if (read_pda(mmu_state) == TLBSTATE_LAZY)
30705 +void flush_tlb_all(void)
30707 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
30712 + * this function sends a 'reschedule' IPI to another CPU.
30713 + * it goes straight through and wastes no time serializing
30714 + * anything. Worst case is that we lose a reschedule ...
30717 +void smp_send_reschedule(int cpu)
30719 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
30723 + * Structure and data for smp_call_function(). This is designed to minimise
30724 + * static memory requirements. It also looks cleaner.
30726 +static DEFINE_SPINLOCK(call_lock);
30728 +struct call_data_struct {
30729 + void (*func) (void *info);
30731 + atomic_t started;
30732 + atomic_t finished;
30736 +static struct call_data_struct * call_data;
30738 +void lock_ipi_call_lock(void)
30740 + spin_lock_irq(&call_lock);
30743 +void unlock_ipi_call_lock(void)
30745 + spin_unlock_irq(&call_lock);
30749 + * this function sends a 'generic call function' IPI to one other CPU
30752 + * cpu is a standard Linux logical CPU number.
30755 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
30756 + int nonatomic, int wait)
30758 + struct call_data_struct data;
30761 + data.func = func;
30762 + data.info = info;
30763 + atomic_set(&data.started, 0);
30764 + data.wait = wait;
30766 + atomic_set(&data.finished, 0);
30768 + call_data = &data;
30770 + /* Send a message to all other CPUs and wait for them to respond */
30771 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
30773 + /* Wait for response */
30774 + while (atomic_read(&data.started) != cpus)
30780 + while (atomic_read(&data.finished) != cpus)
30785 + * smp_call_function_single - Run a function on another CPU
30786 + * @func: The function to run. This must be fast and non-blocking.
30787 + * @info: An arbitrary pointer to pass to the function.
30788 + * @nonatomic: Currently unused.
30789 + * @wait: If true, wait until function has completed on other CPUs.
30791 + * Retrurns 0 on success, else a negative status code.
30793 + * Does not return until the remote CPU is nearly ready to execute <func>
30794 + * or is or has executed.
30797 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
30798 + int nonatomic, int wait)
30800 + /* prevent preemption and reschedule on another processor */
30801 + int me = get_cpu();
30807 + spin_lock_bh(&call_lock);
30808 + __smp_call_function_single(cpu, func, info, nonatomic, wait);
30809 + spin_unlock_bh(&call_lock);
30815 + * this function sends a 'generic call function' IPI to all other CPUs
30818 +static void __smp_call_function (void (*func) (void *info), void *info,
30819 + int nonatomic, int wait)
30821 + struct call_data_struct data;
30822 + int cpus = num_online_cpus()-1;
30827 + data.func = func;
30828 + data.info = info;
30829 + atomic_set(&data.started, 0);
30830 + data.wait = wait;
30832 + atomic_set(&data.finished, 0);
30834 + call_data = &data;
30836 + /* Send a message to all other CPUs and wait for them to respond */
30837 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
30839 + /* Wait for response */
30840 + while (atomic_read(&data.started) != cpus)
30846 + while (atomic_read(&data.finished) != cpus)
30851 + * smp_call_function - run a function on all other CPUs.
30852 + * @func: The function to run. This must be fast and non-blocking.
30853 + * @info: An arbitrary pointer to pass to the function.
30854 + * @nonatomic: currently unused.
30855 + * @wait: If true, wait (atomically) until function has completed on other
30858 + * Returns 0 on success, else a negative status code. Does not return until
30859 + * remote CPUs are nearly ready to execute func or are or have executed.
30861 + * You must not call this function with disabled interrupts or from a
30862 + * hardware interrupt handler or from a bottom half handler.
30863 + * Actually there are a few legal cases, like panic.
30865 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
30868 + spin_lock(&call_lock);
30869 + __smp_call_function(func,info,nonatomic,wait);
30870 + spin_unlock(&call_lock);
30873 +EXPORT_SYMBOL(smp_call_function);
30875 +void smp_stop_cpu(void)
30877 + unsigned long flags;
30879 + * Remove this CPU:
30881 + cpu_clear(smp_processor_id(), cpu_online_map);
30882 + local_irq_save(flags);
30883 + disable_all_local_evtchn();
30884 + local_irq_restore(flags);
30887 +static void smp_really_stop_cpu(void *dummy)
30894 +void smp_send_stop(void)
30897 +#ifndef CONFIG_XEN
30898 + if (reboot_force)
30901 + /* Don't deadlock on the call lock in panic */
30902 + if (!spin_trylock(&call_lock)) {
30903 + /* ignore locking because we have panicked anyways */
30906 + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
30908 + spin_unlock(&call_lock);
30910 + local_irq_disable();
30911 + disable_all_local_evtchn();
30912 + local_irq_enable();
30916 + * Reschedule call back. Nothing to do,
30917 + * all the work is done automatically when
30918 + * we return from the interrupt.
30920 +#ifndef CONFIG_XEN
30921 +asmlinkage void smp_reschedule_interrupt(void)
30923 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
30926 +#ifndef CONFIG_XEN
30929 + return IRQ_HANDLED;
30933 +#ifndef CONFIG_XEN
30934 +asmlinkage void smp_call_function_interrupt(void)
30936 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
30939 + void (*func) (void *info) = call_data->func;
30940 + void *info = call_data->info;
30941 + int wait = call_data->wait;
30943 +#ifndef CONFIG_XEN
30947 + * Notify initiating CPU that I've grabbed the data and am
30948 + * about to execute the function
30951 + atomic_inc(&call_data->started);
30953 + * At this point the info structure may be out of scope unless wait==1
30961 + atomic_inc(&call_data->finished);
30964 + return IRQ_HANDLED;
30968 +int safe_smp_processor_id(void)
30971 + return smp_processor_id();
30973 + unsigned apicid, i;
30975 + if (disable_apic)
30978 + apicid = hard_smp_processor_id();
30979 + if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
30982 + for (i = 0; i < NR_CPUS; ++i) {
30983 + if (x86_cpu_to_apicid[i] == apicid)
30987 + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
30988 + * or called too early. Either way, we must be CPU 0. */
30989 + if (x86_cpu_to_apicid[0] == BAD_APICID)
30992 + return 0; /* Should not happen */
30995 Index: head-2008-11-25/arch/x86/kernel/traps_64-xen.c
30996 ===================================================================
30997 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30998 +++ head-2008-11-25/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200
31001 + * linux/arch/x86-64/traps.c
31003 + * Copyright (C) 1991, 1992 Linus Torvalds
31004 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
31006 + * Pentium III FXSR, SSE support
31007 + * Gareth Hughes <gareth@valinux.com>, May 2000
31011 + * 'Traps.c' handles hardware traps and faults after we have saved some
31012 + * state in 'entry.S'.
31014 +#include <linux/sched.h>
31015 +#include <linux/kernel.h>
31016 +#include <linux/string.h>
31017 +#include <linux/errno.h>
31018 +#include <linux/ptrace.h>
31019 +#include <linux/timer.h>
31020 +#include <linux/mm.h>
31021 +#include <linux/init.h>
31022 +#include <linux/delay.h>
31023 +#include <linux/spinlock.h>
31024 +#include <linux/interrupt.h>
31025 +#include <linux/module.h>
31026 +#include <linux/moduleparam.h>
31027 +#include <linux/nmi.h>
31028 +#include <linux/kprobes.h>
31029 +#include <linux/kexec.h>
31030 +#include <linux/unwind.h>
31032 +#include <asm/system.h>
31033 +#include <asm/uaccess.h>
31034 +#include <asm/io.h>
31035 +#include <asm/atomic.h>
31036 +#include <asm/debugreg.h>
31037 +#include <asm/desc.h>
31038 +#include <asm/i387.h>
31039 +#include <asm/kdebug.h>
31040 +#include <asm/processor.h>
31041 +#include <asm/unwind.h>
31042 +#include <asm/smp.h>
31043 +#include <asm/pgalloc.h>
31044 +#include <asm/pda.h>
31045 +#include <asm/proto.h>
31046 +#include <asm/nmi.h>
31048 +asmlinkage void divide_error(void);
31049 +asmlinkage void debug(void);
31050 +asmlinkage void nmi(void);
31051 +asmlinkage void int3(void);
31052 +asmlinkage void overflow(void);
31053 +asmlinkage void bounds(void);
31054 +asmlinkage void invalid_op(void);
31055 +asmlinkage void device_not_available(void);
31056 +asmlinkage void double_fault(void);
31057 +asmlinkage void coprocessor_segment_overrun(void);
31058 +asmlinkage void invalid_TSS(void);
31059 +asmlinkage void segment_not_present(void);
31060 +asmlinkage void stack_segment(void);
31061 +asmlinkage void general_protection(void);
31062 +asmlinkage void page_fault(void);
31063 +asmlinkage void coprocessor_error(void);
31064 +asmlinkage void simd_coprocessor_error(void);
31065 +asmlinkage void reserved(void);
31066 +asmlinkage void alignment_check(void);
31067 +asmlinkage void machine_check(void);
31068 +asmlinkage void spurious_interrupt_bug(void);
31070 +ATOMIC_NOTIFIER_HEAD(die_chain);
31071 +EXPORT_SYMBOL(die_chain);
31073 +int register_die_notifier(struct notifier_block *nb)
31075 + vmalloc_sync_all();
31076 + return atomic_notifier_chain_register(&die_chain, nb);
31078 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
31080 +int unregister_die_notifier(struct notifier_block *nb)
31082 + return atomic_notifier_chain_unregister(&die_chain, nb);
31084 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
31086 +static inline void conditional_sti(struct pt_regs *regs)
31088 + if (regs->eflags & X86_EFLAGS_IF)
31089 + local_irq_enable();
31092 +static inline void preempt_conditional_sti(struct pt_regs *regs)
31094 + preempt_disable();
31095 + if (regs->eflags & X86_EFLAGS_IF)
31096 + local_irq_enable();
31099 +static inline void preempt_conditional_cli(struct pt_regs *regs)
31101 + if (regs->eflags & X86_EFLAGS_IF)
31102 + local_irq_disable();
31103 + /* Make sure to not schedule here because we could be running
31104 + on an exception stack. */
31105 + preempt_enable_no_resched();
31108 +static int kstack_depth_to_print = 12;
31109 +#ifdef CONFIG_STACK_UNWIND
31110 +static int call_trace = 1;
31112 +#define call_trace (-1)
31115 +#ifdef CONFIG_KALLSYMS
31116 +# include <linux/kallsyms.h>
31117 +void printk_address(unsigned long address)
31119 + unsigned long offset = 0, symsize;
31120 + const char *symname;
31122 + char *delim = ":";
31123 + char namebuf[128];
31125 + symname = kallsyms_lookup(address, &symsize, &offset,
31126 + &modname, namebuf);
31128 + printk(" [<%016lx>]\n", address);
31132 + modname = delim = "";
31133 + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
31134 + address, delim, modname, delim, symname, offset, symsize);
31137 +void printk_address(unsigned long address)
31139 + printk(" [<%016lx>]\n", address);
31143 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
31144 + unsigned *usedp, const char **idp)
31146 +#ifndef CONFIG_X86_NO_TSS
31147 + static char ids[][8] = {
31148 + [DEBUG_STACK - 1] = "#DB",
31149 + [NMI_STACK - 1] = "NMI",
31150 + [DOUBLEFAULT_STACK - 1] = "#DF",
31151 + [STACKFAULT_STACK - 1] = "#SS",
31152 + [MCE_STACK - 1] = "#MC",
31153 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31154 + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
31160 + * Iterate over all exception stacks, and figure out whether
31161 + * 'stack' is in one of them:
31163 + for (k = 0; k < N_EXCEPTION_STACKS; k++) {
31164 + unsigned long end;
31167 + * set 'end' to the end of the exception stack.
31171 + * TODO: this block is not needed i think, because
31172 + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
31175 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31176 + case DEBUG_STACK:
31177 + end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
31181 + end = per_cpu(orig_ist, cpu).ist[k];
31185 + * Is 'stack' above this exception frame's end?
31186 + * If yes then skip to the next frame.
31188 + if (stack >= end)
31191 + * Is 'stack' above this exception frame's start address?
31192 + * If yes then we found the right frame.
31194 + if (stack >= end - EXCEPTION_STKSZ) {
31196 + * Make sure we only iterate through an exception
31197 + * stack once. If it comes up for the second time
31198 + * then there's something wrong going on - just
31199 + * break out and return NULL:
31201 + if (*usedp & (1U << k))
31203 + *usedp |= 1U << k;
31205 + return (unsigned long *)end;
31208 + * If this is a debug stack, and if it has a larger size than
31209 + * the usual exception stacks, then 'stack' might still
31210 + * be within the lower portion of the debug stack:
31212 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31213 + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
31214 + unsigned j = N_EXCEPTION_STACKS - 1;
31217 + * Black magic. A large debug stack is composed of
31218 + * multiple exception stack entries, which we
31219 + * iterate through now. Dont look:
31223 + end -= EXCEPTION_STKSZ;
31224 + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
31225 + } while (stack < end - EXCEPTION_STKSZ);
31226 + if (*usedp & (1U << j))
31228 + *usedp |= 1U << j;
31230 + return (unsigned long *)end;
31238 +static int show_trace_unwind(struct unwind_frame_info *info, void *context)
31242 + while (unwind(info) == 0 && UNW_PC(info)) {
31244 + printk_address(UNW_PC(info));
31245 + if (arch_unw_user_mode(info))
31252 + * x86-64 can have upto three kernel stacks:
31254 + * interrupt stack
31255 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
31258 +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
31260 + const unsigned cpu = safe_smp_processor_id();
31261 + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
31262 + unsigned used = 0;
31264 + printk("\nCall Trace:\n");
31269 + if (call_trace >= 0) {
31271 + struct unwind_frame_info info;
31274 + if (unwind_init_frame_info(&info, tsk, regs) == 0)
31275 + unw_ret = show_trace_unwind(&info, NULL);
31276 + } else if (tsk == current)
31277 + unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
31279 + if (unwind_init_blocked(&info, tsk) == 0)
31280 + unw_ret = show_trace_unwind(&info, NULL);
31282 + if (unw_ret > 0) {
31283 + if (call_trace == 1 && !arch_unw_user_mode(&info)) {
31284 + print_symbol("DWARF2 unwinder stuck at %s\n",
31286 + if ((long)UNW_SP(&info) < 0) {
31287 + printk("Leftover inexact backtrace:\n");
31288 + stack = (unsigned long *)UNW_SP(&info);
31290 + printk("Full inexact backtrace again:\n");
31291 + } else if (call_trace >= 1)
31294 + printk("Full inexact backtrace again:\n");
31296 + printk("Inexact backtrace:\n");
31300 + * Print function call entries within a stack. 'cond' is the
31301 + * "end of stackframe" condition, that the 'stack++'
31302 + * iteration will eventually trigger.
31304 +#define HANDLE_STACK(cond) \
31305 + do while (cond) { \
31306 + unsigned long addr = *stack++; \
31307 + if (kernel_text_address(addr)) { \
31309 + * If the address is either in the text segment of the \
31310 + * kernel, or in the region which contains vmalloc'ed \
31311 + * memory, it *may* be the address of a calling \
31312 + * routine; if so, print it so that someone tracing \
31313 + * down the cause of the crash will be able to figure \
31314 + * out the call path that was taken. \
31316 + printk_address(addr); \
31321 + * Print function call entries in all stacks, starting at the
31322 + * current stack address. If the stacks consist of nested
31327 + unsigned long *estack_end;
31328 + estack_end = in_exception_stack(cpu, (unsigned long)stack,
31331 + if (estack_end) {
31332 + printk(" <%s>", id);
31333 + HANDLE_STACK (stack < estack_end);
31334 + printk(" <EOE>");
31336 + * We link to the next stack via the
31337 + * second-to-last pointer (index -2 to end) in the
31338 + * exception stack:
31340 + stack = (unsigned long *) estack_end[-2];
31343 + if (irqstack_end) {
31344 + unsigned long *irqstack;
31345 + irqstack = irqstack_end -
31346 + (IRQSTACKSIZE - 64) / sizeof(*irqstack);
31348 + if (stack >= irqstack && stack < irqstack_end) {
31349 + printk(" <IRQ>");
31350 + HANDLE_STACK (stack < irqstack_end);
31352 + * We link to the next stack (which would be
31353 + * the process stack normally) the last
31354 + * pointer (index -1 to end) in the IRQ stack:
31356 + stack = (unsigned long *) (irqstack_end[-1]);
31357 + irqstack_end = NULL;
31358 + printk(" <EOI>");
31366 + * This prints the process stack:
31368 + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
31369 +#undef HANDLE_STACK
31374 +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
31376 + unsigned long *stack;
31378 + const int cpu = safe_smp_processor_id();
31379 + unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
31380 + unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
31382 + // debugging aid: "show_stack(NULL, NULL);" prints the
31383 + // back trace for this cpu.
31385 + if (rsp == NULL) {
31387 + rsp = (unsigned long *)tsk->thread.rsp;
31389 + rsp = (unsigned long *)&rsp;
31393 + for(i=0; i < kstack_depth_to_print; i++) {
31394 + if (stack >= irqstack && stack <= irqstack_end) {
31395 + if (stack == irqstack_end) {
31396 + stack = (unsigned long *) (irqstack_end[-1]);
31397 + printk(" <EOI> ");
31400 + if (((long) stack & (THREAD_SIZE-1)) == 0)
31403 + if (i && ((i % 4) == 0))
31405 + printk(" %016lx", *stack++);
31406 + touch_nmi_watchdog();
31408 + show_trace(tsk, regs, rsp);
31411 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
31413 + _show_stack(tsk, NULL, rsp);
31417 + * The architecture-independent dump_stack generator
31419 +void dump_stack(void)
31421 + unsigned long dummy;
31422 + show_trace(NULL, NULL, &dummy);
31425 +EXPORT_SYMBOL(dump_stack);
31427 +void show_registers(struct pt_regs *regs)
31430 + int in_kernel = !user_mode(regs);
31431 + unsigned long rsp;
31432 + const int cpu = safe_smp_processor_id();
31433 + struct task_struct *cur = cpu_pda(cpu)->pcurrent;
31437 + printk("CPU %d ", cpu);
31438 + __show_regs(regs);
31439 + printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
31440 + cur->comm, cur->pid, task_thread_info(cur), cur);
31443 + * When in-kernel, we also print out the stack and code at the
31444 + * time of the fault..
31448 + printk("Stack: ");
31449 + _show_stack(NULL, regs, (unsigned long*)rsp);
31451 + printk("\nCode: ");
31452 + if (regs->rip < PAGE_OFFSET)
31455 + for (i=0; i<20; i++) {
31457 + if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
31459 + printk(" Bad RIP value.");
31462 + printk("%02x ", c);
31468 +void handle_BUG(struct pt_regs *regs)
31470 + struct bug_frame f;
31472 + const char *prefix = "";
31474 + if (user_mode(regs))
31476 + if (__copy_from_user(&f, (const void __user *) regs->rip,
31477 + sizeof(struct bug_frame)))
31479 + if (f.filename >= 0 ||
31480 + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
31482 + len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
31483 + if (len < 0 || len >= PATH_MAX)
31484 + f.filename = (int)(long)"unmapped filename";
31485 + else if (len > 50) {
31486 + f.filename += len - 50;
31489 + printk("----------- [cut here ] --------- [please bite here ] ---------\n");
31490 + printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
31494 +void out_of_line_bug(void)
31498 +EXPORT_SYMBOL(out_of_line_bug);
31501 +static DEFINE_SPINLOCK(die_lock);
31502 +static int die_owner = -1;
31503 +static unsigned int die_nest_count;
31505 +unsigned __kprobes long oops_begin(void)
31507 + int cpu = safe_smp_processor_id();
31508 + unsigned long flags;
31510 + /* racy, but better than risking deadlock. */
31511 + local_irq_save(flags);
31512 + if (!spin_trylock(&die_lock)) {
31513 + if (cpu == die_owner)
31514 + /* nested oops. should stop eventually */;
31516 + spin_lock(&die_lock);
31518 + die_nest_count++;
31520 + console_verbose();
31521 + bust_spinlocks(1);
31525 +void __kprobes oops_end(unsigned long flags)
31528 + bust_spinlocks(0);
31529 + die_nest_count--;
31530 + if (die_nest_count)
31531 + /* We still own the lock */
31532 + local_irq_restore(flags);
31534 + /* Nest count reaches zero, release the lock. */
31535 + spin_unlock_irqrestore(&die_lock, flags);
31536 + if (panic_on_oops)
31537 + panic("Fatal exception");
31540 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
31542 + static int die_counter;
31543 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
31544 +#ifdef CONFIG_PREEMPT
31545 + printk("PREEMPT ");
31550 +#ifdef CONFIG_DEBUG_PAGEALLOC
31551 + printk("DEBUG_PAGEALLOC");
31554 + notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
31555 + show_registers(regs);
31556 + /* Executive summary in case the oops scrolled away */
31557 + printk(KERN_ALERT "RIP ");
31558 + printk_address(regs->rip);
31559 + printk(" RSP <%016lx>\n", regs->rsp);
31560 + if (kexec_should_crash(current))
31561 + crash_kexec(regs);
31564 +void die(const char * str, struct pt_regs * regs, long err)
31566 + unsigned long flags = oops_begin();
31568 + handle_BUG(regs);
31569 + __die(str, regs, err);
31571 + do_exit(SIGSEGV);
31574 +#ifdef CONFIG_X86_LOCAL_APIC
31575 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
31577 + unsigned long flags = oops_begin();
31580 + * We are in trouble anyway, lets at least try
31581 + * to get a message out.
31583 + printk(str, safe_smp_processor_id());
31584 + show_registers(regs);
31585 + if (kexec_should_crash(current))
31586 + crash_kexec(regs);
31587 + if (panic_on_timeout || panic_on_oops)
31588 + panic("nmi watchdog");
31589 + printk("console shuts up ...\n");
31592 + local_irq_enable();
31593 + do_exit(SIGSEGV);
31597 +static void __kprobes do_trap(int trapnr, int signr, char *str,
31598 + struct pt_regs * regs, long error_code,
31601 + struct task_struct *tsk = current;
31603 + tsk->thread.error_code = error_code;
31604 + tsk->thread.trap_no = trapnr;
31606 + if (user_mode(regs)) {
31607 + if (exception_trace && unhandled_signal(tsk, signr))
31609 + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
31610 + tsk->comm, tsk->pid, str,
31611 + regs->rip, regs->rsp, error_code);
31614 + force_sig_info(signr, info, tsk);
31616 + force_sig(signr, tsk);
31621 + /* kernel trap */
31623 + const struct exception_table_entry *fixup;
31624 + fixup = search_exception_tables(regs->rip);
31626 + regs->rip = fixup->fixup;
31628 + die(str, regs, error_code);
31633 +#define DO_ERROR(trapnr, signr, str, name) \
31634 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31636 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31637 + == NOTIFY_STOP) \
31639 + conditional_sti(regs); \
31640 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
31643 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
31644 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31646 + siginfo_t info; \
31647 + info.si_signo = signr; \
31648 + info.si_errno = 0; \
31649 + info.si_code = sicode; \
31650 + info.si_addr = (void __user *)siaddr; \
31651 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31652 + == NOTIFY_STOP) \
31654 + conditional_sti(regs); \
31655 + do_trap(trapnr, signr, str, regs, error_code, &info); \
31658 +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
31659 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
31660 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
31661 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
31662 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
31663 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
31664 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
31665 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
31666 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
31667 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
31669 +/* Runs on IST stack */
31670 +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
31672 + if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
31673 + 12, SIGBUS) == NOTIFY_STOP)
31675 + preempt_conditional_sti(regs);
31676 + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
31677 + preempt_conditional_cli(regs);
31680 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
31682 + static const char str[] = "double fault";
31683 + struct task_struct *tsk = current;
31685 + /* Return not checked because double check cannot be ignored */
31686 + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
31688 + tsk->thread.error_code = error_code;
31689 + tsk->thread.trap_no = 8;
31691 + /* This is always a kernel trap and never fixable (and thus must
31692 + never return). */
31694 + die(str, regs, error_code);
31697 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
31700 + struct task_struct *tsk = current;
31702 + conditional_sti(regs);
31704 + tsk->thread.error_code = error_code;
31705 + tsk->thread.trap_no = 13;
31707 + if (user_mode(regs)) {
31708 + if (exception_trace && unhandled_signal(tsk, SIGSEGV))
31710 + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
31711 + tsk->comm, tsk->pid,
31712 + regs->rip, regs->rsp, error_code);
31714 + force_sig(SIGSEGV, tsk);
31720 + const struct exception_table_entry *fixup;
31721 + fixup = search_exception_tables(regs->rip);
31723 + regs->rip = fixup->fixup;
31726 + if (notify_die(DIE_GPF, "general protection fault", regs,
31727 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
31729 + die("general protection fault", regs, error_code);
31733 +static __kprobes void
31734 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
31736 + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
31737 + printk("You probably have a hardware problem with your RAM chips\n");
31740 + /* Clear and disable the memory parity error line. */
31741 + reason = (reason & 0xf) | 4;
31742 + outb(reason, 0x61);
31746 +static __kprobes void
31747 +io_check_error(unsigned char reason, struct pt_regs * regs)
31749 + printk("NMI: IOCK error (debug interrupt?)\n");
31750 + show_registers(regs);
31753 + /* Re-enable the IOCK line, wait for a few seconds */
31754 + reason = (reason & 0xf) | 8;
31755 + outb(reason, 0x61);
31758 + outb(reason, 0x61);
31762 +static __kprobes void
31763 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
31764 +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
31765 + printk("Dazed and confused, but trying to continue\n");
31766 + printk("Do you have a strange power saving mode enabled?\n");
31769 +/* Runs on IST stack. This code must keep interrupts off all the time.
31770 + Nested NMIs are prevented by the CPU. */
31771 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
31773 + unsigned char reason = 0;
31776 + cpu = smp_processor_id();
31778 + /* Only the BSP gets external NMIs from the system. */
31780 + reason = get_nmi_reason();
31782 + if (!(reason & 0xc0)) {
31783 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
31786 +#ifdef CONFIG_X86_LOCAL_APIC
31788 + * Ok, so this is none of the documented NMI sources,
31789 + * so it must be the NMI watchdog.
31791 + if (nmi_watchdog > 0) {
31792 + nmi_watchdog_tick(regs,reason);
31796 + unknown_nmi_error(reason, regs);
31799 + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
31802 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
31804 + if (reason & 0x80)
31805 + mem_parity_error(reason, regs);
31806 + if (reason & 0x40)
31807 + io_check_error(reason, regs);
31810 +/* runs on IST stack. */
31811 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
31813 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
31816 + preempt_conditional_sti(regs);
31817 + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
31818 + preempt_conditional_cli(regs);
31821 +/* Help handler running on IST stack to switch back to user stack
31822 + for scheduling or signal handling. The actual stack switch is done in
31824 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
31826 + struct pt_regs *regs = eregs;
31827 + /* Did already sync */
31828 + if (eregs == (struct pt_regs *)eregs->rsp)
31830 + /* Exception from user space */
31831 + else if (user_mode(eregs))
31832 + regs = task_pt_regs(current);
31833 + /* Exception from kernel and interrupts are enabled. Move to
31834 + kernel process stack. */
31835 + else if (eregs->eflags & X86_EFLAGS_IF)
31836 + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
31837 + if (eregs != regs)
31842 +/* runs on IST stack. */
31843 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
31844 + unsigned long error_code)
31846 + unsigned long condition;
31847 + struct task_struct *tsk = current;
31850 + get_debugreg(condition, 6);
31852 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
31853 + SIGTRAP) == NOTIFY_STOP)
31856 + preempt_conditional_sti(regs);
31858 + /* Mask out spurious debug traps due to lazy DR7 setting */
31859 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
31860 + if (!tsk->thread.debugreg7) {
31865 + tsk->thread.debugreg6 = condition;
31867 + /* Mask out spurious TF errors due to lazy TF clearing */
31868 + if (condition & DR_STEP) {
31870 + * The TF error should be masked out only if the current
31871 + * process is not traced and if the TRAP flag has been set
31872 + * previously by a tracing process (condition detected by
31873 + * the PT_DTRACE flag); remember that the i386 TRAP flag
31874 + * can be modified by the process itself in user mode,
31875 + * allowing programs to debug themselves without the ptrace()
31878 + if (!user_mode(regs))
31879 + goto clear_TF_reenable;
31881 + * Was the TF flag set by a debugger? If so, clear it now,
31882 + * so that register information is correct.
31884 + if (tsk->ptrace & PT_DTRACE) {
31885 + regs->eflags &= ~TF_MASK;
31886 + tsk->ptrace &= ~PT_DTRACE;
31890 + /* Ok, finally something we can handle */
31891 + tsk->thread.trap_no = 1;
31892 + tsk->thread.error_code = error_code;
31893 + info.si_signo = SIGTRAP;
31894 + info.si_errno = 0;
31895 + info.si_code = TRAP_BRKPT;
31896 + info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
31897 + force_sig_info(SIGTRAP, &info, tsk);
31900 + set_debugreg(0UL, 7);
31901 + preempt_conditional_cli(regs);
31904 +clear_TF_reenable:
31905 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
31906 + regs->eflags &= ~TF_MASK;
31907 + preempt_conditional_cli(regs);
31910 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
31912 + const struct exception_table_entry *fixup;
31913 + fixup = search_exception_tables(regs->rip);
31915 + regs->rip = fixup->fixup;
31918 + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
31919 + /* Illegal floating point operation in the kernel */
31920 + current->thread.trap_no = trapnr;
31921 + die(str, regs, 0);
31926 + * Note that we play around with the 'TS' bit in an attempt to get
31927 + * the correct behaviour even in the presence of the asynchronous
31928 + * IRQ13 behaviour
31930 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
31932 + void __user *rip = (void __user *)(regs->rip);
31933 + struct task_struct * task;
31935 + unsigned short cwd, swd;
31937 + conditional_sti(regs);
31938 + if (!user_mode(regs) &&
31939 + kernel_math_error(regs, "kernel x87 math error", 16))
31943 + * Save the info for the exception handler and clear the error.
31946 + save_init_fpu(task);
31947 + task->thread.trap_no = 16;
31948 + task->thread.error_code = 0;
31949 + info.si_signo = SIGFPE;
31950 + info.si_errno = 0;
31951 + info.si_code = __SI_FAULT;
31952 + info.si_addr = rip;
31954 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
31955 + * status. 0x3f is the exception bits in these regs, 0x200 is the
31956 + * C1 reg you need in case of a stack fault, 0x040 is the stack
31957 + * fault bit. We should only be taking one exception at a time,
31958 + * so if this combination doesn't produce any single exception,
31959 + * then we have a bad program that isn't synchronizing its FPU usage
31960 + * and it will suffer the consequences since we won't be able to
31961 + * fully reproduce the context of the exception
31963 + cwd = get_fpu_cwd(task);
31964 + swd = get_fpu_swd(task);
31965 + switch (swd & ~cwd & 0x3f) {
31969 + case 0x001: /* Invalid Op */
31971 + * swd & 0x240 == 0x040: Stack Underflow
31972 + * swd & 0x240 == 0x240: Stack Overflow
31973 + * User must clear the SF bit (0x40) if set
31975 + info.si_code = FPE_FLTINV;
31977 + case 0x002: /* Denormalize */
31978 + case 0x010: /* Underflow */
31979 + info.si_code = FPE_FLTUND;
31981 + case 0x004: /* Zero Divide */
31982 + info.si_code = FPE_FLTDIV;
31984 + case 0x008: /* Overflow */
31985 + info.si_code = FPE_FLTOVF;
31987 + case 0x020: /* Precision */
31988 + info.si_code = FPE_FLTRES;
31991 + force_sig_info(SIGFPE, &info, task);
31994 +asmlinkage void bad_intr(void)
31996 + printk("bad interrupt");
31999 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
32001 + void __user *rip = (void __user *)(regs->rip);
32002 + struct task_struct * task;
32004 + unsigned short mxcsr;
32006 + conditional_sti(regs);
32007 + if (!user_mode(regs) &&
32008 + kernel_math_error(regs, "kernel simd math error", 19))
32012 + * Save the info for the exception handler and clear the error.
32015 + save_init_fpu(task);
32016 + task->thread.trap_no = 19;
32017 + task->thread.error_code = 0;
32018 + info.si_signo = SIGFPE;
32019 + info.si_errno = 0;
32020 + info.si_code = __SI_FAULT;
32021 + info.si_addr = rip;
32023 + * The SIMD FPU exceptions are handled a little differently, as there
32024 + * is only a single status/control register. Thus, to determine which
32025 + * unmasked exception was caught we must mask the exception mask bits
32026 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
32028 + mxcsr = get_fpu_mxcsr(task);
32029 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
32033 + case 0x001: /* Invalid Op */
32034 + info.si_code = FPE_FLTINV;
32036 + case 0x002: /* Denormalize */
32037 + case 0x010: /* Underflow */
32038 + info.si_code = FPE_FLTUND;
32040 + case 0x004: /* Zero Divide */
32041 + info.si_code = FPE_FLTDIV;
32043 + case 0x008: /* Overflow */
32044 + info.si_code = FPE_FLTOVF;
32046 + case 0x020: /* Precision */
32047 + info.si_code = FPE_FLTRES;
32050 + force_sig_info(SIGFPE, &info, task);
32053 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
32058 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
32063 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
32068 + * 'math_state_restore()' saves the current math information in the
32069 + * old math state array, and gets the new ones from the current task
32071 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
32072 + * Don't touch unless you *really* know how it works.
32074 +asmlinkage void math_state_restore(void)
32076 + struct task_struct *me = current;
32077 + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
32079 + if (!used_math())
32081 + restore_fpu_checking(&me->thread.i387.fxsave);
32082 + task_thread_info(me)->status |= TS_USEDFPU;
32087 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
32088 + * specify <dpl>|4 in the second field.
32090 +static trap_info_t __cpuinitdata trap_table[] = {
32091 + { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
32092 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
32093 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
32094 + { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
32095 + { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
32096 + { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
32097 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
32098 + { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
32099 + { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
32100 + { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
32101 + { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
32102 + { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
32103 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
32104 + { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
32105 + { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
32106 + { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
32107 +#ifdef CONFIG_X86_MCE
32108 + { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
32110 + { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
32111 +#ifdef CONFIG_IA32_EMULATION
32112 + { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall},
32117 +void __init trap_init(void)
32121 + ret = HYPERVISOR_set_trap_table(trap_table);
32123 + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
32126 + * Should be a barrier for any external CPU state.
32131 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
32133 + const trap_info_t *t = trap_table;
32135 + for (t = trap_table; t->address; t++) {
32136 + trap_ctxt[t->vector].flags = t->flags;
32137 + trap_ctxt[t->vector].cs = t->cs;
32138 + trap_ctxt[t->vector].address = t->address;
32143 +/* Actual parsing is done early in setup.c. */
32144 +static int __init oops_dummy(char *s)
32146 + panic_on_oops = 1;
32149 +__setup("oops=", oops_dummy);
32151 +static int __init kstack_setup(char *s)
32153 + kstack_depth_to_print = simple_strtoul(s,NULL,0);
32156 +__setup("kstack=", kstack_setup);
32158 +#ifdef CONFIG_STACK_UNWIND
32159 +static int __init call_trace_setup(char *s)
32161 + if (strcmp(s, "old") == 0)
32163 + else if (strcmp(s, "both") == 0)
32165 + else if (strcmp(s, "newfallback") == 0)
32167 + else if (strcmp(s, "new") == 0)
32171 +__setup("call_trace=", call_trace_setup);
32173 Index: head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c
32174 ===================================================================
32175 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32176 +++ head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200
32179 + * linux/arch/x86_64/kernel/vsyscall.c
32181 + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
32182 + * Copyright 2003 Andi Kleen, SuSE Labs.
32184 + * Thanks to hpa@transmeta.com for some useful hint.
32185 + * Special thanks to Ingo Molnar for his early experience with
32186 + * a different vsyscall implementation for Linux/IA32 and for the name.
32188 + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
32189 + * at virtual address -10Mbyte+1024bytes etc... There are at max 4
32190 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
32191 + * jumping out of line if necessary. We cannot add more with this
32192 + * mechanism because older kernels won't return -ENOSYS.
32193 + * If we want more than four we need a vDSO.
32195 + * Note: the concept clashes with user mode linux. If you use UML and
32196 + * want per guest time just set the kernel.vsyscall64 sysctl to 0.
32199 +#include <linux/time.h>
32200 +#include <linux/init.h>
32201 +#include <linux/kernel.h>
32202 +#include <linux/timer.h>
32203 +#include <linux/seqlock.h>
32204 +#include <linux/jiffies.h>
32205 +#include <linux/sysctl.h>
32207 +#include <asm/vsyscall.h>
32208 +#include <asm/pgtable.h>
32209 +#include <asm/page.h>
32210 +#include <asm/fixmap.h>
32211 +#include <asm/errno.h>
32212 +#include <asm/io.h>
32214 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
32216 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
32217 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
32219 +#include <asm/unistd.h>
32221 +static __always_inline void timeval_normalize(struct timeval * tv)
32225 + __sec = tv->tv_usec / 1000000;
32227 + tv->tv_usec %= 1000000;
32228 + tv->tv_sec += __sec;
32232 +static __always_inline void do_vgettimeofday(struct timeval * tv)
32234 + long sequence, t;
32235 + unsigned long sec, usec;
32238 + sequence = read_seqbegin(&__xtime_lock);
32240 + sec = __xtime.tv_sec;
32241 + usec = (__xtime.tv_nsec / 1000) +
32242 + (__jiffies - __wall_jiffies) * (1000000 / HZ);
32244 + if (__vxtime.mode != VXTIME_HPET) {
32245 + t = get_cycles_sync();
32246 + if (t < __vxtime.last_tsc)
32247 + t = __vxtime.last_tsc;
32248 + usec += ((t - __vxtime.last_tsc) *
32249 + __vxtime.tsc_quot) >> 32;
32250 + /* See comment in x86_64 do_gettimeofday. */
32252 + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
32253 + __vxtime.last) * __vxtime.quot) >> 32;
32255 + } while (read_seqretry(&__xtime_lock, sequence));
32257 + tv->tv_sec = sec + usec / 1000000;
32258 + tv->tv_usec = usec % 1000000;
32261 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
32262 +static __always_inline void do_get_tz(struct timezone * tz)
32267 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
32270 + asm volatile("vsysc2: syscall"
32272 + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
32276 +static __always_inline long time_syscall(long *t)
32279 + asm volatile("vsysc1: syscall"
32281 + : "0" (__NR_time),"D" (t) : __syscall_clobber);
32285 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
32287 + if (!__sysctl_vsyscall)
32288 + return gettimeofday(tv,tz);
32290 + do_vgettimeofday(tv);
32296 +/* This will break when the xtime seconds get inaccurate, but that is
32298 +time_t __vsyscall(1) vtime(time_t *t)
32300 + if (!__sysctl_vsyscall)
32301 + return time_syscall(t);
32303 + *t = __xtime.tv_sec;
32304 + return __xtime.tv_sec;
32307 +long __vsyscall(2) venosys_0(void)
32312 +long __vsyscall(3) venosys_1(void)
32317 +#ifdef CONFIG_SYSCTL
32319 +#define SYSCALL 0x050f
32320 +#define NOP2 0x9090
32323 + * NOP out syscall in vsyscall page when not needed.
32325 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
32326 + void __user *buffer, size_t *lenp, loff_t *ppos)
32328 + extern u16 vsysc1, vsysc2;
32329 + u16 *map1, *map2;
32330 + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
32333 + /* gcc has some trouble with __va(__pa()), so just do it this
32335 + map1 = ioremap(__pa_symbol(&vsysc1), 2);
32338 + map2 = ioremap(__pa_symbol(&vsysc2), 2);
32343 + if (!sysctl_vsyscall) {
32356 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
32357 + void __user *oldval, size_t __user *oldlenp,
32358 + void __user *newval, size_t newlen,
32364 +static ctl_table kernel_table2[] = {
32365 + { .ctl_name = 99, .procname = "vsyscall64",
32366 + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
32367 + .strategy = vsyscall_sysctl_nostrat,
32368 + .proc_handler = vsyscall_sysctl_change },
32372 +static ctl_table kernel_root_table2[] = {
32373 + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
32374 + .child = kernel_table2 },
32380 +static void __init map_vsyscall(void)
32382 + extern char __vsyscall_0;
32383 + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
32385 + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
32388 +static int __init vsyscall_init(void)
32390 + BUG_ON(((unsigned long) &vgettimeofday !=
32391 + VSYSCALL_ADDR(__NR_vgettimeofday)));
32392 + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
32393 + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
32396 + sysctl_vsyscall = 0; /* disable vgettimeofay() */
32398 +#ifdef CONFIG_SYSCTL
32399 + register_sysctl_table(kernel_root_table2, 0);
32404 +__initcall(vsyscall_init);
32405 Index: head-2008-11-25/arch/x86/kernel/xen_entry_64.S
32406 ===================================================================
32407 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32408 +++ head-2008-11-25/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200
32411 + * Copied from arch/xen/i386/kernel/entry.S
32413 +/* Offsets into shared_info_t. */
32414 +#define evtchn_upcall_pending /* 0 */
32415 +#define evtchn_upcall_mask 1
32417 +#define sizeof_vcpu_shift 6
32420 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
32421 +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
32422 +#define preempt_disable(reg)
32423 +#define preempt_enable(reg)
32424 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
32425 + movq %gs:pda_cpunumber,reg ; \
32427 + shr $32-sizeof_vcpu_shift,reg ; \
32428 + addq HYPERVISOR_shared_info,reg
32429 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
32430 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
32432 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
32433 +#define XEN_PUT_VCPU_INFO(reg)
32434 +#define XEN_PUT_VCPU_INFO_fixup
32437 +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
32438 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
32439 +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32440 + XEN_LOCKED_BLOCK_EVENTS(reg) ; \
32441 + XEN_PUT_VCPU_INFO(reg)
32442 +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32443 + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
32444 + XEN_PUT_VCPU_INFO(reg)
32445 +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
32446 Index: head-2008-11-25/arch/x86/mm/fault_64-xen.c
32447 ===================================================================
32448 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32449 +++ head-2008-11-25/arch/x86/mm/fault_64-xen.c 2007-11-02 17:34:23.000000000 +0100
32452 + * linux/arch/x86-64/mm/fault.c
32454 + * Copyright (C) 1995 Linus Torvalds
32455 + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
32458 +#include <linux/signal.h>
32459 +#include <linux/sched.h>
32460 +#include <linux/kernel.h>
32461 +#include <linux/errno.h>
32462 +#include <linux/string.h>
32463 +#include <linux/types.h>
32464 +#include <linux/ptrace.h>
32465 +#include <linux/mman.h>
32466 +#include <linux/mm.h>
32467 +#include <linux/smp.h>
32468 +#include <linux/smp_lock.h>
32469 +#include <linux/interrupt.h>
32470 +#include <linux/init.h>
32471 +#include <linux/tty.h>
32472 +#include <linux/vt_kern.h> /* For unblank_screen() */
32473 +#include <linux/compiler.h>
32474 +#include <linux/module.h>
32475 +#include <linux/kprobes.h>
32477 +#include <asm/system.h>
32478 +#include <asm/uaccess.h>
32479 +#include <asm/pgalloc.h>
32480 +#include <asm/smp.h>
32481 +#include <asm/tlbflush.h>
32482 +#include <asm/proto.h>
32483 +#include <asm/kdebug.h>
32484 +#include <asm-generic/sections.h>
32486 +/* Page fault error code bits */
32487 +#define PF_PROT (1<<0) /* or no page found */
32488 +#define PF_WRITE (1<<1)
32489 +#define PF_USER (1<<2)
32490 +#define PF_RSVD (1<<3)
32491 +#define PF_INSTR (1<<4)
32493 +#ifdef CONFIG_KPROBES
32494 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
32496 +/* Hook to register for page fault notifications */
32497 +int register_page_fault_notifier(struct notifier_block *nb)
32499 + vmalloc_sync_all();
32500 + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
32503 +int unregister_page_fault_notifier(struct notifier_block *nb)
32505 + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
32508 +static inline int notify_page_fault(enum die_val val, const char *str,
32509 + struct pt_regs *regs, long err, int trap, int sig)
32511 + struct die_args args = {
32518 + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
32521 +static inline int notify_page_fault(enum die_val val, const char *str,
32522 + struct pt_regs *regs, long err, int trap, int sig)
32524 + return NOTIFY_DONE;
32528 +void bust_spinlocks(int yes)
32530 + int loglevel_save = console_loglevel;
32532 + oops_in_progress = 1;
32535 + unblank_screen();
32537 + oops_in_progress = 0;
32539 + * OK, the message is on the console. Now we call printk()
32540 + * without oops_in_progress set so that printk will give klogd
32541 + * a poke. Hold onto your hats...
32543 + console_loglevel = 15; /* NMI oopser may have shut the console up */
32545 + console_loglevel = loglevel_save;
32549 +/* Sometimes the CPU reports invalid exceptions on prefetch.
32550 + Check that here and ignore.
32551 + Opcode checker based on code by Richard Brunner */
32552 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
32553 + unsigned long error_code)
32555 + unsigned char *instr;
32556 + int scan_more = 1;
32557 + int prefetch = 0;
32558 + unsigned char *max_instr;
32560 + /* If it was a exec fault ignore */
32561 + if (error_code & PF_INSTR)
32564 + instr = (unsigned char *)convert_rip_to_linear(current, regs);
32565 + max_instr = instr + 15;
32567 + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
32570 + while (scan_more && instr < max_instr) {
32571 + unsigned char opcode;
32572 + unsigned char instr_hi;
32573 + unsigned char instr_lo;
32575 + if (__get_user(opcode, instr))
32578 + instr_hi = opcode & 0xf0;
32579 + instr_lo = opcode & 0x0f;
32582 + switch (instr_hi) {
32585 + /* Values 0x26,0x2E,0x36,0x3E are valid x86
32586 + prefixes. In long mode, the CPU will signal
32587 + invalid opcode if some of these prefixes are
32588 + present so we will never get here anyway */
32589 + scan_more = ((instr_lo & 7) == 0x6);
32593 + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
32594 + Need to figure out under what instruction mode the
32595 + instruction was issued ... */
32596 + /* Could check the LDT for lm, but for now it's good
32597 + enough to assume that long mode only uses well known
32598 + segments or kernel. */
32599 + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
32603 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
32604 + scan_more = (instr_lo & 0xC) == 0x4;
32607 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
32608 + scan_more = !instr_lo || (instr_lo>>1) == 1;
32611 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
32613 + if (__get_user(opcode, instr))
32615 + prefetch = (instr_lo == 0xF) &&
32616 + (opcode == 0x0D || opcode == 0x18);
32626 +static int bad_address(void *p)
32628 + unsigned long dummy;
32629 + return __get_user(dummy, (unsigned long *)p);
32632 +void dump_pagetable(unsigned long address)
32639 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32640 + pgd += pgd_index(address);
32641 + if (bad_address(pgd)) goto bad;
32642 + printk("PGD %lx ", pgd_val(*pgd));
32643 + if (!pgd_present(*pgd)) goto ret;
32645 + pud = pud_offset(pgd, address);
32646 + if (bad_address(pud)) goto bad;
32647 + printk("PUD %lx ", pud_val(*pud));
32648 + if (!pud_present(*pud)) goto ret;
32650 + pmd = pmd_offset(pud, address);
32651 + if (bad_address(pmd)) goto bad;
32652 + printk("PMD %lx ", pmd_val(*pmd));
32653 + if (!pmd_present(*pmd)) goto ret;
32655 + pte = pte_offset_kernel(pmd, address);
32656 + if (bad_address(pte)) goto bad;
32657 + printk("PTE %lx", pte_val(*pte));
32665 +static const char errata93_warning[] =
32666 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
32667 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
32668 +KERN_ERR "******* Please consider a BIOS update.\n"
32669 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
32671 +/* Workaround for K8 erratum #93 & buggy BIOS.
32672 + BIOS SMM functions are required to use a specific workaround
32673 + to avoid corruption of the 64bit RIP register on C stepping K8.
32674 + A lot of BIOS that didn't get tested properly miss this.
32675 + The OS sees this as a page fault with the upper 32bits of RIP cleared.
32676 + Try to work around it here.
32677 + Note we only handle faults in kernel here. */
32679 +static int is_errata93(struct pt_regs *regs, unsigned long address)
32681 + static int warned;
32682 + if (address != regs->rip)
32684 + if ((address >> 32) != 0)
32686 + address |= 0xffffffffUL << 32;
32687 + if ((address >= (u64)_stext && address <= (u64)_etext) ||
32688 + (address >= MODULES_VADDR && address <= MODULES_END)) {
32690 + printk(errata93_warning);
32693 + regs->rip = address;
32699 +int unhandled_signal(struct task_struct *tsk, int sig)
32701 + if (tsk->pid == 1)
32703 + if (tsk->ptrace & PT_PTRACED)
32705 + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
32706 + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
32709 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
32710 + unsigned long error_code)
32712 + unsigned long flags = oops_begin();
32713 + struct task_struct *tsk;
32715 + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
32716 + current->comm, address);
32717 + dump_pagetable(address);
32719 + tsk->thread.cr2 = address;
32720 + tsk->thread.trap_no = 14;
32721 + tsk->thread.error_code = error_code;
32722 + __die("Bad pagetable", regs, error_code);
32724 + do_exit(SIGKILL);
32728 + * Handle a fault on the vmalloc area
32730 + * This assumes no large pages in there.
32732 +static int vmalloc_fault(unsigned long address)
32734 + pgd_t *pgd, *pgd_ref;
32735 + pud_t *pud, *pud_ref;
32736 + pmd_t *pmd, *pmd_ref;
32737 + pte_t *pte, *pte_ref;
32739 + /* Copy kernel mappings over when needed. This can also
32740 + happen within a race in page table update. In the later
32741 + case just flush. */
32743 + /* On Xen the line below does not always work. Needs investigating! */
32744 + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
32745 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32746 + pgd += pgd_index(address);
32747 + pgd_ref = pgd_offset_k(address);
32748 + if (pgd_none(*pgd_ref))
32750 + if (pgd_none(*pgd))
32751 + set_pgd(pgd, *pgd_ref);
32753 + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
32755 + /* Below here mismatches are bugs because these lower tables
32758 + pud = pud_offset(pgd, address);
32759 + pud_ref = pud_offset(pgd_ref, address);
32760 + if (pud_none(*pud_ref))
32762 + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
32764 + pmd = pmd_offset(pud, address);
32765 + pmd_ref = pmd_offset(pud_ref, address);
32766 + if (pmd_none(*pmd_ref))
32768 + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
32770 + pte_ref = pte_offset_kernel(pmd_ref, address);
32771 + if (!pte_present(*pte_ref))
32773 + pte = pte_offset_kernel(pmd, address);
32774 + /* Don't use pte_page here, because the mappings can point
32775 + outside mem_map, and the NUMA hash lookup cannot handle
32777 + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
32782 +int page_fault_trace = 0;
32783 +int exception_trace = 1;
32786 +#define MEM_VERBOSE 1
32788 +#ifdef MEM_VERBOSE
32789 +#define MEM_LOG(_f, _a...) \
32790 + printk("fault.c:[%d]-> " _f "\n", \
32791 + __LINE__ , ## _a )
32793 +#define MEM_LOG(_f, _a...) ((void)0)
32796 +static int spurious_fault(struct pt_regs *regs,
32797 + unsigned long address,
32798 + unsigned long error_code)
32806 + /* Faults in hypervisor area are never spurious. */
32807 + if ((address >= HYPERVISOR_VIRT_START) &&
32808 + (address < HYPERVISOR_VIRT_END))
32812 + /* Reserved-bit violation or user access to kernel space? */
32813 + if (error_code & (PF_RSVD|PF_USER))
32816 + pgd = init_mm.pgd + pgd_index(address);
32817 + if (!pgd_present(*pgd))
32820 + pud = pud_offset(pgd, address);
32821 + if (!pud_present(*pud))
32824 + pmd = pmd_offset(pud, address);
32825 + if (!pmd_present(*pmd))
32828 + pte = pte_offset_kernel(pmd, address);
32829 + if (!pte_present(*pte))
32831 + if ((error_code & PF_WRITE) && !pte_write(*pte))
32833 + if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
32840 + * This routine handles page faults. It determines the address,
32841 + * and the problem, and then passes it off to one of the appropriate
32844 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
32845 + unsigned long error_code)
32847 + struct task_struct *tsk;
32848 + struct mm_struct *mm;
32849 + struct vm_area_struct * vma;
32850 + unsigned long address;
32851 + const struct exception_table_entry *fixup;
32853 + unsigned long flags;
32856 + if (!user_mode(regs))
32857 + error_code &= ~PF_USER; /* means kernel */
32861 + prefetchw(&mm->mmap_sem);
32863 + /* get the address */
32864 + address = current_vcpu_info()->arch.cr2;
32866 + info.si_code = SEGV_MAPERR;
32870 + * We fault-in kernel-space virtual memory on-demand. The
32871 + * 'reference' page table is init_mm.pgd.
32873 + * NOTE! We MUST NOT take any locks for this case. We may
32874 + * be in an interrupt or a critical region, and should
32875 + * only copy the information from the master page table,
32878 + * This verifies that the fault happens in kernel space
32879 + * (error_code & 4) == 0, and that the fault was not a
32880 + * protection error (error_code & 9) == 0.
32882 + if (unlikely(address >= TASK_SIZE64)) {
32884 + * Don't check for the module range here: its PML4
32885 + * is always initialized because it's shared with the main
32886 + * kernel text. Only vmalloc may need PML4 syncups.
32888 + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
32889 + ((address >= VMALLOC_START && address < VMALLOC_END))) {
32890 + if (vmalloc_fault(address) >= 0)
32893 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
32894 + if (spurious_fault(regs, address, error_code))
32896 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32897 + SIGSEGV) == NOTIFY_STOP)
32900 + * Don't take the mm semaphore here. If we fixup a prefetch
32901 + * fault we could otherwise deadlock.
32903 + goto bad_area_nosemaphore;
32906 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32907 + SIGSEGV) == NOTIFY_STOP)
32910 + if (likely(regs->eflags & X86_EFLAGS_IF))
32911 + local_irq_enable();
32913 + if (unlikely(page_fault_trace))
32914 + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
32915 + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
32917 + if (unlikely(error_code & PF_RSVD))
32918 + pgtable_bad(address, regs, error_code);
32921 + * If we're in an interrupt or have no user
32922 + * context, we must not take the fault..
32924 + if (unlikely(in_atomic() || !mm))
32925 + goto bad_area_nosemaphore;
32928 + /* When running in the kernel we expect faults to occur only to
32929 + * addresses in user space. All other faults represent errors in the
32930 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
32931 + * erroneous fault occurring in a code path which already holds mmap_sem
32932 + * we will deadlock attempting to validate the fault against the
32933 + * address space. Luckily the kernel only validly references user
32934 + * space from well defined areas of code, which are listed in the
32935 + * exceptions table.
32937 + * As the vast majority of faults will be valid we will only perform
32938 + * the source reference check when there is a possibilty of a deadlock.
32939 + * Attempt to lock the address space, if we cannot we then validate the
32940 + * source. If this is invalid we can skip the address space check,
32941 + * thus avoiding the deadlock.
32943 + if (!down_read_trylock(&mm->mmap_sem)) {
32944 + if ((error_code & PF_USER) == 0 &&
32945 + !search_exception_tables(regs->rip))
32946 + goto bad_area_nosemaphore;
32947 + down_read(&mm->mmap_sem);
32950 + vma = find_vma(mm, address);
32953 + if (likely(vma->vm_start <= address))
32955 + if (!(vma->vm_flags & VM_GROWSDOWN))
32957 + if (error_code & 4) {
32958 + /* Allow userspace just enough access below the stack pointer
32959 + * to let the 'enter' instruction work.
32961 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
32964 + if (expand_stack(vma, address))
32967 + * Ok, we have a good vm_area for this memory access, so
32968 + * we can handle it..
32971 + info.si_code = SEGV_ACCERR;
32973 + switch (error_code & (PF_PROT|PF_WRITE)) {
32974 + default: /* 3: write, present */
32975 + /* fall through */
32976 + case PF_WRITE: /* write, not present */
32977 + if (!(vma->vm_flags & VM_WRITE))
32981 + case PF_PROT: /* read, present */
32983 + case 0: /* read, not present */
32984 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
32989 + * If for any reason at all we couldn't handle the fault,
32990 + * make sure we exit gracefully rather than endlessly redo
32993 + switch (handle_mm_fault(mm, vma, address, write)) {
32994 + case VM_FAULT_MINOR:
32997 + case VM_FAULT_MAJOR:
33000 + case VM_FAULT_SIGBUS:
33003 + goto out_of_memory;
33006 + up_read(&mm->mmap_sem);
33010 + * Something tried to access memory that isn't in our memory map..
33011 + * Fix it, but check if it's kernel or user first..
33014 + up_read(&mm->mmap_sem);
33016 +bad_area_nosemaphore:
33017 + /* User mode accesses just cause a SIGSEGV */
33018 + if (error_code & PF_USER) {
33019 + if (is_prefetch(regs, address, error_code))
33022 + /* Work around K8 erratum #100 K8 in compat mode
33023 + occasionally jumps to illegal addresses >4GB. We
33024 + catch this here in the page fault handler because
33025 + these addresses are not reachable. Just detect this
33026 + case and return. Any code segment in LDT is
33027 + compatibility mode. */
33028 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
33032 + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
33034 + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
33035 + tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
33036 + tsk->comm, tsk->pid, address, regs->rip,
33037 + regs->rsp, error_code);
33040 + tsk->thread.cr2 = address;
33041 + /* Kernel addresses are always protection faults */
33042 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
33043 + tsk->thread.trap_no = 14;
33044 + info.si_signo = SIGSEGV;
33045 + info.si_errno = 0;
33046 + /* info.si_code has been set above */
33047 + info.si_addr = (void __user *)address;
33048 + force_sig_info(SIGSEGV, &info, tsk);
33054 + /* Are we prepared to handle this kernel fault? */
33055 + fixup = search_exception_tables(regs->rip);
33057 + regs->rip = fixup->fixup;
33062 + * Hall of shame of CPU/BIOS bugs.
33065 + if (is_prefetch(regs, address, error_code))
33068 + if (is_errata93(regs, address))
33072 + * Oops. The kernel tried to access some bad page. We'll have to
33073 + * terminate things with extreme prejudice.
33076 + flags = oops_begin();
33078 + if (address < PAGE_SIZE)
33079 + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
33081 + printk(KERN_ALERT "Unable to handle kernel paging request");
33082 + printk(" at %016lx RIP: \n" KERN_ALERT,address);
33083 + printk_address(regs->rip);
33084 + dump_pagetable(address);
33085 + tsk->thread.cr2 = address;
33086 + tsk->thread.trap_no = 14;
33087 + tsk->thread.error_code = error_code;
33088 + __die("Oops", regs, error_code);
33089 + /* Executive summary in case the body of the oops scrolled away */
33090 + printk(KERN_EMERG "CR2: %016lx\n", address);
33092 + do_exit(SIGKILL);
33095 + * We ran out of memory, or some other thing happened to us that made
33096 + * us unable to handle the page fault gracefully.
33099 + up_read(&mm->mmap_sem);
33100 + if (current->pid == 1) {
33104 + printk("VM: killing process %s\n", tsk->comm);
33105 + if (error_code & 4)
33106 + do_exit(SIGKILL);
33110 + up_read(&mm->mmap_sem);
33112 + /* Kernel mode? Handle exceptions or die */
33113 + if (!(error_code & PF_USER))
33116 + tsk->thread.cr2 = address;
33117 + tsk->thread.error_code = error_code;
33118 + tsk->thread.trap_no = 14;
33119 + info.si_signo = SIGBUS;
33120 + info.si_errno = 0;
33121 + info.si_code = BUS_ADRERR;
33122 + info.si_addr = (void __user *)address;
33123 + force_sig_info(SIGBUS, &info, tsk);
33127 +DEFINE_SPINLOCK(pgd_lock);
33128 +struct page *pgd_list;
33130 +void vmalloc_sync_all(void)
33132 + /* Note that races in the updates of insync and start aren't
33134 + insync can only get set bits added, and updates to start are only
33135 + improving performance (without affecting correctness if undone). */
33136 + static DECLARE_BITMAP(insync, PTRS_PER_PGD);
33137 + static unsigned long start = VMALLOC_START & PGDIR_MASK;
33138 + unsigned long address;
33140 + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
33141 + if (!test_bit(pgd_index(address), insync)) {
33142 + const pgd_t *pgd_ref = pgd_offset_k(address);
33143 + struct page *page;
33145 + if (pgd_none(*pgd_ref))
33147 + spin_lock(&pgd_lock);
33148 + for (page = pgd_list; page;
33149 + page = (struct page *)page->index) {
33151 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
33152 + if (pgd_none(*pgd))
33153 + set_pgd(pgd, *pgd_ref);
33155 + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
33157 + spin_unlock(&pgd_lock);
33158 + set_bit(pgd_index(address), insync);
33160 + if (address == start)
33161 + start = address + PGDIR_SIZE;
33163 + /* Check that there is no need to do the same for the modules area. */
33164 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33165 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
33166 + (__START_KERNEL & PGDIR_MASK)));
33169 +static int __init enable_pagefaulttrace(char *str)
33171 + page_fault_trace = 1;
33174 +__setup("pagefaulttrace", enable_pagefaulttrace);
33175 Index: head-2008-11-25/arch/x86/mm/init_64-xen.c
33176 ===================================================================
33177 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
33178 +++ head-2008-11-25/arch/x86/mm/init_64-xen.c 2008-10-29 09:55:56.000000000 +0100
33181 + * linux/arch/x86_64/mm/init.c
33183 + * Copyright (C) 1995 Linus Torvalds
33184 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
33185 + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
33187 + * Jun Nakajima <jun.nakajima@intel.com>
33188 + * Modified for Xen.
33191 +#include <linux/signal.h>
33192 +#include <linux/sched.h>
33193 +#include <linux/kernel.h>
33194 +#include <linux/errno.h>
33195 +#include <linux/string.h>
33196 +#include <linux/types.h>
33197 +#include <linux/ptrace.h>
33198 +#include <linux/mman.h>
33199 +#include <linux/mm.h>
33200 +#include <linux/swap.h>
33201 +#include <linux/smp.h>
33202 +#include <linux/init.h>
33203 +#include <linux/pagemap.h>
33204 +#include <linux/bootmem.h>
33205 +#include <linux/proc_fs.h>
33206 +#include <linux/pci.h>
33207 +#include <linux/poison.h>
33208 +#include <linux/dma-mapping.h>
33209 +#include <linux/module.h>
33210 +#include <linux/memory_hotplug.h>
33212 +#include <asm/processor.h>
33213 +#include <asm/system.h>
33214 +#include <asm/uaccess.h>
33215 +#include <asm/pgtable.h>
33216 +#include <asm/pgalloc.h>
33217 +#include <asm/dma.h>
33218 +#include <asm/fixmap.h>
33219 +#include <asm/e820.h>
33220 +#include <asm/apic.h>
33221 +#include <asm/tlb.h>
33222 +#include <asm/mmu_context.h>
33223 +#include <asm/proto.h>
33224 +#include <asm/smp.h>
33225 +#include <asm/sections.h>
33227 +#include <xen/features.h>
33230 +#define Dprintk(x...)
33233 +struct dma_mapping_ops* dma_ops;
33234 +EXPORT_SYMBOL(dma_ops);
33236 +#if CONFIG_XEN_COMPAT <= 0x030002
33237 +unsigned int __kernel_page_user;
33238 +EXPORT_SYMBOL(__kernel_page_user);
33241 +int after_bootmem;
33243 +static unsigned long dma_reserve __initdata;
33245 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
33246 +extern unsigned long start_pfn;
33249 + * Use this until direct mapping is established, i.e. before __va() is
33250 + * available in init_memory_mapping().
33253 +#define addr_to_page(addr, page) \
33254 + (addr) &= PHYSICAL_PAGE_MASK; \
33255 + (page) = ((unsigned long *) ((unsigned long) \
33256 + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
33257 + __START_KERNEL_map)))
33259 +static void __meminit early_make_page_readonly(void *va, unsigned int feature)
33261 + unsigned long addr, _va = (unsigned long)va;
33262 + pte_t pte, *ptep;
33263 + unsigned long *page = (unsigned long *) init_level4_pgt;
33265 + BUG_ON(after_bootmem);
33267 + if (xen_feature(feature))
33270 + addr = (unsigned long) page[pgd_index(_va)];
33271 + addr_to_page(addr, page);
33273 + addr = page[pud_index(_va)];
33274 + addr_to_page(addr, page);
33276 + addr = page[pmd_index(_va)];
33277 + addr_to_page(addr, page);
33279 + ptep = (pte_t *) &page[pte_index(_va)];
33281 + pte.pte = ptep->pte & ~_PAGE_RW;
33282 + if (HYPERVISOR_update_va_mapping(_va, pte, 0))
33286 +static void __make_page_readonly(void *va)
33288 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33289 + unsigned long addr = (unsigned long) va;
33291 + pgd = pgd_offset_k(addr);
33292 + pud = pud_offset(pgd, addr);
33293 + pmd = pmd_offset(pud, addr);
33294 + ptep = pte_offset_kernel(pmd, addr);
33296 + pte.pte = ptep->pte & ~_PAGE_RW;
33297 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33298 + xen_l1_entry_update(ptep, pte); /* fallback */
33300 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33301 + __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
33304 +static void __make_page_writable(void *va)
33306 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33307 + unsigned long addr = (unsigned long) va;
33309 + pgd = pgd_offset_k(addr);
33310 + pud = pud_offset(pgd, addr);
33311 + pmd = pmd_offset(pud, addr);
33312 + ptep = pte_offset_kernel(pmd, addr);
33314 + pte.pte = ptep->pte | _PAGE_RW;
33315 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33316 + xen_l1_entry_update(ptep, pte); /* fallback */
33318 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33319 + __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
33322 +void make_page_readonly(void *va, unsigned int feature)
33324 + if (!xen_feature(feature))
33325 + __make_page_readonly(va);
33328 +void make_page_writable(void *va, unsigned int feature)
33330 + if (!xen_feature(feature))
33331 + __make_page_writable(va);
33334 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
33336 + if (xen_feature(feature))
33339 + while (nr-- != 0) {
33340 + __make_page_readonly(va);
33341 + va = (void*)((unsigned long)va + PAGE_SIZE);
33345 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
33347 + if (xen_feature(feature))
33350 + while (nr-- != 0) {
33351 + __make_page_writable(va);
33352 + va = (void*)((unsigned long)va + PAGE_SIZE);
33357 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
33358 + * physical space so we can cache the place of the first one and move
33359 + * around without checking the pgd every time.
33362 +void show_mem(void)
33364 + long i, total = 0, reserved = 0;
33365 + long shared = 0, cached = 0;
33366 + pg_data_t *pgdat;
33367 + struct page *page;
33369 + printk(KERN_INFO "Mem-info:\n");
33370 + show_free_areas();
33371 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
33373 + for_each_online_pgdat(pgdat) {
33374 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
33375 + page = pfn_to_page(pgdat->node_start_pfn + i);
33377 + if (PageReserved(page))
33379 + else if (PageSwapCache(page))
33381 + else if (page_count(page))
33382 + shared += page_count(page) - 1;
33385 + printk(KERN_INFO "%lu pages of RAM\n", total);
33386 + printk(KERN_INFO "%lu reserved pages\n",reserved);
33387 + printk(KERN_INFO "%lu pages shared\n",shared);
33388 + printk(KERN_INFO "%lu pages swap cached\n",cached);
33392 +static __init void *spp_getpage(void)
33395 + if (after_bootmem)
33396 + ptr = (void *) get_zeroed_page(GFP_ATOMIC);
33397 + else if (start_pfn < table_end) {
33398 + ptr = __va(start_pfn << PAGE_SHIFT);
33400 + memset(ptr, 0, PAGE_SIZE);
33402 + ptr = alloc_bootmem_pages(PAGE_SIZE);
33403 + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
33404 + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
33406 + Dprintk("spp_getpage %p\n", ptr);
33410 +#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
33411 +#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
33413 +static __init void set_pte_phys(unsigned long vaddr,
33414 + unsigned long phys, pgprot_t prot, int user_mode)
33419 + pte_t *pte, new_pte;
33421 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33423 + pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
33424 + if (pgd_none(*pgd)) {
33425 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33428 + pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
33429 + if (pud_none(*pud)) {
33430 + pmd = (pmd_t *) spp_getpage();
33431 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
33432 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33433 + if (pmd != pmd_offset(pud, 0)) {
33434 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33438 + pmd = pmd_offset(pud, vaddr);
33439 + if (pmd_none(*pmd)) {
33440 + pte = (pte_t *) spp_getpage();
33441 + make_page_readonly(pte, XENFEAT_writable_page_tables);
33442 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33443 + if (pte != pte_offset_kernel(pmd, 0)) {
33444 + printk("PAGETABLE BUG #02!\n");
33448 + if (pgprot_val(prot))
33449 + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
33451 + new_pte = __pte(0);
33453 + pte = pte_offset_kernel(pmd, vaddr);
33454 + if (!pte_none(*pte) && __pte_val(new_pte) &&
33455 + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33457 + set_pte(pte, new_pte);
33460 + * It's enough to flush this one mapping.
33461 + * (PGE mappings get flushed as well)
33463 + __flush_tlb_one(vaddr);
33466 +static __init void set_pte_phys_ma(unsigned long vaddr,
33467 + unsigned long phys, pgprot_t prot)
33472 + pte_t *pte, new_pte;
33474 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33476 + pgd = pgd_offset_k(vaddr);
33477 + if (pgd_none(*pgd)) {
33478 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33481 + pud = pud_offset(pgd, vaddr);
33482 + if (pud_none(*pud)) {
33484 + pmd = (pmd_t *) spp_getpage();
33485 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
33486 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33487 + if (pmd != pmd_offset(pud, 0)) {
33488 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33492 + pmd = pmd_offset(pud, vaddr);
33493 + if (pmd_none(*pmd)) {
33494 + pte = (pte_t *) spp_getpage();
33495 + make_page_readonly(pte, XENFEAT_writable_page_tables);
33496 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33497 + if (pte != pte_offset_kernel(pmd, 0)) {
33498 + printk("PAGETABLE BUG #02!\n");
33502 + new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
33504 + pte = pte_offset_kernel(pmd, vaddr);
33505 + if (!pte_none(*pte) && __pte_val(new_pte) &&
33506 +#ifdef CONFIG_ACPI
33507 + /* __acpi_map_table() fails to properly call clear_fixmap() */
33508 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
33509 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
33511 + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33513 + set_pte(pte, new_pte);
33516 + * It's enough to flush this one mapping.
33517 + * (PGE mappings get flushed as well)
33519 + __flush_tlb_one(vaddr);
33522 +/* NOTE: this is meant to be run only at boot */
33524 +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
33526 + unsigned long address = __fix_to_virt(idx);
33528 + if (idx >= __end_of_fixed_addresses) {
33529 + printk("Invalid __set_fixmap\n");
33533 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
33534 + set_pte_phys(address, phys, prot, 0);
33535 + set_pte_phys(address, phys, prot, 1);
33538 + set_pte_phys_ma(address, phys, prot);
33543 +unsigned long __initdata table_start, table_end;
33545 +static __meminit void *alloc_static_page(unsigned long *phys)
33547 + unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
33549 + if (after_bootmem) {
33550 + void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
33552 + *phys = __pa(adr);
33556 + *phys = start_pfn << PAGE_SHIFT;
33558 + memset((void *)va, 0, PAGE_SIZE);
33559 + return (void *)va;
33562 +#define PTE_SIZE PAGE_SIZE
33564 +static inline int make_readonly(unsigned long paddr)
33566 + extern char __vsyscall_0;
33567 + int readonly = 0;
33569 + /* Make new page tables read-only. */
33570 + if (!xen_feature(XENFEAT_writable_page_tables)
33571 + && (paddr >= (table_start << PAGE_SHIFT))
33572 + && (paddr < (table_end << PAGE_SHIFT)))
33574 + /* Make old page tables read-only. */
33575 + if (!xen_feature(XENFEAT_writable_page_tables)
33576 + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
33577 + && (paddr < (start_pfn << PAGE_SHIFT)))
33581 + * No need for writable mapping of kernel image. This also ensures that
33582 + * page and descriptor tables embedded inside don't have writable
33583 + * mappings. Exclude the vsyscall area here, allowing alternative
33584 + * instruction patching to work.
33586 + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
33587 + && !(paddr >= __pa_symbol(&__vsyscall_0)
33588 + && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
33594 +#ifndef CONFIG_XEN
33595 +/* Must run before zap_low_mappings */
33596 +__init void *early_ioremap(unsigned long addr, unsigned long size)
33598 + unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
33600 + /* actually usually some more */
33601 + if (size >= LARGE_PAGE_SIZE) {
33602 + printk("SMBIOS area too long %lu\n", size);
33605 + set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33606 + map += LARGE_PAGE_SIZE;
33607 + set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33609 + return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
33612 +/* To avoid virtual aliases later */
33613 +__init void early_iounmap(void *addr, unsigned long size)
33615 + if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
33616 + printk("early_iounmap: bad address %p\n", addr);
33617 + set_pmd(temp_mappings[0].pmd, __pmd(0));
33618 + set_pmd(temp_mappings[1].pmd, __pmd(0));
33623 +static void __meminit
33624 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
33628 + for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
33629 + unsigned long pte_phys;
33630 + pte_t *pte, *pte_save;
33632 + if (address >= end)
33634 + pte = alloc_static_page(&pte_phys);
33636 + for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
33637 + unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
33639 + if (address >= (after_bootmem
33641 + : xen_start_info->nr_pages << PAGE_SHIFT))
33643 + else if (make_readonly(address))
33644 + pteval &= ~_PAGE_RW;
33645 + set_pte(pte, __pte(pteval & __supported_pte_mask));
33647 + if (!after_bootmem) {
33648 + early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33649 + *pmd = __pmd(pte_phys | _KERNPG_TABLE);
33651 + make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33652 + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
33657 +static void __meminit
33658 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
33660 + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
33662 + if (pmd_none(*pmd)) {
33663 + spin_lock(&init_mm.page_table_lock);
33664 + phys_pmd_init(pmd, address, end);
33665 + spin_unlock(&init_mm.page_table_lock);
33666 + __flush_tlb_all();
33670 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
33672 + long i = pud_index(address);
33676 + if (after_bootmem && pud_val(*pud)) {
33677 + phys_pmd_update(pud, address, end);
33681 + for (; i < PTRS_PER_PUD; pud++, i++) {
33682 + unsigned long paddr, pmd_phys;
33685 + paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
33686 + if (paddr >= end)
33689 + pmd = alloc_static_page(&pmd_phys);
33691 + spin_lock(&init_mm.page_table_lock);
33692 + *pud = __pud(pmd_phys | _KERNPG_TABLE);
33693 + phys_pmd_init(pmd, paddr, end);
33694 + spin_unlock(&init_mm.page_table_lock);
33696 + early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
33701 +void __init xen_init_pt(void)
33703 + unsigned long addr, *page;
33705 + /* Find the initial pte page that was built for us. */
33706 + page = (unsigned long *)xen_start_info->pt_base;
33707 + addr = page[pgd_index(__START_KERNEL_map)];
33708 + addr_to_page(addr, page);
33709 + addr = page[pud_index(__START_KERNEL_map)];
33710 + addr_to_page(addr, page);
33712 +#if CONFIG_XEN_COMPAT <= 0x030002
33713 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
33714 + in kernel PTEs. We check that here. */
33715 + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
33716 + unsigned long *pg;
33719 + /* Mess with the initial mapping of page 0. It's not needed. */
33720 + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
33721 + addr = page[pmd_index(__START_KERNEL_map)];
33722 + addr_to_page(addr, pg);
33723 + pte.pte = pg[pte_index(__START_KERNEL_map)];
33724 + BUG_ON(!(pte.pte & _PAGE_PRESENT));
33726 + /* If _PAGE_USER isn't set, we obviously do not need it. */
33727 + if (pte.pte & _PAGE_USER) {
33728 + /* _PAGE_USER is needed, but is it set implicitly? */
33729 + pte.pte &= ~_PAGE_USER;
33730 + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
33732 + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
33733 + /* We need to explicitly specify _PAGE_USER. */
33734 + __kernel_page_user = _PAGE_USER;
33739 + /* Construct mapping of initial pte page in our own directories. */
33740 + init_level4_pgt[pgd_index(__START_KERNEL_map)] =
33741 + __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
33742 + level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
33743 + __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
33744 + memcpy(level2_kernel_pgt, page, PAGE_SIZE);
33746 + __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
33747 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
33749 + early_make_page_readonly(init_level4_pgt,
33750 + XENFEAT_writable_page_tables);
33751 + early_make_page_readonly(__user_pgd(init_level4_pgt),
33752 + XENFEAT_writable_page_tables);
33753 + early_make_page_readonly(level3_kernel_pgt,
33754 + XENFEAT_writable_page_tables);
33755 + early_make_page_readonly(level3_user_pgt,
33756 + XENFEAT_writable_page_tables);
33757 + early_make_page_readonly(level2_kernel_pgt,
33758 + XENFEAT_writable_page_tables);
33760 + if (!xen_feature(XENFEAT_writable_page_tables)) {
33761 + xen_pgd_pin(__pa_symbol(init_level4_pgt));
33762 + xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
33766 +static void __init extend_init_mapping(unsigned long tables_space)
33768 + unsigned long va = __START_KERNEL_map;
33769 + unsigned long phys, addr, *pte_page;
33771 + pte_t *pte, new_pte;
33772 + unsigned long *page = (unsigned long *)init_level4_pgt;
33774 + addr = page[pgd_index(va)];
33775 + addr_to_page(addr, page);
33776 + addr = page[pud_index(va)];
33777 + addr_to_page(addr, page);
33779 + /* Kill mapping of low 1MB. */
33780 + while (va < (unsigned long)&_text) {
33781 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33786 + /* Ensure init mappings cover kernel text/data and initial tables. */
33787 + while (va < (__START_KERNEL_map
33788 + + (start_pfn << PAGE_SHIFT)
33789 + + tables_space)) {
33790 + pmd = (pmd_t *)&page[pmd_index(va)];
33791 + if (pmd_none(*pmd)) {
33792 + pte_page = alloc_static_page(&phys);
33793 + early_make_page_readonly(
33794 + pte_page, XENFEAT_writable_page_tables);
33795 + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
33797 + addr = page[pmd_index(va)];
33798 + addr_to_page(addr, pte_page);
33800 + pte = (pte_t *)&pte_page[pte_index(va)];
33801 + if (pte_none(*pte)) {
33802 + new_pte = pfn_pte(
33803 + (va - __START_KERNEL_map) >> PAGE_SHIFT,
33804 + __pgprot(_KERNPG_TABLE));
33805 + xen_l1_entry_update(pte, new_pte);
33810 + /* Finally, blow away any spurious initial mappings. */
33812 + pmd = (pmd_t *)&page[pmd_index(va)];
33813 + if (pmd_none(*pmd))
33815 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33821 +static void __init find_early_table_space(unsigned long end)
33823 + unsigned long puds, pmds, ptes, tables;
33825 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
33826 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
33827 + ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
33829 + tables = round_up(puds * 8, PAGE_SIZE) +
33830 + round_up(pmds * 8, PAGE_SIZE) +
33831 + round_up(ptes * 8, PAGE_SIZE);
33833 + extend_init_mapping(tables);
33835 + table_start = start_pfn;
33836 + table_end = table_start + (tables>>PAGE_SHIFT);
33838 + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
33839 + end, table_start << PAGE_SHIFT,
33840 + (table_start << PAGE_SHIFT) + tables);
33843 +static void xen_finish_init_mapping(void)
33845 + unsigned long i, start, end;
33847 + /* Re-vector virtual addresses pointing into the initial
33848 + mapping to the just-established permanent ones. */
33849 + xen_start_info = __va(__pa(xen_start_info));
33850 + xen_start_info->pt_base = (unsigned long)
33851 + __va(__pa(xen_start_info->pt_base));
33852 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
33853 + phys_to_machine_mapping =
33854 + __va(__pa(xen_start_info->mfn_list));
33855 + xen_start_info->mfn_list = (unsigned long)
33856 + phys_to_machine_mapping;
33858 + if (xen_start_info->mod_start)
33859 + xen_start_info->mod_start = (unsigned long)
33860 + __va(__pa(xen_start_info->mod_start));
33862 + /* Destroy the Xen-created mappings beyond the kernel image as
33863 + * well as the temporary mappings created above. Prevents
33864 + * overlap with modules area (if init mapping is very big).
33866 + start = PAGE_ALIGN((unsigned long)_end);
33867 + end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
33868 + for (; start < end; start += PAGE_SIZE)
33869 + if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
33872 + /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
33873 + table_end = ~0UL;
33876 + * Prefetch pte's for the bt_ioremap() area. It gets used before the
33877 + * boot-time allocator is online, so allocate-on-demand would fail.
33879 + for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
33880 + __set_fixmap(i, 0, __pgprot(0));
33882 + /* Switch to the real shared_info page, and clear the dummy page. */
33883 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
33884 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
33885 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
33887 + /* Set up mapping of lowest 1MB of physical memory. */
33888 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
33889 + if (is_initial_xendomain())
33890 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
33892 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
33893 + virt_to_mfn(empty_zero_page)
33897 + /* Disable the 'start_pfn' allocator. */
33898 + table_end = start_pfn;
33901 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
33902 + This runs before bootmem is initialized and gets pages directly from the
33903 + physical memory. To access them they are temporarily mapped. */
33904 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
33906 + unsigned long next;
33908 + Dprintk("init_memory_mapping\n");
33911 + * Find space for the kernel direct mapping tables.
33912 + * Later we should allocate these tables in the local node of the memory
33913 + * mapped. Unfortunately this is done currently before the nodes are
33916 + if (!after_bootmem)
33917 + find_early_table_space(end);
33919 + start = (unsigned long)__va(start);
33920 + end = (unsigned long)__va(end);
33922 + for (; start < end; start = next) {
33923 + unsigned long pud_phys;
33924 + pgd_t *pgd = pgd_offset_k(start);
33927 + if (after_bootmem)
33928 + pud = pud_offset(pgd, start & PGDIR_MASK);
33930 + pud = alloc_static_page(&pud_phys);
33931 + next = start + PGDIR_SIZE;
33934 + phys_pud_init(pud, __pa(start), __pa(next));
33935 + if (!after_bootmem) {
33936 + early_make_page_readonly(pud, XENFEAT_writable_page_tables);
33937 + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
33941 + if (!after_bootmem) {
33942 + BUG_ON(start_pfn != table_end);
33943 + xen_finish_init_mapping();
33946 + __flush_tlb_all();
33949 +void __cpuinit zap_low_mappings(int cpu)
33951 + /* this is not required for Xen */
33953 + swap_low_mappings();
33957 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
33959 +size_zones(unsigned long *z, unsigned long *h,
33960 + unsigned long start_pfn, unsigned long end_pfn)
33965 + for (i = 0; i < MAX_NR_ZONES; i++)
33968 + if (start_pfn < MAX_DMA_PFN)
33969 + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
33970 + if (start_pfn < MAX_DMA32_PFN) {
33971 + unsigned long dma32_pfn = MAX_DMA32_PFN;
33972 + if (dma32_pfn > end_pfn)
33973 + dma32_pfn = end_pfn;
33974 + z[ZONE_DMA32] = dma32_pfn - start_pfn;
33976 + z[ZONE_NORMAL] = end_pfn - start_pfn;
33978 + /* Remove lower zones from higher ones. */
33980 + for (i = 0; i < MAX_NR_ZONES; i++) {
33986 + /* Compute holes */
33988 + for (i = 0; i < MAX_NR_ZONES; i++) {
33989 + unsigned long s = w;
33991 + h[i] = e820_hole_size(s, w);
33994 + /* Add the space pace needed for mem_map to the holes too. */
33995 + for (i = 0; i < MAX_NR_ZONES; i++)
33996 + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
33998 + /* The 16MB DMA zone has the kernel and other misc mappings.
33999 + Account them too */
34000 + if (h[ZONE_DMA]) {
34001 + h[ZONE_DMA] += dma_reserve;
34002 + if (h[ZONE_DMA] >= z[ZONE_DMA]) {
34003 + printk(KERN_WARNING
34004 + "Kernel too large and filling up ZONE_DMA?\n");
34005 + h[ZONE_DMA] = z[ZONE_DMA];
34010 +#ifndef CONFIG_NUMA
34011 +void __init paging_init(void)
34013 + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
34015 + memory_present(0, 0, end_pfn);
34017 + size_zones(zones, holes, 0, end_pfn);
34018 + free_area_init_node(0, NODE_DATA(0), zones,
34019 + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
34021 + init_mm.context.pinned = 1;
34025 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
34026 + from the CPU leading to inconsistent cache lines. address and size
34027 + must be aligned to 2MB boundaries.
34028 + Does nothing when the mapping doesn't exist. */
34029 +void __init clear_kernel_mapping(unsigned long address, unsigned long size)
34031 + unsigned long end = address + size;
34033 + BUG_ON(address & ~LARGE_PAGE_MASK);
34034 + BUG_ON(size & ~LARGE_PAGE_MASK);
34036 + for (; address < end; address += LARGE_PAGE_SIZE) {
34037 + pgd_t *pgd = pgd_offset_k(address);
34040 + if (pgd_none(*pgd))
34042 + pud = pud_offset(pgd, address);
34043 + if (pud_none(*pud))
34045 + pmd = pmd_offset(pud, address);
34046 + if (!pmd || pmd_none(*pmd))
34048 + if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
34049 + /* Could handle this, but it should not happen currently. */
34051 + "clear_kernel_mapping: mapping has been split. will leak memory\n");
34054 + set_pmd(pmd, __pmd(0));
34056 + __flush_tlb_all();
34060 + * Memory hotplug specific functions
34062 +void online_page(struct page *page)
34064 + ClearPageReserved(page);
34065 + init_page_count(page);
34066 + __free_page(page);
34067 + totalram_pages++;
34071 +#ifdef CONFIG_MEMORY_HOTPLUG
34073 + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
34074 + * via probe interface of sysfs. If acpi notifies hot-add event, then it
34075 + * can tell node id by searching dsdt. But, probe interface doesn't have
34076 + * node id. So, return 0 as node id at this time.
34078 +#ifdef CONFIG_NUMA
34079 +int memory_add_physaddr_to_nid(u64 start)
34086 + * Memory is added always to NORMAL zone. This means you will never get
34087 + * additional DMA/DMA32 memory.
34089 +int arch_add_memory(int nid, u64 start, u64 size)
34091 + struct pglist_data *pgdat = NODE_DATA(nid);
34092 + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
34093 + unsigned long start_pfn = start >> PAGE_SHIFT;
34094 + unsigned long nr_pages = size >> PAGE_SHIFT;
34097 + ret = __add_pages(zone, start_pfn, nr_pages);
34101 + init_memory_mapping(start, (start + size -1));
34105 + printk("%s: Problem encountered in __add_pages!\n", __func__);
34108 +EXPORT_SYMBOL_GPL(arch_add_memory);
34110 +int remove_memory(u64 start, u64 size)
34114 +EXPORT_SYMBOL_GPL(remove_memory);
34116 +#else /* CONFIG_MEMORY_HOTPLUG */
34118 + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
34119 + * just online the pages.
34121 +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
34124 + unsigned long pfn;
34125 + unsigned long total = 0, mem = 0;
34126 + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
34127 + if (pfn_valid(pfn)) {
34128 + online_page(pfn_to_page(pfn));
34135 + z->spanned_pages += total;
34136 + z->present_pages += mem;
34137 + z->zone_pgdat->node_spanned_pages += total;
34138 + z->zone_pgdat->node_present_pages += mem;
34142 +#endif /* CONFIG_MEMORY_HOTPLUG */
34144 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
34147 +void __init mem_init(void)
34149 + long codesize, reservedpages, datasize, initsize;
34150 + unsigned long pfn;
34152 + pci_iommu_alloc();
34154 + /* How many end-of-memory variables you have, grandma! */
34155 + max_low_pfn = end_pfn;
34156 + max_pfn = end_pfn;
34157 + num_physpages = end_pfn;
34158 + high_memory = (void *) __va(end_pfn * PAGE_SIZE);
34160 + /* clear the zero-page */
34161 + memset(empty_zero_page, 0, PAGE_SIZE);
34163 + reservedpages = 0;
34165 + /* this will put all low memory onto the freelists */
34166 +#ifdef CONFIG_NUMA
34167 + totalram_pages = numa_free_all_bootmem();
34169 + totalram_pages = free_all_bootmem();
34171 + /* XEN: init and count pages outside initial allocation. */
34172 + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
34173 + ClearPageReserved(pfn_to_page(pfn));
34174 + init_page_count(pfn_to_page(pfn));
34175 + totalram_pages++;
34177 + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
34179 + after_bootmem = 1;
34181 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
34182 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
34183 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
34185 + /* Register memory areas for /proc/kcore */
34186 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
34187 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
34188 + VMALLOC_END-VMALLOC_START);
34189 + kclist_add(&kcore_kernel, &_stext, _end - _stext);
34190 + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
34191 + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
34192 + VSYSCALL_END - VSYSCALL_START);
34194 + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
34195 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
34196 + end_pfn << (PAGE_SHIFT-10),
34198 + reservedpages << (PAGE_SHIFT-10),
34202 +#ifndef CONFIG_XEN
34205 + * Sync boot_level4_pgt mappings with the init_level4_pgt
34206 + * except for the low identity mappings which are already zapped
34207 + * in init_level4_pgt. This sync-up is essential for AP's bringup
34209 + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
34214 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
34216 + unsigned long addr;
34218 + if (begin >= end)
34221 + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
34222 + for (addr = begin; addr < end; addr += PAGE_SIZE) {
34223 + ClearPageReserved(virt_to_page(addr));
34224 + init_page_count(virt_to_page(addr));
34225 + memset((void *)(addr & ~(PAGE_SIZE-1)),
34226 + POISON_FREE_INITMEM, PAGE_SIZE);
34227 + if (addr >= __START_KERNEL_map) {
34228 + /* make_readonly() reports all kernel addresses. */
34229 + __make_page_writable(__va(__pa(addr)));
34230 + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
34231 + pgd_t *pgd = pgd_offset_k(addr);
34232 + pud_t *pud = pud_offset(pgd, addr);
34233 + pmd_t *pmd = pmd_offset(pud, addr);
34234 + pte_t *pte = pte_offset_kernel(pmd, addr);
34236 + xen_l1_entry_update(pte, __pte(0)); /* fallback */
34240 + totalram_pages++;
34244 +void free_initmem(void)
34246 + memset(__initdata_begin, POISON_FREE_INITDATA,
34247 + __initdata_end - __initdata_begin);
34248 + free_init_pages("unused kernel memory",
34249 + (unsigned long)(&__init_begin),
34250 + (unsigned long)(&__init_end));
34253 +#ifdef CONFIG_DEBUG_RODATA
34255 +void mark_rodata_ro(void)
34257 + unsigned long addr = (unsigned long)__start_rodata;
34259 + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
34260 + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
34262 + printk ("Write protecting the kernel read-only data: %luk\n",
34263 + (__end_rodata - __start_rodata) >> 10);
34266 + * change_page_attr_addr() requires a global_flush_tlb() call after it.
34267 + * We do this after the printk so that if something went wrong in the
34268 + * change, the printk gets out at least to give a better debug hint
34269 + * of who is the culprit.
34271 + global_flush_tlb();
34275 +#ifdef CONFIG_BLK_DEV_INITRD
34276 +void free_initrd_mem(unsigned long start, unsigned long end)
34278 + free_init_pages("initrd memory", start, end);
34282 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
34284 + /* Should check here against the e820 map to avoid double free */
34285 +#ifdef CONFIG_NUMA
34286 + int nid = phys_to_nid(phys);
34287 + reserve_bootmem_node(NODE_DATA(nid), phys, len);
34289 + reserve_bootmem(phys, len);
34291 + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
34292 + dma_reserve += len / PAGE_SIZE;
34295 +int kern_addr_valid(unsigned long addr)
34297 + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
34303 + if (above != 0 && above != -1UL)
34306 + pgd = pgd_offset_k(addr);
34307 + if (pgd_none(*pgd))
34310 + pud = pud_offset(pgd, addr);
34311 + if (pud_none(*pud))
34314 + pmd = pmd_offset(pud, addr);
34315 + if (pmd_none(*pmd))
34317 + if (pmd_large(*pmd))
34318 + return pfn_valid(pmd_pfn(*pmd));
34320 + pte = pte_offset_kernel(pmd, addr);
34321 + if (pte_none(*pte))
34323 + return pfn_valid(pte_pfn(*pte));
34326 +#ifdef CONFIG_SYSCTL
34327 +#include <linux/sysctl.h>
34329 +extern int exception_trace, page_fault_trace;
34331 +static ctl_table debug_table2[] = {
34332 + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
34337 +static ctl_table debug_root_table2[] = {
34338 + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
34339 + .child = debug_table2 },
34343 +static __init int x8664_sysctl_init(void)
34345 + register_sysctl_table(debug_root_table2, 1);
34348 +__initcall(x8664_sysctl_init);
34351 +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
34352 + covers the 64bit vsyscall page now. 32bit has a real VMA now and does
34353 + not need special handling anymore. */
34355 +static struct vm_area_struct gate_vma = {
34356 + .vm_start = VSYSCALL_START,
34357 + .vm_end = VSYSCALL_END,
34358 + .vm_page_prot = PAGE_READONLY
34361 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
34363 +#ifdef CONFIG_IA32_EMULATION
34364 + if (test_tsk_thread_flag(tsk, TIF_IA32))
34367 + return &gate_vma;
34370 +int in_gate_area(struct task_struct *task, unsigned long addr)
34372 + struct vm_area_struct *vma = get_gate_vma(task);
34375 + return (addr >= vma->vm_start) && (addr < vma->vm_end);
34378 +/* Use this when you have no reliable task/vma, typically from interrupt
34379 + * context. It is less reliable than using the task's vma and may give
34380 + * false positives.
34382 +int in_gate_area_no_task(unsigned long addr)
34384 + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
34386 Index: head-2008-11-25/arch/x86/mm/pageattr_64-xen.c
34387 ===================================================================
34388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
34389 +++ head-2008-11-25/arch/x86/mm/pageattr_64-xen.c 2008-07-21 11:00:32.000000000 +0200
34392 + * Copyright 2002 Andi Kleen, SuSE Labs.
34393 + * Thanks to Ben LaHaise for precious feedback.
34396 +#include <linux/mm.h>
34397 +#include <linux/sched.h>
34398 +#include <linux/highmem.h>
34399 +#include <linux/module.h>
34400 +#include <linux/slab.h>
34401 +#include <asm/uaccess.h>
34402 +#include <asm/processor.h>
34403 +#include <asm/tlbflush.h>
34404 +#include <asm/io.h>
34407 +#include <asm/pgalloc.h>
34408 +#include <asm/mmu_context.h>
34410 +LIST_HEAD(mm_unpinned);
34411 +DEFINE_SPINLOCK(mm_unpinned_lock);
34413 +static void _pin_lock(struct mm_struct *mm, int lock) {
34415 + spin_lock(&mm->page_table_lock);
34416 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
34417 + /* While mm->page_table_lock protects us against insertions and
34418 + * removals of higher level page table pages, it doesn't protect
34419 + * against updates of pte-s. Such updates, however, require the
34420 + * pte pages to be in consistent state (unpinned+writable or
34421 + * pinned+readonly). The pinning and attribute changes, however
34422 + * cannot be done atomically, which is why such updates must be
34423 + * prevented from happening concurrently.
34424 + * Note that no pte lock can ever elsewhere be acquired nesting
34425 + * with an already acquired one in the same mm, or with the mm's
34426 + * page_table_lock already acquired, as that would break in the
34427 + * non-split case (where all these are actually resolving to the
34428 + * one page_table_lock). Thus acquiring all of them here is not
34429 + * going to result in dead locks, and the order of acquires
34430 + * doesn't matter.
34433 + pgd_t *pgd = mm->pgd;
34436 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34440 + if (pgd_none(*pgd))
34442 + pud = pud_offset(pgd, 0);
34443 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34447 + if (pud_none(*pud))
34449 + pmd = pmd_offset(pud, 0);
34450 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34453 + if (pmd_none(*pmd))
34455 + ptl = pte_lockptr(0, pmd);
34459 + spin_unlock(ptl);
34466 + spin_unlock(&mm->page_table_lock);
34468 +#define pin_lock(mm) _pin_lock(mm, 1)
34469 +#define pin_unlock(mm) _pin_lock(mm, 0)
34471 +#define PIN_BATCH 8
34472 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
34474 +static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags,
34475 + unsigned int cpu, unsigned int seq)
34477 + struct page *page = virt_to_page(pt);
34478 + unsigned long pfn = page_to_pfn(page);
34480 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
34481 + (unsigned long)__va(pfn << PAGE_SHIFT),
34482 + pfn_pte(pfn, flags), 0);
34483 + if (unlikely(++seq == PIN_BATCH)) {
34484 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
34485 + PIN_BATCH, NULL)))
34493 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
34500 + unsigned int cpu, seq;
34501 + multicall_entry_t *mcl;
34507 + * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
34508 + * be the 'current' task's pagetables (e.g., current may be 32-bit,
34509 + * but the pagetables may be for a 64-bit task).
34510 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
34511 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
34513 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34514 + if (pgd_none(*pgd))
34516 + pud = pud_offset(pgd, 0);
34517 + if (PTRS_PER_PUD > 1) /* not folded */
34518 + seq = mm_walk_set_prot(pud,flags,cpu,seq);
34519 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34520 + if (pud_none(*pud))
34522 + pmd = pmd_offset(pud, 0);
34523 + if (PTRS_PER_PMD > 1) /* not folded */
34524 + seq = mm_walk_set_prot(pmd,flags,cpu,seq);
34525 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34526 + if (pmd_none(*pmd))
34528 + pte = pte_offset_kernel(pmd,0);
34529 + seq = mm_walk_set_prot(pte,flags,cpu,seq);
34534 + mcl = per_cpu(pb_mcl, cpu);
34535 + if (unlikely(seq > PIN_BATCH - 2)) {
34536 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
34540 + MULTI_update_va_mapping(mcl + seq,
34541 + (unsigned long)__user_pgd(mm->pgd),
34542 + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags),
34544 + MULTI_update_va_mapping(mcl + seq + 1,
34545 + (unsigned long)mm->pgd,
34546 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags),
34548 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
34554 +void mm_pin(struct mm_struct *mm)
34556 + if (xen_feature(XENFEAT_writable_page_tables))
34561 + mm_walk(mm, PAGE_KERNEL_RO);
34562 + xen_pgd_pin(__pa(mm->pgd)); /* kernel */
34563 + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
34564 + mm->context.pinned = 1;
34565 + spin_lock(&mm_unpinned_lock);
34566 + list_del(&mm->context.unpinned);
34567 + spin_unlock(&mm_unpinned_lock);
34572 +void mm_unpin(struct mm_struct *mm)
34574 + if (xen_feature(XENFEAT_writable_page_tables))
34579 + xen_pgd_unpin(__pa(mm->pgd));
34580 + xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
34581 + mm_walk(mm, PAGE_KERNEL);
34582 + mm->context.pinned = 0;
34583 + spin_lock(&mm_unpinned_lock);
34584 + list_add(&mm->context.unpinned, &mm_unpinned);
34585 + spin_unlock(&mm_unpinned_lock);
34590 +void mm_pin_all(void)
34592 + if (xen_feature(XENFEAT_writable_page_tables))
34596 + * Allow uninterrupted access to the mm_unpinned list. We don't
34597 + * actually take the mm_unpinned_lock as it is taken inside mm_pin().
34598 + * All other CPUs must be at a safe point (e.g., in stop_machine
34599 + * or offlined entirely).
34601 + preempt_disable();
34602 + while (!list_empty(&mm_unpinned))
34603 + mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
34604 + context.unpinned));
34605 + preempt_enable();
34608 +void _arch_dup_mmap(struct mm_struct *mm)
34610 + if (!mm->context.pinned)
34614 +void _arch_exit_mmap(struct mm_struct *mm)
34616 + struct task_struct *tsk = current;
34621 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
34622 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
34624 + if (tsk->active_mm == mm) {
34625 + tsk->active_mm = &init_mm;
34626 + atomic_inc(&init_mm.mm_count);
34628 + switch_mm(mm, &init_mm, tsk);
34630 + atomic_dec(&mm->mm_count);
34631 + BUG_ON(atomic_read(&mm->mm_count) == 0);
34634 + task_unlock(tsk);
34636 + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
34637 + !mm->context.has_foreign_mappings )
34641 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
34643 + struct page *pte;
34645 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
34647 + SetPageForeign(pte, pte_free);
34648 + init_page_count(pte);
34653 +void pte_free(struct page *pte)
34655 + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
34657 + if (!pte_write(*virt_to_ptep(va)))
34658 + if (HYPERVISOR_update_va_mapping(
34659 + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
34662 + ClearPageForeign(pte);
34663 + init_page_count(pte);
34665 + __free_page(pte);
34667 +#endif /* CONFIG_XEN */
34669 +pte_t *lookup_address(unsigned long address)
34671 + pgd_t *pgd = pgd_offset_k(address);
34675 + if (pgd_none(*pgd))
34677 + pud = pud_offset(pgd, address);
34678 + if (!pud_present(*pud))
34680 + pmd = pmd_offset(pud, address);
34681 + if (!pmd_present(*pmd))
34683 + if (pmd_large(*pmd))
34684 + return (pte_t *)pmd;
34685 + pte = pte_offset_kernel(pmd, address);
34686 + if (pte && !pte_present(*pte))
34691 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
34692 + pgprot_t ref_prot)
34695 + unsigned long addr;
34696 + struct page *base = alloc_pages(GFP_KERNEL, 0);
34701 + * page_private is used to track the number of entries in
34702 + * the page table page have non standard attributes.
34704 + SetPagePrivate(base);
34705 + page_private(base) = 0;
34707 + address = __pa(address);
34708 + addr = address & LARGE_PAGE_MASK;
34709 + pbase = (pte_t *)page_address(base);
34710 + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
34711 + pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
34712 + addr == address ? prot : ref_prot);
34718 +static void flush_kernel_map(void *address)
34720 + if (0 && address && cpu_has_clflush) {
34721 + /* is this worth it? */
34723 + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
34724 + asm volatile("clflush (%0)" :: "r" (address + i));
34726 + asm volatile("wbinvd":::"memory");
34728 + __flush_tlb_one(address);
34730 + __flush_tlb_all();
34734 +static inline void flush_map(unsigned long address)
34736 + on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
34739 +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
34741 +static inline void save_page(struct page *fpage)
34743 + fpage->lru.next = (struct list_head *)deferred_pages;
34744 + deferred_pages = fpage;
34748 + * No more special protections in this 2/4MB area - revert to a
34749 + * large page again.
34751 +static void revert_page(unsigned long address, pgprot_t ref_prot)
34758 + pgd = pgd_offset_k(address);
34759 + BUG_ON(pgd_none(*pgd));
34760 + pud = pud_offset(pgd,address);
34761 + BUG_ON(pud_none(*pud));
34762 + pmd = pmd_offset(pud, address);
34763 + BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
34764 + pgprot_val(ref_prot) |= _PAGE_PSE;
34765 + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
34766 + set_pte((pte_t *)pmd, large_pte);
34770 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
34771 + pgprot_t ref_prot)
34774 + struct page *kpte_page;
34775 + unsigned kpte_flags;
34776 + pgprot_t ref_prot2;
34777 + kpte = lookup_address(address);
34778 + if (!kpte) return 0;
34779 + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
34780 + kpte_flags = pte_val(*kpte);
34781 + if (pgprot_val(prot) != pgprot_val(ref_prot)) {
34782 + if ((kpte_flags & _PAGE_PSE) == 0) {
34783 + set_pte(kpte, pfn_pte(pfn, prot));
34786 + * split_large_page will take the reference for this
34787 + * change_page_attr on the split page.
34790 + struct page *split;
34791 + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
34793 + split = split_large_page(address, prot, ref_prot2);
34796 + set_pte(kpte,mk_pte(split, ref_prot2));
34797 + kpte_page = split;
34799 + page_private(kpte_page)++;
34800 + } else if ((kpte_flags & _PAGE_PSE) == 0) {
34801 + set_pte(kpte, pfn_pte(pfn, ref_prot));
34802 + BUG_ON(page_private(kpte_page) == 0);
34803 + page_private(kpte_page)--;
34807 + /* on x86-64 the direct mapping set at boot is not using 4k pages */
34809 + * ..., but the XEN guest kernels (currently) do:
34810 + * If the pte was reserved, it means it was created at boot
34811 + * time (not via split_large_page) and in turn we must not
34812 + * replace it with a large page.
34814 +#ifndef CONFIG_XEN
34815 + BUG_ON(PageReserved(kpte_page));
34817 + if (PageReserved(kpte_page))
34821 + if (page_private(kpte_page) == 0) {
34822 + save_page(kpte_page);
34823 + revert_page(address, ref_prot);
34829 + * Change the page attributes of an page in the linear mapping.
34831 + * This should be used when a page is mapped with a different caching policy
34832 + * than write-back somewhere - some CPUs do not like it when mappings with
34833 + * different caching policies exist. This changes the page attributes of the
34834 + * in kernel linear mapping too.
34836 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
34837 + * This function only deals with the kernel linear map.
34839 + * Caller must call global_flush_tlb() after this.
34841 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
34846 + down_write(&init_mm.mmap_sem);
34847 + for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
34848 + unsigned long pfn = __pa(address) >> PAGE_SHIFT;
34850 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
34853 + /* Handle kernel mapping too which aliases part of the
34855 + if (__pa(address) < KERNEL_TEXT_SIZE) {
34856 + unsigned long addr2;
34857 + pgprot_t prot2 = prot;
34858 + addr2 = __START_KERNEL_map + __pa(address);
34859 + pgprot_val(prot2) &= ~_PAGE_NX;
34860 + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
34863 + up_write(&init_mm.mmap_sem);
34867 +/* Don't call this for MMIO areas that may not have a mem_map entry */
34868 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
34870 + unsigned long addr = (unsigned long)page_address(page);
34871 + return change_page_attr_addr(addr, numpages, prot);
34874 +void global_flush_tlb(void)
34876 + struct page *dpage;
34878 + down_read(&init_mm.mmap_sem);
34879 + dpage = xchg(&deferred_pages, NULL);
34880 + up_read(&init_mm.mmap_sem);
34882 + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
34884 + struct page *tmp = dpage;
34885 + dpage = (struct page *)dpage->lru.next;
34886 + ClearPagePrivate(tmp);
34887 + __free_page(tmp);
34891 +EXPORT_SYMBOL(change_page_attr);
34892 +EXPORT_SYMBOL(global_flush_tlb);
34893 Index: head-2008-11-25/drivers/pci/msi-xen.c
34894 ===================================================================
34895 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
34896 +++ head-2008-11-25/drivers/pci/msi-xen.c 2008-10-13 13:43:45.000000000 +0200
34900 + * Purpose: PCI Message Signaled Interrupt (MSI)
34902 + * Copyright (C) 2003-2004 Intel
34903 + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
34906 +#include <linux/mm.h>
34907 +#include <linux/irq.h>
34908 +#include <linux/interrupt.h>
34909 +#include <linux/init.h>
34910 +#include <linux/ioport.h>
34911 +#include <linux/smp_lock.h>
34912 +#include <linux/pci.h>
34913 +#include <linux/proc_fs.h>
34915 +#include <xen/evtchn.h>
34917 +#include <asm/errno.h>
34918 +#include <asm/io.h>
34919 +#include <asm/smp.h>
34924 +static int pci_msi_enable = 1;
34926 +static struct msi_ops *msi_ops;
34928 +int msi_register(struct msi_ops *ops)
34934 +static LIST_HEAD(msi_dev_head);
34935 +DEFINE_SPINLOCK(msi_dev_lock);
34937 +struct msi_dev_list {
34938 + struct pci_dev *dev;
34939 + struct list_head list;
34940 + spinlock_t pirq_list_lock;
34941 + struct list_head pirq_list_head;
34944 +struct msi_pirq_entry {
34945 + struct list_head list;
34950 +static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
34952 + struct msi_dev_list *msi_dev_list, *ret = NULL;
34953 + unsigned long flags;
34955 + spin_lock_irqsave(&msi_dev_lock, flags);
34957 + list_for_each_entry(msi_dev_list, &msi_dev_head, list)
34958 + if ( msi_dev_list->dev == dev )
34959 + ret = msi_dev_list;
34962 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34966 + /* Has not allocate msi_dev until now. */
34967 + ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
34969 + /* Failed to allocate msi_dev structure */
34971 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34976 + spin_lock_init(&ret->pirq_list_lock);
34977 + INIT_LIST_HEAD(&ret->pirq_list_head);
34978 + list_add_tail(&ret->list, &msi_dev_head);
34979 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34983 +static int attach_pirq_entry(int pirq, int entry_nr,
34984 + struct msi_dev_list *msi_dev_entry)
34986 + struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
34987 + unsigned long flags;
34991 + entry->pirq = pirq;
34992 + entry->entry_nr = entry_nr;
34993 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
34994 + list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
34995 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
34999 +static void detach_pirq_entry(int entry_nr,
35000 + struct msi_dev_list *msi_dev_entry)
35002 + unsigned long flags;
35003 + struct msi_pirq_entry *pirq_entry;
35005 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35006 + if (pirq_entry->entry_nr == entry_nr) {
35007 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35008 + list_del(&pirq_entry->list);
35009 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35010 + kfree(pirq_entry);
35017 + * pciback will provide device's owner
35019 +static int (*get_owner)(struct pci_dev *dev);
35021 +int register_msi_get_owner(int (*func)(struct pci_dev *dev))
35024 + printk(KERN_WARNING "register msi_get_owner again\n");
35027 + get_owner = func;
35031 +int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
35033 + if (get_owner != func)
35035 + get_owner = NULL;
35039 +static int msi_get_dev_owner(struct pci_dev *dev)
35043 + BUG_ON(!is_initial_xendomain());
35044 + if (get_owner && (owner = get_owner(dev)) >= 0) {
35045 + printk(KERN_INFO "get owner for dev %x get %x \n",
35046 + dev->devfn, owner);
35050 + return DOMID_SELF;
35053 +static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
35055 + struct physdev_unmap_pirq unmap;
35058 + unmap.domid = msi_get_dev_owner(dev);
35059 + /* See comments in msi_map_pirq_to_vector, input parameter pirq
35060 + * mean irq number only if the device belongs to dom0 itself.
35062 + unmap.pirq = (unmap.domid != DOMID_SELF)
35063 + ? pirq : evtchn_get_xen_pirq(pirq);
35065 + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
35066 + printk(KERN_WARNING "unmap irq %x failed\n", pirq);
35071 + if (unmap.domid == DOMID_SELF)
35072 + evtchn_map_pirq(pirq, 0);
35077 +static u64 find_table_base(struct pci_dev *dev, int pos)
35081 + unsigned long flags;
35083 + pci_read_config_dword(dev, msix_table_offset_reg(pos), ®);
35084 + bar = reg & PCI_MSIX_FLAGS_BIRMASK;
35086 + flags = pci_resource_flags(dev, bar);
35087 + if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
35090 + return pci_resource_start(dev, bar);
35094 + * Protected by msi_lock
35096 +static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq,
35097 + int entry_nr, u64 table_base)
35099 + struct physdev_map_pirq map_irq;
35101 + domid_t domid = DOMID_SELF;
35103 + domid = msi_get_dev_owner(dev);
35105 + map_irq.domid = domid;
35106 + map_irq.type = MAP_PIRQ_TYPE_MSI;
35107 + map_irq.index = -1;
35108 + map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq);
35109 + map_irq.bus = dev->bus->number;
35110 + map_irq.devfn = dev->devfn;
35111 + map_irq.entry_nr = entry_nr;
35112 + map_irq.table_base = table_base;
35114 + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
35115 + printk(KERN_WARNING "map irq failed\n");
35119 + /* This happens when MSI support is not enabled in Xen. */
35120 + if (rc == 0 && map_irq.pirq < 0)
35123 + BUG_ON(map_irq.pirq <= 0);
35125 + /* If mapping of this particular MSI is on behalf of another domain,
35126 + * we do not need to get an irq in dom0. This also implies:
35127 + * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
35128 + * to another domain, and will be 'Linux irq' if it belongs to dom0.
35130 + return ((domid != DOMID_SELF) ?
35131 + map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq));
35134 +static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
35136 + return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base);
35139 +static int msi_init(void)
35141 + static int status = 0;
35143 + if (pci_msi_quirk) {
35144 + pci_msi_enable = 0;
35145 + printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n");
35146 + status = -EINVAL;
35152 +void pci_scan_msi_device(struct pci_dev *dev) { }
35154 +void disable_msi_mode(struct pci_dev *dev, int pos, int type)
35158 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35159 + if (type == PCI_CAP_ID_MSI) {
35160 + /* Set enabled bits to single MSI & enable MSI_enable bit */
35161 + msi_disable(control);
35162 + pci_write_config_word(dev, msi_control_reg(pos), control);
35163 + dev->msi_enabled = 0;
35165 + msix_disable(control);
35166 + pci_write_config_word(dev, msi_control_reg(pos), control);
35167 + dev->msix_enabled = 0;
35169 + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35170 + /* PCI Express Endpoint device detected */
35171 + pci_intx(dev, 1); /* enable intx */
35175 +static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
35179 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35180 + if (type == PCI_CAP_ID_MSI) {
35181 + /* Set enabled bits to single MSI & enable MSI_enable bit */
35182 + msi_enable(control, 1);
35183 + pci_write_config_word(dev, msi_control_reg(pos), control);
35184 + dev->msi_enabled = 1;
35186 + msix_enable(control);
35187 + pci_write_config_word(dev, msi_control_reg(pos), control);
35188 + dev->msix_enabled = 1;
35190 + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35191 + /* PCI Express Endpoint device detected */
35192 + pci_intx(dev, 0); /* disable intx */
35197 +int pci_save_msi_state(struct pci_dev *dev)
35201 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35202 + if (pos <= 0 || dev->no_msi)
35205 + if (!dev->msi_enabled)
35208 + /* Restore dev->irq to its default pin-assertion vector */
35209 + msi_unmap_pirq(dev, dev->irq);
35210 + /* Disable MSI mode */
35211 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35212 + /* Set the flags for use of restore */
35213 + dev->msi_enabled = 1;
35217 +void pci_restore_msi_state(struct pci_dev *dev)
35221 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35225 + if (!dev->msi_enabled)
35228 + pirq = msi_map_pirq_to_vector(dev, dev->irq, 0, 0);
35231 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35234 +int pci_save_msix_state(struct pci_dev *dev)
35237 + unsigned long flags;
35238 + struct msi_dev_list *msi_dev_entry;
35239 + struct msi_pirq_entry *pirq_entry, *tmp;
35241 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35242 + if (pos <= 0 || dev->no_msi)
35245 + /* save the capability */
35246 + if (!dev->msix_enabled)
35249 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35251 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35252 + list_for_each_entry_safe(pirq_entry, tmp,
35253 + &msi_dev_entry->pirq_list_head, list)
35254 + msi_unmap_pirq(dev, pirq_entry->pirq);
35255 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35257 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35258 + /* Set the flags for use of restore */
35259 + dev->msix_enabled = 1;
35264 +void pci_restore_msix_state(struct pci_dev *dev)
35267 + unsigned long flags;
35269 + struct msi_dev_list *msi_dev_entry;
35270 + struct msi_pirq_entry *pirq_entry, *tmp;
35272 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35276 + if (!dev->msix_enabled)
35279 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35280 + table_base = find_table_base(dev, pos);
35284 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35285 + list_for_each_entry_safe(pirq_entry, tmp,
35286 + &msi_dev_entry->pirq_list_head, list) {
35287 + int rc = msi_map_pirq_to_vector(dev, pirq_entry->pirq,
35288 + pirq_entry->entry_nr, table_base);
35290 + printk(KERN_WARNING
35291 + "%s: re-mapping irq #%d (pirq%d) failed: %d\n",
35292 + pci_name(dev), pirq_entry->entry_nr,
35293 + pirq_entry->pirq, rc);
35295 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35297 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35302 + * msi_capability_init - configure device's MSI capability structure
35303 + * @dev: pointer to the pci_dev data structure of MSI device function
35305 + * Setup the MSI capability structure of device function with a single
35306 + * MSI vector, regardless of device function is capable of handling
35307 + * multiple messages. A return of zero indicates the successful setup
35308 + * of an entry zero with the new MSI vector or non-zero for otherwise.
35310 +static int msi_capability_init(struct pci_dev *dev)
35315 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35316 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35318 + pirq = msi_map_vector(dev, 0, 0);
35323 + /* Set MSI enabled bits */
35324 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35325 + dev->msi_enabled = 1;
35331 + * msix_capability_init - configure device's MSI-X capability
35332 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35333 + * @entries: pointer to an array of struct msix_entry entries
35334 + * @nvec: number of @entries
35336 + * Setup the MSI-X capability structure of device function with a
35337 + * single MSI-X vector. A return of zero indicates the successful setup of
35338 + * requested MSI-X entries with allocated vectors or non-zero for otherwise.
35340 +static int msix_capability_init(struct pci_dev *dev,
35341 + struct msix_entry *entries, int nvec)
35344 + int pirq, i, j, mapped, pos;
35345 + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
35346 + struct msi_pirq_entry *pirq_entry;
35348 + if (!msi_dev_entry)
35351 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35352 + table_base = find_table_base(dev, pos);
35356 + /* MSI-X Table Initialization */
35357 + for (i = 0; i < nvec; i++) {
35359 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35360 + if (pirq_entry->entry_nr == entries[i].entry) {
35361 + printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
35362 + not freed before acquire again.\n", entries[i].entry,
35363 + dev->bus->number, PCI_SLOT(dev->devfn),
35364 + PCI_FUNC(dev->devfn));
35365 + (entries + i)->vector = pirq_entry->pirq;
35372 + pirq = msi_map_vector(dev, entries[i].entry, table_base);
35375 + attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
35376 + (entries + i)->vector = pirq;
35380 + for (j = --i; j >= 0; j--) {
35381 + msi_unmap_pirq(dev, entries[j].vector);
35382 + detach_pirq_entry(entries[j].entry, msi_dev_entry);
35383 + entries[j].vector = 0;
35388 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35389 + dev->msix_enabled = 1;
35395 + * pci_enable_msi - configure device's MSI capability structure
35396 + * @dev: pointer to the pci_dev data structure of MSI device function
35398 + * Setup the MSI capability structure of device function with
35399 + * a single MSI vector upon its software driver call to request for
35400 + * MSI mode enabled on its hardware device function. A return of zero
35401 + * indicates the successful setup of an entry zero with the new MSI
35402 + * vector or non-zero for otherwise.
35404 +extern int pci_frontend_enable_msi(struct pci_dev *dev);
35405 +int pci_enable_msi(struct pci_dev* dev)
35407 + struct pci_bus *bus;
35408 + int pos, temp, status = -EINVAL;
35410 + if (!pci_msi_enable || !dev)
35416 + for (bus = dev->bus; bus; bus = bus->parent)
35417 + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35420 + status = msi_init();
35424 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35425 + if (!is_initial_xendomain())
35430 + ret = pci_frontend_enable_msi(dev);
35434 + dev->irq = evtchn_map_pirq(-1, dev->irq);
35435 + dev->irq_old = temp;
35443 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35447 + /* Check whether driver already requested for MSI-X vectors */
35448 + if (dev->msix_enabled) {
35449 + printk(KERN_INFO "PCI: %s: Can't enable MSI. "
35450 + "Device already has MSI-X vectors assigned\n",
35456 + status = msi_capability_init(dev);
35458 + dev->irq_old = temp;
35465 +extern void pci_frontend_disable_msi(struct pci_dev* dev);
35466 +void pci_disable_msi(struct pci_dev* dev)
35471 + if (!pci_msi_enable)
35476 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35477 + if (!is_initial_xendomain()) {
35478 + evtchn_map_pirq(dev->irq, 0);
35479 + pci_frontend_disable_msi(dev);
35480 + dev->irq = dev->irq_old;
35485 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35490 + /* Restore dev->irq to its default pin-assertion vector */
35491 + dev->irq = dev->irq_old;
35492 + msi_unmap_pirq(dev, pirq);
35494 + /* Disable MSI mode */
35495 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35499 + * pci_enable_msix - configure device's MSI-X capability structure
35500 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35501 + * @entries: pointer to an array of MSI-X entries
35502 + * @nvec: number of MSI-X vectors requested for allocation by device driver
35504 + * Setup the MSI-X capability structure of device function with the number
35505 + * of requested vectors upon its software driver call to request for
35506 + * MSI-X mode enabled on its hardware device function. A return of zero
35507 + * indicates the successful configuration of MSI-X capability structure
35508 + * with new allocated MSI-X vectors. A return of < 0 indicates a failure.
35509 + * Or a return of > 0 indicates that driver request is exceeding the number
35510 + * of vectors available. Driver should use the returned value to re-send
35513 +extern int pci_frontend_enable_msix(struct pci_dev *dev,
35514 + struct msix_entry *entries, int nvec);
35515 +int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
35517 + struct pci_bus *bus;
35518 + int status, pos, nr_entries;
35522 + if (!pci_msi_enable || !dev || !entries)
35528 + for (bus = dev->bus; bus; bus = bus->parent)
35529 + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35532 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35533 + if (!is_initial_xendomain()) {
35534 + struct msi_dev_list *msi_dev_entry;
35535 + struct msi_pirq_entry *pirq_entry;
35538 + ret = pci_frontend_enable_msix(dev, entries, nvec);
35540 + printk("get %x from pci_frontend_enable_msix\n", ret);
35544 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35545 + for (i = 0; i < nvec; i++) {
35548 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35549 + if (pirq_entry->entry_nr == entries[i].entry) {
35550 + irq = pirq_entry->pirq;
35551 + BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
35552 + entries[i].vector = irq;
35559 + irq = evtchn_map_pirq(-1, entries[i].vector);
35560 + attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
35561 + entries[i].vector = irq;
35567 + status = msi_init();
35571 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35575 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35576 + nr_entries = multi_msix_capable(control);
35577 + if (nvec > nr_entries)
35580 + /* Check for any invalid entries */
35581 + for (i = 0; i < nvec; i++) {
35582 + if (entries[i].entry >= nr_entries)
35583 + return -EINVAL; /* invalid entry */
35584 + for (j = i + 1; j < nvec; j++) {
35585 + if (entries[i].entry == entries[j].entry)
35586 + return -EINVAL; /* duplicate entry */
35591 + /* Check whether driver already requested for MSI vector */
35592 + if (dev->msi_enabled) {
35593 + printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
35594 + "Device already has an MSI vector assigned\n",
35600 + status = msix_capability_init(dev, entries, nvec);
35603 + dev->irq_old = temp;
35610 +extern void pci_frontend_disable_msix(struct pci_dev* dev);
35611 +void pci_disable_msix(struct pci_dev* dev)
35617 + if (!pci_msi_enable)
35622 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35623 + if (!is_initial_xendomain()) {
35624 + struct msi_dev_list *msi_dev_entry;
35625 + struct msi_pirq_entry *pirq_entry, *tmp;
35627 + pci_frontend_disable_msix(dev);
35629 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35630 + list_for_each_entry_safe(pirq_entry, tmp,
35631 + &msi_dev_entry->pirq_list_head, list) {
35632 + evtchn_map_pirq(pirq_entry->pirq, 0);
35633 + list_del(&pirq_entry->list);
35634 + kfree(pirq_entry);
35637 + dev->irq = dev->irq_old;
35642 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35646 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35647 + if (!(control & PCI_MSIX_FLAGS_ENABLE))
35650 + msi_remove_pci_irq_vectors(dev);
35652 + /* Disable MSI mode */
35653 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35657 + * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
35658 + * @dev: pointer to the pci_dev data structure of MSI(X) device function
35660 + * Being called during hotplug remove, from which the device function
35661 + * is hot-removed. All previous assigned MSI/MSI-X vectors, if
35662 + * allocated for this device function, are reclaimed to unused state,
35663 + * which may be used later on.
35665 +void msi_remove_pci_irq_vectors(struct pci_dev* dev)
35667 + unsigned long flags;
35668 + struct msi_dev_list *msi_dev_entry;
35669 + struct msi_pirq_entry *pirq_entry, *tmp;
35671 + if (!pci_msi_enable || !dev)
35674 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35676 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35677 + if (!list_empty(&msi_dev_entry->pirq_list_head))
35679 + printk(KERN_WARNING "msix pirqs for dev %02x:%02x:%01x are not freed \
35680 + before acquire again.\n", dev->bus->number, PCI_SLOT(dev->devfn),
35681 + PCI_FUNC(dev->devfn));
35682 + list_for_each_entry_safe(pirq_entry, tmp,
35683 + &msi_dev_entry->pirq_list_head, list) {
35684 + msi_unmap_pirq(dev, pirq_entry->pirq);
35685 + list_del(&pirq_entry->list);
35686 + kfree(pirq_entry);
35689 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35690 + dev->irq = dev->irq_old;
35693 +void pci_no_msi(void)
35695 + pci_msi_enable = 0;
35698 +EXPORT_SYMBOL(pci_enable_msi);
35699 +EXPORT_SYMBOL(pci_disable_msi);
35700 +EXPORT_SYMBOL(pci_enable_msix);
35701 +EXPORT_SYMBOL(pci_disable_msix);
35703 +EXPORT_SYMBOL(register_msi_get_owner);
35704 +EXPORT_SYMBOL(unregister_msi_get_owner);
35707 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h
35708 ===================================================================
35709 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35710 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h 2007-06-22 09:08:06.000000000 +0200
35715 +#include <asm/pgtable.h>
35716 +#include <asm/cacheflush.h>
35717 +#include <asm/system.h>
35720 + * Functions to keep the agpgart mappings coherent with the MMU.
35721 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
35722 + * mapped uncacheable. Make sure there are no conflicting mappings
35723 + * with different cachability attributes for the same page. This avoids
35724 + * data corruption on some CPUs.
35727 +/* Caller's responsibility to call global_flush_tlb() for
35728 + * performance reasons */
35729 +#define map_page_into_agp(page) ( \
35730 + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
35731 + ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
35732 +#define unmap_page_from_agp(page) ( \
35733 + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
35734 + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
35735 + change_page_attr(page, 1, PAGE_KERNEL))
35736 +#define flush_agp_mappings() global_flush_tlb()
35738 +/* Could use CLFLUSH here if the cpu supports it. But then it would
35739 + need to be called for each cacheline of the whole page so it may not be
35740 + worth it. Would need a page for it. */
35741 +#define flush_agp_cache() wbinvd()
35743 +/* Convert a physical address to an address suitable for the GART. */
35744 +#define phys_to_gart(x) phys_to_machine(x)
35745 +#define gart_to_phys(x) machine_to_phys(x)
35747 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
35748 +#define alloc_gatt_pages(order) ({ \
35749 + char *_t; dma_addr_t _d; \
35750 + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
35752 +#define free_gatt_pages(table, order) \
35753 + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
35756 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h
35757 ===================================================================
35758 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35759 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100
35761 +#ifndef __ARCH_DESC_H
35762 +#define __ARCH_DESC_H
35764 +#include <asm/ldt.h>
35765 +#include <asm/segment.h>
35767 +#define CPU_16BIT_STACK_SIZE 1024
35769 +#ifndef __ASSEMBLY__
35771 +#include <linux/preempt.h>
35772 +#include <linux/smp.h>
35774 +#include <asm/mmu.h>
35776 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
35778 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
35780 +struct Xgt_desc_struct {
35781 + unsigned short size;
35782 + unsigned long address __attribute__((packed));
35783 + unsigned short pad;
35784 +} __attribute__ ((packed));
35786 +extern struct Xgt_desc_struct idt_descr;
35787 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
35790 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
35792 + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
35795 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
35796 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
35798 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
35799 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
35800 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
35801 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
35803 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
35804 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
35805 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
35806 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
35809 + * This is the ldt that every process will get unless we need
35810 + * something other than this.
35812 +extern struct desc_struct default_ldt[];
35813 +extern void set_intr_gate(unsigned int irq, void * addr);
35815 +#define _set_tssldt_desc(n,addr,limit,type) \
35816 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
35817 + "movw %w1,2(%2)\n\t" \
35818 + "rorl $16,%1\n\t" \
35819 + "movb %b1,4(%2)\n\t" \
35820 + "movb %4,5(%2)\n\t" \
35821 + "movb $0,6(%2)\n\t" \
35822 + "movb %h1,7(%2)\n\t" \
35824 + : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
35826 +#ifndef CONFIG_X86_NO_TSS
35827 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
35829 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
35830 + offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
35833 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
35836 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
35838 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
35841 +#define LDT_entry_a(info) \
35842 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
35844 +#define LDT_entry_b(info) \
35845 + (((info)->base_addr & 0xff000000) | \
35846 + (((info)->base_addr & 0x00ff0000) >> 16) | \
35847 + ((info)->limit & 0xf0000) | \
35848 + (((info)->read_exec_only ^ 1) << 9) | \
35849 + ((info)->contents << 10) | \
35850 + (((info)->seg_not_present ^ 1) << 15) | \
35851 + ((info)->seg_32bit << 22) | \
35852 + ((info)->limit_in_pages << 23) | \
35853 + ((info)->useable << 20) | \
35856 +#define LDT_empty(info) (\
35857 + (info)->base_addr == 0 && \
35858 + (info)->limit == 0 && \
35859 + (info)->contents == 0 && \
35860 + (info)->read_exec_only == 1 && \
35861 + (info)->seg_32bit == 0 && \
35862 + (info)->limit_in_pages == 0 && \
35863 + (info)->seg_not_present == 1 && \
35864 + (info)->useable == 0 )
35866 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
35868 +#if TLS_SIZE != 24
35869 +# error update this code.
35872 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
35874 +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
35875 + *(u64 *)&t->tls_array[i])) \
35877 + C(0); C(1); C(2);
35881 +static inline void clear_LDT(void)
35883 + int cpu = get_cpu();
35886 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
35887 + * it slows down context switching. Noone uses it anyway.
35889 + cpu = cpu; /* XXX avoid compiler warning */
35890 + xen_set_ldt(NULL, 0);
35895 + * load one particular LDT into the current CPU
35897 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
35899 + void *segments = pc->ldt;
35900 + int count = pc->size;
35902 + if (likely(!count))
35905 + xen_set_ldt(segments, count);
35908 +static inline void load_LDT(mm_context_t *pc)
35910 + int cpu = get_cpu();
35911 + load_LDT_nolock(pc, cpu);
35915 +static inline unsigned long get_desc_base(unsigned long *desc)
35917 + unsigned long base;
35918 + base = ((desc[0] >> 16) & 0x0000ffff) |
35919 + ((desc[1] << 16) & 0x00ff0000) |
35920 + (desc[1] & 0xff000000);
35924 +#endif /* !__ASSEMBLY__ */
35927 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h
35928 ===================================================================
35929 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35930 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-04-02 12:34:02.000000000 +0200
35932 +#ifndef _ASM_I386_DMA_MAPPING_H
35933 +#define _ASM_I386_DMA_MAPPING_H
35936 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
35940 +#include <linux/mm.h>
35941 +#include <asm/cache.h>
35942 +#include <asm/io.h>
35943 +#include <asm/scatterlist.h>
35944 +#include <asm/swiotlb.h>
35947 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
35949 + dma_addr_t mask = 0xffffffff;
35950 + /* If the device has a mask, use it, otherwise default to 32 bits */
35951 + if (hwdev && hwdev->dma_mask)
35952 + mask = *hwdev->dma_mask;
35953 + return (addr & ~mask) != 0;
35956 +extern int range_straddles_page_boundary(paddr_t p, size_t size);
35958 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
35959 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
35961 +void *dma_alloc_coherent(struct device *dev, size_t size,
35962 + dma_addr_t *dma_handle, gfp_t flag);
35964 +void dma_free_coherent(struct device *dev, size_t size,
35965 + void *vaddr, dma_addr_t dma_handle);
35968 +dma_map_single(struct device *dev, void *ptr, size_t size,
35969 + enum dma_data_direction direction);
35972 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
35973 + enum dma_data_direction direction);
35975 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
35976 + int nents, enum dma_data_direction direction);
35977 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
35978 + int nents, enum dma_data_direction direction);
35980 +#ifdef CONFIG_HIGHMEM
35982 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
35983 + size_t size, enum dma_data_direction direction);
35986 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
35987 + enum dma_data_direction direction);
35989 +#define dma_map_page(dev, page, offset, size, dir) \
35990 + dma_map_single(dev, page_address(page) + (offset), (size), (dir))
35991 +#define dma_unmap_page dma_unmap_single
35995 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
35996 + enum dma_data_direction direction);
35999 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
36000 + enum dma_data_direction direction);
36002 +static inline void
36003 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
36004 + unsigned long offset, size_t size,
36005 + enum dma_data_direction direction)
36007 + dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
36010 +static inline void
36011 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
36012 + unsigned long offset, size_t size,
36013 + enum dma_data_direction direction)
36015 + dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
36018 +static inline void
36019 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
36020 + enum dma_data_direction direction)
36023 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
36024 + flush_write_buffers();
36027 +static inline void
36028 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
36029 + enum dma_data_direction direction)
36032 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
36033 + flush_write_buffers();
36037 +dma_mapping_error(dma_addr_t dma_addr);
36040 +dma_supported(struct device *dev, u64 mask);
36043 +dma_set_mask(struct device *dev, u64 mask)
36045 + if(!dev->dma_mask || !dma_supported(dev, mask))
36048 + *dev->dma_mask = mask;
36054 +dma_get_cache_alignment(void)
36056 + /* no easy way to get cache size on all x86, so return the
36057 + * maximum possible, to be safe */
36058 + return (1 << INTERNODE_CACHE_SHIFT);
36061 +#define dma_is_consistent(d) (1)
36063 +static inline void
36064 +dma_cache_sync(void *vaddr, size_t size,
36065 + enum dma_data_direction direction)
36067 + flush_write_buffers();
36070 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
36072 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
36073 + dma_addr_t device_addr, size_t size, int flags);
36076 +dma_release_declared_memory(struct device *dev);
36079 +dma_mark_declared_memory_occupied(struct device *dev,
36080 + dma_addr_t device_addr, size_t size);
36083 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h
36084 ===================================================================
36085 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36086 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200
36089 + * fixmap.h: compile-time virtual memory allocation
36091 + * This file is subject to the terms and conditions of the GNU General Public
36092 + * License. See the file "COPYING" in the main directory of this archive
36093 + * for more details.
36095 + * Copyright (C) 1998 Ingo Molnar
36097 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
36100 +#ifndef _ASM_FIXMAP_H
36101 +#define _ASM_FIXMAP_H
36104 +/* used by vmalloc.c, vsyscall.lds.S.
36106 + * Leave one empty page between vmalloc'ed areas and
36107 + * the start of the fixmap.
36109 +extern unsigned long __FIXADDR_TOP;
36111 +#ifndef __ASSEMBLY__
36112 +#include <linux/kernel.h>
36113 +#include <asm/acpi.h>
36114 +#include <asm/apicdef.h>
36115 +#include <asm/page.h>
36116 +#ifdef CONFIG_HIGHMEM
36117 +#include <linux/threads.h>
36118 +#include <asm/kmap_types.h>
36122 + * Here we define all the compile-time 'special' virtual
36123 + * addresses. The point is to have a constant address at
36124 + * compile time, but to set the physical address only
36125 + * in the boot process. We allocate these special addresses
36126 + * from the end of virtual memory (0xfffff000) backwards.
36127 + * Also this lets us do fail-safe vmalloc(), we
36128 + * can guarantee that these special addresses and
36129 + * vmalloc()-ed addresses never overlap.
36131 + * these 'compile-time allocated' memory buffers are
36132 + * fixed-size 4k pages. (or larger if used with an increment
36133 + * highger than 1) use fixmap_set(idx,phys) to associate
36134 + * physical memory with fixmap indices.
36136 + * TLB entries of such buffers will not be flushed across
36139 +enum fixed_addresses {
36142 +#ifdef CONFIG_X86_LOCAL_APIC
36143 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
36145 +#ifdef CONFIG_X86_IO_APIC
36146 + FIX_IO_APIC_BASE_0,
36147 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
36149 +#ifdef CONFIG_X86_VISWS_APIC
36150 + FIX_CO_CPU, /* Cobalt timer */
36151 + FIX_CO_APIC, /* Cobalt APIC Redirection Table */
36152 + FIX_LI_PCIA, /* Lithium PCI Bridge A */
36153 + FIX_LI_PCIB, /* Lithium PCI Bridge B */
36155 +#ifdef CONFIG_X86_F00F_BUG
36156 + FIX_F00F_IDT, /* Virtual mapping for IDT */
36158 +#ifdef CONFIG_X86_CYCLONE_TIMER
36159 + FIX_CYCLONE_TIMER, /*cyclone timer register*/
36161 +#ifdef CONFIG_HIGHMEM
36162 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
36163 + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
36165 +#ifdef CONFIG_ACPI
36167 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
36169 +#ifdef CONFIG_PCI_MMCONFIG
36173 +#define NR_FIX_ISAMAPS 256
36175 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
36176 + __end_of_permanent_fixed_addresses,
36177 + /* temporary boot-time mappings, used before ioremap() is functional */
36178 +#define NR_FIX_BTMAPS 16
36179 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
36180 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
36182 + __end_of_fixed_addresses
36185 +extern void set_fixaddr_top(unsigned long top);
36187 +extern void __set_fixmap(enum fixed_addresses idx,
36188 + maddr_t phys, pgprot_t flags);
36190 +#define set_fixmap(idx, phys) \
36191 + __set_fixmap(idx, phys, PAGE_KERNEL)
36193 + * Some hardware wants to get fixmapped without caching.
36195 +#define set_fixmap_nocache(idx, phys) \
36196 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
36198 +#define clear_fixmap(idx) \
36199 + __set_fixmap(idx, 0, __pgprot(0))
36201 +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
36203 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
36204 +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
36205 +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
36206 +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
36208 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
36209 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
36211 +extern void __this_fixmap_does_not_exist(void);
36214 + * 'index to address' translation. If anyone tries to use the idx
36215 + * directly without tranlation, we catch the bug with a NULL-deference
36216 + * kernel oops. Illegal ranges of incoming indices are caught too.
36218 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
36221 + * this branch gets completely eliminated after inlining,
36222 + * except when someone tries to use fixaddr indices in an
36223 + * illegal way. (such as mixing up address types or using
36224 + * out-of-range indices).
36226 + * If it doesn't get removed, the linker will complain
36227 + * loudly with a reasonably clear error message..
36229 + if (idx >= __end_of_fixed_addresses)
36230 + __this_fixmap_does_not_exist();
36232 + return __fix_to_virt(idx);
36235 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
36237 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
36238 + return __virt_to_fix(vaddr);
36241 +#endif /* !__ASSEMBLY__ */
36243 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h
36244 ===================================================================
36245 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36246 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h 2007-08-06 15:10:49.000000000 +0200
36249 + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
36250 + * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
36251 + * VA Linux Systems Japan K.K.
36253 + * This program is free software; you can redistribute it and/or modify
36254 + * it under the terms of the GNU General Public License as published by
36255 + * the Free Software Foundation; either version 2 of the License, or
36256 + * (at your option) any later version.
36258 + * This program is distributed in the hope that it will be useful,
36259 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
36260 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36261 + * GNU General Public License for more details.
36263 + * You should have received a copy of the GNU General Public License
36264 + * along with this program; if not, write to the Free Software
36265 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36268 +#ifndef _ASM_I386_GNTTAB_DMA_H
36269 +#define _ASM_I386_GNTTAB_DMA_H
36271 +static inline int gnttab_dma_local_pfn(struct page *page)
36273 + /* Has it become a local MFN? */
36274 + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
36277 +static inline maddr_t gnttab_dma_map_page(struct page *page)
36279 + __gnttab_dma_map_page(page);
36280 + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
36283 +static inline void gnttab_dma_unmap_page(maddr_t maddr)
36285 + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
36288 +#endif /* _ASM_I386_GNTTAB_DMA_H */
36289 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h
36290 ===================================================================
36291 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36292 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100
36295 + * highmem.h: virtual kernel memory mappings for high memory
36297 + * Used in CONFIG_HIGHMEM systems for memory pages which
36298 + * are not addressable by direct kernel virtual addresses.
36300 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
36301 + * Gerhard.Wichert@pdb.siemens.de
36304 + * Redesigned the x86 32-bit VM architecture to deal with
36305 + * up to 16 Terabyte physical memory. With current x86 CPUs
36306 + * we now support up to 64 Gigabytes physical RAM.
36308 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
36311 +#ifndef _ASM_HIGHMEM_H
36312 +#define _ASM_HIGHMEM_H
36316 +#include <linux/interrupt.h>
36317 +#include <linux/threads.h>
36318 +#include <asm/kmap_types.h>
36319 +#include <asm/tlbflush.h>
36321 +/* declarations for highmem.c */
36322 +extern unsigned long highstart_pfn, highend_pfn;
36324 +extern pte_t *kmap_pte;
36325 +extern pgprot_t kmap_prot;
36326 +extern pte_t *pkmap_page_table;
36329 + * Right now we initialize only a single pte table. It can be extended
36330 + * easily, subsequent pte tables have to be allocated in one physical
36333 +#ifdef CONFIG_X86_PAE
36334 +#define LAST_PKMAP 512
36336 +#define LAST_PKMAP 1024
36342 + * fixed_addresses
36344 + * temp fixed addresses
36345 + * FIXADDR_BOOT_START
36346 + * Persistent kmap area
36353 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
36354 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
36355 +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
36356 +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
36358 +extern void * FASTCALL(kmap_high(struct page *page));
36359 +extern void FASTCALL(kunmap_high(struct page *page));
36361 +void *kmap(struct page *page);
36362 +void kunmap(struct page *page);
36363 +void *kmap_atomic(struct page *page, enum km_type type);
36364 +void *kmap_atomic_pte(struct page *page, enum km_type type);
36365 +void kunmap_atomic(void *kvaddr, enum km_type type);
36366 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
36367 +struct page *kmap_atomic_to_page(void *ptr);
36369 +#define flush_cache_kmaps() do { } while (0)
36371 +void clear_highpage(struct page *);
36372 +static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
36374 + clear_highpage(page);
36376 +#define __HAVE_ARCH_CLEAR_HIGHPAGE
36377 +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
36379 +void copy_highpage(struct page *to, struct page *from);
36380 +static inline void copy_user_highpage(struct page *to, struct page *from,
36381 + unsigned long vaddr)
36383 + copy_highpage(to, from);
36385 +#define __HAVE_ARCH_COPY_HIGHPAGE
36386 +#define __HAVE_ARCH_COPY_USER_HIGHPAGE
36388 +#endif /* __KERNEL__ */
36390 +#endif /* _ASM_HIGHMEM_H */
36391 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h
36392 ===================================================================
36393 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36394 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h 2008-11-25 12:22:34.000000000 +0100
36396 +/******************************************************************************
36399 + * Linux-specific hypervisor handling.
36401 + * Copyright (c) 2002-2004, K A Fraser
36403 + * This program is free software; you can redistribute it and/or
36404 + * modify it under the terms of the GNU General Public License version 2
36405 + * as published by the Free Software Foundation; or, when distributed
36406 + * separately from the Linux kernel or incorporated into other
36407 + * software packages, subject to the following license:
36409 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36410 + * of this source file (the "Software"), to deal in the Software without
36411 + * restriction, including without limitation the rights to use, copy, modify,
36412 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36413 + * and to permit persons to whom the Software is furnished to do so, subject to
36414 + * the following conditions:
36416 + * The above copyright notice and this permission notice shall be included in
36417 + * all copies or substantial portions of the Software.
36419 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36420 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36421 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36422 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36423 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36424 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36425 + * IN THE SOFTWARE.
36428 +#ifndef __HYPERCALL_H__
36429 +#define __HYPERCALL_H__
36431 +#include <linux/string.h> /* memcpy() */
36432 +#include <linux/stringify.h>
36434 +#ifndef __HYPERVISOR_H__
36435 +# error "please don't include this file directly"
36439 +#define HYPERCALL_STR(name) \
36440 + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
36442 +#define HYPERCALL_STR(name) \
36443 + "mov hypercall_stubs,%%eax; " \
36444 + "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
36448 +#define _hypercall0(type, name) \
36452 + HYPERCALL_STR(name) \
36459 +#define _hypercall1(type, name, a1) \
36464 + HYPERCALL_STR(name) \
36465 + : "=a" (__res), "=b" (__ign1) \
36466 + : "1" ((long)(a1)) \
36471 +#define _hypercall2(type, name, a1, a2) \
36474 + long __ign1, __ign2; \
36476 + HYPERCALL_STR(name) \
36477 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
36478 + : "1" ((long)(a1)), "2" ((long)(a2)) \
36483 +#define _hypercall3(type, name, a1, a2, a3) \
36486 + long __ign1, __ign2, __ign3; \
36488 + HYPERCALL_STR(name) \
36489 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36491 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36492 + "3" ((long)(a3)) \
36497 +#define _hypercall4(type, name, a1, a2, a3, a4) \
36500 + long __ign1, __ign2, __ign3, __ign4; \
36502 + HYPERCALL_STR(name) \
36503 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36504 + "=d" (__ign3), "=S" (__ign4) \
36505 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36506 + "3" ((long)(a3)), "4" ((long)(a4)) \
36511 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
36514 + long __ign1, __ign2, __ign3, __ign4, __ign5; \
36516 + HYPERCALL_STR(name) \
36517 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36518 + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
36519 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36520 + "3" ((long)(a3)), "4" ((long)(a4)), \
36521 + "5" ((long)(a5)) \
36526 +static inline int __must_check
36527 +HYPERVISOR_set_trap_table(
36528 + const trap_info_t *table)
36530 + return _hypercall1(int, set_trap_table, table);
36533 +static inline int __must_check
36534 +HYPERVISOR_mmu_update(
36535 + mmu_update_t *req, unsigned int count, unsigned int *success_count,
36538 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
36541 +static inline int __must_check
36542 +HYPERVISOR_mmuext_op(
36543 + struct mmuext_op *op, unsigned int count, unsigned int *success_count,
36546 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
36549 +static inline int __must_check
36550 +HYPERVISOR_set_gdt(
36551 + unsigned long *frame_list, unsigned int entries)
36553 + return _hypercall2(int, set_gdt, frame_list, entries);
36556 +static inline int __must_check
36557 +HYPERVISOR_stack_switch(
36558 + unsigned long ss, unsigned long esp)
36560 + return _hypercall2(int, stack_switch, ss, esp);
36563 +static inline int __must_check
36564 +HYPERVISOR_set_callbacks(
36565 + unsigned long event_selector, unsigned long event_address,
36566 + unsigned long failsafe_selector, unsigned long failsafe_address)
36568 + return _hypercall4(int, set_callbacks,
36569 + event_selector, event_address,
36570 + failsafe_selector, failsafe_address);
36574 +HYPERVISOR_fpu_taskswitch(
36577 + return _hypercall1(int, fpu_taskswitch, set);
36580 +static inline int __must_check
36581 +HYPERVISOR_sched_op_compat(
36582 + int cmd, unsigned long arg)
36584 + return _hypercall2(int, sched_op_compat, cmd, arg);
36587 +static inline int __must_check
36588 +HYPERVISOR_sched_op(
36589 + int cmd, void *arg)
36591 + return _hypercall2(int, sched_op, cmd, arg);
36594 +static inline long __must_check
36595 +HYPERVISOR_set_timer_op(
36598 + unsigned long timeout_hi = (unsigned long)(timeout>>32);
36599 + unsigned long timeout_lo = (unsigned long)timeout;
36600 + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
36603 +static inline int __must_check
36604 +HYPERVISOR_platform_op(
36605 + struct xen_platform_op *platform_op)
36607 + platform_op->interface_version = XENPF_INTERFACE_VERSION;
36608 + return _hypercall1(int, platform_op, platform_op);
36611 +static inline int __must_check
36612 +HYPERVISOR_set_debugreg(
36613 + unsigned int reg, unsigned long value)
36615 + return _hypercall2(int, set_debugreg, reg, value);
36618 +static inline unsigned long __must_check
36619 +HYPERVISOR_get_debugreg(
36620 + unsigned int reg)
36622 + return _hypercall1(unsigned long, get_debugreg, reg);
36625 +static inline int __must_check
36626 +HYPERVISOR_update_descriptor(
36627 + u64 ma, u64 desc)
36629 + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
36632 +static inline int __must_check
36633 +HYPERVISOR_memory_op(
36634 + unsigned int cmd, void *arg)
36636 + return _hypercall2(int, memory_op, cmd, arg);
36639 +static inline int __must_check
36640 +HYPERVISOR_multicall(
36641 + multicall_entry_t *call_list, unsigned int nr_calls)
36643 + return _hypercall2(int, multicall, call_list, nr_calls);
36646 +static inline int __must_check
36647 +HYPERVISOR_update_va_mapping(
36648 + unsigned long va, pte_t new_val, unsigned long flags)
36650 + unsigned long pte_hi = 0;
36651 +#ifdef CONFIG_X86_PAE
36652 + pte_hi = new_val.pte_high;
36654 + return _hypercall4(int, update_va_mapping, va,
36655 + new_val.pte_low, pte_hi, flags);
36658 +static inline int __must_check
36659 +HYPERVISOR_event_channel_op(
36660 + int cmd, void *arg)
36662 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
36664 +#if CONFIG_XEN_COMPAT <= 0x030002
36665 + if (unlikely(rc == -ENOSYS)) {
36666 + struct evtchn_op op;
36668 + memcpy(&op.u, arg, sizeof(op.u));
36669 + rc = _hypercall1(int, event_channel_op_compat, &op);
36670 + memcpy(arg, &op.u, sizeof(op.u));
36677 +static inline int __must_check
36678 +HYPERVISOR_xen_version(
36679 + int cmd, void *arg)
36681 + return _hypercall2(int, xen_version, cmd, arg);
36684 +static inline int __must_check
36685 +HYPERVISOR_console_io(
36686 + int cmd, unsigned int count, char *str)
36688 + return _hypercall3(int, console_io, cmd, count, str);
36691 +static inline int __must_check
36692 +HYPERVISOR_physdev_op(
36693 + int cmd, void *arg)
36695 + int rc = _hypercall2(int, physdev_op, cmd, arg);
36697 +#if CONFIG_XEN_COMPAT <= 0x030002
36698 + if (unlikely(rc == -ENOSYS)) {
36699 + struct physdev_op op;
36701 + memcpy(&op.u, arg, sizeof(op.u));
36702 + rc = _hypercall1(int, physdev_op_compat, &op);
36703 + memcpy(arg, &op.u, sizeof(op.u));
36710 +static inline int __must_check
36711 +HYPERVISOR_grant_table_op(
36712 + unsigned int cmd, void *uop, unsigned int count)
36714 + return _hypercall3(int, grant_table_op, cmd, uop, count);
36717 +static inline int __must_check
36718 +HYPERVISOR_update_va_mapping_otherdomain(
36719 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
36721 + unsigned long pte_hi = 0;
36722 +#ifdef CONFIG_X86_PAE
36723 + pte_hi = new_val.pte_high;
36725 + return _hypercall5(int, update_va_mapping_otherdomain, va,
36726 + new_val.pte_low, pte_hi, flags, domid);
36729 +static inline int __must_check
36730 +HYPERVISOR_vm_assist(
36731 + unsigned int cmd, unsigned int type)
36733 + return _hypercall2(int, vm_assist, cmd, type);
36736 +static inline int __must_check
36737 +HYPERVISOR_vcpu_op(
36738 + int cmd, unsigned int vcpuid, void *extra_args)
36740 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
36743 +static inline int __must_check
36744 +HYPERVISOR_suspend(
36745 + unsigned long srec)
36747 + struct sched_shutdown sched_shutdown = {
36748 + .reason = SHUTDOWN_suspend
36751 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
36752 + &sched_shutdown, srec);
36754 +#if CONFIG_XEN_COMPAT <= 0x030002
36755 + if (rc == -ENOSYS)
36756 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
36757 + SHUTDOWN_suspend, srec);
36763 +#if CONFIG_XEN_COMPAT <= 0x030002
36765 +HYPERVISOR_nmi_op(
36766 + unsigned long op, void *arg)
36768 + return _hypercall2(int, nmi_op, op, arg);
36772 +#ifndef CONFIG_XEN
36773 +static inline unsigned long __must_check
36774 +HYPERVISOR_hvm_op(
36775 + int op, void *arg)
36777 + return _hypercall2(unsigned long, hvm_op, op, arg);
36781 +static inline int __must_check
36782 +HYPERVISOR_callback_op(
36783 + int cmd, const void *arg)
36785 + return _hypercall2(int, callback_op, cmd, arg);
36788 +static inline int __must_check
36789 +HYPERVISOR_xenoprof_op(
36790 + int op, void *arg)
36792 + return _hypercall2(int, xenoprof_op, op, arg);
36795 +static inline int __must_check
36796 +HYPERVISOR_kexec_op(
36797 + unsigned long op, void *args)
36799 + return _hypercall2(int, kexec_op, op, args);
36804 +#endif /* __HYPERCALL_H__ */
36805 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h
36806 ===================================================================
36807 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36808 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h 2008-02-20 09:32:49.000000000 +0100
36810 +/******************************************************************************
36813 + * Linux-specific hypervisor handling.
36815 + * Copyright (c) 2002-2004, K A Fraser
36817 + * This program is free software; you can redistribute it and/or
36818 + * modify it under the terms of the GNU General Public License version 2
36819 + * as published by the Free Software Foundation; or, when distributed
36820 + * separately from the Linux kernel or incorporated into other
36821 + * software packages, subject to the following license:
36823 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36824 + * of this source file (the "Software"), to deal in the Software without
36825 + * restriction, including without limitation the rights to use, copy, modify,
36826 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36827 + * and to permit persons to whom the Software is furnished to do so, subject to
36828 + * the following conditions:
36830 + * The above copyright notice and this permission notice shall be included in
36831 + * all copies or substantial portions of the Software.
36833 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36834 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36835 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36836 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36837 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36838 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36839 + * IN THE SOFTWARE.
36842 +#ifndef __HYPERVISOR_H__
36843 +#define __HYPERVISOR_H__
36845 +#include <linux/types.h>
36846 +#include <linux/kernel.h>
36847 +#include <linux/version.h>
36848 +#include <linux/errno.h>
36849 +#include <xen/interface/xen.h>
36850 +#include <xen/interface/platform.h>
36851 +#include <xen/interface/event_channel.h>
36852 +#include <xen/interface/physdev.h>
36853 +#include <xen/interface/sched.h>
36854 +#include <xen/interface/nmi.h>
36855 +#include <asm/ptrace.h>
36856 +#include <asm/page.h>
36857 +#if defined(__i386__)
36858 +# ifdef CONFIG_X86_PAE
36859 +# include <asm-generic/pgtable-nopud.h>
36861 +# include <asm-generic/pgtable-nopmd.h>
36863 +#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
36864 +# include <asm-generic/pgtable-nopud.h>
36867 +extern shared_info_t *HYPERVISOR_shared_info;
36869 +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
36871 +#define current_vcpu_info() vcpu_info(smp_processor_id())
36873 +#define current_vcpu_info() vcpu_info(0)
36876 +#ifdef CONFIG_X86_32
36877 +extern unsigned long hypervisor_virt_start;
36880 +/* arch/xen/i386/kernel/setup.c */
36881 +extern start_info_t *xen_start_info;
36882 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
36883 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
36885 +#define is_initial_xendomain() 0
36888 +/* arch/xen/kernel/evtchn.c */
36889 +/* Force a proper event-channel callback from Xen. */
36890 +void force_evtchn_callback(void);
36892 +/* arch/xen/kernel/process.c */
36893 +void xen_cpu_idle (void);
36895 +/* arch/xen/i386/kernel/hypervisor.c */
36896 +void do_hypervisor_callback(struct pt_regs *regs);
36898 +/* arch/xen/i386/mm/hypervisor.c */
36900 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
36901 + * be MACHINE addresses.
36904 +void xen_pt_switch(unsigned long ptr);
36905 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
36906 +void xen_load_gs(unsigned int selector); /* x86_64 only */
36907 +void xen_tlb_flush(void);
36908 +void xen_invlpg(unsigned long ptr);
36910 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
36911 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
36912 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
36913 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
36914 +void xen_pgd_pin(unsigned long ptr);
36915 +void xen_pgd_unpin(unsigned long ptr);
36917 +void xen_set_ldt(const void *ptr, unsigned int ents);
36920 +#include <linux/cpumask.h>
36921 +void xen_tlb_flush_all(void);
36922 +void xen_invlpg_all(unsigned long ptr);
36923 +void xen_tlb_flush_mask(cpumask_t *mask);
36924 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
36927 +/* Returns zero on success else negative errno. */
36928 +int xen_create_contiguous_region(
36929 + unsigned long vstart, unsigned int order, unsigned int address_bits);
36930 +void xen_destroy_contiguous_region(
36931 + unsigned long vstart, unsigned int order);
36935 +int xen_limit_pages_to_max_mfn(
36936 + struct page *pages, unsigned int order, unsigned int address_bits);
36938 +/* Turn jiffies into Xen system time. */
36939 +u64 jiffies_to_st(unsigned long jiffies);
36941 +#ifdef CONFIG_XEN_SCRUB_PAGES
36942 +void scrub_pages(void *, unsigned int);
36944 +#define scrub_pages(_p,_n) ((void)0)
36947 +#include <xen/hypercall.h>
36949 +#if defined(CONFIG_X86_64)
36950 +#define MULTI_UVMFLAGS_INDEX 2
36951 +#define MULTI_UVMDOMID_INDEX 3
36953 +#define MULTI_UVMFLAGS_INDEX 3
36954 +#define MULTI_UVMDOMID_INDEX 4
36958 +#define is_running_on_xen() 1
36960 +extern char *hypercall_stubs;
36961 +#define is_running_on_xen() (!!hypercall_stubs)
36968 + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
36970 +#if CONFIG_XEN_COMPAT <= 0x030002
36971 + if (rc == -ENOSYS)
36972 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
36982 + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
36984 +#if CONFIG_XEN_COMPAT <= 0x030002
36985 + if (rc == -ENOSYS)
36986 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
36992 +static inline void /*__noreturn*/
36993 +HYPERVISOR_shutdown(
36994 + unsigned int reason)
36996 + struct sched_shutdown sched_shutdown = {
37000 + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
37001 +#if CONFIG_XEN_COMPAT <= 0x030002
37002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
37004 + /* Don't recurse needlessly. */
37005 + BUG_ON(reason != SHUTDOWN_crash);
37009 +static inline int __must_check
37011 + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
37014 + struct sched_poll sched_poll = {
37015 + .nr_ports = nr_ports,
37016 + .timeout = jiffies_to_st(timeout)
37018 + set_xen_guest_handle(sched_poll.ports, ports);
37020 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
37021 +#if CONFIG_XEN_COMPAT <= 0x030002
37022 + if (rc == -ENOSYS)
37023 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
37031 +static inline void
37032 +MULTI_update_va_mapping(
37033 + multicall_entry_t *mcl, unsigned long va,
37034 + pte_t new_val, unsigned long flags)
37036 + mcl->op = __HYPERVISOR_update_va_mapping;
37037 + mcl->args[0] = va;
37038 +#if defined(CONFIG_X86_64)
37039 + mcl->args[1] = new_val.pte;
37040 +#elif defined(CONFIG_X86_PAE)
37041 + mcl->args[1] = new_val.pte_low;
37042 + mcl->args[2] = new_val.pte_high;
37044 + mcl->args[1] = new_val.pte_low;
37045 + mcl->args[2] = 0;
37047 + mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
37050 +static inline void
37051 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
37052 + void *uop, unsigned int count)
37054 + mcl->op = __HYPERVISOR_grant_table_op;
37055 + mcl->args[0] = cmd;
37056 + mcl->args[1] = (unsigned long)uop;
37057 + mcl->args[2] = count;
37060 +#else /* !defined(CONFIG_XEN) */
37062 +/* Multicalls not supported for HVM guests. */
37063 +#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
37064 +#define MULTI_grant_table_op(a,b,c,d) ((void)0)
37068 +#endif /* __HYPERVISOR_H__ */
37069 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h
37070 ===================================================================
37071 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37072 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h 2007-08-16 18:07:01.000000000 +0200
37077 +#include <linux/string.h>
37078 +#include <linux/compiler.h>
37081 + * This file contains the definitions for the x86 IO instructions
37082 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
37083 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
37084 + * versions of the single-IO instructions (inb_p/inw_p/..).
37086 + * This file is not meant to be obfuscating: it's just complicated
37087 + * to (a) handle it all in a way that makes gcc able to optimize it
37088 + * as well as possible and (b) trying to avoid writing the same thing
37089 + * over and over again with slight variations and possibly making a
37090 + * mistake somewhere.
37094 + * Thanks to James van Artsdalen for a better timing-fix than
37095 + * the two short jumps: using outb's to a nonexistent port seems
37096 + * to guarantee better timings even on fast machines.
37098 + * On the other hand, I'd like to be sure of a non-existent port:
37099 + * I feel a bit unsafe about using 0x80 (should be safe, though)
37105 + * Bit simplified and optimized by Jan Hubicka
37106 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
37108 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
37109 + * isa_read[wl] and isa_write[wl] fixed
37110 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
37113 +#define IO_SPACE_LIMIT 0xffff
37115 +#define XQUAD_PORTIO_BASE 0xfe400000
37116 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
37120 +#include <asm-generic/iomap.h>
37122 +#include <linux/vmalloc.h>
37123 +#include <asm/fixmap.h>
37126 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
37129 +#define xlate_dev_mem_ptr(p) __va(p)
37132 + * Convert a virtual cached pointer to an uncached pointer
37134 +#define xlate_dev_kmem_ptr(p) p
37137 + * virt_to_phys - map virtual addresses to physical
37138 + * @address: address to remap
37140 + * The returned physical address is the physical (CPU) mapping for
37141 + * the memory address given. It is only valid to use this function on
37142 + * addresses directly mapped or allocated via kmalloc.
37144 + * This function does not give bus mappings for DMA transfers. In
37145 + * almost all conceivable cases a device driver should not be using
37149 +static inline unsigned long virt_to_phys(volatile void * address)
37151 + return __pa(address);
37155 + * phys_to_virt - map physical address to virtual
37156 + * @address: address to remap
37158 + * The returned virtual address is a current CPU mapping for
37159 + * the memory address given. It is only valid to use this function on
37160 + * addresses that have a kernel mapping
37162 + * This function does not handle bus mappings for DMA transfers. In
37163 + * almost all conceivable cases a device driver should not be using
37167 +static inline void * phys_to_virt(unsigned long address)
37169 + return __va(address);
37173 + * Change "struct page" to physical address.
37175 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
37176 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
37177 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
37179 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
37180 + (unsigned long) bio_offset((bio)))
37181 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
37182 + (unsigned long) (bv)->bv_offset)
37184 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
37185 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
37186 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
37187 + bvec_to_pseudophys((vec2))))
37189 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
37192 + * ioremap - map bus memory into CPU space
37193 + * @offset: bus address of the memory
37194 + * @size: size of the resource to map
37196 + * ioremap performs a platform specific sequence of operations to
37197 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
37198 + * writew/writel functions and the other mmio helpers. The returned
37199 + * address is not guaranteed to be usable directly as a virtual
37203 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
37205 + return __ioremap(offset, size, 0);
37208 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
37209 +extern void iounmap(volatile void __iomem *addr);
37212 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
37213 + * mappings, before the real ioremap() is functional.
37214 + * A boot-time mapping is currently limited to at most 16 pages.
37216 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
37217 +extern void bt_iounmap(void *addr, unsigned long size);
37219 +/* Use early IO mappings for DMI because it's initialized early */
37220 +#define dmi_ioremap bt_ioremap
37221 +#define dmi_iounmap bt_iounmap
37222 +#define dmi_alloc alloc_bootmem
37225 + * ISA I/O bus memory addresses are 1:1 with the physical address.
37227 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
37228 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
37229 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
37232 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
37233 + * are forbidden in portable PCI drivers.
37235 + * Allow them on x86 for legacy drivers, though.
37237 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
37238 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
37241 + * readX/writeX() are used to access memory mapped devices. On some
37242 + * architectures the memory mapped IO stuff needs to be accessed
37243 + * differently. On the x86 architecture, we just read/write the
37244 + * memory location directly.
37247 +static inline unsigned char readb(const volatile void __iomem *addr)
37249 + return *(volatile unsigned char __force *) addr;
37251 +static inline unsigned short readw(const volatile void __iomem *addr)
37253 + return *(volatile unsigned short __force *) addr;
37255 +static inline unsigned int readl(const volatile void __iomem *addr)
37257 + return *(volatile unsigned int __force *) addr;
37259 +#define readb_relaxed(addr) readb(addr)
37260 +#define readw_relaxed(addr) readw(addr)
37261 +#define readl_relaxed(addr) readl(addr)
37262 +#define __raw_readb readb
37263 +#define __raw_readw readw
37264 +#define __raw_readl readl
37266 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
37268 + *(volatile unsigned char __force *) addr = b;
37270 +static inline void writew(unsigned short b, volatile void __iomem *addr)
37272 + *(volatile unsigned short __force *) addr = b;
37274 +static inline void writel(unsigned int b, volatile void __iomem *addr)
37276 + *(volatile unsigned int __force *) addr = b;
37278 +#define __raw_writeb writeb
37279 +#define __raw_writew writew
37280 +#define __raw_writel writel
37284 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
37286 + memset((void __force *) addr, val, count);
37288 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
37290 + __memcpy(dst, (void __force *) src, count);
37292 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
37294 + __memcpy((void __force *) dst, src, count);
37298 + * ISA space is 'always mapped' on a typical x86 system, no need to
37299 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
37300 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
37301 + * are physical addresses. The following constant pointer can be
37302 + * used as the IO-area pointer (it can be iounmapped as well, so the
37303 + * analogy with PCI is quite large):
37305 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
37308 + * Again, i386 does not require mem IO specific function.
37311 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
37314 + * check_signature - find BIOS signatures
37315 + * @io_addr: mmio address to check
37316 + * @signature: signature block
37317 + * @length: length of signature
37319 + * Perform a signature comparison with the mmio address io_addr. This
37320 + * address should have been obtained by ioremap.
37321 + * Returns 1 on a match.
37324 +static inline int check_signature(volatile void __iomem * io_addr,
37325 + const unsigned char *signature, int length)
37329 + if (readb(io_addr) != *signature)
37334 + } while (length);
37341 + * Cache management
37343 + * This needed for two cases
37344 + * 1. Out of order aware processors
37345 + * 2. Accidentally out of order processors (PPro errata #51)
37348 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
37350 +static inline void flush_write_buffers(void)
37352 + __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
37355 +#define dma_cache_inv(_start,_size) flush_write_buffers()
37356 +#define dma_cache_wback(_start,_size) flush_write_buffers()
37357 +#define dma_cache_wback_inv(_start,_size) flush_write_buffers()
37361 +/* Nothing to do */
37363 +#define dma_cache_inv(_start,_size) do { } while (0)
37364 +#define dma_cache_wback(_start,_size) do { } while (0)
37365 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
37366 +#define flush_write_buffers()
37370 +#endif /* __KERNEL__ */
37372 +#ifdef SLOW_IO_BY_JUMPING
37373 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
37375 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
37378 +static inline void slow_down_io(void) {
37379 + __asm__ __volatile__(
37381 +#ifdef REALLY_SLOW_IO
37382 + __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
37387 +#ifdef CONFIG_X86_NUMAQ
37388 +extern void *xquad_portio; /* Where the IO area was mapped */
37389 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
37390 +#define __BUILDIO(bwl,bw,type) \
37391 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
37392 + if (xquad_portio) \
37393 + write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
37395 + out##bwl##_local(value, port); \
37397 +static inline void out##bwl(unsigned type value, int port) { \
37398 + out##bwl##_quad(value, port, 0); \
37400 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
37401 + if (xquad_portio) \
37402 + return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
37404 + return in##bwl##_local(port); \
37406 +static inline unsigned type in##bwl(int port) { \
37407 + return in##bwl##_quad(port, 0); \
37410 +#define __BUILDIO(bwl,bw,type) \
37411 +static inline void out##bwl(unsigned type value, int port) { \
37412 + out##bwl##_local(value, port); \
37414 +static inline unsigned type in##bwl(int port) { \
37415 + return in##bwl##_local(port); \
37420 +#define BUILDIO(bwl,bw,type) \
37421 +static inline void out##bwl##_local(unsigned type value, int port) { \
37422 + __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
37424 +static inline unsigned type in##bwl##_local(int port) { \
37425 + unsigned type value; \
37426 + __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
37429 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
37430 + out##bwl##_local(value, port); \
37431 + slow_down_io(); \
37433 +static inline unsigned type in##bwl##_local_p(int port) { \
37434 + unsigned type value = in##bwl##_local(port); \
37435 + slow_down_io(); \
37438 +__BUILDIO(bwl,bw,type) \
37439 +static inline void out##bwl##_p(unsigned type value, int port) { \
37440 + out##bwl(value, port); \
37441 + slow_down_io(); \
37443 +static inline unsigned type in##bwl##_p(int port) { \
37444 + unsigned type value = in##bwl(port); \
37445 + slow_down_io(); \
37448 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
37449 + __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
37451 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
37452 + __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
37456 +BUILDIO(w,w,short)
37459 +/* We will be supplying our own /dev/mem implementation */
37460 +#define ARCH_HAS_DEV_MEM
37463 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h
37464 ===================================================================
37465 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37466 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200
37469 + * include/asm-i386/irqflags.h
37471 + * IRQ flags handling
37473 + * This file gets included from lowlevel asm headers too, to provide
37474 + * wrapped versions of the local_irq_*() APIs, based on the
37475 + * raw_local_irq_*() functions from the lowlevel headers.
37477 +#ifndef _ASM_IRQFLAGS_H
37478 +#define _ASM_IRQFLAGS_H
37480 +#ifndef __ASSEMBLY__
37483 + * The use of 'barrier' in the following reflects their use as local-lock
37484 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
37485 + * critical operations are executed. All critical operations must complete
37486 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
37487 + * includes these barriers, for example.
37490 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
37492 +#define raw_local_save_flags(flags) \
37493 + do { (flags) = __raw_local_save_flags(); } while (0)
37495 +#define raw_local_irq_restore(x) \
37497 + vcpu_info_t *_vcpu; \
37499 + _vcpu = current_vcpu_info(); \
37500 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
37501 + barrier(); /* unmask then check (avoid races) */ \
37502 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
37503 + force_evtchn_callback(); \
37507 +#define raw_local_irq_disable() \
37509 + current_vcpu_info()->evtchn_upcall_mask = 1; \
37513 +#define raw_local_irq_enable() \
37515 + vcpu_info_t *_vcpu; \
37517 + _vcpu = current_vcpu_info(); \
37518 + _vcpu->evtchn_upcall_mask = 0; \
37519 + barrier(); /* unmask then check (avoid races) */ \
37520 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
37521 + force_evtchn_callback(); \
37525 + * Used in the idle loop; sti takes one instruction cycle
37528 +void raw_safe_halt(void);
37531 + * Used when interrupts are already enabled or to
37532 + * shutdown the processor:
37536 +static inline int raw_irqs_disabled_flags(unsigned long flags)
37538 + return (flags != 0);
37541 +#define raw_irqs_disabled() \
37543 + unsigned long flags = __raw_local_save_flags(); \
37545 + raw_irqs_disabled_flags(flags); \
37549 + * For spinlocks, etc:
37551 +#define __raw_local_irq_save() \
37553 + unsigned long flags = __raw_local_save_flags(); \
37555 + raw_local_irq_disable(); \
37560 +#define raw_local_irq_save(flags) \
37561 + do { (flags) = __raw_local_irq_save(); } while (0)
37563 +#endif /* __ASSEMBLY__ */
37566 + * Do the CPU's IRQ-state tracing from assembly code. We call a
37567 + * C function, so save all the C-clobbered registers:
37569 +#ifdef CONFIG_TRACE_IRQFLAGS
37571 +# define TRACE_IRQS_ON \
37575 + call trace_hardirqs_on; \
37580 +# define TRACE_IRQS_OFF \
37584 + call trace_hardirqs_off; \
37590 +# define TRACE_IRQS_ON
37591 +# define TRACE_IRQS_OFF
37595 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h
37596 ===================================================================
37597 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37598 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h 2008-04-02 12:34:02.000000000 +0200
37600 +#ifndef _I386_MADDR_H
37601 +#define _I386_MADDR_H
37603 +#include <xen/features.h>
37604 +#include <xen/interface/xen.h>
37606 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
37607 +#define INVALID_P2M_ENTRY (~0UL)
37608 +#define FOREIGN_FRAME_BIT (1UL<<31)
37609 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
37611 +/* Definitions for machine and pseudophysical addresses. */
37612 +#ifdef CONFIG_X86_PAE
37613 +typedef unsigned long long paddr_t;
37614 +typedef unsigned long long maddr_t;
37616 +typedef unsigned long paddr_t;
37617 +typedef unsigned long maddr_t;
37622 +extern unsigned long *phys_to_machine_mapping;
37623 +extern unsigned long max_mapnr;
37625 +#undef machine_to_phys_mapping
37626 +extern unsigned long *machine_to_phys_mapping;
37627 +extern unsigned int machine_to_phys_order;
37629 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
37631 + if (xen_feature(XENFEAT_auto_translated_physmap))
37633 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37634 + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
37637 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
37639 + if (xen_feature(XENFEAT_auto_translated_physmap))
37641 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37642 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
37645 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
37647 + unsigned long pfn;
37649 + if (xen_feature(XENFEAT_auto_translated_physmap))
37652 + if (unlikely((mfn >> machine_to_phys_order) != 0))
37653 + return max_mapnr;
37655 + /* The array access can fail (e.g., device space beyond end of RAM). */
37657 + "1: movl %1,%0\n"
37659 + ".section .fixup,\"ax\"\n"
37660 + "3: movl %2,%0\n"
37663 + ".section __ex_table,\"a\"\n"
37668 + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
37674 + * We detect special mappings in one of two ways:
37675 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
37676 + * to be outside our maximum possible pseudophys range.
37677 + * 2. If the MFN belongs to a different domain then we will certainly
37678 + * not have MFN in our p2m table. Conversely, if the page is ours,
37679 + * then we'll have p2m(m2p(MFN))==MFN.
37680 + * If we detect a special mapping then it doesn't have a 'struct page'.
37681 + * We force !pfn_valid() by returning an out-of-range pointer.
37683 + * NB. These checks require that, for any MFN that is not in our reservation,
37684 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
37685 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
37686 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
37688 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
37689 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
37690 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
37691 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
37693 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
37695 + unsigned long pfn = mfn_to_pfn(mfn);
37696 + if ((pfn < max_mapnr)
37697 + && !xen_feature(XENFEAT_auto_translated_physmap)
37698 + && (phys_to_machine_mapping[pfn] != mfn))
37699 + return max_mapnr; /* force !pfn_valid() */
37703 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
37705 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37706 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
37707 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
37710 + phys_to_machine_mapping[pfn] = mfn;
37713 +static inline maddr_t phys_to_machine(paddr_t phys)
37715 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37716 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
37720 +static inline paddr_t machine_to_phys(maddr_t machine)
37722 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37723 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
37727 +#ifdef CONFIG_X86_PAE
37728 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
37731 + * In PAE mode, the NX bit needs to be dealt with in the value
37732 + * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
37733 + * but for i386 the conversion to ulong for the argument will
37736 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37737 + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
37741 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
37744 + * In PAE mode, the NX bit needs to be dealt with in the value
37745 + * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
37746 + * but for i386 the conversion to ulong for the argument will
37749 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37750 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
37755 +#ifdef CONFIG_X86_PAE
37756 +#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
37757 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
37761 + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
37762 + (pgprot_val(pgprot) >> 32);
37763 + pte.pte_high &= (__supported_pte_mask >> 32);
37764 + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
37765 + __supported_pte_mask;
37769 +#define __pte_ma(x) ((pte_t) { (x) } )
37770 +#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
37773 +#else /* !CONFIG_XEN */
37775 +#define pfn_to_mfn(pfn) (pfn)
37776 +#define mfn_to_pfn(mfn) (mfn)
37777 +#define mfn_to_local_pfn(mfn) (mfn)
37778 +#define set_phys_to_machine(pfn, mfn) ((void)0)
37779 +#define phys_to_machine_mapping_valid(pfn) (1)
37780 +#define phys_to_machine(phys) ((maddr_t)(phys))
37781 +#define machine_to_phys(mach) ((paddr_t)(mach))
37782 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
37783 +#define __pte_ma(x) __pte(x)
37785 +#endif /* !CONFIG_XEN */
37787 +/* VIRT <-> MACHINE conversion */
37788 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
37789 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
37790 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
37792 +#endif /* _I386_MADDR_H */
37793 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h
37794 ===================================================================
37795 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37796 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200
37798 +#ifndef __I386_SCHED_H
37799 +#define __I386_SCHED_H
37801 +#include <asm/desc.h>
37802 +#include <asm/atomic.h>
37803 +#include <asm/pgalloc.h>
37804 +#include <asm/tlbflush.h>
37807 + * Used for LDT copy/destruction.
37809 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
37810 +void destroy_context(struct mm_struct *mm);
37813 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
37815 +#if 0 /* XEN: no lazy tlb */
37816 + unsigned cpu = smp_processor_id();
37817 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
37818 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
37822 +#define prepare_arch_switch(next) __prepare_arch_switch()
37824 +static inline void __prepare_arch_switch(void)
37827 + * Save away %fs and %gs. No need to save %es and %ds, as those
37828 + * are always kernel segments while inside the kernel. Must
37829 + * happen before reload of cr3/ldt (i.e., not in __switch_to).
37831 + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
37832 + : "=m" (current->thread.fs),
37833 + "=m" (current->thread.gs));
37834 + asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
37838 +extern void mm_pin(struct mm_struct *mm);
37839 +extern void mm_unpin(struct mm_struct *mm);
37840 +void mm_pin_all(void);
37842 +static inline void switch_mm(struct mm_struct *prev,
37843 + struct mm_struct *next,
37844 + struct task_struct *tsk)
37846 + int cpu = smp_processor_id();
37847 + struct mmuext_op _op[2], *op = _op;
37849 + if (likely(prev != next)) {
37850 + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
37851 + !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
37853 + /* stop flush ipis for the previous mm */
37854 + cpu_clear(cpu, prev->cpu_vm_mask);
37855 +#if 0 /* XEN: no lazy tlb */
37856 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37857 + per_cpu(cpu_tlbstate, cpu).active_mm = next;
37859 + cpu_set(cpu, next->cpu_vm_mask);
37861 + /* Re-load page tables: load_cr3(next->pgd) */
37862 + op->cmd = MMUEXT_NEW_BASEPTR;
37863 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
37867 + * load the LDT, if the LDT is different:
37869 + if (unlikely(prev->context.ldt != next->context.ldt)) {
37870 + /* load_LDT_nolock(&next->context, cpu) */
37871 + op->cmd = MMUEXT_SET_LDT;
37872 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
37873 + op->arg2.nr_ents = next->context.size;
37877 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
37879 +#if 0 /* XEN: no lazy tlb */
37881 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37882 + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
37884 + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37885 + /* We were in lazy tlb mode and leave_mm disabled
37886 + * tlb flush IPI delivery. We must reload %cr3.
37888 + load_cr3(next->pgd);
37889 + load_LDT_nolock(&next->context, cpu);
37895 +#define deactivate_mm(tsk, mm) \
37896 + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
37898 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
37900 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
37902 + switch_mm(prev, next, NULL);
37906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h
37907 ===================================================================
37908 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h 2007-09-14 11:14:51.000000000 +0200
37911 +#ifndef __i386_PCI_H
37912 +#define __i386_PCI_H
37916 +#include <linux/mm.h> /* for struct page */
37918 +/* Can be used to override the logic in pci_scan_bus for skipping
37919 + already-configured bus numbers - to be used for buggy BIOSes
37920 + or architectures with incomplete PCI setup by the loader */
37923 +extern unsigned int pcibios_assign_all_busses(void);
37925 +#define pcibios_assign_all_busses() 0
37928 +#include <asm/hypervisor.h>
37929 +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
37931 +extern unsigned long pci_mem_start;
37932 +#define PCIBIOS_MIN_IO 0x1000
37933 +#define PCIBIOS_MIN_MEM (pci_mem_start)
37935 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
37937 +void pcibios_config_init(void);
37938 +struct pci_bus * pcibios_scan_root(int bus);
37940 +void pcibios_set_master(struct pci_dev *dev);
37941 +void pcibios_penalize_isa_irq(int irq, int active);
37942 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
37943 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
37945 +/* Dynamic DMA mapping stuff.
37946 + * i386 has everything mapped statically.
37949 +#include <linux/types.h>
37950 +#include <linux/slab.h>
37951 +#include <asm/scatterlist.h>
37952 +#include <linux/string.h>
37953 +#include <asm/io.h>
37957 +#ifdef CONFIG_SWIOTLB
37960 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
37961 +#define PCI_DMA_BUS_IS_PHYS (0)
37963 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
37964 + dma_addr_t ADDR_NAME;
37965 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
37967 +#define pci_unmap_addr(PTR, ADDR_NAME) \
37968 + ((PTR)->ADDR_NAME)
37969 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
37970 + (((PTR)->ADDR_NAME) = (VAL))
37971 +#define pci_unmap_len(PTR, LEN_NAME) \
37972 + ((PTR)->LEN_NAME)
37973 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
37974 + (((PTR)->LEN_NAME) = (VAL))
37978 +/* The PCI address space does equal the physical memory
37979 + * address space. The networking and block device layers use
37980 + * this boolean for bounce buffer decisions.
37982 +#define PCI_DMA_BUS_IS_PHYS (1)
37984 +/* pci_unmap_{page,single} is a nop so... */
37985 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
37986 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
37987 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
37988 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
37989 +#define pci_unmap_len(PTR, LEN_NAME) (0)
37990 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
37994 +/* This is always fine. */
37995 +#define pci_dac_dma_supported(pci_dev, mask) (1)
37997 +static inline dma64_addr_t
37998 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
38000 + return ((dma64_addr_t) page_to_phys(page) +
38001 + (dma64_addr_t) offset);
38004 +static inline struct page *
38005 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
38007 + return pfn_to_page(dma_addr >> PAGE_SHIFT);
38010 +static inline unsigned long
38011 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
38013 + return (dma_addr & ~PAGE_MASK);
38016 +static inline void
38017 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38021 +static inline void
38022 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38024 + flush_write_buffers();
38027 +#define HAVE_PCI_MMAP
38028 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
38029 + enum pci_mmap_state mmap_state, int write_combine);
38032 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
38037 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
38038 + enum pci_dma_burst_strategy *strat,
38039 + unsigned long *strategy_parameter)
38041 + *strat = PCI_DMA_BURST_INFINITY;
38042 + *strategy_parameter = ~0UL;
38046 +#endif /* __KERNEL__ */
38048 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
38049 +#include <xen/pcifront.h>
38050 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
38052 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
38053 +#include <asm-generic/pci-dma-compat.h>
38055 +/* generic pci stuff */
38056 +#include <asm-generic/pci.h>
38058 +#endif /* __i386_PCI_H */
38059 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h
38060 ===================================================================
38061 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38062 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h 2008-07-21 11:00:33.000000000 +0200
38064 +#ifndef _I386_PGALLOC_H
38065 +#define _I386_PGALLOC_H
38067 +#include <asm/fixmap.h>
38068 +#include <linux/threads.h>
38069 +#include <linux/mm.h> /* for struct page */
38070 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
38072 +#define pmd_populate_kernel(mm, pmd, pte) \
38073 + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
38075 +#define pmd_populate(mm, pmd, pte) \
38077 + unsigned long pfn = page_to_pfn(pte); \
38078 + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \
38079 + if (!PageHighMem(pte)) \
38080 + BUG_ON(HYPERVISOR_update_va_mapping( \
38081 + (unsigned long)__va(pfn << PAGE_SHIFT), \
38082 + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
38083 + else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
38084 + kmap_flush_unused(); \
38086 + __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
38088 + *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
38092 + * Allocate and free page tables.
38094 +extern pgd_t *pgd_alloc(struct mm_struct *);
38095 +extern void pgd_free(pgd_t *pgd);
38097 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
38098 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
38100 +static inline void pte_free_kernel(pte_t *pte)
38102 + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
38103 + free_page((unsigned long)pte);
38106 +extern void pte_free(struct page *pte);
38108 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
38110 +#ifdef CONFIG_X86_PAE
38112 + * In the PAE case we free the pmds as part of the pgd.
38114 +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
38115 +#define pmd_free(x) do { } while (0)
38116 +#define __pmd_free_tlb(tlb,x) do { } while (0)
38117 +#define pud_populate(mm, pmd, pte) BUG()
38120 +#define check_pgt_cache() do { } while (0)
38122 +#endif /* _I386_PGALLOC_H */
38123 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
38124 ===================================================================
38125 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38126 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h 2007-06-12 13:14:02.000000000 +0200
38128 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
38129 +#define _I386_PGTABLE_3LEVEL_DEFS_H
38131 +#define HAVE_SHARED_KERNEL_PMD 0
38134 + * PGDIR_SHIFT determines what a top-level page table entry can map
38136 +#define PGDIR_SHIFT 30
38137 +#define PTRS_PER_PGD 4
38140 + * PMD_SHIFT determines the size of the area a middle-level
38141 + * page table can map
38143 +#define PMD_SHIFT 21
38144 +#define PTRS_PER_PMD 512
38147 + * entries per page directory level
38149 +#define PTRS_PER_PTE 512
38151 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
38152 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h
38153 ===================================================================
38154 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38155 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200
38157 +#ifndef _I386_PGTABLE_3LEVEL_H
38158 +#define _I386_PGTABLE_3LEVEL_H
38160 +#include <asm-generic/pgtable-nopud.h>
38163 + * Intel Physical Address Extension (PAE) Mode - three-level page
38164 + * tables on PPro+ CPUs.
38166 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
38169 +#define pte_ERROR(e) \
38170 + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
38171 + &(e), __pte_val(e), pte_pfn(e))
38172 +#define pmd_ERROR(e) \
38173 + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38174 + &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38175 +#define pgd_ERROR(e) \
38176 + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38177 + &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38179 +#define pud_none(pud) 0
38180 +#define pud_bad(pud) 0
38181 +#define pud_present(pud) 1
38184 + * Is the pte executable?
38186 +static inline int pte_x(pte_t pte)
38188 + return !(__pte_val(pte) & _PAGE_NX);
38192 + * All present user-pages with !NX bit are user-executable:
38194 +static inline int pte_exec(pte_t pte)
38196 + return pte_user(pte) && pte_x(pte);
38199 + * All present pages with !NX bit are kernel-executable:
38201 +static inline int pte_exec_kernel(pte_t pte)
38203 + return pte_x(pte);
38206 +/* Rules for using set_pte: the pte being assigned *must* be
38207 + * either not present or in a state where the hardware will
38208 + * not attempt to update the pte. In places where this is
38209 + * not possible, use pte_get_and_clear to obtain the old pte
38210 + * value and then use set_pte to update it. -ben
38212 +#define __HAVE_ARCH_SET_PTE_ATOMIC
38214 +static inline void set_pte(pte_t *ptep, pte_t pte)
38216 + ptep->pte_high = pte.pte_high;
38218 + ptep->pte_low = pte.pte_low;
38220 +#define set_pte_atomic(pteptr,pteval) \
38221 + set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
38223 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
38224 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
38225 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
38226 + set_pte((ptep), (pteval)); \
38229 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
38230 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
38231 + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
38232 + set_pte((ptep), (pteval)); \
38233 + xen_invlpg((addr)); \
38237 +#define set_pmd(pmdptr,pmdval) \
38238 + xen_l2_entry_update((pmdptr), (pmdval))
38239 +#define set_pud(pudptr,pudval) \
38240 + xen_l3_entry_update((pudptr), (pudval))
38243 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
38244 + * the TLB via cr3 if the top-level pgd is changed...
38245 + * We do not let the generic code free and clear pgd entries due to
38248 +static inline void pud_clear (pud_t * pud) { }
38250 +#define pud_page(pud) \
38251 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
38253 +#define pud_page_kernel(pud) \
38254 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
38257 +/* Find an entry in the second-level page table.. */
38258 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
38259 + pmd_index(address))
38261 +static inline int pte_none(pte_t pte)
38263 + return !(pte.pte_low | pte.pte_high);
38267 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
38268 + * entry, so clear the bottom half first and enforce ordering with a compiler
38271 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38273 + if ((mm != current->mm && mm != &init_mm)
38274 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38275 + ptep->pte_low = 0;
38277 + ptep->pte_high = 0;
38281 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
38283 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38285 + pte_t pte = *ptep;
38286 + if (!pte_none(pte)) {
38287 + if ((mm != &init_mm) ||
38288 + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38289 + uint64_t val = __pte_val(pte);
38290 + if (__cmpxchg64(ptep, val, 0) != val) {
38291 + /* xchg acts as a barrier before the setting of the high bits */
38292 + pte.pte_low = xchg(&ptep->pte_low, 0);
38293 + pte.pte_high = ptep->pte_high;
38294 + ptep->pte_high = 0;
38301 +#define ptep_clear_flush(vma, addr, ptep) \
38303 + pte_t *__ptep = (ptep); \
38304 + pte_t __res = *__ptep; \
38305 + if (!pte_none(__res) && \
38306 + ((vma)->vm_mm != current->mm || \
38307 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
38308 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38309 + UVMF_INVLPG|UVMF_MULTI))) { \
38310 + __ptep->pte_low = 0; \
38312 + __ptep->pte_high = 0; \
38313 + flush_tlb_page(vma, addr); \
38318 +static inline int pte_same(pte_t a, pte_t b)
38320 + return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
38323 +#define pte_page(x) pfn_to_page(pte_pfn(x))
38325 +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
38326 + ((_pte).pte_high << (32-PAGE_SHIFT)))
38327 +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
38328 + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
38329 +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \
38330 + (_pte).pte_low & _PAGE_PRESENT ? \
38331 + mfn_to_local_pfn(__pte_mfn(_pte)) : \
38334 +extern unsigned long long __supported_pte_mask;
38336 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
38338 + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
38339 + pgprot_val(pgprot)) & __supported_pte_mask);
38342 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
38344 + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
38345 + pgprot_val(pgprot)) & __supported_pte_mask);
38349 + * Bits 0, 6 and 7 are taken in the low part of the pte,
38350 + * put the 32 bits of offset into the high part.
38352 +#define pte_to_pgoff(pte) ((pte).pte_high)
38353 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
38354 +#define PTE_FILE_MAX_BITS 32
38356 +/* Encode and de-code a swap entry */
38357 +#define __swp_type(x) (((x).val) & 0x1f)
38358 +#define __swp_offset(x) ((x).val >> 5)
38359 +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
38360 +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
38361 +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
38363 +#define __pmd_free_tlb(tlb, x) do { } while (0)
38365 +void vmalloc_sync_all(void);
38367 +#endif /* _I386_PGTABLE_3LEVEL_H */
38368 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h
38369 ===================================================================
38370 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38371 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-07-21 11:00:33.000000000 +0200
38373 +#ifndef _I386_PGTABLE_H
38374 +#define _I386_PGTABLE_H
38376 +#include <asm/hypervisor.h>
38379 + * The Linux memory management assumes a three-level page table setup. On
38380 + * the i386, we use that, but "fold" the mid level into the top-level page
38381 + * table, so that we physically have the same two-level page table as the
38382 + * i386 mmu expects.
38384 + * This file contains the functions and defines necessary to modify and use
38385 + * the i386 page table tree.
38387 +#ifndef __ASSEMBLY__
38388 +#include <asm/processor.h>
38389 +#include <asm/fixmap.h>
38390 +#include <linux/threads.h>
38392 +#ifndef _I386_BITOPS_H
38393 +#include <asm/bitops.h>
38396 +#include <linux/slab.h>
38397 +#include <linux/list.h>
38398 +#include <linux/spinlock.h>
38400 +/* Is this pagetable pinned? */
38401 +#define PG_pinned PG_arch_1
38404 +struct vm_area_struct;
38407 + * ZERO_PAGE is a global shared page that is always zero: used
38408 + * for zero-mapped memory areas etc..
38410 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
38411 +extern unsigned long empty_zero_page[1024];
38412 +extern pgd_t *swapper_pg_dir;
38413 +extern kmem_cache_t *pgd_cache;
38414 +extern kmem_cache_t *pmd_cache;
38415 +extern spinlock_t pgd_lock;
38416 +extern struct page *pgd_list;
38418 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
38419 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
38420 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
38421 +void pgtable_cache_init(void);
38422 +void paging_init(void);
38425 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
38426 + * implements both the traditional 2-level x86 page tables and the
38427 + * newer 3-level PAE-mode page tables.
38429 +#ifdef CONFIG_X86_PAE
38430 +# include <asm/pgtable-3level-defs.h>
38431 +# define PMD_SIZE (1UL << PMD_SHIFT)
38432 +# define PMD_MASK (~(PMD_SIZE-1))
38434 +# include <asm/pgtable-2level-defs.h>
38437 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
38438 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
38440 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
38441 +#define FIRST_USER_ADDRESS 0
38443 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
38444 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
38446 +#define TWOLEVEL_PGDIR_SHIFT 22
38447 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
38448 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
38450 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
38451 + * current 8MB value just means that there will be a 8MB "hole" after the
38452 + * physical memory until the kernel virtual memory starts. That means that
38453 + * any out-of-bounds memory accesses will hopefully be caught.
38454 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
38455 + * area for the same reason. ;)
38457 +#define VMALLOC_OFFSET (8*1024*1024)
38458 +#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \
38459 + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
38460 +#ifdef CONFIG_HIGHMEM
38461 +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
38463 +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
38467 + * _PAGE_PSE set in the page directory entry just means that
38468 + * the page directory entry points directly to a 4MB-aligned block of
38471 +#define _PAGE_BIT_PRESENT 0
38472 +#define _PAGE_BIT_RW 1
38473 +#define _PAGE_BIT_USER 2
38474 +#define _PAGE_BIT_PWT 3
38475 +#define _PAGE_BIT_PCD 4
38476 +#define _PAGE_BIT_ACCESSED 5
38477 +#define _PAGE_BIT_DIRTY 6
38478 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38479 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
38480 +/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
38481 +#define _PAGE_BIT_UNUSED2 10
38482 +#define _PAGE_BIT_UNUSED3 11
38483 +#define _PAGE_BIT_NX 63
38485 +#define _PAGE_PRESENT 0x001
38486 +#define _PAGE_RW 0x002
38487 +#define _PAGE_USER 0x004
38488 +#define _PAGE_PWT 0x008
38489 +#define _PAGE_PCD 0x010
38490 +#define _PAGE_ACCESSED 0x020
38491 +#define _PAGE_DIRTY 0x040
38492 +#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38493 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
38494 +/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
38495 +#define _PAGE_UNUSED2 0x400
38496 +#define _PAGE_UNUSED3 0x800
38498 +/* If _PAGE_PRESENT is clear, we use these: */
38499 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
38500 +#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
38501 + pte_present gives true */
38502 +#ifdef CONFIG_X86_PAE
38503 +#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
38505 +#define _PAGE_NX 0
38508 +/* Mapped page is I/O or foreign and has no associated page struct. */
38509 +#define _PAGE_IO 0x200
38511 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
38512 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
38513 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
38515 +#define PAGE_NONE \
38516 + __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
38517 +#define PAGE_SHARED \
38518 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38520 +#define PAGE_SHARED_EXEC \
38521 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38522 +#define PAGE_COPY_NOEXEC \
38523 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38524 +#define PAGE_COPY_EXEC \
38525 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38526 +#define PAGE_COPY \
38528 +#define PAGE_READONLY \
38529 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38530 +#define PAGE_READONLY_EXEC \
38531 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38533 +#define _PAGE_KERNEL \
38534 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
38535 +#define _PAGE_KERNEL_EXEC \
38536 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
38538 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
38539 +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
38540 +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
38541 +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
38542 +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
38544 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
38545 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
38546 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
38547 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
38548 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
38549 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
38552 + * The i386 can't do page protection for execute, and considers that
38553 + * the same are read. Also, write permissions imply read permissions.
38554 + * This is the closest we can get..
38556 +#define __P000 PAGE_NONE
38557 +#define __P001 PAGE_READONLY
38558 +#define __P010 PAGE_COPY
38559 +#define __P011 PAGE_COPY
38560 +#define __P100 PAGE_READONLY_EXEC
38561 +#define __P101 PAGE_READONLY_EXEC
38562 +#define __P110 PAGE_COPY_EXEC
38563 +#define __P111 PAGE_COPY_EXEC
38565 +#define __S000 PAGE_NONE
38566 +#define __S001 PAGE_READONLY
38567 +#define __S010 PAGE_SHARED
38568 +#define __S011 PAGE_SHARED
38569 +#define __S100 PAGE_READONLY_EXEC
38570 +#define __S101 PAGE_READONLY_EXEC
38571 +#define __S110 PAGE_SHARED_EXEC
38572 +#define __S111 PAGE_SHARED_EXEC
38575 + * Define this if things work differently on an i386 and an i486:
38576 + * it will (on an i486) warn about kernel memory accesses that are
38577 + * done without a 'access_ok(VERIFY_WRITE,..)'
38579 +#undef TEST_ACCESS_OK
38581 +/* The boot page tables (all created as a single array) */
38582 +extern unsigned long pg0[];
38584 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
38586 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
38587 +#define pmd_none(x) (!(unsigned long)__pmd_val(x))
38588 +#if CONFIG_XEN_COMPAT <= 0x030002
38589 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
38590 + can temporarily clear it. */
38591 +#define pmd_present(x) (__pmd_val(x))
38592 +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
38594 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
38595 +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
38599 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
38602 + * The following only work if pte_present() is true.
38603 + * Undefined behaviour if not..
38605 +static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38606 +static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38607 +static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
38608 +static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
38609 +static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
38610 +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
38613 + * The following only works if pte_present() is not true.
38615 +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
38617 +static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38618 +static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38619 +static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
38620 +static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
38621 +static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
38622 +static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38623 +static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38624 +static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
38625 +static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
38626 +static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
38627 +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
38629 +#ifdef CONFIG_X86_PAE
38630 +# include <asm/pgtable-3level.h>
38632 +# include <asm/pgtable-2level.h>
38635 +#define ptep_test_and_clear_dirty(vma, addr, ptep) \
38637 + pte_t __pte = *(ptep); \
38638 + int __ret = pte_dirty(__pte); \
38640 + __pte = pte_mkclean(__pte); \
38641 + if ((vma)->vm_mm != current->mm || \
38642 + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38643 + (ptep)->pte_low = __pte.pte_low; \
38648 +#define ptep_test_and_clear_young(vma, addr, ptep) \
38650 + pte_t __pte = *(ptep); \
38651 + int __ret = pte_young(__pte); \
38653 + __pte = pte_mkold(__pte); \
38654 + if ((vma)->vm_mm != current->mm || \
38655 + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38656 + (ptep)->pte_low = __pte.pte_low; \
38660 +#define ptep_get_and_clear_full(mm, addr, ptep, full) \
38662 + pte_t __res = *(ptep); \
38663 + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \
38664 + xen_l1_entry_update(ptep, __pte(0)); \
38666 + *(ptep) = __pte(0); \
38669 + ptep_get_and_clear(mm, addr, ptep))
38671 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38673 + pte_t pte = *ptep;
38674 + if (pte_write(pte))
38675 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
38679 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
38681 + * dst - pointer to pgd range anwhere on a pgd page
38683 + * count - the number of pgds to copy.
38685 + * dst and src can be on the same page, but the range must not overlap,
38686 + * and must not cross a page boundary.
38688 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
38690 + memcpy(dst, src, count * sizeof(pgd_t));
38694 + * Macro to mark a page protection value as "uncacheable". On processors which do not support
38695 + * it, this is a no-op.
38697 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
38698 + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
38701 + * Conversion functions: convert a page and protection to a page entry,
38702 + * and a page entry and page directory to the page they refer to.
38705 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
38707 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
38710 + * Since this might change the present bit (which controls whether
38711 + * a pte_t object has undergone p2m translation), we must use
38712 + * pte_val() on the input pte and __pte() for the return value.
38714 + paddr_t pteval = pte_val(pte);
38716 + pteval &= _PAGE_CHG_MASK;
38717 + pteval |= pgprot_val(newprot);
38718 +#ifdef CONFIG_X86_PAE
38719 + pteval &= __supported_pte_mask;
38721 + return __pte(pteval);
38724 +#define pmd_large(pmd) \
38725 +((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
38728 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
38730 + * this macro returns the index of the entry in the pgd page which would
38731 + * control the given virtual address
38733 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
38734 +#define pgd_index_k(addr) pgd_index(addr)
38737 + * pgd_offset() returns a (pgd_t *)
38738 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
38740 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
38743 + * a shortcut which implies the use of the kernel's pgd, instead
38746 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
38749 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
38751 + * this macro returns the index of the entry in the pmd page which would
38752 + * control the given virtual address
38754 +#define pmd_index(address) \
38755 + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
38758 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
38760 + * this macro returns the index of the entry in the pte page which would
38761 + * control the given virtual address
38763 +#define pte_index(address) \
38764 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
38765 +#define pte_offset_kernel(dir, address) \
38766 + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
38768 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
38770 +#define pmd_page_kernel(pmd) \
38771 + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
38774 + * Helper function that returns the kernel pagetable entry controlling
38775 + * the virtual address 'address'. NULL means no pagetable entry present.
38776 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
38779 +extern pte_t *lookup_address(unsigned long address);
38782 + * Make a given kernel text page executable/non-executable.
38783 + * Returns the previous executability setting of that page (which
38784 + * is used to restore the previous state). Used by the SMP bootup code.
38785 + * NOTE: this is an __init function for security reasons.
38787 +#ifdef CONFIG_X86_PAE
38788 + extern int set_kernel_exec(unsigned long vaddr, int enable);
38790 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
38793 +extern void noexec_setup(const char *str);
38795 +#if defined(CONFIG_HIGHPTE)
38796 +#define pte_offset_map(dir, address) \
38797 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
38798 + pte_index(address))
38799 +#define pte_offset_map_nested(dir, address) \
38800 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
38801 + pte_index(address))
38802 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
38803 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
38805 +#define pte_offset_map(dir, address) \
38806 + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
38807 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
38808 +#define pte_unmap(pte) do { } while (0)
38809 +#define pte_unmap_nested(pte) do { } while (0)
38812 +#define __HAVE_ARCH_PTEP_ESTABLISH
38813 +#define ptep_establish(vma, address, ptep, pteval) \
38815 + if ( likely((vma)->vm_mm == current->mm) ) { \
38816 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
38818 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38819 + UVMF_INVLPG|UVMF_MULTI)); \
38821 + xen_l1_entry_update(ptep, pteval); \
38822 + flush_tlb_page(vma, address); \
38827 + * The i386 doesn't have any external MMU info: the kernel page
38828 + * tables contain all the necessary information.
38830 + * Also, we only update the dirty/accessed state if we set
38831 + * the dirty bit by hand in the kernel, since the hardware
38832 + * will do the accessed bit for us, and we don't want to
38833 + * race with other CPU's that might be updating the dirty
38834 + * bit at the same time.
38836 +#define update_mmu_cache(vma,address,pte) do { } while (0)
38837 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
38838 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
38841 + ptep_establish(vma, address, ptep, entry); \
38844 +#include <xen/features.h>
38845 +void make_lowmem_page_readonly(void *va, unsigned int feature);
38846 +void make_lowmem_page_writable(void *va, unsigned int feature);
38847 +void make_page_readonly(void *va, unsigned int feature);
38848 +void make_page_writable(void *va, unsigned int feature);
38849 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
38850 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
38852 +#define virt_to_ptep(va) \
38854 + pte_t *__ptep = lookup_address((unsigned long)(va)); \
38855 + BUG_ON(!__ptep || !pte_present(*__ptep)); \
38859 +#define arbitrary_virt_to_machine(va) \
38860 + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
38861 + | ((unsigned long)(va) & (PAGE_SIZE - 1)))
38863 +#endif /* !__ASSEMBLY__ */
38865 +#ifdef CONFIG_FLATMEM
38866 +#define kern_addr_valid(addr) (1)
38867 +#endif /* CONFIG_FLATMEM */
38869 +int direct_remap_pfn_range(struct vm_area_struct *vma,
38870 + unsigned long address,
38871 + unsigned long mfn,
38872 + unsigned long size,
38875 +int direct_kernel_remap_pfn_range(unsigned long address,
38876 + unsigned long mfn,
38877 + unsigned long size,
38880 +int create_lookup_pte_addr(struct mm_struct *mm,
38881 + unsigned long address,
38883 +int touch_pte_range(struct mm_struct *mm,
38884 + unsigned long address,
38885 + unsigned long size);
38887 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
38888 + unsigned long addr, unsigned long end, pgprot_t newprot);
38890 +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
38891 + xen_change_pte_range(mm, pmd, addr, end, newprot)
38893 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
38894 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
38896 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
38897 +#define GET_IOSPACE(pfn) 0
38898 +#define GET_PFN(pfn) (pfn)
38900 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
38901 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
38902 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
38903 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
38904 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
38905 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
38906 +#define __HAVE_ARCH_PTE_SAME
38907 +#include <asm-generic/pgtable.h>
38909 +#endif /* _I386_PGTABLE_H */
38910 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h
38911 ===================================================================
38912 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38913 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100
38916 + * include/asm-i386/processor.h
38918 + * Copyright (C) 1994 Linus Torvalds
38921 +#ifndef __ASM_I386_PROCESSOR_H
38922 +#define __ASM_I386_PROCESSOR_H
38924 +#include <asm/vm86.h>
38925 +#include <asm/math_emu.h>
38926 +#include <asm/segment.h>
38927 +#include <asm/page.h>
38928 +#include <asm/types.h>
38929 +#include <asm/sigcontext.h>
38930 +#include <asm/cpufeature.h>
38931 +#include <asm/msr.h>
38932 +#include <asm/system.h>
38933 +#include <linux/cache.h>
38934 +#include <linux/threads.h>
38935 +#include <asm/percpu.h>
38936 +#include <linux/cpumask.h>
38937 +#include <xen/interface/physdev.h>
38939 +/* flag for disabling the tsc */
38940 +extern int tsc_disable;
38942 +struct desc_struct {
38943 + unsigned long a,b;
38946 +#define desc_empty(desc) \
38947 + (!((desc)->a | (desc)->b))
38949 +#define desc_equal(desc1, desc2) \
38950 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
38952 + * Default implementation of macro that returns current
38953 + * instruction pointer ("program counter").
38955 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
38958 + * CPU type and hardware bug flags. Kept separately for each CPU.
38959 + * Members of this structure are referenced in head.S, so think twice
38960 + * before touching them. [mj]
38963 +struct cpuinfo_x86 {
38964 + __u8 x86; /* CPU family */
38965 + __u8 x86_vendor; /* CPU vendor */
38968 + char wp_works_ok; /* It doesn't on 386's */
38969 + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
38972 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
38973 + unsigned long x86_capability[NCAPINTS];
38974 + char x86_vendor_id[16];
38975 + char x86_model_id[64];
38976 + int x86_cache_size; /* in KB - valid for CPUS which support this
38978 + int x86_cache_alignment; /* In bytes */
38984 + unsigned long loops_per_jiffy;
38986 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
38988 + unsigned char x86_max_cores; /* cpuid returned max cores value */
38989 + unsigned char apicid;
38991 + unsigned char booted_cores; /* number of cores as seen by OS */
38992 + __u8 phys_proc_id; /* Physical processor id. */
38993 + __u8 cpu_core_id; /* Core id */
38995 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
38997 +#define X86_VENDOR_INTEL 0
38998 +#define X86_VENDOR_CYRIX 1
38999 +#define X86_VENDOR_AMD 2
39000 +#define X86_VENDOR_UMC 3
39001 +#define X86_VENDOR_NEXGEN 4
39002 +#define X86_VENDOR_CENTAUR 5
39003 +#define X86_VENDOR_RISE 6
39004 +#define X86_VENDOR_TRANSMETA 7
39005 +#define X86_VENDOR_NSC 8
39006 +#define X86_VENDOR_NUM 9
39007 +#define X86_VENDOR_UNKNOWN 0xff
39010 + * capabilities of CPUs
39013 +extern struct cpuinfo_x86 boot_cpu_data;
39014 +extern struct cpuinfo_x86 new_cpu_data;
39015 +#ifndef CONFIG_X86_NO_TSS
39016 +extern struct tss_struct doublefault_tss;
39017 +DECLARE_PER_CPU(struct tss_struct, init_tss);
39021 +extern struct cpuinfo_x86 cpu_data[];
39022 +#define current_cpu_data cpu_data[smp_processor_id()]
39024 +#define cpu_data (&boot_cpu_data)
39025 +#define current_cpu_data boot_cpu_data
39028 +extern int cpu_llc_id[NR_CPUS];
39029 +extern char ignore_fpu_irq;
39031 +extern void identify_cpu(struct cpuinfo_x86 *);
39032 +extern void print_cpu_info(struct cpuinfo_x86 *);
39033 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
39034 +extern unsigned short num_cache_leaves;
39036 +#ifdef CONFIG_X86_HT
39037 +extern void detect_ht(struct cpuinfo_x86 *c);
39039 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
39045 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
39046 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
39047 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
39048 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
39049 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
39050 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
39051 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
39052 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
39053 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
39054 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
39055 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
39056 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
39057 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
39058 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
39059 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
39060 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
39061 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
39064 + * Generic CPUID function
39065 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
39066 + * resulting in stale register contents being returned.
39068 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
39070 + __asm__(XEN_CPUID
39075 + : "0" (op), "c"(0));
39078 +/* Some CPUID calls want 'count' to be placed in ecx */
39079 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
39082 + __asm__(XEN_CPUID
39087 + : "0" (op), "c" (count));
39091 + * CPUID functions returning a single datum
39093 +static inline unsigned int cpuid_eax(unsigned int op)
39095 + unsigned int eax;
39097 + __asm__(XEN_CPUID
39100 + : "bx", "cx", "dx");
39103 +static inline unsigned int cpuid_ebx(unsigned int op)
39105 + unsigned int eax, ebx;
39107 + __asm__(XEN_CPUID
39108 + : "=a" (eax), "=b" (ebx)
39113 +static inline unsigned int cpuid_ecx(unsigned int op)
39115 + unsigned int eax, ecx;
39117 + __asm__(XEN_CPUID
39118 + : "=a" (eax), "=c" (ecx)
39123 +static inline unsigned int cpuid_edx(unsigned int op)
39125 + unsigned int eax, edx;
39127 + __asm__(XEN_CPUID
39128 + : "=a" (eax), "=d" (edx)
39134 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
39137 + * Intel CPU features in CR4
39139 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
39140 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
39141 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
39142 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
39143 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
39144 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
39145 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
39146 +#define X86_CR4_PGE 0x0080 /* enable global pages */
39147 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
39148 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
39149 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
39152 + * Save the cr4 feature set we're using (ie
39153 + * Pentium 4MB enable and PPro Global page
39154 + * enable), so that any CPU's that boot up
39155 + * after us can get the correct flags.
39157 +extern unsigned long mmu_cr4_features;
39159 +static inline void set_in_cr4 (unsigned long mask)
39162 + mmu_cr4_features |= mask;
39163 + cr4 = read_cr4();
39168 +static inline void clear_in_cr4 (unsigned long mask)
39171 + mmu_cr4_features &= ~mask;
39172 + cr4 = read_cr4();
39178 + * NSC/Cyrix CPU configuration register indexes
39181 +#define CX86_PCR0 0x20
39182 +#define CX86_GCR 0xb8
39183 +#define CX86_CCR0 0xc0
39184 +#define CX86_CCR1 0xc1
39185 +#define CX86_CCR2 0xc2
39186 +#define CX86_CCR3 0xc3
39187 +#define CX86_CCR4 0xe8
39188 +#define CX86_CCR5 0xe9
39189 +#define CX86_CCR6 0xea
39190 +#define CX86_CCR7 0xeb
39191 +#define CX86_PCR1 0xf0
39192 +#define CX86_DIR0 0xfe
39193 +#define CX86_DIR1 0xff
39194 +#define CX86_ARR_BASE 0xc4
39195 +#define CX86_RCR_BASE 0xdc
39198 + * NSC/Cyrix CPU indexed register access macros
39201 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
39203 +#define setCx86(reg, data) do { \
39204 + outb((reg), 0x22); \
39205 + outb((data), 0x23); \
39208 +/* Stop speculative execution */
39209 +static inline void sync_core(void)
39212 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
39215 +static inline void __monitor(const void *eax, unsigned long ecx,
39216 + unsigned long edx)
39218 + /* "monitor %eax,%ecx,%edx;" */
39220 + ".byte 0x0f,0x01,0xc8;"
39221 + : :"a" (eax), "c" (ecx), "d"(edx));
39224 +static inline void __mwait(unsigned long eax, unsigned long ecx)
39226 + /* "mwait %eax,%ecx;" */
39228 + ".byte 0x0f,0x01,0xc9;"
39229 + : :"a" (eax), "c" (ecx));
39232 +/* from system description table in BIOS. Mostly for MCA use, but
39233 +others may find it useful. */
39234 +extern unsigned int machine_id;
39235 +extern unsigned int machine_submodel_id;
39236 +extern unsigned int BIOS_revision;
39237 +extern unsigned int mca_pentium_flag;
39239 +/* Boot loader type from the setup header */
39240 +extern int bootloader_type;
39243 + * User space process size: 3GB (default).
39245 +#define TASK_SIZE (PAGE_OFFSET)
39247 +/* This decides where the kernel will search for a free chunk of vm
39248 + * space during mmap's.
39250 +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
39252 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
39255 + * Size of io_bitmap.
39257 +#define IO_BITMAP_BITS 65536
39258 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
39259 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
39260 +#ifndef CONFIG_X86_NO_TSS
39261 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
39263 +#define INVALID_IO_BITMAP_OFFSET 0x8000
39264 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
39266 +struct i387_fsave_struct {
39274 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39275 + long status; /* software status information */
39278 +struct i387_fxsave_struct {
39279 + unsigned short cwd;
39280 + unsigned short swd;
39281 + unsigned short twd;
39282 + unsigned short fop;
39289 + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
39290 + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
39291 + long padding[56];
39292 +} __attribute__ ((aligned (16)));
39294 +struct i387_soft_struct {
39302 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39303 + unsigned char ftop, changed, lookahead, no_update, rm, alimit;
39304 + struct info *info;
39305 + unsigned long entry_eip;
39308 +union i387_union {
39309 + struct i387_fsave_struct fsave;
39310 + struct i387_fxsave_struct fxsave;
39311 + struct i387_soft_struct soft;
39315 + unsigned long seg;
39318 +struct thread_struct;
39320 +#ifndef CONFIG_X86_NO_TSS
39321 +struct tss_struct {
39322 + unsigned short back_link,__blh;
39323 + unsigned long esp0;
39324 + unsigned short ss0,__ss0h;
39325 + unsigned long esp1;
39326 + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
39327 + unsigned long esp2;
39328 + unsigned short ss2,__ss2h;
39329 + unsigned long __cr3;
39330 + unsigned long eip;
39331 + unsigned long eflags;
39332 + unsigned long eax,ecx,edx,ebx;
39333 + unsigned long esp;
39334 + unsigned long ebp;
39335 + unsigned long esi;
39336 + unsigned long edi;
39337 + unsigned short es, __esh;
39338 + unsigned short cs, __csh;
39339 + unsigned short ss, __ssh;
39340 + unsigned short ds, __dsh;
39341 + unsigned short fs, __fsh;
39342 + unsigned short gs, __gsh;
39343 + unsigned short ldt, __ldth;
39344 + unsigned short trace, io_bitmap_base;
39346 + * The extra 1 is there because the CPU will access an
39347 + * additional byte beyond the end of the IO permission
39348 + * bitmap. The extra byte must be all 1 bits, and must
39349 + * be within the limit.
39351 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
39353 + * Cache the current maximum and the last task that used the bitmap:
39355 + unsigned long io_bitmap_max;
39356 + struct thread_struct *io_bitmap_owner;
39358 + * pads the TSS to be cacheline-aligned (size is 0x100)
39360 + unsigned long __cacheline_filler[35];
39362 + * .. and then another 0x100 bytes for emergency kernel stack
39364 + unsigned long stack[64];
39365 +} __attribute__((packed));
39368 +#define ARCH_MIN_TASKALIGN 16
39370 +struct thread_struct {
39371 +/* cached TLS descriptors. */
39372 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
39373 + unsigned long esp0;
39374 + unsigned long sysenter_cs;
39375 + unsigned long eip;
39376 + unsigned long esp;
39377 + unsigned long fs;
39378 + unsigned long gs;
39379 +/* Hardware debugging registers */
39380 + unsigned long debugreg[8]; /* %%db0-7 debug registers */
39382 + unsigned long cr2, trap_no, error_code;
39383 +/* floating point info */
39384 + union i387_union i387;
39385 +/* virtual 86 mode info */
39386 + struct vm86_struct __user * vm86_info;
39387 + unsigned long screen_bitmap;
39388 + unsigned long v86flags, v86mask, saved_esp0;
39389 + unsigned int saved_fs, saved_gs;
39390 +/* IO permissions */
39391 + unsigned long *io_bitmap_ptr;
39392 + unsigned long iopl;
39393 +/* max allowed port in the bitmap, in bytes: */
39394 + unsigned long io_bitmap_max;
39397 +#define INIT_THREAD { \
39398 + .vm86_info = NULL, \
39399 + .sysenter_cs = __KERNEL_CS, \
39400 + .io_bitmap_ptr = NULL, \
39403 +#ifndef CONFIG_X86_NO_TSS
39405 + * Note that the .io_bitmap member must be extra-big. This is because
39406 + * the CPU will access an additional byte beyond the end of the IO
39407 + * permission bitmap. The extra byte must be all 1 bits, and must
39408 + * be within the limit.
39410 +#define INIT_TSS { \
39411 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
39412 + .ss0 = __KERNEL_DS, \
39413 + .ss1 = __KERNEL_CS, \
39414 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
39415 + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
39418 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
39420 + tss->esp0 = thread->esp0;
39421 + /* This can only happen when SEP is enabled, no need to test "SEP"arately */
39422 + if (unlikely(tss->ss1 != thread->sysenter_cs)) {
39423 + tss->ss1 = thread->sysenter_cs;
39424 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
39427 +#define load_esp0(tss, thread) \
39428 + __load_esp0(tss, thread)
39430 +#define load_esp0(tss, thread) do { \
39431 + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
39436 +#define start_thread(regs, new_eip, new_esp) do { \
39437 + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
39438 + set_fs(USER_DS); \
39439 + regs->xds = __USER_DS; \
39440 + regs->xes = __USER_DS; \
39441 + regs->xss = __USER_DS; \
39442 + regs->xcs = __USER_CS; \
39443 + regs->eip = new_eip; \
39444 + regs->esp = new_esp; \
39448 + * These special macros can be used to get or set a debugging register
39450 +#define get_debugreg(var, register) \
39451 + (var) = HYPERVISOR_get_debugreg((register))
39452 +#define set_debugreg(value, register) \
39453 + WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
39456 + * Set IOPL bits in EFLAGS from given mask
39458 +static inline void set_iopl_mask(unsigned mask)
39460 + struct physdev_set_iopl set_iopl;
39462 + /* Force the change at ring 0. */
39463 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
39464 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
39467 +/* Forward declaration, a strange C thing */
39468 +struct task_struct;
39471 +/* Free all resources held by a thread. */
39472 +extern void release_thread(struct task_struct *);
39474 +/* Prepare to copy thread state - unlazy all lazy status */
39475 +extern void prepare_to_copy(struct task_struct *tsk);
39478 + * create a kernel thread without removing it from tasklists
39480 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
39482 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
39483 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
39485 +unsigned long get_wchan(struct task_struct *p);
39487 +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
39488 +#define KSTK_TOP(info) \
39490 + unsigned long *__ptr = (unsigned long *)(info); \
39491 + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
39495 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
39496 + * This is necessary to guarantee that the entire "struct pt_regs"
39497 + * is accessable even if the CPU haven't stored the SS/ESP registers
39498 + * on the stack (interrupt gate does not save these registers
39499 + * when switching to the same priv ring).
39500 + * Therefore beware: accessing the xss/esp fields of the
39501 + * "struct pt_regs" is possible, but they may contain the
39502 + * completely wrong values.
39504 +#define task_pt_regs(task) \
39506 + struct pt_regs *__regs__; \
39507 + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
39511 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
39512 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
39515 +struct microcode_header {
39516 + unsigned int hdrver;
39517 + unsigned int rev;
39518 + unsigned int date;
39519 + unsigned int sig;
39520 + unsigned int cksum;
39521 + unsigned int ldrver;
39523 + unsigned int datasize;
39524 + unsigned int totalsize;
39525 + unsigned int reserved[3];
39528 +struct microcode {
39529 + struct microcode_header hdr;
39530 + unsigned int bits[0];
39533 +typedef struct microcode microcode_t;
39534 +typedef struct microcode_header microcode_header_t;
39536 +/* microcode format is extended from prescott processors */
39537 +struct extended_signature {
39538 + unsigned int sig;
39540 + unsigned int cksum;
39543 +struct extended_sigtable {
39544 + unsigned int count;
39545 + unsigned int cksum;
39546 + unsigned int reserved[3];
39547 + struct extended_signature sigs[0];
39550 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
39551 +static inline void rep_nop(void)
39553 + __asm__ __volatile__("rep;nop": : :"memory");
39556 +#define cpu_relax() rep_nop()
39558 +/* generic versions from gas */
39559 +#define GENERIC_NOP1 ".byte 0x90\n"
39560 +#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
39561 +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
39562 +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
39563 +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
39564 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
39565 +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
39566 +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
39568 +/* Opteron nops */
39569 +#define K8_NOP1 GENERIC_NOP1
39570 +#define K8_NOP2 ".byte 0x66,0x90\n"
39571 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
39572 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
39573 +#define K8_NOP5 K8_NOP3 K8_NOP2
39574 +#define K8_NOP6 K8_NOP3 K8_NOP3
39575 +#define K8_NOP7 K8_NOP4 K8_NOP3
39576 +#define K8_NOP8 K8_NOP4 K8_NOP4
39579 +/* uses eax dependencies (arbitary choice) */
39580 +#define K7_NOP1 GENERIC_NOP1
39581 +#define K7_NOP2 ".byte 0x8b,0xc0\n"
39582 +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
39583 +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
39584 +#define K7_NOP5 K7_NOP4 ASM_NOP1
39585 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
39586 +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
39587 +#define K7_NOP8 K7_NOP7 ASM_NOP1
39590 +#define ASM_NOP1 K8_NOP1
39591 +#define ASM_NOP2 K8_NOP2
39592 +#define ASM_NOP3 K8_NOP3
39593 +#define ASM_NOP4 K8_NOP4
39594 +#define ASM_NOP5 K8_NOP5
39595 +#define ASM_NOP6 K8_NOP6
39596 +#define ASM_NOP7 K8_NOP7
39597 +#define ASM_NOP8 K8_NOP8
39598 +#elif defined(CONFIG_MK7)
39599 +#define ASM_NOP1 K7_NOP1
39600 +#define ASM_NOP2 K7_NOP2
39601 +#define ASM_NOP3 K7_NOP3
39602 +#define ASM_NOP4 K7_NOP4
39603 +#define ASM_NOP5 K7_NOP5
39604 +#define ASM_NOP6 K7_NOP6
39605 +#define ASM_NOP7 K7_NOP7
39606 +#define ASM_NOP8 K7_NOP8
39608 +#define ASM_NOP1 GENERIC_NOP1
39609 +#define ASM_NOP2 GENERIC_NOP2
39610 +#define ASM_NOP3 GENERIC_NOP3
39611 +#define ASM_NOP4 GENERIC_NOP4
39612 +#define ASM_NOP5 GENERIC_NOP5
39613 +#define ASM_NOP6 GENERIC_NOP6
39614 +#define ASM_NOP7 GENERIC_NOP7
39615 +#define ASM_NOP8 GENERIC_NOP8
39618 +#define ASM_NOP_MAX 8
39620 +/* Prefetch instructions for Pentium III and AMD Athlon */
39621 +/* It's not worth to care about 3dnow! prefetches for the K6
39622 + because they are microcoded there and very slow.
39623 + However we don't do prefetches for pre XP Athlons currently
39624 + That should be fixed. */
39625 +#define ARCH_HAS_PREFETCH
39626 +static inline void prefetch(const void *x)
39628 + alternative_input(ASM_NOP4,
39629 + "prefetchnta (%1)",
39634 +#define ARCH_HAS_PREFETCH
39635 +#define ARCH_HAS_PREFETCHW
39636 +#define ARCH_HAS_SPINLOCK_PREFETCH
39638 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
39639 + spinlocks to avoid one state transition in the cache coherency protocol. */
39640 +static inline void prefetchw(const void *x)
39642 + alternative_input(ASM_NOP4,
39643 + "prefetchw (%1)",
39644 + X86_FEATURE_3DNOW,
39647 +#define spin_lock_prefetch(x) prefetchw(x)
39649 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
39651 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
39653 +extern unsigned long boot_option_idle_override;
39654 +extern void enable_sep_cpu(void);
39655 +extern int sysenter_setup(void);
39657 +#endif /* __ASM_I386_PROCESSOR_H */
39658 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h
39659 ===================================================================
39660 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39661 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h 2007-06-12 13:14:02.000000000 +0200
39663 +#ifndef _ASM_SEGMENT_H
39664 +#define _ASM_SEGMENT_H
39667 + * The layout of the per-CPU GDT under Linux:
39674 + * 4 - unused <==== new cacheline
39677 + * ------- start of TLS (Thread-Local Storage) segments:
39679 + * 6 - TLS segment #1 [ glibc's TLS segment ]
39680 + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
39681 + * 8 - TLS segment #3
39686 + * ------- start of kernel segments:
39688 + * 12 - kernel code segment <==== new cacheline
39689 + * 13 - kernel data segment
39690 + * 14 - default user CS
39691 + * 15 - default user DS
39694 + * 18 - PNPBIOS support (16->32 gate)
39695 + * 19 - PNPBIOS support
39696 + * 20 - PNPBIOS support
39697 + * 21 - PNPBIOS support
39698 + * 22 - PNPBIOS support
39699 + * 23 - APM BIOS support
39700 + * 24 - APM BIOS support
39701 + * 25 - APM BIOS support
39703 + * 26 - ESPFIX small SS
39708 + * 31 - TSS for double fault handler
39710 +#define GDT_ENTRY_TLS_ENTRIES 3
39711 +#define GDT_ENTRY_TLS_MIN 6
39712 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
39714 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
39716 +#define GDT_ENTRY_DEFAULT_USER_CS 14
39717 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
39719 +#define GDT_ENTRY_DEFAULT_USER_DS 15
39720 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
39722 +#define GDT_ENTRY_KERNEL_BASE 12
39724 +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
39725 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
39726 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39728 +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
39729 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
39730 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39732 +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
39733 +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
39735 +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
39736 +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
39738 +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
39739 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
39741 +#define GDT_ENTRY_DOUBLEFAULT_TSS 31
39744 + * The GDT has 32 entries
39746 +#define GDT_ENTRIES 32
39748 +#define GDT_SIZE (GDT_ENTRIES * 8)
39750 +/* Simple and small GDT entries for booting only */
39752 +#define GDT_ENTRY_BOOT_CS 2
39753 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
39755 +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
39756 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
39758 +/* The PnP BIOS entries in the GDT */
39759 +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
39760 +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
39761 +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
39762 +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
39763 +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
39765 +/* The PnP BIOS selectors */
39766 +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
39767 +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
39768 +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
39769 +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
39770 +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
39773 + * The interrupt descriptor table has room for 256 idt's,
39774 + * the global descriptor table is dependent on the number
39775 + * of tasks we can have..
39777 +#define IDT_ENTRIES 256
39780 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h
39781 ===================================================================
39782 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39783 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200
39785 +#ifndef __ASM_SMP_H
39786 +#define __ASM_SMP_H
39789 + * We need the APIC definitions automatically as part of 'smp.h'
39791 +#ifndef __ASSEMBLY__
39792 +#include <linux/kernel.h>
39793 +#include <linux/threads.h>
39794 +#include <linux/cpumask.h>
39797 +#ifdef CONFIG_X86_LOCAL_APIC
39798 +#ifndef __ASSEMBLY__
39799 +#include <asm/fixmap.h>
39800 +#include <asm/bitops.h>
39801 +#include <asm/mpspec.h>
39802 +#ifdef CONFIG_X86_IO_APIC
39803 +#include <asm/io_apic.h>
39805 +#include <asm/apic.h>
39809 +#define BAD_APICID 0xFFu
39811 +#ifndef __ASSEMBLY__
39814 + * Private routines/data
39817 +extern void smp_alloc_memory(void);
39818 +extern int pic_mode;
39819 +extern int smp_num_siblings;
39820 +extern cpumask_t cpu_sibling_map[];
39821 +extern cpumask_t cpu_core_map[];
39823 +extern void (*mtrr_hook) (void);
39824 +extern void zap_low_mappings (void);
39825 +extern void lock_ipi_call_lock(void);
39826 +extern void unlock_ipi_call_lock(void);
39828 +#define MAX_APICID 256
39829 +extern u8 x86_cpu_to_apicid[];
39831 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
39833 +#ifdef CONFIG_HOTPLUG_CPU
39834 +extern void cpu_exit_clear(void);
39835 +extern void cpu_uninit(void);
39839 + * This function is needed by all SMP systems. It must _always_ be valid
39840 + * from the initial startup. We map APIC_BASE very early in page_setup(),
39841 + * so this is correct in the x86 case.
39843 +#define raw_smp_processor_id() (current_thread_info()->cpu)
39845 +extern cpumask_t cpu_possible_map;
39846 +#define cpu_callin_map cpu_possible_map
39848 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
39849 +static inline int num_booting_cpus(void)
39851 + return cpus_weight(cpu_possible_map);
39854 +#ifdef CONFIG_X86_LOCAL_APIC
39856 +#ifdef APIC_DEFINITION
39857 +extern int hard_smp_processor_id(void);
39859 +#include <mach_apicdef.h>
39860 +static inline int hard_smp_processor_id(void)
39862 + /* we don't want to mark this access volatile - bad code generation */
39863 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
39867 +static __inline int logical_smp_processor_id(void)
39869 + /* we don't want to mark this access volatile - bad code generation */
39870 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
39875 +extern int __cpu_disable(void);
39876 +extern void __cpu_die(unsigned int cpu);
39877 +extern void prefill_possible_map(void);
39878 +#endif /* !__ASSEMBLY__ */
39880 +#else /* CONFIG_SMP */
39882 +#define cpu_physical_id(cpu) boot_cpu_physical_apicid
39884 +#define NO_PROC_ID 0xFF /* No processor magic marker */
39888 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h
39889 ===================================================================
39890 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39891 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h 2007-06-12 13:14:02.000000000 +0200
39893 +#ifndef _ASM_SWIOTLB_H
39894 +#define _ASM_SWIOTLB_H 1
39896 +/* SWIOTLB interface */
39898 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
39900 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
39901 + size_t size, int dir);
39902 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
39903 + dma_addr_t dev_addr,
39904 + size_t size, int dir);
39905 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
39906 + dma_addr_t dev_addr,
39907 + size_t size, int dir);
39908 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
39909 + struct scatterlist *sg, int nelems,
39911 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
39912 + struct scatterlist *sg, int nelems,
39914 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
39915 + int nents, int direction);
39916 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
39917 + int nents, int direction);
39918 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
39919 +#ifdef CONFIG_HIGHMEM
39920 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
39921 + unsigned long offset, size_t size,
39922 + enum dma_data_direction direction);
39923 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
39924 + size_t size, enum dma_data_direction direction);
39926 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
39927 +extern void swiotlb_init(void);
39929 +#ifdef CONFIG_SWIOTLB
39930 +extern int swiotlb;
39936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h
39937 ===================================================================
39938 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200
39941 +#ifndef __XEN_SYNCH_BITOPS_H__
39942 +#define __XEN_SYNCH_BITOPS_H__
39945 + * Copyright 1992, Linus Torvalds.
39946 + * Heavily modified to provide guaranteed strong synchronisation
39947 + * when communicating with Xen or other guest OSes running on other CPUs.
39950 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
39951 +#include <xen/platform-compat.h>
39954 +#define ADDR (*(volatile long *) addr)
39956 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
39958 + __asm__ __volatile__ (
39959 + "lock btsl %1,%0"
39960 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39963 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
39965 + __asm__ __volatile__ (
39966 + "lock btrl %1,%0"
39967 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39970 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
39972 + __asm__ __volatile__ (
39973 + "lock btcl %1,%0"
39974 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39977 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
39980 + __asm__ __volatile__ (
39981 + "lock btsl %2,%1\n\tsbbl %0,%0"
39982 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39986 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
39989 + __asm__ __volatile__ (
39990 + "lock btrl %2,%1\n\tsbbl %0,%0"
39991 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39995 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
39999 + __asm__ __volatile__ (
40000 + "lock btcl %2,%1\n\tsbbl %0,%0"
40001 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
40005 +struct __synch_xchg_dummy { unsigned long a[100]; };
40006 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
40008 +#define synch_cmpxchg(ptr, old, new) \
40009 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
40010 + (unsigned long)(old), \
40011 + (unsigned long)(new), \
40014 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
40015 + unsigned long old,
40016 + unsigned long new, int size)
40018 + unsigned long prev;
40021 + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
40023 + : "q"(new), "m"(*__synch_xg(ptr)),
40028 + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
40030 + : "r"(new), "m"(*__synch_xg(ptr)),
40034 +#ifdef CONFIG_X86_64
40036 + __asm__ __volatile__("lock; cmpxchgl %k1,%2"
40038 + : "r"(new), "m"(*__synch_xg(ptr)),
40043 + __asm__ __volatile__("lock; cmpxchgq %1,%2"
40045 + : "r"(new), "m"(*__synch_xg(ptr)),
40051 + __asm__ __volatile__("lock; cmpxchgl %1,%2"
40053 + : "r"(new), "m"(*__synch_xg(ptr)),
40062 +#define synch_test_bit test_bit
40064 +#define synch_cmpxchg_subword synch_cmpxchg
40066 +#endif /* __XEN_SYNCH_BITOPS_H__ */
40067 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h
40068 ===================================================================
40069 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40070 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200
40072 +#ifndef __ASM_SYSTEM_H
40073 +#define __ASM_SYSTEM_H
40075 +#include <linux/kernel.h>
40076 +#include <asm/segment.h>
40077 +#include <asm/cpufeature.h>
40078 +#include <linux/bitops.h> /* for LOCK_PREFIX */
40079 +#include <asm/synch_bitops.h>
40080 +#include <asm/hypervisor.h>
40084 +struct task_struct; /* one of the stranger aspects of C forward declarations.. */
40085 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
40088 + * Saving eflags is important. It switches not only IOPL between tasks,
40089 + * it also protects other tasks from NT leaking through sysenter etc.
40091 +#define switch_to(prev,next,last) do { \
40092 + unsigned long esi,edi; \
40093 + asm volatile("pushfl\n\t" /* Save flags */ \
40094 + "pushl %%ebp\n\t" \
40095 + "movl %%esp,%0\n\t" /* save ESP */ \
40096 + "movl %5,%%esp\n\t" /* restore ESP */ \
40097 + "movl $1f,%1\n\t" /* save EIP */ \
40098 + "pushl %6\n\t" /* restore EIP */ \
40099 + "jmp __switch_to\n" \
40101 + "popl %%ebp\n\t" \
40103 + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
40104 + "=a" (last),"=S" (esi),"=D" (edi) \
40105 + :"m" (next->thread.esp),"m" (next->thread.eip), \
40106 + "2" (prev), "d" (next)); \
40109 +#define _set_base(addr,base) do { unsigned long __pr; \
40110 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40111 + "rorl $16,%%edx\n\t" \
40112 + "movb %%dl,%2\n\t" \
40115 + :"m" (*((addr)+2)), \
40116 + "m" (*((addr)+4)), \
40117 + "m" (*((addr)+7)), \
40121 +#define _set_limit(addr,limit) do { unsigned long __lr; \
40122 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40123 + "rorl $16,%%edx\n\t" \
40124 + "movb %2,%%dh\n\t" \
40125 + "andb $0xf0,%%dh\n\t" \
40126 + "orb %%dh,%%dl\n\t" \
40129 + :"m" (*(addr)), \
40130 + "m" (*((addr)+6)), \
40134 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
40135 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
40138 + * Load a segment. Fall back on loading the zero
40139 + * segment if something goes wrong..
40141 +#define loadsegment(seg,value) \
40142 + asm volatile("\n" \
40144 + "mov %0,%%" #seg "\n" \
40146 + ".section .fixup,\"ax\"\n" \
40149 + "popl %%" #seg "\n\t" \
40152 + ".section __ex_table,\"a\"\n\t" \
40154 + ".long 1b,3b\n" \
40159 + * Save a segment register away
40161 +#define savesegment(seg, value) \
40162 + asm volatile("mov %%" #seg ",%0":"=rm" (value))
40164 +#define read_cr0() ({ \
40165 + unsigned int __dummy; \
40166 + __asm__ __volatile__( \
40167 + "movl %%cr0,%0\n\t" \
40168 + :"=r" (__dummy)); \
40171 +#define write_cr0(x) \
40172 + __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
40174 +#define read_cr2() (current_vcpu_info()->arch.cr2)
40175 +#define write_cr2(x) \
40176 + __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
40178 +#define read_cr3() ({ \
40179 + unsigned int __dummy; \
40181 + "movl %%cr3,%0\n\t" \
40182 + :"=r" (__dummy)); \
40183 + __dummy = xen_cr3_to_pfn(__dummy); \
40184 + mfn_to_pfn(__dummy) << PAGE_SHIFT; \
40186 +#define write_cr3(x) ({ \
40187 + unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
40188 + __dummy = xen_pfn_to_cr3(__dummy); \
40189 + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
40191 +#define read_cr4() ({ \
40192 + unsigned int __dummy; \
40194 + "movl %%cr4,%0\n\t" \
40195 + :"=r" (__dummy)); \
40198 +#define read_cr4_safe() ({ \
40199 + unsigned int __dummy; \
40200 + /* This could fault if %cr4 does not exist */ \
40201 + __asm__("1: movl %%cr4, %0 \n" \
40203 + ".section __ex_table,\"a\" \n" \
40204 + ".long 1b,2b \n" \
40206 + : "=r" (__dummy): "0" (0)); \
40210 +#define write_cr4(x) \
40211 + __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
40214 + * Clear and set 'TS' bit respectively
40216 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
40217 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
40219 +#endif /* __KERNEL__ */
40221 +#define wbinvd() \
40222 + __asm__ __volatile__ ("wbinvd": : :"memory")
40224 +static inline unsigned long get_limit(unsigned long segment)
40226 + unsigned long __limit;
40227 + __asm__("lsll %1,%0"
40228 + :"=r" (__limit):"r" (segment));
40229 + return __limit+1;
40232 +#define nop() __asm__ __volatile__ ("nop")
40234 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
40236 +#define tas(ptr) (xchg((ptr),1))
40238 +struct __xchg_dummy { unsigned long a[100]; };
40239 +#define __xg(x) ((struct __xchg_dummy *)(x))
40242 +#ifdef CONFIG_X86_CMPXCHG64
40245 + * The semantics of XCHGCMP8B are a bit strange, this is why
40246 + * there is a loop and the loading of %%eax and %%edx has to
40247 + * be inside. This inlines well in most cases, the cached
40248 + * cost is around ~38 cycles. (in the future we might want
40249 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
40250 + * might have an implicit FPU-save as a cost, so it's not
40251 + * clear which path to go.)
40253 + * cmpxchg8b must be used with the lock prefix here to allow
40254 + * the instruction to be executed atomically, see page 3-102
40255 + * of the instruction set reference 24319102.pdf. We need
40256 + * the reader side to see the coherent 64bit value.
40258 +static inline void __set_64bit (unsigned long long * ptr,
40259 + unsigned int low, unsigned int high)
40261 + __asm__ __volatile__ (
40263 + "movl (%0), %%eax\n\t"
40264 + "movl 4(%0), %%edx\n\t"
40265 + "lock cmpxchg8b (%0)\n\t"
40267 + : /* no outputs */
40271 + : "ax","dx","memory");
40274 +static inline void __set_64bit_constant (unsigned long long *ptr,
40275 + unsigned long long value)
40277 + __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
40279 +#define ll_low(x) *(((unsigned int*)&(x))+0)
40280 +#define ll_high(x) *(((unsigned int*)&(x))+1)
40282 +static inline void __set_64bit_var (unsigned long long *ptr,
40283 + unsigned long long value)
40285 + __set_64bit(ptr,ll_low(value), ll_high(value));
40288 +#define set_64bit(ptr,value) \
40289 +(__builtin_constant_p(value) ? \
40290 + __set_64bit_constant(ptr, value) : \
40291 + __set_64bit_var(ptr, value) )
40293 +#define _set_64bit(ptr,value) \
40294 +(__builtin_constant_p(value) ? \
40295 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
40296 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
40301 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
40302 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
40303 + * but generally the primitive is invalid, *ptr is output argument. --ANK
40305 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
40309 + __asm__ __volatile__("xchgb %b0,%1"
40311 + :"m" (*__xg(ptr)), "0" (x)
40315 + __asm__ __volatile__("xchgw %w0,%1"
40317 + :"m" (*__xg(ptr)), "0" (x)
40321 + __asm__ __volatile__("xchgl %0,%1"
40323 + :"m" (*__xg(ptr)), "0" (x)
40331 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
40332 + * store NEW in MEM. Return the initial value in MEM. Success is
40333 + * indicated by comparing RETURN with OLD.
40336 +#ifdef CONFIG_X86_CMPXCHG
40337 +#define __HAVE_ARCH_CMPXCHG 1
40338 +#define cmpxchg(ptr,o,n)\
40339 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
40340 + (unsigned long)(n),sizeof(*(ptr))))
40343 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
40344 + unsigned long new, int size)
40346 + unsigned long prev;
40349 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
40351 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
40355 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
40357 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
40361 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
40363 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
40370 +#ifndef CONFIG_X86_CMPXCHG
40372 + * Building a kernel capable running on 80386. It may be necessary to
40373 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
40374 + * a function for each of the sizes we support.
40377 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
40378 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
40379 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
40381 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
40382 + unsigned long new, int size)
40386 + return cmpxchg_386_u8(ptr, old, new);
40388 + return cmpxchg_386_u16(ptr, old, new);
40390 + return cmpxchg_386_u32(ptr, old, new);
40395 +#define cmpxchg(ptr,o,n) \
40397 + __typeof__(*(ptr)) __ret; \
40398 + if (likely(boot_cpu_data.x86 > 3)) \
40399 + __ret = __cmpxchg((ptr), (unsigned long)(o), \
40400 + (unsigned long)(n), sizeof(*(ptr))); \
40402 + __ret = cmpxchg_386((ptr), (unsigned long)(o), \
40403 + (unsigned long)(n), sizeof(*(ptr))); \
40408 +#ifdef CONFIG_X86_CMPXCHG64
40410 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
40411 + unsigned long long new)
40413 + unsigned long long prev;
40414 + __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
40416 + : "b"((unsigned long)new),
40417 + "c"((unsigned long)(new >> 32)),
40424 +#define cmpxchg64(ptr,o,n)\
40425 + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
40426 + (unsigned long long)(n)))
40431 + * Force strict CPU ordering.
40432 + * And yes, this is required on UP too when we're talking
40435 + * For now, "wmb()" doesn't actually do anything, as all
40436 + * Intel CPU's follow what Intel calls a *Processor Order*,
40437 + * in which all writes are seen in the program order even
40438 + * outside the CPU.
40440 + * I expect future Intel CPU's to have a weaker ordering,
40441 + * but I'd also expect them to finally get their act together
40442 + * and add some real memory barriers if so.
40444 + * Some non intel clones support out of order store. wmb() ceases to be a
40450 + * Actually only lfence would be needed for mb() because all stores done
40451 + * by the kernel should be already ordered. But keep a full barrier for now.
40454 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
40455 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
40458 + * read_barrier_depends - Flush all pending reads that subsequents reads
40461 + * No data-dependent reads from memory-like regions are ever reordered
40462 + * over this barrier. All reads preceding this primitive are guaranteed
40463 + * to access memory (but not necessarily other CPUs' caches) before any
40464 + * reads following this primitive that depend on the data return by
40465 + * any of the preceding reads. This primitive is much lighter weight than
40466 + * rmb() on most CPUs, and is never heavier weight than is
40469 + * These ordering constraints are respected by both the local CPU
40470 + * and the compiler.
40472 + * Ordering is not guaranteed by anything other than these primitives,
40473 + * not even by data dependencies. See the documentation for
40474 + * memory_barrier() for examples and URLs to more information.
40476 + * For example, the following code would force ordering (the initial
40477 + * value of "a" is zero, "b" is one, and "p" is "&a"):
40479 + * <programlisting>
40483 + * memory_barrier();
40485 + * read_barrier_depends();
40487 + * </programlisting>
40489 + * because the read of "*q" depends on the read of "p" and these
40490 + * two reads are separated by a read_barrier_depends(). However,
40491 + * the following code, with the same initial values for "a" and "b":
40493 + * <programlisting>
40497 + * memory_barrier();
40499 + * read_barrier_depends();
40501 + * </programlisting>
40503 + * does not enforce ordering, since there is no data dependency between
40504 + * the read of "a" and the read of "b". Therefore, on some CPUs, such
40505 + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
40506 + * in cases like this where there are no data dependencies.
40509 +#define read_barrier_depends() do { } while(0)
40511 +#ifdef CONFIG_X86_OOSTORE
40512 +/* Actually there are no OOO store capable CPUs for now that do SSE,
40513 + but make it already an possibility. */
40514 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
40516 +#define wmb() __asm__ __volatile__ ("": : :"memory")
40520 +#define smp_mb() mb()
40521 +#define smp_rmb() rmb()
40522 +#define smp_wmb() wmb()
40523 +#define smp_read_barrier_depends() read_barrier_depends()
40524 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
40526 +#define smp_mb() barrier()
40527 +#define smp_rmb() barrier()
40528 +#define smp_wmb() barrier()
40529 +#define smp_read_barrier_depends() do { } while(0)
40530 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
40533 +#include <linux/irqflags.h>
40536 + * disable hlt during certain critical i/o operations
40538 +#define HAVE_DISABLE_HLT
40539 +void disable_hlt(void);
40540 +void enable_hlt(void);
40542 +extern int es7000_plat;
40543 +void cpu_idle_wait(void);
40546 + * On SMP systems, when the scheduler does migration-cost autodetection,
40547 + * it needs a way to flush as much of the CPU's caches as possible:
40549 +static inline void sched_cacheflush(void)
40554 +extern unsigned long arch_align_stack(unsigned long sp);
40555 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
40557 +void default_idle(void);
40560 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h
40561 ===================================================================
40562 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40563 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100
40565 +#ifndef _I386_TLBFLUSH_H
40566 +#define _I386_TLBFLUSH_H
40568 +#include <linux/mm.h>
40569 +#include <asm/processor.h>
40571 +#define __flush_tlb() xen_tlb_flush()
40572 +#define __flush_tlb_global() xen_tlb_flush()
40573 +#define __flush_tlb_all() xen_tlb_flush()
40575 +extern unsigned long pgkern_mask;
40577 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
40579 +#define __flush_tlb_single(addr) xen_invlpg(addr)
40581 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
40586 + * - flush_tlb() flushes the current mm struct TLBs
40587 + * - flush_tlb_all() flushes all processes TLBs
40588 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
40589 + * - flush_tlb_page(vma, vmaddr) flushes one page
40590 + * - flush_tlb_range(vma, start, end) flushes a range of pages
40591 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
40592 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
40594 + * ..but the i386 has somewhat limited tlb flushing capabilities,
40595 + * and page-granular flushes are available only on i486 and up.
40598 +#ifndef CONFIG_SMP
40600 +#define flush_tlb() __flush_tlb()
40601 +#define flush_tlb_all() __flush_tlb_all()
40602 +#define local_flush_tlb() __flush_tlb()
40604 +static inline void flush_tlb_mm(struct mm_struct *mm)
40606 + if (mm == current->active_mm)
40610 +static inline void flush_tlb_page(struct vm_area_struct *vma,
40611 + unsigned long addr)
40613 + if (vma->vm_mm == current->active_mm)
40614 + __flush_tlb_one(addr);
40617 +static inline void flush_tlb_range(struct vm_area_struct *vma,
40618 + unsigned long start, unsigned long end)
40620 + if (vma->vm_mm == current->active_mm)
40626 +#include <asm/smp.h>
40628 +#define local_flush_tlb() \
40631 +#define flush_tlb_all xen_tlb_flush_all
40632 +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
40633 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
40634 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
40636 +#define flush_tlb() flush_tlb_current_task()
40638 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
40640 + flush_tlb_mm(vma->vm_mm);
40643 +#define TLBSTATE_OK 1
40644 +#define TLBSTATE_LAZY 2
40648 + struct mm_struct *active_mm;
40650 + char __cacheline_padding[L1_CACHE_BYTES-8];
40652 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
40657 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
40659 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
40660 + unsigned long start, unsigned long end)
40662 + /* i386 does not keep any page table caches in TLB */
40665 +#endif /* _I386_TLBFLUSH_H */
40666 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h
40667 ===================================================================
40668 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40669 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200
40672 + * Access to VGA videoram
40674 + * (c) 1998 Martin Mares <mj@ucw.cz>
40677 +#ifndef _LINUX_ASM_VGA_H_
40678 +#define _LINUX_ASM_VGA_H_
40681 + * On the PC, we can just recalculate addresses and then
40682 + * access the videoram directly without any black magic.
40685 +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
40687 +#define vga_readb(x) (*(x))
40688 +#define vga_writeb(x,y) (*(y) = (x))
40691 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h
40692 ===================================================================
40693 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40694 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h 2007-06-12 13:14:02.000000000 +0200
40696 +/******************************************************************************
40697 + * asm-i386/mach-xen/asm/xenoprof.h
40699 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
40700 + * VA Linux Systems Japan K.K.
40702 + * This program is free software; you can redistribute it and/or modify
40703 + * it under the terms of the GNU General Public License as published by
40704 + * the Free Software Foundation; either version 2 of the License, or
40705 + * (at your option) any later version.
40707 + * This program is distributed in the hope that it will be useful,
40708 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
40709 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40710 + * GNU General Public License for more details.
40712 + * You should have received a copy of the GNU General Public License
40713 + * along with this program; if not, write to the Free Software
40714 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40717 +#ifndef __ASM_XENOPROF_H__
40718 +#define __ASM_XENOPROF_H__
40721 +struct super_block;
40723 +int xenoprof_create_files(struct super_block * sb, struct dentry * root);
40724 +#define HAVE_XENOPROF_CREATE_FILES
40726 +struct xenoprof_init;
40727 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
40728 +void xenoprof_arch_counter(void);
40729 +void xenoprof_arch_start(void);
40730 +void xenoprof_arch_stop(void);
40732 +struct xenoprof_arch_shared_buffer {
40735 +struct xenoprof_shared_buffer;
40736 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
40737 +struct xenoprof_get_buffer;
40738 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
40739 +struct xenoprof_passive;
40740 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
40742 +#endif /* CONFIG_XEN */
40743 +#endif /* __ASM_XENOPROF_H__ */
40744 Index: head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h
40745 ===================================================================
40746 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40747 +++ head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200
40750 + * This file should contain #defines for all of the interrupt vector
40751 + * numbers used by this architecture.
40753 + * In addition, there are some standard defines:
40755 + * FIRST_EXTERNAL_VECTOR:
40756 + * The first free place for external interrupts
40758 + * SYSCALL_VECTOR:
40759 + * The IRQ vector a syscall makes the user to kernel transition
40763 + * The IRQ number the timer interrupt comes in at.
40766 + * The total number of interrupt vectors (including all the
40767 + * architecture specific interrupts) needed.
40770 +#ifndef _ASM_IRQ_VECTORS_H
40771 +#define _ASM_IRQ_VECTORS_H
40774 + * IDT vectors usable for external interrupt sources start
40777 +#define FIRST_EXTERNAL_VECTOR 0x20
40779 +#define SYSCALL_VECTOR 0x80
40782 + * Vectors 0x20-0x2f are used for ISA interrupts.
40787 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
40789 + * some of the following vectors are 'rare', they are merged
40790 + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
40791 + * TLB, reschedule and local APIC vectors are performance-critical.
40793 + * Vectors 0xf0-0xfa are free (reserved for future Linux use).
40795 +#define SPURIOUS_APIC_VECTOR 0xff
40796 +#define ERROR_APIC_VECTOR 0xfe
40797 +#define INVALIDATE_TLB_VECTOR 0xfd
40798 +#define RESCHEDULE_VECTOR 0xfc
40799 +#define CALL_FUNCTION_VECTOR 0xfb
40801 +#define THERMAL_APIC_VECTOR 0xf0
40803 + * Local APIC timer IRQ vector is on a different priority level,
40804 + * to work around the 'lost local interrupt if more than 2 IRQ
40805 + * sources per level' errata.
40807 +#define LOCAL_TIMER_VECTOR 0xef
40810 +#define SPURIOUS_APIC_VECTOR 0xff
40811 +#define ERROR_APIC_VECTOR 0xfe
40814 + * First APIC vector available to drivers: (vectors 0x30-0xee)
40815 + * we start at 0x31 to spread out vectors evenly between priority
40816 + * levels. (0x80 is the syscall vector)
40818 +#define FIRST_DEVICE_VECTOR 0x31
40819 +#define FIRST_SYSTEM_VECTOR 0xef
40822 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
40823 + * Right now the APIC is mostly only used for SMP.
40824 + * 256 vectors is an architectural limit. (we can have
40825 + * more than 256 devices theoretically, but they will
40826 + * have to use shared interrupts)
40827 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
40828 + * the usable vector space is 0x20-0xff (224 vectors)
40831 +#define RESCHEDULE_VECTOR 0
40832 +#define CALL_FUNCTION_VECTOR 1
40836 + * The maximum number of vectors supported by i386 processors
40837 + * is limited to 256. For processors other than i386, NR_VECTORS
40838 + * should be changed accordingly.
40840 +#define NR_VECTORS 256
40842 +#define FPU_IRQ 13
40844 +#define FIRST_VM86_IRQ 3
40845 +#define LAST_VM86_IRQ 15
40846 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
40849 + * The flat IRQ space is divided into two regions:
40850 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
40851 + * if we have physical device-access privilege. This region is at the
40852 + * start of the IRQ space so that existing device drivers do not need
40853 + * to be modified to translate physical IRQ numbers into our IRQ space.
40854 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
40855 + * are bound using the provided bind/unbind functions.
40858 +#define PIRQ_BASE 0
40859 +#if !defined(MAX_IO_APICS)
40860 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40861 +#elif NR_CPUS < MAX_IO_APICS
40862 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40864 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
40867 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
40868 +#define NR_DYNIRQS 256
40870 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
40871 +#define NR_IRQ_VECTORS NR_IRQS
40873 +#endif /* _ASM_IRQ_VECTORS_H */
40874 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h
40875 ===================================================================
40876 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40877 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h 2007-06-12 13:14:02.000000000 +0200
40880 + * include/asm-xen/asm-i386/mach-xen/mach_traps.h
40882 + * Machine specific NMI handling for Xen
40884 +#ifndef _MACH_TRAPS_H
40885 +#define _MACH_TRAPS_H
40887 +#include <linux/bitops.h>
40888 +#include <xen/interface/nmi.h>
40890 +static inline void clear_mem_error(unsigned char reason) {}
40891 +static inline void clear_io_check_error(unsigned char reason) {}
40893 +static inline unsigned char get_nmi_reason(void)
40895 + shared_info_t *s = HYPERVISOR_shared_info;
40896 + unsigned char reason = 0;
40898 + /* construct a value which looks like it came from
40901 + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
40903 + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
40909 +static inline void reassert_nmi(void) {}
40911 +#endif /* !_MACH_TRAPS_H */
40912 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h
40913 ===================================================================
40914 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40915 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h 2007-06-12 13:14:02.000000000 +0200
40917 +/* Hook to call BIOS initialisation function */
40919 +#define ARCH_SETUP machine_specific_arch_setup();
40921 +void __init machine_specific_arch_setup(void);
40922 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h
40923 ===================================================================
40924 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40925 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100
40927 +/* Written 2000 by Andi Kleen */
40928 +#ifndef __ARCH_DESC_H
40929 +#define __ARCH_DESC_H
40931 +#include <linux/threads.h>
40932 +#include <asm/ldt.h>
40934 +#ifndef __ASSEMBLY__
40936 +#include <linux/string.h>
40937 +#include <linux/smp.h>
40939 +#include <asm/segment.h>
40940 +#include <asm/mmu.h>
40942 +// 8 byte segment descriptor
40943 +struct desc_struct {
40946 + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
40947 + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
40948 +} __attribute__((packed));
40950 +struct n_desc_struct {
40951 + unsigned int a,b;
40955 + GATE_INTERRUPT = 0xE,
40961 +struct gate_struct {
40964 + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
40965 + u16 offset_middle;
40968 +} __attribute__((packed));
40970 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
40971 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
40972 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
40979 +// LDT or TSS descriptor in the GDT. 16 bytes.
40980 +struct ldttss_desc {
40983 + unsigned base1 : 8, type : 5, dpl : 2, p : 1;
40984 + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
40987 +} __attribute__((packed));
40990 + unsigned short size;
40991 + unsigned long address;
40992 +} __attribute__((packed)) ;
40994 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
40996 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
40998 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
40999 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
41001 +static inline void clear_LDT(void)
41003 + int cpu = get_cpu();
41006 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
41007 + * it slows down context switching. Noone uses it anyway.
41009 + cpu = cpu; /* XXX avoid compiler warning */
41010 + xen_set_ldt(NULL, 0);
41015 + * This is the ldt that every process will get unless we need
41016 + * something other than this.
41018 +extern struct desc_struct default_ldt[];
41019 +#ifndef CONFIG_X86_NO_IDT
41020 +extern struct gate_struct idt_table[];
41022 +extern struct desc_ptr cpu_gdt_descr[];
41024 +/* the cpu gdt accessor */
41025 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
41027 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
41029 + struct gate_struct s;
41030 + s.offset_low = PTR_LOW(func);
41031 + s.segment = __KERNEL_CS;
41038 + s.offset_middle = PTR_MIDDLE(func);
41039 + s.offset_high = PTR_HIGH(func);
41040 + /* does not need to be atomic because it is only done once at setup time */
41041 + memcpy(adr, &s, 16);
41044 +#ifndef CONFIG_X86_NO_IDT
41045 +static inline void set_intr_gate(int nr, void *func)
41047 + BUG_ON((unsigned)nr > 0xFF);
41048 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
41051 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
41053 + BUG_ON((unsigned)nr > 0xFF);
41054 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
41057 +static inline void set_system_gate(int nr, void *func)
41059 + BUG_ON((unsigned)nr > 0xFF);
41060 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
41063 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
41065 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
41069 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
41072 + struct ldttss_desc d;
41073 + memset(&d,0,sizeof(d));
41074 + d.limit0 = size & 0xFFFF;
41075 + d.base0 = PTR_LOW(tss);
41076 + d.base1 = PTR_MIDDLE(tss) & 0xFF;
41079 + d.limit1 = (size >> 16) & 0xF;
41080 + d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
41081 + d.base3 = PTR_HIGH(tss);
41082 + memcpy(ptr, &d, 16);
41085 +#ifndef CONFIG_X86_NO_TSS
41086 +static inline void set_tss_desc(unsigned cpu, void *addr)
41089 + * sizeof(unsigned long) coming from an extra "long" at the end
41090 + * of the iobitmap. See tss_struct definition in processor.h
41092 + * -1? seg base+limit should be pointing to the address of the
41093 + * last valid byte
41095 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
41096 + (unsigned long)addr, DESC_TSS,
41097 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
41101 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
41103 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
41104 + DESC_LDT, size * 8 - 1);
41107 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
41109 + struct desc_struct *d = &cpu_gdt(cpu)[entry];
41110 + u32 addr = (u32)(u64)base;
41111 + BUG_ON((u64)base >> 32);
41112 + d->base0 = addr & 0xffff;
41113 + d->base1 = (addr >> 16) & 0xff;
41114 + d->base2 = (addr >> 24) & 0xff;
41117 +#define LDT_entry_a(info) \
41118 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
41119 +/* Don't allow setting of the lm bit. It is useless anyways because
41120 + 64bit system calls require __USER_CS. */
41121 +#define LDT_entry_b(info) \
41122 + (((info)->base_addr & 0xff000000) | \
41123 + (((info)->base_addr & 0x00ff0000) >> 16) | \
41124 + ((info)->limit & 0xf0000) | \
41125 + (((info)->read_exec_only ^ 1) << 9) | \
41126 + ((info)->contents << 10) | \
41127 + (((info)->seg_not_present ^ 1) << 15) | \
41128 + ((info)->seg_32bit << 22) | \
41129 + ((info)->limit_in_pages << 23) | \
41130 + ((info)->useable << 20) | \
41131 + /* ((info)->lm << 21) | */ \
41134 +#define LDT_empty(info) (\
41135 + (info)->base_addr == 0 && \
41136 + (info)->limit == 0 && \
41137 + (info)->contents == 0 && \
41138 + (info)->read_exec_only == 1 && \
41139 + (info)->seg_32bit == 0 && \
41140 + (info)->limit_in_pages == 0 && \
41141 + (info)->seg_not_present == 1 && \
41142 + (info)->useable == 0 && \
41145 +#if TLS_SIZE != 24
41146 +# error update this code.
41149 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
41152 + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
41153 + gdt[0] = t->tls_array[0];
41154 + gdt[1] = t->tls_array[1];
41155 + gdt[2] = t->tls_array[2];
41158 + if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
41159 + t->tls_array[i])) \
41162 + C(0); C(1); C(2);
41167 + * load one particular LDT into the current CPU
41169 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
41171 + void *segments = pc->ldt;
41172 + int count = pc->size;
41174 + if (likely(!count))
41177 + xen_set_ldt(segments, count);
41180 +static inline void load_LDT(mm_context_t *pc)
41182 + int cpu = get_cpu();
41183 + load_LDT_nolock(pc, cpu);
41187 +extern struct desc_ptr idt_descr;
41189 +#endif /* !__ASSEMBLY__ */
41192 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h
41193 ===================================================================
41194 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41195 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2007-06-12 13:14:13.000000000 +0200
41197 +#ifndef _X8664_DMA_MAPPING_H
41198 +#define _X8664_DMA_MAPPING_H 1
41201 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
41206 +#include <asm/scatterlist.h>
41207 +#include <asm/io.h>
41208 +#include <asm/swiotlb.h>
41210 +struct dma_mapping_ops {
41211 + int (*mapping_error)(dma_addr_t dma_addr);
41212 + void* (*alloc_coherent)(struct device *dev, size_t size,
41213 + dma_addr_t *dma_handle, gfp_t gfp);
41214 + void (*free_coherent)(struct device *dev, size_t size,
41215 + void *vaddr, dma_addr_t dma_handle);
41216 + dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
41217 + size_t size, int direction);
41218 + /* like map_single, but doesn't check the device mask */
41219 + dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
41220 + size_t size, int direction);
41221 + void (*unmap_single)(struct device *dev, dma_addr_t addr,
41222 + size_t size, int direction);
41223 + void (*sync_single_for_cpu)(struct device *hwdev,
41224 + dma_addr_t dma_handle, size_t size,
41226 + void (*sync_single_for_device)(struct device *hwdev,
41227 + dma_addr_t dma_handle, size_t size,
41229 + void (*sync_single_range_for_cpu)(struct device *hwdev,
41230 + dma_addr_t dma_handle, unsigned long offset,
41231 + size_t size, int direction);
41232 + void (*sync_single_range_for_device)(struct device *hwdev,
41233 + dma_addr_t dma_handle, unsigned long offset,
41234 + size_t size, int direction);
41235 + void (*sync_sg_for_cpu)(struct device *hwdev,
41236 + struct scatterlist *sg, int nelems,
41238 + void (*sync_sg_for_device)(struct device *hwdev,
41239 + struct scatterlist *sg, int nelems,
41241 + int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
41242 + int nents, int direction);
41243 + void (*unmap_sg)(struct device *hwdev,
41244 + struct scatterlist *sg, int nents,
41246 + int (*dma_supported)(struct device *hwdev, u64 mask);
41250 +extern dma_addr_t bad_dma_address;
41251 +extern struct dma_mapping_ops* dma_ops;
41252 +extern int iommu_merge;
41254 +static inline int valid_dma_direction(int dma_direction)
41256 + return ((dma_direction == DMA_BIDIRECTIONAL) ||
41257 + (dma_direction == DMA_TO_DEVICE) ||
41258 + (dma_direction == DMA_FROM_DEVICE));
41262 +static inline int dma_mapping_error(dma_addr_t dma_addr)
41264 + if (dma_ops->mapping_error)
41265 + return dma_ops->mapping_error(dma_addr);
41267 + return (dma_addr == bad_dma_address);
41270 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
41271 + dma_addr_t *dma_handle, gfp_t gfp);
41272 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
41273 + dma_addr_t dma_handle);
41275 +static inline dma_addr_t
41276 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
41279 + BUG_ON(!valid_dma_direction(direction));
41280 + return dma_ops->map_single(hwdev, ptr, size, direction);
41283 +static inline void
41284 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
41287 + BUG_ON(!valid_dma_direction(direction));
41288 + dma_ops->unmap_single(dev, addr, size, direction);
41291 +#define dma_map_page(dev,page,offset,size,dir) \
41292 + dma_map_single((dev), page_address(page)+(offset), (size), (dir))
41294 +#define dma_unmap_page dma_unmap_single
41296 +static inline void
41297 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41298 + size_t size, int direction)
41300 + BUG_ON(!valid_dma_direction(direction));
41301 + if (dma_ops->sync_single_for_cpu)
41302 + dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
41304 + flush_write_buffers();
41307 +static inline void
41308 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
41309 + size_t size, int direction)
41311 + BUG_ON(!valid_dma_direction(direction));
41312 + if (dma_ops->sync_single_for_device)
41313 + dma_ops->sync_single_for_device(hwdev, dma_handle, size,
41315 + flush_write_buffers();
41318 +static inline void
41319 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41320 + unsigned long offset, size_t size, int direction)
41322 + BUG_ON(!valid_dma_direction(direction));
41323 + if (dma_ops->sync_single_range_for_cpu) {
41324 + dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
41327 + flush_write_buffers();
41330 +static inline void
41331 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
41332 + unsigned long offset, size_t size, int direction)
41334 + BUG_ON(!valid_dma_direction(direction));
41335 + if (dma_ops->sync_single_range_for_device)
41336 + dma_ops->sync_single_range_for_device(hwdev, dma_handle,
41337 + offset, size, direction);
41339 + flush_write_buffers();
41342 +static inline void
41343 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
41344 + int nelems, int direction)
41346 + BUG_ON(!valid_dma_direction(direction));
41347 + if (dma_ops->sync_sg_for_cpu)
41348 + dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
41349 + flush_write_buffers();
41352 +static inline void
41353 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
41354 + int nelems, int direction)
41356 + BUG_ON(!valid_dma_direction(direction));
41357 + if (dma_ops->sync_sg_for_device) {
41358 + dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
41361 + flush_write_buffers();
41365 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
41367 + BUG_ON(!valid_dma_direction(direction));
41368 + return dma_ops->map_sg(hwdev, sg, nents, direction);
41371 +static inline void
41372 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
41375 + BUG_ON(!valid_dma_direction(direction));
41376 + dma_ops->unmap_sg(hwdev, sg, nents, direction);
41379 +extern int dma_supported(struct device *hwdev, u64 mask);
41381 +/* same for gart, swiotlb, and nommu */
41382 +static inline int dma_get_cache_alignment(void)
41384 + return boot_cpu_data.x86_clflush_size;
41387 +#define dma_is_consistent(h) 1
41389 +extern int dma_set_mask(struct device *dev, u64 mask);
41391 +static inline void
41392 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
41394 + flush_write_buffers();
41397 +extern struct device fallback_dev;
41398 +extern int panic_on_overflow;
41401 +#endif /* _X8664_DMA_MAPPING_H */
41403 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
41404 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h
41405 ===================================================================
41406 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41407 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200
41410 + * fixmap.h: compile-time virtual memory allocation
41412 + * This file is subject to the terms and conditions of the GNU General Public
41413 + * License. See the file "COPYING" in the main directory of this archive
41414 + * for more details.
41416 + * Copyright (C) 1998 Ingo Molnar
41419 +#ifndef _ASM_FIXMAP_H
41420 +#define _ASM_FIXMAP_H
41422 +#include <linux/kernel.h>
41423 +#include <asm/apicdef.h>
41424 +#include <asm/page.h>
41425 +#include <asm/vsyscall.h>
41426 +#include <asm/vsyscall32.h>
41427 +#include <asm/acpi.h>
41430 + * Here we define all the compile-time 'special' virtual
41431 + * addresses. The point is to have a constant address at
41432 + * compile time, but to set the physical address only
41433 + * in the boot process.
41435 + * these 'compile-time allocated' memory buffers are
41436 + * fixed-size 4k pages. (or larger if used with an increment
41437 + * highger than 1) use fixmap_set(idx,phys) to associate
41438 + * physical memory with fixmap indices.
41440 + * TLB entries of such buffers will not be flushed across
41444 +enum fixed_addresses {
41445 + VSYSCALL_LAST_PAGE,
41446 + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
41449 +#ifdef CONFIG_X86_LOCAL_APIC
41450 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
41452 +#ifdef CONFIG_X86_IO_APIC
41453 + FIX_IO_APIC_BASE_0,
41454 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
41456 +#ifdef CONFIG_ACPI
41458 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
41461 +#define NR_FIX_ISAMAPS 256
41463 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
41464 + __end_of_permanent_fixed_addresses,
41465 + /* temporary boot-time mappings, used before ioremap() is functional */
41466 +#define NR_FIX_BTMAPS 16
41467 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
41468 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
41469 + __end_of_fixed_addresses
41472 +extern void __set_fixmap (enum fixed_addresses idx,
41473 + unsigned long phys, pgprot_t flags);
41475 +#define set_fixmap(idx, phys) \
41476 + __set_fixmap(idx, phys, PAGE_KERNEL)
41478 + * Some hardware wants to get fixmapped without caching.
41480 +#define set_fixmap_nocache(idx, phys) \
41481 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
41483 +#define clear_fixmap(idx) \
41484 + __set_fixmap(idx, 0, __pgprot(0))
41486 +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
41487 +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
41488 +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
41490 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
41491 +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
41492 +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
41494 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
41496 +extern void __this_fixmap_does_not_exist(void);
41499 + * 'index to address' translation. If anyone tries to use the idx
41500 + * directly without translation, we catch the bug with a NULL-deference
41501 + * kernel oops. Illegal ranges of incoming indices are caught too.
41503 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
41506 + * this branch gets completely eliminated after inlining,
41507 + * except when someone tries to use fixaddr indices in an
41508 + * illegal way. (such as mixing up address types or using
41509 + * out-of-range indices).
41511 + * If it doesn't get removed, the linker will complain
41512 + * loudly with a reasonably clear error message..
41514 + if (idx >= __end_of_fixed_addresses)
41515 + __this_fixmap_does_not_exist();
41517 + return __fix_to_virt(idx);
41521 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h
41522 ===================================================================
41523 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41524 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h 2008-11-25 12:22:34.000000000 +0100
41526 +/******************************************************************************
41529 + * Linux-specific hypervisor handling.
41531 + * Copyright (c) 2002-2004, K A Fraser
41533 + * 64-bit updates:
41534 + * Benjamin Liu <benjamin.liu@intel.com>
41535 + * Jun Nakajima <jun.nakajima@intel.com>
41537 + * This program is free software; you can redistribute it and/or
41538 + * modify it under the terms of the GNU General Public License version 2
41539 + * as published by the Free Software Foundation; or, when distributed
41540 + * separately from the Linux kernel or incorporated into other
41541 + * software packages, subject to the following license:
41543 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41544 + * of this source file (the "Software"), to deal in the Software without
41545 + * restriction, including without limitation the rights to use, copy, modify,
41546 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41547 + * and to permit persons to whom the Software is furnished to do so, subject to
41548 + * the following conditions:
41550 + * The above copyright notice and this permission notice shall be included in
41551 + * all copies or substantial portions of the Software.
41553 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41554 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41555 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41556 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41557 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41558 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41559 + * IN THE SOFTWARE.
41562 +#ifndef __HYPERCALL_H__
41563 +#define __HYPERCALL_H__
41565 +#include <linux/string.h> /* memcpy() */
41566 +#include <linux/stringify.h>
41568 +#ifndef __HYPERVISOR_H__
41569 +# error "please don't include this file directly"
41573 +#define HYPERCALL_STR(name) \
41574 + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
41576 +#define HYPERCALL_STR(name) \
41577 + "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
41578 + "add hypercall_stubs(%%rip),%%rax; " \
41582 +#define _hypercall0(type, name) \
41586 + HYPERCALL_STR(name) \
41593 +#define _hypercall1(type, name, a1) \
41598 + HYPERCALL_STR(name) \
41599 + : "=a" (__res), "=D" (__ign1) \
41600 + : "1" ((long)(a1)) \
41605 +#define _hypercall2(type, name, a1, a2) \
41608 + long __ign1, __ign2; \
41610 + HYPERCALL_STR(name) \
41611 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
41612 + : "1" ((long)(a1)), "2" ((long)(a2)) \
41617 +#define _hypercall3(type, name, a1, a2, a3) \
41620 + long __ign1, __ign2, __ign3; \
41622 + HYPERCALL_STR(name) \
41623 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41625 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41626 + "3" ((long)(a3)) \
41631 +#define _hypercall4(type, name, a1, a2, a3, a4) \
41634 + long __ign1, __ign2, __ign3; \
41635 + register long __arg4 asm("r10") = (long)(a4); \
41637 + HYPERCALL_STR(name) \
41638 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41639 + "=d" (__ign3), "+r" (__arg4) \
41640 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41641 + "3" ((long)(a3)) \
41646 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
41649 + long __ign1, __ign2, __ign3; \
41650 + register long __arg4 asm("r10") = (long)(a4); \
41651 + register long __arg5 asm("r8") = (long)(a5); \
41653 + HYPERCALL_STR(name) \
41654 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41655 + "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
41656 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41657 + "3" ((long)(a3)) \
41662 +static inline int __must_check
41663 +HYPERVISOR_set_trap_table(
41664 + const trap_info_t *table)
41666 + return _hypercall1(int, set_trap_table, table);
41669 +static inline int __must_check
41670 +HYPERVISOR_mmu_update(
41671 + mmu_update_t *req, unsigned int count, unsigned int *success_count,
41674 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
41677 +static inline int __must_check
41678 +HYPERVISOR_mmuext_op(
41679 + struct mmuext_op *op, unsigned int count, unsigned int *success_count,
41682 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
41685 +static inline int __must_check
41686 +HYPERVISOR_set_gdt(
41687 + unsigned long *frame_list, unsigned int entries)
41689 + return _hypercall2(int, set_gdt, frame_list, entries);
41692 +static inline int __must_check
41693 +HYPERVISOR_stack_switch(
41694 + unsigned long ss, unsigned long esp)
41696 + return _hypercall2(int, stack_switch, ss, esp);
41699 +static inline int __must_check
41700 +HYPERVISOR_set_callbacks(
41701 + unsigned long event_address, unsigned long failsafe_address,
41702 + unsigned long syscall_address)
41704 + return _hypercall3(int, set_callbacks,
41705 + event_address, failsafe_address, syscall_address);
41709 +HYPERVISOR_fpu_taskswitch(
41712 + return _hypercall1(int, fpu_taskswitch, set);
41715 +static inline int __must_check
41716 +HYPERVISOR_sched_op_compat(
41717 + int cmd, unsigned long arg)
41719 + return _hypercall2(int, sched_op_compat, cmd, arg);
41722 +static inline int __must_check
41723 +HYPERVISOR_sched_op(
41724 + int cmd, void *arg)
41726 + return _hypercall2(int, sched_op, cmd, arg);
41729 +static inline long __must_check
41730 +HYPERVISOR_set_timer_op(
41733 + return _hypercall1(long, set_timer_op, timeout);
41736 +static inline int __must_check
41737 +HYPERVISOR_platform_op(
41738 + struct xen_platform_op *platform_op)
41740 + platform_op->interface_version = XENPF_INTERFACE_VERSION;
41741 + return _hypercall1(int, platform_op, platform_op);
41744 +static inline int __must_check
41745 +HYPERVISOR_set_debugreg(
41746 + unsigned int reg, unsigned long value)
41748 + return _hypercall2(int, set_debugreg, reg, value);
41751 +static inline unsigned long __must_check
41752 +HYPERVISOR_get_debugreg(
41753 + unsigned int reg)
41755 + return _hypercall1(unsigned long, get_debugreg, reg);
41758 +static inline int __must_check
41759 +HYPERVISOR_update_descriptor(
41760 + unsigned long ma, unsigned long word)
41762 + return _hypercall2(int, update_descriptor, ma, word);
41765 +static inline int __must_check
41766 +HYPERVISOR_memory_op(
41767 + unsigned int cmd, void *arg)
41769 + return _hypercall2(int, memory_op, cmd, arg);
41772 +static inline int __must_check
41773 +HYPERVISOR_multicall(
41774 + multicall_entry_t *call_list, unsigned int nr_calls)
41776 + return _hypercall2(int, multicall, call_list, nr_calls);
41779 +static inline int __must_check
41780 +HYPERVISOR_update_va_mapping(
41781 + unsigned long va, pte_t new_val, unsigned long flags)
41783 + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
41786 +static inline int __must_check
41787 +HYPERVISOR_event_channel_op(
41788 + int cmd, void *arg)
41790 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
41792 +#if CONFIG_XEN_COMPAT <= 0x030002
41793 + if (unlikely(rc == -ENOSYS)) {
41794 + struct evtchn_op op;
41796 + memcpy(&op.u, arg, sizeof(op.u));
41797 + rc = _hypercall1(int, event_channel_op_compat, &op);
41798 + memcpy(arg, &op.u, sizeof(op.u));
41805 +static inline int __must_check
41806 +HYPERVISOR_xen_version(
41807 + int cmd, void *arg)
41809 + return _hypercall2(int, xen_version, cmd, arg);
41812 +static inline int __must_check
41813 +HYPERVISOR_console_io(
41814 + int cmd, unsigned int count, char *str)
41816 + return _hypercall3(int, console_io, cmd, count, str);
41819 +static inline int __must_check
41820 +HYPERVISOR_physdev_op(
41821 + int cmd, void *arg)
41823 + int rc = _hypercall2(int, physdev_op, cmd, arg);
41825 +#if CONFIG_XEN_COMPAT <= 0x030002
41826 + if (unlikely(rc == -ENOSYS)) {
41827 + struct physdev_op op;
41829 + memcpy(&op.u, arg, sizeof(op.u));
41830 + rc = _hypercall1(int, physdev_op_compat, &op);
41831 + memcpy(arg, &op.u, sizeof(op.u));
41838 +static inline int __must_check
41839 +HYPERVISOR_grant_table_op(
41840 + unsigned int cmd, void *uop, unsigned int count)
41842 + return _hypercall3(int, grant_table_op, cmd, uop, count);
41845 +static inline int __must_check
41846 +HYPERVISOR_update_va_mapping_otherdomain(
41847 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
41849 + return _hypercall4(int, update_va_mapping_otherdomain, va,
41850 + new_val.pte, flags, domid);
41853 +static inline int __must_check
41854 +HYPERVISOR_vm_assist(
41855 + unsigned int cmd, unsigned int type)
41857 + return _hypercall2(int, vm_assist, cmd, type);
41860 +static inline int __must_check
41861 +HYPERVISOR_vcpu_op(
41862 + int cmd, unsigned int vcpuid, void *extra_args)
41864 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
41867 +static inline int __must_check
41868 +HYPERVISOR_set_segment_base(
41869 + int reg, unsigned long value)
41871 + return _hypercall2(int, set_segment_base, reg, value);
41874 +static inline int __must_check
41875 +HYPERVISOR_suspend(
41876 + unsigned long srec)
41878 + struct sched_shutdown sched_shutdown = {
41879 + .reason = SHUTDOWN_suspend
41882 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
41883 + &sched_shutdown, srec);
41885 +#if CONFIG_XEN_COMPAT <= 0x030002
41886 + if (rc == -ENOSYS)
41887 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
41888 + SHUTDOWN_suspend, srec);
41894 +#if CONFIG_XEN_COMPAT <= 0x030002
41896 +HYPERVISOR_nmi_op(
41897 + unsigned long op, void *arg)
41899 + return _hypercall2(int, nmi_op, op, arg);
41903 +#ifndef CONFIG_XEN
41904 +static inline unsigned long __must_check
41905 +HYPERVISOR_hvm_op(
41906 + int op, void *arg)
41908 + return _hypercall2(unsigned long, hvm_op, op, arg);
41912 +static inline int __must_check
41913 +HYPERVISOR_callback_op(
41914 + int cmd, const void *arg)
41916 + return _hypercall2(int, callback_op, cmd, arg);
41919 +static inline int __must_check
41920 +HYPERVISOR_xenoprof_op(
41921 + int op, void *arg)
41923 + return _hypercall2(int, xenoprof_op, op, arg);
41926 +static inline int __must_check
41927 +HYPERVISOR_kexec_op(
41928 + unsigned long op, void *args)
41930 + return _hypercall2(int, kexec_op, op, args);
41933 +#endif /* __HYPERCALL_H__ */
41934 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h
41935 ===================================================================
41936 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41937 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h 2007-08-16 18:07:01.000000000 +0200
41942 +#include <asm/fixmap.h>
41945 + * This file contains the definitions for the x86 IO instructions
41946 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
41947 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
41948 + * versions of the single-IO instructions (inb_p/inw_p/..).
41950 + * This file is not meant to be obfuscating: it's just complicated
41951 + * to (a) handle it all in a way that makes gcc able to optimize it
41952 + * as well as possible and (b) trying to avoid writing the same thing
41953 + * over and over again with slight variations and possibly making a
41954 + * mistake somewhere.
41958 + * Thanks to James van Artsdalen for a better timing-fix than
41959 + * the two short jumps: using outb's to a nonexistent port seems
41960 + * to guarantee better timings even on fast machines.
41962 + * On the other hand, I'd like to be sure of a non-existent port:
41963 + * I feel a bit unsafe about using 0x80 (should be safe, though)
41969 + * Bit simplified and optimized by Jan Hubicka
41970 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
41972 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
41973 + * isa_read[wl] and isa_write[wl] fixed
41974 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
41977 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
41979 +#ifdef REALLY_SLOW_IO
41980 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
41982 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
41986 + * Talk about misusing macros..
41988 +#define __OUT1(s,x) \
41989 +static inline void out##s(unsigned x value, unsigned short port) {
41991 +#define __OUT2(s,s1,s2) \
41992 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
41994 +#define __OUT(s,s1,x) \
41995 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
41996 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
41998 +#define __IN1(s) \
41999 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
42001 +#define __IN2(s,s1,s2) \
42002 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
42004 +#define __IN(s,s1,i...) \
42005 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42006 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42008 +#define __INS(s) \
42009 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
42010 +{ __asm__ __volatile__ ("rep ; ins" #s \
42011 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42013 +#define __OUTS(s) \
42014 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
42015 +{ __asm__ __volatile__ ("rep ; outs" #s \
42016 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42018 +#define RETURN_TYPE unsigned char
42020 +#undef RETURN_TYPE
42021 +#define RETURN_TYPE unsigned short
42023 +#undef RETURN_TYPE
42024 +#define RETURN_TYPE unsigned int
42026 +#undef RETURN_TYPE
42029 +__OUT(w,"w",short)
42040 +#define IO_SPACE_LIMIT 0xffff
42042 +#if defined(__KERNEL__) && __x86_64__
42044 +#include <linux/vmalloc.h>
42048 + * Change virtual addresses to physical addresses and vv.
42049 + * These are pretty trivial
42051 +static inline unsigned long virt_to_phys(volatile void * address)
42053 + return __pa(address);
42056 +static inline void * phys_to_virt(unsigned long address)
42058 + return __va(address);
42061 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42062 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42066 + * Change "struct page" to physical address.
42068 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
42069 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
42070 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
42072 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
42073 + (unsigned long) bio_offset((bio)))
42074 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
42075 + (unsigned long) (bv)->bv_offset)
42077 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
42078 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
42079 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
42080 + bvec_to_pseudophys((vec2))))
42082 +#include <asm-generic/iomap.h>
42084 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
42086 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
42088 + return __ioremap(offset, size, 0);
42091 +extern void *bt_ioremap(unsigned long addr, unsigned long size);
42092 +extern void bt_iounmap(void *addr, unsigned long size);
42093 +#define early_ioremap bt_ioremap
42094 +#define early_iounmap bt_iounmap
42097 + * This one maps high address device memory and turns off caching for that area.
42098 + * it's useful if some control registers are in such an area and write combining
42099 + * or read caching is not desirable:
42101 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
42102 +extern void iounmap(volatile void __iomem *addr);
42105 + * ISA I/O bus memory addresses are 1:1 with the physical address.
42108 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
42109 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
42110 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
42113 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
42114 + * are forbidden in portable PCI drivers.
42116 + * Allow them on x86 for legacy drivers, though.
42118 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42119 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42122 + * readX/writeX() are used to access memory mapped devices. On some
42123 + * architectures the memory mapped IO stuff needs to be accessed
42124 + * differently. On the x86 architecture, we just read/write the
42125 + * memory location directly.
42128 +static inline __u8 __readb(const volatile void __iomem *addr)
42130 + return *(__force volatile __u8 *)addr;
42132 +static inline __u16 __readw(const volatile void __iomem *addr)
42134 + return *(__force volatile __u16 *)addr;
42136 +static __always_inline __u32 __readl(const volatile void __iomem *addr)
42138 + return *(__force volatile __u32 *)addr;
42140 +static inline __u64 __readq(const volatile void __iomem *addr)
42142 + return *(__force volatile __u64 *)addr;
42144 +#define readb(x) __readb(x)
42145 +#define readw(x) __readw(x)
42146 +#define readl(x) __readl(x)
42147 +#define readq(x) __readq(x)
42148 +#define readb_relaxed(a) readb(a)
42149 +#define readw_relaxed(a) readw(a)
42150 +#define readl_relaxed(a) readl(a)
42151 +#define readq_relaxed(a) readq(a)
42152 +#define __raw_readb readb
42153 +#define __raw_readw readw
42154 +#define __raw_readl readl
42155 +#define __raw_readq readq
42159 +static inline void __writel(__u32 b, volatile void __iomem *addr)
42161 + *(__force volatile __u32 *)addr = b;
42163 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
42165 + *(__force volatile __u64 *)addr = b;
42167 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
42169 + *(__force volatile __u8 *)addr = b;
42171 +static inline void __writew(__u16 b, volatile void __iomem *addr)
42173 + *(__force volatile __u16 *)addr = b;
42175 +#define writeq(val,addr) __writeq((val),(addr))
42176 +#define writel(val,addr) __writel((val),(addr))
42177 +#define writew(val,addr) __writew((val),(addr))
42178 +#define writeb(val,addr) __writeb((val),(addr))
42179 +#define __raw_writeb writeb
42180 +#define __raw_writew writew
42181 +#define __raw_writel writel
42182 +#define __raw_writeq writeq
42184 +void __memcpy_fromio(void*,unsigned long,unsigned);
42185 +void __memcpy_toio(unsigned long,const void*,unsigned);
42187 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
42189 + __memcpy_fromio(to,(unsigned long)from,len);
42191 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
42193 + __memcpy_toio((unsigned long)to,from,len);
42196 +void memset_io(volatile void __iomem *a, int b, size_t c);
42199 + * ISA space is 'always mapped' on a typical x86 system, no need to
42200 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
42201 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
42202 + * are physical addresses. The following constant pointer can be
42203 + * used as the IO-area pointer (it can be iounmapped as well, so the
42204 + * analogy with PCI is quite large):
42206 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
42209 + * Again, x86-64 does not require mem IO specific function.
42212 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
42215 + * check_signature - find BIOS signatures
42216 + * @io_addr: mmio address to check
42217 + * @signature: signature block
42218 + * @length: length of signature
42220 + * Perform a signature comparison with the mmio address io_addr. This
42221 + * address should have been obtained by ioremap.
42222 + * Returns 1 on a match.
42225 +static inline int check_signature(void __iomem *io_addr,
42226 + const unsigned char *signature, int length)
42230 + if (readb(io_addr) != *signature)
42235 + } while (length);
42241 +/* Nothing to do */
42243 +#define dma_cache_inv(_start,_size) do { } while (0)
42244 +#define dma_cache_wback(_start,_size) do { } while (0)
42245 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
42247 +#define flush_write_buffers()
42249 +extern int iommu_bio_merge;
42250 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
42253 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
42256 +#define xlate_dev_mem_ptr(p) __va(p)
42259 + * Convert a virtual cached pointer to an uncached pointer
42261 +#define xlate_dev_kmem_ptr(p) p
42263 +#endif /* __KERNEL__ */
42265 +#define ARCH_HAS_DEV_MEM
42268 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h
42269 ===================================================================
42270 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42271 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h 2007-06-12 13:14:13.000000000 +0200
42274 + * include/asm-x86_64/irqflags.h
42276 + * IRQ flags handling
42278 + * This file gets included from lowlevel asm headers too, to provide
42279 + * wrapped versions of the local_irq_*() APIs, based on the
42280 + * raw_local_irq_*() functions from the lowlevel headers.
42282 +#ifndef _ASM_IRQFLAGS_H
42283 +#define _ASM_IRQFLAGS_H
42285 +#ifndef __ASSEMBLY__
42287 + * Interrupt control:
42291 + * The use of 'barrier' in the following reflects their use as local-lock
42292 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
42293 + * critical operations are executed. All critical operations must complete
42294 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
42295 + * includes these barriers, for example.
42298 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
42300 +#define raw_local_save_flags(flags) \
42301 + do { (flags) = __raw_local_save_flags(); } while (0)
42303 +#define raw_local_irq_restore(x) \
42305 + vcpu_info_t *_vcpu; \
42307 + _vcpu = current_vcpu_info(); \
42308 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
42309 + barrier(); /* unmask then check (avoid races) */ \
42310 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42311 + force_evtchn_callback(); \
42315 +#ifdef CONFIG_X86_VSMP
42318 + * Interrupt control for the VSMP architecture:
42321 +static inline void raw_local_irq_disable(void)
42323 + unsigned long flags = __raw_local_save_flags();
42325 + raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
42328 +static inline void raw_local_irq_enable(void)
42330 + unsigned long flags = __raw_local_save_flags();
42332 + raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
42335 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42337 + return !(flags & (1<<9)) || (flags & (1 << 18));
42340 +#else /* CONFIG_X86_VSMP */
42342 +#define raw_local_irq_disable() \
42344 + current_vcpu_info()->evtchn_upcall_mask = 1; \
42348 +#define raw_local_irq_enable() \
42350 + vcpu_info_t *_vcpu; \
42352 + _vcpu = current_vcpu_info(); \
42353 + _vcpu->evtchn_upcall_mask = 0; \
42354 + barrier(); /* unmask then check (avoid races) */ \
42355 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42356 + force_evtchn_callback(); \
42359 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42361 + return (flags != 0);
42367 + * For spinlocks, etc.:
42370 +#define __raw_local_irq_save() \
42372 + unsigned long flags = __raw_local_save_flags(); \
42374 + raw_local_irq_disable(); \
42379 +#define raw_local_irq_save(flags) \
42380 + do { (flags) = __raw_local_irq_save(); } while (0)
42382 +#define raw_irqs_disabled() \
42384 + unsigned long flags = __raw_local_save_flags(); \
42386 + raw_irqs_disabled_flags(flags); \
42390 + * Used in the idle loop; sti takes one instruction cycle
42393 +void raw_safe_halt(void);
42396 + * Used when interrupts are already enabled or to
42397 + * shutdown the processor:
42401 +#else /* __ASSEMBLY__: */
42402 +# ifdef CONFIG_TRACE_IRQFLAGS
42403 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
42404 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
42406 +# define TRACE_IRQS_ON
42407 +# define TRACE_IRQS_OFF
42412 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h
42413 ===================================================================
42414 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42415 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200
42417 +#ifndef _X86_64_MADDR_H
42418 +#define _X86_64_MADDR_H
42420 +#include <xen/features.h>
42421 +#include <xen/interface/xen.h>
42423 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
42424 +#define INVALID_P2M_ENTRY (~0UL)
42425 +#define FOREIGN_FRAME_BIT (1UL<<63)
42426 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
42428 +/* Definitions for machine and pseudophysical addresses. */
42429 +typedef unsigned long paddr_t;
42430 +typedef unsigned long maddr_t;
42434 +extern unsigned long *phys_to_machine_mapping;
42436 +#undef machine_to_phys_mapping
42437 +extern unsigned long *machine_to_phys_mapping;
42438 +extern unsigned int machine_to_phys_order;
42440 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
42442 + if (xen_feature(XENFEAT_auto_translated_physmap))
42444 + BUG_ON(end_pfn && pfn >= end_pfn);
42445 + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
42448 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
42450 + if (xen_feature(XENFEAT_auto_translated_physmap))
42452 + BUG_ON(end_pfn && pfn >= end_pfn);
42453 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
42456 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
42458 + unsigned long pfn;
42460 + if (xen_feature(XENFEAT_auto_translated_physmap))
42463 + if (unlikely((mfn >> machine_to_phys_order) != 0))
42466 + /* The array access can fail (e.g., device space beyond end of RAM). */
42468 + "1: movq %1,%0\n"
42470 + ".section .fixup,\"ax\"\n"
42471 + "3: movq %2,%0\n"
42474 + ".section __ex_table,\"a\"\n"
42479 + : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
42485 + * We detect special mappings in one of two ways:
42486 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
42487 + * to be outside our maximum possible pseudophys range.
42488 + * 2. If the MFN belongs to a different domain then we will certainly
42489 + * not have MFN in our p2m table. Conversely, if the page is ours,
42490 + * then we'll have p2m(m2p(MFN))==MFN.
42491 + * If we detect a special mapping then it doesn't have a 'struct page'.
42492 + * We force !pfn_valid() by returning an out-of-range pointer.
42494 + * NB. These checks require that, for any MFN that is not in our reservation,
42495 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
42496 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
42497 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
42499 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
42500 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
42501 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
42502 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
42504 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
42506 + unsigned long pfn = mfn_to_pfn(mfn);
42507 + if ((pfn < end_pfn)
42508 + && !xen_feature(XENFEAT_auto_translated_physmap)
42509 + && (phys_to_machine_mapping[pfn] != mfn))
42510 + return end_pfn; /* force !pfn_valid() */
42514 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
42516 + BUG_ON(end_pfn && pfn >= end_pfn);
42517 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
42518 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
42521 + phys_to_machine_mapping[pfn] = mfn;
42524 +static inline maddr_t phys_to_machine(paddr_t phys)
42526 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
42527 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
42531 +static inline paddr_t machine_to_phys(maddr_t machine)
42533 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
42534 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
42538 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
42541 + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42542 + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
42546 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
42549 + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42550 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
42554 +#define __pte_ma(x) ((pte_t) { (x) } )
42555 +#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
42557 +#else /* !CONFIG_XEN */
42559 +#define pfn_to_mfn(pfn) (pfn)
42560 +#define mfn_to_pfn(mfn) (mfn)
42561 +#define mfn_to_local_pfn(mfn) (mfn)
42562 +#define set_phys_to_machine(pfn, mfn) ((void)0)
42563 +#define phys_to_machine_mapping_valid(pfn) (1)
42564 +#define phys_to_machine(phys) ((maddr_t)(phys))
42565 +#define machine_to_phys(mach) ((paddr_t)(mach))
42566 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
42567 +#define __pte_ma(x) __pte(x)
42569 +#endif /* !CONFIG_XEN */
42571 +/* VIRT <-> MACHINE conversion */
42572 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
42573 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
42574 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
42576 +#endif /* _X86_64_MADDR_H */
42578 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h
42579 ===================================================================
42580 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42581 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h 2007-06-12 13:14:13.000000000 +0200
42583 +#ifndef __X86_64_MMU_CONTEXT_H
42584 +#define __X86_64_MMU_CONTEXT_H
42586 +#include <asm/desc.h>
42587 +#include <asm/atomic.h>
42588 +#include <asm/pgalloc.h>
42589 +#include <asm/page.h>
42590 +#include <asm/pda.h>
42591 +#include <asm/pgtable.h>
42592 +#include <asm/tlbflush.h>
42595 + * possibly do the LDT unload here?
42597 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
42598 +void destroy_context(struct mm_struct *mm);
42600 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
42602 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42603 + if (read_pda(mmu_state) == TLBSTATE_OK)
42604 + write_pda(mmu_state, TLBSTATE_LAZY);
42608 +#define prepare_arch_switch(next) __prepare_arch_switch()
42610 +static inline void __prepare_arch_switch(void)
42613 + * Save away %es, %ds, %fs and %gs. Must happen before reload
42614 + * of cr3/ldt (i.e., not in __switch_to).
42616 + __asm__ __volatile__ (
42617 + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
42618 + : "=m" (current->thread.es),
42619 + "=m" (current->thread.ds),
42620 + "=m" (current->thread.fsindex),
42621 + "=m" (current->thread.gsindex) );
42623 + if (current->thread.ds)
42624 + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
42626 + if (current->thread.es)
42627 + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
42629 + if (current->thread.fsindex) {
42630 + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
42631 + current->thread.fs = 0;
42634 + if (current->thread.gsindex) {
42635 + load_gs_index(0);
42636 + current->thread.gs = 0;
42640 +extern void mm_pin(struct mm_struct *mm);
42641 +extern void mm_unpin(struct mm_struct *mm);
42642 +void mm_pin_all(void);
42644 +static inline void load_cr3(pgd_t *pgd)
42646 + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
42650 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
42651 + struct task_struct *tsk)
42653 + unsigned cpu = smp_processor_id();
42654 + struct mmuext_op _op[3], *op = _op;
42656 + if (likely(prev != next)) {
42657 + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
42658 + !next->context.pinned);
42660 + /* stop flush ipis for the previous mm */
42661 + cpu_clear(cpu, prev->cpu_vm_mask);
42662 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42663 + write_pda(mmu_state, TLBSTATE_OK);
42664 + write_pda(active_mm, next);
42666 + cpu_set(cpu, next->cpu_vm_mask);
42668 + /* load_cr3(next->pgd) */
42669 + op->cmd = MMUEXT_NEW_BASEPTR;
42670 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
42673 + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
42674 + op->cmd = MMUEXT_NEW_USER_BASEPTR;
42675 + op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
42678 + if (unlikely(next->context.ldt != prev->context.ldt)) {
42679 + /* load_LDT_nolock(&next->context, cpu) */
42680 + op->cmd = MMUEXT_SET_LDT;
42681 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
42682 + op->arg2.nr_ents = next->context.size;
42686 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
42688 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42690 + write_pda(mmu_state, TLBSTATE_OK);
42691 + if (read_pda(active_mm) != next)
42692 + out_of_line_bug();
42693 + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42694 + /* We were in lazy tlb mode and leave_mm disabled
42695 + * tlb flush IPI delivery. We must reload CR3
42696 + * to make sure to use no freed page tables.
42698 + load_cr3(next->pgd);
42699 + xen_new_user_pt(__pa(__user_pgd(next->pgd)));
42700 + load_LDT_nolock(&next->context, cpu);
42706 +#define deactivate_mm(tsk,mm) do { \
42707 + load_gs_index(0); \
42708 + asm volatile("movl %0,%%fs"::"r"(0)); \
42711 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
42713 + if (!next->context.pinned)
42715 + switch_mm(prev, next, NULL);
42719 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h
42720 ===================================================================
42721 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42722 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h 2008-04-02 12:34:02.000000000 +0200
42724 +#ifndef _X86_64_PAGE_H
42725 +#define _X86_64_PAGE_H
42727 +/* #include <linux/string.h> */
42728 +#ifndef __ASSEMBLY__
42729 +#include <linux/kernel.h>
42730 +#include <linux/types.h>
42731 +#include <asm/bug.h>
42733 +#include <xen/interface/xen.h>
42736 + * Need to repeat this here in order to not include pgtable.h (which in turn
42737 + * depends on definitions made here), but to be able to use the symbolic
42738 + * below. The preprocessor will warn if the two definitions aren't identical.
42740 +#define _PAGE_PRESENT 0x001
42741 +#define _PAGE_IO 0x200
42743 +/* PAGE_SHIFT determines the page size */
42744 +#define PAGE_SHIFT 12
42745 +#ifdef __ASSEMBLY__
42746 +#define PAGE_SIZE (0x1 << PAGE_SHIFT)
42748 +#define PAGE_SIZE (1UL << PAGE_SHIFT)
42750 +#define PAGE_MASK (~(PAGE_SIZE-1))
42752 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
42753 +#define __PHYSICAL_MASK_SHIFT 46
42754 +#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
42755 +#define __VIRTUAL_MASK_SHIFT 48
42756 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
42758 +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
42760 +#define THREAD_ORDER 1
42761 +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
42762 +#define CURRENT_MASK (~(THREAD_SIZE-1))
42764 +#define EXCEPTION_STACK_ORDER 0
42765 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
42767 +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
42768 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
42770 +#define IRQSTACK_ORDER 2
42771 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
42773 +#define STACKFAULT_STACK 1
42774 +#define DOUBLEFAULT_STACK 2
42775 +#define NMI_STACK 3
42776 +#define DEBUG_STACK 4
42777 +#define MCE_STACK 5
42778 +#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
42780 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
42781 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
42783 +#define HPAGE_SHIFT PMD_SHIFT
42784 +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
42785 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
42786 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
42789 +#ifndef __ASSEMBLY__
42791 +extern unsigned long end_pfn;
42793 +#include <asm/maddr.h>
42795 +void clear_page(void *);
42796 +void copy_page(void *, void *);
42798 +#define clear_user_page(page, vaddr, pg) clear_page(page)
42799 +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
42801 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
42802 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
42805 + * These are used to make use of C type-checking..
42807 +typedef struct { unsigned long pte; } pte_t;
42808 +typedef struct { unsigned long pmd; } pmd_t;
42809 +typedef struct { unsigned long pud; } pud_t;
42810 +typedef struct { unsigned long pgd; } pgd_t;
42811 +#define PTE_MASK PHYSICAL_PAGE_MASK
42813 +typedef struct { unsigned long pgprot; } pgprot_t;
42815 +#define __pte_val(x) ((x).pte)
42816 +#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
42817 + == _PAGE_PRESENT ? \
42818 + pte_machine_to_phys(__pte_val(x)) : \
42821 +#define __pmd_val(x) ((x).pmd)
42822 +static inline unsigned long pmd_val(pmd_t x)
42824 + unsigned long ret = __pmd_val(x);
42825 +#if CONFIG_XEN_COMPAT <= 0x030002
42826 + if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
42828 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42833 +#define __pud_val(x) ((x).pud)
42834 +static inline unsigned long pud_val(pud_t x)
42836 + unsigned long ret = __pud_val(x);
42837 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42841 +#define __pgd_val(x) ((x).pgd)
42842 +static inline unsigned long pgd_val(pgd_t x)
42844 + unsigned long ret = __pgd_val(x);
42845 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42849 +#define pgprot_val(x) ((x).pgprot)
42851 +static inline pte_t __pte(unsigned long x)
42853 + if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
42854 + x = pte_phys_to_machine(x);
42855 + return ((pte_t) { (x) });
42858 +static inline pmd_t __pmd(unsigned long x)
42860 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42861 + return ((pmd_t) { (x) });
42864 +static inline pud_t __pud(unsigned long x)
42866 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42867 + return ((pud_t) { (x) });
42870 +static inline pgd_t __pgd(unsigned long x)
42872 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42873 + return ((pgd_t) { (x) });
42876 +#define __pgprot(x) ((pgprot_t) { (x) } )
42878 +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
42879 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42880 +#define __START_KERNEL_map 0xffffffff80000000UL
42881 +#define __PAGE_OFFSET 0xffff880000000000UL
42884 +#define __PHYSICAL_START CONFIG_PHYSICAL_START
42885 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42886 +#define __START_KERNEL_map 0xffffffff80000000
42887 +#define __PAGE_OFFSET 0xffff880000000000
42888 +#endif /* !__ASSEMBLY__ */
42890 +#if CONFIG_XEN_COMPAT <= 0x030002
42891 +#undef LOAD_OFFSET
42892 +#define LOAD_OFFSET 0
42895 +/* to align the pointer to the (next) page boundary */
42896 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
42898 +#define KERNEL_TEXT_SIZE (40UL*1024*1024)
42899 +#define KERNEL_TEXT_START 0xffffffff80000000UL
42901 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
42903 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
42904 + Otherwise you risk miscompilation. */
42905 +#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
42906 +/* __pa_symbol should be used for C visible symbols.
42907 + This seems to be the official gcc blessed way to do such arithmetic. */
42908 +#define __pa_symbol(x) \
42909 + ({unsigned long v; \
42910 + asm("" : "=r" (v) : "0" (x)); \
42913 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
42914 +#define __boot_va(x) __va(x)
42915 +#define __boot_pa(x) __pa(x)
42916 +#ifdef CONFIG_FLATMEM
42917 +#define pfn_valid(pfn) ((pfn) < end_pfn)
42920 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
42921 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
42922 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
42924 +#define VM_DATA_DEFAULT_FLAGS \
42925 + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
42926 + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
42928 +#define __HAVE_ARCH_GATE_AREA 1
42930 +#include <asm-generic/memory_model.h>
42931 +#include <asm-generic/page.h>
42933 +#endif /* __KERNEL__ */
42935 +#endif /* _X86_64_PAGE_H */
42936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h
42937 ===================================================================
42938 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h 2007-09-14 11:14:51.000000000 +0200
42941 +#ifndef __x8664_PCI_H
42942 +#define __x8664_PCI_H
42944 +#include <asm/io.h>
42948 +#include <linux/mm.h> /* for struct page */
42950 +/* Can be used to override the logic in pci_scan_bus for skipping
42951 + already-configured bus numbers - to be used for buggy BIOSes
42952 + or architectures with incomplete PCI setup by the loader */
42955 +extern unsigned int pcibios_assign_all_busses(void);
42957 +#define pcibios_assign_all_busses() 0
42960 +#include <asm/hypervisor.h>
42961 +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
42963 +extern unsigned long pci_mem_start;
42964 +#define PCIBIOS_MIN_IO 0x1000
42965 +#define PCIBIOS_MIN_MEM (pci_mem_start)
42967 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
42969 +void pcibios_config_init(void);
42970 +struct pci_bus * pcibios_scan_root(int bus);
42971 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
42972 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
42974 +void pcibios_set_master(struct pci_dev *dev);
42975 +void pcibios_penalize_isa_irq(int irq, int active);
42976 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
42977 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
42979 +#include <linux/types.h>
42980 +#include <linux/slab.h>
42981 +#include <asm/scatterlist.h>
42982 +#include <linux/string.h>
42983 +#include <asm/page.h>
42985 +extern void pci_iommu_alloc(void);
42986 +extern int iommu_setup(char *opt);
42988 +/* The PCI address space does equal the physical memory
42989 + * address space. The networking and block device layers use
42990 + * this boolean for bounce buffer decisions
42992 + * On AMD64 it mostly equals, but we set it to zero if a hardware
42993 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
42995 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
42997 +#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
43000 + * x86-64 always supports DAC, but sometimes it is useful to force
43001 + * devices through the IOMMU to get automatic sg list merging.
43002 + * Optional right now.
43004 +extern int iommu_sac_force;
43005 +#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force)
43007 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43008 + dma_addr_t ADDR_NAME;
43009 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43011 +#define pci_unmap_addr(PTR, ADDR_NAME) \
43012 + ((PTR)->ADDR_NAME)
43013 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43014 + (((PTR)->ADDR_NAME) = (VAL))
43015 +#define pci_unmap_len(PTR, LEN_NAME) \
43016 + ((PTR)->LEN_NAME)
43017 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43018 + (((PTR)->LEN_NAME) = (VAL))
43020 +#elif defined(CONFIG_SWIOTLB)
43022 +#define pci_dac_dma_supported(pci_dev, mask) 1
43024 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43025 + dma_addr_t ADDR_NAME;
43026 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43028 +#define pci_unmap_addr(PTR, ADDR_NAME) \
43029 + ((PTR)->ADDR_NAME)
43030 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43031 + (((PTR)->ADDR_NAME) = (VAL))
43032 +#define pci_unmap_len(PTR, LEN_NAME) \
43033 + ((PTR)->LEN_NAME)
43034 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43035 + (((PTR)->LEN_NAME) = (VAL))
43040 +#define pci_dac_dma_supported(pci_dev, mask) 1
43042 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
43043 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
43044 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
43045 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
43046 +#define pci_unmap_len(PTR, LEN_NAME) (0)
43047 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
43051 +#include <asm-generic/pci-dma-compat.h>
43053 +static inline dma64_addr_t
43054 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
43056 + return ((dma64_addr_t) page_to_phys(page) +
43057 + (dma64_addr_t) offset);
43060 +static inline struct page *
43061 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
43063 + return virt_to_page(__va(dma_addr));
43066 +static inline unsigned long
43067 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
43069 + return (dma_addr & ~PAGE_MASK);
43072 +static inline void
43073 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43077 +static inline void
43078 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43080 + flush_write_buffers();
43084 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
43085 + enum pci_dma_burst_strategy *strat,
43086 + unsigned long *strategy_parameter)
43088 + *strat = PCI_DMA_BURST_INFINITY;
43089 + *strategy_parameter = ~0UL;
43093 +#define HAVE_PCI_MMAP
43094 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
43095 + enum pci_mmap_state mmap_state, int write_combine);
43097 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
43101 +#endif /* __KERNEL__ */
43103 +/* generic pci stuff */
43105 +#include <asm-generic/pci.h>
43108 +#endif /* __x8664_PCI_H */
43109 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h
43110 ===================================================================
43111 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43112 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h 2007-06-18 08:38:13.000000000 +0200
43114 +#ifndef _X86_64_PGALLOC_H
43115 +#define _X86_64_PGALLOC_H
43117 +#include <asm/fixmap.h>
43118 +#include <asm/pda.h>
43119 +#include <linux/threads.h>
43120 +#include <linux/mm.h>
43121 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
43123 +#include <xen/features.h>
43124 +void make_page_readonly(void *va, unsigned int feature);
43125 +void make_page_writable(void *va, unsigned int feature);
43126 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
43127 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
43129 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43131 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
43133 + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
43136 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
43138 + if (unlikely((mm)->context.pinned)) {
43139 + BUG_ON(HYPERVISOR_update_va_mapping(
43140 + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
43141 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
43142 + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
43144 + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
43148 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
43150 + if (unlikely((mm)->context.pinned)) {
43151 + BUG_ON(HYPERVISOR_update_va_mapping(
43152 + (unsigned long)pmd,
43153 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
43154 + PAGE_KERNEL_RO), 0));
43155 + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
43157 + *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
43162 + * We need to use the batch mode here, but pgd_pupulate() won't be
43163 + * be called frequently.
43165 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
43167 + if (unlikely((mm)->context.pinned)) {
43168 + BUG_ON(HYPERVISOR_update_va_mapping(
43169 + (unsigned long)pud,
43170 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
43171 + PAGE_KERNEL_RO), 0));
43172 + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
43173 + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
43175 + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
43176 + *(__user_pgd(pgd)) = *(pgd);
43180 +extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
43181 +extern void pte_free(struct page *pte);
43183 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
43187 + pg = pte_alloc_one(mm, addr);
43188 + return pg ? page_address(pg) : NULL;
43191 +static inline void pmd_free(pmd_t *pmd)
43193 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
43194 + pte_free(virt_to_page(pmd));
43197 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
43201 + pg = pte_alloc_one(mm, addr);
43202 + return pg ? page_address(pg) : NULL;
43205 +static inline void pud_free(pud_t *pud)
43207 + BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
43208 + pte_free(virt_to_page(pud));
43211 +static inline void pgd_list_add(pgd_t *pgd)
43213 + struct page *page = virt_to_page(pgd);
43215 + spin_lock(&pgd_lock);
43216 + page->index = (pgoff_t)pgd_list;
43218 + pgd_list->private = (unsigned long)&page->index;
43220 + page->private = (unsigned long)&pgd_list;
43221 + spin_unlock(&pgd_lock);
43224 +static inline void pgd_list_del(pgd_t *pgd)
43226 + struct page *next, **pprev, *page = virt_to_page(pgd);
43228 + spin_lock(&pgd_lock);
43229 + next = (struct page *)page->index;
43230 + pprev = (struct page **)page->private;
43233 + next->private = (unsigned long)pprev;
43234 + spin_unlock(&pgd_lock);
43237 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
43240 + * We allocate two contiguous pages for kernel and user.
43242 + unsigned boundary;
43243 + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
43246 + pgd_list_add(pgd);
43248 + * Copy kernel pointers in from init.
43249 + * Could keep a freelist or slab cache of those because the kernel
43250 + * part never changes.
43252 + boundary = pgd_index(__PAGE_OFFSET);
43253 + memset(pgd, 0, boundary * sizeof(pgd_t));
43254 + memcpy(pgd + boundary,
43255 + init_level4_pgt + boundary,
43256 + (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
43258 + memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
43260 + * Set level3_user_pgt for vsyscall area
43262 + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
43263 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
43267 +static inline void pgd_free(pgd_t *pgd)
43269 + pte_t *ptep = virt_to_ptep(pgd);
43271 + if (!pte_write(*ptep)) {
43272 + xen_pgd_unpin(__pa(pgd));
43273 + BUG_ON(HYPERVISOR_update_va_mapping(
43274 + (unsigned long)pgd,
43275 + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
43279 + ptep = virt_to_ptep(__user_pgd(pgd));
43281 + if (!pte_write(*ptep)) {
43282 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
43283 + BUG_ON(HYPERVISOR_update_va_mapping(
43284 + (unsigned long)__user_pgd(pgd),
43285 + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
43290 + pgd_list_del(pgd);
43291 + free_pages((unsigned long)pgd, 1);
43294 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
43296 + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
43298 + make_page_readonly(pte, XENFEAT_writable_page_tables);
43303 +/* Should really implement gc for free page table pages. This could be
43304 + done with a reference count in struct page. */
43306 +static inline void pte_free_kernel(pte_t *pte)
43308 + BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
43309 + make_page_writable(pte, XENFEAT_writable_page_tables);
43310 + free_page((unsigned long)pte);
43313 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
43314 +#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43315 +#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43317 +#endif /* _X86_64_PGALLOC_H */
43318 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h
43319 ===================================================================
43320 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43321 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-07-21 11:00:33.000000000 +0200
43323 +#ifndef _X86_64_PGTABLE_H
43324 +#define _X86_64_PGTABLE_H
43327 + * This file contains the functions and defines necessary to modify and use
43328 + * the x86-64 page table tree.
43330 +#include <asm/processor.h>
43331 +#include <asm/fixmap.h>
43332 +#include <asm/bitops.h>
43333 +#include <linux/threads.h>
43334 +#include <linux/sched.h>
43335 +#include <asm/pda.h>
43337 +#include <asm/hypervisor.h>
43339 +extern pud_t level3_user_pgt[512];
43341 +extern void xen_init_pt(void);
43343 +extern pte_t *lookup_address(unsigned long address);
43345 +#define virt_to_ptep(va) \
43347 + pte_t *__ptep = lookup_address((unsigned long)(va)); \
43348 + BUG_ON(!__ptep || !pte_present(*__ptep)); \
43352 +#define arbitrary_virt_to_machine(va) \
43353 + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
43354 + | ((unsigned long)(va) & (PAGE_SIZE - 1)))
43357 +extern pud_t level3_kernel_pgt[512];
43358 +extern pud_t level3_physmem_pgt[512];
43359 +extern pud_t level3_ident_pgt[512];
43360 +extern pmd_t level2_kernel_pgt[512];
43361 +extern pgd_t init_level4_pgt[];
43362 +extern pgd_t boot_level4_pgt[];
43363 +extern unsigned long __supported_pte_mask;
43365 +#define swapper_pg_dir init_level4_pgt
43367 +extern int nonx_setup(char *str);
43368 +extern void paging_init(void);
43369 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
43371 +extern unsigned long pgkern_mask;
43374 + * ZERO_PAGE is a global shared page that is always zero: used
43375 + * for zero-mapped memory areas etc..
43377 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
43378 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
43381 + * PGDIR_SHIFT determines what a top-level page table entry can map
43383 +#define PGDIR_SHIFT 39
43384 +#define PTRS_PER_PGD 512
43389 +#define PUD_SHIFT 30
43390 +#define PTRS_PER_PUD 512
43393 + * PMD_SHIFT determines the size of the area a middle-level
43394 + * page table can map
43396 +#define PMD_SHIFT 21
43397 +#define PTRS_PER_PMD 512
43400 + * entries per page directory level
43402 +#define PTRS_PER_PTE 512
43404 +#define pte_ERROR(e) \
43405 + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43406 + &(e), __pte_val(e), pte_pfn(e))
43407 +#define pmd_ERROR(e) \
43408 + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43409 + &(e), __pmd_val(e), pmd_pfn(e))
43410 +#define pud_ERROR(e) \
43411 + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43412 + &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43413 +#define pgd_ERROR(e) \
43414 + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43415 + &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43417 +#define pgd_none(x) (!__pgd_val(x))
43418 +#define pud_none(x) (!__pud_val(x))
43420 +static inline void set_pte(pte_t *dst, pte_t val)
43425 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
43426 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
43427 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
43429 +static inline void pud_clear (pud_t * pud)
43431 + set_pud(pud, __pud(0));
43434 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43436 +static inline void pgd_clear (pgd_t * pgd)
43438 + set_pgd(pgd, __pgd(0));
43439 + set_pgd(__user_pgd(pgd), __pgd(0));
43442 +#define pud_page(pud) \
43443 + ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
43445 +#define pte_same(a, b) ((a).pte == (b).pte)
43447 +#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
43449 +#define PMD_SIZE (1UL << PMD_SHIFT)
43450 +#define PMD_MASK (~(PMD_SIZE-1))
43451 +#define PUD_SIZE (1UL << PUD_SHIFT)
43452 +#define PUD_MASK (~(PUD_SIZE-1))
43453 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
43454 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
43456 +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
43457 +#define FIRST_USER_ADDRESS 0
43459 +#ifndef __ASSEMBLY__
43460 +#define MAXMEM 0x3fffffffffffUL
43461 +#define VMALLOC_START 0xffffc20000000000UL
43462 +#define VMALLOC_END 0xffffe1ffffffffffUL
43463 +#define MODULES_VADDR 0xffffffff88000000UL
43464 +#define MODULES_END 0xfffffffffff00000UL
43465 +#define MODULES_LEN (MODULES_END - MODULES_VADDR)
43467 +#define _PAGE_BIT_PRESENT 0
43468 +#define _PAGE_BIT_RW 1
43469 +#define _PAGE_BIT_USER 2
43470 +#define _PAGE_BIT_PWT 3
43471 +#define _PAGE_BIT_PCD 4
43472 +#define _PAGE_BIT_ACCESSED 5
43473 +#define _PAGE_BIT_DIRTY 6
43474 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
43475 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
43476 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
43478 +#define _PAGE_PRESENT 0x001
43479 +#define _PAGE_RW 0x002
43480 +#define _PAGE_USER 0x004
43481 +#define _PAGE_PWT 0x008
43482 +#define _PAGE_PCD 0x010
43483 +#define _PAGE_ACCESSED 0x020
43484 +#define _PAGE_DIRTY 0x040
43485 +#define _PAGE_PSE 0x080 /* 2MB page */
43486 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
43487 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
43489 +#define _PAGE_PROTNONE 0x080 /* If not present */
43490 +#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
43492 +/* Mapped page is I/O or foreign and has no associated page struct. */
43493 +#define _PAGE_IO 0x200
43495 +#if CONFIG_XEN_COMPAT <= 0x030002
43496 +extern unsigned int __kernel_page_user;
43498 +#define __kernel_page_user 0
43501 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
43502 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
43504 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
43506 +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
43507 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43508 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
43509 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43510 +#define PAGE_COPY PAGE_COPY_NOEXEC
43511 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43512 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43513 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43514 +#define __PAGE_KERNEL \
43515 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43516 +#define __PAGE_KERNEL_EXEC \
43517 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
43518 +#define __PAGE_KERNEL_NOCACHE \
43519 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43520 +#define __PAGE_KERNEL_RO \
43521 + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43522 +#define __PAGE_KERNEL_VSYSCALL \
43523 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43524 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
43525 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
43526 +#define __PAGE_KERNEL_LARGE \
43527 + (__PAGE_KERNEL | _PAGE_PSE)
43528 +#define __PAGE_KERNEL_LARGE_EXEC \
43529 + (__PAGE_KERNEL_EXEC | _PAGE_PSE)
43532 + * We don't support GLOBAL page in xenolinux64
43534 +#define MAKE_GLOBAL(x) __pgprot((x))
43536 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
43537 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
43538 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
43539 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
43540 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
43541 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
43542 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
43543 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
43546 +#define __P000 PAGE_NONE
43547 +#define __P001 PAGE_READONLY
43548 +#define __P010 PAGE_COPY
43549 +#define __P011 PAGE_COPY
43550 +#define __P100 PAGE_READONLY_EXEC
43551 +#define __P101 PAGE_READONLY_EXEC
43552 +#define __P110 PAGE_COPY_EXEC
43553 +#define __P111 PAGE_COPY_EXEC
43555 +#define __S000 PAGE_NONE
43556 +#define __S001 PAGE_READONLY
43557 +#define __S010 PAGE_SHARED
43558 +#define __S011 PAGE_SHARED
43559 +#define __S100 PAGE_READONLY_EXEC
43560 +#define __S101 PAGE_READONLY_EXEC
43561 +#define __S110 PAGE_SHARED_EXEC
43562 +#define __S111 PAGE_SHARED_EXEC
43564 +static inline unsigned long pgd_bad(pgd_t pgd)
43566 + unsigned long val = __pgd_val(pgd);
43567 + val &= ~PTE_MASK;
43568 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
43569 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43572 +static inline unsigned long pud_bad(pud_t pud)
43574 + unsigned long val = __pud_val(pud);
43575 + val &= ~PTE_MASK;
43576 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
43577 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43580 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
43581 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
43582 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
43583 + set_pte((ptep), (pteval)); \
43586 +#define pte_none(x) (!(x).pte)
43587 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
43588 +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
43590 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
43592 +#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
43593 +#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
43594 + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
43595 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
43596 + (_pte).pte & _PAGE_PRESENT ? \
43597 + mfn_to_local_pfn(__pte_mfn(_pte)) : \
43600 +#define pte_page(x) pfn_to_page(pte_pfn(x))
43602 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
43604 + unsigned long pte = page_nr << PAGE_SHIFT;
43605 + pte |= pgprot_val(pgprot);
43606 + pte &= __supported_pte_mask;
43607 + return __pte(pte);
43610 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43612 + pte_t pte = *ptep;
43613 + if (!pte_none(pte)) {
43614 + if ((mm != &init_mm) ||
43615 + HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
43616 + pte = __pte_ma(xchg(&ptep->pte, 0));
43621 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
43624 + pte_t pte = *ptep;
43625 + if (mm->context.pinned)
43626 + xen_l1_entry_update(ptep, __pte(0));
43628 + *ptep = __pte(0);
43631 + return ptep_get_and_clear(mm, addr, ptep);
43634 +#define ptep_clear_flush(vma, addr, ptep) \
43636 + pte_t *__ptep = (ptep); \
43637 + pte_t __res = *__ptep; \
43638 + if (!pte_none(__res) && \
43639 + ((vma)->vm_mm != current->mm || \
43640 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
43641 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43642 + UVMF_INVLPG|UVMF_MULTI))) { \
43643 + __ptep->pte = 0; \
43644 + flush_tlb_page(vma, addr); \
43650 + * The following only work if pte_present() is true.
43651 + * Undefined behaviour if not..
43653 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
43654 +static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43655 +static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43656 +static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43657 +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
43658 +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
43659 +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
43660 +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
43661 +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
43663 +static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43664 +static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43665 +static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
43666 +static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
43667 +static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
43668 +static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43669 +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43670 +static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
43671 +static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
43672 +static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
43673 +static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
43675 +#define ptep_test_and_clear_dirty(vma, addr, ptep) \
43677 + pte_t __pte = *(ptep); \
43678 + int __ret = pte_dirty(__pte); \
43680 + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
43684 +#define ptep_test_and_clear_young(vma, addr, ptep) \
43686 + pte_t __pte = *(ptep); \
43687 + int __ret = pte_young(__pte); \
43689 + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
43693 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43695 + pte_t pte = *ptep;
43696 + if (pte_write(pte))
43697 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
43701 + * Macro to mark a page protection value as "uncacheable".
43703 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
43705 +static inline int pmd_large(pmd_t pte) {
43706 + return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
43711 + * Conversion functions: convert a page and protection to a page entry,
43712 + * and a page entry and page directory to the page they refer to.
43716 + * Level 4 access.
43717 + * Never use these in the common code.
43719 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
43720 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
43721 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
43722 +#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
43723 +#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
43724 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
43726 +/* PUD - Level3 access */
43727 +/* to find an entry in a page-table-directory. */
43728 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
43729 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
43730 +#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
43732 +/* PMD - Level 2 access */
43733 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
43734 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
43736 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
43737 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
43738 + pmd_index(address))
43739 +#define pmd_none(x) (!__pmd_val(x))
43740 +#if CONFIG_XEN_COMPAT <= 0x030002
43741 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
43742 + can temporarily clear it. */
43743 +#define pmd_present(x) (__pmd_val(x))
43745 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
43747 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
43748 +#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
43749 + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
43750 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
43751 +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43753 +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
43754 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
43755 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
43757 +/* PTE - Level 1 access. */
43759 +/* page, protection -> pte */
43760 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
43761 +#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
43763 +/* physical address -> PTE */
43764 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
43766 + unsigned long pteval;
43767 + pteval = physpage | pgprot_val(pgprot);
43768 + return __pte(pteval);
43771 +/* Change flags of a PTE */
43772 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
43775 + * Since this might change the present bit (which controls whether
43776 + * a pte_t object has undergone p2m translation), we must use
43777 + * pte_val() on the input pte and __pte() for the return value.
43779 + unsigned long pteval = pte_val(pte);
43781 + pteval &= _PAGE_CHG_MASK;
43782 + pteval |= pgprot_val(newprot);
43783 + pteval &= __supported_pte_mask;
43784 + return __pte(pteval);
43787 +#define pte_index(address) \
43788 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
43789 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
43790 + pte_index(address))
43792 +/* x86-64 always has all page tables mapped. */
43793 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
43794 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
43795 +#define pte_unmap(pte) /* NOP */
43796 +#define pte_unmap_nested(pte) /* NOP */
43798 +#define update_mmu_cache(vma,address,pte) do { } while (0)
43801 + * Rules for using ptep_establish: the pte MUST be a user pte, and
43802 + * must be a present->present transition.
43804 +#define __HAVE_ARCH_PTEP_ESTABLISH
43805 +#define ptep_establish(vma, address, ptep, pteval) \
43807 + if ( likely((vma)->vm_mm == current->mm) ) { \
43808 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
43810 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43811 + UVMF_INVLPG|UVMF_MULTI)); \
43813 + xen_l1_entry_update(ptep, pteval); \
43814 + flush_tlb_page(vma, address); \
43818 +/* We only update the dirty/accessed state if we set
43819 + * the dirty bit by hand in the kernel, since the hardware
43820 + * will do the accessed bit for us, and we don't want to
43821 + * race with other CPU's that might be updating the dirty
43822 + * bit at the same time. */
43823 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
43824 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
43827 + ptep_establish(vma, address, ptep, entry); \
43830 +/* Encode and de-code a swap entry */
43831 +#define __swp_type(x) (((x).val >> 1) & 0x3f)
43832 +#define __swp_offset(x) ((x).val >> 8)
43833 +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
43834 +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
43835 +#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
43837 +extern spinlock_t pgd_lock;
43838 +extern struct page *pgd_list;
43839 +void vmalloc_sync_all(void);
43841 +#endif /* !__ASSEMBLY__ */
43843 +extern int kern_addr_valid(unsigned long addr);
43845 +#define DOMID_LOCAL (0xFFFFU)
43847 +struct vm_area_struct;
43849 +int direct_remap_pfn_range(struct vm_area_struct *vma,
43850 + unsigned long address,
43851 + unsigned long mfn,
43852 + unsigned long size,
43856 +int direct_kernel_remap_pfn_range(unsigned long address,
43857 + unsigned long mfn,
43858 + unsigned long size,
43862 +int create_lookup_pte_addr(struct mm_struct *mm,
43863 + unsigned long address,
43866 +int touch_pte_range(struct mm_struct *mm,
43867 + unsigned long address,
43868 + unsigned long size);
43870 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43871 + unsigned long addr, unsigned long end, pgprot_t newprot);
43873 +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
43874 + xen_change_pte_range(mm, pmd, addr, end, newprot)
43876 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
43877 + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
43879 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
43880 +#define GET_IOSPACE(pfn) 0
43881 +#define GET_PFN(pfn) (pfn)
43883 +#define HAVE_ARCH_UNMAPPED_AREA
43885 +#define pgtable_cache_init() do { } while (0)
43886 +#define check_pgt_cache() do { } while (0)
43888 +#define PAGE_AGP PAGE_KERNEL_NOCACHE
43889 +#define HAVE_PAGE_AGP 1
43891 +/* fs/proc/kcore.c */
43892 +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
43893 +#define kc_offset_to_vaddr(o) \
43894 + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
43896 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
43897 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
43898 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
43899 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
43900 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
43901 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
43902 +#define __HAVE_ARCH_PTE_SAME
43903 +#include <asm-generic/pgtable.h>
43905 +#endif /* _X86_64_PGTABLE_H */
43906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h
43907 ===================================================================
43908 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100
43912 + * include/asm-x86_64/processor.h
43914 + * Copyright (C) 1994 Linus Torvalds
43917 +#ifndef __ASM_X86_64_PROCESSOR_H
43918 +#define __ASM_X86_64_PROCESSOR_H
43920 +#include <asm/segment.h>
43921 +#include <asm/page.h>
43922 +#include <asm/types.h>
43923 +#include <asm/sigcontext.h>
43924 +#include <asm/cpufeature.h>
43925 +#include <linux/threads.h>
43926 +#include <asm/msr.h>
43927 +#include <asm/current.h>
43928 +#include <asm/system.h>
43929 +#include <asm/mmsegment.h>
43930 +#include <asm/percpu.h>
43931 +#include <linux/personality.h>
43932 +#include <linux/cpumask.h>
43934 +#define TF_MASK 0x00000100
43935 +#define IF_MASK 0x00000200
43936 +#define IOPL_MASK 0x00003000
43937 +#define NT_MASK 0x00004000
43938 +#define VM_MASK 0x00020000
43939 +#define AC_MASK 0x00040000
43940 +#define VIF_MASK 0x00080000 /* virtual interrupt flag */
43941 +#define VIP_MASK 0x00100000 /* virtual interrupt pending */
43942 +#define ID_MASK 0x00200000
43944 +#define desc_empty(desc) \
43945 + (!((desc)->a | (desc)->b))
43947 +#define desc_equal(desc1, desc2) \
43948 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
43951 + * Default implementation of macro that returns current
43952 + * instruction pointer ("program counter").
43954 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
43957 + * CPU type and hardware bug flags. Kept separately for each CPU.
43960 +struct cpuinfo_x86 {
43961 + __u8 x86; /* CPU family */
43962 + __u8 x86_vendor; /* CPU vendor */
43965 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
43966 + __u32 x86_capability[NCAPINTS];
43967 + char x86_vendor_id[16];
43968 + char x86_model_id[64];
43969 + int x86_cache_size; /* in KB */
43970 + int x86_clflush_size;
43971 + int x86_cache_alignment;
43972 + int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
43973 + __u8 x86_virt_bits, x86_phys_bits;
43974 + __u8 x86_max_cores; /* cpuid returned max cores value */
43976 + __u32 extended_cpuid_level; /* Max extended CPUID function supported */
43977 + unsigned long loops_per_jiffy;
43979 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
43983 + __u8 booted_cores; /* number of cores as seen by OS */
43984 + __u8 phys_proc_id; /* Physical Processor id. */
43985 + __u8 cpu_core_id; /* Core id. */
43987 +} ____cacheline_aligned;
43989 +#define X86_VENDOR_INTEL 0
43990 +#define X86_VENDOR_CYRIX 1
43991 +#define X86_VENDOR_AMD 2
43992 +#define X86_VENDOR_UMC 3
43993 +#define X86_VENDOR_NEXGEN 4
43994 +#define X86_VENDOR_CENTAUR 5
43995 +#define X86_VENDOR_RISE 6
43996 +#define X86_VENDOR_TRANSMETA 7
43997 +#define X86_VENDOR_NUM 8
43998 +#define X86_VENDOR_UNKNOWN 0xff
44001 +extern struct cpuinfo_x86 cpu_data[];
44002 +#define current_cpu_data cpu_data[smp_processor_id()]
44004 +#define cpu_data (&boot_cpu_data)
44005 +#define current_cpu_data boot_cpu_data
44008 +extern char ignore_irq13;
44010 +extern void identify_cpu(struct cpuinfo_x86 *);
44011 +extern void print_cpu_info(struct cpuinfo_x86 *);
44012 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
44013 +extern unsigned short num_cache_leaves;
44018 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
44019 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
44020 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
44021 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
44022 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
44023 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
44024 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
44025 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
44026 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
44027 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
44028 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
44029 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
44030 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
44031 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
44032 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
44033 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
44034 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
44037 + * Intel CPU features in CR4
44039 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
44040 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
44041 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
44042 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
44043 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
44044 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
44045 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
44046 +#define X86_CR4_PGE 0x0080 /* enable global pages */
44047 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
44048 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
44049 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
44052 + * Save the cr4 feature set we're using (ie
44053 + * Pentium 4MB enable and PPro Global page
44054 + * enable), so that any CPU's that boot up
44055 + * after us can get the correct flags.
44057 +extern unsigned long mmu_cr4_features;
44059 +static inline void set_in_cr4 (unsigned long mask)
44061 + mmu_cr4_features |= mask;
44062 + __asm__("movq %%cr4,%%rax\n\t"
44063 + "orq %0,%%rax\n\t"
44064 + "movq %%rax,%%cr4\n"
44069 +static inline void clear_in_cr4 (unsigned long mask)
44071 + mmu_cr4_features &= ~mask;
44072 + __asm__("movq %%cr4,%%rax\n\t"
44073 + "andq %0,%%rax\n\t"
44074 + "movq %%rax,%%cr4\n"
44075 + : : "irg" (~mask)
44081 + * User space process size. 47bits minus one guard page.
44083 +#define TASK_SIZE64 (0x800000000000UL - 4096)
44085 +/* This decides where the kernel will search for a free chunk of vm
44086 + * space during mmap's.
44088 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
44090 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44091 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44093 +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
44096 + * Size of io_bitmap.
44098 +#define IO_BITMAP_BITS 65536
44099 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
44100 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
44101 +#ifndef CONFIG_X86_NO_TSS
44102 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
44104 +#define INVALID_IO_BITMAP_OFFSET 0x8000
44106 +struct i387_fxsave_struct {
44115 + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
44116 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
44118 +} __attribute__ ((aligned (16)));
44120 +union i387_union {
44121 + struct i387_fxsave_struct fxsave;
44124 +#ifndef CONFIG_X86_NO_TSS
44125 +struct tss_struct {
44135 + u16 io_bitmap_base;
44137 + * The extra 1 is there because the CPU will access an
44138 + * additional byte beyond the end of the IO permission
44139 + * bitmap. The extra byte must be all 1 bits, and must
44140 + * be within the limit. Thus we have:
44142 + * 128 bytes, the bitmap itself, for ports 0..0x3ff
44143 + * 8 bytes, for an extra "long" of ~0UL
44145 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
44146 +} __attribute__((packed)) ____cacheline_aligned;
44148 +DECLARE_PER_CPU(struct tss_struct,init_tss);
44152 +extern struct cpuinfo_x86 boot_cpu_data;
44153 +#ifndef CONFIG_X86_NO_TSS
44154 +/* Save the original ist values for checking stack pointers during debugging */
44156 + unsigned long ist[7];
44158 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
44161 +#ifdef CONFIG_X86_VSMP
44162 +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
44163 +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
44165 +#define ARCH_MIN_TASKALIGN 16
44166 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
44169 +struct thread_struct {
44170 + unsigned long rsp0;
44171 + unsigned long rsp;
44172 + unsigned long userrsp; /* Copy from PDA */
44173 + unsigned long fs;
44174 + unsigned long gs;
44175 + unsigned short es, ds, fsindex, gsindex;
44176 +/* Hardware debugging registers */
44177 + unsigned long debugreg0;
44178 + unsigned long debugreg1;
44179 + unsigned long debugreg2;
44180 + unsigned long debugreg3;
44181 + unsigned long debugreg6;
44182 + unsigned long debugreg7;
44184 + unsigned long cr2, trap_no, error_code;
44185 +/* floating point info */
44186 + union i387_union i387 __attribute__((aligned(16)));
44187 +/* IO permissions. the bitmap could be moved into the GDT, that would make
44188 + switch faster for a limited number of ioperm using tasks. -AK */
44190 + unsigned long *io_bitmap_ptr;
44191 + unsigned io_bitmap_max;
44192 +/* cached TLS descriptors. */
44193 + u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
44194 + unsigned int iopl;
44195 +} __attribute__((aligned(16)));
44197 +#define INIT_THREAD { \
44198 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44201 +#ifndef CONFIG_X86_NO_TSS
44202 +#define INIT_TSS { \
44203 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44207 +#define INIT_MMAP \
44208 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
44210 +#define start_thread(regs,new_rip,new_rsp) do { \
44211 + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
44212 + load_gs_index(0); \
44213 + (regs)->rip = (new_rip); \
44214 + (regs)->rsp = (new_rsp); \
44215 + write_pda(oldrsp, (new_rsp)); \
44216 + (regs)->cs = __USER_CS; \
44217 + (regs)->ss = __USER_DS; \
44218 + (regs)->eflags = 0x200; \
44219 + set_fs(USER_DS); \
44222 +#define get_debugreg(var, register) \
44223 + var = HYPERVISOR_get_debugreg(register)
44224 +#define set_debugreg(value, register) do { \
44225 + if (HYPERVISOR_set_debugreg(register, value)) \
44229 +struct task_struct;
44232 +/* Free all resources held by a thread. */
44233 +extern void release_thread(struct task_struct *);
44235 +/* Prepare to copy thread state - unlazy all lazy status */
44236 +extern void prepare_to_copy(struct task_struct *tsk);
44239 + * create a kernel thread without removing it from tasklists
44241 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
44244 + * Return saved PC of a blocked thread.
44245 + * What is this good for? it will be always the scheduler or ret_from_fork.
44247 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
44249 +extern unsigned long get_wchan(struct task_struct *p);
44250 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
44251 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
44252 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
44255 +struct microcode_header {
44256 + unsigned int hdrver;
44257 + unsigned int rev;
44258 + unsigned int date;
44259 + unsigned int sig;
44260 + unsigned int cksum;
44261 + unsigned int ldrver;
44263 + unsigned int datasize;
44264 + unsigned int totalsize;
44265 + unsigned int reserved[3];
44268 +struct microcode {
44269 + struct microcode_header hdr;
44270 + unsigned int bits[0];
44273 +typedef struct microcode microcode_t;
44274 +typedef struct microcode_header microcode_header_t;
44276 +/* microcode format is extended from prescott processors */
44277 +struct extended_signature {
44278 + unsigned int sig;
44280 + unsigned int cksum;
44283 +struct extended_sigtable {
44284 + unsigned int count;
44285 + unsigned int cksum;
44286 + unsigned int reserved[3];
44287 + struct extended_signature sigs[0];
44291 +#define ASM_NOP1 K8_NOP1
44292 +#define ASM_NOP2 K8_NOP2
44293 +#define ASM_NOP3 K8_NOP3
44294 +#define ASM_NOP4 K8_NOP4
44295 +#define ASM_NOP5 K8_NOP5
44296 +#define ASM_NOP6 K8_NOP6
44297 +#define ASM_NOP7 K8_NOP7
44298 +#define ASM_NOP8 K8_NOP8
44300 +/* Opteron nops */
44301 +#define K8_NOP1 ".byte 0x90\n"
44302 +#define K8_NOP2 ".byte 0x66,0x90\n"
44303 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
44304 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
44305 +#define K8_NOP5 K8_NOP3 K8_NOP2
44306 +#define K8_NOP6 K8_NOP3 K8_NOP3
44307 +#define K8_NOP7 K8_NOP4 K8_NOP3
44308 +#define K8_NOP8 K8_NOP4 K8_NOP4
44310 +#define ASM_NOP_MAX 8
44312 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
44313 +static inline void rep_nop(void)
44315 + __asm__ __volatile__("rep;nop": : :"memory");
44318 +/* Stop speculative execution */
44319 +static inline void sync_core(void)
44322 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
44325 +#define cpu_has_fpu 1
44327 +#define ARCH_HAS_PREFETCH
44328 +static inline void prefetch(void *x)
44330 + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
44333 +#define ARCH_HAS_PREFETCHW 1
44334 +static inline void prefetchw(void *x)
44336 + alternative_input("prefetcht0 (%1)",
44337 + "prefetchw (%1)",
44338 + X86_FEATURE_3DNOW,
44342 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
44344 +#define spin_lock_prefetch(x) prefetchw(x)
44346 +#define cpu_relax() rep_nop()
44349 + * NSC/Cyrix CPU configuration register indexes
44351 +#define CX86_CCR0 0xc0
44352 +#define CX86_CCR1 0xc1
44353 +#define CX86_CCR2 0xc2
44354 +#define CX86_CCR3 0xc3
44355 +#define CX86_CCR4 0xe8
44356 +#define CX86_CCR5 0xe9
44357 +#define CX86_CCR6 0xea
44358 +#define CX86_CCR7 0xeb
44359 +#define CX86_DIR0 0xfe
44360 +#define CX86_DIR1 0xff
44361 +#define CX86_ARR_BASE 0xc4
44362 +#define CX86_RCR_BASE 0xdc
44365 + * NSC/Cyrix CPU indexed register access macros
44368 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
44370 +#define setCx86(reg, data) do { \
44371 + outb((reg), 0x22); \
44372 + outb((data), 0x23); \
44375 +static inline void serialize_cpu(void)
44377 + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
44380 +static inline void __monitor(const void *eax, unsigned long ecx,
44381 + unsigned long edx)
44383 + /* "monitor %eax,%ecx,%edx;" */
44385 + ".byte 0x0f,0x01,0xc8;"
44386 + : :"a" (eax), "c" (ecx), "d"(edx));
44389 +static inline void __mwait(unsigned long eax, unsigned long ecx)
44391 + /* "mwait %eax,%ecx;" */
44393 + ".byte 0x0f,0x01,0xc9;"
44394 + : :"a" (eax), "c" (ecx));
44397 +#define stack_current() \
44399 + struct thread_info *ti; \
44400 + asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44404 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
44406 +extern unsigned long boot_option_idle_override;
44407 +/* Boot loader type from the setup header */
44408 +extern int bootloader_type;
44410 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
44412 +#endif /* __ASM_X86_64_PROCESSOR_H */
44413 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h
44414 ===================================================================
44415 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44416 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200
44418 +#ifndef __ASM_SMP_H
44419 +#define __ASM_SMP_H
44422 + * We need the APIC definitions automatically as part of 'smp.h'
44424 +#ifndef __ASSEMBLY__
44425 +#include <linux/threads.h>
44426 +#include <linux/cpumask.h>
44427 +#include <linux/bitops.h>
44428 +extern int disable_apic;
44431 +#ifdef CONFIG_X86_LOCAL_APIC
44432 +#ifndef __ASSEMBLY__
44433 +#include <asm/fixmap.h>
44434 +#include <asm/mpspec.h>
44435 +#ifdef CONFIG_X86_IO_APIC
44436 +#include <asm/io_apic.h>
44438 +#include <asm/apic.h>
44439 +#include <asm/thread_info.h>
44446 +#include <asm/pda.h>
44450 +extern cpumask_t cpu_present_mask;
44451 +extern cpumask_t cpu_possible_map;
44452 +extern cpumask_t cpu_online_map;
44453 +extern cpumask_t cpu_initialized;
44456 + * Private routines/data
44459 +extern void smp_alloc_memory(void);
44460 +extern volatile unsigned long smp_invalidate_needed;
44461 +extern int pic_mode;
44462 +extern void lock_ipi_call_lock(void);
44463 +extern void unlock_ipi_call_lock(void);
44464 +extern int smp_num_siblings;
44465 +extern void smp_send_reschedule(int cpu);
44466 +void smp_stop_cpu(void);
44467 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
44468 + void *info, int retry, int wait);
44470 +extern cpumask_t cpu_sibling_map[NR_CPUS];
44471 +extern cpumask_t cpu_core_map[NR_CPUS];
44472 +extern u8 cpu_llc_id[NR_CPUS];
44474 +#define SMP_TRAMPOLINE_BASE 0x6000
44477 + * On x86 all CPUs are mapped 1:1 to the APIC space.
44478 + * This simplifies scheduling and IPI sending and
44479 + * compresses data structures.
44482 +static inline int num_booting_cpus(void)
44484 + return cpus_weight(cpu_possible_map);
44487 +#define raw_smp_processor_id() read_pda(cpunumber)
44489 +#ifdef CONFIG_X86_LOCAL_APIC
44490 +static inline int hard_smp_processor_id(void)
44492 + /* we don't want to mark this access volatile - bad code generation */
44493 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
44497 +extern int safe_smp_processor_id(void);
44498 +extern int __cpu_disable(void);
44499 +extern void __cpu_die(unsigned int cpu);
44500 +extern void prefill_possible_map(void);
44501 +extern unsigned num_processors;
44502 +extern unsigned disabled_cpus;
44504 +#endif /* !ASSEMBLY */
44506 +#define NO_PROC_ID 0xFF /* No processor magic marker */
44512 + * Some lowlevel functions might want to know about
44513 + * the real APIC ID <-> CPU # mapping.
44515 +extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
44516 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
44517 +extern u8 bios_cpu_apicid[];
44519 +#ifdef CONFIG_X86_LOCAL_APIC
44520 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
44522 + return cpus_addr(cpumask)[0];
44525 +static inline int cpu_present_to_apicid(int mps_cpu)
44527 + if (mps_cpu < NR_CPUS)
44528 + return (int)bios_cpu_apicid[mps_cpu];
44530 + return BAD_APICID;
44534 +#endif /* !ASSEMBLY */
44536 +#ifndef CONFIG_SMP
44537 +#define stack_smp_processor_id() 0
44538 +#define safe_smp_processor_id() 0
44539 +#define cpu_logical_map(x) (x)
44541 +#include <asm/thread_info.h>
44542 +#define stack_smp_processor_id() \
44544 + struct thread_info *ti; \
44545 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44550 +#ifndef __ASSEMBLY__
44551 +#ifdef CONFIG_X86_LOCAL_APIC
44552 +static __inline int logical_smp_processor_id(void)
44554 + /* we don't want to mark this access volatile - bad code generation */
44555 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
44561 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
44563 +#define cpu_physical_id(cpu) boot_cpu_id
44568 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h
44569 ===================================================================
44570 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44571 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100
44573 +#ifndef __ASM_SYSTEM_H
44574 +#define __ASM_SYSTEM_H
44576 +#include <linux/kernel.h>
44577 +#include <asm/segment.h>
44578 +#include <asm/alternative.h>
44580 +#include <asm/synch_bitops.h>
44581 +#include <asm/hypervisor.h>
44582 +#include <xen/interface/arch-x86_64.h>
44586 +#define __STR(x) #x
44587 +#define STR(x) __STR(x)
44589 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
44590 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
44592 +/* frame pointer must be last for get_wchan */
44593 +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
44594 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
44596 +#define __EXTRA_CLOBBER \
44597 + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
44599 +#define switch_to(prev,next,last) \
44600 + asm volatile(SAVE_CONTEXT \
44601 + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
44602 + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
44603 + "call __switch_to\n\t" \
44604 + ".globl thread_return\n" \
44605 + "thread_return:\n\t" \
44606 + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
44607 + "movq %P[thread_info](%%rsi),%%r8\n\t" \
44608 + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
44609 + "movq %%rax,%%rdi\n\t" \
44610 + "jc ret_from_fork\n\t" \
44611 + RESTORE_CONTEXT \
44613 + : [next] "S" (next), [prev] "D" (prev), \
44614 + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
44615 + [ti_flags] "i" (offsetof(struct thread_info, flags)),\
44616 + [tif_fork] "i" (TIF_FORK), \
44617 + [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
44618 + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
44619 + : "memory", "cc" __EXTRA_CLOBBER)
44621 +extern void load_gs_index(unsigned);
44624 + * Load a segment. Fall back on loading the zero
44625 + * segment if something goes wrong..
44627 +#define loadsegment(seg,value) \
44628 + asm volatile("\n" \
44630 + "movl %k0,%%" #seg "\n" \
44632 + ".section .fixup,\"ax\"\n" \
44634 + "movl %1,%%" #seg "\n\t" \
44637 + ".section __ex_table,\"a\"\n\t" \
44639 + ".quad 1b,3b\n" \
44641 + : :"r" (value), "r" (0))
44644 + * Clear and set 'TS' bit respectively
44646 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
44648 +static inline unsigned long read_cr0(void)
44650 + unsigned long cr0;
44651 + asm volatile("movq %%cr0,%0" : "=r" (cr0));
44655 +static inline void write_cr0(unsigned long val)
44657 + asm volatile("movq %0,%%cr0" :: "r" (val));
44660 +#define read_cr3() ({ \
44661 + unsigned long __dummy; \
44662 + asm("movq %%cr3,%0" : "=r" (__dummy)); \
44663 + machine_to_phys(__dummy); \
44666 +static inline unsigned long read_cr4(void)
44668 + unsigned long cr4;
44669 + asm("movq %%cr4,%0" : "=r" (cr4));
44673 +static inline void write_cr4(unsigned long val)
44675 + asm volatile("movq %0,%%cr4" :: "r" (val));
44678 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
44680 +#define wbinvd() \
44681 + __asm__ __volatile__ ("wbinvd": : :"memory");
44684 + * On SMP systems, when the scheduler does migration-cost autodetection,
44685 + * it needs a way to flush as much of the CPU's caches as possible.
44687 +static inline void sched_cacheflush(void)
44692 +#endif /* __KERNEL__ */
44694 +#define nop() __asm__ __volatile__ ("nop")
44696 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
44698 +#define tas(ptr) (xchg((ptr),1))
44700 +#define __xg(x) ((volatile long *)(x))
44702 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
44707 +#define _set_64bit set_64bit
44710 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
44711 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
44712 + * but generally the primitive is invalid, *ptr is output argument. --ANK
44714 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
44718 + __asm__ __volatile__("xchgb %b0,%1"
44720 + :"m" (*__xg(ptr)), "0" (x)
44724 + __asm__ __volatile__("xchgw %w0,%1"
44726 + :"m" (*__xg(ptr)), "0" (x)
44730 + __asm__ __volatile__("xchgl %k0,%1"
44732 + :"m" (*__xg(ptr)), "0" (x)
44736 + __asm__ __volatile__("xchgq %0,%1"
44738 + :"m" (*__xg(ptr)), "0" (x)
44746 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
44747 + * store NEW in MEM. Return the initial value in MEM. Success is
44748 + * indicated by comparing RETURN with OLD.
44751 +#define __HAVE_ARCH_CMPXCHG 1
44753 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
44754 + unsigned long new, int size)
44756 + unsigned long prev;
44759 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
44761 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
44765 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
44767 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44771 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
44773 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44777 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
44779 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44786 +#define cmpxchg(ptr,o,n)\
44787 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
44788 + (unsigned long)(n),sizeof(*(ptr))))
44791 +#define smp_mb() mb()
44792 +#define smp_rmb() rmb()
44793 +#define smp_wmb() wmb()
44794 +#define smp_read_barrier_depends() do {} while(0)
44796 +#define smp_mb() barrier()
44797 +#define smp_rmb() barrier()
44798 +#define smp_wmb() barrier()
44799 +#define smp_read_barrier_depends() do {} while(0)
44804 + * Force strict CPU ordering.
44805 + * And yes, this is required on UP too when we're talking
44808 +#define mb() asm volatile("mfence":::"memory")
44809 +#define rmb() asm volatile("lfence":::"memory")
44811 +#ifdef CONFIG_UNORDERED_IO
44812 +#define wmb() asm volatile("sfence" ::: "memory")
44814 +#define wmb() asm volatile("" ::: "memory")
44816 +#define read_barrier_depends() do {} while(0)
44817 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
44819 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
44821 +#include <linux/irqflags.h>
44823 +void cpu_idle_wait(void);
44825 +extern unsigned long arch_align_stack(unsigned long sp);
44826 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
44829 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h
44830 ===================================================================
44831 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44832 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100
44834 +#ifndef _X8664_TLBFLUSH_H
44835 +#define _X8664_TLBFLUSH_H
44837 +#include <linux/mm.h>
44838 +#include <asm/processor.h>
44840 +#define __flush_tlb() xen_tlb_flush()
44843 + * Global pages have to be flushed a bit differently. Not a real
44844 + * performance problem because this does not happen often.
44846 +#define __flush_tlb_global() xen_tlb_flush()
44849 +extern unsigned long pgkern_mask;
44851 +#define __flush_tlb_all() __flush_tlb_global()
44853 +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
44859 + * - flush_tlb() flushes the current mm struct TLBs
44860 + * - flush_tlb_all() flushes all processes TLBs
44861 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
44862 + * - flush_tlb_page(vma, vmaddr) flushes one page
44863 + * - flush_tlb_range(vma, start, end) flushes a range of pages
44864 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
44865 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
44867 + * x86-64 can only flush individual pages or full VMs. For a range flush
44868 + * we always do the full VM. Might be worth trying if for a small
44869 + * range a few INVLPGs in a row are a win.
44872 +#ifndef CONFIG_SMP
44874 +#define flush_tlb() __flush_tlb()
44875 +#define flush_tlb_all() __flush_tlb_all()
44876 +#define local_flush_tlb() __flush_tlb()
44878 +static inline void flush_tlb_mm(struct mm_struct *mm)
44880 + if (mm == current->active_mm)
44884 +static inline void flush_tlb_page(struct vm_area_struct *vma,
44885 + unsigned long addr)
44887 + if (vma->vm_mm == current->active_mm)
44888 + __flush_tlb_one(addr);
44891 +static inline void flush_tlb_range(struct vm_area_struct *vma,
44892 + unsigned long start, unsigned long end)
44894 + if (vma->vm_mm == current->active_mm)
44900 +#include <asm/smp.h>
44902 +#define local_flush_tlb() \
44905 +#define flush_tlb_all xen_tlb_flush_all
44906 +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
44907 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
44908 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
44910 +#define flush_tlb() flush_tlb_current_task()
44912 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
44914 + flush_tlb_mm(vma->vm_mm);
44917 +#define TLBSTATE_OK 1
44918 +#define TLBSTATE_LAZY 2
44920 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
44921 + ranges. Cost is about 42k of memory for each CPU. */
44922 +#define ARCH_FREE_PTE_NR 5350
44926 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
44928 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
44929 + unsigned long start, unsigned long end)
44931 + /* x86_64 does not keep any page table caches in a software TLB.
44932 + The CPUs do in their hardware TLBs, but they are handled
44933 + by the normal TLB flushing algorithms. */
44936 +#endif /* _X8664_TLBFLUSH_H */
44937 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h
44938 ===================================================================
44939 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44940 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200
44943 + * x86-64 changes / gcc fixes from Andi Kleen.
44944 + * Copyright 2002 Andi Kleen, SuSE Labs.
44946 + * This hasn't been optimized for the hammer yet, but there are likely
44947 + * no advantages to be gotten from x86-64 here anyways.
44950 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
44952 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
44953 + tell it to do a clts before the register saving. */
44954 +#define XMMS_SAVE do { \
44955 + preempt_disable(); \
44956 + if (!(current_thread_info()->status & TS_USEDFPU)) \
44958 + __asm__ __volatile__ ( \
44959 + "movups %%xmm0,(%1) ;\n\t" \
44960 + "movups %%xmm1,0x10(%1) ;\n\t" \
44961 + "movups %%xmm2,0x20(%1) ;\n\t" \
44962 + "movups %%xmm3,0x30(%1) ;\n\t" \
44964 + : "r" (xmm_save) \
44968 +#define XMMS_RESTORE do { \
44971 + "movups (%1),%%xmm0 ;\n\t" \
44972 + "movups 0x10(%1),%%xmm1 ;\n\t" \
44973 + "movups 0x20(%1),%%xmm2 ;\n\t" \
44974 + "movups 0x30(%1),%%xmm3 ;\n\t" \
44976 + : "r" (cr0), "r" (xmm_save) \
44978 + if (!(current_thread_info()->status & TS_USEDFPU)) \
44980 + preempt_enable(); \
44983 +#define OFFS(x) "16*("#x")"
44984 +#define PF_OFFS(x) "256+16*("#x")"
44985 +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
44986 +#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
44987 +#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44988 +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
44989 +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44990 +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
44991 +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
44992 +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
44993 +#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
44994 +#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
44995 +#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
44996 +#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
44997 +#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
45001 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45003 + unsigned int lines = bytes >> 8;
45004 + unsigned long cr0;
45005 + xmm_store_t xmm_save[4];
45011 +#define BLOCK(i) \
45041 + " addq %[inc], %[p1] ;\n"
45042 + " addq %[inc], %[p2] ;\n"
45043 + " decl %[cnt] ; jnz 1b"
45044 + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
45045 + : [inc] "r" (256UL)
45052 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45053 + unsigned long *p3)
45055 + unsigned int lines = bytes >> 8;
45056 + xmm_store_t xmm_save[4];
45057 + unsigned long cr0;
45061 + __asm__ __volatile__ (
45063 +#define BLOCK(i) \
45099 + " addq %[inc], %[p1] ;\n"
45100 + " addq %[inc], %[p2] ;\n"
45101 + " addq %[inc], %[p3] ;\n"
45102 + " decl %[cnt] ; jnz 1b"
45103 + : [cnt] "+r" (lines),
45104 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
45105 + : [inc] "r" (256UL)
45111 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45112 + unsigned long *p3, unsigned long *p4)
45114 + unsigned int lines = bytes >> 8;
45115 + xmm_store_t xmm_save[4];
45116 + unsigned long cr0;
45120 + __asm__ __volatile__ (
45122 +#define BLOCK(i) \
45164 + " addq %[inc], %[p1] ;\n"
45165 + " addq %[inc], %[p2] ;\n"
45166 + " addq %[inc], %[p3] ;\n"
45167 + " addq %[inc], %[p4] ;\n"
45168 + " decl %[cnt] ; jnz 1b"
45169 + : [cnt] "+c" (lines),
45170 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
45171 + : [inc] "r" (256UL)
45178 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45179 + unsigned long *p3, unsigned long *p4, unsigned long *p5)
45181 + unsigned int lines = bytes >> 8;
45182 + xmm_store_t xmm_save[4];
45183 + unsigned long cr0;
45187 + __asm__ __volatile__ (
45189 +#define BLOCK(i) \
45237 + " addq %[inc], %[p1] ;\n"
45238 + " addq %[inc], %[p2] ;\n"
45239 + " addq %[inc], %[p3] ;\n"
45240 + " addq %[inc], %[p4] ;\n"
45241 + " addq %[inc], %[p5] ;\n"
45242 + " decl %[cnt] ; jnz 1b"
45243 + : [cnt] "+c" (lines),
45244 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
45246 + : [inc] "r" (256UL)
45252 +static struct xor_block_template xor_block_sse = {
45253 + .name = "generic_sse",
45254 + .do_2 = xor_sse_2,
45255 + .do_3 = xor_sse_3,
45256 + .do_4 = xor_sse_4,
45257 + .do_5 = xor_sse_5,
45260 +#undef XOR_TRY_TEMPLATES
45261 +#define XOR_TRY_TEMPLATES \
45263 + xor_speed(&xor_block_sse); \
45266 +/* We force the use of the SSE xor block because it can write around L2.
45267 + We may also be able to load into the L1 only depending on how the cpu
45268 + deals with a load to a line that is being prefetched. */
45269 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
45270 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_time.h
45271 ===================================================================
45272 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45273 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_time.h 2007-06-12 13:14:13.000000000 +0200
45276 + * include/asm-i386/mach-default/mach_time.h
45278 + * Machine specific set RTC function for generic.
45279 + * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
45281 +#ifndef _MACH_TIME_H
45282 +#define _MACH_TIME_H
45284 +#include <asm-i386/mc146818rtc.h>
45286 +/* for check timing call set_rtc_mmss() 500ms */
45287 +/* used in arch/i386/time.c::do_timer_interrupt() */
45288 +#define USEC_AFTER 500000
45289 +#define USEC_BEFORE 500000
45292 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
45293 + * called 500 ms after the second nowtime has started, because when
45294 + * nowtime is written into the registers of the CMOS clock, it will
45295 + * jump to the next second precisely 500 ms later. Check the Motorola
45296 + * MC146818A or Dallas DS12887 data sheet for details.
45298 + * BUG: This routine does not handle hour overflow properly; it just
45299 + * sets the minutes. Usually you'll only notice that after reboot!
45301 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
45304 + int real_seconds, real_minutes, cmos_minutes;
45305 + unsigned char save_control, save_freq_select;
45307 + save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
45308 + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
45310 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
45311 + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
45313 + cmos_minutes = CMOS_READ(RTC_MINUTES);
45314 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
45315 + BCD_TO_BIN(cmos_minutes);
45318 + * since we're only adjusting minutes and seconds,
45319 + * don't interfere with hour overflow. This avoids
45320 + * messing with unknown time zones but requires your
45321 + * RTC not to be off by more than 15 minutes
45323 + real_seconds = nowtime % 60;
45324 + real_minutes = nowtime / 60;
45325 + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
45326 + real_minutes += 30; /* correct for half hour time zone */
45327 + real_minutes %= 60;
45329 + if (abs(real_minutes - cmos_minutes) < 30) {
45330 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45331 + BIN_TO_BCD(real_seconds);
45332 + BIN_TO_BCD(real_minutes);
45334 + CMOS_WRITE(real_seconds,RTC_SECONDS);
45335 + CMOS_WRITE(real_minutes,RTC_MINUTES);
45337 + printk(KERN_WARNING
45338 + "set_rtc_mmss: can't update from %d to %d\n",
45339 + cmos_minutes, real_minutes);
45343 + /* The following flags have to be released exactly in this order,
45344 + * otherwise the DS12887 (popular MC146818A clone with integrated
45345 + * battery and quartz) will not reset the oscillator and will not
45346 + * update precisely 500 ms later. You won't find this mentioned in
45347 + * the Dallas Semiconductor data sheets, but who believes data
45348 + * sheets anyway ... -- Markus Kuhn
45350 + CMOS_WRITE(save_control, RTC_CONTROL);
45351 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
45356 +static inline unsigned long mach_get_cmos_time(void)
45358 + unsigned int year, mon, day, hour, min, sec;
45361 + sec = CMOS_READ(RTC_SECONDS);
45362 + min = CMOS_READ(RTC_MINUTES);
45363 + hour = CMOS_READ(RTC_HOURS);
45364 + day = CMOS_READ(RTC_DAY_OF_MONTH);
45365 + mon = CMOS_READ(RTC_MONTH);
45366 + year = CMOS_READ(RTC_YEAR);
45367 + } while (sec != CMOS_READ(RTC_SECONDS));
45369 + if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45372 + BCD_TO_BIN(hour);
45375 + BCD_TO_BIN(year);
45382 + return mktime(year, mon, day, hour, min, sec);
45385 +#endif /* !_MACH_TIME_H */
45386 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h
45387 ===================================================================
45388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45389 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h 2007-06-12 13:14:13.000000000 +0200
45392 + * include/asm-i386/mach-default/mach_timer.h
45394 + * Machine specific calibrate_tsc() for generic.
45395 + * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
45397 +/* ------ Calibrate the TSC -------
45398 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
45399 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
45400 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
45401 + * output busy loop as low as possible. We avoid reading the CTC registers
45402 + * directly because of the awkward 8-bit access mechanism of the 82C54
45405 +#ifndef _MACH_TIMER_H
45406 +#define _MACH_TIMER_H
45408 +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
45409 +#define CALIBRATE_LATCH \
45410 + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
45412 +static inline void mach_prepare_counter(void)
45414 + /* Set the Gate high, disable speaker */
45415 + outb((inb(0x61) & ~0x02) | 0x01, 0x61);
45418 + * Now let's take care of CTC channel 2
45420 + * Set the Gate high, program CTC channel 2 for mode 0,
45421 + * (interrupt on terminal count mode), binary count,
45422 + * load 5 * LATCH count, (LSB and MSB) to begin countdown.
45424 + * Some devices need a delay here.
45426 + outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
45427 + outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
45428 + outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
45431 +static inline void mach_countup(unsigned long *count_p)
45433 + unsigned long count = 0;
45436 + } while ((inb_p(0x61) & 0x20) == 0);
45437 + *count_p = count;
45440 +#endif /* !_MACH_TIMER_H */
45441 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h
45442 ===================================================================
45443 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45444 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200
45447 + * machine_specific_* - Hooks for machine specific setup.
45450 + * This is included late in kernel/setup.c so that it can make
45451 + * use of all of the static functions.
45454 +#include <xen/interface/callback.h>
45456 +extern void hypervisor_callback(void);
45457 +extern void failsafe_callback(void);
45458 +extern void nmi(void);
45460 +static void __init machine_specific_arch_setup(void)
45463 + static struct callback_register __initdata event = {
45464 + .type = CALLBACKTYPE_event,
45465 + .address = (unsigned long) hypervisor_callback,
45467 + static struct callback_register __initdata failsafe = {
45468 + .type = CALLBACKTYPE_failsafe,
45469 + .address = (unsigned long)failsafe_callback,
45471 + static struct callback_register __initdata syscall = {
45472 + .type = CALLBACKTYPE_syscall,
45473 + .address = (unsigned long)system_call,
45475 +#ifdef CONFIG_X86_LOCAL_APIC
45476 + static struct callback_register __initdata nmi_cb = {
45477 + .type = CALLBACKTYPE_nmi,
45478 + .address = (unsigned long)nmi,
45482 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
45484 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
45486 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
45487 +#if CONFIG_XEN_COMPAT <= 0x030002
45488 + if (ret == -ENOSYS)
45489 + ret = HYPERVISOR_set_callbacks(
45491 + failsafe.address,
45492 + syscall.address);
45496 +#ifdef CONFIG_X86_LOCAL_APIC
45497 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
45498 +#if CONFIG_XEN_COMPAT <= 0x030002
45499 + if (ret == -ENOSYS) {
45500 + static struct xennmi_callback __initdata cb = {
45501 + .handler_address = (unsigned long)nmi
45504 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
45509 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h
45510 ===================================================================
45511 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45512 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200
45514 +/* Hook to call BIOS initialisation function */
45516 +#define ARCH_SETUP machine_specific_arch_setup();
45518 +static void __init machine_specific_arch_setup(void);
45519 Index: head-2008-11-25/include/xen/blkif.h
45520 ===================================================================
45521 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45522 +++ head-2008-11-25/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200
45525 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45526 + * of this software and associated documentation files (the "Software"), to
45527 + * deal in the Software without restriction, including without limitation the
45528 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
45529 + * sell copies of the Software, and to permit persons to whom the Software is
45530 + * furnished to do so, subject to the following conditions:
45532 + * The above copyright notice and this permission notice shall be included in
45533 + * all copies or substantial portions of the Software.
45535 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45536 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45537 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45538 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45539 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45540 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
45541 + * DEALINGS IN THE SOFTWARE.
45544 +#ifndef __XEN_BLKIF_H__
45545 +#define __XEN_BLKIF_H__
45547 +#include <xen/interface/io/ring.h>
45548 +#include <xen/interface/io/blkif.h>
45549 +#include <xen/interface/io/protocols.h>
45551 +/* Not a real protocol. Used to generate ring structs which contain
45552 + * the elements common to all protocols only. This way we get a
45553 + * compiler-checkable way to use common struct elements, so we can
45554 + * avoid using switch(protocol) in a number of places. */
45555 +struct blkif_common_request {
45558 +struct blkif_common_response {
45562 +/* i386 protocol version */
45563 +#pragma pack(push, 4)
45564 +struct blkif_x86_32_request {
45565 + uint8_t operation; /* BLKIF_OP_??? */
45566 + uint8_t nr_segments; /* number of segments */
45567 + blkif_vdev_t handle; /* only for read/write requests */
45568 + uint64_t id; /* private guest value, echoed in resp */
45569 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45570 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45572 +struct blkif_x86_32_response {
45573 + uint64_t id; /* copied from request */
45574 + uint8_t operation; /* copied from request */
45575 + int16_t status; /* BLKIF_RSP_??? */
45577 +typedef struct blkif_x86_32_request blkif_x86_32_request_t;
45578 +typedef struct blkif_x86_32_response blkif_x86_32_response_t;
45581 +/* x86_64 protocol version */
45582 +struct blkif_x86_64_request {
45583 + uint8_t operation; /* BLKIF_OP_??? */
45584 + uint8_t nr_segments; /* number of segments */
45585 + blkif_vdev_t handle; /* only for read/write requests */
45586 + uint64_t __attribute__((__aligned__(8))) id;
45587 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45588 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45590 +struct blkif_x86_64_response {
45591 + uint64_t __attribute__((__aligned__(8))) id;
45592 + uint8_t operation; /* copied from request */
45593 + int16_t status; /* BLKIF_RSP_??? */
45595 +typedef struct blkif_x86_64_request blkif_x86_64_request_t;
45596 +typedef struct blkif_x86_64_response blkif_x86_64_response_t;
45598 +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
45599 +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
45600 +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
45602 +union blkif_back_rings {
45603 + blkif_back_ring_t native;
45604 + blkif_common_back_ring_t common;
45605 + blkif_x86_32_back_ring_t x86_32;
45606 + blkif_x86_64_back_ring_t x86_64;
45608 +typedef union blkif_back_rings blkif_back_rings_t;
45610 +enum blkif_protocol {
45611 + BLKIF_PROTOCOL_NATIVE = 1,
45612 + BLKIF_PROTOCOL_X86_32 = 2,
45613 + BLKIF_PROTOCOL_X86_64 = 3,
45616 +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
45618 + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45619 + dst->operation = src->operation;
45620 + dst->nr_segments = src->nr_segments;
45621 + dst->handle = src->handle;
45622 + dst->id = src->id;
45623 + dst->sector_number = src->sector_number;
45625 + if (n > dst->nr_segments)
45626 + n = dst->nr_segments;
45627 + for (i = 0; i < n; i++)
45628 + dst->seg[i] = src->seg[i];
45631 +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
45633 + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45634 + dst->operation = src->operation;
45635 + dst->nr_segments = src->nr_segments;
45636 + dst->handle = src->handle;
45637 + dst->id = src->id;
45638 + dst->sector_number = src->sector_number;
45640 + if (n > dst->nr_segments)
45641 + n = dst->nr_segments;
45642 + for (i = 0; i < n; i++)
45643 + dst->seg[i] = src->seg[i];
45646 +#endif /* __XEN_BLKIF_H__ */
45647 Index: head-2008-11-25/include/xen/compat_ioctl.h
45648 ===================================================================
45649 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45650 +++ head-2008-11-25/include/xen/compat_ioctl.h 2007-07-10 09:42:30.000000000 +0200
45653 + * This program is free software; you can redistribute it and/or
45654 + * modify it under the terms of the GNU General Public License as
45655 + * published by the Free Software Foundation; either version 2 of the
45656 + * License, or (at your option) any later version.
45658 + * This program is distributed in the hope that it will be useful,
45659 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
45660 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
45661 + * GNU General Public License for more details.
45663 + * You should have received a copy of the GNU General Public License
45664 + * along with this program; if not, write to the Free Software
45665 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
45667 + * Copyright IBM Corp. 2007
45669 + * Authors: Jimi Xenidis <jimix@watson.ibm.com>
45670 + * Hollis Blanchard <hollisb@us.ibm.com>
45673 +#ifndef __LINUX_XEN_COMPAT_H__
45674 +#define __LINUX_XEN_COMPAT_H__
45676 +#include <linux/compat.h>
45678 +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg);
45679 +struct privcmd_mmap_32 {
45682 + compat_uptr_t entry;
45685 +struct privcmd_mmapbatch_32 {
45686 + int num; /* number of pages to populate */
45687 + domid_t dom; /* target domain */
45688 + __u64 addr; /* virtual address */
45689 + compat_uptr_t arr; /* array of mfns - top nibble set on err */
45691 +#define IOCTL_PRIVCMD_MMAP_32 \
45692 + _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
45693 +#define IOCTL_PRIVCMD_MMAPBATCH_32 \
45694 + _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
45696 +#endif /* __LINUX_XEN_COMPAT_H__ */
45697 Index: head-2008-11-25/include/xen/cpu_hotplug.h
45698 ===================================================================
45699 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45700 +++ head-2008-11-25/include/xen/cpu_hotplug.h 2007-08-16 18:07:01.000000000 +0200
45702 +#ifndef __XEN_CPU_HOTPLUG_H__
45703 +#define __XEN_CPU_HOTPLUG_H__
45705 +#include <linux/kernel.h>
45706 +#include <linux/cpumask.h>
45708 +#if defined(CONFIG_X86) && defined(CONFIG_SMP)
45709 +extern cpumask_t cpu_initialized_map;
45712 +#if defined(CONFIG_HOTPLUG_CPU)
45714 +int cpu_up_check(unsigned int cpu);
45715 +void init_xenbus_allowed_cpumask(void);
45716 +int smp_suspend(void);
45717 +void smp_resume(void);
45719 +void cpu_bringup(void);
45721 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
45723 +#define cpu_up_check(cpu) (0)
45724 +#define init_xenbus_allowed_cpumask() ((void)0)
45726 +static inline int smp_suspend(void)
45728 + if (num_online_cpus() > 1) {
45729 + printk(KERN_WARNING "Can't suspend SMP guests "
45730 + "without CONFIG_HOTPLUG_CPU\n");
45731 + return -EOPNOTSUPP;
45736 +static inline void smp_resume(void)
45740 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
45742 +#endif /* __XEN_CPU_HOTPLUG_H__ */
45743 Index: head-2008-11-25/include/xen/driver_util.h
45744 ===================================================================
45745 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45746 +++ head-2008-11-25/include/xen/driver_util.h 2007-06-12 13:14:19.000000000 +0200
45749 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
45750 +#define __ASM_XEN_DRIVER_UTIL_H__
45752 +#include <linux/vmalloc.h>
45753 +#include <linux/device.h>
45755 +/* Allocate/destroy a 'vmalloc' VM area. */
45756 +extern struct vm_struct *alloc_vm_area(unsigned long size);
45757 +extern void free_vm_area(struct vm_struct *area);
45759 +extern struct class *get_xen_class(void);
45761 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
45762 Index: head-2008-11-25/include/xen/evtchn.h
45763 ===================================================================
45764 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45765 +++ head-2008-11-25/include/xen/evtchn.h 2008-09-15 13:40:15.000000000 +0200
45767 +/******************************************************************************
45770 + * Communication via Xen event channels.
45771 + * Also definitions for the device that demuxes notifications to userspace.
45773 + * Copyright (c) 2004-2005, K A Fraser
45775 + * This program is free software; you can redistribute it and/or
45776 + * modify it under the terms of the GNU General Public License version 2
45777 + * as published by the Free Software Foundation; or, when distributed
45778 + * separately from the Linux kernel or incorporated into other
45779 + * software packages, subject to the following license:
45781 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45782 + * of this source file (the "Software"), to deal in the Software without
45783 + * restriction, including without limitation the rights to use, copy, modify,
45784 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45785 + * and to permit persons to whom the Software is furnished to do so, subject to
45786 + * the following conditions:
45788 + * The above copyright notice and this permission notice shall be included in
45789 + * all copies or substantial portions of the Software.
45791 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45792 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45793 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45794 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45795 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45796 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45797 + * IN THE SOFTWARE.
45800 +#ifndef __ASM_EVTCHN_H__
45801 +#define __ASM_EVTCHN_H__
45803 +#include <linux/interrupt.h>
45804 +#include <asm/hypervisor.h>
45805 +#include <asm/ptrace.h>
45806 +#include <asm/synch_bitops.h>
45807 +#include <xen/interface/event_channel.h>
45808 +#include <linux/smp.h>
45811 + * LOW-LEVEL DEFINITIONS
45815 + * Dynamically bind an event source to an IRQ-like callback handler.
45816 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
45817 + * The IRQ argument passed to the callback handler is the same as returned
45818 + * from the bind call. It may not correspond to a Linux IRQ number.
45819 + * Returns IRQ or negative errno.
45821 +int bind_caller_port_to_irqhandler(
45822 + unsigned int caller_port,
45823 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45824 + unsigned long irqflags,
45825 + const char *devname,
45827 +int bind_listening_port_to_irqhandler(
45828 + unsigned int remote_domain,
45829 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45830 + unsigned long irqflags,
45831 + const char *devname,
45833 +int bind_interdomain_evtchn_to_irqhandler(
45834 + unsigned int remote_domain,
45835 + unsigned int remote_port,
45836 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45837 + unsigned long irqflags,
45838 + const char *devname,
45840 +int bind_virq_to_irqhandler(
45841 + unsigned int virq,
45842 + unsigned int cpu,
45843 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45844 + unsigned long irqflags,
45845 + const char *devname,
45847 +int bind_ipi_to_irqhandler(
45848 + unsigned int ipi,
45849 + unsigned int cpu,
45850 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45851 + unsigned long irqflags,
45852 + const char *devname,
45856 + * Common unbind function for all event sources. Takes IRQ to unbind from.
45857 + * Automatically closes the underlying event channel (except for bindings
45858 + * made with bind_caller_port_to_irqhandler()).
45860 +void unbind_from_irqhandler(unsigned int irq, void *dev_id);
45862 +void irq_resume(void);
45864 +/* Entry point for notifications into Linux subsystems. */
45865 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
45867 +/* Entry point for notifications into the userland character device. */
45868 +void evtchn_device_upcall(int port);
45870 +/* Mark a PIRQ as unavailable for dynamic allocation. */
45871 +void evtchn_register_pirq(int irq);
45872 +/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
45873 +int evtchn_map_pirq(int irq, int xen_pirq);
45874 +/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
45875 +int evtchn_get_xen_pirq(int irq);
45877 +void mask_evtchn(int port);
45878 +void disable_all_local_evtchn(void);
45879 +void unmask_evtchn(int port);
45882 +void rebind_evtchn_to_cpu(int port, unsigned int cpu);
45884 +#define rebind_evtchn_to_cpu(port, cpu) ((void)0)
45887 +static inline int test_and_set_evtchn_mask(int port)
45889 + shared_info_t *s = HYPERVISOR_shared_info;
45890 + return synch_test_and_set_bit(port, s->evtchn_mask);
45893 +static inline void clear_evtchn(int port)
45895 + shared_info_t *s = HYPERVISOR_shared_info;
45896 + synch_clear_bit(port, s->evtchn_pending);
45899 +static inline void notify_remote_via_evtchn(int port)
45901 + struct evtchn_send send = { .port = port };
45902 + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
45906 + * Use these to access the event channel underlying the IRQ handle returned
45907 + * by bind_*_to_irqhandler().
45909 +void notify_remote_via_irq(int irq);
45910 +int irq_to_evtchn_port(int irq);
45912 +#define PIRQ_SET_MAPPING 0x0
45913 +#define PIRQ_CLEAR_MAPPING 0x1
45914 +#define PIRQ_GET_MAPPING 0x3
45915 +int pirq_mapstatus(int pirq, int action);
45916 +int set_pirq_hw_action(int pirq, int (*action)(int pirq, int action));
45917 +int clear_pirq_hw_action(int pirq);
45919 +#define PIRQ_STARTUP 1
45920 +#define PIRQ_SHUTDOWN 2
45921 +#define PIRQ_ENABLE 3
45922 +#define PIRQ_DISABLE 4
45923 +#define PIRQ_END 5
45924 +#define PIRQ_ACK 6
45926 +#endif /* __ASM_EVTCHN_H__ */
45927 Index: head-2008-11-25/include/xen/firmware.h
45928 ===================================================================
45929 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45930 +++ head-2008-11-25/include/xen/firmware.h 2007-07-02 08:16:19.000000000 +0200
45932 +#ifndef __XEN_FIRMWARE_H__
45933 +#define __XEN_FIRMWARE_H__
45935 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
45936 +void copy_edd(void);
45939 +void copy_edid(void);
45941 +#endif /* __XEN_FIRMWARE_H__ */
45942 Index: head-2008-11-25/include/xen/gnttab.h
45943 ===================================================================
45944 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45945 +++ head-2008-11-25/include/xen/gnttab.h 2008-11-04 11:13:10.000000000 +0100
45947 +/******************************************************************************
45950 + * Two sets of functionality:
45951 + * 1. Granting foreign access to our memory reservation.
45952 + * 2. Accessing others' memory reservations via grant references.
45953 + * (i.e., mechanisms for both sender and recipient of grant references)
45955 + * Copyright (c) 2004-2005, K A Fraser
45956 + * Copyright (c) 2005, Christopher Clark
45958 + * This program is free software; you can redistribute it and/or
45959 + * modify it under the terms of the GNU General Public License version 2
45960 + * as published by the Free Software Foundation; or, when distributed
45961 + * separately from the Linux kernel or incorporated into other
45962 + * software packages, subject to the following license:
45964 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45965 + * of this source file (the "Software"), to deal in the Software without
45966 + * restriction, including without limitation the rights to use, copy, modify,
45967 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45968 + * and to permit persons to whom the Software is furnished to do so, subject to
45969 + * the following conditions:
45971 + * The above copyright notice and this permission notice shall be included in
45972 + * all copies or substantial portions of the Software.
45974 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45975 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45976 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45977 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45978 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45979 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45980 + * IN THE SOFTWARE.
45983 +#ifndef __ASM_GNTTAB_H__
45984 +#define __ASM_GNTTAB_H__
45986 +#include <asm/hypervisor.h>
45987 +#include <asm/maddr.h> /* maddr_t */
45988 +#include <linux/mm.h>
45989 +#include <xen/interface/grant_table.h>
45990 +#include <xen/features.h>
45992 +struct gnttab_free_callback {
45993 + struct gnttab_free_callback *next;
45994 + void (*fn)(void *);
46000 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
46004 + * End access through the given grant reference, iff the grant entry is no
46005 + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
46008 +int gnttab_end_foreign_access_ref(grant_ref_t ref);
46011 + * Eventually end access through the given grant reference, and once that
46012 + * access has been ended, free the given page too. Access will be ended
46013 + * immediately iff the grant entry is not in use, otherwise it will happen
46014 + * some time later. page may be 0, in which case no freeing will occur.
46016 +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
46018 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
46020 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
46021 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
46023 +int gnttab_query_foreign_access(grant_ref_t ref);
46026 + * operations on reserved batches of grant references
46028 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
46030 +void gnttab_free_grant_reference(grant_ref_t ref);
46032 +void gnttab_free_grant_references(grant_ref_t head);
46034 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
46036 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
46038 +void gnttab_release_grant_reference(grant_ref_t *private_head,
46039 + grant_ref_t release);
46041 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
46042 + void (*fn)(void *), void *arg, u16 count);
46043 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
46045 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
46046 + unsigned long frame, int flags);
46048 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
46049 + unsigned long pfn);
46051 +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
46052 +void __gnttab_dma_map_page(struct page *page);
46053 +static inline void __gnttab_dma_unmap_page(struct page *page)
46057 +void gnttab_reset_grant_page(struct page *page);
46059 +int gnttab_suspend(void);
46060 +int gnttab_resume(void);
46062 +void *arch_gnttab_alloc_shared(unsigned long *frames);
46064 +static inline void
46065 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
46066 + uint32_t flags, grant_ref_t ref, domid_t domid)
46068 + if (flags & GNTMAP_contains_pte)
46069 + map->host_addr = addr;
46070 + else if (xen_feature(XENFEAT_auto_translated_physmap))
46071 + map->host_addr = __pa(addr);
46073 + map->host_addr = addr;
46075 + map->flags = flags;
46077 + map->dom = domid;
46080 +static inline void
46081 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
46082 + uint32_t flags, grant_handle_t handle)
46084 + if (flags & GNTMAP_contains_pte)
46085 + unmap->host_addr = addr;
46086 + else if (xen_feature(XENFEAT_auto_translated_physmap))
46087 + unmap->host_addr = __pa(addr);
46089 + unmap->host_addr = addr;
46091 + unmap->handle = handle;
46092 + unmap->dev_bus_addr = 0;
46095 +static inline void
46096 +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
46097 + maddr_t new_addr, grant_handle_t handle)
46099 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
46100 + unmap->host_addr = __pa(addr);
46101 + unmap->new_addr = __pa(new_addr);
46103 + unmap->host_addr = addr;
46104 + unmap->new_addr = new_addr;
46107 + unmap->handle = handle;
46110 +#endif /* __ASM_GNTTAB_H__ */
46111 Index: head-2008-11-25/include/xen/hvm.h
46112 ===================================================================
46113 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46114 +++ head-2008-11-25/include/xen/hvm.h 2007-06-12 13:14:19.000000000 +0200
46116 +/* Simple wrappers around HVM functions */
46117 +#ifndef XEN_HVM_H__
46118 +#define XEN_HVM_H__
46120 +#include <xen/interface/hvm/params.h>
46122 +static inline unsigned long hvm_get_parameter(int idx)
46124 + struct xen_hvm_param xhv;
46127 + xhv.domid = DOMID_SELF;
46129 + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
46131 + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
46135 + return xhv.value;
46138 +#endif /* XEN_HVM_H__ */
46139 Index: head-2008-11-25/include/xen/hypercall.h
46140 ===================================================================
46141 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46142 +++ head-2008-11-25/include/xen/hypercall.h 2008-01-28 12:24:19.000000000 +0100
46144 +#ifndef __XEN_HYPERCALL_H__
46145 +#define __XEN_HYPERCALL_H__
46147 +#include <asm/hypercall.h>
46149 +static inline int __must_check
46150 +HYPERVISOR_multicall_check(
46151 + multicall_entry_t *call_list, unsigned int nr_calls,
46152 + const unsigned long *rc_list)
46154 + int rc = HYPERVISOR_multicall(call_list, nr_calls);
46156 + if (unlikely(rc < 0))
46159 + BUG_ON((int)nr_calls < 0);
46161 + for ( ; nr_calls > 0; --nr_calls, ++call_list)
46162 + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
46168 +/* A construct to ignore the return value of hypercall wrappers in a few
46169 + * exceptional cases (simply casting the function result to void doesn't
46170 + * avoid the compiler warning): */
46171 +#define VOID(expr) ((void)((expr)?:0))
46173 +#endif /* __XEN_HYPERCALL_H__ */
46174 Index: head-2008-11-25/include/xen/hypervisor_sysfs.h
46175 ===================================================================
46176 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46177 +++ head-2008-11-25/include/xen/hypervisor_sysfs.h 2007-06-22 09:08:06.000000000 +0200
46180 + * copyright (c) 2006 IBM Corporation
46181 + * Authored by: Mike D. Day <ncmike@us.ibm.com>
46183 + * This program is free software; you can redistribute it and/or modify
46184 + * it under the terms of the GNU General Public License version 2 as
46185 + * published by the Free Software Foundation.
46188 +#ifndef _HYP_SYSFS_H_
46189 +#define _HYP_SYSFS_H_
46191 +#include <linux/kobject.h>
46192 +#include <linux/sysfs.h>
46194 +#define HYPERVISOR_ATTR_RO(_name) \
46195 +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
46197 +#define HYPERVISOR_ATTR_RW(_name) \
46198 +static struct hyp_sysfs_attr _name##_attr = \
46199 + __ATTR(_name, 0644, _name##_show, _name##_store)
46201 +struct hyp_sysfs_attr {
46202 + struct attribute attr;
46203 + ssize_t (*show)(struct hyp_sysfs_attr *, char *);
46204 + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
46205 + void *hyp_attr_data;
46208 +#endif /* _HYP_SYSFS_H_ */
46209 Index: head-2008-11-25/include/xen/pcifront.h
46210 ===================================================================
46211 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46212 +++ head-2008-11-25/include/xen/pcifront.h 2007-06-18 08:38:13.000000000 +0200
46215 + * PCI Frontend - arch-dependendent declarations
46217 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
46219 +#ifndef __XEN_ASM_PCIFRONT_H__
46220 +#define __XEN_ASM_PCIFRONT_H__
46222 +#include <linux/spinlock.h>
46228 +struct pcifront_device;
46231 +struct pcifront_sd {
46233 + struct pcifront_device *pdev;
46236 +static inline struct pcifront_device *
46237 +pcifront_get_pdev(struct pcifront_sd *sd)
46242 +static inline void pcifront_init_sd(struct pcifront_sd *sd,
46243 + unsigned int domain, unsigned int bus,
46244 + struct pcifront_device *pdev)
46246 + sd->domain = domain;
46250 +#if defined(CONFIG_PCI_DOMAINS)
46251 +static inline int pci_domain_nr(struct pci_bus *bus)
46253 + struct pcifront_sd *sd = bus->sysdata;
46254 + return sd->domain;
46256 +static inline int pci_proc_domain(struct pci_bus *bus)
46258 + return pci_domain_nr(bus);
46260 +#endif /* CONFIG_PCI_DOMAINS */
46262 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46263 + struct pcifront_sd *sd)
46267 +#else /* __ia64__ */
46269 +#include <linux/acpi.h>
46270 +#include <asm/pci.h>
46271 +#define pcifront_sd pci_controller
46273 +extern void xen_add_resource(struct pci_controller *, unsigned int,
46274 + unsigned int, struct acpi_resource *);
46275 +extern void xen_pcibios_setup_root_windows(struct pci_bus *,
46276 + struct pci_controller *);
46278 +static inline struct pcifront_device *
46279 +pcifront_get_pdev(struct pcifront_sd *sd)
46281 + return (struct pcifront_device *)sd->platform_data;
46284 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46285 + struct pcifront_sd *sd)
46287 + xen_pcibios_setup_root_windows(bus, sd);
46290 +#endif /* __ia64__ */
46292 +extern struct rw_semaphore pci_bus_sem;
46294 +#endif /* __KERNEL__ */
46296 +#endif /* __XEN_ASM_PCIFRONT_H__ */
46297 Index: head-2008-11-25/include/xen/public/evtchn.h
46298 ===================================================================
46299 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46300 +++ head-2008-11-25/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200
46302 +/******************************************************************************
46305 + * Interface to /dev/xen/evtchn.
46307 + * Copyright (c) 2003-2005, K A Fraser
46309 + * This program is free software; you can redistribute it and/or
46310 + * modify it under the terms of the GNU General Public License version 2
46311 + * as published by the Free Software Foundation; or, when distributed
46312 + * separately from the Linux kernel or incorporated into other
46313 + * software packages, subject to the following license:
46315 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46316 + * of this source file (the "Software"), to deal in the Software without
46317 + * restriction, including without limitation the rights to use, copy, modify,
46318 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46319 + * and to permit persons to whom the Software is furnished to do so, subject to
46320 + * the following conditions:
46322 + * The above copyright notice and this permission notice shall be included in
46323 + * all copies or substantial portions of the Software.
46325 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46326 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46327 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46328 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46329 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46330 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46331 + * IN THE SOFTWARE.
46334 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
46335 +#define __LINUX_PUBLIC_EVTCHN_H__
46338 + * Bind a fresh port to VIRQ @virq.
46339 + * Return allocated port.
46341 +#define IOCTL_EVTCHN_BIND_VIRQ \
46342 + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
46343 +struct ioctl_evtchn_bind_virq {
46344 + unsigned int virq;
46348 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
46349 + * Return allocated port.
46351 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
46352 + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
46353 +struct ioctl_evtchn_bind_interdomain {
46354 + unsigned int remote_domain, remote_port;
46358 + * Allocate a fresh port for binding to @remote_domain.
46359 + * Return allocated port.
46361 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
46362 + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
46363 +struct ioctl_evtchn_bind_unbound_port {
46364 + unsigned int remote_domain;
46368 + * Unbind previously allocated @port.
46370 +#define IOCTL_EVTCHN_UNBIND \
46371 + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
46372 +struct ioctl_evtchn_unbind {
46373 + unsigned int port;
46377 + * Unbind previously allocated @port.
46379 +#define IOCTL_EVTCHN_NOTIFY \
46380 + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
46381 +struct ioctl_evtchn_notify {
46382 + unsigned int port;
46385 +/* Clear and reinitialise the event buffer. Clear error condition. */
46386 +#define IOCTL_EVTCHN_RESET \
46387 + _IOC(_IOC_NONE, 'E', 5, 0)
46389 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
46390 Index: head-2008-11-25/include/xen/public/gntdev.h
46391 ===================================================================
46392 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46393 +++ head-2008-11-25/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200
46395 +/******************************************************************************
46398 + * Interface to /dev/xen/gntdev.
46400 + * Copyright (c) 2007, D G Murray
46402 + * This program is free software; you can redistribute it and/or
46403 + * modify it under the terms of the GNU General Public License version 2
46404 + * as published by the Free Software Foundation; or, when distributed
46405 + * separately from the Linux kernel or incorporated into other
46406 + * software packages, subject to the following license:
46408 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46409 + * of this source file (the "Software"), to deal in the Software without
46410 + * restriction, including without limitation the rights to use, copy, modify,
46411 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46412 + * and to permit persons to whom the Software is furnished to do so, subject to
46413 + * the following conditions:
46415 + * The above copyright notice and this permission notice shall be included in
46416 + * all copies or substantial portions of the Software.
46418 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46419 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46420 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46421 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46422 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46423 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46424 + * IN THE SOFTWARE.
46427 +#ifndef __LINUX_PUBLIC_GNTDEV_H__
46428 +#define __LINUX_PUBLIC_GNTDEV_H__
46430 +struct ioctl_gntdev_grant_ref {
46431 + /* The domain ID of the grant to be mapped. */
46433 + /* The grant reference of the grant to be mapped. */
46438 + * Inserts the grant references into the mapping table of an instance
46439 + * of gntdev. N.B. This does not perform the mapping, which is deferred
46440 + * until mmap() is called with @index as the offset.
46442 +#define IOCTL_GNTDEV_MAP_GRANT_REF \
46443 +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
46444 +struct ioctl_gntdev_map_grant_ref {
46445 + /* IN parameters */
46446 + /* The number of grants to be mapped. */
46449 + /* OUT parameters */
46450 + /* The offset to be used on a subsequent call to mmap(). */
46452 + /* Variable IN parameter. */
46453 + /* Array of grant references, of size @count. */
46454 + struct ioctl_gntdev_grant_ref refs[1];
46458 + * Removes the grant references from the mapping table of an instance of
46459 + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
46460 + * before this ioctl is called, or an error will result.
46462 +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
46463 +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
46464 +struct ioctl_gntdev_unmap_grant_ref {
46465 + /* IN parameters */
46466 + /* The offset was returned by the corresponding map operation. */
46468 + /* The number of pages to be unmapped. */
46474 + * Returns the offset in the driver's address space that corresponds
46475 + * to @vaddr. This can be used to perform a munmap(), followed by an
46476 + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
46477 + * the caller. The number of pages that were allocated at the same time as
46478 + * @vaddr is returned in @count.
46480 + * N.B. Where more than one page has been mapped into a contiguous range, the
46481 + * supplied @vaddr must correspond to the start of the range; otherwise
46482 + * an error will result. It is only possible to munmap() the entire
46483 + * contiguously-allocated range at once, and not any subrange thereof.
46485 +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
46486 +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
46487 +struct ioctl_gntdev_get_offset_for_vaddr {
46488 + /* IN parameters */
46489 + /* The virtual address of the first mapped page in a range. */
46491 + /* OUT parameters */
46492 + /* The offset that was used in the initial mmap() operation. */
46494 + /* The number of pages mapped in the VM area that begins at @vaddr. */
46500 + * Sets the maximum number of grants that may mapped at once by this gntdev
46503 + * N.B. This must be called before any other ioctl is performed on the device.
46505 +#define IOCTL_GNTDEV_SET_MAX_GRANTS \
46506 +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
46507 +struct ioctl_gntdev_set_max_grants {
46508 + /* IN parameter */
46509 + /* The maximum number of grants that may be mapped at once. */
46513 +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
46514 Index: head-2008-11-25/include/xen/public/privcmd.h
46515 ===================================================================
46516 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46517 +++ head-2008-11-25/include/xen/public/privcmd.h 2007-06-12 13:14:19.000000000 +0200
46519 +/******************************************************************************
46522 + * Interface to /proc/xen/privcmd.
46524 + * Copyright (c) 2003-2005, K A Fraser
46526 + * This program is free software; you can redistribute it and/or
46527 + * modify it under the terms of the GNU General Public License version 2
46528 + * as published by the Free Software Foundation; or, when distributed
46529 + * separately from the Linux kernel or incorporated into other
46530 + * software packages, subject to the following license:
46532 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46533 + * of this source file (the "Software"), to deal in the Software without
46534 + * restriction, including without limitation the rights to use, copy, modify,
46535 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46536 + * and to permit persons to whom the Software is furnished to do so, subject to
46537 + * the following conditions:
46539 + * The above copyright notice and this permission notice shall be included in
46540 + * all copies or substantial portions of the Software.
46542 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46543 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46544 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46545 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46546 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46547 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46548 + * IN THE SOFTWARE.
46551 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
46552 +#define __LINUX_PUBLIC_PRIVCMD_H__
46554 +#include <linux/types.h>
46560 +typedef struct privcmd_hypercall
46564 +} privcmd_hypercall_t;
46566 +typedef struct privcmd_mmap_entry {
46570 +} privcmd_mmap_entry_t;
46572 +typedef struct privcmd_mmap {
46574 + domid_t dom; /* target domain */
46575 + privcmd_mmap_entry_t __user *entry;
46578 +typedef struct privcmd_mmapbatch {
46579 + int num; /* number of pages to populate */
46580 + domid_t dom; /* target domain */
46581 + __u64 addr; /* virtual address */
46582 + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
46583 +} privcmd_mmapbatch_t;
46586 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
46587 + * @arg: &privcmd_hypercall_t
46588 + * Return: Value returned from execution of the specified hypercall.
46590 +#define IOCTL_PRIVCMD_HYPERCALL \
46591 + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
46592 +#define IOCTL_PRIVCMD_MMAP \
46593 + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
46594 +#define IOCTL_PRIVCMD_MMAPBATCH \
46595 + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
46597 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
46598 Index: head-2008-11-25/include/xen/xen_proc.h
46599 ===================================================================
46600 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46601 +++ head-2008-11-25/include/xen/xen_proc.h 2007-06-12 13:14:19.000000000 +0200
46604 +#ifndef __ASM_XEN_PROC_H__
46605 +#define __ASM_XEN_PROC_H__
46607 +#include <linux/proc_fs.h>
46609 +extern struct proc_dir_entry *create_xen_proc_entry(
46610 + const char *name, mode_t mode);
46611 +extern void remove_xen_proc_entry(
46612 + const char *name);
46614 +#endif /* __ASM_XEN_PROC_H__ */
46615 Index: head-2008-11-25/include/xen/xencons.h
46616 ===================================================================
46617 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46618 +++ head-2008-11-25/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200
46620 +#ifndef __ASM_XENCONS_H__
46621 +#define __ASM_XENCONS_H__
46623 +struct dom0_vga_console_info;
46624 +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
46626 +void xencons_force_flush(void);
46627 +void xencons_resume(void);
46629 +/* Interrupt work hooks. Receive data, or kick data out. */
46630 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
46631 +void xencons_tx(void);
46633 +int xencons_ring_init(void);
46634 +int xencons_ring_send(const char *data, unsigned len);
46636 +#endif /* __ASM_XENCONS_H__ */
46637 Index: head-2008-11-25/include/xen/xenoprof.h
46638 ===================================================================
46639 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46640 +++ head-2008-11-25/include/xen/xenoprof.h 2007-06-12 13:14:19.000000000 +0200
46642 +/******************************************************************************
46645 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
46646 + * VA Linux Systems Japan K.K.
46648 + * This program is free software; you can redistribute it and/or modify
46649 + * it under the terms of the GNU General Public License as published by
46650 + * the Free Software Foundation; either version 2 of the License, or
46651 + * (at your option) any later version.
46653 + * This program is distributed in the hope that it will be useful,
46654 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
46655 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
46656 + * GNU General Public License for more details.
46658 + * You should have received a copy of the GNU General Public License
46659 + * along with this program; if not, write to the Free Software
46660 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46664 +#ifndef __XEN_XENOPROF_H__
46665 +#define __XEN_XENOPROF_H__
46668 +#include <asm/xenoprof.h>
46670 +struct oprofile_operations;
46671 +int xenoprofile_init(struct oprofile_operations * ops);
46672 +void xenoprofile_exit(void);
46674 +struct xenoprof_shared_buffer {
46676 + struct xenoprof_arch_shared_buffer arch;
46679 +#define xenoprofile_init(ops) (-ENOSYS)
46680 +#define xenoprofile_exit() do { } while (0)
46682 +#endif /* CONFIG_XEN */
46683 +#endif /* __XEN_XENOPROF_H__ */
46684 Index: head-2008-11-25/lib/swiotlb-xen.c
46685 ===================================================================
46686 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46687 +++ head-2008-11-25/lib/swiotlb-xen.c 2008-09-15 13:40:15.000000000 +0200
46690 + * Dynamic DMA mapping support.
46692 + * This implementation is a fallback for platforms that do not support
46693 + * I/O TLBs (aka DMA address translation hardware).
46694 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
46695 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
46696 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
46697 + * David Mosberger-Tang <davidm@hpl.hp.com>
46698 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
46701 +#include <linux/cache.h>
46702 +#include <linux/mm.h>
46703 +#include <linux/module.h>
46704 +#include <linux/pci.h>
46705 +#include <linux/spinlock.h>
46706 +#include <linux/string.h>
46707 +#include <linux/types.h>
46708 +#include <linux/ctype.h>
46709 +#include <linux/init.h>
46710 +#include <linux/bootmem.h>
46711 +#include <linux/highmem.h>
46712 +#include <asm/io.h>
46713 +#include <asm/pci.h>
46714 +#include <asm/dma.h>
46715 +#include <asm/uaccess.h>
46716 +#include <xen/gnttab.h>
46717 +#include <xen/interface/memory.h>
46718 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
46721 +EXPORT_SYMBOL(swiotlb);
46723 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
46726 + * Maximum allowable number of contiguous slabs to map,
46727 + * must be a power of 2. What is the appropriate value ?
46728 + * The complexity of {map,unmap}_single is linearly dependent on this value.
46730 +#define IO_TLB_SEGSIZE 128
46733 + * log of the size of each IO TLB slab. The number of slabs is command line
46736 +#define IO_TLB_SHIFT 11
46738 +int swiotlb_force;
46740 +static char *iotlb_virt_start;
46741 +static unsigned long iotlb_nslabs;
46744 + * Used to do a quick range check in swiotlb_unmap_single and
46745 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
46748 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
46750 +/* Does the given dma address reside within the swiotlb aperture? */
46751 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
46753 + unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
46754 + return (pfn_valid(pfn)
46755 + && (pfn >= iotlb_pfn_start)
46756 + && (pfn < iotlb_pfn_end));
46760 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
46762 +static unsigned long io_tlb_overflow = 32*1024;
46764 +void *io_tlb_overflow_buffer;
46767 + * This is a free list describing the number of free entries available from
46770 +static unsigned int *io_tlb_list;
46771 +static unsigned int io_tlb_index;
46774 + * We need to save away the original address corresponding to a mapped entry
46775 + * for the sync operations.
46777 +static struct phys_addr {
46778 + struct page *page;
46779 + unsigned int offset;
46780 +} *io_tlb_orig_addr;
46783 + * Protect the above data structures in the map and unmap calls
46785 +static DEFINE_SPINLOCK(io_tlb_lock);
46787 +static unsigned int dma_bits;
46788 +static unsigned int __initdata max_dma_bits = 32;
46790 +setup_dma_bits(char *str)
46792 + max_dma_bits = simple_strtoul(str, NULL, 0);
46795 +__setup("dma_bits=", setup_dma_bits);
46798 +setup_io_tlb_npages(char *str)
46800 + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
46801 + if (isdigit(*str)) {
46802 + iotlb_nslabs = simple_strtoul(str, &str, 0) <<
46803 + (20 - IO_TLB_SHIFT);
46804 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46809 + * NB. 'force' enables the swiotlb, but doesn't force its use for
46810 + * every DMA like it does on native Linux. 'off' forcibly disables
46811 + * use of the swiotlb.
46813 + if (!strcmp(str, "force"))
46814 + swiotlb_force = 1;
46815 + else if (!strcmp(str, "off"))
46816 + swiotlb_force = -1;
46819 +__setup("swiotlb=", setup_io_tlb_npages);
46820 +/* make io_tlb_overflow tunable too? */
46823 + * Statically reserve bounce buffer space and initialize bounce buffer data
46824 + * structures for the software IO TLB used to implement the PCI DMA API.
46827 +swiotlb_init_with_default_size (size_t default_size)
46829 + unsigned long i, bytes;
46832 + if (!iotlb_nslabs) {
46833 + iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
46834 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46837 + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
46840 + * Get IO TLB memory from the low pages
46842 + iotlb_virt_start = alloc_bootmem_low_pages(bytes);
46843 + if (!iotlb_virt_start)
46844 + panic("Cannot allocate SWIOTLB buffer!\n");
46846 + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
46847 + for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
46849 + rc = xen_create_contiguous_region(
46850 + (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
46851 + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
46853 + } while (rc && dma_bits++ < max_dma_bits);
46856 + panic("No suitable physical memory available for SWIOTLB buffer!\n"
46857 + "Use dom0_mem Xen boot parameter to reserve\n"
46858 + "some DMA memory (e.g., dom0_mem=-128M).\n");
46859 + iotlb_nslabs = i;
46860 + i <<= IO_TLB_SHIFT;
46861 + free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
46863 + for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
46864 + unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
46866 + if (bits > dma_bits)
46874 + * Allocate and initialize the free list array. This array is used
46875 + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
46877 + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
46878 + for (i = 0; i < iotlb_nslabs; i++)
46879 + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
46880 + io_tlb_index = 0;
46881 + io_tlb_orig_addr = alloc_bootmem(
46882 + iotlb_nslabs * sizeof(*io_tlb_orig_addr));
46885 + * Get the overflow emergency buffer
46887 + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
46888 + if (!io_tlb_overflow_buffer)
46889 + panic("Cannot allocate SWIOTLB overflow buffer!\n");
46892 + rc = xen_create_contiguous_region(
46893 + (unsigned long)io_tlb_overflow_buffer,
46894 + get_order(io_tlb_overflow),
46896 + } while (rc && dma_bits++ < max_dma_bits);
46898 + panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
46900 + iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
46901 + iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
46903 + printk(KERN_INFO "Software IO TLB enabled: \n"
46904 + " Aperture: %lu megabytes\n"
46905 + " Kernel range: %p - %p\n"
46906 + " Address size: %u bits\n",
46908 + iotlb_virt_start, iotlb_virt_start + bytes,
46913 +swiotlb_init(void)
46916 + size_t defsz = 64 * (1 << 20); /* 64MB default size */
46918 + if (swiotlb_force == 1) {
46920 + } else if ((swiotlb_force != -1) &&
46921 + is_running_on_xen() &&
46922 + is_initial_xendomain()) {
46923 + /* Domain 0 always has a swiotlb. */
46924 + ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
46925 + if (ram_end <= 0x7ffff)
46926 + defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
46931 + swiotlb_init_with_default_size(defsz);
46933 + printk(KERN_INFO "Software IO TLB disabled\n");
46937 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
46938 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
46939 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
46940 + * unnecessary copy from the aperture to the host buffer, and a page fault.
46943 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
46945 + if (PageHighMem(buffer.page)) {
46946 + size_t len, bytes;
46947 + char *dev, *host, *kmp;
46949 + while (len != 0) {
46950 + unsigned long flags;
46952 + if (((bytes = len) + buffer.offset) > PAGE_SIZE)
46953 + bytes = PAGE_SIZE - buffer.offset;
46954 + local_irq_save(flags); /* protects KM_BOUNCE_READ */
46955 + kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
46956 + dev = dma_addr + size - len;
46957 + host = kmp + buffer.offset;
46958 + if (dir == DMA_FROM_DEVICE) {
46959 + if (__copy_to_user_inatomic(host, dev, bytes))
46960 + /* inaccessible */;
46962 + memcpy(dev, host, bytes);
46963 + kunmap_atomic(kmp, KM_BOUNCE_READ);
46964 + local_irq_restore(flags);
46967 + buffer.offset = 0;
46970 + char *host = (char *)phys_to_virt(
46971 + page_to_pseudophys(buffer.page)) + buffer.offset;
46972 + if (dir == DMA_FROM_DEVICE) {
46973 + if (__copy_to_user_inatomic(host, dma_addr, size))
46974 + /* inaccessible */;
46975 + } else if (dir == DMA_TO_DEVICE)
46976 + memcpy(dma_addr, host, size);
46981 + * Allocates bounce buffer and returns its kernel virtual address.
46984 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
46986 + unsigned long flags;
46988 + unsigned int nslots, stride, index, wrap;
46989 + struct phys_addr slot_buf;
46993 + * For mappings greater than a page, we limit the stride (and
46994 + * hence alignment) to a page size.
46996 + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
46997 + if (size > PAGE_SIZE)
46998 + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
47005 + * Find suitable number of IO TLB entries size that will fit this
47006 + * request and allocate a buffer from that IO TLB pool.
47008 + spin_lock_irqsave(&io_tlb_lock, flags);
47010 + wrap = index = ALIGN(io_tlb_index, stride);
47012 + if (index >= iotlb_nslabs)
47013 + wrap = index = 0;
47017 + * If we find a slot that indicates we have 'nslots'
47018 + * number of contiguous buffers, we allocate the
47019 + * buffers from that slot and mark the entries as '0'
47020 + * indicating unavailable.
47022 + if (io_tlb_list[index] >= nslots) {
47025 + for (i = index; i < (int)(index + nslots); i++)
47026 + io_tlb_list[i] = 0;
47027 + for (i = index - 1;
47028 + (OFFSET(i, IO_TLB_SEGSIZE) !=
47029 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47031 + io_tlb_list[i] = ++count;
47032 + dma_addr = iotlb_virt_start +
47033 + (index << IO_TLB_SHIFT);
47036 + * Update the indices to avoid searching in
47037 + * the next round.
47040 + ((index + nslots) < iotlb_nslabs
47041 + ? (index + nslots) : 0);
47046 + if (index >= iotlb_nslabs)
47048 + } while (index != wrap);
47050 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47054 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47057 + * Save away the mapping from the original address to the DMA address.
47058 + * This is needed when we sync the memory. Then we sync the buffer if
47061 + slot_buf = buffer;
47062 + for (i = 0; i < nslots; i++) {
47063 + slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
47064 + slot_buf.offset &= PAGE_SIZE - 1;
47065 + io_tlb_orig_addr[index+i] = slot_buf;
47066 + slot_buf.offset += 1 << IO_TLB_SHIFT;
47068 + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47069 + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
47074 +static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
47076 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47077 + struct phys_addr buffer = io_tlb_orig_addr[index];
47078 + buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
47079 + buffer.page += buffer.offset >> PAGE_SHIFT;
47080 + buffer.offset &= PAGE_SIZE - 1;
47085 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
47088 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47090 + unsigned long flags;
47091 + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
47092 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47093 + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47096 + * First, sync the memory before unmapping the entry
47098 + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47099 + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
47102 + * Return the buffer to the free list by setting the corresponding
47103 + * entries to indicate the number of contigous entries available.
47104 + * While returning the entries to the free list, we merge the entries
47105 + * with slots below and above the pool being returned.
47107 + spin_lock_irqsave(&io_tlb_lock, flags);
47109 + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
47110 + io_tlb_list[index + nslots] : 0);
47112 + * Step 1: return the slots to the free list, merging the
47113 + * slots with superceeding slots
47115 + for (i = index + nslots - 1; i >= index; i--)
47116 + io_tlb_list[i] = ++count;
47118 + * Step 2: merge the returned slots with the preceding slots,
47119 + * if available (non zero)
47121 + for (i = index - 1;
47122 + (OFFSET(i, IO_TLB_SEGSIZE) !=
47123 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47125 + io_tlb_list[i] = ++count;
47127 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47131 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47133 + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47134 + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
47135 + __sync_single(buffer, dma_addr, size, dir);
47139 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
47142 + * Ran out of IOMMU space for this operation. This is very bad.
47143 + * Unfortunately the drivers cannot handle this operation properly.
47144 + * unless they check for pci_dma_mapping_error (most don't)
47145 + * When the mapping is small enough return a static buffer to limit
47146 + * the damage, or panic when the transfer is too big.
47148 + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
47149 + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
47151 + if (size > io_tlb_overflow && do_panic) {
47152 + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47153 + panic("PCI-DMA: Memory would be corrupted\n");
47154 + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47155 + panic("PCI-DMA: Random memory would be DMAed\n");
47160 + * Map a single buffer of the indicated size for DMA in streaming mode. The
47161 + * PCI address to use is returned.
47163 + * Once the device is given the dma address, the device owns this memory until
47164 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
47167 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
47169 + dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
47170 + offset_in_page(ptr);
47172 + struct phys_addr buffer;
47174 + BUG_ON(dir == DMA_NONE);
47177 + * If the pointer passed in happens to be in the device's DMA window,
47178 + * we can safely return the device addr and not worry about bounce
47181 + if (!range_straddles_page_boundary(__pa(ptr), size) &&
47182 + !address_needs_mapping(hwdev, dev_addr))
47186 + * Oh well, have to allocate and map a bounce buffer.
47188 + gnttab_dma_unmap_page(dev_addr);
47189 + buffer.page = virt_to_page(ptr);
47190 + buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
47191 + map = map_single(hwdev, buffer, size, dir);
47193 + swiotlb_full(hwdev, size, dir, 1);
47194 + map = io_tlb_overflow_buffer;
47197 + dev_addr = virt_to_bus(map);
47202 + * Unmap a single streaming mode DMA translation. The dma_addr and size must
47203 + * match what was provided for in a previous swiotlb_map_single call. All
47204 + * other usages are undefined.
47206 + * After this call, reads by the cpu to the buffer are guaranteed to see
47207 + * whatever the device wrote there.
47210 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
47213 + BUG_ON(dir == DMA_NONE);
47214 + if (in_swiotlb_aperture(dev_addr))
47215 + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
47217 + gnttab_dma_unmap_page(dev_addr);
47221 + * Make physical memory consistent for a single streaming mode DMA translation
47222 + * after a transfer.
47224 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
47225 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
47226 + * call this function before doing so. At the next point you give the PCI dma
47227 + * address back to the card, you must first perform a
47228 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
47231 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
47232 + size_t size, int dir)
47234 + BUG_ON(dir == DMA_NONE);
47235 + if (in_swiotlb_aperture(dev_addr))
47236 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47240 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
47241 + size_t size, int dir)
47243 + BUG_ON(dir == DMA_NONE);
47244 + if (in_swiotlb_aperture(dev_addr))
47245 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47249 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
47250 + * This is the scatter-gather version of the above swiotlb_map_single
47251 + * interface. Here the scatter gather list elements are each tagged with the
47252 + * appropriate dma address and length. They are obtained via
47253 + * sg_dma_{address,length}(SG).
47255 + * NOTE: An implementation may be able to use a smaller number of
47256 + * DMA address/length pairs than there are SG table elements.
47257 + * (for example via virtual mapping capabilities)
47258 + * The routine returns the number of addr/length pairs actually
47259 + * used, at most nents.
47261 + * Device ownership issues as mentioned above for swiotlb_map_single are the
47265 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47268 + struct phys_addr buffer;
47269 + dma_addr_t dev_addr;
47273 + BUG_ON(dir == DMA_NONE);
47275 + for (i = 0; i < nelems; i++, sg++) {
47276 + dev_addr = gnttab_dma_map_page(sg->page) + sg->offset;
47278 + if (range_straddles_page_boundary(page_to_pseudophys(sg->page)
47279 + + sg->offset, sg->length)
47280 + || address_needs_mapping(hwdev, dev_addr)) {
47281 + gnttab_dma_unmap_page(dev_addr);
47282 + buffer.page = sg->page;
47283 + buffer.offset = sg->offset;
47284 + map = map_single(hwdev, buffer, sg->length, dir);
47286 + /* Don't panic here, we expect map_sg users
47287 + to do proper error handling. */
47288 + swiotlb_full(hwdev, sg->length, dir, 0);
47289 + swiotlb_unmap_sg(hwdev, sg - i, i, dir);
47290 + sg[0].dma_length = 0;
47293 + sg->dma_address = (dma_addr_t)virt_to_bus(map);
47295 + sg->dma_address = dev_addr;
47296 + sg->dma_length = sg->length;
47302 + * Unmap a set of streaming mode DMA translations. Again, cpu read rules
47303 + * concerning calls here are the same as for swiotlb_unmap_single() above.
47306 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47311 + BUG_ON(dir == DMA_NONE);
47313 + for (i = 0; i < nelems; i++, sg++)
47314 + if (in_swiotlb_aperture(sg->dma_address))
47315 + unmap_single(hwdev,
47316 + (void *)bus_to_virt(sg->dma_address),
47317 + sg->dma_length, dir);
47319 + gnttab_dma_unmap_page(sg->dma_address);
47323 + * Make physical memory consistent for a set of streaming mode DMA translations
47324 + * after a transfer.
47326 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
47330 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
47331 + int nelems, int dir)
47335 + BUG_ON(dir == DMA_NONE);
47337 + for (i = 0; i < nelems; i++, sg++)
47338 + if (in_swiotlb_aperture(sg->dma_address))
47339 + sync_single(hwdev,
47340 + (void *)bus_to_virt(sg->dma_address),
47341 + sg->dma_length, dir);
47345 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
47346 + int nelems, int dir)
47350 + BUG_ON(dir == DMA_NONE);
47352 + for (i = 0; i < nelems; i++, sg++)
47353 + if (in_swiotlb_aperture(sg->dma_address))
47354 + sync_single(hwdev,
47355 + (void *)bus_to_virt(sg->dma_address),
47356 + sg->dma_length, dir);
47359 +#ifdef CONFIG_HIGHMEM
47362 +swiotlb_map_page(struct device *hwdev, struct page *page,
47363 + unsigned long offset, size_t size,
47364 + enum dma_data_direction direction)
47366 + struct phys_addr buffer;
47367 + dma_addr_t dev_addr;
47370 + dev_addr = gnttab_dma_map_page(page) + offset;
47371 + if (address_needs_mapping(hwdev, dev_addr)) {
47372 + gnttab_dma_unmap_page(dev_addr);
47373 + buffer.page = page;
47374 + buffer.offset = offset;
47375 + map = map_single(hwdev, buffer, size, direction);
47377 + swiotlb_full(hwdev, size, direction, 1);
47378 + map = io_tlb_overflow_buffer;
47380 + dev_addr = (dma_addr_t)virt_to_bus(map);
47387 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
47388 + size_t size, enum dma_data_direction direction)
47390 + BUG_ON(direction == DMA_NONE);
47391 + if (in_swiotlb_aperture(dma_address))
47392 + unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
47394 + gnttab_dma_unmap_page(dma_address);
47400 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
47402 + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
47406 + * Return whether the given PCI device DMA address mask can be supported
47407 + * properly. For example, if your device can only drive the low 24-bits
47408 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
47412 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
47414 + return (mask >= ((1UL << dma_bits) - 1));
47417 +EXPORT_SYMBOL(swiotlb_init);
47418 +EXPORT_SYMBOL(swiotlb_map_single);
47419 +EXPORT_SYMBOL(swiotlb_unmap_single);
47420 +EXPORT_SYMBOL(swiotlb_map_sg);
47421 +EXPORT_SYMBOL(swiotlb_unmap_sg);
47422 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
47423 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
47424 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
47425 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
47426 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
47427 +EXPORT_SYMBOL(swiotlb_dma_supported);
47428 Index: head-2008-11-25/scripts/Makefile.xen.awk
47429 ===================================================================
47430 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
47431 +++ head-2008-11-25/scripts/Makefile.xen.awk 2007-08-06 15:10:49.000000000 +0200
47437 +/^[[:space:]]*#/ {
47441 +/^[[:space:]]*$/ {
47448 +/:[[:space:]]*%\.[cS][[:space:]]/ {
47449 + line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
47450 + line = gensub(/(single-used-m)/, "xen-\\1", "g", line)