]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/60008_xen3-auto-xen-arch.patch1
Stop dhcpcd before starting if it was running
[people/pmueller/ipfire-2.x.git] / src / patches / 60008_xen3-auto-xen-arch.patch1
CommitLineData
cc90b958
BS
1Subject: xen3 xen-arch
2From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd)
3Patch-mainline: obsolete
4Acked-by: jbeulich@novell.com
5
6List of files having Xen derivates (perhaps created during the merging
7of newer kernel versions), for xen-port-patches.py to pick up (i.e. this
8must be retained here until the XenSource tree has these in the right
9places):
10+++ linux/arch/x86/kernel/acpi/sleep-xen.c
11+++ linux/arch/x86/kernel/cpu/common_64-xen.c
12+++ linux/arch/x86/kernel/e820-xen.c
13+++ linux/arch/x86/kernel/head-xen.c
14+++ linux/arch/x86/kernel/head32-xen.c
15+++ linux/arch/x86/kernel/ioport-xen.c
16+++ linux/arch/x86/kernel/ipi-xen.c
17+++ linux/arch/x86/kernel/ldt-xen.c
18+++ linux/arch/x86/kernel/mpparse-xen.c
19+++ linux/arch/x86/kernel/pci-nommu-xen.c
20+++ linux/arch/x86/kernel/process-xen.c
21+++ linux/arch/x86/kernel/setup-xen.c
22+++ linux/arch/x86/kernel/setup_percpu-xen.c
23+++ linux/arch/x86/kernel/smp-xen.c
24+++ linux/arch/x86/mm/fault-xen.c
25+++ linux/arch/x86/mm/ioremap-xen.c
26+++ linux/arch/x86/mm/pageattr-xen.c
27+++ linux/arch/x86/mm/pat-xen.c
28+++ linux/arch/x86/mm/pgtable-xen.c
29+++ linux/arch/x86/vdso/vdso32-setup-xen.c
30+++ linux/drivers/char/mem-xen.c
31+++ linux/include/asm-x86/mach-xen/asm/desc.h
32+++ linux/include/asm-x86/mach-xen/asm/dma-mapping.h
33+++ linux/include/asm-x86/mach-xen/asm/fixmap.h
34+++ linux/include/asm-x86/mach-xen/asm/io.h
35+++ linux/include/asm-x86/mach-xen/asm/irq_vectors.h
36+++ linux/include/asm-x86/mach-xen/asm/irqflags.h
37+++ linux/include/asm-x86/mach-xen/asm/mmu_context.h
38+++ linux/include/asm-x86/mach-xen/asm/page.h
39+++ linux/include/asm-x86/mach-xen/asm/pci.h
40+++ linux/include/asm-x86/mach-xen/asm/pgalloc.h
41+++ linux/include/asm-x86/mach-xen/asm/pgtable.h
42+++ linux/include/asm-x86/mach-xen/asm/processor.h
43+++ linux/include/asm-x86/mach-xen/asm/segment.h
44+++ linux/include/asm-x86/mach-xen/asm/smp.h
45+++ linux/include/asm-x86/mach-xen/asm/spinlock.h
46+++ linux/include/asm-x86/mach-xen/asm/swiotlb.h
47+++ linux/include/asm-x86/mach-xen/asm/system.h
48+++ linux/include/asm-x86/mach-xen/asm/tlbflush.h
49+++ linux/include/asm-x86/mach-xen/asm/xor.h
50
51List of files folded into their native counterparts (and hence removed
52from this patch for xen-port-patches.py to not needlessly pick them up;
53for reference, prefixed with the version the removal occured):
542.6.18/include/asm-x86/mach-xen/asm/pgtable-2level.h
552.6.18/include/asm-x86/mach-xen/asm/pgtable-2level-defs.h
562.6.19/include/asm-x86/mach-xen/asm/ptrace.h
572.6.23/arch/x86/kernel/vsyscall-note_32-xen.S
582.6.23/include/asm-x86/mach-xen/asm/ptrace_64.h
592.6.24/arch/x86/kernel/early_printk_32-xen.c
602.6.24/include/asm-x86/mach-xen/asm/arch_hooks_64.h
612.6.24/include/asm-x86/mach-xen/asm/bootsetup_64.h
622.6.24/include/asm-x86/mach-xen/asm/mmu_32.h
632.6.24/include/asm-x86/mach-xen/asm/mmu_64.h
642.6.24/include/asm-x86/mach-xen/asm/nmi_64.h
652.6.24/include/asm-x86/mach-xen/asm/setup.h
662.6.24/include/asm-x86/mach-xen/asm/time_64.h (added in 2.6.20)
672.6.25/arch/x86/ia32/syscall32-xen.c
682.6.25/arch/x86/ia32/syscall32_syscall-xen.S
692.6.25/arch/x86/ia32/vsyscall-int80.S
702.6.25/arch/x86/kernel/acpi/boot-xen.c
712.6.25/include/asm-x86/mach-xen/asm/msr.h
722.6.25/include/asm-x86/mach-xen/asm/page_32.h
732.6.25/include/asm-x86/mach-xen/asm/spinlock_32.h
742.6.25/include/asm-x86/mach-xen/asm/timer.h (added in 2.6.24)
752.6.25/include/asm-x86/mach-xen/asm/timer_64.h
762.6.26/arch/x86/kernel/pci-dma_32-xen.c
772.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c
782.6.26/include/asm-x86/mach-xen/asm/dma-mapping_32.h
792.6.26/include/asm-x86/mach-xen/asm/dma-mapping_64.h
802.6.26/include/asm-x86/mach-xen/asm/nmi.h (added in 2.6.24)
812.6.26/include/asm-x86/mach-xen/asm/scatterlist.h (added in 2.6.24)
822.6.26/include/asm-x86/mach-xen/asm/scatterlist_32.h
832.6.26/include/xen/xencomm.h
842.6.27/arch/x86/kernel/e820_32-xen.c
852.6.27/include/asm-x86/mach-xen/asm/e820.h (added in 2.6.24)
862.6.27/include/asm-x86/mach-xen/asm/e820_64.h
872.6.27/include/asm-x86/mach-xen/asm/hw_irq.h (added in 2.6.24)
882.6.27/include/asm-x86/mach-xen/asm/hw_irq_32.h
892.6.27/include/asm-x86/mach-xen/asm/hw_irq_64.h
902.6.27/include/asm-x86/mach-xen/asm/irq.h (added in 2.6.24)
912.6.27/include/asm-x86/mach-xen/asm/irq_64.h
92
93Index: head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c
94===================================================================
95--- /dev/null 1970-01-01 00:00:00.000000000 +0000
96+++ head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c 2008-10-01 15:43:24.000000000 +0200
97@@ -0,0 +1,209 @@
98+/*
99+ * processor_extcntl_xen.c - interface to notify Xen
100+ *
101+ * Copyright (C) 2008, Intel corporation
102+ *
103+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104+ *
105+ * This program is free software; you can redistribute it and/or modify
106+ * it under the terms of the GNU General Public License as published by
107+ * the Free Software Foundation; either version 2 of the License, or (at
108+ * your option) any later version.
109+ *
110+ * This program is distributed in the hope that it will be useful, but
111+ * WITHOUT ANY WARRANTY; without even the implied warranty of
112+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
113+ * General Public License for more details.
114+ *
115+ * You should have received a copy of the GNU General Public License along
116+ * with this program; if not, write to the Free Software Foundation, Inc.,
117+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
118+ *
119+ */
120+
121+#include <linux/kernel.h>
122+#include <linux/init.h>
123+#include <linux/types.h>
124+#include <linux/acpi.h>
125+#include <linux/pm.h>
126+#include <linux/cpu.h>
127+
128+#include <linux/cpufreq.h>
129+#include <acpi/processor.h>
130+#include <asm/hypercall.h>
131+
132+static int xen_cx_notifier(struct acpi_processor *pr, int action)
133+{
134+ int ret, count = 0, i;
135+ xen_platform_op_t op = {
136+ .cmd = XENPF_set_processor_pminfo,
137+ .interface_version = XENPF_INTERFACE_VERSION,
138+ .u.set_pminfo.id = pr->acpi_id,
139+ .u.set_pminfo.type = XEN_PM_CX,
140+ };
141+ struct xen_processor_cx *data, *buf;
142+ struct acpi_processor_cx *cx;
143+
144+ if (action == PROCESSOR_PM_CHANGE)
145+ return -EINVAL;
146+
147+ /* Convert to Xen defined structure and hypercall */
148+ buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
149+ GFP_KERNEL);
150+ if (!buf)
151+ return -ENOMEM;
152+
153+ data = buf;
154+ for (i = 1; i <= pr->power.count; i++) {
155+ cx = &pr->power.states[i];
156+ /* Skip invalid cstate entry */
157+ if (!cx->valid)
158+ continue;
159+
160+ data->type = cx->type;
161+ data->latency = cx->latency;
162+ data->power = cx->power;
163+ data->reg.space_id = cx->reg.space_id;
164+ data->reg.bit_width = cx->reg.bit_width;
165+ data->reg.bit_offset = cx->reg.bit_offset;
166+ data->reg.access_size = cx->reg.reserved;
167+ data->reg.address = cx->reg.address;
168+
169+ /* Get dependency relationships */
170+ if (cx->csd_count) {
171+ printk("Wow! _CSD is found. Not support for now!\n");
172+ kfree(buf);
173+ return -EINVAL;
174+ } else {
175+ data->dpcnt = 0;
176+ set_xen_guest_handle(data->dp, NULL);
177+ }
178+
179+ data++;
180+ count++;
181+ }
182+
183+ if (!count) {
184+ printk("No available Cx info for cpu %d\n", pr->acpi_id);
185+ kfree(buf);
186+ return -EINVAL;
187+ }
188+
189+ op.u.set_pminfo.power.count = count;
190+ op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
191+ op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
192+ op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
193+ op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done;
194+
195+ set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
196+ ret = HYPERVISOR_platform_op(&op);
197+ kfree(buf);
198+ return ret;
199+}
200+
201+static int xen_px_notifier(struct acpi_processor *pr, int action)
202+{
203+ int ret = -EINVAL;
204+ xen_platform_op_t op = {
205+ .cmd = XENPF_set_processor_pminfo,
206+ .interface_version = XENPF_INTERFACE_VERSION,
207+ .u.set_pminfo.id = pr->acpi_id,
208+ .u.set_pminfo.type = XEN_PM_PX,
209+ };
210+ struct xen_processor_performance *perf;
211+ struct xen_processor_px *states = NULL;
212+ struct acpi_processor_performance *px;
213+ struct acpi_psd_package *pdomain;
214+
215+ if (!pr)
216+ return -EINVAL;
217+
218+ perf = &op.u.set_pminfo.perf;
219+ px = pr->performance;
220+
221+ switch(action) {
222+ case PROCESSOR_PM_CHANGE:
223+ /* ppc dynamic handle */
224+ perf->flags = XEN_PX_PPC;
225+ perf->platform_limit = pr->performance_platform_limit;
226+
227+ ret = HYPERVISOR_platform_op(&op);
228+ break;
229+
230+ case PROCESSOR_PM_INIT:
231+ /* px normal init */
232+ perf->flags = XEN_PX_PPC |
233+ XEN_PX_PCT |
234+ XEN_PX_PSS |
235+ XEN_PX_PSD;
236+
237+ /* ppc */
238+ perf->platform_limit = pr->performance_platform_limit;
239+
240+ /* pct */
241+ xen_convert_pct_reg(&perf->control_register, &px->control_register);
242+ xen_convert_pct_reg(&perf->status_register, &px->status_register);
243+
244+ /* pss */
245+ perf->state_count = px->state_count;
246+ states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
247+ if (!states)
248+ return -ENOMEM;
249+ xen_convert_pss_states(states, px->states, px->state_count);
250+ set_xen_guest_handle(perf->states, states);
251+
252+ /* psd */
253+ pdomain = &px->domain_info;
254+ xen_convert_psd_pack(&perf->domain_info, pdomain);
255+ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
256+ perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
257+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
258+ perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
259+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
260+ perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
261+ else {
262+ ret = -ENODEV;
263+ kfree(states);
264+ break;
265+ }
266+
267+ ret = HYPERVISOR_platform_op(&op);
268+ kfree(states);
269+ break;
270+
271+ default:
272+ break;
273+ }
274+
275+ return ret;
276+}
277+
278+static int xen_tx_notifier(struct acpi_processor *pr, int action)
279+{
280+ return -EINVAL;
281+}
282+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
283+{
284+ return -EINVAL;
285+}
286+
287+static struct processor_extcntl_ops xen_extcntl_ops = {
288+ .hotplug = xen_hotplug_notifier,
289+};
290+
291+void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops)
292+{
293+ unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
294+
295+ if (!pmbits)
296+ return;
297+ if (pmbits & XEN_PROCESSOR_PM_CX)
298+ xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
299+ if (pmbits & XEN_PROCESSOR_PM_PX)
300+ xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
301+ if (pmbits & XEN_PROCESSOR_PM_TX)
302+ xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
303+
304+ *ops = &xen_extcntl_ops;
305+}
306+EXPORT_SYMBOL(arch_acpi_processor_init_extcntl);
307Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c
308===================================================================
309--- /dev/null 1970-01-01 00:00:00.000000000 +0000
310+++ head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200
311@@ -0,0 +1,113 @@
312+/*
313+ * sleep.c - x86-specific ACPI sleep support.
314+ *
315+ * Copyright (C) 2001-2003 Patrick Mochel
316+ * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
317+ */
318+
319+#include <linux/acpi.h>
320+#include <linux/bootmem.h>
321+#include <linux/dmi.h>
322+#include <linux/cpumask.h>
323+
324+#include <asm/smp.h>
325+
326+#ifndef CONFIG_ACPI_PV_SLEEP
327+/* address in low memory of the wakeup routine. */
328+unsigned long acpi_wakeup_address = 0;
329+unsigned long acpi_video_flags;
330+extern char wakeup_start, wakeup_end;
331+
332+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
333+#endif
334+
335+/**
336+ * acpi_save_state_mem - save kernel state
337+ *
338+ * Create an identity mapped page table and copy the wakeup routine to
339+ * low memory.
340+ */
341+int acpi_save_state_mem(void)
342+{
343+#ifndef CONFIG_ACPI_PV_SLEEP
344+ if (!acpi_wakeup_address)
345+ return 1;
346+ memcpy((void *)acpi_wakeup_address, &wakeup_start,
347+ &wakeup_end - &wakeup_start);
348+ acpi_copy_wakeup_routine(acpi_wakeup_address);
349+#endif
350+ return 0;
351+}
352+
353+/*
354+ * acpi_restore_state - undo effects of acpi_save_state_mem
355+ */
356+void acpi_restore_state_mem(void)
357+{
358+}
359+
360+/**
361+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
362+ *
363+ * We allocate a page from the first 1MB of memory for the wakeup
364+ * routine for when we come back from a sleep state. The
365+ * runtime allocator allows specification of <16MB pages, but not
366+ * <1MB pages.
367+ */
368+void __init acpi_reserve_bootmem(void)
369+{
370+#ifndef CONFIG_ACPI_PV_SLEEP
371+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
372+ printk(KERN_ERR
373+ "ACPI: Wakeup code way too big, S3 disabled.\n");
374+ return;
375+ }
376+
377+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
378+ if (!acpi_wakeup_address)
379+ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
380+#endif
381+}
382+
383+#ifndef CONFIG_ACPI_PV_SLEEP
384+static int __init acpi_sleep_setup(char *str)
385+{
386+ while ((str != NULL) && (*str != '\0')) {
387+ if (strncmp(str, "s3_bios", 7) == 0)
388+ acpi_video_flags = 1;
389+ if (strncmp(str, "s3_mode", 7) == 0)
390+ acpi_video_flags |= 2;
391+ str = strchr(str, ',');
392+ if (str != NULL)
393+ str += strspn(str, ", \t");
394+ }
395+ return 1;
396+}
397+
398+__setup("acpi_sleep=", acpi_sleep_setup);
399+
400+static __init int reset_videomode_after_s3(struct dmi_system_id *d)
401+{
402+ acpi_video_flags |= 2;
403+ return 0;
404+}
405+
406+static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
407+ { /* Reset video mode after returning from ACPI S3 sleep */
408+ .callback = reset_videomode_after_s3,
409+ .ident = "Toshiba Satellite 4030cdt",
410+ .matches = {
411+ DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
412+ },
413+ },
414+ {}
415+};
416+
417+static int __init acpisleep_dmi_init(void)
418+{
419+ dmi_check_system(acpisleep_dmi_table);
420+ return 0;
421+}
422+
423+core_initcall(acpisleep_dmi_init);
424+#endif /* CONFIG_ACPI_PV_SLEEP */
425Index: head-2008-11-25/arch/x86/kernel/apic_32-xen.c
426===================================================================
427--- /dev/null 1970-01-01 00:00:00.000000000 +0000
428+++ head-2008-11-25/arch/x86/kernel/apic_32-xen.c 2007-06-12 13:12:48.000000000 +0200
429@@ -0,0 +1,155 @@
430+/*
431+ * Local APIC handling, local APIC timers
432+ *
433+ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
434+ *
435+ * Fixes
436+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
437+ * thanks to Eric Gilmore
438+ * and Rolf G. Tews
439+ * for testing these extensively.
440+ * Maciej W. Rozycki : Various updates and fixes.
441+ * Mikael Pettersson : Power Management for UP-APIC.
442+ * Pavel Machek and
443+ * Mikael Pettersson : PM converted to driver model.
444+ */
445+
446+#include <linux/init.h>
447+
448+#include <linux/mm.h>
449+#include <linux/delay.h>
450+#include <linux/bootmem.h>
451+#include <linux/smp_lock.h>
452+#include <linux/interrupt.h>
453+#include <linux/mc146818rtc.h>
454+#include <linux/kernel_stat.h>
455+#include <linux/sysdev.h>
456+#include <linux/cpu.h>
457+#include <linux/module.h>
458+
459+#include <asm/atomic.h>
460+#include <asm/smp.h>
461+#include <asm/mtrr.h>
462+#include <asm/mpspec.h>
463+#include <asm/desc.h>
464+#include <asm/arch_hooks.h>
465+#include <asm/hpet.h>
466+#include <asm/i8253.h>
467+#include <asm/nmi.h>
468+
469+#include <mach_apic.h>
470+#include <mach_apicdef.h>
471+#include <mach_ipi.h>
472+
473+#include "io_ports.h"
474+
475+#ifndef CONFIG_XEN
476+/*
477+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
478+ * IPIs in place of local APIC timers
479+ */
480+static cpumask_t timer_bcast_ipi;
481+#endif
482+
483+/*
484+ * Knob to control our willingness to enable the local APIC.
485+ */
486+int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
487+
488+/*
489+ * Debug level
490+ */
491+int apic_verbosity;
492+
493+#ifndef CONFIG_XEN
494+static int modern_apic(void)
495+{
496+ unsigned int lvr, version;
497+ /* AMD systems use old APIC versions, so check the CPU */
498+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
499+ boot_cpu_data.x86 >= 0xf)
500+ return 1;
501+ lvr = apic_read(APIC_LVR);
502+ version = GET_APIC_VERSION(lvr);
503+ return version >= 0x14;
504+}
505+#endif /* !CONFIG_XEN */
506+
507+/*
508+ * 'what should we do if we get a hw irq event on an illegal vector'.
509+ * each architecture has to answer this themselves.
510+ */
511+void ack_bad_irq(unsigned int irq)
512+{
513+ printk("unexpected IRQ trap at vector %02x\n", irq);
514+ /*
515+ * Currently unexpected vectors happen only on SMP and APIC.
516+ * We _must_ ack these because every local APIC has only N
517+ * irq slots per priority level, and a 'hanging, unacked' IRQ
518+ * holds up an irq slot - in excessive cases (when multiple
519+ * unexpected vectors occur) that might lock up the APIC
520+ * completely.
521+ * But only ack when the APIC is enabled -AK
522+ */
523+ if (cpu_has_apic)
524+ ack_APIC_irq();
525+}
526+
527+int get_physical_broadcast(void)
528+{
529+ return 0xff;
530+}
531+
532+#ifndef CONFIG_XEN
533+#ifndef CONFIG_SMP
534+static void up_apic_timer_interrupt_call(struct pt_regs *regs)
535+{
536+ int cpu = smp_processor_id();
537+
538+ /*
539+ * the NMI deadlock-detector uses this.
540+ */
541+ per_cpu(irq_stat, cpu).apic_timer_irqs++;
542+
543+ smp_local_timer_interrupt(regs);
544+}
545+#endif
546+
547+void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
548+{
549+ cpumask_t mask;
550+
551+ cpus_and(mask, cpu_online_map, timer_bcast_ipi);
552+ if (!cpus_empty(mask)) {
553+#ifdef CONFIG_SMP
554+ send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
555+#else
556+ /*
557+ * We can directly call the apic timer interrupt handler
558+ * in UP case. Minus all irq related functions
559+ */
560+ up_apic_timer_interrupt_call(regs);
561+#endif
562+ }
563+}
564+#endif
565+
566+int setup_profiling_timer(unsigned int multiplier)
567+{
568+ return -EINVAL;
569+}
570+
571+/*
572+ * This initializes the IO-APIC and APIC hardware if this is
573+ * a UP kernel.
574+ */
575+int __init APIC_init_uniprocessor (void)
576+{
577+#ifdef CONFIG_X86_IO_APIC
578+ if (smp_found_config)
579+ if (!skip_ioapic_setup && nr_ioapics)
580+ setup_IO_APIC();
581+#endif
582+
583+ return 0;
584+}
585Index: head-2008-11-25/arch/x86/kernel/cpu/common-xen.c
586===================================================================
587--- /dev/null 1970-01-01 00:00:00.000000000 +0000
588+++ head-2008-11-25/arch/x86/kernel/cpu/common-xen.c 2007-12-10 08:47:31.000000000 +0100
589@@ -0,0 +1,743 @@
590+#include <linux/init.h>
591+#include <linux/string.h>
592+#include <linux/delay.h>
593+#include <linux/smp.h>
594+#include <linux/module.h>
595+#include <linux/percpu.h>
596+#include <linux/bootmem.h>
597+#include <asm/semaphore.h>
598+#include <asm/processor.h>
599+#include <asm/i387.h>
600+#include <asm/msr.h>
601+#include <asm/io.h>
602+#include <asm/mmu_context.h>
603+#include <asm/mtrr.h>
604+#include <asm/mce.h>
605+#ifdef CONFIG_X86_LOCAL_APIC
606+#include <asm/mpspec.h>
607+#include <asm/apic.h>
608+#include <mach_apic.h>
609+#else
610+#ifdef CONFIG_XEN
611+#define phys_pkg_id(a,b) a
612+#endif
613+#endif
614+#include <asm/hypervisor.h>
615+
616+#include "cpu.h"
617+
618+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
619+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
620+
621+#ifndef CONFIG_XEN
622+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
623+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
624+#endif
625+
626+static int cachesize_override __cpuinitdata = -1;
627+static int disable_x86_fxsr __cpuinitdata;
628+static int disable_x86_serial_nr __cpuinitdata = 1;
629+static int disable_x86_sep __cpuinitdata;
630+
631+struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
632+
633+extern int disable_pse;
634+
635+static void default_init(struct cpuinfo_x86 * c)
636+{
637+ /* Not much we can do here... */
638+ /* Check if at least it has cpuid */
639+ if (c->cpuid_level == -1) {
640+ /* No cpuid. It must be an ancient CPU */
641+ if (c->x86 == 4)
642+ strcpy(c->x86_model_id, "486");
643+ else if (c->x86 == 3)
644+ strcpy(c->x86_model_id, "386");
645+ }
646+}
647+
648+static struct cpu_dev default_cpu = {
649+ .c_init = default_init,
650+ .c_vendor = "Unknown",
651+};
652+static struct cpu_dev * this_cpu = &default_cpu;
653+
654+static int __init cachesize_setup(char *str)
655+{
656+ get_option (&str, &cachesize_override);
657+ return 1;
658+}
659+__setup("cachesize=", cachesize_setup);
660+
661+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
662+{
663+ unsigned int *v;
664+ char *p, *q;
665+
666+ if (cpuid_eax(0x80000000) < 0x80000004)
667+ return 0;
668+
669+ v = (unsigned int *) c->x86_model_id;
670+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
671+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
672+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
673+ c->x86_model_id[48] = 0;
674+
675+ /* Intel chips right-justify this string for some dumb reason;
676+ undo that brain damage */
677+ p = q = &c->x86_model_id[0];
678+ while ( *p == ' ' )
679+ p++;
680+ if ( p != q ) {
681+ while ( *p )
682+ *q++ = *p++;
683+ while ( q <= &c->x86_model_id[48] )
684+ *q++ = '\0'; /* Zero-pad the rest */
685+ }
686+
687+ return 1;
688+}
689+
690+
691+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
692+{
693+ unsigned int n, dummy, ecx, edx, l2size;
694+
695+ n = cpuid_eax(0x80000000);
696+
697+ if (n >= 0x80000005) {
698+ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
699+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
700+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
701+ c->x86_cache_size=(ecx>>24)+(edx>>24);
702+ }
703+
704+ if (n < 0x80000006) /* Some chips just has a large L1. */
705+ return;
706+
707+ ecx = cpuid_ecx(0x80000006);
708+ l2size = ecx >> 16;
709+
710+ /* do processor-specific cache resizing */
711+ if (this_cpu->c_size_cache)
712+ l2size = this_cpu->c_size_cache(c,l2size);
713+
714+ /* Allow user to override all this if necessary. */
715+ if (cachesize_override != -1)
716+ l2size = cachesize_override;
717+
718+ if ( l2size == 0 )
719+ return; /* Again, no L2 cache is possible */
720+
721+ c->x86_cache_size = l2size;
722+
723+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
724+ l2size, ecx & 0xFF);
725+}
726+
727+/* Naming convention should be: <Name> [(<Codename>)] */
728+/* This table only is used unless init_<vendor>() below doesn't set it; */
729+/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
730+
731+/* Look up CPU names by table lookup. */
732+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
733+{
734+ struct cpu_model_info *info;
735+
736+ if ( c->x86_model >= 16 )
737+ return NULL; /* Range check */
738+
739+ if (!this_cpu)
740+ return NULL;
741+
742+ info = this_cpu->c_models;
743+
744+ while (info && info->family) {
745+ if (info->family == c->x86)
746+ return info->model_names[c->x86_model];
747+ info++;
748+ }
749+ return NULL; /* Not found */
750+}
751+
752+
753+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
754+{
755+ char *v = c->x86_vendor_id;
756+ int i;
757+ static int printed;
758+
759+ for (i = 0; i < X86_VENDOR_NUM; i++) {
760+ if (cpu_devs[i]) {
761+ if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
762+ (cpu_devs[i]->c_ident[1] &&
763+ !strcmp(v,cpu_devs[i]->c_ident[1]))) {
764+ c->x86_vendor = i;
765+ if (!early)
766+ this_cpu = cpu_devs[i];
767+ return;
768+ }
769+ }
770+ }
771+ if (!printed) {
772+ printed++;
773+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
774+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
775+ }
776+ c->x86_vendor = X86_VENDOR_UNKNOWN;
777+ this_cpu = &default_cpu;
778+}
779+
780+
781+static int __init x86_fxsr_setup(char * s)
782+{
783+ disable_x86_fxsr = 1;
784+ return 1;
785+}
786+__setup("nofxsr", x86_fxsr_setup);
787+
788+
789+static int __init x86_sep_setup(char * s)
790+{
791+ disable_x86_sep = 1;
792+ return 1;
793+}
794+__setup("nosep", x86_sep_setup);
795+
796+
797+/* Standard macro to see if a specific flag is changeable */
798+static inline int flag_is_changeable_p(u32 flag)
799+{
800+ u32 f1, f2;
801+
802+ asm("pushfl\n\t"
803+ "pushfl\n\t"
804+ "popl %0\n\t"
805+ "movl %0,%1\n\t"
806+ "xorl %2,%0\n\t"
807+ "pushl %0\n\t"
808+ "popfl\n\t"
809+ "pushfl\n\t"
810+ "popl %0\n\t"
811+ "popfl\n\t"
812+ : "=&r" (f1), "=&r" (f2)
813+ : "ir" (flag));
814+
815+ return ((f1^f2) & flag) != 0;
816+}
817+
818+
819+/* Probe for the CPUID instruction */
820+static int __cpuinit have_cpuid_p(void)
821+{
822+ return flag_is_changeable_p(X86_EFLAGS_ID);
823+}
824+
825+/* Do minimum CPU detection early.
826+ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
827+ The others are not touched to avoid unwanted side effects.
828+
829+ WARNING: this function is only called on the BP. Don't add code here
830+ that is supposed to run on all CPUs. */
831+static void __init early_cpu_detect(void)
832+{
833+ struct cpuinfo_x86 *c = &boot_cpu_data;
834+
835+ c->x86_cache_alignment = 32;
836+
837+ if (!have_cpuid_p())
838+ return;
839+
840+ /* Get vendor name */
841+ cpuid(0x00000000, &c->cpuid_level,
842+ (int *)&c->x86_vendor_id[0],
843+ (int *)&c->x86_vendor_id[8],
844+ (int *)&c->x86_vendor_id[4]);
845+
846+ get_cpu_vendor(c, 1);
847+
848+ c->x86 = 4;
849+ if (c->cpuid_level >= 0x00000001) {
850+ u32 junk, tfms, cap0, misc;
851+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
852+ c->x86 = (tfms >> 8) & 15;
853+ c->x86_model = (tfms >> 4) & 15;
854+ if (c->x86 == 0xf)
855+ c->x86 += (tfms >> 20) & 0xff;
856+ if (c->x86 >= 0x6)
857+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
858+ c->x86_mask = tfms & 15;
859+ if (cap0 & (1<<19))
860+ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
861+ }
862+}
863+
864+void __cpuinit generic_identify(struct cpuinfo_x86 * c)
865+{
866+ u32 tfms, xlvl;
867+ int ebx;
868+
869+ if (have_cpuid_p()) {
870+ /* Get vendor name */
871+ cpuid(0x00000000, &c->cpuid_level,
872+ (int *)&c->x86_vendor_id[0],
873+ (int *)&c->x86_vendor_id[8],
874+ (int *)&c->x86_vendor_id[4]);
875+
876+ get_cpu_vendor(c, 0);
877+ /* Initialize the standard set of capabilities */
878+ /* Note that the vendor-specific code below might override */
879+
880+ /* Intel-defined flags: level 0x00000001 */
881+ if ( c->cpuid_level >= 0x00000001 ) {
882+ u32 capability, excap;
883+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
884+ c->x86_capability[0] = capability;
885+ c->x86_capability[4] = excap;
886+ c->x86 = (tfms >> 8) & 15;
887+ c->x86_model = (tfms >> 4) & 15;
888+ if (c->x86 == 0xf)
889+ c->x86 += (tfms >> 20) & 0xff;
890+ if (c->x86 >= 0x6)
891+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
892+ c->x86_mask = tfms & 15;
893+#ifdef CONFIG_X86_HT
894+ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
895+#else
896+ c->apicid = (ebx >> 24) & 0xFF;
897+#endif
898+ } else {
899+ /* Have CPUID level 0 only - unheard of */
900+ c->x86 = 4;
901+ }
902+
903+ /* AMD-defined flags: level 0x80000001 */
904+ xlvl = cpuid_eax(0x80000000);
905+ if ( (xlvl & 0xffff0000) == 0x80000000 ) {
906+ if ( xlvl >= 0x80000001 ) {
907+ c->x86_capability[1] = cpuid_edx(0x80000001);
908+ c->x86_capability[6] = cpuid_ecx(0x80000001);
909+ }
910+ if ( xlvl >= 0x80000004 )
911+ get_model_name(c); /* Default name */
912+ }
913+ }
914+
915+ early_intel_workaround(c);
916+
917+#ifdef CONFIG_X86_HT
918+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
919+#endif
920+}
921+
922+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
923+{
924+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
925+ /* Disable processor serial number */
926+ unsigned long lo,hi;
927+ rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
928+ lo |= 0x200000;
929+ wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
930+ printk(KERN_NOTICE "CPU serial number disabled.\n");
931+ clear_bit(X86_FEATURE_PN, c->x86_capability);
932+
933+ /* Disabling the serial number may affect the cpuid level */
934+ c->cpuid_level = cpuid_eax(0);
935+ }
936+}
937+
938+static int __init x86_serial_nr_setup(char *s)
939+{
940+ disable_x86_serial_nr = 0;
941+ return 1;
942+}
943+__setup("serialnumber", x86_serial_nr_setup);
944+
945+
946+
947+/*
948+ * This does the hard work of actually picking apart the CPU stuff...
949+ */
950+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
951+{
952+ int i;
953+
954+ c->loops_per_jiffy = loops_per_jiffy;
955+ c->x86_cache_size = -1;
956+ c->x86_vendor = X86_VENDOR_UNKNOWN;
957+ c->cpuid_level = -1; /* CPUID not detected */
958+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
959+ c->x86_vendor_id[0] = '\0'; /* Unset */
960+ c->x86_model_id[0] = '\0'; /* Unset */
961+ c->x86_max_cores = 1;
962+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
963+
964+ if (!have_cpuid_p()) {
965+ /* First of all, decide if this is a 486 or higher */
966+ /* It's a 486 if we can modify the AC flag */
967+ if ( flag_is_changeable_p(X86_EFLAGS_AC) )
968+ c->x86 = 4;
969+ else
970+ c->x86 = 3;
971+ }
972+
973+ generic_identify(c);
974+
975+ printk(KERN_DEBUG "CPU: After generic identify, caps:");
976+ for (i = 0; i < NCAPINTS; i++)
977+ printk(" %08lx", c->x86_capability[i]);
978+ printk("\n");
979+
980+ if (this_cpu->c_identify) {
981+ this_cpu->c_identify(c);
982+
983+ printk(KERN_DEBUG "CPU: After vendor identify, caps:");
984+ for (i = 0; i < NCAPINTS; i++)
985+ printk(" %08lx", c->x86_capability[i]);
986+ printk("\n");
987+ }
988+
989+ /*
990+ * Vendor-specific initialization. In this section we
991+ * canonicalize the feature flags, meaning if there are
992+ * features a certain CPU supports which CPUID doesn't
993+ * tell us, CPUID claiming incorrect flags, or other bugs,
994+ * we handle them here.
995+ *
996+ * At the end of this section, c->x86_capability better
997+ * indicate the features this CPU genuinely supports!
998+ */
999+ if (this_cpu->c_init)
1000+ this_cpu->c_init(c);
1001+
1002+ /* Disable the PN if appropriate */
1003+ squash_the_stupid_serial_number(c);
1004+
1005+ /*
1006+ * The vendor-specific functions might have changed features. Now
1007+ * we do "generic changes."
1008+ */
1009+
1010+ /* TSC disabled? */
1011+ if ( tsc_disable )
1012+ clear_bit(X86_FEATURE_TSC, c->x86_capability);
1013+
1014+ /* FXSR disabled? */
1015+ if (disable_x86_fxsr) {
1016+ clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1017+ clear_bit(X86_FEATURE_XMM, c->x86_capability);
1018+ }
1019+
1020+ /* SEP disabled? */
1021+ if (disable_x86_sep)
1022+ clear_bit(X86_FEATURE_SEP, c->x86_capability);
1023+
1024+ if (disable_pse)
1025+ clear_bit(X86_FEATURE_PSE, c->x86_capability);
1026+
1027+ /* If the model name is still unset, do table lookup. */
1028+ if ( !c->x86_model_id[0] ) {
1029+ char *p;
1030+ p = table_lookup_model(c);
1031+ if ( p )
1032+ strcpy(c->x86_model_id, p);
1033+ else
1034+ /* Last resort... */
1035+ sprintf(c->x86_model_id, "%02x/%02x",
1036+ c->x86, c->x86_model);
1037+ }
1038+
1039+ /* Now the feature flags better reflect actual CPU features! */
1040+
1041+ printk(KERN_DEBUG "CPU: After all inits, caps:");
1042+ for (i = 0; i < NCAPINTS; i++)
1043+ printk(" %08lx", c->x86_capability[i]);
1044+ printk("\n");
1045+
1046+ /*
1047+ * On SMP, boot_cpu_data holds the common feature set between
1048+ * all CPUs; so make sure that we indicate which features are
1049+ * common between the CPUs. The first time this routine gets
1050+ * executed, c == &boot_cpu_data.
1051+ */
1052+ if ( c != &boot_cpu_data ) {
1053+ /* AND the already accumulated flags with these */
1054+ for ( i = 0 ; i < NCAPINTS ; i++ )
1055+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1056+ }
1057+
1058+ /* Init Machine Check Exception if available. */
1059+ mcheck_init(c);
1060+
1061+ if (c == &boot_cpu_data)
1062+ sysenter_setup();
1063+ enable_sep_cpu();
1064+
1065+ if (c == &boot_cpu_data)
1066+ mtrr_bp_init();
1067+ else
1068+ mtrr_ap_init();
1069+}
1070+
1071+#ifdef CONFIG_X86_HT
1072+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
1073+{
1074+ u32 eax, ebx, ecx, edx;
1075+ int index_msb, core_bits;
1076+
1077+ cpuid(1, &eax, &ebx, &ecx, &edx);
1078+
1079+ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
1080+ return;
1081+
1082+ smp_num_siblings = (ebx & 0xff0000) >> 16;
1083+
1084+ if (smp_num_siblings == 1) {
1085+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
1086+ } else if (smp_num_siblings > 1 ) {
1087+
1088+ if (smp_num_siblings > NR_CPUS) {
1089+ printk(KERN_WARNING "CPU: Unsupported number of the "
1090+ "siblings %d", smp_num_siblings);
1091+ smp_num_siblings = 1;
1092+ return;
1093+ }
1094+
1095+ index_msb = get_count_order(smp_num_siblings);
1096+ c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
1097+
1098+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
1099+ c->phys_proc_id);
1100+
1101+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
1102+
1103+ index_msb = get_count_order(smp_num_siblings) ;
1104+
1105+ core_bits = get_count_order(c->x86_max_cores);
1106+
1107+ c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
1108+ ((1 << core_bits) - 1);
1109+
1110+ if (c->x86_max_cores > 1)
1111+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
1112+ c->cpu_core_id);
1113+ }
1114+}
1115+#endif
1116+
1117+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1118+{
1119+ char *vendor = NULL;
1120+
1121+ if (c->x86_vendor < X86_VENDOR_NUM)
1122+ vendor = this_cpu->c_vendor;
1123+ else if (c->cpuid_level >= 0)
1124+ vendor = c->x86_vendor_id;
1125+
1126+ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
1127+ printk("%s ", vendor);
1128+
1129+ if (!c->x86_model_id[0])
1130+ printk("%d86", c->x86);
1131+ else
1132+ printk("%s", c->x86_model_id);
1133+
1134+ if (c->x86_mask || c->cpuid_level >= 0)
1135+ printk(" stepping %02x\n", c->x86_mask);
1136+ else
1137+ printk("\n");
1138+}
1139+
1140+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1141+
1142+/* This is hacky. :)
1143+ * We're emulating future behavior.
1144+ * In the future, the cpu-specific init functions will be called implicitly
1145+ * via the magic of initcalls.
1146+ * They will insert themselves into the cpu_devs structure.
1147+ * Then, when cpu_init() is called, we can just iterate over that array.
1148+ */
1149+
1150+extern int intel_cpu_init(void);
1151+extern int cyrix_init_cpu(void);
1152+extern int nsc_init_cpu(void);
1153+extern int amd_init_cpu(void);
1154+extern int centaur_init_cpu(void);
1155+extern int transmeta_init_cpu(void);
1156+extern int rise_init_cpu(void);
1157+extern int nexgen_init_cpu(void);
1158+extern int umc_init_cpu(void);
1159+
1160+void __init early_cpu_init(void)
1161+{
1162+ intel_cpu_init();
1163+ cyrix_init_cpu();
1164+ nsc_init_cpu();
1165+ amd_init_cpu();
1166+ centaur_init_cpu();
1167+ transmeta_init_cpu();
1168+ rise_init_cpu();
1169+ nexgen_init_cpu();
1170+ umc_init_cpu();
1171+ early_cpu_detect();
1172+
1173+#ifdef CONFIG_DEBUG_PAGEALLOC
1174+ /* pse is not compatible with on-the-fly unmapping,
1175+ * disable it even if the cpus claim to support it.
1176+ */
1177+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1178+ disable_pse = 1;
1179+#endif
1180+}
1181+
1182+static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
1183+{
1184+ unsigned long frames[16];
1185+ unsigned long va;
1186+ int f;
1187+
1188+ for (va = gdt_descr->address, f = 0;
1189+ va < gdt_descr->address + gdt_descr->size;
1190+ va += PAGE_SIZE, f++) {
1191+ frames[f] = virt_to_mfn(va);
1192+ make_lowmem_page_readonly(
1193+ (void *)va, XENFEAT_writable_descriptor_tables);
1194+ }
1195+ if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
1196+ BUG();
1197+}
1198+
1199+/*
1200+ * cpu_init() initializes state that is per-CPU. Some data is already
1201+ * initialized (naturally) in the bootstrap process, such as the GDT
1202+ * and IDT. We reload them nevertheless, this function acts as a
1203+ * 'CPU state barrier', nothing should get across.
1204+ */
1205+void __cpuinit cpu_init(void)
1206+{
1207+ int cpu = smp_processor_id();
1208+#ifndef CONFIG_X86_NO_TSS
1209+ struct tss_struct * t = &per_cpu(init_tss, cpu);
1210+#endif
1211+ struct thread_struct *thread = &current->thread;
1212+ struct desc_struct *gdt;
1213+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1214+
1215+ if (cpu_test_and_set(cpu, cpu_initialized)) {
1216+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1217+ for (;;) local_irq_enable();
1218+ }
1219+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1220+
1221+ if (cpu_has_vme || cpu_has_de)
1222+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1223+ if (tsc_disable && cpu_has_tsc) {
1224+ printk(KERN_NOTICE "Disabling TSC...\n");
1225+ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1226+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1227+ set_in_cr4(X86_CR4_TSD);
1228+ }
1229+
1230+#ifndef CONFIG_XEN
1231+ /* The CPU hotplug case */
1232+ if (cpu_gdt_descr->address) {
1233+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
1234+ memset(gdt, 0, PAGE_SIZE);
1235+ goto old_gdt;
1236+ }
1237+ /*
1238+ * This is a horrible hack to allocate the GDT. The problem
1239+ * is that cpu_init() is called really early for the boot CPU
1240+ * (and hence needs bootmem) but much later for the secondary
1241+ * CPUs, when bootmem will have gone away
1242+ */
1243+ if (NODE_DATA(0)->bdata->node_bootmem_map) {
1244+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1245+ /* alloc_bootmem_pages panics on failure, so no check */
1246+ memset(gdt, 0, PAGE_SIZE);
1247+ } else {
1248+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
1249+ if (unlikely(!gdt)) {
1250+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
1251+ for (;;)
1252+ local_irq_enable();
1253+ }
1254+ }
1255+old_gdt:
1256+ /*
1257+ * Initialize the per-CPU GDT with the boot GDT,
1258+ * and set up the GDT descriptor:
1259+ */
1260+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1261+
1262+ /* Set up GDT entry for 16bit stack */
1263+ *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
1264+ ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
1265+ ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
1266+ (CPU_16BIT_STACK_SIZE - 1);
1267+
1268+ cpu_gdt_descr->size = GDT_SIZE - 1;
1269+ cpu_gdt_descr->address = (unsigned long)gdt;
1270+#else
1271+ if (cpu == 0 && cpu_gdt_descr->address == 0) {
1272+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1273+ /* alloc_bootmem_pages panics on failure, so no check */
1274+ memset(gdt, 0, PAGE_SIZE);
1275+
1276+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1277+
1278+ cpu_gdt_descr->size = GDT_SIZE;
1279+ cpu_gdt_descr->address = (unsigned long)gdt;
1280+ }
1281+#endif
1282+
1283+ cpu_gdt_init(cpu_gdt_descr);
1284+
1285+ /*
1286+ * Set up and load the per-CPU TSS and LDT
1287+ */
1288+ atomic_inc(&init_mm.mm_count);
1289+ current->active_mm = &init_mm;
1290+ if (current->mm)
1291+ BUG();
1292+ enter_lazy_tlb(&init_mm, current);
1293+
1294+ load_esp0(t, thread);
1295+
1296+ load_LDT(&init_mm.context);
1297+
1298+#ifdef CONFIG_DOUBLEFAULT
1299+ /* Set up doublefault TSS pointer in the GDT */
1300+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1301+#endif
1302+
1303+ /* Clear %fs and %gs. */
1304+ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
1305+
1306+ /* Clear all 6 debug registers: */
1307+ set_debugreg(0, 0);
1308+ set_debugreg(0, 1);
1309+ set_debugreg(0, 2);
1310+ set_debugreg(0, 3);
1311+ set_debugreg(0, 6);
1312+ set_debugreg(0, 7);
1313+
1314+ /*
1315+ * Force FPU initialization:
1316+ */
1317+ current_thread_info()->status = 0;
1318+ clear_used_math();
1319+ mxcsr_feature_mask_init();
1320+}
1321+
1322+#ifdef CONFIG_HOTPLUG_CPU
1323+void __cpuinit cpu_uninit(void)
1324+{
1325+ int cpu = raw_smp_processor_id();
1326+ cpu_clear(cpu, cpu_initialized);
1327+
1328+ /* lazy TLB state */
1329+ per_cpu(cpu_tlbstate, cpu).state = 0;
1330+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1331+}
1332+#endif
1333Index: head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c
1334===================================================================
1335--- /dev/null 1970-01-01 00:00:00.000000000 +0000
1336+++ head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100
1337@@ -0,0 +1,198 @@
1338+#include <linux/init.h>
1339+#include <linux/proc_fs.h>
1340+#include <linux/ctype.h>
1341+#include <linux/module.h>
1342+#include <linux/seq_file.h>
1343+#include <asm/uaccess.h>
1344+#include <linux/mutex.h>
1345+
1346+#include <asm/mtrr.h>
1347+#include "mtrr.h"
1348+
1349+static DEFINE_MUTEX(mtrr_mutex);
1350+
1351+void generic_get_mtrr(unsigned int reg, unsigned long *base,
1352+ unsigned int *size, mtrr_type * type)
1353+{
1354+ struct xen_platform_op op;
1355+
1356+ op.cmd = XENPF_read_memtype;
1357+ op.u.read_memtype.reg = reg;
1358+ if (unlikely(HYPERVISOR_platform_op(&op)))
1359+ memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
1360+
1361+ *size = op.u.read_memtype.nr_mfns;
1362+ *base = op.u.read_memtype.mfn;
1363+ *type = op.u.read_memtype.type;
1364+}
1365+
1366+struct mtrr_ops generic_mtrr_ops = {
1367+ .use_intel_if = 1,
1368+ .get = generic_get_mtrr,
1369+};
1370+
1371+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1372+unsigned int num_var_ranges;
1373+unsigned int *usage_table;
1374+
1375+static void __init set_num_var_ranges(void)
1376+{
1377+ struct xen_platform_op op;
1378+
1379+ for (num_var_ranges = 0; ; num_var_ranges++) {
1380+ op.cmd = XENPF_read_memtype;
1381+ op.u.read_memtype.reg = num_var_ranges;
1382+ if (HYPERVISOR_platform_op(&op) != 0)
1383+ break;
1384+ }
1385+}
1386+
1387+static void __init init_table(void)
1388+{
1389+ int i, max;
1390+
1391+ max = num_var_ranges;
1392+ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1393+ == NULL) {
1394+ printk(KERN_ERR "mtrr: could not allocate\n");
1395+ return;
1396+ }
1397+ for (i = 0; i < max; i++)
1398+ usage_table[i] = 0;
1399+}
1400+
1401+int mtrr_add_page(unsigned long base, unsigned long size,
1402+ unsigned int type, char increment)
1403+{
1404+ int error;
1405+ struct xen_platform_op op;
1406+
1407+ mutex_lock(&mtrr_mutex);
1408+
1409+ op.cmd = XENPF_add_memtype;
1410+ op.u.add_memtype.mfn = base;
1411+ op.u.add_memtype.nr_mfns = size;
1412+ op.u.add_memtype.type = type;
1413+ error = HYPERVISOR_platform_op(&op);
1414+ if (error) {
1415+ mutex_unlock(&mtrr_mutex);
1416+ BUG_ON(error > 0);
1417+ return error;
1418+ }
1419+
1420+ if (increment)
1421+ ++usage_table[op.u.add_memtype.reg];
1422+
1423+ mutex_unlock(&mtrr_mutex);
1424+
1425+ return op.u.add_memtype.reg;
1426+}
1427+
1428+static int mtrr_check(unsigned long base, unsigned long size)
1429+{
1430+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
1431+ printk(KERN_WARNING
1432+ "mtrr: size and base must be multiples of 4 kiB\n");
1433+ printk(KERN_DEBUG
1434+ "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
1435+ dump_stack();
1436+ return -1;
1437+ }
1438+ return 0;
1439+}
1440+
1441+int
1442+mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1443+ char increment)
1444+{
1445+ if (mtrr_check(base, size))
1446+ return -EINVAL;
1447+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
1448+ increment);
1449+}
1450+
1451+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
1452+{
1453+ unsigned i;
1454+ mtrr_type ltype;
1455+ unsigned long lbase;
1456+ unsigned int lsize;
1457+ int error = -EINVAL;
1458+ struct xen_platform_op op;
1459+
1460+ mutex_lock(&mtrr_mutex);
1461+
1462+ if (reg < 0) {
1463+ /* Search for existing MTRR */
1464+ for (i = 0; i < num_var_ranges; ++i) {
1465+ mtrr_if->get(i, &lbase, &lsize, &ltype);
1466+ if (lbase == base && lsize == size) {
1467+ reg = i;
1468+ break;
1469+ }
1470+ }
1471+ if (reg < 0) {
1472+ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
1473+ size);
1474+ goto out;
1475+ }
1476+ }
1477+ if (usage_table[reg] < 1) {
1478+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1479+ goto out;
1480+ }
1481+ if (--usage_table[reg] < 1) {
1482+ op.cmd = XENPF_del_memtype;
1483+ op.u.del_memtype.handle = 0;
1484+ op.u.del_memtype.reg = reg;
1485+ error = HYPERVISOR_platform_op(&op);
1486+ if (error) {
1487+ BUG_ON(error > 0);
1488+ goto out;
1489+ }
1490+ }
1491+ error = reg;
1492+ out:
1493+ mutex_unlock(&mtrr_mutex);
1494+ return error;
1495+}
1496+
1497+int
1498+mtrr_del(int reg, unsigned long base, unsigned long size)
1499+{
1500+ if (mtrr_check(base, size))
1501+ return -EINVAL;
1502+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
1503+}
1504+
1505+EXPORT_SYMBOL(mtrr_add);
1506+EXPORT_SYMBOL(mtrr_del);
1507+
1508+void __init mtrr_bp_init(void)
1509+{
1510+}
1511+
1512+void mtrr_ap_init(void)
1513+{
1514+}
1515+
1516+static int __init mtrr_init(void)
1517+{
1518+ struct cpuinfo_x86 *c = &boot_cpu_data;
1519+
1520+ if (!is_initial_xendomain())
1521+ return -ENODEV;
1522+
1523+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
1524+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
1525+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
1526+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
1527+ return -ENODEV;
1528+
1529+ set_num_var_ranges();
1530+ init_table();
1531+
1532+ return 0;
1533+}
1534+
1535+subsys_initcall(mtrr_init);
1536Index: head-2008-11-25/arch/x86/kernel/entry_32-xen.S
1537===================================================================
1538--- /dev/null 1970-01-01 00:00:00.000000000 +0000
1539+++ head-2008-11-25/arch/x86/kernel/entry_32-xen.S 2007-12-10 08:47:31.000000000 +0100
1540@@ -0,0 +1,1238 @@
1541+/*
1542+ * linux/arch/i386/entry.S
1543+ *
1544+ * Copyright (C) 1991, 1992 Linus Torvalds
1545+ */
1546+
1547+/*
1548+ * entry.S contains the system-call and fault low-level handling routines.
1549+ * This also contains the timer-interrupt handler, as well as all interrupts
1550+ * and faults that can result in a task-switch.
1551+ *
1552+ * NOTE: This code handles signal-recognition, which happens every time
1553+ * after a timer-interrupt and after each system call.
1554+ *
1555+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
1556+ * on a 486.
1557+ *
1558+ * Stack layout in 'ret_from_system_call':
1559+ * ptrace needs to have all regs on the stack.
1560+ * if the order here is changed, it needs to be
1561+ * updated in fork.c:copy_process, signal.c:do_signal,
1562+ * ptrace.c and ptrace.h
1563+ *
1564+ * 0(%esp) - %ebx
1565+ * 4(%esp) - %ecx
1566+ * 8(%esp) - %edx
1567+ * C(%esp) - %esi
1568+ * 10(%esp) - %edi
1569+ * 14(%esp) - %ebp
1570+ * 18(%esp) - %eax
1571+ * 1C(%esp) - %ds
1572+ * 20(%esp) - %es
1573+ * 24(%esp) - orig_eax
1574+ * 28(%esp) - %eip
1575+ * 2C(%esp) - %cs
1576+ * 30(%esp) - %eflags
1577+ * 34(%esp) - %oldesp
1578+ * 38(%esp) - %oldss
1579+ *
1580+ * "current" is in register %ebx during any slow entries.
1581+ */
1582+
1583+#include <linux/linkage.h>
1584+#include <asm/thread_info.h>
1585+#include <asm/irqflags.h>
1586+#include <asm/errno.h>
1587+#include <asm/segment.h>
1588+#include <asm/smp.h>
1589+#include <asm/page.h>
1590+#include <asm/desc.h>
1591+#include <asm/dwarf2.h>
1592+#include "irq_vectors.h"
1593+#include <xen/interface/xen.h>
1594+
1595+#define nr_syscalls ((syscall_table_size)/4)
1596+
1597+EBX = 0x00
1598+ECX = 0x04
1599+EDX = 0x08
1600+ESI = 0x0C
1601+EDI = 0x10
1602+EBP = 0x14
1603+EAX = 0x18
1604+DS = 0x1C
1605+ES = 0x20
1606+ORIG_EAX = 0x24
1607+EIP = 0x28
1608+CS = 0x2C
1609+EFLAGS = 0x30
1610+OLDESP = 0x34
1611+OLDSS = 0x38
1612+
1613+CF_MASK = 0x00000001
1614+TF_MASK = 0x00000100
1615+IF_MASK = 0x00000200
1616+DF_MASK = 0x00000400
1617+NT_MASK = 0x00004000
1618+VM_MASK = 0x00020000
1619+/* Pseudo-eflags. */
1620+NMI_MASK = 0x80000000
1621+
1622+#ifndef CONFIG_XEN
1623+#define DISABLE_INTERRUPTS cli
1624+#define ENABLE_INTERRUPTS sti
1625+#else
1626+/* Offsets into shared_info_t. */
1627+#define evtchn_upcall_pending /* 0 */
1628+#define evtchn_upcall_mask 1
1629+
1630+#define sizeof_vcpu_shift 6
1631+
1632+#ifdef CONFIG_SMP
1633+#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1634+ shl $sizeof_vcpu_shift,%esi ; \
1635+ addl HYPERVISOR_shared_info,%esi
1636+#else
1637+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1638+#endif
1639+
1640+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1641+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1642+#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1643+ __DISABLE_INTERRUPTS
1644+#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1645+ __ENABLE_INTERRUPTS
1646+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1647+#endif
1648+
1649+#ifdef CONFIG_PREEMPT
1650+#define preempt_stop cli; TRACE_IRQS_OFF
1651+#else
1652+#define preempt_stop
1653+#define resume_kernel restore_nocheck
1654+#endif
1655+
1656+.macro TRACE_IRQS_IRET
1657+#ifdef CONFIG_TRACE_IRQFLAGS
1658+ testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1659+ jz 1f
1660+ TRACE_IRQS_ON
1661+1:
1662+#endif
1663+.endm
1664+
1665+#ifdef CONFIG_VM86
1666+#define resume_userspace_sig check_userspace
1667+#else
1668+#define resume_userspace_sig resume_userspace
1669+#endif
1670+
1671+#define SAVE_ALL \
1672+ cld; \
1673+ pushl %es; \
1674+ CFI_ADJUST_CFA_OFFSET 4;\
1675+ /*CFI_REL_OFFSET es, 0;*/\
1676+ pushl %ds; \
1677+ CFI_ADJUST_CFA_OFFSET 4;\
1678+ /*CFI_REL_OFFSET ds, 0;*/\
1679+ pushl %eax; \
1680+ CFI_ADJUST_CFA_OFFSET 4;\
1681+ CFI_REL_OFFSET eax, 0;\
1682+ pushl %ebp; \
1683+ CFI_ADJUST_CFA_OFFSET 4;\
1684+ CFI_REL_OFFSET ebp, 0;\
1685+ pushl %edi; \
1686+ CFI_ADJUST_CFA_OFFSET 4;\
1687+ CFI_REL_OFFSET edi, 0;\
1688+ pushl %esi; \
1689+ CFI_ADJUST_CFA_OFFSET 4;\
1690+ CFI_REL_OFFSET esi, 0;\
1691+ pushl %edx; \
1692+ CFI_ADJUST_CFA_OFFSET 4;\
1693+ CFI_REL_OFFSET edx, 0;\
1694+ pushl %ecx; \
1695+ CFI_ADJUST_CFA_OFFSET 4;\
1696+ CFI_REL_OFFSET ecx, 0;\
1697+ pushl %ebx; \
1698+ CFI_ADJUST_CFA_OFFSET 4;\
1699+ CFI_REL_OFFSET ebx, 0;\
1700+ movl $(__USER_DS), %edx; \
1701+ movl %edx, %ds; \
1702+ movl %edx, %es;
1703+
1704+#define RESTORE_INT_REGS \
1705+ popl %ebx; \
1706+ CFI_ADJUST_CFA_OFFSET -4;\
1707+ CFI_RESTORE ebx;\
1708+ popl %ecx; \
1709+ CFI_ADJUST_CFA_OFFSET -4;\
1710+ CFI_RESTORE ecx;\
1711+ popl %edx; \
1712+ CFI_ADJUST_CFA_OFFSET -4;\
1713+ CFI_RESTORE edx;\
1714+ popl %esi; \
1715+ CFI_ADJUST_CFA_OFFSET -4;\
1716+ CFI_RESTORE esi;\
1717+ popl %edi; \
1718+ CFI_ADJUST_CFA_OFFSET -4;\
1719+ CFI_RESTORE edi;\
1720+ popl %ebp; \
1721+ CFI_ADJUST_CFA_OFFSET -4;\
1722+ CFI_RESTORE ebp;\
1723+ popl %eax; \
1724+ CFI_ADJUST_CFA_OFFSET -4;\
1725+ CFI_RESTORE eax
1726+
1727+#define RESTORE_REGS \
1728+ RESTORE_INT_REGS; \
1729+1: popl %ds; \
1730+ CFI_ADJUST_CFA_OFFSET -4;\
1731+ /*CFI_RESTORE ds;*/\
1732+2: popl %es; \
1733+ CFI_ADJUST_CFA_OFFSET -4;\
1734+ /*CFI_RESTORE es;*/\
1735+.section .fixup,"ax"; \
1736+3: movl $0,(%esp); \
1737+ jmp 1b; \
1738+4: movl $0,(%esp); \
1739+ jmp 2b; \
1740+.previous; \
1741+.section __ex_table,"a";\
1742+ .align 4; \
1743+ .long 1b,3b; \
1744+ .long 2b,4b; \
1745+.previous
1746+
1747+#define RING0_INT_FRAME \
1748+ CFI_STARTPROC simple;\
1749+ CFI_DEF_CFA esp, 3*4;\
1750+ /*CFI_OFFSET cs, -2*4;*/\
1751+ CFI_OFFSET eip, -3*4
1752+
1753+#define RING0_EC_FRAME \
1754+ CFI_STARTPROC simple;\
1755+ CFI_DEF_CFA esp, 4*4;\
1756+ /*CFI_OFFSET cs, -2*4;*/\
1757+ CFI_OFFSET eip, -3*4
1758+
1759+#define RING0_PTREGS_FRAME \
1760+ CFI_STARTPROC simple;\
1761+ CFI_DEF_CFA esp, OLDESP-EBX;\
1762+ /*CFI_OFFSET cs, CS-OLDESP;*/\
1763+ CFI_OFFSET eip, EIP-OLDESP;\
1764+ /*CFI_OFFSET es, ES-OLDESP;*/\
1765+ /*CFI_OFFSET ds, DS-OLDESP;*/\
1766+ CFI_OFFSET eax, EAX-OLDESP;\
1767+ CFI_OFFSET ebp, EBP-OLDESP;\
1768+ CFI_OFFSET edi, EDI-OLDESP;\
1769+ CFI_OFFSET esi, ESI-OLDESP;\
1770+ CFI_OFFSET edx, EDX-OLDESP;\
1771+ CFI_OFFSET ecx, ECX-OLDESP;\
1772+ CFI_OFFSET ebx, EBX-OLDESP
1773+
1774+ENTRY(ret_from_fork)
1775+ CFI_STARTPROC
1776+ pushl %eax
1777+ CFI_ADJUST_CFA_OFFSET 4
1778+ call schedule_tail
1779+ GET_THREAD_INFO(%ebp)
1780+ popl %eax
1781+ CFI_ADJUST_CFA_OFFSET -4
1782+ pushl $0x0202 # Reset kernel eflags
1783+ CFI_ADJUST_CFA_OFFSET 4
1784+ popfl
1785+ CFI_ADJUST_CFA_OFFSET -4
1786+ jmp syscall_exit
1787+ CFI_ENDPROC
1788+
1789+/*
1790+ * Return to user mode is not as complex as all this looks,
1791+ * but we want the default path for a system call return to
1792+ * go as quickly as possible which is why some of this is
1793+ * less clear than it otherwise should be.
1794+ */
1795+
1796+ # userspace resumption stub bypassing syscall exit tracing
1797+ ALIGN
1798+ RING0_PTREGS_FRAME
1799+ret_from_exception:
1800+ preempt_stop
1801+ret_from_intr:
1802+ GET_THREAD_INFO(%ebp)
1803+check_userspace:
1804+ movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1805+ movb CS(%esp), %al
1806+ testl $(VM_MASK | 2), %eax
1807+ jz resume_kernel
1808+ENTRY(resume_userspace)
1809+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1810+ # setting need_resched or sigpending
1811+ # between sampling and the iret
1812+ movl TI_flags(%ebp), %ecx
1813+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1814+ # int/exception return?
1815+ jne work_pending
1816+ jmp restore_all
1817+
1818+#ifdef CONFIG_PREEMPT
1819+ENTRY(resume_kernel)
1820+ cli
1821+ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1822+ jnz restore_nocheck
1823+need_resched:
1824+ movl TI_flags(%ebp), %ecx # need_resched set ?
1825+ testb $_TIF_NEED_RESCHED, %cl
1826+ jz restore_all
1827+ testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1828+ jz restore_all
1829+ call preempt_schedule_irq
1830+ jmp need_resched
1831+#endif
1832+ CFI_ENDPROC
1833+
1834+/* SYSENTER_RETURN points to after the "sysenter" instruction in
1835+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
1836+
1837+ # sysenter call handler stub
1838+ENTRY(sysenter_entry)
1839+ CFI_STARTPROC simple
1840+ CFI_DEF_CFA esp, 0
1841+ CFI_REGISTER esp, ebp
1842+ movl SYSENTER_stack_esp0(%esp),%esp
1843+sysenter_past_esp:
1844+ /*
1845+ * No need to follow this irqs on/off section: the syscall
1846+ * disabled irqs and here we enable it straight after entry:
1847+ */
1848+ sti
1849+ pushl $(__USER_DS)
1850+ CFI_ADJUST_CFA_OFFSET 4
1851+ /*CFI_REL_OFFSET ss, 0*/
1852+ pushl %ebp
1853+ CFI_ADJUST_CFA_OFFSET 4
1854+ CFI_REL_OFFSET esp, 0
1855+ pushfl
1856+ CFI_ADJUST_CFA_OFFSET 4
1857+ pushl $(__USER_CS)
1858+ CFI_ADJUST_CFA_OFFSET 4
1859+ /*CFI_REL_OFFSET cs, 0*/
1860+ /*
1861+ * Push current_thread_info()->sysenter_return to the stack.
1862+ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1863+ * pushed above; +8 corresponds to copy_thread's esp0 setting.
1864+ */
1865+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1866+ CFI_ADJUST_CFA_OFFSET 4
1867+ CFI_REL_OFFSET eip, 0
1868+
1869+/*
1870+ * Load the potential sixth argument from user stack.
1871+ * Careful about security.
1872+ */
1873+ cmpl $__PAGE_OFFSET-3,%ebp
1874+ jae syscall_fault
1875+1: movl (%ebp),%ebp
1876+.section __ex_table,"a"
1877+ .align 4
1878+ .long 1b,syscall_fault
1879+.previous
1880+
1881+ pushl %eax
1882+ CFI_ADJUST_CFA_OFFSET 4
1883+ SAVE_ALL
1884+ GET_THREAD_INFO(%ebp)
1885+
1886+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1887+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1888+ jnz syscall_trace_entry
1889+ cmpl $(nr_syscalls), %eax
1890+ jae syscall_badsys
1891+ call *sys_call_table(,%eax,4)
1892+ movl %eax,EAX(%esp)
1893+ DISABLE_INTERRUPTS
1894+ TRACE_IRQS_OFF
1895+ movl TI_flags(%ebp), %ecx
1896+ testw $_TIF_ALLWORK_MASK, %cx
1897+ jne syscall_exit_work
1898+/* if something modifies registers it must also disable sysexit */
1899+ movl EIP(%esp), %edx
1900+ movl OLDESP(%esp), %ecx
1901+ xorl %ebp,%ebp
1902+#ifdef CONFIG_XEN
1903+ TRACE_IRQS_ON
1904+ __ENABLE_INTERRUPTS
1905+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1906+ __TEST_PENDING
1907+ jnz 14f # process more events if necessary...
1908+ movl ESI(%esp), %esi
1909+ sysexit
1910+14: __DISABLE_INTERRUPTS
1911+ TRACE_IRQS_OFF
1912+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1913+ push %esp
1914+ call evtchn_do_upcall
1915+ add $4,%esp
1916+ jmp ret_from_intr
1917+#else
1918+ TRACE_IRQS_ON
1919+ sti
1920+ sysexit
1921+#endif /* !CONFIG_XEN */
1922+ CFI_ENDPROC
1923+
1924+ # pv sysenter call handler stub
1925+ENTRY(sysenter_entry_pv)
1926+ RING0_INT_FRAME
1927+ movl $__USER_DS,16(%esp)
1928+ movl %ebp,12(%esp)
1929+ movl $__USER_CS,4(%esp)
1930+ addl $4,%esp
1931+ /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
1932+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1933+/*
1934+ * Load the potential sixth argument from user stack.
1935+ * Careful about security.
1936+ */
1937+ cmpl $__PAGE_OFFSET-3,%ebp
1938+ jae syscall_fault
1939+1: movl (%ebp),%ebp
1940+.section __ex_table,"a"
1941+ .align 4
1942+ .long 1b,syscall_fault
1943+.previous
1944+ /* fall through */
1945+ CFI_ENDPROC
1946+ENDPROC(sysenter_entry_pv)
1947+
1948+ # system call handler stub
1949+ENTRY(system_call)
1950+ RING0_INT_FRAME # can't unwind into user space anyway
1951+ pushl %eax # save orig_eax
1952+ CFI_ADJUST_CFA_OFFSET 4
1953+ SAVE_ALL
1954+ GET_THREAD_INFO(%ebp)
1955+ testl $TF_MASK,EFLAGS(%esp)
1956+ jz no_singlestep
1957+ orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1958+no_singlestep:
1959+ # system call tracing in operation / emulation
1960+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1961+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1962+ jnz syscall_trace_entry
1963+ cmpl $(nr_syscalls), %eax
1964+ jae syscall_badsys
1965+syscall_call:
1966+ call *sys_call_table(,%eax,4)
1967+ movl %eax,EAX(%esp) # store the return value
1968+syscall_exit:
1969+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1970+ # setting need_resched or sigpending
1971+ # between sampling and the iret
1972+ TRACE_IRQS_OFF
1973+ movl TI_flags(%ebp), %ecx
1974+ testw $_TIF_ALLWORK_MASK, %cx # current->work
1975+ jne syscall_exit_work
1976+
1977+restore_all:
1978+#ifndef CONFIG_XEN
1979+ movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1980+ # Warning: OLDSS(%esp) contains the wrong/random values if we
1981+ # are returning to the kernel.
1982+ # See comments in process.c:copy_thread() for details.
1983+ movb OLDSS(%esp), %ah
1984+ movb CS(%esp), %al
1985+ andl $(VM_MASK | (4 << 8) | 3), %eax
1986+ cmpl $((4 << 8) | 3), %eax
1987+ CFI_REMEMBER_STATE
1988+ je ldt_ss # returning to user-space with LDT SS
1989+restore_nocheck:
1990+#else
1991+restore_nocheck:
1992+ movl EFLAGS(%esp), %eax
1993+ testl $(VM_MASK|NMI_MASK), %eax
1994+ CFI_REMEMBER_STATE
1995+ jnz hypervisor_iret
1996+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1997+ GET_VCPU_INFO
1998+ andb evtchn_upcall_mask(%esi),%al
1999+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2000+ CFI_REMEMBER_STATE
2001+ jnz restore_all_enable_events # != 0 => enable event delivery
2002+#endif
2003+ TRACE_IRQS_IRET
2004+restore_nocheck_notrace:
2005+ RESTORE_REGS
2006+ addl $4, %esp
2007+ CFI_ADJUST_CFA_OFFSET -4
2008+1: iret
2009+.section .fixup,"ax"
2010+iret_exc:
2011+#ifndef CONFIG_XEN
2012+ TRACE_IRQS_ON
2013+ sti
2014+#endif
2015+ pushl $0 # no error code
2016+ pushl $do_iret_error
2017+ jmp error_code
2018+.previous
2019+.section __ex_table,"a"
2020+ .align 4
2021+ .long 1b,iret_exc
2022+.previous
2023+
2024+ CFI_RESTORE_STATE
2025+#ifndef CONFIG_XEN
2026+ldt_ss:
2027+ larl OLDSS(%esp), %eax
2028+ jnz restore_nocheck
2029+ testl $0x00400000, %eax # returning to 32bit stack?
2030+ jnz restore_nocheck # allright, normal return
2031+ /* If returning to userspace with 16bit stack,
2032+ * try to fix the higher word of ESP, as the CPU
2033+ * won't restore it.
2034+ * This is an "official" bug of all the x86-compatible
2035+ * CPUs, which we can try to work around to make
2036+ * dosemu and wine happy. */
2037+ subl $8, %esp # reserve space for switch16 pointer
2038+ CFI_ADJUST_CFA_OFFSET 8
2039+ cli
2040+ TRACE_IRQS_OFF
2041+ movl %esp, %eax
2042+ /* Set up the 16bit stack frame with switch32 pointer on top,
2043+ * and a switch16 pointer on top of the current frame. */
2044+ call setup_x86_bogus_stack
2045+ CFI_ADJUST_CFA_OFFSET -8 # frame has moved
2046+ TRACE_IRQS_IRET
2047+ RESTORE_REGS
2048+ lss 20+4(%esp), %esp # switch to 16bit stack
2049+1: iret
2050+.section __ex_table,"a"
2051+ .align 4
2052+ .long 1b,iret_exc
2053+.previous
2054+#else
2055+ ALIGN
2056+restore_all_enable_events:
2057+ TRACE_IRQS_ON
2058+ __ENABLE_INTERRUPTS
2059+scrit: /**** START OF CRITICAL REGION ****/
2060+ __TEST_PENDING
2061+ jnz 14f # process more events if necessary...
2062+ RESTORE_REGS
2063+ addl $4, %esp
2064+ CFI_ADJUST_CFA_OFFSET -4
2065+1: iret
2066+.section __ex_table,"a"
2067+ .align 4
2068+ .long 1b,iret_exc
2069+.previous
2070+14: __DISABLE_INTERRUPTS
2071+ TRACE_IRQS_OFF
2072+ jmp 11f
2073+ecrit: /**** END OF CRITICAL REGION ****/
2074+
2075+ CFI_RESTORE_STATE
2076+hypervisor_iret:
2077+ andl $~NMI_MASK, EFLAGS(%esp)
2078+ RESTORE_REGS
2079+ addl $4, %esp
2080+ CFI_ADJUST_CFA_OFFSET -4
2081+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
2082+#endif
2083+ CFI_ENDPROC
2084+
2085+ # perform work that needs to be done immediately before resumption
2086+ ALIGN
2087+ RING0_PTREGS_FRAME # can't unwind into user space anyway
2088+work_pending:
2089+ testb $_TIF_NEED_RESCHED, %cl
2090+ jz work_notifysig
2091+work_resched:
2092+ call schedule
2093+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
2094+ # setting need_resched or sigpending
2095+ # between sampling and the iret
2096+ TRACE_IRQS_OFF
2097+ movl TI_flags(%ebp), %ecx
2098+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
2099+ # than syscall tracing?
2100+ jz restore_all
2101+ testb $_TIF_NEED_RESCHED, %cl
2102+ jnz work_resched
2103+
2104+work_notifysig: # deal with pending signals and
2105+ # notify-resume requests
2106+ testl $VM_MASK, EFLAGS(%esp)
2107+ movl %esp, %eax
2108+ jne work_notifysig_v86 # returning to kernel-space or
2109+ # vm86-space
2110+ xorl %edx, %edx
2111+ call do_notify_resume
2112+ jmp resume_userspace_sig
2113+
2114+ ALIGN
2115+work_notifysig_v86:
2116+#ifdef CONFIG_VM86
2117+ pushl %ecx # save ti_flags for do_notify_resume
2118+ CFI_ADJUST_CFA_OFFSET 4
2119+ call save_v86_state # %eax contains pt_regs pointer
2120+ popl %ecx
2121+ CFI_ADJUST_CFA_OFFSET -4
2122+ movl %eax, %esp
2123+ xorl %edx, %edx
2124+ call do_notify_resume
2125+ jmp resume_userspace_sig
2126+#endif
2127+
2128+ # perform syscall exit tracing
2129+ ALIGN
2130+syscall_trace_entry:
2131+ movl $-ENOSYS,EAX(%esp)
2132+ movl %esp, %eax
2133+ xorl %edx,%edx
2134+ call do_syscall_trace
2135+ cmpl $0, %eax
2136+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2137+ # so must skip actual syscall
2138+ movl ORIG_EAX(%esp), %eax
2139+ cmpl $(nr_syscalls), %eax
2140+ jnae syscall_call
2141+ jmp syscall_exit
2142+
2143+ # perform syscall exit tracing
2144+ ALIGN
2145+syscall_exit_work:
2146+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2147+ jz work_pending
2148+ TRACE_IRQS_ON
2149+ ENABLE_INTERRUPTS # could let do_syscall_trace() call
2150+ # schedule() instead
2151+ movl %esp, %eax
2152+ movl $1, %edx
2153+ call do_syscall_trace
2154+ jmp resume_userspace
2155+ CFI_ENDPROC
2156+
2157+ RING0_INT_FRAME # can't unwind into user space anyway
2158+syscall_fault:
2159+ pushl %eax # save orig_eax
2160+ CFI_ADJUST_CFA_OFFSET 4
2161+ SAVE_ALL
2162+ GET_THREAD_INFO(%ebp)
2163+ movl $-EFAULT,EAX(%esp)
2164+ jmp resume_userspace
2165+
2166+syscall_badsys:
2167+ movl $-ENOSYS,EAX(%esp)
2168+ jmp resume_userspace
2169+ CFI_ENDPROC
2170+
2171+#ifndef CONFIG_XEN
2172+#define FIXUP_ESPFIX_STACK \
2173+ movl %esp, %eax; \
2174+ /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2175+ lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2176+ /* copy data from 16bit stack to 32bit stack */ \
2177+ call fixup_x86_bogus_stack; \
2178+ /* put ESP to the proper location */ \
2179+ movl %eax, %esp;
2180+#define UNWIND_ESPFIX_STACK \
2181+ pushl %eax; \
2182+ CFI_ADJUST_CFA_OFFSET 4; \
2183+ movl %ss, %eax; \
2184+ /* see if on 16bit stack */ \
2185+ cmpw $__ESPFIX_SS, %ax; \
2186+ je 28f; \
2187+27: popl %eax; \
2188+ CFI_ADJUST_CFA_OFFSET -4; \
2189+.section .fixup,"ax"; \
2190+28: movl $__KERNEL_DS, %eax; \
2191+ movl %eax, %ds; \
2192+ movl %eax, %es; \
2193+ /* switch to 32bit stack */ \
2194+ FIXUP_ESPFIX_STACK; \
2195+ jmp 27b; \
2196+.previous
2197+
2198+/*
2199+ * Build the entry stubs and pointer table with
2200+ * some assembler magic.
2201+ */
2202+.data
2203+ENTRY(interrupt)
2204+.text
2205+
2206+vector=0
2207+ENTRY(irq_entries_start)
2208+ RING0_INT_FRAME
2209+.rept NR_IRQS
2210+ ALIGN
2211+ .if vector
2212+ CFI_ADJUST_CFA_OFFSET -4
2213+ .endif
2214+1: pushl $~(vector)
2215+ CFI_ADJUST_CFA_OFFSET 4
2216+ jmp common_interrupt
2217+.data
2218+ .long 1b
2219+.text
2220+vector=vector+1
2221+.endr
2222+
2223+/*
2224+ * the CPU automatically disables interrupts when executing an IRQ vector,
2225+ * so IRQ-flags tracing has to follow that:
2226+ */
2227+ ALIGN
2228+common_interrupt:
2229+ SAVE_ALL
2230+ TRACE_IRQS_OFF
2231+ movl %esp,%eax
2232+ call do_IRQ
2233+ jmp ret_from_intr
2234+ CFI_ENDPROC
2235+
2236+#define BUILD_INTERRUPT(name, nr) \
2237+ENTRY(name) \
2238+ RING0_INT_FRAME; \
2239+ pushl $~(nr); \
2240+ CFI_ADJUST_CFA_OFFSET 4; \
2241+ SAVE_ALL; \
2242+ TRACE_IRQS_OFF \
2243+ movl %esp,%eax; \
2244+ call smp_/**/name; \
2245+ jmp ret_from_intr; \
2246+ CFI_ENDPROC
2247+
2248+/* The include is where all of the SMP etc. interrupts come from */
2249+#include "entry_arch.h"
2250+#else
2251+#define UNWIND_ESPFIX_STACK
2252+#endif
2253+
2254+ENTRY(divide_error)
2255+ RING0_INT_FRAME
2256+ pushl $0 # no error code
2257+ CFI_ADJUST_CFA_OFFSET 4
2258+ pushl $do_divide_error
2259+ CFI_ADJUST_CFA_OFFSET 4
2260+ ALIGN
2261+error_code:
2262+ pushl %ds
2263+ CFI_ADJUST_CFA_OFFSET 4
2264+ /*CFI_REL_OFFSET ds, 0*/
2265+ pushl %eax
2266+ CFI_ADJUST_CFA_OFFSET 4
2267+ CFI_REL_OFFSET eax, 0
2268+ xorl %eax, %eax
2269+ pushl %ebp
2270+ CFI_ADJUST_CFA_OFFSET 4
2271+ CFI_REL_OFFSET ebp, 0
2272+ pushl %edi
2273+ CFI_ADJUST_CFA_OFFSET 4
2274+ CFI_REL_OFFSET edi, 0
2275+ pushl %esi
2276+ CFI_ADJUST_CFA_OFFSET 4
2277+ CFI_REL_OFFSET esi, 0
2278+ pushl %edx
2279+ CFI_ADJUST_CFA_OFFSET 4
2280+ CFI_REL_OFFSET edx, 0
2281+ decl %eax # eax = -1
2282+ pushl %ecx
2283+ CFI_ADJUST_CFA_OFFSET 4
2284+ CFI_REL_OFFSET ecx, 0
2285+ pushl %ebx
2286+ CFI_ADJUST_CFA_OFFSET 4
2287+ CFI_REL_OFFSET ebx, 0
2288+ cld
2289+ pushl %es
2290+ CFI_ADJUST_CFA_OFFSET 4
2291+ /*CFI_REL_OFFSET es, 0*/
2292+ UNWIND_ESPFIX_STACK
2293+ popl %ecx
2294+ CFI_ADJUST_CFA_OFFSET -4
2295+ /*CFI_REGISTER es, ecx*/
2296+ movl ES(%esp), %edi # get the function address
2297+ movl ORIG_EAX(%esp), %edx # get the error code
2298+ movl %eax, ORIG_EAX(%esp)
2299+ movl %ecx, ES(%esp)
2300+ /*CFI_REL_OFFSET es, ES*/
2301+ movl $(__USER_DS), %ecx
2302+ movl %ecx, %ds
2303+ movl %ecx, %es
2304+ movl %esp,%eax # pt_regs pointer
2305+ call *%edi
2306+ jmp ret_from_exception
2307+ CFI_ENDPROC
2308+
2309+#ifdef CONFIG_XEN
2310+# A note on the "critical region" in our callback handler.
2311+# We want to avoid stacking callback handlers due to events occurring
2312+# during handling of the last event. To do this, we keep events disabled
2313+# until we've done all processing. HOWEVER, we must enable events before
2314+# popping the stack frame (can't be done atomically) and so it would still
2315+# be possible to get enough handler activations to overflow the stack.
2316+# Although unlikely, bugs of that kind are hard to track down, so we'd
2317+# like to avoid the possibility.
2318+# So, on entry to the handler we detect whether we interrupted an
2319+# existing activation in its critical region -- if so, we pop the current
2320+# activation and restart the handler using the previous one.
2321+#
2322+# The sysexit critical region is slightly different. sysexit
2323+# atomically removes the entire stack frame. If we interrupt in the
2324+# critical region we know that the entire frame is present and correct
2325+# so we can simply throw away the new one.
2326+ENTRY(hypervisor_callback)
2327+ RING0_INT_FRAME
2328+ pushl %eax
2329+ CFI_ADJUST_CFA_OFFSET 4
2330+ SAVE_ALL
2331+ movl EIP(%esp),%eax
2332+ cmpl $scrit,%eax
2333+ jb 11f
2334+ cmpl $ecrit,%eax
2335+ jb critical_region_fixup
2336+ cmpl $sysexit_scrit,%eax
2337+ jb 11f
2338+ cmpl $sysexit_ecrit,%eax
2339+ ja 11f
2340+ addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2341+11: push %esp
2342+ CFI_ADJUST_CFA_OFFSET 4
2343+ call evtchn_do_upcall
2344+ add $4,%esp
2345+ CFI_ADJUST_CFA_OFFSET -4
2346+ jmp ret_from_intr
2347+ CFI_ENDPROC
2348+
2349+# [How we do the fixup]. We want to merge the current stack frame with the
2350+# just-interrupted frame. How we do this depends on where in the critical
2351+# region the interrupted handler was executing, and so how many saved
2352+# registers are in each frame. We do this quickly using the lookup table
2353+# 'critical_fixup_table'. For each byte offset in the critical region, it
2354+# provides the number of bytes which have already been popped from the
2355+# interrupted stack frame.
2356+critical_region_fixup:
2357+ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
2358+ cmpb $0xff,%cl # 0xff => vcpu_info critical region
2359+ jne 15f
2360+ xorl %ecx,%ecx
2361+15: leal (%esp,%ecx),%esi # %esi points at end of src region
2362+ leal OLDESP(%esp),%edi # %edi points at end of dst region
2363+ shrl $2,%ecx # convert words to bytes
2364+ je 17f # skip loop if nothing to copy
2365+16: subl $4,%esi # pre-decrementing copy loop
2366+ subl $4,%edi
2367+ movl (%esi),%eax
2368+ movl %eax,(%edi)
2369+ loop 16b
2370+17: movl %edi,%esp # final %edi is top of merged stack
2371+ jmp 11b
2372+
2373+.section .rodata,"a"
2374+critical_fixup_table:
2375+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
2376+ .byte 0xff,0xff # jnz 14f
2377+ .byte 0x00 # pop %ebx
2378+ .byte 0x04 # pop %ecx
2379+ .byte 0x08 # pop %edx
2380+ .byte 0x0c # pop %esi
2381+ .byte 0x10 # pop %edi
2382+ .byte 0x14 # pop %ebp
2383+ .byte 0x18 # pop %eax
2384+ .byte 0x1c # pop %ds
2385+ .byte 0x20 # pop %es
2386+ .byte 0x24,0x24,0x24 # add $4,%esp
2387+ .byte 0x28 # iret
2388+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2389+ .byte 0x00,0x00 # jmp 11b
2390+.previous
2391+
2392+# Hypervisor uses this for application faults while it executes.
2393+# We get here for two reasons:
2394+# 1. Fault while reloading DS, ES, FS or GS
2395+# 2. Fault while executing IRET
2396+# Category 1 we fix up by reattempting the load, and zeroing the segment
2397+# register if the load fails.
2398+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
2399+# normal Linux return path in this case because if we use the IRET hypercall
2400+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
2401+# We distinguish between categories by maintaining a status value in EAX.
2402+ENTRY(failsafe_callback)
2403+ pushl %eax
2404+ movl $1,%eax
2405+1: mov 4(%esp),%ds
2406+2: mov 8(%esp),%es
2407+3: mov 12(%esp),%fs
2408+4: mov 16(%esp),%gs
2409+ testl %eax,%eax
2410+ popl %eax
2411+ jz 5f
2412+ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
2413+ jmp iret_exc
2414+5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
2415+ RING0_INT_FRAME
2416+ pushl $0
2417+ SAVE_ALL
2418+ jmp ret_from_exception
2419+.section .fixup,"ax"; \
2420+6: xorl %eax,%eax; \
2421+ movl %eax,4(%esp); \
2422+ jmp 1b; \
2423+7: xorl %eax,%eax; \
2424+ movl %eax,8(%esp); \
2425+ jmp 2b; \
2426+8: xorl %eax,%eax; \
2427+ movl %eax,12(%esp); \
2428+ jmp 3b; \
2429+9: xorl %eax,%eax; \
2430+ movl %eax,16(%esp); \
2431+ jmp 4b; \
2432+.previous; \
2433+.section __ex_table,"a"; \
2434+ .align 4; \
2435+ .long 1b,6b; \
2436+ .long 2b,7b; \
2437+ .long 3b,8b; \
2438+ .long 4b,9b; \
2439+.previous
2440+#endif
2441+ CFI_ENDPROC
2442+
2443+ENTRY(coprocessor_error)
2444+ RING0_INT_FRAME
2445+ pushl $0
2446+ CFI_ADJUST_CFA_OFFSET 4
2447+ pushl $do_coprocessor_error
2448+ CFI_ADJUST_CFA_OFFSET 4
2449+ jmp error_code
2450+ CFI_ENDPROC
2451+
2452+ENTRY(simd_coprocessor_error)
2453+ RING0_INT_FRAME
2454+ pushl $0
2455+ CFI_ADJUST_CFA_OFFSET 4
2456+ pushl $do_simd_coprocessor_error
2457+ CFI_ADJUST_CFA_OFFSET 4
2458+ jmp error_code
2459+ CFI_ENDPROC
2460+
2461+ENTRY(device_not_available)
2462+ RING0_INT_FRAME
2463+ pushl $-1 # mark this as an int
2464+ CFI_ADJUST_CFA_OFFSET 4
2465+ SAVE_ALL
2466+#ifndef CONFIG_XEN
2467+ movl %cr0, %eax
2468+ testl $0x4, %eax # EM (math emulation bit)
2469+ je device_available_emulate
2470+ pushl $0 # temporary storage for ORIG_EIP
2471+ CFI_ADJUST_CFA_OFFSET 4
2472+ call math_emulate
2473+ addl $4, %esp
2474+ CFI_ADJUST_CFA_OFFSET -4
2475+ jmp ret_from_exception
2476+device_available_emulate:
2477+#endif
2478+ preempt_stop
2479+ call math_state_restore
2480+ jmp ret_from_exception
2481+ CFI_ENDPROC
2482+
2483+#ifndef CONFIG_XEN
2484+/*
2485+ * Debug traps and NMI can happen at the one SYSENTER instruction
2486+ * that sets up the real kernel stack. Check here, since we can't
2487+ * allow the wrong stack to be used.
2488+ *
2489+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2490+ * already pushed 3 words if it hits on the sysenter instruction:
2491+ * eflags, cs and eip.
2492+ *
2493+ * We just load the right stack, and push the three (known) values
2494+ * by hand onto the new stack - while updating the return eip past
2495+ * the instruction that would have done it for sysenter.
2496+ */
2497+#define FIX_STACK(offset, ok, label) \
2498+ cmpw $__KERNEL_CS,4(%esp); \
2499+ jne ok; \
2500+label: \
2501+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2502+ pushfl; \
2503+ pushl $__KERNEL_CS; \
2504+ pushl $sysenter_past_esp
2505+#endif /* CONFIG_XEN */
2506+
2507+KPROBE_ENTRY(debug)
2508+ RING0_INT_FRAME
2509+#ifndef CONFIG_XEN
2510+ cmpl $sysenter_entry,(%esp)
2511+ jne debug_stack_correct
2512+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2513+debug_stack_correct:
2514+#endif /* !CONFIG_XEN */
2515+ pushl $-1 # mark this as an int
2516+ CFI_ADJUST_CFA_OFFSET 4
2517+ SAVE_ALL
2518+ xorl %edx,%edx # error code 0
2519+ movl %esp,%eax # pt_regs pointer
2520+ call do_debug
2521+ jmp ret_from_exception
2522+ CFI_ENDPROC
2523+ .previous .text
2524+#ifndef CONFIG_XEN
2525+/*
2526+ * NMI is doubly nasty. It can happen _while_ we're handling
2527+ * a debug fault, and the debug fault hasn't yet been able to
2528+ * clear up the stack. So we first check whether we got an
2529+ * NMI on the sysenter entry path, but after that we need to
2530+ * check whether we got an NMI on the debug path where the debug
2531+ * fault happened on the sysenter path.
2532+ */
2533+ENTRY(nmi)
2534+ RING0_INT_FRAME
2535+ pushl %eax
2536+ CFI_ADJUST_CFA_OFFSET 4
2537+ movl %ss, %eax
2538+ cmpw $__ESPFIX_SS, %ax
2539+ popl %eax
2540+ CFI_ADJUST_CFA_OFFSET -4
2541+ je nmi_16bit_stack
2542+ cmpl $sysenter_entry,(%esp)
2543+ je nmi_stack_fixup
2544+ pushl %eax
2545+ CFI_ADJUST_CFA_OFFSET 4
2546+ movl %esp,%eax
2547+ /* Do not access memory above the end of our stack page,
2548+ * it might not exist.
2549+ */
2550+ andl $(THREAD_SIZE-1),%eax
2551+ cmpl $(THREAD_SIZE-20),%eax
2552+ popl %eax
2553+ CFI_ADJUST_CFA_OFFSET -4
2554+ jae nmi_stack_correct
2555+ cmpl $sysenter_entry,12(%esp)
2556+ je nmi_debug_stack_check
2557+nmi_stack_correct:
2558+ pushl %eax
2559+ CFI_ADJUST_CFA_OFFSET 4
2560+ SAVE_ALL
2561+ xorl %edx,%edx # zero error code
2562+ movl %esp,%eax # pt_regs pointer
2563+ call do_nmi
2564+ jmp restore_nocheck_notrace
2565+ CFI_ENDPROC
2566+
2567+nmi_stack_fixup:
2568+ FIX_STACK(12,nmi_stack_correct, 1)
2569+ jmp nmi_stack_correct
2570+nmi_debug_stack_check:
2571+ cmpw $__KERNEL_CS,16(%esp)
2572+ jne nmi_stack_correct
2573+ cmpl $debug,(%esp)
2574+ jb nmi_stack_correct
2575+ cmpl $debug_esp_fix_insn,(%esp)
2576+ ja nmi_stack_correct
2577+ FIX_STACK(24,nmi_stack_correct, 1)
2578+ jmp nmi_stack_correct
2579+
2580+nmi_16bit_stack:
2581+ RING0_INT_FRAME
2582+ /* create the pointer to lss back */
2583+ pushl %ss
2584+ CFI_ADJUST_CFA_OFFSET 4
2585+ pushl %esp
2586+ CFI_ADJUST_CFA_OFFSET 4
2587+ movzwl %sp, %esp
2588+ addw $4, (%esp)
2589+ /* copy the iret frame of 12 bytes */
2590+ .rept 3
2591+ pushl 16(%esp)
2592+ CFI_ADJUST_CFA_OFFSET 4
2593+ .endr
2594+ pushl %eax
2595+ CFI_ADJUST_CFA_OFFSET 4
2596+ SAVE_ALL
2597+ FIXUP_ESPFIX_STACK # %eax == %esp
2598+ CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2599+ xorl %edx,%edx # zero error code
2600+ call do_nmi
2601+ RESTORE_REGS
2602+ lss 12+4(%esp), %esp # back to 16bit stack
2603+1: iret
2604+ CFI_ENDPROC
2605+.section __ex_table,"a"
2606+ .align 4
2607+ .long 1b,iret_exc
2608+.previous
2609+#else
2610+ENTRY(nmi)
2611+ RING0_INT_FRAME
2612+ pushl %eax
2613+ CFI_ADJUST_CFA_OFFSET 4
2614+ SAVE_ALL
2615+ xorl %edx,%edx # zero error code
2616+ movl %esp,%eax # pt_regs pointer
2617+ call do_nmi
2618+ orl $NMI_MASK, EFLAGS(%esp)
2619+ jmp restore_all
2620+ CFI_ENDPROC
2621+#endif
2622+
2623+KPROBE_ENTRY(int3)
2624+ RING0_INT_FRAME
2625+ pushl $-1 # mark this as an int
2626+ CFI_ADJUST_CFA_OFFSET 4
2627+ SAVE_ALL
2628+ xorl %edx,%edx # zero error code
2629+ movl %esp,%eax # pt_regs pointer
2630+ call do_int3
2631+ jmp ret_from_exception
2632+ CFI_ENDPROC
2633+ .previous .text
2634+
2635+ENTRY(overflow)
2636+ RING0_INT_FRAME
2637+ pushl $0
2638+ CFI_ADJUST_CFA_OFFSET 4
2639+ pushl $do_overflow
2640+ CFI_ADJUST_CFA_OFFSET 4
2641+ jmp error_code
2642+ CFI_ENDPROC
2643+
2644+ENTRY(bounds)
2645+ RING0_INT_FRAME
2646+ pushl $0
2647+ CFI_ADJUST_CFA_OFFSET 4
2648+ pushl $do_bounds
2649+ CFI_ADJUST_CFA_OFFSET 4
2650+ jmp error_code
2651+ CFI_ENDPROC
2652+
2653+ENTRY(invalid_op)
2654+ RING0_INT_FRAME
2655+ pushl $0
2656+ CFI_ADJUST_CFA_OFFSET 4
2657+ pushl $do_invalid_op
2658+ CFI_ADJUST_CFA_OFFSET 4
2659+ jmp error_code
2660+ CFI_ENDPROC
2661+
2662+ENTRY(coprocessor_segment_overrun)
2663+ RING0_INT_FRAME
2664+ pushl $0
2665+ CFI_ADJUST_CFA_OFFSET 4
2666+ pushl $do_coprocessor_segment_overrun
2667+ CFI_ADJUST_CFA_OFFSET 4
2668+ jmp error_code
2669+ CFI_ENDPROC
2670+
2671+ENTRY(invalid_TSS)
2672+ RING0_EC_FRAME
2673+ pushl $do_invalid_TSS
2674+ CFI_ADJUST_CFA_OFFSET 4
2675+ jmp error_code
2676+ CFI_ENDPROC
2677+
2678+ENTRY(segment_not_present)
2679+ RING0_EC_FRAME
2680+ pushl $do_segment_not_present
2681+ CFI_ADJUST_CFA_OFFSET 4
2682+ jmp error_code
2683+ CFI_ENDPROC
2684+
2685+ENTRY(stack_segment)
2686+ RING0_EC_FRAME
2687+ pushl $do_stack_segment
2688+ CFI_ADJUST_CFA_OFFSET 4
2689+ jmp error_code
2690+ CFI_ENDPROC
2691+
2692+KPROBE_ENTRY(general_protection)
2693+ RING0_EC_FRAME
2694+ pushl $do_general_protection
2695+ CFI_ADJUST_CFA_OFFSET 4
2696+ jmp error_code
2697+ CFI_ENDPROC
2698+ .previous .text
2699+
2700+ENTRY(alignment_check)
2701+ RING0_EC_FRAME
2702+ pushl $do_alignment_check
2703+ CFI_ADJUST_CFA_OFFSET 4
2704+ jmp error_code
2705+ CFI_ENDPROC
2706+
2707+KPROBE_ENTRY(page_fault)
2708+ RING0_EC_FRAME
2709+ pushl $do_page_fault
2710+ CFI_ADJUST_CFA_OFFSET 4
2711+ jmp error_code
2712+ CFI_ENDPROC
2713+ .previous .text
2714+
2715+#ifdef CONFIG_X86_MCE
2716+ENTRY(machine_check)
2717+ RING0_INT_FRAME
2718+ pushl $0
2719+ CFI_ADJUST_CFA_OFFSET 4
2720+ pushl machine_check_vector
2721+ CFI_ADJUST_CFA_OFFSET 4
2722+ jmp error_code
2723+ CFI_ENDPROC
2724+#endif
2725+
2726+#ifndef CONFIG_XEN
2727+ENTRY(spurious_interrupt_bug)
2728+ RING0_INT_FRAME
2729+ pushl $0
2730+ CFI_ADJUST_CFA_OFFSET 4
2731+ pushl $do_spurious_interrupt_bug
2732+ CFI_ADJUST_CFA_OFFSET 4
2733+ jmp error_code
2734+ CFI_ENDPROC
2735+#endif /* !CONFIG_XEN */
2736+
2737+#ifdef CONFIG_STACK_UNWIND
2738+ENTRY(arch_unwind_init_running)
2739+ CFI_STARTPROC
2740+ movl 4(%esp), %edx
2741+ movl (%esp), %ecx
2742+ leal 4(%esp), %eax
2743+ movl %ebx, EBX(%edx)
2744+ xorl %ebx, %ebx
2745+ movl %ebx, ECX(%edx)
2746+ movl %ebx, EDX(%edx)
2747+ movl %esi, ESI(%edx)
2748+ movl %edi, EDI(%edx)
2749+ movl %ebp, EBP(%edx)
2750+ movl %ebx, EAX(%edx)
2751+ movl $__USER_DS, DS(%edx)
2752+ movl $__USER_DS, ES(%edx)
2753+ movl %ebx, ORIG_EAX(%edx)
2754+ movl %ecx, EIP(%edx)
2755+ movl 12(%esp), %ecx
2756+ movl $__KERNEL_CS, CS(%edx)
2757+ movl %ebx, EFLAGS(%edx)
2758+ movl %eax, OLDESP(%edx)
2759+ movl 8(%esp), %eax
2760+ movl %ecx, 8(%esp)
2761+ movl EBX(%edx), %ebx
2762+ movl $__KERNEL_DS, OLDSS(%edx)
2763+ jmpl *%eax
2764+ CFI_ENDPROC
2765+ENDPROC(arch_unwind_init_running)
2766+#endif
2767+
2768+ENTRY(fixup_4gb_segment)
2769+ RING0_EC_FRAME
2770+ pushl $do_fixup_4gb_segment
2771+ CFI_ADJUST_CFA_OFFSET 4
2772+ jmp error_code
2773+ CFI_ENDPROC
2774+
2775+.section .rodata,"a"
2776+#include "syscall_table.S"
2777+
2778+syscall_table_size=(.-sys_call_table)
2779Index: head-2008-11-25/arch/x86/kernel/fixup.c
2780===================================================================
2781--- /dev/null 1970-01-01 00:00:00.000000000 +0000
2782+++ head-2008-11-25/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100
2783@@ -0,0 +1,88 @@
2784+/******************************************************************************
2785+ * fixup.c
2786+ *
2787+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
2788+ * Used to avoid repeated slow emulation of common instructions used by the
2789+ * user-space TLS (Thread-Local Storage) libraries.
2790+ *
2791+ * **** NOTE ****
2792+ * Issues with the binary rewriting have caused it to be removed. Instead
2793+ * we rely on Xen's emulator to boot the kernel, and then print a banner
2794+ * message recommending that the user disables /lib/tls.
2795+ *
2796+ * Copyright (c) 2004, K A Fraser
2797+ *
2798+ * This program is free software; you can redistribute it and/or modify
2799+ * it under the terms of the GNU General Public License as published by
2800+ * the Free Software Foundation; either version 2 of the License, or
2801+ * (at your option) any later version.
2802+ *
2803+ * This program is distributed in the hope that it will be useful,
2804+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2805+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2806+ * GNU General Public License for more details.
2807+ *
2808+ * You should have received a copy of the GNU General Public License
2809+ * along with this program; if not, write to the Free Software
2810+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2811+ */
2812+
2813+#include <linux/init.h>
2814+#include <linux/sched.h>
2815+#include <linux/slab.h>
2816+#include <linux/kernel.h>
2817+#include <linux/delay.h>
2818+#include <linux/version.h>
2819+
2820+#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
2821+
2822+fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2823+{
2824+ static unsigned long printed = 0;
2825+ char info[100];
2826+ int i;
2827+
2828+ /* Ignore statically-linked init. */
2829+ if (current->tgid == 1)
2830+ return;
2831+
2832+ VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
2833+ VMASST_TYPE_4gb_segments_notify));
2834+
2835+ if (test_and_set_bit(0, &printed))
2836+ return;
2837+
2838+ sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
2839+
2840+ DP("");
2841+ DP("***************************************************************");
2842+ DP("***************************************************************");
2843+ DP("** WARNING: Currently emulating unsupported memory accesses **");
2844+ DP("** in /lib/tls glibc libraries. The emulation is **");
2845+ DP("** slow. To ensure full performance you should **");
2846+ DP("** install a 'xen-friendly' (nosegneg) version of **");
2847+ DP("** the library, or disable tls support by executing **");
2848+ DP("** the following as root: **");
2849+ DP("** mv /lib/tls /lib/tls.disabled **");
2850+ DP("** Offending process: %-38.38s **", info);
2851+ DP("***************************************************************");
2852+ DP("***************************************************************");
2853+ DP("");
2854+
2855+ for (i = 5; i > 0; i--) {
2856+ touch_softlockup_watchdog();
2857+ printk("Pausing... %d", i);
2858+ mdelay(1000);
2859+ printk("\b\b\b\b\b\b\b\b\b\b\b\b");
2860+ }
2861+
2862+ printk("Continuing...\n\n");
2863+}
2864+
2865+static int __init fixup_init(void)
2866+{
2867+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
2868+ VMASST_TYPE_4gb_segments_notify));
2869+ return 0;
2870+}
2871+__initcall(fixup_init);
2872Index: head-2008-11-25/arch/x86/kernel/head_32-xen.S
2873===================================================================
2874--- /dev/null 1970-01-01 00:00:00.000000000 +0000
2875+++ head-2008-11-25/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200
2876@@ -0,0 +1,207 @@
2877+
2878+
2879+.text
2880+#include <linux/elfnote.h>
2881+#include <linux/threads.h>
2882+#include <linux/linkage.h>
2883+#include <asm/segment.h>
2884+#include <asm/page.h>
2885+#include <asm/cache.h>
2886+#include <asm/thread_info.h>
2887+#include <asm/asm-offsets.h>
2888+#include <asm/dwarf2.h>
2889+#include <xen/interface/xen.h>
2890+#include <xen/interface/elfnote.h>
2891+
2892+/*
2893+ * References to members of the new_cpu_data structure.
2894+ */
2895+
2896+#define X86 new_cpu_data+CPUINFO_x86
2897+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
2898+#define X86_MODEL new_cpu_data+CPUINFO_x86_model
2899+#define X86_MASK new_cpu_data+CPUINFO_x86_mask
2900+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
2901+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
2902+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
2903+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
2904+
2905+#define VIRT_ENTRY_OFFSET 0x0
2906+.org VIRT_ENTRY_OFFSET
2907+ENTRY(startup_32)
2908+ movl %esi,xen_start_info
2909+ cld
2910+
2911+ /* Set up the stack pointer */
2912+ movl $(init_thread_union+THREAD_SIZE),%esp
2913+
2914+ /* get vendor info */
2915+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2916+ XEN_CPUID
2917+ movl %eax,X86_CPUID # save CPUID level
2918+ movl %ebx,X86_VENDOR_ID # lo 4 chars
2919+ movl %edx,X86_VENDOR_ID+4 # next 4 chars
2920+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
2921+
2922+ movl $1,%eax # Use the CPUID instruction to get CPU type
2923+ XEN_CPUID
2924+ movb %al,%cl # save reg for future use
2925+ andb $0x0f,%ah # mask processor family
2926+ movb %ah,X86
2927+ andb $0xf0,%al # mask model
2928+ shrb $4,%al
2929+ movb %al,X86_MODEL
2930+ andb $0x0f,%cl # mask mask revision
2931+ movb %cl,X86_MASK
2932+ movl %edx,X86_CAPABILITY
2933+
2934+ movb $1,X86_HARD_MATH
2935+
2936+ xorl %eax,%eax # Clear FS/GS and LDT
2937+ movl %eax,%fs
2938+ movl %eax,%gs
2939+ cld # gcc2 wants the direction flag cleared at all times
2940+
2941+ pushl %eax # fake return address
2942+ jmp start_kernel
2943+
2944+#define HYPERCALL_PAGE_OFFSET 0x1000
2945+.org HYPERCALL_PAGE_OFFSET
2946+ENTRY(hypercall_page)
2947+ CFI_STARTPROC
2948+.skip 0x1000
2949+ CFI_ENDPROC
2950+
2951+/*
2952+ * Real beginning of normal "text" segment
2953+ */
2954+ENTRY(stext)
2955+ENTRY(_stext)
2956+
2957+/*
2958+ * BSS section
2959+ */
2960+.section ".bss.page_aligned","w"
2961+ENTRY(empty_zero_page)
2962+ .fill 4096,1,0
2963+
2964+/*
2965+ * This starts the data section.
2966+ */
2967+.data
2968+
2969+/*
2970+ * The Global Descriptor Table contains 28 quadwords, per-CPU.
2971+ */
2972+ .align L1_CACHE_BYTES
2973+ENTRY(cpu_gdt_table)
2974+ .quad 0x0000000000000000 /* NULL descriptor */
2975+ .quad 0x0000000000000000 /* 0x0b reserved */
2976+ .quad 0x0000000000000000 /* 0x13 reserved */
2977+ .quad 0x0000000000000000 /* 0x1b reserved */
2978+ .quad 0x0000000000000000 /* 0x20 unused */
2979+ .quad 0x0000000000000000 /* 0x28 unused */
2980+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
2981+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
2982+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
2983+ .quad 0x0000000000000000 /* 0x4b reserved */
2984+ .quad 0x0000000000000000 /* 0x53 reserved */
2985+ .quad 0x0000000000000000 /* 0x5b reserved */
2986+
2987+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
2988+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
2989+ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
2990+ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
2991+
2992+ .quad 0x0000000000000000 /* 0x80 TSS descriptor */
2993+ .quad 0x0000000000000000 /* 0x88 LDT descriptor */
2994+
2995+ /*
2996+ * Segments used for calling PnP BIOS have byte granularity.
2997+ * They code segments and data segments have fixed 64k limits,
2998+ * the transfer segment sizes are set at run time.
2999+ */
3000+ .quad 0x0000000000000000 /* 0x90 32-bit code */
3001+ .quad 0x0000000000000000 /* 0x98 16-bit code */
3002+ .quad 0x0000000000000000 /* 0xa0 16-bit data */
3003+ .quad 0x0000000000000000 /* 0xa8 16-bit data */
3004+ .quad 0x0000000000000000 /* 0xb0 16-bit data */
3005+
3006+ /*
3007+ * The APM segments have byte granularity and their bases
3008+ * are set at run time. All have 64k limits.
3009+ */
3010+ .quad 0x0000000000000000 /* 0xb8 APM CS code */
3011+ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
3012+ .quad 0x0000000000000000 /* 0xc8 APM DS data */
3013+
3014+ .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
3015+ .quad 0x0000000000000000 /* 0xd8 - unused */
3016+ .quad 0x0000000000000000 /* 0xe0 - unused */
3017+ .quad 0x0000000000000000 /* 0xe8 - unused */
3018+ .quad 0x0000000000000000 /* 0xf0 - unused */
3019+ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
3020+
3021+#if CONFIG_XEN_COMPAT <= 0x030002
3022+/*
3023+ * __xen_guest information
3024+ */
3025+.macro utoa value
3026+ .if (\value) < 0 || (\value) >= 0x10
3027+ utoa (((\value)>>4)&0x0fffffff)
3028+ .endif
3029+ .if ((\value) & 0xf) < 10
3030+ .byte '0' + ((\value) & 0xf)
3031+ .else
3032+ .byte 'A' + ((\value) & 0xf) - 10
3033+ .endif
3034+.endm
3035+
3036+.section __xen_guest
3037+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
3038+ .ascii ",XEN_VER=xen-3.0"
3039+ .ascii ",VIRT_BASE=0x"
3040+ utoa __PAGE_OFFSET
3041+ .ascii ",ELF_PADDR_OFFSET=0x"
3042+ utoa __PAGE_OFFSET
3043+ .ascii ",VIRT_ENTRY=0x"
3044+ utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
3045+ .ascii ",HYPERCALL_PAGE=0x"
3046+ utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3047+ .ascii ",FEATURES=writable_page_tables"
3048+ .ascii "|writable_descriptor_tables"
3049+ .ascii "|auto_translated_physmap"
3050+ .ascii "|pae_pgdir_above_4gb"
3051+ .ascii "|supervisor_mode_kernel"
3052+#ifdef CONFIG_X86_PAE
3053+ .ascii ",PAE=yes[extended-cr3]"
3054+#else
3055+ .ascii ",PAE=no"
3056+#endif
3057+ .ascii ",LOADER=generic"
3058+ .byte 0
3059+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
3060+
3061+
3062+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
3063+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
3064+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
3065+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
3066+#if CONFIG_XEN_COMPAT <= 0x030002
3067+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
3068+#else
3069+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
3070+#endif
3071+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
3072+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
3073+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
3074+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
3075+#ifdef CONFIG_X86_PAE
3076+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
3077+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
3078+#else
3079+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
3080+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT)
3081+#endif
3082+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
3083+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
3084Index: head-2008-11-25/arch/x86/kernel/init_task-xen.c
3085===================================================================
3086--- /dev/null 1970-01-01 00:00:00.000000000 +0000
3087+++ head-2008-11-25/arch/x86/kernel/init_task-xen.c 2007-06-12 13:12:48.000000000 +0200
3088@@ -0,0 +1,51 @@
3089+#include <linux/mm.h>
3090+#include <linux/module.h>
3091+#include <linux/sched.h>
3092+#include <linux/init.h>
3093+#include <linux/init_task.h>
3094+#include <linux/fs.h>
3095+#include <linux/mqueue.h>
3096+
3097+#include <asm/uaccess.h>
3098+#include <asm/pgtable.h>
3099+#include <asm/desc.h>
3100+
3101+static struct fs_struct init_fs = INIT_FS;
3102+static struct files_struct init_files = INIT_FILES;
3103+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3104+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3105+
3106+#define swapper_pg_dir ((pgd_t *)NULL)
3107+struct mm_struct init_mm = INIT_MM(init_mm);
3108+#undef swapper_pg_dir
3109+
3110+EXPORT_SYMBOL(init_mm);
3111+
3112+/*
3113+ * Initial thread structure.
3114+ *
3115+ * We need to make sure that this is THREAD_SIZE aligned due to the
3116+ * way process stacks are handled. This is done by having a special
3117+ * "init_task" linker map entry..
3118+ */
3119+union thread_union init_thread_union
3120+ __attribute__((__section__(".data.init_task"))) =
3121+ { INIT_THREAD_INFO(init_task) };
3122+
3123+/*
3124+ * Initial task structure.
3125+ *
3126+ * All other task structs will be allocated on slabs in fork.c
3127+ */
3128+struct task_struct init_task = INIT_TASK(init_task);
3129+
3130+EXPORT_SYMBOL(init_task);
3131+
3132+#ifndef CONFIG_X86_NO_TSS
3133+/*
3134+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3135+ * no more per-task TSS's.
3136+ */
3137+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3138+#endif
3139+
3140Index: head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c
3141===================================================================
3142--- /dev/null 1970-01-01 00:00:00.000000000 +0000
3143+++ head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c 2008-11-25 12:22:34.000000000 +0100
3144@@ -0,0 +1,2776 @@
3145+/*
3146+ * Intel IO-APIC support for multi-Pentium hosts.
3147+ *
3148+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3149+ *
3150+ * Many thanks to Stig Venaas for trying out countless experimental
3151+ * patches and reporting/debugging problems patiently!
3152+ *
3153+ * (c) 1999, Multiple IO-APIC support, developed by
3154+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3155+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3156+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
3157+ * and Ingo Molnar <mingo@redhat.com>
3158+ *
3159+ * Fixes
3160+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
3161+ * thanks to Eric Gilmore
3162+ * and Rolf G. Tews
3163+ * for testing these extensively
3164+ * Paul Diefenbaugh : Added full ACPI support
3165+ */
3166+
3167+#include <linux/mm.h>
3168+#include <linux/interrupt.h>
3169+#include <linux/init.h>
3170+#include <linux/delay.h>
3171+#include <linux/sched.h>
3172+#include <linux/smp_lock.h>
3173+#include <linux/mc146818rtc.h>
3174+#include <linux/compiler.h>
3175+#include <linux/acpi.h>
3176+#include <linux/module.h>
3177+#include <linux/sysdev.h>
3178+
3179+#include <asm/io.h>
3180+#include <asm/smp.h>
3181+#include <asm/desc.h>
3182+#include <asm/timer.h>
3183+#include <asm/i8259.h>
3184+#include <asm/nmi.h>
3185+
3186+#include <mach_apic.h>
3187+
3188+#include "io_ports.h"
3189+
3190+#ifdef CONFIG_XEN
3191+
3192+#include <xen/interface/xen.h>
3193+#include <xen/interface/physdev.h>
3194+#include <xen/evtchn.h>
3195+
3196+/* Fake i8259 */
3197+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
3198+#define disable_8259A_irq(_irq) ((void)0)
3199+#define i8259A_irq_pending(_irq) (0)
3200+
3201+unsigned long io_apic_irqs;
3202+
3203+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
3204+{
3205+ struct physdev_apic apic_op;
3206+ int ret;
3207+
3208+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3209+ apic_op.reg = reg;
3210+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
3211+ if (ret)
3212+ return ret;
3213+ return apic_op.value;
3214+}
3215+
3216+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
3217+{
3218+ struct physdev_apic apic_op;
3219+
3220+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3221+ apic_op.reg = reg;
3222+ apic_op.value = value;
3223+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
3224+}
3225+
3226+#define io_apic_read(a,r) xen_io_apic_read(a,r)
3227+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
3228+
3229+#endif /* CONFIG_XEN */
3230+
3231+int (*ioapic_renumber_irq)(int ioapic, int irq);
3232+atomic_t irq_mis_count;
3233+
3234+/* Where if anywhere is the i8259 connect in external int mode */
3235+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
3236+
3237+static DEFINE_SPINLOCK(ioapic_lock);
3238+static DEFINE_SPINLOCK(vector_lock);
3239+
3240+int timer_over_8254 __initdata = 1;
3241+
3242+/*
3243+ * Is the SiS APIC rmw bug present ?
3244+ * -1 = don't know, 0 = no, 1 = yes
3245+ */
3246+int sis_apic_bug = -1;
3247+
3248+/*
3249+ * # of IRQ routing registers
3250+ */
3251+int nr_ioapic_registers[MAX_IO_APICS];
3252+
3253+int disable_timer_pin_1 __initdata;
3254+
3255+/*
3256+ * Rough estimation of how many shared IRQs there are, can
3257+ * be changed anytime.
3258+ */
3259+#define MAX_PLUS_SHARED_IRQS NR_IRQS
3260+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
3261+
3262+/*
3263+ * This is performance-critical, we want to do it O(1)
3264+ *
3265+ * the indexing order of this array favors 1:1 mappings
3266+ * between pins and IRQs.
3267+ */
3268+
3269+static struct irq_pin_list {
3270+ int apic, pin, next;
3271+} irq_2_pin[PIN_MAP_SIZE];
3272+
3273+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
3274+#ifdef CONFIG_PCI_MSI
3275+#define vector_to_irq(vector) \
3276+ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
3277+#else
3278+#define vector_to_irq(vector) (vector)
3279+#endif
3280+
3281+/*
3282+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
3283+ * shared ISA-space IRQs, so we have to support them. We are super
3284+ * fast in the common case, and fast for shared ISA-space IRQs.
3285+ */
3286+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
3287+{
3288+ static int first_free_entry = NR_IRQS;
3289+ struct irq_pin_list *entry = irq_2_pin + irq;
3290+
3291+ while (entry->next)
3292+ entry = irq_2_pin + entry->next;
3293+
3294+ if (entry->pin != -1) {
3295+ entry->next = first_free_entry;
3296+ entry = irq_2_pin + entry->next;
3297+ if (++first_free_entry >= PIN_MAP_SIZE)
3298+ panic("io_apic.c: whoops");
3299+ }
3300+ entry->apic = apic;
3301+ entry->pin = pin;
3302+}
3303+
3304+#ifdef CONFIG_XEN
3305+#define clear_IO_APIC() ((void)0)
3306+#else
3307+/*
3308+ * Reroute an IRQ to a different pin.
3309+ */
3310+static void __init replace_pin_at_irq(unsigned int irq,
3311+ int oldapic, int oldpin,
3312+ int newapic, int newpin)
3313+{
3314+ struct irq_pin_list *entry = irq_2_pin + irq;
3315+
3316+ while (1) {
3317+ if (entry->apic == oldapic && entry->pin == oldpin) {
3318+ entry->apic = newapic;
3319+ entry->pin = newpin;
3320+ }
3321+ if (!entry->next)
3322+ break;
3323+ entry = irq_2_pin + entry->next;
3324+ }
3325+}
3326+
3327+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
3328+{
3329+ struct irq_pin_list *entry = irq_2_pin + irq;
3330+ unsigned int pin, reg;
3331+
3332+ for (;;) {
3333+ pin = entry->pin;
3334+ if (pin == -1)
3335+ break;
3336+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
3337+ reg &= ~disable;
3338+ reg |= enable;
3339+ io_apic_modify(entry->apic, 0x10 + pin*2, reg);
3340+ if (!entry->next)
3341+ break;
3342+ entry = irq_2_pin + entry->next;
3343+ }
3344+}
3345+
3346+/* mask = 1 */
3347+static void __mask_IO_APIC_irq (unsigned int irq)
3348+{
3349+ __modify_IO_APIC_irq(irq, 0x00010000, 0);
3350+}
3351+
3352+/* mask = 0 */
3353+static void __unmask_IO_APIC_irq (unsigned int irq)
3354+{
3355+ __modify_IO_APIC_irq(irq, 0, 0x00010000);
3356+}
3357+
3358+/* mask = 1, trigger = 0 */
3359+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
3360+{
3361+ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
3362+}
3363+
3364+/* mask = 0, trigger = 1 */
3365+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
3366+{
3367+ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
3368+}
3369+
3370+static void mask_IO_APIC_irq (unsigned int irq)
3371+{
3372+ unsigned long flags;
3373+
3374+ spin_lock_irqsave(&ioapic_lock, flags);
3375+ __mask_IO_APIC_irq(irq);
3376+ spin_unlock_irqrestore(&ioapic_lock, flags);
3377+}
3378+
3379+static void unmask_IO_APIC_irq (unsigned int irq)
3380+{
3381+ unsigned long flags;
3382+
3383+ spin_lock_irqsave(&ioapic_lock, flags);
3384+ __unmask_IO_APIC_irq(irq);
3385+ spin_unlock_irqrestore(&ioapic_lock, flags);
3386+}
3387+
3388+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
3389+{
3390+ struct IO_APIC_route_entry entry;
3391+ unsigned long flags;
3392+
3393+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
3394+ spin_lock_irqsave(&ioapic_lock, flags);
3395+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3396+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3397+ spin_unlock_irqrestore(&ioapic_lock, flags);
3398+ if (entry.delivery_mode == dest_SMI)
3399+ return;
3400+
3401+ /*
3402+ * Disable it in the IO-APIC irq-routing table:
3403+ */
3404+ memset(&entry, 0, sizeof(entry));
3405+ entry.mask = 1;
3406+ spin_lock_irqsave(&ioapic_lock, flags);
3407+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3408+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3409+ spin_unlock_irqrestore(&ioapic_lock, flags);
3410+}
3411+
3412+static void clear_IO_APIC (void)
3413+{
3414+ int apic, pin;
3415+
3416+ for (apic = 0; apic < nr_ioapics; apic++)
3417+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
3418+ clear_IO_APIC_pin(apic, pin);
3419+}
3420+
3421+#ifdef CONFIG_SMP
3422+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
3423+{
3424+ unsigned long flags;
3425+ int pin;
3426+ struct irq_pin_list *entry = irq_2_pin + irq;
3427+ unsigned int apicid_value;
3428+ cpumask_t tmp;
3429+
3430+ cpus_and(tmp, cpumask, cpu_online_map);
3431+ if (cpus_empty(tmp))
3432+ tmp = TARGET_CPUS;
3433+
3434+ cpus_and(cpumask, tmp, CPU_MASK_ALL);
3435+
3436+ apicid_value = cpu_mask_to_apicid(cpumask);
3437+ /* Prepare to do the io_apic_write */
3438+ apicid_value = apicid_value << 24;
3439+ spin_lock_irqsave(&ioapic_lock, flags);
3440+ for (;;) {
3441+ pin = entry->pin;
3442+ if (pin == -1)
3443+ break;
3444+ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
3445+ if (!entry->next)
3446+ break;
3447+ entry = irq_2_pin + entry->next;
3448+ }
3449+ set_irq_info(irq, cpumask);
3450+ spin_unlock_irqrestore(&ioapic_lock, flags);
3451+}
3452+
3453+#if defined(CONFIG_IRQBALANCE)
3454+# include <asm/processor.h> /* kernel_thread() */
3455+# include <linux/kernel_stat.h> /* kstat */
3456+# include <linux/slab.h> /* kmalloc() */
3457+# include <linux/timer.h> /* time_after() */
3458+
3459+#ifdef CONFIG_BALANCED_IRQ_DEBUG
3460+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
3461+# define Dprintk(x...) do { TDprintk(x); } while (0)
3462+# else
3463+# define TDprintk(x...)
3464+# define Dprintk(x...)
3465+# endif
3466+
3467+#define IRQBALANCE_CHECK_ARCH -999
3468+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3469+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
3470+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
3471+#define BALANCED_IRQ_LESS_DELTA (HZ)
3472+
3473+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
3474+static int physical_balance __read_mostly;
3475+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
3476+
3477+static struct irq_cpu_info {
3478+ unsigned long * last_irq;
3479+ unsigned long * irq_delta;
3480+ unsigned long irq;
3481+} irq_cpu_data[NR_CPUS];
3482+
3483+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
3484+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
3485+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
3486+
3487+#define IDLE_ENOUGH(cpu,now) \
3488+ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
3489+
3490+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
3491+
3492+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
3493+
3494+static cpumask_t balance_irq_affinity[NR_IRQS] = {
3495+ [0 ... NR_IRQS-1] = CPU_MASK_ALL
3496+};
3497+
3498+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
3499+{
3500+ balance_irq_affinity[irq] = mask;
3501+}
3502+
3503+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
3504+ unsigned long now, int direction)
3505+{
3506+ int search_idle = 1;
3507+ int cpu = curr_cpu;
3508+
3509+ goto inside;
3510+
3511+ do {
3512+ if (unlikely(cpu == curr_cpu))
3513+ search_idle = 0;
3514+inside:
3515+ if (direction == 1) {
3516+ cpu++;
3517+ if (cpu >= NR_CPUS)
3518+ cpu = 0;
3519+ } else {
3520+ cpu--;
3521+ if (cpu == -1)
3522+ cpu = NR_CPUS-1;
3523+ }
3524+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
3525+ (search_idle && !IDLE_ENOUGH(cpu,now)));
3526+
3527+ return cpu;
3528+}
3529+
3530+static inline void balance_irq(int cpu, int irq)
3531+{
3532+ unsigned long now = jiffies;
3533+ cpumask_t allowed_mask;
3534+ unsigned int new_cpu;
3535+
3536+ if (irqbalance_disabled)
3537+ return;
3538+
3539+ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
3540+ new_cpu = move(cpu, allowed_mask, now, 1);
3541+ if (cpu != new_cpu) {
3542+ set_pending_irq(irq, cpumask_of_cpu(new_cpu));
3543+ }
3544+}
3545+
3546+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
3547+{
3548+ int i, j;
3549+ Dprintk("Rotating IRQs among CPUs.\n");
3550+ for_each_online_cpu(i) {
3551+ for (j = 0; j < NR_IRQS; j++) {
3552+ if (!irq_desc[j].action)
3553+ continue;
3554+ /* Is it a significant load ? */
3555+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
3556+ useful_load_threshold)
3557+ continue;
3558+ balance_irq(i, j);
3559+ }
3560+ }
3561+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3562+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3563+ return;
3564+}
3565+
3566+static void do_irq_balance(void)
3567+{
3568+ int i, j;
3569+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
3570+ unsigned long move_this_load = 0;
3571+ int max_loaded = 0, min_loaded = 0;
3572+ int load;
3573+ unsigned long useful_load_threshold = balanced_irq_interval + 10;
3574+ int selected_irq;
3575+ int tmp_loaded, first_attempt = 1;
3576+ unsigned long tmp_cpu_irq;
3577+ unsigned long imbalance = 0;
3578+ cpumask_t allowed_mask, target_cpu_mask, tmp;
3579+
3580+ for_each_possible_cpu(i) {
3581+ int package_index;
3582+ CPU_IRQ(i) = 0;
3583+ if (!cpu_online(i))
3584+ continue;
3585+ package_index = CPU_TO_PACKAGEINDEX(i);
3586+ for (j = 0; j < NR_IRQS; j++) {
3587+ unsigned long value_now, delta;
3588+ /* Is this an active IRQ? */
3589+ if (!irq_desc[j].action)
3590+ continue;
3591+ if ( package_index == i )
3592+ IRQ_DELTA(package_index,j) = 0;
3593+ /* Determine the total count per processor per IRQ */
3594+ value_now = (unsigned long) kstat_cpu(i).irqs[j];
3595+
3596+ /* Determine the activity per processor per IRQ */
3597+ delta = value_now - LAST_CPU_IRQ(i,j);
3598+
3599+ /* Update last_cpu_irq[][] for the next time */
3600+ LAST_CPU_IRQ(i,j) = value_now;
3601+
3602+ /* Ignore IRQs whose rate is less than the clock */
3603+ if (delta < useful_load_threshold)
3604+ continue;
3605+ /* update the load for the processor or package total */
3606+ IRQ_DELTA(package_index,j) += delta;
3607+
3608+ /* Keep track of the higher numbered sibling as well */
3609+ if (i != package_index)
3610+ CPU_IRQ(i) += delta;
3611+ /*
3612+ * We have sibling A and sibling B in the package
3613+ *
3614+ * cpu_irq[A] = load for cpu A + load for cpu B
3615+ * cpu_irq[B] = load for cpu B
3616+ */
3617+ CPU_IRQ(package_index) += delta;
3618+ }
3619+ }
3620+ /* Find the least loaded processor package */
3621+ for_each_online_cpu(i) {
3622+ if (i != CPU_TO_PACKAGEINDEX(i))
3623+ continue;
3624+ if (min_cpu_irq > CPU_IRQ(i)) {
3625+ min_cpu_irq = CPU_IRQ(i);
3626+ min_loaded = i;
3627+ }
3628+ }
3629+ max_cpu_irq = ULONG_MAX;
3630+
3631+tryanothercpu:
3632+ /* Look for heaviest loaded processor.
3633+ * We may come back to get the next heaviest loaded processor.
3634+ * Skip processors with trivial loads.
3635+ */
3636+ tmp_cpu_irq = 0;
3637+ tmp_loaded = -1;
3638+ for_each_online_cpu(i) {
3639+ if (i != CPU_TO_PACKAGEINDEX(i))
3640+ continue;
3641+ if (max_cpu_irq <= CPU_IRQ(i))
3642+ continue;
3643+ if (tmp_cpu_irq < CPU_IRQ(i)) {
3644+ tmp_cpu_irq = CPU_IRQ(i);
3645+ tmp_loaded = i;
3646+ }
3647+ }
3648+
3649+ if (tmp_loaded == -1) {
3650+ /* In the case of small number of heavy interrupt sources,
3651+ * loading some of the cpus too much. We use Ingo's original
3652+ * approach to rotate them around.
3653+ */
3654+ if (!first_attempt && imbalance >= useful_load_threshold) {
3655+ rotate_irqs_among_cpus(useful_load_threshold);
3656+ return;
3657+ }
3658+ goto not_worth_the_effort;
3659+ }
3660+
3661+ first_attempt = 0; /* heaviest search */
3662+ max_cpu_irq = tmp_cpu_irq; /* load */
3663+ max_loaded = tmp_loaded; /* processor */
3664+ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
3665+
3666+ Dprintk("max_loaded cpu = %d\n", max_loaded);
3667+ Dprintk("min_loaded cpu = %d\n", min_loaded);
3668+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
3669+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
3670+ Dprintk("load imbalance = %lu\n", imbalance);
3671+
3672+ /* if imbalance is less than approx 10% of max load, then
3673+ * observe diminishing returns action. - quit
3674+ */
3675+ if (imbalance < (max_cpu_irq >> 3)) {
3676+ Dprintk("Imbalance too trivial\n");
3677+ goto not_worth_the_effort;
3678+ }
3679+
3680+tryanotherirq:
3681+ /* if we select an IRQ to move that can't go where we want, then
3682+ * see if there is another one to try.
3683+ */
3684+ move_this_load = 0;
3685+ selected_irq = -1;
3686+ for (j = 0; j < NR_IRQS; j++) {
3687+ /* Is this an active IRQ? */
3688+ if (!irq_desc[j].action)
3689+ continue;
3690+ if (imbalance <= IRQ_DELTA(max_loaded,j))
3691+ continue;
3692+ /* Try to find the IRQ that is closest to the imbalance
3693+ * without going over.
3694+ */
3695+ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
3696+ move_this_load = IRQ_DELTA(max_loaded,j);
3697+ selected_irq = j;
3698+ }
3699+ }
3700+ if (selected_irq == -1) {
3701+ goto tryanothercpu;
3702+ }
3703+
3704+ imbalance = move_this_load;
3705+
3706+ /* For physical_balance case, we accumlated both load
3707+ * values in the one of the siblings cpu_irq[],
3708+ * to use the same code for physical and logical processors
3709+ * as much as possible.
3710+ *
3711+ * NOTE: the cpu_irq[] array holds the sum of the load for
3712+ * sibling A and sibling B in the slot for the lowest numbered
3713+ * sibling (A), _AND_ the load for sibling B in the slot for
3714+ * the higher numbered sibling.
3715+ *
3716+ * We seek the least loaded sibling by making the comparison
3717+ * (A+B)/2 vs B
3718+ */
3719+ load = CPU_IRQ(min_loaded) >> 1;
3720+ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
3721+ if (load > CPU_IRQ(j)) {
3722+ /* This won't change cpu_sibling_map[min_loaded] */
3723+ load = CPU_IRQ(j);
3724+ min_loaded = j;
3725+ }
3726+ }
3727+
3728+ cpus_and(allowed_mask,
3729+ cpu_online_map,
3730+ balance_irq_affinity[selected_irq]);
3731+ target_cpu_mask = cpumask_of_cpu(min_loaded);
3732+ cpus_and(tmp, target_cpu_mask, allowed_mask);
3733+
3734+ if (!cpus_empty(tmp)) {
3735+
3736+ Dprintk("irq = %d moved to cpu = %d\n",
3737+ selected_irq, min_loaded);
3738+ /* mark for change destination */
3739+ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
3740+
3741+ /* Since we made a change, come back sooner to
3742+ * check for more variation.
3743+ */
3744+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3745+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3746+ return;
3747+ }
3748+ goto tryanotherirq;
3749+
3750+not_worth_the_effort:
3751+ /*
3752+ * if we did not find an IRQ to move, then adjust the time interval
3753+ * upward
3754+ */
3755+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
3756+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
3757+ Dprintk("IRQ worth rotating not found\n");
3758+ return;
3759+}
3760+
3761+static int balanced_irq(void *unused)
3762+{
3763+ int i;
3764+ unsigned long prev_balance_time = jiffies;
3765+ long time_remaining = balanced_irq_interval;
3766+
3767+ daemonize("kirqd");
3768+
3769+ /* push everything to CPU 0 to give us a starting point. */
3770+ for (i = 0 ; i < NR_IRQS ; i++) {
3771+ irq_desc[i].pending_mask = cpumask_of_cpu(0);
3772+ set_pending_irq(i, cpumask_of_cpu(0));
3773+ }
3774+
3775+ for ( ; ; ) {
3776+ time_remaining = schedule_timeout_interruptible(time_remaining);
3777+ try_to_freeze();
3778+ if (time_after(jiffies,
3779+ prev_balance_time+balanced_irq_interval)) {
3780+ preempt_disable();
3781+ do_irq_balance();
3782+ prev_balance_time = jiffies;
3783+ time_remaining = balanced_irq_interval;
3784+ preempt_enable();
3785+ }
3786+ }
3787+ return 0;
3788+}
3789+
3790+static int __init balanced_irq_init(void)
3791+{
3792+ int i;
3793+ struct cpuinfo_x86 *c;
3794+ cpumask_t tmp;
3795+
3796+ cpus_shift_right(tmp, cpu_online_map, 2);
3797+ c = &boot_cpu_data;
3798+ /* When not overwritten by the command line ask subarchitecture. */
3799+ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
3800+ irqbalance_disabled = NO_BALANCE_IRQ;
3801+ if (irqbalance_disabled)
3802+ return 0;
3803+
3804+ /* disable irqbalance completely if there is only one processor online */
3805+ if (num_online_cpus() < 2) {
3806+ irqbalance_disabled = 1;
3807+ return 0;
3808+ }
3809+ /*
3810+ * Enable physical balance only if more than 1 physical processor
3811+ * is present
3812+ */
3813+ if (smp_num_siblings > 1 && !cpus_empty(tmp))
3814+ physical_balance = 1;
3815+
3816+ for_each_online_cpu(i) {
3817+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3818+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3819+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
3820+ printk(KERN_ERR "balanced_irq_init: out of memory");
3821+ goto failed;
3822+ }
3823+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
3824+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
3825+ }
3826+
3827+ printk(KERN_INFO "Starting balanced_irq\n");
3828+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
3829+ return 0;
3830+ else
3831+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
3832+failed:
3833+ for_each_possible_cpu(i) {
3834+ kfree(irq_cpu_data[i].irq_delta);
3835+ irq_cpu_data[i].irq_delta = NULL;
3836+ kfree(irq_cpu_data[i].last_irq);
3837+ irq_cpu_data[i].last_irq = NULL;
3838+ }
3839+ return 0;
3840+}
3841+
3842+int __init irqbalance_disable(char *str)
3843+{
3844+ irqbalance_disabled = 1;
3845+ return 1;
3846+}
3847+
3848+__setup("noirqbalance", irqbalance_disable);
3849+
3850+late_initcall(balanced_irq_init);
3851+#endif /* CONFIG_IRQBALANCE */
3852+#endif /* CONFIG_SMP */
3853+#endif
3854+
3855+#ifndef CONFIG_SMP
3856+void fastcall send_IPI_self(int vector)
3857+{
3858+#ifndef CONFIG_XEN
3859+ unsigned int cfg;
3860+
3861+ /*
3862+ * Wait for idle.
3863+ */
3864+ apic_wait_icr_idle();
3865+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
3866+ /*
3867+ * Send the IPI. The write to APIC_ICR fires this off.
3868+ */
3869+ apic_write_around(APIC_ICR, cfg);
3870+#endif
3871+}
3872+#endif /* !CONFIG_SMP */
3873+
3874+
3875+/*
3876+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3877+ * specific CPU-side IRQs.
3878+ */
3879+
3880+#define MAX_PIRQS 8
3881+static int pirq_entries [MAX_PIRQS];
3882+static int pirqs_enabled;
3883+int skip_ioapic_setup;
3884+
3885+static int __init ioapic_setup(char *str)
3886+{
3887+ skip_ioapic_setup = 1;
3888+ return 1;
3889+}
3890+
3891+__setup("noapic", ioapic_setup);
3892+
3893+static int __init ioapic_pirq_setup(char *str)
3894+{
3895+ int i, max;
3896+ int ints[MAX_PIRQS+1];
3897+
3898+ get_options(str, ARRAY_SIZE(ints), ints);
3899+
3900+ for (i = 0; i < MAX_PIRQS; i++)
3901+ pirq_entries[i] = -1;
3902+
3903+ pirqs_enabled = 1;
3904+ apic_printk(APIC_VERBOSE, KERN_INFO
3905+ "PIRQ redirection, working around broken MP-BIOS.\n");
3906+ max = MAX_PIRQS;
3907+ if (ints[0] < MAX_PIRQS)
3908+ max = ints[0];
3909+
3910+ for (i = 0; i < max; i++) {
3911+ apic_printk(APIC_VERBOSE, KERN_DEBUG
3912+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3913+ /*
3914+ * PIRQs are mapped upside down, usually.
3915+ */
3916+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3917+ }
3918+ return 1;
3919+}
3920+
3921+__setup("pirq=", ioapic_pirq_setup);
3922+
3923+/*
3924+ * Find the IRQ entry number of a certain pin.
3925+ */
3926+static int find_irq_entry(int apic, int pin, int type)
3927+{
3928+ int i;
3929+
3930+ for (i = 0; i < mp_irq_entries; i++)
3931+ if (mp_irqs[i].mpc_irqtype == type &&
3932+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
3933+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
3934+ mp_irqs[i].mpc_dstirq == pin)
3935+ return i;
3936+
3937+ return -1;
3938+}
3939+
3940+/*
3941+ * Find the pin to which IRQ[irq] (ISA) is connected
3942+ */
3943+static int __init find_isa_irq_pin(int irq, int type)
3944+{
3945+ int i;
3946+
3947+ for (i = 0; i < mp_irq_entries; i++) {
3948+ int lbus = mp_irqs[i].mpc_srcbus;
3949+
3950+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3951+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3952+ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3953+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3954+ ) &&
3955+ (mp_irqs[i].mpc_irqtype == type) &&
3956+ (mp_irqs[i].mpc_srcbusirq == irq))
3957+
3958+ return mp_irqs[i].mpc_dstirq;
3959+ }
3960+ return -1;
3961+}
3962+
3963+static int __init find_isa_irq_apic(int irq, int type)
3964+{
3965+ int i;
3966+
3967+ for (i = 0; i < mp_irq_entries; i++) {
3968+ int lbus = mp_irqs[i].mpc_srcbus;
3969+
3970+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3971+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3972+ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3973+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3974+ ) &&
3975+ (mp_irqs[i].mpc_irqtype == type) &&
3976+ (mp_irqs[i].mpc_srcbusirq == irq))
3977+ break;
3978+ }
3979+ if (i < mp_irq_entries) {
3980+ int apic;
3981+ for(apic = 0; apic < nr_ioapics; apic++) {
3982+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
3983+ return apic;
3984+ }
3985+ }
3986+
3987+ return -1;
3988+}
3989+
3990+/*
3991+ * Find a specific PCI IRQ entry.
3992+ * Not an __init, possibly needed by modules
3993+ */
3994+static int pin_2_irq(int idx, int apic, int pin);
3995+
3996+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
3997+{
3998+ int apic, i, best_guess = -1;
3999+
4000+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4001+ "slot:%d, pin:%d.\n", bus, slot, pin);
4002+ if (mp_bus_id_to_pci_bus[bus] == -1) {
4003+ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4004+ return -1;
4005+ }
4006+ for (i = 0; i < mp_irq_entries; i++) {
4007+ int lbus = mp_irqs[i].mpc_srcbus;
4008+
4009+ for (apic = 0; apic < nr_ioapics; apic++)
4010+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4011+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4012+ break;
4013+
4014+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4015+ !mp_irqs[i].mpc_irqtype &&
4016+ (bus == lbus) &&
4017+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4018+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4019+
4020+ if (!(apic || IO_APIC_IRQ(irq)))
4021+ continue;
4022+
4023+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4024+ return irq;
4025+ /*
4026+ * Use the first all-but-pin matching entry as a
4027+ * best-guess fuzzy result for broken mptables.
4028+ */
4029+ if (best_guess < 0)
4030+ best_guess = irq;
4031+ }
4032+ }
4033+ return best_guess;
4034+}
4035+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4036+
4037+/*
4038+ * This function currently is only a helper for the i386 smp boot process where
4039+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
4040+ * so mask in all cases should simply be TARGET_CPUS
4041+ */
4042+#ifdef CONFIG_SMP
4043+#ifndef CONFIG_XEN
4044+void __init setup_ioapic_dest(void)
4045+{
4046+ int pin, ioapic, irq, irq_entry;
4047+
4048+ if (skip_ioapic_setup == 1)
4049+ return;
4050+
4051+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4052+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4053+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4054+ if (irq_entry == -1)
4055+ continue;
4056+ irq = pin_2_irq(irq_entry, ioapic, pin);
4057+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
4058+ }
4059+
4060+ }
4061+}
4062+#endif /* !CONFIG_XEN */
4063+#endif
4064+
4065+/*
4066+ * EISA Edge/Level control register, ELCR
4067+ */
4068+static int EISA_ELCR(unsigned int irq)
4069+{
4070+ if (irq < 16) {
4071+ unsigned int port = 0x4d0 + (irq >> 3);
4072+ return (inb(port) >> (irq & 7)) & 1;
4073+ }
4074+ apic_printk(APIC_VERBOSE, KERN_INFO
4075+ "Broken MPtable reports ISA irq %d\n", irq);
4076+ return 0;
4077+}
4078+
4079+/* EISA interrupts are always polarity zero and can be edge or level
4080+ * trigger depending on the ELCR value. If an interrupt is listed as
4081+ * EISA conforming in the MP table, that means its trigger type must
4082+ * be read in from the ELCR */
4083+
4084+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4085+#define default_EISA_polarity(idx) (0)
4086+
4087+/* ISA interrupts are always polarity zero edge triggered,
4088+ * when listed as conforming in the MP table. */
4089+
4090+#define default_ISA_trigger(idx) (0)
4091+#define default_ISA_polarity(idx) (0)
4092+
4093+/* PCI interrupts are always polarity one level triggered,
4094+ * when listed as conforming in the MP table. */
4095+
4096+#define default_PCI_trigger(idx) (1)
4097+#define default_PCI_polarity(idx) (1)
4098+
4099+/* MCA interrupts are always polarity zero level triggered,
4100+ * when listed as conforming in the MP table. */
4101+
4102+#define default_MCA_trigger(idx) (1)
4103+#define default_MCA_polarity(idx) (0)
4104+
4105+/* NEC98 interrupts are always polarity zero edge triggered,
4106+ * when listed as conforming in the MP table. */
4107+
4108+#define default_NEC98_trigger(idx) (0)
4109+#define default_NEC98_polarity(idx) (0)
4110+
4111+static int __init MPBIOS_polarity(int idx)
4112+{
4113+ int bus = mp_irqs[idx].mpc_srcbus;
4114+ int polarity;
4115+
4116+ /*
4117+ * Determine IRQ line polarity (high active or low active):
4118+ */
4119+ switch (mp_irqs[idx].mpc_irqflag & 3)
4120+ {
4121+ case 0: /* conforms, ie. bus-type dependent polarity */
4122+ {
4123+ switch (mp_bus_id_to_type[bus])
4124+ {
4125+ case MP_BUS_ISA: /* ISA pin */
4126+ {
4127+ polarity = default_ISA_polarity(idx);
4128+ break;
4129+ }
4130+ case MP_BUS_EISA: /* EISA pin */
4131+ {
4132+ polarity = default_EISA_polarity(idx);
4133+ break;
4134+ }
4135+ case MP_BUS_PCI: /* PCI pin */
4136+ {
4137+ polarity = default_PCI_polarity(idx);
4138+ break;
4139+ }
4140+ case MP_BUS_MCA: /* MCA pin */
4141+ {
4142+ polarity = default_MCA_polarity(idx);
4143+ break;
4144+ }
4145+ case MP_BUS_NEC98: /* NEC 98 pin */
4146+ {
4147+ polarity = default_NEC98_polarity(idx);
4148+ break;
4149+ }
4150+ default:
4151+ {
4152+ printk(KERN_WARNING "broken BIOS!!\n");
4153+ polarity = 1;
4154+ break;
4155+ }
4156+ }
4157+ break;
4158+ }
4159+ case 1: /* high active */
4160+ {
4161+ polarity = 0;
4162+ break;
4163+ }
4164+ case 2: /* reserved */
4165+ {
4166+ printk(KERN_WARNING "broken BIOS!!\n");
4167+ polarity = 1;
4168+ break;
4169+ }
4170+ case 3: /* low active */
4171+ {
4172+ polarity = 1;
4173+ break;
4174+ }
4175+ default: /* invalid */
4176+ {
4177+ printk(KERN_WARNING "broken BIOS!!\n");
4178+ polarity = 1;
4179+ break;
4180+ }
4181+ }
4182+ return polarity;
4183+}
4184+
4185+static int MPBIOS_trigger(int idx)
4186+{
4187+ int bus = mp_irqs[idx].mpc_srcbus;
4188+ int trigger;
4189+
4190+ /*
4191+ * Determine IRQ trigger mode (edge or level sensitive):
4192+ */
4193+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
4194+ {
4195+ case 0: /* conforms, ie. bus-type dependent */
4196+ {
4197+ switch (mp_bus_id_to_type[bus])
4198+ {
4199+ case MP_BUS_ISA: /* ISA pin */
4200+ {
4201+ trigger = default_ISA_trigger(idx);
4202+ break;
4203+ }
4204+ case MP_BUS_EISA: /* EISA pin */
4205+ {
4206+ trigger = default_EISA_trigger(idx);
4207+ break;
4208+ }
4209+ case MP_BUS_PCI: /* PCI pin */
4210+ {
4211+ trigger = default_PCI_trigger(idx);
4212+ break;
4213+ }
4214+ case MP_BUS_MCA: /* MCA pin */
4215+ {
4216+ trigger = default_MCA_trigger(idx);
4217+ break;
4218+ }
4219+ case MP_BUS_NEC98: /* NEC 98 pin */
4220+ {
4221+ trigger = default_NEC98_trigger(idx);
4222+ break;
4223+ }
4224+ default:
4225+ {
4226+ printk(KERN_WARNING "broken BIOS!!\n");
4227+ trigger = 1;
4228+ break;
4229+ }
4230+ }
4231+ break;
4232+ }
4233+ case 1: /* edge */
4234+ {
4235+ trigger = 0;
4236+ break;
4237+ }
4238+ case 2: /* reserved */
4239+ {
4240+ printk(KERN_WARNING "broken BIOS!!\n");
4241+ trigger = 1;
4242+ break;
4243+ }
4244+ case 3: /* level */
4245+ {
4246+ trigger = 1;
4247+ break;
4248+ }
4249+ default: /* invalid */
4250+ {
4251+ printk(KERN_WARNING "broken BIOS!!\n");
4252+ trigger = 0;
4253+ break;
4254+ }
4255+ }
4256+ return trigger;
4257+}
4258+
4259+static inline int irq_polarity(int idx)
4260+{
4261+ return MPBIOS_polarity(idx);
4262+}
4263+
4264+static inline int irq_trigger(int idx)
4265+{
4266+ return MPBIOS_trigger(idx);
4267+}
4268+
4269+static int pin_2_irq(int idx, int apic, int pin)
4270+{
4271+ int irq, i;
4272+ int bus = mp_irqs[idx].mpc_srcbus;
4273+
4274+ /*
4275+ * Debugging check, we are in big trouble if this message pops up!
4276+ */
4277+ if (mp_irqs[idx].mpc_dstirq != pin)
4278+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
4279+
4280+ switch (mp_bus_id_to_type[bus])
4281+ {
4282+ case MP_BUS_ISA: /* ISA pin */
4283+ case MP_BUS_EISA:
4284+ case MP_BUS_MCA:
4285+ case MP_BUS_NEC98:
4286+ {
4287+ irq = mp_irqs[idx].mpc_srcbusirq;
4288+ break;
4289+ }
4290+ case MP_BUS_PCI: /* PCI pin */
4291+ {
4292+ /*
4293+ * PCI IRQs are mapped in order
4294+ */
4295+ i = irq = 0;
4296+ while (i < apic)
4297+ irq += nr_ioapic_registers[i++];
4298+ irq += pin;
4299+
4300+ /*
4301+ * For MPS mode, so far only needed by ES7000 platform
4302+ */
4303+ if (ioapic_renumber_irq)
4304+ irq = ioapic_renumber_irq(apic, irq);
4305+
4306+ break;
4307+ }
4308+ default:
4309+ {
4310+ printk(KERN_ERR "unknown bus type %d.\n",bus);
4311+ irq = 0;
4312+ break;
4313+ }
4314+ }
4315+
4316+ /*
4317+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
4318+ */
4319+ if ((pin >= 16) && (pin <= 23)) {
4320+ if (pirq_entries[pin-16] != -1) {
4321+ if (!pirq_entries[pin-16]) {
4322+ apic_printk(APIC_VERBOSE, KERN_DEBUG
4323+ "disabling PIRQ%d\n", pin-16);
4324+ } else {
4325+ irq = pirq_entries[pin-16];
4326+ apic_printk(APIC_VERBOSE, KERN_DEBUG
4327+ "using PIRQ%d -> IRQ %d\n",
4328+ pin-16, irq);
4329+ }
4330+ }
4331+ }
4332+ return irq;
4333+}
4334+
4335+static inline int IO_APIC_irq_trigger(int irq)
4336+{
4337+ int apic, idx, pin;
4338+
4339+ for (apic = 0; apic < nr_ioapics; apic++) {
4340+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4341+ idx = find_irq_entry(apic,pin,mp_INT);
4342+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
4343+ return irq_trigger(idx);
4344+ }
4345+ }
4346+ /*
4347+ * nonexistent IRQs are edge default
4348+ */
4349+ return 0;
4350+}
4351+
4352+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
4353+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
4354+
4355+int assign_irq_vector(int irq)
4356+{
4357+ unsigned long flags;
4358+ int vector;
4359+ struct physdev_irq irq_op;
4360+
4361+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
4362+
4363+ if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
4364+ return -EINVAL;
4365+
4366+ spin_lock_irqsave(&vector_lock, flags);
4367+
4368+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
4369+ spin_unlock_irqrestore(&vector_lock, flags);
4370+ return IO_APIC_VECTOR(irq);
4371+ }
4372+
4373+ irq_op.irq = irq;
4374+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
4375+ spin_unlock_irqrestore(&vector_lock, flags);
4376+ return -ENOSPC;
4377+ }
4378+
4379+ vector = irq_op.vector;
4380+ vector_irq[vector] = irq;
4381+ if (irq != AUTO_ASSIGN)
4382+ IO_APIC_VECTOR(irq) = vector;
4383+
4384+ spin_unlock_irqrestore(&vector_lock, flags);
4385+
4386+ return vector;
4387+}
4388+
4389+#ifndef CONFIG_XEN
4390+static struct hw_interrupt_type ioapic_level_type;
4391+static struct hw_interrupt_type ioapic_edge_type;
4392+
4393+#define IOAPIC_AUTO -1
4394+#define IOAPIC_EDGE 0
4395+#define IOAPIC_LEVEL 1
4396+
4397+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
4398+{
4399+ unsigned idx;
4400+
4401+ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
4402+
4403+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
4404+ trigger == IOAPIC_LEVEL)
4405+ irq_desc[idx].chip = &ioapic_level_type;
4406+ else
4407+ irq_desc[idx].chip = &ioapic_edge_type;
4408+ set_intr_gate(vector, interrupt[idx]);
4409+}
4410+#else
4411+#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
4412+#endif
4413+
4414+static void __init setup_IO_APIC_irqs(void)
4415+{
4416+ struct IO_APIC_route_entry entry;
4417+ int apic, pin, idx, irq, first_notcon = 1, vector;
4418+ unsigned long flags;
4419+
4420+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
4421+
4422+ for (apic = 0; apic < nr_ioapics; apic++) {
4423+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4424+
4425+ /*
4426+ * add it to the IO-APIC irq-routing table:
4427+ */
4428+ memset(&entry,0,sizeof(entry));
4429+
4430+ entry.delivery_mode = INT_DELIVERY_MODE;
4431+ entry.dest_mode = INT_DEST_MODE;
4432+ entry.mask = 0; /* enable IRQ */
4433+ entry.dest.logical.logical_dest =
4434+ cpu_mask_to_apicid(TARGET_CPUS);
4435+
4436+ idx = find_irq_entry(apic,pin,mp_INT);
4437+ if (idx == -1) {
4438+ if (first_notcon) {
4439+ apic_printk(APIC_VERBOSE, KERN_DEBUG
4440+ " IO-APIC (apicid-pin) %d-%d",
4441+ mp_ioapics[apic].mpc_apicid,
4442+ pin);
4443+ first_notcon = 0;
4444+ } else
4445+ apic_printk(APIC_VERBOSE, ", %d-%d",
4446+ mp_ioapics[apic].mpc_apicid, pin);
4447+ continue;
4448+ }
4449+
4450+ entry.trigger = irq_trigger(idx);
4451+ entry.polarity = irq_polarity(idx);
4452+
4453+ if (irq_trigger(idx)) {
4454+ entry.trigger = 1;
4455+ entry.mask = 1;
4456+ }
4457+
4458+ irq = pin_2_irq(idx, apic, pin);
4459+ /*
4460+ * skip adding the timer int on secondary nodes, which causes
4461+ * a small but painful rift in the time-space continuum
4462+ */
4463+ if (multi_timer_check(apic, irq))
4464+ continue;
4465+ else
4466+ add_pin_to_irq(irq, apic, pin);
4467+
4468+ if (/*!apic &&*/ !IO_APIC_IRQ(irq))
4469+ continue;
4470+
4471+ if (IO_APIC_IRQ(irq)) {
4472+ vector = assign_irq_vector(irq);
4473+ entry.vector = vector;
4474+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
4475+
4476+ if (!apic && (irq < 16))
4477+ disable_8259A_irq(irq);
4478+ }
4479+ spin_lock_irqsave(&ioapic_lock, flags);
4480+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4481+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4482+ set_native_irq_info(irq, TARGET_CPUS);
4483+ spin_unlock_irqrestore(&ioapic_lock, flags);
4484+ }
4485+ }
4486+
4487+ if (!first_notcon)
4488+ apic_printk(APIC_VERBOSE, " not connected.\n");
4489+}
4490+
4491+/*
4492+ * Set up the 8259A-master output pin:
4493+ */
4494+#ifndef CONFIG_XEN
4495+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
4496+{
4497+ struct IO_APIC_route_entry entry;
4498+ unsigned long flags;
4499+
4500+ memset(&entry,0,sizeof(entry));
4501+
4502+ disable_8259A_irq(0);
4503+
4504+ /* mask LVT0 */
4505+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
4506+
4507+ /*
4508+ * We use logical delivery to get the timer IRQ
4509+ * to the first CPU.
4510+ */
4511+ entry.dest_mode = INT_DEST_MODE;
4512+ entry.mask = 0; /* unmask IRQ now */
4513+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4514+ entry.delivery_mode = INT_DELIVERY_MODE;
4515+ entry.polarity = 0;
4516+ entry.trigger = 0;
4517+ entry.vector = vector;
4518+
4519+ /*
4520+ * The timer IRQ doesn't have to know that behind the
4521+ * scene we have a 8259A-master in AEOI mode ...
4522+ */
4523+ irq_desc[0].chip = &ioapic_edge_type;
4524+
4525+ /*
4526+ * Add it to the IO-APIC irq-routing table:
4527+ */
4528+ spin_lock_irqsave(&ioapic_lock, flags);
4529+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4530+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4531+ spin_unlock_irqrestore(&ioapic_lock, flags);
4532+
4533+ enable_8259A_irq(0);
4534+}
4535+
4536+static inline void UNEXPECTED_IO_APIC(void)
4537+{
4538+}
4539+
4540+void __init print_IO_APIC(void)
4541+{
4542+ int apic, i;
4543+ union IO_APIC_reg_00 reg_00;
4544+ union IO_APIC_reg_01 reg_01;
4545+ union IO_APIC_reg_02 reg_02;
4546+ union IO_APIC_reg_03 reg_03;
4547+ unsigned long flags;
4548+
4549+ if (apic_verbosity == APIC_QUIET)
4550+ return;
4551+
4552+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
4553+ for (i = 0; i < nr_ioapics; i++)
4554+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
4555+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
4556+
4557+ /*
4558+ * We are a bit conservative about what we expect. We have to
4559+ * know about every hardware change ASAP.
4560+ */
4561+ printk(KERN_INFO "testing the IO APIC.......................\n");
4562+
4563+ for (apic = 0; apic < nr_ioapics; apic++) {
4564+
4565+ spin_lock_irqsave(&ioapic_lock, flags);
4566+ reg_00.raw = io_apic_read(apic, 0);
4567+ reg_01.raw = io_apic_read(apic, 1);
4568+ if (reg_01.bits.version >= 0x10)
4569+ reg_02.raw = io_apic_read(apic, 2);
4570+ if (reg_01.bits.version >= 0x20)
4571+ reg_03.raw = io_apic_read(apic, 3);
4572+ spin_unlock_irqrestore(&ioapic_lock, flags);
4573+
4574+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
4575+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
4576+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
4577+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
4578+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
4579+ if (reg_00.bits.ID >= get_physical_broadcast())
4580+ UNEXPECTED_IO_APIC();
4581+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
4582+ UNEXPECTED_IO_APIC();
4583+
4584+ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
4585+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
4586+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
4587+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
4588+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
4589+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
4590+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
4591+ (reg_01.bits.entries != 0x2E) &&
4592+ (reg_01.bits.entries != 0x3F)
4593+ )
4594+ UNEXPECTED_IO_APIC();
4595+
4596+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
4597+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
4598+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
4599+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
4600+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
4601+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
4602+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
4603+ )
4604+ UNEXPECTED_IO_APIC();
4605+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
4606+ UNEXPECTED_IO_APIC();
4607+
4608+ /*
4609+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
4610+ * but the value of reg_02 is read as the previous read register
4611+ * value, so ignore it if reg_02 == reg_01.
4612+ */
4613+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
4614+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
4615+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
4616+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
4617+ UNEXPECTED_IO_APIC();
4618+ }
4619+
4620+ /*
4621+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
4622+ * or reg_03, but the value of reg_0[23] is read as the previous read
4623+ * register value, so ignore it if reg_03 == reg_0[12].
4624+ */
4625+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
4626+ reg_03.raw != reg_01.raw) {
4627+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
4628+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
4629+ if (reg_03.bits.__reserved_1)
4630+ UNEXPECTED_IO_APIC();
4631+ }
4632+
4633+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
4634+
4635+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
4636+ " Stat Dest Deli Vect: \n");
4637+
4638+ for (i = 0; i <= reg_01.bits.entries; i++) {
4639+ struct IO_APIC_route_entry entry;
4640+
4641+ spin_lock_irqsave(&ioapic_lock, flags);
4642+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
4643+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
4644+ spin_unlock_irqrestore(&ioapic_lock, flags);
4645+
4646+ printk(KERN_DEBUG " %02x %03X %02X ",
4647+ i,
4648+ entry.dest.logical.logical_dest,
4649+ entry.dest.physical.physical_dest
4650+ );
4651+
4652+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
4653+ entry.mask,
4654+ entry.trigger,
4655+ entry.irr,
4656+ entry.polarity,
4657+ entry.delivery_status,
4658+ entry.dest_mode,
4659+ entry.delivery_mode,
4660+ entry.vector
4661+ );
4662+ }
4663+ }
4664+ if (use_pci_vector())
4665+ printk(KERN_INFO "Using vector-based indexing\n");
4666+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
4667+ for (i = 0; i < NR_IRQS; i++) {
4668+ struct irq_pin_list *entry = irq_2_pin + i;
4669+ if (entry->pin < 0)
4670+ continue;
4671+ if (use_pci_vector() && !platform_legacy_irq(i))
4672+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
4673+ else
4674+ printk(KERN_DEBUG "IRQ%d ", i);
4675+ for (;;) {
4676+ printk("-> %d:%d", entry->apic, entry->pin);
4677+ if (!entry->next)
4678+ break;
4679+ entry = irq_2_pin + entry->next;
4680+ }
4681+ printk("\n");
4682+ }
4683+
4684+ printk(KERN_INFO ".................................... done.\n");
4685+
4686+ return;
4687+}
4688+
4689+static void print_APIC_bitfield (int base)
4690+{
4691+ unsigned int v;
4692+ int i, j;
4693+
4694+ if (apic_verbosity == APIC_QUIET)
4695+ return;
4696+
4697+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
4698+ for (i = 0; i < 8; i++) {
4699+ v = apic_read(base + i*0x10);
4700+ for (j = 0; j < 32; j++) {
4701+ if (v & (1<<j))
4702+ printk("1");
4703+ else
4704+ printk("0");
4705+ }
4706+ printk("\n");
4707+ }
4708+}
4709+
4710+void /*__init*/ print_local_APIC(void * dummy)
4711+{
4712+ unsigned int v, ver, maxlvt;
4713+
4714+ if (apic_verbosity == APIC_QUIET)
4715+ return;
4716+
4717+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
4718+ smp_processor_id(), hard_smp_processor_id());
4719+ v = apic_read(APIC_ID);
4720+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
4721+ v = apic_read(APIC_LVR);
4722+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
4723+ ver = GET_APIC_VERSION(v);
4724+ maxlvt = get_maxlvt();
4725+
4726+ v = apic_read(APIC_TASKPRI);
4727+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
4728+
4729+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
4730+ v = apic_read(APIC_ARBPRI);
4731+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
4732+ v & APIC_ARBPRI_MASK);
4733+ v = apic_read(APIC_PROCPRI);
4734+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
4735+ }
4736+
4737+ v = apic_read(APIC_EOI);
4738+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
4739+ v = apic_read(APIC_RRR);
4740+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
4741+ v = apic_read(APIC_LDR);
4742+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
4743+ v = apic_read(APIC_DFR);
4744+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
4745+ v = apic_read(APIC_SPIV);
4746+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
4747+
4748+ printk(KERN_DEBUG "... APIC ISR field:\n");
4749+ print_APIC_bitfield(APIC_ISR);
4750+ printk(KERN_DEBUG "... APIC TMR field:\n");
4751+ print_APIC_bitfield(APIC_TMR);
4752+ printk(KERN_DEBUG "... APIC IRR field:\n");
4753+ print_APIC_bitfield(APIC_IRR);
4754+
4755+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
4756+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
4757+ apic_write(APIC_ESR, 0);
4758+ v = apic_read(APIC_ESR);
4759+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
4760+ }
4761+
4762+ v = apic_read(APIC_ICR);
4763+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
4764+ v = apic_read(APIC_ICR2);
4765+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
4766+
4767+ v = apic_read(APIC_LVTT);
4768+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
4769+
4770+ if (maxlvt > 3) { /* PC is LVT#4. */
4771+ v = apic_read(APIC_LVTPC);
4772+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
4773+ }
4774+ v = apic_read(APIC_LVT0);
4775+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
4776+ v = apic_read(APIC_LVT1);
4777+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
4778+
4779+ if (maxlvt > 2) { /* ERR is LVT#3. */
4780+ v = apic_read(APIC_LVTERR);
4781+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
4782+ }
4783+
4784+ v = apic_read(APIC_TMICT);
4785+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
4786+ v = apic_read(APIC_TMCCT);
4787+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
4788+ v = apic_read(APIC_TDCR);
4789+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
4790+ printk("\n");
4791+}
4792+
4793+void print_all_local_APICs (void)
4794+{
4795+ on_each_cpu(print_local_APIC, NULL, 1, 1);
4796+}
4797+
4798+void /*__init*/ print_PIC(void)
4799+{
4800+ unsigned int v;
4801+ unsigned long flags;
4802+
4803+ if (apic_verbosity == APIC_QUIET)
4804+ return;
4805+
4806+ printk(KERN_DEBUG "\nprinting PIC contents\n");
4807+
4808+ spin_lock_irqsave(&i8259A_lock, flags);
4809+
4810+ v = inb(0xa1) << 8 | inb(0x21);
4811+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
4812+
4813+ v = inb(0xa0) << 8 | inb(0x20);
4814+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
4815+
4816+ outb(0x0b,0xa0);
4817+ outb(0x0b,0x20);
4818+ v = inb(0xa0) << 8 | inb(0x20);
4819+ outb(0x0a,0xa0);
4820+ outb(0x0a,0x20);
4821+
4822+ spin_unlock_irqrestore(&i8259A_lock, flags);
4823+
4824+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
4825+
4826+ v = inb(0x4d1) << 8 | inb(0x4d0);
4827+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
4828+}
4829+#endif /* !CONFIG_XEN */
4830+
4831+static void __init enable_IO_APIC(void)
4832+{
4833+ union IO_APIC_reg_01 reg_01;
4834+ int i8259_apic, i8259_pin;
4835+ int i, apic;
4836+ unsigned long flags;
4837+
4838+ for (i = 0; i < PIN_MAP_SIZE; i++) {
4839+ irq_2_pin[i].pin = -1;
4840+ irq_2_pin[i].next = 0;
4841+ }
4842+ if (!pirqs_enabled)
4843+ for (i = 0; i < MAX_PIRQS; i++)
4844+ pirq_entries[i] = -1;
4845+
4846+ /*
4847+ * The number of IO-APIC IRQ registers (== #pins):
4848+ */
4849+ for (apic = 0; apic < nr_ioapics; apic++) {
4850+ spin_lock_irqsave(&ioapic_lock, flags);
4851+ reg_01.raw = io_apic_read(apic, 1);
4852+ spin_unlock_irqrestore(&ioapic_lock, flags);
4853+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
4854+ }
4855+ for(apic = 0; apic < nr_ioapics; apic++) {
4856+ int pin;
4857+ /* See if any of the pins is in ExtINT mode */
4858+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4859+ struct IO_APIC_route_entry entry;
4860+ spin_lock_irqsave(&ioapic_lock, flags);
4861+ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4862+ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4863+ spin_unlock_irqrestore(&ioapic_lock, flags);
4864+
4865+
4866+ /* If the interrupt line is enabled and in ExtInt mode
4867+ * I have found the pin where the i8259 is connected.
4868+ */
4869+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
4870+ ioapic_i8259.apic = apic;
4871+ ioapic_i8259.pin = pin;
4872+ goto found_i8259;
4873+ }
4874+ }
4875+ }
4876+ found_i8259:
4877+ /* Look to see what if the MP table has reported the ExtINT */
4878+ /* If we could not find the appropriate pin by looking at the ioapic
4879+ * the i8259 probably is not connected the ioapic but give the
4880+ * mptable a chance anyway.
4881+ */
4882+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
4883+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
4884+ /* Trust the MP table if nothing is setup in the hardware */
4885+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
4886+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
4887+ ioapic_i8259.pin = i8259_pin;
4888+ ioapic_i8259.apic = i8259_apic;
4889+ }
4890+ /* Complain if the MP table and the hardware disagree */
4891+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
4892+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
4893+ {
4894+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
4895+ }
4896+
4897+ /*
4898+ * Do not trust the IO-APIC being empty at bootup
4899+ */
4900+ clear_IO_APIC();
4901+}
4902+
4903+/*
4904+ * Not an __init, needed by the reboot code
4905+ */
4906+void disable_IO_APIC(void)
4907+{
4908+ /*
4909+ * Clear the IO-APIC before rebooting:
4910+ */
4911+ clear_IO_APIC();
4912+
4913+#ifndef CONFIG_XEN
4914+ /*
4915+ * If the i8259 is routed through an IOAPIC
4916+ * Put that IOAPIC in virtual wire mode
4917+ * so legacy interrupts can be delivered.
4918+ */
4919+ if (ioapic_i8259.pin != -1) {
4920+ struct IO_APIC_route_entry entry;
4921+ unsigned long flags;
4922+
4923+ memset(&entry, 0, sizeof(entry));
4924+ entry.mask = 0; /* Enabled */
4925+ entry.trigger = 0; /* Edge */
4926+ entry.irr = 0;
4927+ entry.polarity = 0; /* High */
4928+ entry.delivery_status = 0;
4929+ entry.dest_mode = 0; /* Physical */
4930+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
4931+ entry.vector = 0;
4932+ entry.dest.physical.physical_dest =
4933+ GET_APIC_ID(apic_read(APIC_ID));
4934+
4935+ /*
4936+ * Add it to the IO-APIC irq-routing table:
4937+ */
4938+ spin_lock_irqsave(&ioapic_lock, flags);
4939+ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
4940+ *(((int *)&entry)+1));
4941+ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
4942+ *(((int *)&entry)+0));
4943+ spin_unlock_irqrestore(&ioapic_lock, flags);
4944+ }
4945+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
4946+#endif
4947+}
4948+
4949+/*
4950+ * function to set the IO-APIC physical IDs based on the
4951+ * values stored in the MPC table.
4952+ *
4953+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
4954+ */
4955+
4956+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
4957+static void __init setup_ioapic_ids_from_mpc(void)
4958+{
4959+ union IO_APIC_reg_00 reg_00;
4960+ physid_mask_t phys_id_present_map;
4961+ int apic;
4962+ int i;
4963+ unsigned char old_id;
4964+ unsigned long flags;
4965+
4966+ /*
4967+ * Don't check I/O APIC IDs for xAPIC systems. They have
4968+ * no meaning without the serial APIC bus.
4969+ */
4970+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4971+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4972+ return;
4973+ /*
4974+ * This is broken; anything with a real cpu count has to
4975+ * circumvent this idiocy regardless.
4976+ */
4977+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
4978+
4979+ /*
4980+ * Set the IOAPIC ID to the value stored in the MPC table.
4981+ */
4982+ for (apic = 0; apic < nr_ioapics; apic++) {
4983+
4984+ /* Read the register 0 value */
4985+ spin_lock_irqsave(&ioapic_lock, flags);
4986+ reg_00.raw = io_apic_read(apic, 0);
4987+ spin_unlock_irqrestore(&ioapic_lock, flags);
4988+
4989+ old_id = mp_ioapics[apic].mpc_apicid;
4990+
4991+ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
4992+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
4993+ apic, mp_ioapics[apic].mpc_apicid);
4994+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
4995+ reg_00.bits.ID);
4996+ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
4997+ }
4998+
4999+ /*
5000+ * Sanity check, is the ID really free? Every APIC in a
5001+ * system must have a unique ID or we get lots of nice
5002+ * 'stuck on smp_invalidate_needed IPI wait' messages.
5003+ */
5004+ if (check_apicid_used(phys_id_present_map,
5005+ mp_ioapics[apic].mpc_apicid)) {
5006+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5007+ apic, mp_ioapics[apic].mpc_apicid);
5008+ for (i = 0; i < get_physical_broadcast(); i++)
5009+ if (!physid_isset(i, phys_id_present_map))
5010+ break;
5011+ if (i >= get_physical_broadcast())
5012+ panic("Max APIC ID exceeded!\n");
5013+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5014+ i);
5015+ physid_set(i, phys_id_present_map);
5016+ mp_ioapics[apic].mpc_apicid = i;
5017+ } else {
5018+ physid_mask_t tmp;
5019+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5020+ apic_printk(APIC_VERBOSE, "Setting %d in the "
5021+ "phys_id_present_map\n",
5022+ mp_ioapics[apic].mpc_apicid);
5023+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
5024+ }
5025+
5026+
5027+ /*
5028+ * We need to adjust the IRQ routing table
5029+ * if the ID changed.
5030+ */
5031+ if (old_id != mp_ioapics[apic].mpc_apicid)
5032+ for (i = 0; i < mp_irq_entries; i++)
5033+ if (mp_irqs[i].mpc_dstapic == old_id)
5034+ mp_irqs[i].mpc_dstapic
5035+ = mp_ioapics[apic].mpc_apicid;
5036+
5037+ /*
5038+ * Read the right value from the MPC table and
5039+ * write it into the ID register.
5040+ */
5041+ apic_printk(APIC_VERBOSE, KERN_INFO
5042+ "...changing IO-APIC physical APIC ID to %d ...",
5043+ mp_ioapics[apic].mpc_apicid);
5044+
5045+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5046+ spin_lock_irqsave(&ioapic_lock, flags);
5047+ io_apic_write(apic, 0, reg_00.raw);
5048+ spin_unlock_irqrestore(&ioapic_lock, flags);
5049+
5050+ /*
5051+ * Sanity check
5052+ */
5053+ spin_lock_irqsave(&ioapic_lock, flags);
5054+ reg_00.raw = io_apic_read(apic, 0);
5055+ spin_unlock_irqrestore(&ioapic_lock, flags);
5056+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5057+ printk("could not set ID!\n");
5058+ else
5059+ apic_printk(APIC_VERBOSE, " ok.\n");
5060+ }
5061+}
5062+#else
5063+static void __init setup_ioapic_ids_from_mpc(void) { }
5064+#endif
5065+
5066+#ifndef CONFIG_XEN
5067+/*
5068+ * There is a nasty bug in some older SMP boards, their mptable lies
5069+ * about the timer IRQ. We do the following to work around the situation:
5070+ *
5071+ * - timer IRQ defaults to IO-APIC IRQ
5072+ * - if this function detects that timer IRQs are defunct, then we fall
5073+ * back to ISA timer IRQs
5074+ */
5075+static int __init timer_irq_works(void)
5076+{
5077+ unsigned long t1 = jiffies;
5078+
5079+ local_irq_enable();
5080+ /* Let ten ticks pass... */
5081+ mdelay((10 * 1000) / HZ);
5082+
5083+ /*
5084+ * Expect a few ticks at least, to be sure some possible
5085+ * glue logic does not lock up after one or two first
5086+ * ticks in a non-ExtINT mode. Also the local APIC
5087+ * might have cached one ExtINT interrupt. Finally, at
5088+ * least one tick may be lost due to delays.
5089+ */
5090+ if (jiffies - t1 > 4)
5091+ return 1;
5092+
5093+ return 0;
5094+}
5095+
5096+/*
5097+ * In the SMP+IOAPIC case it might happen that there are an unspecified
5098+ * number of pending IRQ events unhandled. These cases are very rare,
5099+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5100+ * better to do it this way as thus we do not have to be aware of
5101+ * 'pending' interrupts in the IRQ path, except at this point.
5102+ */
5103+/*
5104+ * Edge triggered needs to resend any interrupt
5105+ * that was delayed but this is now handled in the device
5106+ * independent code.
5107+ */
5108+
5109+/*
5110+ * Starting up a edge-triggered IO-APIC interrupt is
5111+ * nasty - we need to make sure that we get the edge.
5112+ * If it is already asserted for some reason, we need
5113+ * return 1 to indicate that is was pending.
5114+ *
5115+ * This is not complete - we should be able to fake
5116+ * an edge even if it isn't on the 8259A...
5117+ */
5118+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5119+{
5120+ int was_pending = 0;
5121+ unsigned long flags;
5122+
5123+ spin_lock_irqsave(&ioapic_lock, flags);
5124+ if (irq < 16) {
5125+ disable_8259A_irq(irq);
5126+ if (i8259A_irq_pending(irq))
5127+ was_pending = 1;
5128+ }
5129+ __unmask_IO_APIC_irq(irq);
5130+ spin_unlock_irqrestore(&ioapic_lock, flags);
5131+
5132+ return was_pending;
5133+}
5134+
5135+/*
5136+ * Once we have recorded IRQ_PENDING already, we can mask the
5137+ * interrupt for real. This prevents IRQ storms from unhandled
5138+ * devices.
5139+ */
5140+static void ack_edge_ioapic_irq(unsigned int irq)
5141+{
5142+ move_irq(irq);
5143+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5144+ == (IRQ_PENDING | IRQ_DISABLED))
5145+ mask_IO_APIC_irq(irq);
5146+ ack_APIC_irq();
5147+}
5148+
5149+/*
5150+ * Level triggered interrupts can just be masked,
5151+ * and shutting down and starting up the interrupt
5152+ * is the same as enabling and disabling them -- except
5153+ * with a startup need to return a "was pending" value.
5154+ *
5155+ * Level triggered interrupts are special because we
5156+ * do not touch any IO-APIC register while handling
5157+ * them. We ack the APIC in the end-IRQ handler, not
5158+ * in the start-IRQ-handler. Protection against reentrance
5159+ * from the same interrupt is still provided, both by the
5160+ * generic IRQ layer and by the fact that an unacked local
5161+ * APIC does not accept IRQs.
5162+ */
5163+static unsigned int startup_level_ioapic_irq (unsigned int irq)
5164+{
5165+ unmask_IO_APIC_irq(irq);
5166+
5167+ return 0; /* don't check for pending */
5168+}
5169+
5170+static void end_level_ioapic_irq (unsigned int irq)
5171+{
5172+ unsigned long v;
5173+ int i;
5174+
5175+ move_irq(irq);
5176+/*
5177+ * It appears there is an erratum which affects at least version 0x11
5178+ * of I/O APIC (that's the 82093AA and cores integrated into various
5179+ * chipsets). Under certain conditions a level-triggered interrupt is
5180+ * erroneously delivered as edge-triggered one but the respective IRR
5181+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
5182+ * message but it will never arrive and further interrupts are blocked
5183+ * from the source. The exact reason is so far unknown, but the
5184+ * phenomenon was observed when two consecutive interrupt requests
5185+ * from a given source get delivered to the same CPU and the source is
5186+ * temporarily disabled in between.
5187+ *
5188+ * A workaround is to simulate an EOI message manually. We achieve it
5189+ * by setting the trigger mode to edge and then to level when the edge
5190+ * trigger mode gets detected in the TMR of a local APIC for a
5191+ * level-triggered interrupt. We mask the source for the time of the
5192+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
5193+ * The idea is from Manfred Spraul. --macro
5194+ */
5195+ i = IO_APIC_VECTOR(irq);
5196+
5197+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
5198+
5199+ ack_APIC_irq();
5200+
5201+ if (!(v & (1 << (i & 0x1f)))) {
5202+ atomic_inc(&irq_mis_count);
5203+ spin_lock(&ioapic_lock);
5204+ __mask_and_edge_IO_APIC_irq(irq);
5205+ __unmask_and_level_IO_APIC_irq(irq);
5206+ spin_unlock(&ioapic_lock);
5207+ }
5208+}
5209+
5210+#ifdef CONFIG_PCI_MSI
5211+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
5212+{
5213+ int irq = vector_to_irq(vector);
5214+
5215+ return startup_edge_ioapic_irq(irq);
5216+}
5217+
5218+static void ack_edge_ioapic_vector(unsigned int vector)
5219+{
5220+ int irq = vector_to_irq(vector);
5221+
5222+ move_native_irq(vector);
5223+ ack_edge_ioapic_irq(irq);
5224+}
5225+
5226+static unsigned int startup_level_ioapic_vector (unsigned int vector)
5227+{
5228+ int irq = vector_to_irq(vector);
5229+
5230+ return startup_level_ioapic_irq (irq);
5231+}
5232+
5233+static void end_level_ioapic_vector (unsigned int vector)
5234+{
5235+ int irq = vector_to_irq(vector);
5236+
5237+ move_native_irq(vector);
5238+ end_level_ioapic_irq(irq);
5239+}
5240+
5241+static void mask_IO_APIC_vector (unsigned int vector)
5242+{
5243+ int irq = vector_to_irq(vector);
5244+
5245+ mask_IO_APIC_irq(irq);
5246+}
5247+
5248+static void unmask_IO_APIC_vector (unsigned int vector)
5249+{
5250+ int irq = vector_to_irq(vector);
5251+
5252+ unmask_IO_APIC_irq(irq);
5253+}
5254+
5255+#ifdef CONFIG_SMP
5256+static void set_ioapic_affinity_vector (unsigned int vector,
5257+ cpumask_t cpu_mask)
5258+{
5259+ int irq = vector_to_irq(vector);
5260+
5261+ set_native_irq_info(vector, cpu_mask);
5262+ set_ioapic_affinity_irq(irq, cpu_mask);
5263+}
5264+#endif
5265+#endif
5266+
5267+static int ioapic_retrigger(unsigned int irq)
5268+{
5269+ send_IPI_self(IO_APIC_VECTOR(irq));
5270+
5271+ return 1;
5272+}
5273+
5274+/*
5275+ * Level and edge triggered IO-APIC interrupts need different handling,
5276+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
5277+ * handled with the level-triggered descriptor, but that one has slightly
5278+ * more overhead. Level-triggered interrupts cannot be handled with the
5279+ * edge-triggered handler, without risking IRQ storms and other ugly
5280+ * races.
5281+ */
5282+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
5283+ .typename = "IO-APIC-edge",
5284+ .startup = startup_edge_ioapic,
5285+ .shutdown = shutdown_edge_ioapic,
5286+ .enable = enable_edge_ioapic,
5287+ .disable = disable_edge_ioapic,
5288+ .ack = ack_edge_ioapic,
5289+ .end = end_edge_ioapic,
5290+#ifdef CONFIG_SMP
5291+ .set_affinity = set_ioapic_affinity,
5292+#endif
5293+ .retrigger = ioapic_retrigger,
5294+};
5295+
5296+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
5297+ .typename = "IO-APIC-level",
5298+ .startup = startup_level_ioapic,
5299+ .shutdown = shutdown_level_ioapic,
5300+ .enable = enable_level_ioapic,
5301+ .disable = disable_level_ioapic,
5302+ .ack = mask_and_ack_level_ioapic,
5303+ .end = end_level_ioapic,
5304+#ifdef CONFIG_SMP
5305+ .set_affinity = set_ioapic_affinity,
5306+#endif
5307+ .retrigger = ioapic_retrigger,
5308+};
5309+#endif /* !CONFIG_XEN */
5310+
5311+static inline void init_IO_APIC_traps(void)
5312+{
5313+ int irq;
5314+
5315+ /*
5316+ * NOTE! The local APIC isn't very good at handling
5317+ * multiple interrupts at the same interrupt level.
5318+ * As the interrupt level is determined by taking the
5319+ * vector number and shifting that right by 4, we
5320+ * want to spread these out a bit so that they don't
5321+ * all fall in the same interrupt level.
5322+ *
5323+ * Also, we've got to be careful not to trash gate
5324+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
5325+ */
5326+ for (irq = 0; irq < NR_IRQS ; irq++) {
5327+ int tmp = irq;
5328+ if (use_pci_vector()) {
5329+ if (!platform_legacy_irq(tmp))
5330+ if ((tmp = vector_to_irq(tmp)) == -1)
5331+ continue;
5332+ }
5333+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
5334+ /*
5335+ * Hmm.. We don't have an entry for this,
5336+ * so default to an old-fashioned 8259
5337+ * interrupt if we can..
5338+ */
5339+ if (irq < 16)
5340+ make_8259A_irq(irq);
5341+#ifndef CONFIG_XEN
5342+ else
5343+ /* Strange. Oh, well.. */
5344+ irq_desc[irq].chip = &no_irq_type;
5345+#endif
5346+ }
5347+ }
5348+}
5349+
5350+#ifndef CONFIG_XEN
5351+static void enable_lapic_irq (unsigned int irq)
5352+{
5353+ unsigned long v;
5354+
5355+ v = apic_read(APIC_LVT0);
5356+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
5357+}
5358+
5359+static void disable_lapic_irq (unsigned int irq)
5360+{
5361+ unsigned long v;
5362+
5363+ v = apic_read(APIC_LVT0);
5364+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
5365+}
5366+
5367+static void ack_lapic_irq (unsigned int irq)
5368+{
5369+ ack_APIC_irq();
5370+}
5371+
5372+static void end_lapic_irq (unsigned int i) { /* nothing */ }
5373+
5374+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
5375+ .typename = "local-APIC-edge",
5376+ .startup = NULL, /* startup_irq() not used for IRQ0 */
5377+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
5378+ .enable = enable_lapic_irq,
5379+ .disable = disable_lapic_irq,
5380+ .ack = ack_lapic_irq,
5381+ .end = end_lapic_irq
5382+};
5383+
5384+static void setup_nmi (void)
5385+{
5386+ /*
5387+ * Dirty trick to enable the NMI watchdog ...
5388+ * We put the 8259A master into AEOI mode and
5389+ * unmask on all local APICs LVT0 as NMI.
5390+ *
5391+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
5392+ * is from Maciej W. Rozycki - so we do not have to EOI from
5393+ * the NMI handler or the timer interrupt.
5394+ */
5395+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
5396+
5397+ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
5398+
5399+ apic_printk(APIC_VERBOSE, " done.\n");
5400+}
5401+
5402+/*
5403+ * This looks a bit hackish but it's about the only one way of sending
5404+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
5405+ * not support the ExtINT mode, unfortunately. We need to send these
5406+ * cycles as some i82489DX-based boards have glue logic that keeps the
5407+ * 8259A interrupt line asserted until INTA. --macro
5408+ */
5409+static inline void unlock_ExtINT_logic(void)
5410+{
5411+ int apic, pin, i;
5412+ struct IO_APIC_route_entry entry0, entry1;
5413+ unsigned char save_control, save_freq_select;
5414+ unsigned long flags;
5415+
5416+ pin = find_isa_irq_pin(8, mp_INT);
5417+ apic = find_isa_irq_apic(8, mp_INT);
5418+ if (pin == -1)
5419+ return;
5420+
5421+ spin_lock_irqsave(&ioapic_lock, flags);
5422+ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5423+ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5424+ spin_unlock_irqrestore(&ioapic_lock, flags);
5425+ clear_IO_APIC_pin(apic, pin);
5426+
5427+ memset(&entry1, 0, sizeof(entry1));
5428+
5429+ entry1.dest_mode = 0; /* physical delivery */
5430+ entry1.mask = 0; /* unmask IRQ now */
5431+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
5432+ entry1.delivery_mode = dest_ExtINT;
5433+ entry1.polarity = entry0.polarity;
5434+ entry1.trigger = 0;
5435+ entry1.vector = 0;
5436+
5437+ spin_lock_irqsave(&ioapic_lock, flags);
5438+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
5439+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
5440+ spin_unlock_irqrestore(&ioapic_lock, flags);
5441+
5442+ save_control = CMOS_READ(RTC_CONTROL);
5443+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
5444+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
5445+ RTC_FREQ_SELECT);
5446+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
5447+
5448+ i = 100;
5449+ while (i-- > 0) {
5450+ mdelay(10);
5451+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
5452+ i -= 10;
5453+ }
5454+
5455+ CMOS_WRITE(save_control, RTC_CONTROL);
5456+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
5457+ clear_IO_APIC_pin(apic, pin);
5458+
5459+ spin_lock_irqsave(&ioapic_lock, flags);
5460+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
5461+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
5462+ spin_unlock_irqrestore(&ioapic_lock, flags);
5463+}
5464+
5465+int timer_uses_ioapic_pin_0;
5466+
5467+/*
5468+ * This code may look a bit paranoid, but it's supposed to cooperate with
5469+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
5470+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
5471+ * fanatically on his truly buggy board.
5472+ */
5473+static inline void check_timer(void)
5474+{
5475+ int apic1, pin1, apic2, pin2;
5476+ int vector;
5477+
5478+ /*
5479+ * get/set the timer IRQ vector:
5480+ */
5481+ disable_8259A_irq(0);
5482+ vector = assign_irq_vector(0);
5483+ set_intr_gate(vector, interrupt[0]);
5484+
5485+ /*
5486+ * Subtle, code in do_timer_interrupt() expects an AEOI
5487+ * mode for the 8259A whenever interrupts are routed
5488+ * through I/O APICs. Also IRQ0 has to be enabled in
5489+ * the 8259A which implies the virtual wire has to be
5490+ * disabled in the local APIC.
5491+ */
5492+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5493+ init_8259A(1);
5494+ timer_ack = 1;
5495+ if (timer_over_8254 > 0)
5496+ enable_8259A_irq(0);
5497+
5498+ pin1 = find_isa_irq_pin(0, mp_INT);
5499+ apic1 = find_isa_irq_apic(0, mp_INT);
5500+ pin2 = ioapic_i8259.pin;
5501+ apic2 = ioapic_i8259.apic;
5502+
5503+ if (pin1 == 0)
5504+ timer_uses_ioapic_pin_0 = 1;
5505+
5506+ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
5507+ vector, apic1, pin1, apic2, pin2);
5508+
5509+ if (pin1 != -1) {
5510+ /*
5511+ * Ok, does IRQ0 through the IOAPIC work?
5512+ */
5513+ unmask_IO_APIC_irq(0);
5514+ if (timer_irq_works()) {
5515+ if (nmi_watchdog == NMI_IO_APIC) {
5516+ disable_8259A_irq(0);
5517+ setup_nmi();
5518+ enable_8259A_irq(0);
5519+ }
5520+ if (disable_timer_pin_1 > 0)
5521+ clear_IO_APIC_pin(0, pin1);
5522+ return;
5523+ }
5524+ clear_IO_APIC_pin(apic1, pin1);
5525+ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
5526+ "IO-APIC\n");
5527+ }
5528+
5529+ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
5530+ if (pin2 != -1) {
5531+ printk("\n..... (found pin %d) ...", pin2);
5532+ /*
5533+ * legacy devices should be connected to IO APIC #0
5534+ */
5535+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
5536+ if (timer_irq_works()) {
5537+ printk("works.\n");
5538+ if (pin1 != -1)
5539+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
5540+ else
5541+ add_pin_to_irq(0, apic2, pin2);
5542+ if (nmi_watchdog == NMI_IO_APIC) {
5543+ setup_nmi();
5544+ }
5545+ return;
5546+ }
5547+ /*
5548+ * Cleanup, just in case ...
5549+ */
5550+ clear_IO_APIC_pin(apic2, pin2);
5551+ }
5552+ printk(" failed.\n");
5553+
5554+ if (nmi_watchdog == NMI_IO_APIC) {
5555+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
5556+ nmi_watchdog = 0;
5557+ }
5558+
5559+ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
5560+
5561+ disable_8259A_irq(0);
5562+ irq_desc[0].chip = &lapic_irq_type;
5563+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
5564+ enable_8259A_irq(0);
5565+
5566+ if (timer_irq_works()) {
5567+ printk(" works.\n");
5568+ return;
5569+ }
5570+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
5571+ printk(" failed.\n");
5572+
5573+ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
5574+
5575+ timer_ack = 0;
5576+ init_8259A(0);
5577+ make_8259A_irq(0);
5578+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
5579+
5580+ unlock_ExtINT_logic();
5581+
5582+ if (timer_irq_works()) {
5583+ printk(" works.\n");
5584+ return;
5585+ }
5586+ printk(" failed :(.\n");
5587+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
5588+ "report. Then try booting with the 'noapic' option");
5589+}
5590+#else
5591+int timer_uses_ioapic_pin_0 = 0;
5592+#define check_timer() ((void)0)
5593+#endif
5594+
5595+/*
5596+ *
5597+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
5598+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
5599+ * Linux doesn't really care, as it's not actually used
5600+ * for any interrupt handling anyway.
5601+ */
5602+#define PIC_IRQS (1 << PIC_CASCADE_IR)
5603+
5604+void __init setup_IO_APIC(void)
5605+{
5606+ enable_IO_APIC();
5607+
5608+ if (acpi_ioapic)
5609+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
5610+ else
5611+ io_apic_irqs = ~PIC_IRQS;
5612+
5613+ printk("ENABLING IO-APIC IRQs\n");
5614+
5615+ /*
5616+ * Set up IO-APIC IRQ routing.
5617+ */
5618+ if (!acpi_ioapic)
5619+ setup_ioapic_ids_from_mpc();
5620+#ifndef CONFIG_XEN
5621+ sync_Arb_IDs();
5622+#endif
5623+ setup_IO_APIC_irqs();
5624+ init_IO_APIC_traps();
5625+ check_timer();
5626+ if (!acpi_ioapic)
5627+ print_IO_APIC();
5628+}
5629+
5630+static int __init setup_disable_8254_timer(char *s)
5631+{
5632+ timer_over_8254 = -1;
5633+ return 1;
5634+}
5635+static int __init setup_enable_8254_timer(char *s)
5636+{
5637+ timer_over_8254 = 2;
5638+ return 1;
5639+}
5640+
5641+__setup("disable_8254_timer", setup_disable_8254_timer);
5642+__setup("enable_8254_timer", setup_enable_8254_timer);
5643+
5644+/*
5645+ * Called after all the initialization is done. If we didnt find any
5646+ * APIC bugs then we can allow the modify fast path
5647+ */
5648+
5649+static int __init io_apic_bug_finalize(void)
5650+{
5651+ if(sis_apic_bug == -1)
5652+ sis_apic_bug = 0;
5653+ if (is_initial_xendomain()) {
5654+ struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
5655+ op.u.platform_quirk.quirk_id = sis_apic_bug ?
5656+ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
5657+ VOID(HYPERVISOR_platform_op(&op));
5658+ }
5659+ return 0;
5660+}
5661+
5662+late_initcall(io_apic_bug_finalize);
5663+
5664+struct sysfs_ioapic_data {
5665+ struct sys_device dev;
5666+ struct IO_APIC_route_entry entry[0];
5667+};
5668+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
5669+
5670+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
5671+{
5672+ struct IO_APIC_route_entry *entry;
5673+ struct sysfs_ioapic_data *data;
5674+ unsigned long flags;
5675+ int i;
5676+
5677+ data = container_of(dev, struct sysfs_ioapic_data, dev);
5678+ entry = data->entry;
5679+ spin_lock_irqsave(&ioapic_lock, flags);
5680+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5681+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
5682+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
5683+ }
5684+ spin_unlock_irqrestore(&ioapic_lock, flags);
5685+
5686+ return 0;
5687+}
5688+
5689+static int ioapic_resume(struct sys_device *dev)
5690+{
5691+ struct IO_APIC_route_entry *entry;
5692+ struct sysfs_ioapic_data *data;
5693+ unsigned long flags;
5694+ union IO_APIC_reg_00 reg_00;
5695+ int i;
5696+
5697+ data = container_of(dev, struct sysfs_ioapic_data, dev);
5698+ entry = data->entry;
5699+
5700+ spin_lock_irqsave(&ioapic_lock, flags);
5701+ reg_00.raw = io_apic_read(dev->id, 0);
5702+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
5703+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
5704+ io_apic_write(dev->id, 0, reg_00.raw);
5705+ }
5706+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5707+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
5708+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
5709+ }
5710+ spin_unlock_irqrestore(&ioapic_lock, flags);
5711+
5712+ return 0;
5713+}
5714+
5715+static struct sysdev_class ioapic_sysdev_class = {
5716+ set_kset_name("ioapic"),
5717+#ifndef CONFIG_XEN
5718+ .suspend = ioapic_suspend,
5719+ .resume = ioapic_resume,
5720+#endif
5721+};
5722+
5723+static int __init ioapic_init_sysfs(void)
5724+{
5725+ struct sys_device * dev;
5726+ int i, size, error = 0;
5727+
5728+ error = sysdev_class_register(&ioapic_sysdev_class);
5729+ if (error)
5730+ return error;
5731+
5732+ for (i = 0; i < nr_ioapics; i++ ) {
5733+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
5734+ * sizeof(struct IO_APIC_route_entry);
5735+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
5736+ if (!mp_ioapic_data[i]) {
5737+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5738+ continue;
5739+ }
5740+ memset(mp_ioapic_data[i], 0, size);
5741+ dev = &mp_ioapic_data[i]->dev;
5742+ dev->id = i;
5743+ dev->cls = &ioapic_sysdev_class;
5744+ error = sysdev_register(dev);
5745+ if (error) {
5746+ kfree(mp_ioapic_data[i]);
5747+ mp_ioapic_data[i] = NULL;
5748+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5749+ continue;
5750+ }
5751+ }
5752+
5753+ return 0;
5754+}
5755+
5756+device_initcall(ioapic_init_sysfs);
5757+
5758+/* --------------------------------------------------------------------------
5759+ ACPI-based IOAPIC Configuration
5760+ -------------------------------------------------------------------------- */
5761+
5762+#ifdef CONFIG_ACPI
5763+
5764+int __init io_apic_get_unique_id (int ioapic, int apic_id)
5765+{
5766+#ifndef CONFIG_XEN
5767+ union IO_APIC_reg_00 reg_00;
5768+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
5769+ physid_mask_t tmp;
5770+ unsigned long flags;
5771+ int i = 0;
5772+
5773+ /*
5774+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
5775+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
5776+ * supports up to 16 on one shared APIC bus.
5777+ *
5778+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
5779+ * advantage of new APIC bus architecture.
5780+ */
5781+
5782+ if (physids_empty(apic_id_map))
5783+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
5784+
5785+ spin_lock_irqsave(&ioapic_lock, flags);
5786+ reg_00.raw = io_apic_read(ioapic, 0);
5787+ spin_unlock_irqrestore(&ioapic_lock, flags);
5788+
5789+ if (apic_id >= get_physical_broadcast()) {
5790+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
5791+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
5792+ apic_id = reg_00.bits.ID;
5793+ }
5794+
5795+ /*
5796+ * Every APIC in a system must have a unique ID or we get lots of nice
5797+ * 'stuck on smp_invalidate_needed IPI wait' messages.
5798+ */
5799+ if (check_apicid_used(apic_id_map, apic_id)) {
5800+
5801+ for (i = 0; i < get_physical_broadcast(); i++) {
5802+ if (!check_apicid_used(apic_id_map, i))
5803+ break;
5804+ }
5805+
5806+ if (i == get_physical_broadcast())
5807+ panic("Max apic_id exceeded!\n");
5808+
5809+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
5810+ "trying %d\n", ioapic, apic_id, i);
5811+
5812+ apic_id = i;
5813+ }
5814+
5815+ tmp = apicid_to_cpu_present(apic_id);
5816+ physids_or(apic_id_map, apic_id_map, tmp);
5817+
5818+ if (reg_00.bits.ID != apic_id) {
5819+ reg_00.bits.ID = apic_id;
5820+
5821+ spin_lock_irqsave(&ioapic_lock, flags);
5822+ io_apic_write(ioapic, 0, reg_00.raw);
5823+ reg_00.raw = io_apic_read(ioapic, 0);
5824+ spin_unlock_irqrestore(&ioapic_lock, flags);
5825+
5826+ /* Sanity check */
5827+ if (reg_00.bits.ID != apic_id) {
5828+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
5829+ return -1;
5830+ }
5831+ }
5832+
5833+ apic_printk(APIC_VERBOSE, KERN_INFO
5834+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
5835+#endif /* !CONFIG_XEN */
5836+
5837+ return apic_id;
5838+}
5839+
5840+
5841+int __init io_apic_get_version (int ioapic)
5842+{
5843+ union IO_APIC_reg_01 reg_01;
5844+ unsigned long flags;
5845+
5846+ spin_lock_irqsave(&ioapic_lock, flags);
5847+ reg_01.raw = io_apic_read(ioapic, 1);
5848+ spin_unlock_irqrestore(&ioapic_lock, flags);
5849+
5850+ return reg_01.bits.version;
5851+}
5852+
5853+
5854+int __init io_apic_get_redir_entries (int ioapic)
5855+{
5856+ union IO_APIC_reg_01 reg_01;
5857+ unsigned long flags;
5858+
5859+ spin_lock_irqsave(&ioapic_lock, flags);
5860+ reg_01.raw = io_apic_read(ioapic, 1);
5861+ spin_unlock_irqrestore(&ioapic_lock, flags);
5862+
5863+ return reg_01.bits.entries;
5864+}
5865+
5866+
5867+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
5868+{
5869+ struct IO_APIC_route_entry entry;
5870+ unsigned long flags;
5871+
5872+ if (!IO_APIC_IRQ(irq)) {
5873+ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
5874+ ioapic);
5875+ return -EINVAL;
5876+ }
5877+
5878+ /*
5879+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
5880+ * Note that we mask (disable) IRQs now -- these get enabled when the
5881+ * corresponding device driver registers for this IRQ.
5882+ */
5883+
5884+ memset(&entry,0,sizeof(entry));
5885+
5886+ entry.delivery_mode = INT_DELIVERY_MODE;
5887+ entry.dest_mode = INT_DEST_MODE;
5888+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5889+ entry.trigger = edge_level;
5890+ entry.polarity = active_high_low;
5891+ entry.mask = 1;
5892+
5893+ /*
5894+ * IRQs < 16 are already in the irq_2_pin[] map
5895+ */
5896+ if (irq >= 16)
5897+ add_pin_to_irq(irq, ioapic, pin);
5898+
5899+ entry.vector = assign_irq_vector(irq);
5900+
5901+ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
5902+ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
5903+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
5904+ edge_level, active_high_low);
5905+
5906+ ioapic_register_intr(irq, entry.vector, edge_level);
5907+
5908+ if (!ioapic && (irq < 16))
5909+ disable_8259A_irq(irq);
5910+
5911+ spin_lock_irqsave(&ioapic_lock, flags);
5912+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
5913+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
5914+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
5915+ spin_unlock_irqrestore(&ioapic_lock, flags);
5916+
5917+ return 0;
5918+}
5919+
5920+#endif /* CONFIG_ACPI */
5921Index: head-2008-11-25/arch/x86/kernel/ioport_32-xen.c
5922===================================================================
5923--- /dev/null 1970-01-01 00:00:00.000000000 +0000
5924+++ head-2008-11-25/arch/x86/kernel/ioport_32-xen.c 2008-01-28 12:24:19.000000000 +0100
5925@@ -0,0 +1,123 @@
5926+/*
5927+ * linux/arch/i386/kernel/ioport.c
5928+ *
5929+ * This contains the io-permission bitmap code - written by obz, with changes
5930+ * by Linus.
5931+ */
5932+
5933+#include <linux/sched.h>
5934+#include <linux/kernel.h>
5935+#include <linux/capability.h>
5936+#include <linux/errno.h>
5937+#include <linux/types.h>
5938+#include <linux/ioport.h>
5939+#include <linux/smp.h>
5940+#include <linux/smp_lock.h>
5941+#include <linux/stddef.h>
5942+#include <linux/slab.h>
5943+#include <linux/thread_info.h>
5944+#include <xen/interface/physdev.h>
5945+
5946+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
5947+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
5948+{
5949+ unsigned long mask;
5950+ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
5951+ unsigned int low_index = base & (BITS_PER_LONG-1);
5952+ int length = low_index + extent;
5953+
5954+ if (low_index != 0) {
5955+ mask = (~0UL << low_index);
5956+ if (length < BITS_PER_LONG)
5957+ mask &= ~(~0UL << length);
5958+ if (new_value)
5959+ *bitmap_base++ |= mask;
5960+ else
5961+ *bitmap_base++ &= ~mask;
5962+ length -= BITS_PER_LONG;
5963+ }
5964+
5965+ mask = (new_value ? ~0UL : 0UL);
5966+ while (length >= BITS_PER_LONG) {
5967+ *bitmap_base++ = mask;
5968+ length -= BITS_PER_LONG;
5969+ }
5970+
5971+ if (length > 0) {
5972+ mask = ~(~0UL << length);
5973+ if (new_value)
5974+ *bitmap_base++ |= mask;
5975+ else
5976+ *bitmap_base++ &= ~mask;
5977+ }
5978+}
5979+
5980+
5981+/*
5982+ * this changes the io permissions bitmap in the current task.
5983+ */
5984+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
5985+{
5986+ struct thread_struct * t = &current->thread;
5987+ unsigned long *bitmap;
5988+ struct physdev_set_iobitmap set_iobitmap;
5989+
5990+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
5991+ return -EINVAL;
5992+ if (turn_on && !capable(CAP_SYS_RAWIO))
5993+ return -EPERM;
5994+
5995+ /*
5996+ * If it's the first ioperm() call in this thread's lifetime, set the
5997+ * IO bitmap up. ioperm() is much less timing critical than clone(),
5998+ * this is why we delay this operation until now:
5999+ */
6000+ if (!t->io_bitmap_ptr) {
6001+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6002+ if (!bitmap)
6003+ return -ENOMEM;
6004+
6005+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
6006+ t->io_bitmap_ptr = bitmap;
6007+ set_thread_flag(TIF_IO_BITMAP);
6008+
6009+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
6010+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
6011+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
6012+ &set_iobitmap));
6013+ }
6014+
6015+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6016+
6017+ return 0;
6018+}
6019+
6020+/*
6021+ * sys_iopl has to be used when you want to access the IO ports
6022+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6023+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
6024+ *
6025+ * Here we just change the eflags value on the stack: we allow
6026+ * only the super-user to do it. This depends on the stack-layout
6027+ * on system-call entry - see also fork() and the signal handling
6028+ * code.
6029+ */
6030+
6031+asmlinkage long sys_iopl(unsigned long unused)
6032+{
6033+ volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6034+ unsigned int level = regs->ebx;
6035+ struct thread_struct *t = &current->thread;
6036+ unsigned int old = (t->iopl >> 12) & 3;
6037+
6038+ if (level > 3)
6039+ return -EINVAL;
6040+ /* Trying to gain more privileges? */
6041+ if (level > old) {
6042+ if (!capable(CAP_SYS_RAWIO))
6043+ return -EPERM;
6044+ }
6045+ t->iopl = level << 12;
6046+ set_iopl_mask(t->iopl);
6047+ return 0;
6048+}
6049Index: head-2008-11-25/arch/x86/kernel/irq_32-xen.c
6050===================================================================
6051--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6052+++ head-2008-11-25/arch/x86/kernel/irq_32-xen.c 2008-10-29 09:55:56.000000000 +0100
6053@@ -0,0 +1,324 @@
6054+/*
6055+ * linux/arch/i386/kernel/irq.c
6056+ *
6057+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6058+ *
6059+ * This file contains the lowest level x86-specific interrupt
6060+ * entry, irq-stacks and irq statistics code. All the remaining
6061+ * irq logic is done by the generic kernel/irq/ code and
6062+ * by the x86-specific irq controller code. (e.g. i8259.c and
6063+ * io_apic.c.)
6064+ */
6065+
6066+#include <asm/uaccess.h>
6067+#include <linux/module.h>
6068+#include <linux/seq_file.h>
6069+#include <linux/interrupt.h>
6070+#include <linux/kernel_stat.h>
6071+#include <linux/notifier.h>
6072+#include <linux/cpu.h>
6073+#include <linux/delay.h>
6074+
6075+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6076+EXPORT_PER_CPU_SYMBOL(irq_stat);
6077+
6078+#ifndef CONFIG_X86_LOCAL_APIC
6079+/*
6080+ * 'what should we do if we get a hw irq event on an illegal vector'.
6081+ * each architecture has to answer this themselves.
6082+ */
6083+void ack_bad_irq(unsigned int irq)
6084+{
6085+ printk("unexpected IRQ trap at vector %02x\n", irq);
6086+}
6087+#endif
6088+
6089+#ifdef CONFIG_4KSTACKS
6090+/*
6091+ * per-CPU IRQ handling contexts (thread information and stack)
6092+ */
6093+union irq_ctx {
6094+ struct thread_info tinfo;
6095+ u32 stack[THREAD_SIZE/sizeof(u32)];
6096+};
6097+
6098+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
6099+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
6100+#endif
6101+
6102+/*
6103+ * do_IRQ handles all normal device IRQ's (the special
6104+ * SMP cross-CPU interrupts have their own specific
6105+ * handlers).
6106+ */
6107+fastcall unsigned int do_IRQ(struct pt_regs *regs)
6108+{
6109+ /* high bit used in ret_from_ code */
6110+ int irq = ~regs->orig_eax;
6111+#ifdef CONFIG_4KSTACKS
6112+ union irq_ctx *curctx, *irqctx;
6113+ u32 *isp;
6114+#endif
6115+
6116+ if (unlikely((unsigned)irq >= NR_IRQS)) {
6117+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
6118+ __FUNCTION__, irq);
6119+ BUG();
6120+ }
6121+
6122+ /*irq_enter();*/
6123+#ifdef CONFIG_DEBUG_STACKOVERFLOW
6124+ /* Debugging check for stack overflow: is there less than 1KB free? */
6125+ {
6126+ long esp;
6127+
6128+ __asm__ __volatile__("andl %%esp,%0" :
6129+ "=r" (esp) : "0" (THREAD_SIZE - 1));
6130+ if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6131+ printk("do_IRQ: stack overflow: %ld\n",
6132+ esp - sizeof(struct thread_info));
6133+ dump_stack();
6134+ }
6135+ }
6136+#endif
6137+
6138+#ifdef CONFIG_4KSTACKS
6139+
6140+ curctx = (union irq_ctx *) current_thread_info();
6141+ irqctx = hardirq_ctx[smp_processor_id()];
6142+
6143+ /*
6144+ * this is where we switch to the IRQ stack. However, if we are
6145+ * already using the IRQ stack (because we interrupted a hardirq
6146+ * handler) we can't do that and just have to keep using the
6147+ * current stack (which is the irq stack already after all)
6148+ */
6149+ if (curctx != irqctx) {
6150+ int arg1, arg2, ebx;
6151+
6152+ /* build the stack frame on the IRQ stack */
6153+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6154+ irqctx->tinfo.task = curctx->tinfo.task;
6155+ irqctx->tinfo.previous_esp = current_stack_pointer;
6156+
6157+ /*
6158+ * Copy the softirq bits in preempt_count so that the
6159+ * softirq checks work in the hardirq context.
6160+ */
6161+ irqctx->tinfo.preempt_count =
6162+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
6163+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
6164+
6165+ asm volatile(
6166+ " xchgl %%ebx,%%esp \n"
6167+ " call __do_IRQ \n"
6168+ " movl %%ebx,%%esp \n"
6169+ : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6170+ : "0" (irq), "1" (regs), "2" (isp)
6171+ : "memory", "cc", "ecx"
6172+ );
6173+ } else
6174+#endif
6175+ __do_IRQ(irq, regs);
6176+
6177+ /*irq_exit();*/
6178+
6179+ return 1;
6180+}
6181+
6182+#ifdef CONFIG_4KSTACKS
6183+
6184+/*
6185+ * These should really be __section__(".bss.page_aligned") as well, but
6186+ * gcc's 3.0 and earlier don't handle that correctly.
6187+ */
6188+static char softirq_stack[NR_CPUS * THREAD_SIZE]
6189+ __attribute__((__aligned__(THREAD_SIZE)));
6190+
6191+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6192+ __attribute__((__aligned__(THREAD_SIZE)));
6193+
6194+/*
6195+ * allocate per-cpu stacks for hardirq and for softirq processing
6196+ */
6197+void irq_ctx_init(int cpu)
6198+{
6199+ union irq_ctx *irqctx;
6200+
6201+ if (hardirq_ctx[cpu])
6202+ return;
6203+
6204+ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
6205+ irqctx->tinfo.task = NULL;
6206+ irqctx->tinfo.exec_domain = NULL;
6207+ irqctx->tinfo.cpu = cpu;
6208+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
6209+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6210+
6211+ hardirq_ctx[cpu] = irqctx;
6212+
6213+ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
6214+ irqctx->tinfo.task = NULL;
6215+ irqctx->tinfo.exec_domain = NULL;
6216+ irqctx->tinfo.cpu = cpu;
6217+ irqctx->tinfo.preempt_count = 0;
6218+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6219+
6220+ softirq_ctx[cpu] = irqctx;
6221+
6222+ printk("CPU %u irqstacks, hard=%p soft=%p\n",
6223+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
6224+}
6225+
6226+void irq_ctx_exit(int cpu)
6227+{
6228+ hardirq_ctx[cpu] = NULL;
6229+}
6230+
6231+extern asmlinkage void __do_softirq(void);
6232+
6233+asmlinkage void do_softirq(void)
6234+{
6235+ unsigned long flags;
6236+ struct thread_info *curctx;
6237+ union irq_ctx *irqctx;
6238+ u32 *isp;
6239+
6240+ if (in_interrupt())
6241+ return;
6242+
6243+ local_irq_save(flags);
6244+
6245+ if (local_softirq_pending()) {
6246+ curctx = current_thread_info();
6247+ irqctx = softirq_ctx[smp_processor_id()];
6248+ irqctx->tinfo.task = curctx->task;
6249+ irqctx->tinfo.previous_esp = current_stack_pointer;
6250+
6251+ /* build the stack frame on the softirq stack */
6252+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6253+
6254+ asm volatile(
6255+ " xchgl %%ebx,%%esp \n"
6256+ " call __do_softirq \n"
6257+ " movl %%ebx,%%esp \n"
6258+ : "=b"(isp)
6259+ : "0"(isp)
6260+ : "memory", "cc", "edx", "ecx", "eax"
6261+ );
6262+ /*
6263+ * Shouldnt happen, we returned above if in_interrupt():
6264+ */
6265+ WARN_ON_ONCE(softirq_count());
6266+ }
6267+
6268+ local_irq_restore(flags);
6269+}
6270+
6271+EXPORT_SYMBOL(do_softirq);
6272+#endif
6273+
6274+/*
6275+ * Interrupt statistics:
6276+ */
6277+
6278+atomic_t irq_err_count;
6279+
6280+/*
6281+ * /proc/interrupts printing:
6282+ */
6283+
6284+int show_interrupts(struct seq_file *p, void *v)
6285+{
6286+ int i = *(loff_t *) v, j;
6287+ struct irqaction * action;
6288+ unsigned long flags;
6289+
6290+ if (i == 0) {
6291+ seq_printf(p, " ");
6292+ for_each_online_cpu(j)
6293+ seq_printf(p, "CPU%-8d",j);
6294+ seq_putc(p, '\n');
6295+ }
6296+
6297+ if (i < NR_IRQS) {
6298+ spin_lock_irqsave(&irq_desc[i].lock, flags);
6299+ action = irq_desc[i].action;
6300+ if (!action)
6301+ goto skip;
6302+ seq_printf(p, "%3d: ",i);
6303+#ifndef CONFIG_SMP
6304+ seq_printf(p, "%10u ", kstat_irqs(i));
6305+#else
6306+ for_each_online_cpu(j)
6307+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
6308+#endif
6309+ seq_printf(p, " %14s", irq_desc[i].chip->typename);
6310+ seq_printf(p, " %s", action->name);
6311+
6312+ for (action=action->next; action; action = action->next)
6313+ seq_printf(p, ", %s", action->name);
6314+
6315+ seq_putc(p, '\n');
6316+skip:
6317+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
6318+ } else if (i == NR_IRQS) {
6319+ seq_printf(p, "NMI: ");
6320+ for_each_online_cpu(j)
6321+ seq_printf(p, "%10u ", nmi_count(j));
6322+ seq_putc(p, '\n');
6323+#ifdef CONFIG_X86_LOCAL_APIC
6324+ seq_printf(p, "LOC: ");
6325+ for_each_online_cpu(j)
6326+ seq_printf(p, "%10u ",
6327+ per_cpu(irq_stat,j).apic_timer_irqs);
6328+ seq_putc(p, '\n');
6329+#endif
6330+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
6331+#if defined(CONFIG_X86_IO_APIC)
6332+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
6333+#endif
6334+ }
6335+ return 0;
6336+}
6337+
6338+#ifdef CONFIG_HOTPLUG_CPU
6339+
6340+void fixup_irqs(cpumask_t map)
6341+{
6342+ unsigned int irq;
6343+ static int warned;
6344+
6345+ for (irq = 0; irq < NR_IRQS; irq++) {
6346+ cpumask_t mask;
6347+ if (irq == 2)
6348+ continue;
6349+
6350+ cpus_and(mask, irq_desc[irq].affinity, map);
6351+ if (any_online_cpu(mask) == NR_CPUS) {
6352+ /*printk("Breaking affinity for irq %i\n", irq);*/
6353+ mask = map;
6354+ }
6355+ if (irq_desc[irq].chip->set_affinity)
6356+ irq_desc[irq].chip->set_affinity(irq, mask);
6357+ else if (irq_desc[irq].action && !(warned++))
6358+ printk("Cannot set affinity for irq %i\n", irq);
6359+ }
6360+
6361+#if 0
6362+ barrier();
6363+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
6364+ [note the nop - the interrupt-enable boundary on x86 is two
6365+ instructions from sti] - to flush out pending hardirqs and
6366+ IPIs. After this point nothing is supposed to reach this CPU." */
6367+ __asm__ __volatile__("sti; nop; cli");
6368+ barrier();
6369+#else
6370+ /* That doesn't seem sufficient. Give it 1ms. */
6371+ local_irq_enable();
6372+ mdelay(1);
6373+ local_irq_disable();
6374+#endif
6375+}
6376+#endif
6377+
6378Index: head-2008-11-25/arch/x86/kernel/ldt_32-xen.c
6379===================================================================
6380--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6381+++ head-2008-11-25/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6382@@ -0,0 +1,270 @@
6383+/*
6384+ * linux/kernel/ldt.c
6385+ *
6386+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
6387+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6388+ */
6389+
6390+#include <linux/errno.h>
6391+#include <linux/sched.h>
6392+#include <linux/string.h>
6393+#include <linux/mm.h>
6394+#include <linux/smp.h>
6395+#include <linux/smp_lock.h>
6396+#include <linux/vmalloc.h>
6397+#include <linux/slab.h>
6398+
6399+#include <asm/uaccess.h>
6400+#include <asm/system.h>
6401+#include <asm/ldt.h>
6402+#include <asm/desc.h>
6403+#include <asm/mmu_context.h>
6404+
6405+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
6406+static void flush_ldt(void *null)
6407+{
6408+ if (current->active_mm)
6409+ load_LDT(&current->active_mm->context);
6410+}
6411+#endif
6412+
6413+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
6414+{
6415+ void *oldldt;
6416+ void *newldt;
6417+ int oldsize;
6418+
6419+ if (mincount <= pc->size)
6420+ return 0;
6421+ oldsize = pc->size;
6422+ mincount = (mincount+511)&(~511);
6423+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
6424+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
6425+ else
6426+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
6427+
6428+ if (!newldt)
6429+ return -ENOMEM;
6430+
6431+ if (oldsize)
6432+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
6433+ oldldt = pc->ldt;
6434+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
6435+ pc->ldt = newldt;
6436+ wmb();
6437+ pc->size = mincount;
6438+ wmb();
6439+
6440+ if (reload) {
6441+#ifdef CONFIG_SMP
6442+ cpumask_t mask;
6443+ preempt_disable();
6444+#endif
6445+ make_pages_readonly(
6446+ pc->ldt,
6447+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6448+ XENFEAT_writable_descriptor_tables);
6449+ load_LDT(pc);
6450+#ifdef CONFIG_SMP
6451+ mask = cpumask_of_cpu(smp_processor_id());
6452+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
6453+ smp_call_function(flush_ldt, NULL, 1, 1);
6454+ preempt_enable();
6455+#endif
6456+ }
6457+ if (oldsize) {
6458+ make_pages_writable(
6459+ oldldt,
6460+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
6461+ XENFEAT_writable_descriptor_tables);
6462+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
6463+ vfree(oldldt);
6464+ else
6465+ kfree(oldldt);
6466+ }
6467+ return 0;
6468+}
6469+
6470+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
6471+{
6472+ int err = alloc_ldt(new, old->size, 0);
6473+ if (err < 0)
6474+ return err;
6475+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
6476+ make_pages_readonly(
6477+ new->ldt,
6478+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6479+ XENFEAT_writable_descriptor_tables);
6480+ return 0;
6481+}
6482+
6483+/*
6484+ * we do not have to muck with descriptors here, that is
6485+ * done in switch_mm() as needed.
6486+ */
6487+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
6488+{
6489+ struct mm_struct * old_mm;
6490+ int retval = 0;
6491+
6492+ init_MUTEX(&mm->context.sem);
6493+ mm->context.size = 0;
6494+ mm->context.has_foreign_mappings = 0;
6495+ old_mm = current->mm;
6496+ if (old_mm && old_mm->context.size > 0) {
6497+ down(&old_mm->context.sem);
6498+ retval = copy_ldt(&mm->context, &old_mm->context);
6499+ up(&old_mm->context.sem);
6500+ }
6501+ return retval;
6502+}
6503+
6504+/*
6505+ * No need to lock the MM as we are the last user
6506+ */
6507+void destroy_context(struct mm_struct *mm)
6508+{
6509+ if (mm->context.size) {
6510+ if (mm == current->active_mm)
6511+ clear_LDT();
6512+ make_pages_writable(
6513+ mm->context.ldt,
6514+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6515+ XENFEAT_writable_descriptor_tables);
6516+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
6517+ vfree(mm->context.ldt);
6518+ else
6519+ kfree(mm->context.ldt);
6520+ mm->context.size = 0;
6521+ }
6522+}
6523+
6524+static int read_ldt(void __user * ptr, unsigned long bytecount)
6525+{
6526+ int err;
6527+ unsigned long size;
6528+ struct mm_struct * mm = current->mm;
6529+
6530+ if (!mm->context.size)
6531+ return 0;
6532+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
6533+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
6534+
6535+ down(&mm->context.sem);
6536+ size = mm->context.size*LDT_ENTRY_SIZE;
6537+ if (size > bytecount)
6538+ size = bytecount;
6539+
6540+ err = 0;
6541+ if (copy_to_user(ptr, mm->context.ldt, size))
6542+ err = -EFAULT;
6543+ up(&mm->context.sem);
6544+ if (err < 0)
6545+ goto error_return;
6546+ if (size != bytecount) {
6547+ /* zero-fill the rest */
6548+ if (clear_user(ptr+size, bytecount-size) != 0) {
6549+ err = -EFAULT;
6550+ goto error_return;
6551+ }
6552+ }
6553+ return bytecount;
6554+error_return:
6555+ return err;
6556+}
6557+
6558+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
6559+{
6560+ int err;
6561+ unsigned long size;
6562+ void *address;
6563+
6564+ err = 0;
6565+ address = &default_ldt[0];
6566+ size = 5*sizeof(struct desc_struct);
6567+ if (size > bytecount)
6568+ size = bytecount;
6569+
6570+ err = size;
6571+ if (copy_to_user(ptr, address, size))
6572+ err = -EFAULT;
6573+
6574+ return err;
6575+}
6576+
6577+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
6578+{
6579+ struct mm_struct * mm = current->mm;
6580+ __u32 entry_1, entry_2;
6581+ int error;
6582+ struct user_desc ldt_info;
6583+
6584+ error = -EINVAL;
6585+ if (bytecount != sizeof(ldt_info))
6586+ goto out;
6587+ error = -EFAULT;
6588+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
6589+ goto out;
6590+
6591+ error = -EINVAL;
6592+ if (ldt_info.entry_number >= LDT_ENTRIES)
6593+ goto out;
6594+ if (ldt_info.contents == 3) {
6595+ if (oldmode)
6596+ goto out;
6597+ if (ldt_info.seg_not_present == 0)
6598+ goto out;
6599+ }
6600+
6601+ down(&mm->context.sem);
6602+ if (ldt_info.entry_number >= mm->context.size) {
6603+ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
6604+ if (error < 0)
6605+ goto out_unlock;
6606+ }
6607+
6608+ /* Allow LDTs to be cleared by the user. */
6609+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
6610+ if (oldmode || LDT_empty(&ldt_info)) {
6611+ entry_1 = 0;
6612+ entry_2 = 0;
6613+ goto install;
6614+ }
6615+ }
6616+
6617+ entry_1 = LDT_entry_a(&ldt_info);
6618+ entry_2 = LDT_entry_b(&ldt_info);
6619+ if (oldmode)
6620+ entry_2 &= ~(1 << 20);
6621+
6622+ /* Install the new entry ... */
6623+install:
6624+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
6625+ entry_1, entry_2);
6626+
6627+out_unlock:
6628+ up(&mm->context.sem);
6629+out:
6630+ return error;
6631+}
6632+
6633+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
6634+{
6635+ int ret = -ENOSYS;
6636+
6637+ switch (func) {
6638+ case 0:
6639+ ret = read_ldt(ptr, bytecount);
6640+ break;
6641+ case 1:
6642+ ret = write_ldt(ptr, bytecount, 1);
6643+ break;
6644+ case 2:
6645+ ret = read_default_ldt(ptr, bytecount);
6646+ break;
6647+ case 0x11:
6648+ ret = write_ldt(ptr, bytecount, 0);
6649+ break;
6650+ }
6651+ return ret;
6652+}
6653Index: head-2008-11-25/arch/x86/kernel/microcode-xen.c
6654===================================================================
6655--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6656+++ head-2008-11-25/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200
6657@@ -0,0 +1,144 @@
6658+/*
6659+ * Intel CPU Microcode Update Driver for Linux
6660+ *
6661+ * Copyright (C) 2000-2004 Tigran Aivazian
6662+ *
6663+ * This driver allows to upgrade microcode on Intel processors
6664+ * belonging to IA-32 family - PentiumPro, Pentium II,
6665+ * Pentium III, Xeon, Pentium 4, etc.
6666+ *
6667+ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
6668+ * Order Number 245472 or free download from:
6669+ *
6670+ * http://developer.intel.com/design/pentium4/manuals/245472.htm
6671+ *
6672+ * For more information, go to http://www.urbanmyth.org/microcode
6673+ *
6674+ * This program is free software; you can redistribute it and/or
6675+ * modify it under the terms of the GNU General Public License
6676+ * as published by the Free Software Foundation; either version
6677+ * 2 of the License, or (at your option) any later version.
6678+ */
6679+
6680+//#define DEBUG /* pr_debug */
6681+#include <linux/capability.h>
6682+#include <linux/kernel.h>
6683+#include <linux/init.h>
6684+#include <linux/sched.h>
6685+#include <linux/cpumask.h>
6686+#include <linux/module.h>
6687+#include <linux/slab.h>
6688+#include <linux/vmalloc.h>
6689+#include <linux/miscdevice.h>
6690+#include <linux/spinlock.h>
6691+#include <linux/mm.h>
6692+#include <linux/mutex.h>
6693+#include <linux/syscalls.h>
6694+
6695+#include <asm/msr.h>
6696+#include <asm/uaccess.h>
6697+#include <asm/processor.h>
6698+
6699+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
6700+MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
6701+MODULE_LICENSE("GPL");
6702+
6703+static int verbose;
6704+module_param(verbose, int, 0644);
6705+
6706+#define MICROCODE_VERSION "1.14a-xen"
6707+
6708+#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
6709+#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
6710+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
6711+
6712+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
6713+static DEFINE_MUTEX(microcode_mutex);
6714+
6715+static int microcode_open (struct inode *unused1, struct file *unused2)
6716+{
6717+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
6718+}
6719+
6720+
6721+static int do_microcode_update (const void __user *ubuf, size_t len)
6722+{
6723+ int err;
6724+ void *kbuf;
6725+
6726+ kbuf = vmalloc(len);
6727+ if (!kbuf)
6728+ return -ENOMEM;
6729+
6730+ if (copy_from_user(kbuf, ubuf, len) == 0) {
6731+ struct xen_platform_op op;
6732+
6733+ op.cmd = XENPF_microcode_update;
6734+ set_xen_guest_handle(op.u.microcode.data, kbuf);
6735+ op.u.microcode.length = len;
6736+ err = HYPERVISOR_platform_op(&op);
6737+ } else
6738+ err = -EFAULT;
6739+
6740+ vfree(kbuf);
6741+
6742+ return err;
6743+}
6744+
6745+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
6746+{
6747+ ssize_t ret;
6748+
6749+ if (len < MC_HEADER_SIZE) {
6750+ printk(KERN_ERR "microcode: not enough data\n");
6751+ return -EINVAL;
6752+ }
6753+
6754+ mutex_lock(&microcode_mutex);
6755+
6756+ ret = do_microcode_update(buf, len);
6757+ if (!ret)
6758+ ret = (ssize_t)len;
6759+
6760+ mutex_unlock(&microcode_mutex);
6761+
6762+ return ret;
6763+}
6764+
6765+static struct file_operations microcode_fops = {
6766+ .owner = THIS_MODULE,
6767+ .write = microcode_write,
6768+ .open = microcode_open,
6769+};
6770+
6771+static struct miscdevice microcode_dev = {
6772+ .minor = MICROCODE_MINOR,
6773+ .name = "microcode",
6774+ .fops = &microcode_fops,
6775+};
6776+
6777+static int __init microcode_init (void)
6778+{
6779+ int error;
6780+
6781+ error = misc_register(&microcode_dev);
6782+ if (error) {
6783+ printk(KERN_ERR
6784+ "microcode: can't misc_register on minor=%d\n",
6785+ MICROCODE_MINOR);
6786+ return error;
6787+ }
6788+
6789+ printk(KERN_INFO
6790+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
6791+ return 0;
6792+}
6793+
6794+static void __exit microcode_exit (void)
6795+{
6796+ misc_deregister(&microcode_dev);
6797+}
6798+
6799+module_init(microcode_init)
6800+module_exit(microcode_exit)
6801+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
6802Index: head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c
6803===================================================================
6804--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6805+++ head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6806@@ -0,0 +1,1185 @@
6807+/*
6808+ * Intel Multiprocessor Specification 1.1 and 1.4
6809+ * compliant MP-table parsing routines.
6810+ *
6811+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6812+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6813+ *
6814+ * Fixes
6815+ * Erich Boleyn : MP v1.4 and additional changes.
6816+ * Alan Cox : Added EBDA scanning
6817+ * Ingo Molnar : various cleanups and rewrites
6818+ * Maciej W. Rozycki: Bits for default MP configurations
6819+ * Paul Diefenbaugh: Added full ACPI support
6820+ */
6821+
6822+#include <linux/mm.h>
6823+#include <linux/init.h>
6824+#include <linux/acpi.h>
6825+#include <linux/delay.h>
6826+#include <linux/bootmem.h>
6827+#include <linux/smp_lock.h>
6828+#include <linux/kernel_stat.h>
6829+#include <linux/mc146818rtc.h>
6830+#include <linux/bitops.h>
6831+
6832+#include <asm/smp.h>
6833+#include <asm/acpi.h>
6834+#include <asm/mtrr.h>
6835+#include <asm/mpspec.h>
6836+#include <asm/io_apic.h>
6837+
6838+#include <mach_apic.h>
6839+#include <mach_mpparse.h>
6840+#include <bios_ebda.h>
6841+
6842+/* Have we found an MP table */
6843+int smp_found_config;
6844+unsigned int __initdata maxcpus = NR_CPUS;
6845+
6846+/*
6847+ * Various Linux-internal data structures created from the
6848+ * MP-table.
6849+ */
6850+int apic_version [MAX_APICS];
6851+int mp_bus_id_to_type [MAX_MP_BUSSES];
6852+int mp_bus_id_to_node [MAX_MP_BUSSES];
6853+int mp_bus_id_to_local [MAX_MP_BUSSES];
6854+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
6855+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
6856+static int mp_current_pci_id;
6857+
6858+/* I/O APIC entries */
6859+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6860+
6861+/* # of MP IRQ source entries */
6862+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6863+
6864+/* MP IRQ source entries */
6865+int mp_irq_entries;
6866+
6867+int nr_ioapics;
6868+
6869+int pic_mode;
6870+unsigned long mp_lapic_addr;
6871+
6872+unsigned int def_to_bigsmp = 0;
6873+
6874+/* Processor that is doing the boot up */
6875+unsigned int boot_cpu_physical_apicid = -1U;
6876+/* Internal processor count */
6877+static unsigned int __devinitdata num_processors;
6878+
6879+/* Bitmask of physically existing CPUs */
6880+physid_mask_t phys_cpu_present_map;
6881+
6882+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
6883+
6884+/*
6885+ * Intel MP BIOS table parsing routines:
6886+ */
6887+
6888+
6889+/*
6890+ * Checksum an MP configuration block.
6891+ */
6892+
6893+static int __init mpf_checksum(unsigned char *mp, int len)
6894+{
6895+ int sum = 0;
6896+
6897+ while (len--)
6898+ sum += *mp++;
6899+
6900+ return sum & 0xFF;
6901+}
6902+
6903+/*
6904+ * Have to match translation table entries to main table entries by counter
6905+ * hence the mpc_record variable .... can't see a less disgusting way of
6906+ * doing this ....
6907+ */
6908+
6909+static int mpc_record;
6910+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
6911+
6912+#ifndef CONFIG_XEN
6913+static void __devinit MP_processor_info (struct mpc_config_processor *m)
6914+{
6915+ int ver, apicid;
6916+ physid_mask_t phys_cpu;
6917+
6918+ if (!(m->mpc_cpuflag & CPU_ENABLED))
6919+ return;
6920+
6921+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
6922+
6923+ if (m->mpc_featureflag&(1<<0))
6924+ Dprintk(" Floating point unit present.\n");
6925+ if (m->mpc_featureflag&(1<<7))
6926+ Dprintk(" Machine Exception supported.\n");
6927+ if (m->mpc_featureflag&(1<<8))
6928+ Dprintk(" 64 bit compare & exchange supported.\n");
6929+ if (m->mpc_featureflag&(1<<9))
6930+ Dprintk(" Internal APIC present.\n");
6931+ if (m->mpc_featureflag&(1<<11))
6932+ Dprintk(" SEP present.\n");
6933+ if (m->mpc_featureflag&(1<<12))
6934+ Dprintk(" MTRR present.\n");
6935+ if (m->mpc_featureflag&(1<<13))
6936+ Dprintk(" PGE present.\n");
6937+ if (m->mpc_featureflag&(1<<14))
6938+ Dprintk(" MCA present.\n");
6939+ if (m->mpc_featureflag&(1<<15))
6940+ Dprintk(" CMOV present.\n");
6941+ if (m->mpc_featureflag&(1<<16))
6942+ Dprintk(" PAT present.\n");
6943+ if (m->mpc_featureflag&(1<<17))
6944+ Dprintk(" PSE present.\n");
6945+ if (m->mpc_featureflag&(1<<18))
6946+ Dprintk(" PSN present.\n");
6947+ if (m->mpc_featureflag&(1<<19))
6948+ Dprintk(" Cache Line Flush Instruction present.\n");
6949+ /* 20 Reserved */
6950+ if (m->mpc_featureflag&(1<<21))
6951+ Dprintk(" Debug Trace and EMON Store present.\n");
6952+ if (m->mpc_featureflag&(1<<22))
6953+ Dprintk(" ACPI Thermal Throttle Registers present.\n");
6954+ if (m->mpc_featureflag&(1<<23))
6955+ Dprintk(" MMX present.\n");
6956+ if (m->mpc_featureflag&(1<<24))
6957+ Dprintk(" FXSR present.\n");
6958+ if (m->mpc_featureflag&(1<<25))
6959+ Dprintk(" XMM present.\n");
6960+ if (m->mpc_featureflag&(1<<26))
6961+ Dprintk(" Willamette New Instructions present.\n");
6962+ if (m->mpc_featureflag&(1<<27))
6963+ Dprintk(" Self Snoop present.\n");
6964+ if (m->mpc_featureflag&(1<<28))
6965+ Dprintk(" HT present.\n");
6966+ if (m->mpc_featureflag&(1<<29))
6967+ Dprintk(" Thermal Monitor present.\n");
6968+ /* 30, 31 Reserved */
6969+
6970+
6971+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
6972+ Dprintk(" Bootup CPU\n");
6973+ boot_cpu_physical_apicid = m->mpc_apicid;
6974+ }
6975+
6976+ ver = m->mpc_apicver;
6977+
6978+ /*
6979+ * Validate version
6980+ */
6981+ if (ver == 0x0) {
6982+ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
6983+ "fixing up to 0x10. (tell your hw vendor)\n",
6984+ m->mpc_apicid);
6985+ ver = 0x10;
6986+ }
6987+ apic_version[m->mpc_apicid] = ver;
6988+
6989+ phys_cpu = apicid_to_cpu_present(apicid);
6990+ physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
6991+
6992+ if (num_processors >= NR_CPUS) {
6993+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
6994+ " Processor ignored.\n", NR_CPUS);
6995+ return;
6996+ }
6997+
6998+ if (num_processors >= maxcpus) {
6999+ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7000+ " Processor ignored.\n", maxcpus);
7001+ return;
7002+ }
7003+
7004+ cpu_set(num_processors, cpu_possible_map);
7005+ num_processors++;
7006+
7007+ /*
7008+ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
7009+ * but we need to work other dependencies like SMP_SUSPEND etc
7010+ * before this can be done without some confusion.
7011+ * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
7012+ * - Ashok Raj <ashok.raj@intel.com>
7013+ */
7014+ if (num_processors > 8) {
7015+ switch (boot_cpu_data.x86_vendor) {
7016+ case X86_VENDOR_INTEL:
7017+ if (!APIC_XAPIC(ver)) {
7018+ def_to_bigsmp = 0;
7019+ break;
7020+ }
7021+ /* If P4 and above fall through */
7022+ case X86_VENDOR_AMD:
7023+ def_to_bigsmp = 1;
7024+ }
7025+ }
7026+ bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7027+}
7028+#else
7029+void __init MP_processor_info (struct mpc_config_processor *m)
7030+{
7031+ num_processors++;
7032+}
7033+#endif /* CONFIG_XEN */
7034+
7035+static void __init MP_bus_info (struct mpc_config_bus *m)
7036+{
7037+ char str[7];
7038+
7039+ memcpy(str, m->mpc_bustype, 6);
7040+ str[6] = 0;
7041+
7042+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7043+
7044+ if (m->mpc_busid >= MAX_MP_BUSSES) {
7045+ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
7046+ " is too large, max. supported is %d\n",
7047+ m->mpc_busid, str, MAX_MP_BUSSES - 1);
7048+ return;
7049+ }
7050+
7051+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7052+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7053+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7054+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7055+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7056+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
7057+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7058+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7059+ mp_current_pci_id++;
7060+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7061+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7062+ } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7063+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7064+ } else {
7065+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7066+ }
7067+}
7068+
7069+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7070+{
7071+ if (!(m->mpc_flags & MPC_APIC_USABLE))
7072+ return;
7073+
7074+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7075+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7076+ if (nr_ioapics >= MAX_IO_APICS) {
7077+ printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7078+ MAX_IO_APICS, nr_ioapics);
7079+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7080+ }
7081+ if (!m->mpc_apicaddr) {
7082+ printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7083+ " found in MP table, skipping!\n");
7084+ return;
7085+ }
7086+ mp_ioapics[nr_ioapics] = *m;
7087+ nr_ioapics++;
7088+}
7089+
7090+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7091+{
7092+ mp_irqs [mp_irq_entries] = *m;
7093+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7094+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7095+ m->mpc_irqtype, m->mpc_irqflag & 3,
7096+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7097+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7098+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
7099+ panic("Max # of irq sources exceeded!!\n");
7100+}
7101+
7102+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7103+{
7104+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7105+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7106+ m->mpc_irqtype, m->mpc_irqflag & 3,
7107+ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7108+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7109+ /*
7110+ * Well it seems all SMP boards in existence
7111+ * use ExtINT/LVT1 == LINT0 and
7112+ * NMI/LVT2 == LINT1 - the following check
7113+ * will show us if this assumptions is false.
7114+ * Until then we do not have to add baggage.
7115+ */
7116+ if ((m->mpc_irqtype == mp_ExtINT) &&
7117+ (m->mpc_destapiclint != 0))
7118+ BUG();
7119+ if ((m->mpc_irqtype == mp_NMI) &&
7120+ (m->mpc_destapiclint != 1))
7121+ BUG();
7122+}
7123+
7124+#ifdef CONFIG_X86_NUMAQ
7125+static void __init MP_translation_info (struct mpc_config_translation *m)
7126+{
7127+ printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7128+
7129+ if (mpc_record >= MAX_MPC_ENTRY)
7130+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7131+ else
7132+ translation_table[mpc_record] = m; /* stash this for later */
7133+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7134+ node_set_online(m->trans_quad);
7135+}
7136+
7137+/*
7138+ * Read/parse the MPC oem tables
7139+ */
7140+
7141+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7142+ unsigned short oemsize)
7143+{
7144+ int count = sizeof (*oemtable); /* the header size */
7145+ unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7146+
7147+ mpc_record = 0;
7148+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7149+ if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7150+ {
7151+ printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7152+ oemtable->oem_signature[0],
7153+ oemtable->oem_signature[1],
7154+ oemtable->oem_signature[2],
7155+ oemtable->oem_signature[3]);
7156+ return;
7157+ }
7158+ if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7159+ {
7160+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7161+ return;
7162+ }
7163+ while (count < oemtable->oem_length) {
7164+ switch (*oemptr) {
7165+ case MP_TRANSLATION:
7166+ {
7167+ struct mpc_config_translation *m=
7168+ (struct mpc_config_translation *)oemptr;
7169+ MP_translation_info(m);
7170+ oemptr += sizeof(*m);
7171+ count += sizeof(*m);
7172+ ++mpc_record;
7173+ break;
7174+ }
7175+ default:
7176+ {
7177+ printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7178+ return;
7179+ }
7180+ }
7181+ }
7182+}
7183+
7184+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
7185+ char *productid)
7186+{
7187+ if (strncmp(oem, "IBM NUMA", 8))
7188+ printk("Warning! May not be a NUMA-Q system!\n");
7189+ if (mpc->mpc_oemptr)
7190+ smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
7191+ mpc->mpc_oemsize);
7192+}
7193+#endif /* CONFIG_X86_NUMAQ */
7194+
7195+/*
7196+ * Read/parse the MPC
7197+ */
7198+
7199+static int __init smp_read_mpc(struct mp_config_table *mpc)
7200+{
7201+ char str[16];
7202+ char oem[10];
7203+ int count=sizeof(*mpc);
7204+ unsigned char *mpt=((unsigned char *)mpc)+count;
7205+
7206+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
7207+ printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
7208+ *(u32 *)mpc->mpc_signature);
7209+ return 0;
7210+ }
7211+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
7212+ printk(KERN_ERR "SMP mptable: checksum error!\n");
7213+ return 0;
7214+ }
7215+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
7216+ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
7217+ mpc->mpc_spec);
7218+ return 0;
7219+ }
7220+ if (!mpc->mpc_lapic) {
7221+ printk(KERN_ERR "SMP mptable: null local APIC address!\n");
7222+ return 0;
7223+ }
7224+ memcpy(oem,mpc->mpc_oem,8);
7225+ oem[8]=0;
7226+ printk(KERN_INFO "OEM ID: %s ",oem);
7227+
7228+ memcpy(str,mpc->mpc_productid,12);
7229+ str[12]=0;
7230+ printk("Product ID: %s ",str);
7231+
7232+ mps_oem_check(mpc, oem, str);
7233+
7234+ printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
7235+
7236+ /*
7237+ * Save the local APIC address (it might be non-default) -- but only
7238+ * if we're not using ACPI.
7239+ */
7240+ if (!acpi_lapic)
7241+ mp_lapic_addr = mpc->mpc_lapic;
7242+
7243+ /*
7244+ * Now process the configuration blocks.
7245+ */
7246+ mpc_record = 0;
7247+ while (count < mpc->mpc_length) {
7248+ switch(*mpt) {
7249+ case MP_PROCESSOR:
7250+ {
7251+ struct mpc_config_processor *m=
7252+ (struct mpc_config_processor *)mpt;
7253+ /* ACPI may have already provided this data */
7254+ if (!acpi_lapic)
7255+ MP_processor_info(m);
7256+ mpt += sizeof(*m);
7257+ count += sizeof(*m);
7258+ break;
7259+ }
7260+ case MP_BUS:
7261+ {
7262+ struct mpc_config_bus *m=
7263+ (struct mpc_config_bus *)mpt;
7264+ MP_bus_info(m);
7265+ mpt += sizeof(*m);
7266+ count += sizeof(*m);
7267+ break;
7268+ }
7269+ case MP_IOAPIC:
7270+ {
7271+ struct mpc_config_ioapic *m=
7272+ (struct mpc_config_ioapic *)mpt;
7273+ MP_ioapic_info(m);
7274+ mpt+=sizeof(*m);
7275+ count+=sizeof(*m);
7276+ break;
7277+ }
7278+ case MP_INTSRC:
7279+ {
7280+ struct mpc_config_intsrc *m=
7281+ (struct mpc_config_intsrc *)mpt;
7282+
7283+ MP_intsrc_info(m);
7284+ mpt+=sizeof(*m);
7285+ count+=sizeof(*m);
7286+ break;
7287+ }
7288+ case MP_LINTSRC:
7289+ {
7290+ struct mpc_config_lintsrc *m=
7291+ (struct mpc_config_lintsrc *)mpt;
7292+ MP_lintsrc_info(m);
7293+ mpt+=sizeof(*m);
7294+ count+=sizeof(*m);
7295+ break;
7296+ }
7297+ default:
7298+ {
7299+ count = mpc->mpc_length;
7300+ break;
7301+ }
7302+ }
7303+ ++mpc_record;
7304+ }
7305+ clustered_apic_check();
7306+ if (!num_processors)
7307+ printk(KERN_ERR "SMP mptable: no processors registered!\n");
7308+ return num_processors;
7309+}
7310+
7311+static int __init ELCR_trigger(unsigned int irq)
7312+{
7313+ unsigned int port;
7314+
7315+ port = 0x4d0 + (irq >> 3);
7316+ return (inb(port) >> (irq & 7)) & 1;
7317+}
7318+
7319+static void __init construct_default_ioirq_mptable(int mpc_default_type)
7320+{
7321+ struct mpc_config_intsrc intsrc;
7322+ int i;
7323+ int ELCR_fallback = 0;
7324+
7325+ intsrc.mpc_type = MP_INTSRC;
7326+ intsrc.mpc_irqflag = 0; /* conforming */
7327+ intsrc.mpc_srcbus = 0;
7328+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
7329+
7330+ intsrc.mpc_irqtype = mp_INT;
7331+
7332+ /*
7333+ * If true, we have an ISA/PCI system with no IRQ entries
7334+ * in the MP table. To prevent the PCI interrupts from being set up
7335+ * incorrectly, we try to use the ELCR. The sanity check to see if
7336+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
7337+ * never be level sensitive, so we simply see if the ELCR agrees.
7338+ * If it does, we assume it's valid.
7339+ */
7340+ if (mpc_default_type == 5) {
7341+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
7342+
7343+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
7344+ printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
7345+ else {
7346+ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
7347+ ELCR_fallback = 1;
7348+ }
7349+ }
7350+
7351+ for (i = 0; i < 16; i++) {
7352+ switch (mpc_default_type) {
7353+ case 2:
7354+ if (i == 0 || i == 13)
7355+ continue; /* IRQ0 & IRQ13 not connected */
7356+ /* fall through */
7357+ default:
7358+ if (i == 2)
7359+ continue; /* IRQ2 is never connected */
7360+ }
7361+
7362+ if (ELCR_fallback) {
7363+ /*
7364+ * If the ELCR indicates a level-sensitive interrupt, we
7365+ * copy that information over to the MP table in the
7366+ * irqflag field (level sensitive, active high polarity).
7367+ */
7368+ if (ELCR_trigger(i))
7369+ intsrc.mpc_irqflag = 13;
7370+ else
7371+ intsrc.mpc_irqflag = 0;
7372+ }
7373+
7374+ intsrc.mpc_srcbusirq = i;
7375+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
7376+ MP_intsrc_info(&intsrc);
7377+ }
7378+
7379+ intsrc.mpc_irqtype = mp_ExtINT;
7380+ intsrc.mpc_srcbusirq = 0;
7381+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
7382+ MP_intsrc_info(&intsrc);
7383+}
7384+
7385+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
7386+{
7387+ struct mpc_config_processor processor;
7388+ struct mpc_config_bus bus;
7389+ struct mpc_config_ioapic ioapic;
7390+ struct mpc_config_lintsrc lintsrc;
7391+ int linttypes[2] = { mp_ExtINT, mp_NMI };
7392+ int i;
7393+
7394+ /*
7395+ * local APIC has default address
7396+ */
7397+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
7398+
7399+ /*
7400+ * 2 CPUs, numbered 0 & 1.
7401+ */
7402+ processor.mpc_type = MP_PROCESSOR;
7403+ /* Either an integrated APIC or a discrete 82489DX. */
7404+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7405+ processor.mpc_cpuflag = CPU_ENABLED;
7406+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7407+ (boot_cpu_data.x86_model << 4) |
7408+ boot_cpu_data.x86_mask;
7409+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7410+ processor.mpc_reserved[0] = 0;
7411+ processor.mpc_reserved[1] = 0;
7412+ for (i = 0; i < 2; i++) {
7413+ processor.mpc_apicid = i;
7414+ MP_processor_info(&processor);
7415+ }
7416+
7417+ bus.mpc_type = MP_BUS;
7418+ bus.mpc_busid = 0;
7419+ switch (mpc_default_type) {
7420+ default:
7421+ printk("???\n");
7422+ printk(KERN_ERR "Unknown standard configuration %d\n",
7423+ mpc_default_type);
7424+ /* fall through */
7425+ case 1:
7426+ case 5:
7427+ memcpy(bus.mpc_bustype, "ISA ", 6);
7428+ break;
7429+ case 2:
7430+ case 6:
7431+ case 3:
7432+ memcpy(bus.mpc_bustype, "EISA ", 6);
7433+ break;
7434+ case 4:
7435+ case 7:
7436+ memcpy(bus.mpc_bustype, "MCA ", 6);
7437+ }
7438+ MP_bus_info(&bus);
7439+ if (mpc_default_type > 4) {
7440+ bus.mpc_busid = 1;
7441+ memcpy(bus.mpc_bustype, "PCI ", 6);
7442+ MP_bus_info(&bus);
7443+ }
7444+
7445+ ioapic.mpc_type = MP_IOAPIC;
7446+ ioapic.mpc_apicid = 2;
7447+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7448+ ioapic.mpc_flags = MPC_APIC_USABLE;
7449+ ioapic.mpc_apicaddr = 0xFEC00000;
7450+ MP_ioapic_info(&ioapic);
7451+
7452+ /*
7453+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
7454+ */
7455+ construct_default_ioirq_mptable(mpc_default_type);
7456+
7457+ lintsrc.mpc_type = MP_LINTSRC;
7458+ lintsrc.mpc_irqflag = 0; /* conforming */
7459+ lintsrc.mpc_srcbusid = 0;
7460+ lintsrc.mpc_srcbusirq = 0;
7461+ lintsrc.mpc_destapic = MP_APIC_ALL;
7462+ for (i = 0; i < 2; i++) {
7463+ lintsrc.mpc_irqtype = linttypes[i];
7464+ lintsrc.mpc_destapiclint = i;
7465+ MP_lintsrc_info(&lintsrc);
7466+ }
7467+}
7468+
7469+static struct intel_mp_floating *mpf_found;
7470+
7471+/*
7472+ * Scan the memory blocks for an SMP configuration block.
7473+ */
7474+void __init get_smp_config (void)
7475+{
7476+ struct intel_mp_floating *mpf = mpf_found;
7477+
7478+ /*
7479+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
7480+ * processors, where MPS only supports physical.
7481+ */
7482+ if (acpi_lapic && acpi_ioapic) {
7483+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
7484+ return;
7485+ }
7486+ else if (acpi_lapic)
7487+ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
7488+
7489+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
7490+ if (mpf->mpf_feature2 & (1<<7)) {
7491+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
7492+ pic_mode = 1;
7493+ } else {
7494+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
7495+ pic_mode = 0;
7496+ }
7497+
7498+ /*
7499+ * Now see if we need to read further.
7500+ */
7501+ if (mpf->mpf_feature1 != 0) {
7502+
7503+ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
7504+ construct_default_ISA_mptable(mpf->mpf_feature1);
7505+
7506+ } else if (mpf->mpf_physptr) {
7507+
7508+ /*
7509+ * Read the physical hardware table. Anything here will
7510+ * override the defaults.
7511+ */
7512+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
7513+ smp_found_config = 0;
7514+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
7515+ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
7516+ return;
7517+ }
7518+ /*
7519+ * If there are no explicit MP IRQ entries, then we are
7520+ * broken. We set up most of the low 16 IO-APIC pins to
7521+ * ISA defaults and hope it will work.
7522+ */
7523+ if (!mp_irq_entries) {
7524+ struct mpc_config_bus bus;
7525+
7526+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
7527+
7528+ bus.mpc_type = MP_BUS;
7529+ bus.mpc_busid = 0;
7530+ memcpy(bus.mpc_bustype, "ISA ", 6);
7531+ MP_bus_info(&bus);
7532+
7533+ construct_default_ioirq_mptable(0);
7534+ }
7535+
7536+ } else
7537+ BUG();
7538+
7539+ printk(KERN_INFO "Processors: %d\n", num_processors);
7540+ /*
7541+ * Only use the first configuration found.
7542+ */
7543+}
7544+
7545+static int __init smp_scan_config (unsigned long base, unsigned long length)
7546+{
7547+ unsigned long *bp = isa_bus_to_virt(base);
7548+ struct intel_mp_floating *mpf;
7549+
7550+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
7551+ if (sizeof(*mpf) != 16)
7552+ printk("Error: MPF size\n");
7553+
7554+ while (length > 0) {
7555+ mpf = (struct intel_mp_floating *)bp;
7556+ if ((*bp == SMP_MAGIC_IDENT) &&
7557+ (mpf->mpf_length == 1) &&
7558+ !mpf_checksum((unsigned char *)bp, 16) &&
7559+ ((mpf->mpf_specification == 1)
7560+ || (mpf->mpf_specification == 4)) ) {
7561+
7562+ smp_found_config = 1;
7563+#ifndef CONFIG_XEN
7564+ printk(KERN_INFO "found SMP MP-table at %08lx\n",
7565+ virt_to_phys(mpf));
7566+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
7567+ if (mpf->mpf_physptr) {
7568+ /*
7569+ * We cannot access to MPC table to compute
7570+ * table size yet, as only few megabytes from
7571+ * the bottom is mapped now.
7572+ * PC-9800's MPC table places on the very last
7573+ * of physical memory; so that simply reserving
7574+ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
7575+ * in reserve_bootmem.
7576+ */
7577+ unsigned long size = PAGE_SIZE;
7578+ unsigned long end = max_low_pfn * PAGE_SIZE;
7579+ if (mpf->mpf_physptr + size > end)
7580+ size = end - mpf->mpf_physptr;
7581+ reserve_bootmem(mpf->mpf_physptr, size);
7582+ }
7583+#else
7584+ printk(KERN_INFO "found SMP MP-table at %08lx\n",
7585+ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
7586+#endif
7587+
7588+ mpf_found = mpf;
7589+ return 1;
7590+ }
7591+ bp += 4;
7592+ length -= 16;
7593+ }
7594+ return 0;
7595+}
7596+
7597+void __init find_smp_config (void)
7598+{
7599+#ifndef CONFIG_XEN
7600+ unsigned int address;
7601+#endif
7602+
7603+ /*
7604+ * FIXME: Linux assumes you have 640K of base ram..
7605+ * this continues the error...
7606+ *
7607+ * 1) Scan the bottom 1K for a signature
7608+ * 2) Scan the top 1K of base RAM
7609+ * 3) Scan the 64K of bios
7610+ */
7611+ if (smp_scan_config(0x0,0x400) ||
7612+ smp_scan_config(639*0x400,0x400) ||
7613+ smp_scan_config(0xF0000,0x10000))
7614+ return;
7615+ /*
7616+ * If it is an SMP machine we should know now, unless the
7617+ * configuration is in an EISA/MCA bus machine with an
7618+ * extended bios data area.
7619+ *
7620+ * there is a real-mode segmented pointer pointing to the
7621+ * 4K EBDA area at 0x40E, calculate and scan it here.
7622+ *
7623+ * NOTE! There are Linux loaders that will corrupt the EBDA
7624+ * area, and as such this kind of SMP config may be less
7625+ * trustworthy, simply because the SMP table may have been
7626+ * stomped on during early boot. These loaders are buggy and
7627+ * should be fixed.
7628+ *
7629+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
7630+ */
7631+
7632+#ifndef CONFIG_XEN
7633+ address = get_bios_ebda();
7634+ if (address)
7635+ smp_scan_config(address, 0x400);
7636+#endif
7637+}
7638+
7639+int es7000_plat;
7640+
7641+/* --------------------------------------------------------------------------
7642+ ACPI-based MP Configuration
7643+ -------------------------------------------------------------------------- */
7644+
7645+#ifdef CONFIG_ACPI
7646+
7647+void __init mp_register_lapic_address (
7648+ u64 address)
7649+{
7650+#ifndef CONFIG_XEN
7651+ mp_lapic_addr = (unsigned long) address;
7652+
7653+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
7654+
7655+ if (boot_cpu_physical_apicid == -1U)
7656+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
7657+
7658+ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
7659+#endif
7660+}
7661+
7662+
7663+void __devinit mp_register_lapic (
7664+ u8 id,
7665+ u8 enabled)
7666+{
7667+ struct mpc_config_processor processor;
7668+ int boot_cpu = 0;
7669+
7670+ if (MAX_APICS - id <= 0) {
7671+ printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
7672+ id, MAX_APICS);
7673+ return;
7674+ }
7675+
7676+ if (id == boot_cpu_physical_apicid)
7677+ boot_cpu = 1;
7678+
7679+#ifndef CONFIG_XEN
7680+ processor.mpc_type = MP_PROCESSOR;
7681+ processor.mpc_apicid = id;
7682+ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
7683+ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
7684+ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
7685+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7686+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
7687+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7688+ processor.mpc_reserved[0] = 0;
7689+ processor.mpc_reserved[1] = 0;
7690+#endif
7691+
7692+ MP_processor_info(&processor);
7693+}
7694+
7695+#ifdef CONFIG_X86_IO_APIC
7696+
7697+#define MP_ISA_BUS 0
7698+#define MP_MAX_IOAPIC_PIN 127
7699+
7700+static struct mp_ioapic_routing {
7701+ int apic_id;
7702+ int gsi_base;
7703+ int gsi_end;
7704+ u32 pin_programmed[4];
7705+} mp_ioapic_routing[MAX_IO_APICS];
7706+
7707+
7708+static int mp_find_ioapic (
7709+ int gsi)
7710+{
7711+ int i = 0;
7712+
7713+ /* Find the IOAPIC that manages this GSI. */
7714+ for (i = 0; i < nr_ioapics; i++) {
7715+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
7716+ && (gsi <= mp_ioapic_routing[i].gsi_end))
7717+ return i;
7718+ }
7719+
7720+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
7721+
7722+ return -1;
7723+}
7724+
7725+
7726+void __init mp_register_ioapic (
7727+ u8 id,
7728+ u32 address,
7729+ u32 gsi_base)
7730+{
7731+ int idx = 0;
7732+ int tmpid;
7733+
7734+ if (nr_ioapics >= MAX_IO_APICS) {
7735+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
7736+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
7737+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
7738+ }
7739+ if (!address) {
7740+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
7741+ " found in MADT table, skipping!\n");
7742+ return;
7743+ }
7744+
7745+ idx = nr_ioapics++;
7746+
7747+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
7748+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
7749+ mp_ioapics[idx].mpc_apicaddr = address;
7750+
7751+#ifndef CONFIG_XEN
7752+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
7753+#endif
7754+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
7755+ && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
7756+ tmpid = io_apic_get_unique_id(idx, id);
7757+ else
7758+ tmpid = id;
7759+ if (tmpid == -1) {
7760+ nr_ioapics--;
7761+ return;
7762+ }
7763+ mp_ioapics[idx].mpc_apicid = tmpid;
7764+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
7765+
7766+ /*
7767+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
7768+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
7769+ */
7770+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
7771+ mp_ioapic_routing[idx].gsi_base = gsi_base;
7772+ mp_ioapic_routing[idx].gsi_end = gsi_base +
7773+ io_apic_get_redir_entries(idx);
7774+
7775+ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
7776+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
7777+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
7778+ mp_ioapic_routing[idx].gsi_base,
7779+ mp_ioapic_routing[idx].gsi_end);
7780+
7781+ return;
7782+}
7783+
7784+
7785+void __init mp_override_legacy_irq (
7786+ u8 bus_irq,
7787+ u8 polarity,
7788+ u8 trigger,
7789+ u32 gsi)
7790+{
7791+ struct mpc_config_intsrc intsrc;
7792+ int ioapic = -1;
7793+ int pin = -1;
7794+
7795+ /*
7796+ * Convert 'gsi' to 'ioapic.pin'.
7797+ */
7798+ ioapic = mp_find_ioapic(gsi);
7799+ if (ioapic < 0)
7800+ return;
7801+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7802+
7803+ /*
7804+ * TBD: This check is for faulty timer entries, where the override
7805+ * erroneously sets the trigger to level, resulting in a HUGE
7806+ * increase of timer interrupts!
7807+ */
7808+ if ((bus_irq == 0) && (trigger == 3))
7809+ trigger = 1;
7810+
7811+ intsrc.mpc_type = MP_INTSRC;
7812+ intsrc.mpc_irqtype = mp_INT;
7813+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
7814+ intsrc.mpc_srcbus = MP_ISA_BUS;
7815+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
7816+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
7817+ intsrc.mpc_dstirq = pin; /* INTIN# */
7818+
7819+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
7820+ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7821+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7822+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
7823+
7824+ mp_irqs[mp_irq_entries] = intsrc;
7825+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
7826+ panic("Max # of irq sources exceeded!\n");
7827+
7828+ return;
7829+}
7830+
7831+void __init mp_config_acpi_legacy_irqs (void)
7832+{
7833+ struct mpc_config_intsrc intsrc;
7834+ int i = 0;
7835+ int ioapic = -1;
7836+
7837+ /*
7838+ * Fabricate the legacy ISA bus (bus #31).
7839+ */
7840+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
7841+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
7842+
7843+ /*
7844+ * Older generations of ES7000 have no legacy identity mappings
7845+ */
7846+ if (es7000_plat == 1)
7847+ return;
7848+
7849+ /*
7850+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
7851+ */
7852+ ioapic = mp_find_ioapic(0);
7853+ if (ioapic < 0)
7854+ return;
7855+
7856+ intsrc.mpc_type = MP_INTSRC;
7857+ intsrc.mpc_irqflag = 0; /* Conforming */
7858+ intsrc.mpc_srcbus = MP_ISA_BUS;
7859+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
7860+
7861+ /*
7862+ * Use the default configuration for the IRQs 0-15. Unless
7863+ * overriden by (MADT) interrupt source override entries.
7864+ */
7865+ for (i = 0; i < 16; i++) {
7866+ int idx;
7867+
7868+ for (idx = 0; idx < mp_irq_entries; idx++) {
7869+ struct mpc_config_intsrc *irq = mp_irqs + idx;
7870+
7871+ /* Do we already have a mapping for this ISA IRQ? */
7872+ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
7873+ break;
7874+
7875+ /* Do we already have a mapping for this IOAPIC pin */
7876+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
7877+ (irq->mpc_dstirq == i))
7878+ break;
7879+ }
7880+
7881+ if (idx != mp_irq_entries) {
7882+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
7883+ continue; /* IRQ already used */
7884+ }
7885+
7886+ intsrc.mpc_irqtype = mp_INT;
7887+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
7888+ intsrc.mpc_dstirq = i;
7889+
7890+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
7891+ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7892+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7893+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
7894+ intsrc.mpc_dstirq);
7895+
7896+ mp_irqs[mp_irq_entries] = intsrc;
7897+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
7898+ panic("Max # of irq sources exceeded!\n");
7899+ }
7900+}
7901+
7902+#define MAX_GSI_NUM 4096
7903+
7904+int mp_register_gsi (u32 gsi, int triggering, int polarity)
7905+{
7906+ int ioapic = -1;
7907+ int ioapic_pin = 0;
7908+ int idx, bit = 0;
7909+ static int pci_irq = 16;
7910+ /*
7911+ * Mapping between Global System Interrups, which
7912+ * represent all possible interrupts, and IRQs
7913+ * assigned to actual devices.
7914+ */
7915+ static int gsi_to_irq[MAX_GSI_NUM];
7916+
7917+ /* Don't set up the ACPI SCI because it's already set up */
7918+ if (acpi_fadt.sci_int == gsi)
7919+ return gsi;
7920+
7921+ ioapic = mp_find_ioapic(gsi);
7922+ if (ioapic < 0) {
7923+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
7924+ return gsi;
7925+ }
7926+
7927+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7928+
7929+ if (ioapic_renumber_irq)
7930+ gsi = ioapic_renumber_irq(ioapic, gsi);
7931+
7932+ /*
7933+ * Avoid pin reprogramming. PRTs typically include entries
7934+ * with redundant pin->gsi mappings (but unique PCI devices);
7935+ * we only program the IOAPIC on the first.
7936+ */
7937+ bit = ioapic_pin % 32;
7938+ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
7939+ if (idx > 3) {
7940+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
7941+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
7942+ ioapic_pin);
7943+ return gsi;
7944+ }
7945+ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
7946+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
7947+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
7948+ return gsi_to_irq[gsi];
7949+ }
7950+
7951+ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
7952+
7953+ if (triggering == ACPI_LEVEL_SENSITIVE) {
7954+ /*
7955+ * For PCI devices assign IRQs in order, avoiding gaps
7956+ * due to unused I/O APIC pins.
7957+ */
7958+ int irq = gsi;
7959+ if (gsi < MAX_GSI_NUM) {
7960+ /*
7961+ * Retain the VIA chipset work-around (gsi > 15), but
7962+ * avoid a problem where the 8254 timer (IRQ0) is setup
7963+ * via an override (so it's not on pin 0 of the ioapic),
7964+ * and at the same time, the pin 0 interrupt is a PCI
7965+ * type. The gsi > 15 test could cause these two pins
7966+ * to be shared as IRQ0, and they are not shareable.
7967+ * So test for this condition, and if necessary, avoid
7968+ * the pin collision.
7969+ */
7970+ if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
7971+ gsi = pci_irq++;
7972+ /*
7973+ * Don't assign IRQ used by ACPI SCI
7974+ */
7975+ if (gsi == acpi_fadt.sci_int)
7976+ gsi = pci_irq++;
7977+ gsi_to_irq[irq] = gsi;
7978+ } else {
7979+ printk(KERN_ERR "GSI %u is too high\n", gsi);
7980+ return gsi;
7981+ }
7982+ }
7983+
7984+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
7985+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
7986+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
7987+ return gsi;
7988+}
7989+
7990+#endif /* CONFIG_X86_IO_APIC */
7991+#endif /* CONFIG_ACPI */
7992Index: head-2008-11-25/arch/x86/kernel/pci-dma-xen.c
7993===================================================================
7994--- /dev/null 1970-01-01 00:00:00.000000000 +0000
7995+++ head-2008-11-25/arch/x86/kernel/pci-dma-xen.c 2008-10-29 09:55:56.000000000 +0100
7996@@ -0,0 +1,409 @@
7997+/*
7998+ * Dynamic DMA mapping support.
7999+ *
8000+ * On i386 there is no hardware dynamic DMA address translation,
8001+ * so consistent alloc/free are merely page allocation/freeing.
8002+ * The rest of the dynamic DMA mapping interface is implemented
8003+ * in asm/pci.h.
8004+ */
8005+
8006+#include <linux/types.h>
8007+#include <linux/mm.h>
8008+#include <linux/string.h>
8009+#include <linux/pci.h>
8010+#include <linux/module.h>
8011+#include <linux/version.h>
8012+#include <asm/io.h>
8013+#include <xen/balloon.h>
8014+#include <xen/gnttab.h>
8015+#include <asm/swiotlb.h>
8016+#include <asm/tlbflush.h>
8017+#include <asm-i386/mach-xen/asm/swiotlb.h>
8018+#include <asm-i386/mach-xen/asm/gnttab_dma.h>
8019+#include <asm/bug.h>
8020+
8021+#ifdef __x86_64__
8022+#include <asm/proto.h>
8023+
8024+int iommu_merge __read_mostly = 0;
8025+EXPORT_SYMBOL(iommu_merge);
8026+
8027+dma_addr_t bad_dma_address __read_mostly;
8028+EXPORT_SYMBOL(bad_dma_address);
8029+
8030+/* This tells the BIO block layer to assume merging. Default to off
8031+ because we cannot guarantee merging later. */
8032+int iommu_bio_merge __read_mostly = 0;
8033+EXPORT_SYMBOL(iommu_bio_merge);
8034+
8035+int force_iommu __read_mostly= 0;
8036+
8037+__init int iommu_setup(char *p)
8038+{
8039+ return 1;
8040+}
8041+
8042+void __init pci_iommu_alloc(void)
8043+{
8044+#ifdef CONFIG_SWIOTLB
8045+ pci_swiotlb_init();
8046+#endif
8047+}
8048+
8049+static int __init pci_iommu_init(void)
8050+{
8051+ no_iommu_init();
8052+ return 0;
8053+}
8054+
8055+/* Must execute after PCI subsystem */
8056+fs_initcall(pci_iommu_init);
8057+#endif
8058+
8059+struct dma_coherent_mem {
8060+ void *virt_base;
8061+ u32 device_base;
8062+ int size;
8063+ int flags;
8064+ unsigned long *bitmap;
8065+};
8066+
8067+#define IOMMU_BUG_ON(test) \
8068+do { \
8069+ if (unlikely(test)) { \
8070+ printk(KERN_ALERT "Fatal DMA error! " \
8071+ "Please use 'swiotlb=force'\n"); \
8072+ BUG(); \
8073+ } \
8074+} while (0)
8075+
8076+static int check_pages_physically_contiguous(unsigned long pfn,
8077+ unsigned int offset,
8078+ size_t length)
8079+{
8080+ unsigned long next_mfn;
8081+ int i;
8082+ int nr_pages;
8083+
8084+ next_mfn = pfn_to_mfn(pfn);
8085+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
8086+
8087+ for (i = 1; i < nr_pages; i++) {
8088+ if (pfn_to_mfn(++pfn) != ++next_mfn)
8089+ return 0;
8090+ }
8091+ return 1;
8092+}
8093+
8094+int range_straddles_page_boundary(paddr_t p, size_t size)
8095+{
8096+ unsigned long pfn = p >> PAGE_SHIFT;
8097+ unsigned int offset = p & ~PAGE_MASK;
8098+
8099+ return ((offset + size > PAGE_SIZE) &&
8100+ !check_pages_physically_contiguous(pfn, offset, size));
8101+}
8102+
8103+int
8104+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8105+ enum dma_data_direction direction)
8106+{
8107+ int i, rc;
8108+
8109+ if (direction == DMA_NONE)
8110+ BUG();
8111+ WARN_ON(nents == 0 || sg[0].length == 0);
8112+
8113+ if (swiotlb) {
8114+ rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8115+ } else {
8116+ for (i = 0; i < nents; i++ ) {
8117+ BUG_ON(!sg[i].page);
8118+ sg[i].dma_address =
8119+ gnttab_dma_map_page(sg[i].page) + sg[i].offset;
8120+ sg[i].dma_length = sg[i].length;
8121+ IOMMU_BUG_ON(address_needs_mapping(
8122+ hwdev, sg[i].dma_address));
8123+ IOMMU_BUG_ON(range_straddles_page_boundary(
8124+ page_to_pseudophys(sg[i].page) + sg[i].offset,
8125+ sg[i].length));
8126+ }
8127+ rc = nents;
8128+ }
8129+
8130+ flush_write_buffers();
8131+ return rc;
8132+}
8133+EXPORT_SYMBOL(dma_map_sg);
8134+
8135+void
8136+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8137+ enum dma_data_direction direction)
8138+{
8139+ int i;
8140+
8141+ BUG_ON(direction == DMA_NONE);
8142+ if (swiotlb)
8143+ swiotlb_unmap_sg(hwdev, sg, nents, direction);
8144+ else {
8145+ for (i = 0; i < nents; i++ )
8146+ gnttab_dma_unmap_page(sg[i].dma_address);
8147+ }
8148+}
8149+EXPORT_SYMBOL(dma_unmap_sg);
8150+
8151+#ifdef CONFIG_HIGHMEM
8152+dma_addr_t
8153+dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8154+ size_t size, enum dma_data_direction direction)
8155+{
8156+ dma_addr_t dma_addr;
8157+
8158+ BUG_ON(direction == DMA_NONE);
8159+
8160+ if (swiotlb) {
8161+ dma_addr = swiotlb_map_page(
8162+ dev, page, offset, size, direction);
8163+ } else {
8164+ dma_addr = gnttab_dma_map_page(page) + offset;
8165+ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8166+ }
8167+
8168+ return dma_addr;
8169+}
8170+EXPORT_SYMBOL(dma_map_page);
8171+
8172+void
8173+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8174+ enum dma_data_direction direction)
8175+{
8176+ BUG_ON(direction == DMA_NONE);
8177+ if (swiotlb)
8178+ swiotlb_unmap_page(dev, dma_address, size, direction);
8179+ else
8180+ gnttab_dma_unmap_page(dma_address);
8181+}
8182+EXPORT_SYMBOL(dma_unmap_page);
8183+#endif /* CONFIG_HIGHMEM */
8184+
8185+int
8186+dma_mapping_error(dma_addr_t dma_addr)
8187+{
8188+ if (swiotlb)
8189+ return swiotlb_dma_mapping_error(dma_addr);
8190+ return 0;
8191+}
8192+EXPORT_SYMBOL(dma_mapping_error);
8193+
8194+int
8195+dma_supported(struct device *dev, u64 mask)
8196+{
8197+ if (swiotlb)
8198+ return swiotlb_dma_supported(dev, mask);
8199+ /*
8200+ * By default we'll BUG when an infeasible DMA is requested, and
8201+ * request swiotlb=force (see IOMMU_BUG_ON).
8202+ */
8203+ return 1;
8204+}
8205+EXPORT_SYMBOL(dma_supported);
8206+
8207+void *dma_alloc_coherent(struct device *dev, size_t size,
8208+ dma_addr_t *dma_handle, gfp_t gfp)
8209+{
8210+ void *ret;
8211+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8212+ unsigned int order = get_order(size);
8213+ unsigned long vstart;
8214+ u64 mask;
8215+
8216+ /* ignore region specifiers */
8217+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8218+
8219+ if (mem) {
8220+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
8221+ order);
8222+ if (page >= 0) {
8223+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8224+ ret = mem->virt_base + (page << PAGE_SHIFT);
8225+ memset(ret, 0, size);
8226+ return ret;
8227+ }
8228+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8229+ return NULL;
8230+ }
8231+
8232+ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8233+ gfp |= GFP_DMA;
8234+
8235+ vstart = __get_free_pages(gfp, order);
8236+ ret = (void *)vstart;
8237+
8238+ if (dev != NULL && dev->coherent_dma_mask)
8239+ mask = dev->coherent_dma_mask;
8240+ else
8241+ mask = 0xffffffff;
8242+
8243+ if (ret != NULL) {
8244+ if (xen_create_contiguous_region(vstart, order,
8245+ fls64(mask)) != 0) {
8246+ free_pages(vstart, order);
8247+ return NULL;
8248+ }
8249+ memset(ret, 0, size);
8250+ *dma_handle = virt_to_bus(ret);
8251+ }
8252+ return ret;
8253+}
8254+EXPORT_SYMBOL(dma_alloc_coherent);
8255+
8256+void dma_free_coherent(struct device *dev, size_t size,
8257+ void *vaddr, dma_addr_t dma_handle)
8258+{
8259+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8260+ int order = get_order(size);
8261+
8262+ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
8263+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
8264+
8265+ bitmap_release_region(mem->bitmap, page, order);
8266+ } else {
8267+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
8268+ free_pages((unsigned long)vaddr, order);
8269+ }
8270+}
8271+EXPORT_SYMBOL(dma_free_coherent);
8272+
8273+#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
8274+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
8275+ dma_addr_t device_addr, size_t size, int flags)
8276+{
8277+ void __iomem *mem_base;
8278+ int pages = size >> PAGE_SHIFT;
8279+ int bitmap_size = (pages + 31)/32;
8280+
8281+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
8282+ goto out;
8283+ if (!size)
8284+ goto out;
8285+ if (dev->dma_mem)
8286+ goto out;
8287+
8288+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
8289+
8290+ mem_base = ioremap(bus_addr, size);
8291+ if (!mem_base)
8292+ goto out;
8293+
8294+ dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
8295+ if (!dev->dma_mem)
8296+ goto out;
8297+ memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
8298+ dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
8299+ if (!dev->dma_mem->bitmap)
8300+ goto free1_out;
8301+ memset(dev->dma_mem->bitmap, 0, bitmap_size);
8302+
8303+ dev->dma_mem->virt_base = mem_base;
8304+ dev->dma_mem->device_base = device_addr;
8305+ dev->dma_mem->size = pages;
8306+ dev->dma_mem->flags = flags;
8307+
8308+ if (flags & DMA_MEMORY_MAP)
8309+ return DMA_MEMORY_MAP;
8310+
8311+ return DMA_MEMORY_IO;
8312+
8313+ free1_out:
8314+ kfree(dev->dma_mem->bitmap);
8315+ out:
8316+ return 0;
8317+}
8318+EXPORT_SYMBOL(dma_declare_coherent_memory);
8319+
8320+void dma_release_declared_memory(struct device *dev)
8321+{
8322+ struct dma_coherent_mem *mem = dev->dma_mem;
8323+
8324+ if(!mem)
8325+ return;
8326+ dev->dma_mem = NULL;
8327+ iounmap(mem->virt_base);
8328+ kfree(mem->bitmap);
8329+ kfree(mem);
8330+}
8331+EXPORT_SYMBOL(dma_release_declared_memory);
8332+
8333+void *dma_mark_declared_memory_occupied(struct device *dev,
8334+ dma_addr_t device_addr, size_t size)
8335+{
8336+ struct dma_coherent_mem *mem = dev->dma_mem;
8337+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
8338+ int pos, err;
8339+
8340+ if (!mem)
8341+ return ERR_PTR(-EINVAL);
8342+
8343+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
8344+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
8345+ if (err != 0)
8346+ return ERR_PTR(err);
8347+ return mem->virt_base + (pos << PAGE_SHIFT);
8348+}
8349+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
8350+#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
8351+
8352+dma_addr_t
8353+dma_map_single(struct device *dev, void *ptr, size_t size,
8354+ enum dma_data_direction direction)
8355+{
8356+ dma_addr_t dma;
8357+
8358+ if (direction == DMA_NONE)
8359+ BUG();
8360+ WARN_ON(size == 0);
8361+
8362+ if (swiotlb) {
8363+ dma = swiotlb_map_single(dev, ptr, size, direction);
8364+ } else {
8365+ dma = gnttab_dma_map_page(virt_to_page(ptr)) +
8366+ offset_in_page(ptr);
8367+ IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
8368+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
8369+ }
8370+
8371+ flush_write_buffers();
8372+ return dma;
8373+}
8374+EXPORT_SYMBOL(dma_map_single);
8375+
8376+void
8377+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
8378+ enum dma_data_direction direction)
8379+{
8380+ if (direction == DMA_NONE)
8381+ BUG();
8382+ if (swiotlb)
8383+ swiotlb_unmap_single(dev, dma_addr, size, direction);
8384+ else
8385+ gnttab_dma_unmap_page(dma_addr);
8386+}
8387+EXPORT_SYMBOL(dma_unmap_single);
8388+
8389+void
8390+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
8391+ enum dma_data_direction direction)
8392+{
8393+ if (swiotlb)
8394+ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
8395+}
8396+EXPORT_SYMBOL(dma_sync_single_for_cpu);
8397+
8398+void
8399+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
8400+ enum dma_data_direction direction)
8401+{
8402+ if (swiotlb)
8403+ swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
8404+}
8405+EXPORT_SYMBOL(dma_sync_single_for_device);
8406Index: head-2008-11-25/arch/x86/kernel/process_32-xen.c
8407===================================================================
8408--- /dev/null 1970-01-01 00:00:00.000000000 +0000
8409+++ head-2008-11-25/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200
8410@@ -0,0 +1,877 @@
8411+/*
8412+ * linux/arch/i386/kernel/process.c
8413+ *
8414+ * Copyright (C) 1995 Linus Torvalds
8415+ *
8416+ * Pentium III FXSR, SSE support
8417+ * Gareth Hughes <gareth@valinux.com>, May 2000
8418+ */
8419+
8420+/*
8421+ * This file handles the architecture-dependent parts of process handling..
8422+ */
8423+
8424+#include <stdarg.h>
8425+
8426+#include <linux/cpu.h>
8427+#include <linux/errno.h>
8428+#include <linux/sched.h>
8429+#include <linux/fs.h>
8430+#include <linux/kernel.h>
8431+#include <linux/mm.h>
8432+#include <linux/elfcore.h>
8433+#include <linux/smp.h>
8434+#include <linux/smp_lock.h>
8435+#include <linux/stddef.h>
8436+#include <linux/slab.h>
8437+#include <linux/vmalloc.h>
8438+#include <linux/user.h>
8439+#include <linux/a.out.h>
8440+#include <linux/interrupt.h>
8441+#include <linux/utsname.h>
8442+#include <linux/delay.h>
8443+#include <linux/reboot.h>
8444+#include <linux/init.h>
8445+#include <linux/mc146818rtc.h>
8446+#include <linux/module.h>
8447+#include <linux/kallsyms.h>
8448+#include <linux/ptrace.h>
8449+#include <linux/random.h>
8450+
8451+#include <asm/uaccess.h>
8452+#include <asm/pgtable.h>
8453+#include <asm/system.h>
8454+#include <asm/io.h>
8455+#include <asm/ldt.h>
8456+#include <asm/processor.h>
8457+#include <asm/i387.h>
8458+#include <asm/desc.h>
8459+#include <asm/vm86.h>
8460+#ifdef CONFIG_MATH_EMULATION
8461+#include <asm/math_emu.h>
8462+#endif
8463+
8464+#include <xen/interface/physdev.h>
8465+#include <xen/interface/vcpu.h>
8466+#include <xen/cpu_hotplug.h>
8467+
8468+#include <linux/err.h>
8469+
8470+#include <asm/tlbflush.h>
8471+#include <asm/cpu.h>
8472+
8473+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
8474+
8475+static int hlt_counter;
8476+
8477+unsigned long boot_option_idle_override = 0;
8478+EXPORT_SYMBOL(boot_option_idle_override);
8479+
8480+/*
8481+ * Return saved PC of a blocked thread.
8482+ */
8483+unsigned long thread_saved_pc(struct task_struct *tsk)
8484+{
8485+ return ((unsigned long *)tsk->thread.esp)[3];
8486+}
8487+
8488+/*
8489+ * Powermanagement idle function, if any..
8490+ */
8491+void (*pm_idle)(void);
8492+EXPORT_SYMBOL(pm_idle);
8493+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
8494+
8495+void disable_hlt(void)
8496+{
8497+ hlt_counter++;
8498+}
8499+
8500+EXPORT_SYMBOL(disable_hlt);
8501+
8502+void enable_hlt(void)
8503+{
8504+ hlt_counter--;
8505+}
8506+
8507+EXPORT_SYMBOL(enable_hlt);
8508+
8509+/*
8510+ * On SMP it's slightly faster (but much more power-consuming!)
8511+ * to poll the ->work.need_resched flag instead of waiting for the
8512+ * cross-CPU IPI to arrive. Use this option with caution.
8513+ */
8514+static void poll_idle (void)
8515+{
8516+ local_irq_enable();
8517+
8518+ asm volatile(
8519+ "2:"
8520+ "testl %0, %1;"
8521+ "rep; nop;"
8522+ "je 2b;"
8523+ : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
8524+}
8525+
8526+static void xen_idle(void)
8527+{
8528+ local_irq_disable();
8529+
8530+ if (need_resched())
8531+ local_irq_enable();
8532+ else {
8533+ current_thread_info()->status &= ~TS_POLLING;
8534+ smp_mb__after_clear_bit();
8535+ safe_halt();
8536+ current_thread_info()->status |= TS_POLLING;
8537+ }
8538+}
8539+#ifdef CONFIG_APM_MODULE
8540+EXPORT_SYMBOL(default_idle);
8541+#endif
8542+
8543+#ifdef CONFIG_HOTPLUG_CPU
8544+extern cpumask_t cpu_initialized;
8545+static inline void play_dead(void)
8546+{
8547+ idle_task_exit();
8548+ local_irq_disable();
8549+ cpu_clear(smp_processor_id(), cpu_initialized);
8550+ preempt_enable_no_resched();
8551+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
8552+ cpu_bringup();
8553+}
8554+#else
8555+static inline void play_dead(void)
8556+{
8557+ BUG();
8558+}
8559+#endif /* CONFIG_HOTPLUG_CPU */
8560+
8561+/*
8562+ * The idle thread. There's no useful work to be
8563+ * done, so just try to conserve power and have a
8564+ * low exit latency (ie sit in a loop waiting for
8565+ * somebody to say that they'd like to reschedule)
8566+ */
8567+void cpu_idle(void)
8568+{
8569+ int cpu = smp_processor_id();
8570+
8571+ current_thread_info()->status |= TS_POLLING;
8572+
8573+ /* endless idle loop with no priority at all */
8574+ while (1) {
8575+ while (!need_resched()) {
8576+ void (*idle)(void);
8577+
8578+ if (__get_cpu_var(cpu_idle_state))
8579+ __get_cpu_var(cpu_idle_state) = 0;
8580+
8581+ rmb();
8582+ idle = xen_idle; /* no alternatives */
8583+
8584+ if (cpu_is_offline(cpu))
8585+ play_dead();
8586+
8587+ __get_cpu_var(irq_stat).idle_timestamp = jiffies;
8588+ idle();
8589+ }
8590+ preempt_enable_no_resched();
8591+ schedule();
8592+ preempt_disable();
8593+ }
8594+}
8595+
8596+void cpu_idle_wait(void)
8597+{
8598+ unsigned int cpu, this_cpu = get_cpu();
8599+ cpumask_t map;
8600+
8601+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
8602+ put_cpu();
8603+
8604+ cpus_clear(map);
8605+ for_each_online_cpu(cpu) {
8606+ per_cpu(cpu_idle_state, cpu) = 1;
8607+ cpu_set(cpu, map);
8608+ }
8609+
8610+ __get_cpu_var(cpu_idle_state) = 0;
8611+
8612+ wmb();
8613+ do {
8614+ ssleep(1);
8615+ for_each_online_cpu(cpu) {
8616+ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
8617+ cpu_clear(cpu, map);
8618+ }
8619+ cpus_and(map, map, cpu_online_map);
8620+ } while (!cpus_empty(map));
8621+}
8622+EXPORT_SYMBOL_GPL(cpu_idle_wait);
8623+
8624+void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
8625+{
8626+}
8627+
8628+static int __init idle_setup (char *str)
8629+{
8630+ if (!strncmp(str, "poll", 4)) {
8631+ printk("using polling idle threads.\n");
8632+ pm_idle = poll_idle;
8633+ }
8634+
8635+ boot_option_idle_override = 1;
8636+ return 1;
8637+}
8638+
8639+__setup("idle=", idle_setup);
8640+
8641+void show_regs(struct pt_regs * regs)
8642+{
8643+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
8644+
8645+ printk("\n");
8646+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
8647+ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
8648+ print_symbol("EIP is at %s\n", regs->eip);
8649+
8650+ if (user_mode_vm(regs))
8651+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
8652+ printk(" EFLAGS: %08lx %s (%s %.*s)\n",
8653+ regs->eflags, print_tainted(), system_utsname.release,
8654+ (int)strcspn(system_utsname.version, " "),
8655+ system_utsname.version);
8656+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
8657+ regs->eax,regs->ebx,regs->ecx,regs->edx);
8658+ printk("ESI: %08lx EDI: %08lx EBP: %08lx",
8659+ regs->esi, regs->edi, regs->ebp);
8660+ printk(" DS: %04x ES: %04x\n",
8661+ 0xffff & regs->xds,0xffff & regs->xes);
8662+
8663+ cr0 = read_cr0();
8664+ cr2 = read_cr2();
8665+ cr3 = read_cr3();
8666+ cr4 = read_cr4_safe();
8667+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
8668+ show_trace(NULL, regs, &regs->esp);
8669+}
8670+
8671+/*
8672+ * This gets run with %ebx containing the
8673+ * function to call, and %edx containing
8674+ * the "args".
8675+ */
8676+extern void kernel_thread_helper(void);
8677+__asm__(".section .text\n"
8678+ ".align 4\n"
8679+ "kernel_thread_helper:\n\t"
8680+ "movl %edx,%eax\n\t"
8681+ "pushl %edx\n\t"
8682+ "call *%ebx\n\t"
8683+ "pushl %eax\n\t"
8684+ "call do_exit\n"
8685+ ".previous");
8686+
8687+/*
8688+ * Create a kernel thread
8689+ */
8690+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
8691+{
8692+ struct pt_regs regs;
8693+
8694+ memset(&regs, 0, sizeof(regs));
8695+
8696+ regs.ebx = (unsigned long) fn;
8697+ regs.edx = (unsigned long) arg;
8698+
8699+ regs.xds = __USER_DS;
8700+ regs.xes = __USER_DS;
8701+ regs.orig_eax = -1;
8702+ regs.eip = (unsigned long) kernel_thread_helper;
8703+ regs.xcs = GET_KERNEL_CS();
8704+ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
8705+
8706+ /* Ok, create the new process.. */
8707+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
8708+}
8709+EXPORT_SYMBOL(kernel_thread);
8710+
8711+/*
8712+ * Free current thread data structures etc..
8713+ */
8714+void exit_thread(void)
8715+{
8716+ /* The process may have allocated an io port bitmap... nuke it. */
8717+ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
8718+ struct task_struct *tsk = current;
8719+ struct thread_struct *t = &tsk->thread;
8720+ struct physdev_set_iobitmap set_iobitmap;
8721+ memset(&set_iobitmap, 0, sizeof(set_iobitmap));
8722+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
8723+ &set_iobitmap));
8724+ kfree(t->io_bitmap_ptr);
8725+ t->io_bitmap_ptr = NULL;
8726+ clear_thread_flag(TIF_IO_BITMAP);
8727+ }
8728+}
8729+
8730+void flush_thread(void)
8731+{
8732+ struct task_struct *tsk = current;
8733+
8734+ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
8735+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
8736+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
8737+ /*
8738+ * Forget coprocessor state..
8739+ */
8740+ clear_fpu(tsk);
8741+ clear_used_math();
8742+}
8743+
8744+void release_thread(struct task_struct *dead_task)
8745+{
8746+ BUG_ON(dead_task->mm);
8747+ release_vm86_irqs(dead_task);
8748+}
8749+
8750+/*
8751+ * This gets called before we allocate a new thread and copy
8752+ * the current task into it.
8753+ */
8754+void prepare_to_copy(struct task_struct *tsk)
8755+{
8756+ unlazy_fpu(tsk);
8757+}
8758+
8759+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
8760+ unsigned long unused,
8761+ struct task_struct * p, struct pt_regs * regs)
8762+{
8763+ struct pt_regs * childregs;
8764+ struct task_struct *tsk;
8765+ int err;
8766+
8767+ childregs = task_pt_regs(p);
8768+ *childregs = *regs;
8769+ childregs->eax = 0;
8770+ childregs->esp = esp;
8771+
8772+ p->thread.esp = (unsigned long) childregs;
8773+ p->thread.esp0 = (unsigned long) (childregs+1);
8774+
8775+ p->thread.eip = (unsigned long) ret_from_fork;
8776+
8777+ savesegment(fs,p->thread.fs);
8778+ savesegment(gs,p->thread.gs);
8779+
8780+ tsk = current;
8781+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
8782+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
8783+ if (!p->thread.io_bitmap_ptr) {
8784+ p->thread.io_bitmap_max = 0;
8785+ return -ENOMEM;
8786+ }
8787+ memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
8788+ IO_BITMAP_BYTES);
8789+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
8790+ }
8791+
8792+ /*
8793+ * Set a new TLS for the child thread?
8794+ */
8795+ if (clone_flags & CLONE_SETTLS) {
8796+ struct desc_struct *desc;
8797+ struct user_desc info;
8798+ int idx;
8799+
8800+ err = -EFAULT;
8801+ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
8802+ goto out;
8803+ err = -EINVAL;
8804+ if (LDT_empty(&info))
8805+ goto out;
8806+
8807+ idx = info.entry_number;
8808+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
8809+ goto out;
8810+
8811+ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
8812+ desc->a = LDT_entry_a(&info);
8813+ desc->b = LDT_entry_b(&info);
8814+ }
8815+
8816+ p->thread.iopl = current->thread.iopl;
8817+
8818+ err = 0;
8819+ out:
8820+ if (err && p->thread.io_bitmap_ptr) {
8821+ kfree(p->thread.io_bitmap_ptr);
8822+ p->thread.io_bitmap_max = 0;
8823+ }
8824+ return err;
8825+}
8826+
8827+/*
8828+ * fill in the user structure for a core dump..
8829+ */
8830+void dump_thread(struct pt_regs * regs, struct user * dump)
8831+{
8832+ int i;
8833+
8834+/* changed the size calculations - should hopefully work better. lbt */
8835+ dump->magic = CMAGIC;
8836+ dump->start_code = 0;
8837+ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
8838+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
8839+ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
8840+ dump->u_dsize -= dump->u_tsize;
8841+ dump->u_ssize = 0;
8842+ for (i = 0; i < 8; i++)
8843+ dump->u_debugreg[i] = current->thread.debugreg[i];
8844+
8845+ if (dump->start_stack < TASK_SIZE)
8846+ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
8847+
8848+ dump->regs.ebx = regs->ebx;
8849+ dump->regs.ecx = regs->ecx;
8850+ dump->regs.edx = regs->edx;
8851+ dump->regs.esi = regs->esi;
8852+ dump->regs.edi = regs->edi;
8853+ dump->regs.ebp = regs->ebp;
8854+ dump->regs.eax = regs->eax;
8855+ dump->regs.ds = regs->xds;
8856+ dump->regs.es = regs->xes;
8857+ savesegment(fs,dump->regs.fs);
8858+ savesegment(gs,dump->regs.gs);
8859+ dump->regs.orig_eax = regs->orig_eax;
8860+ dump->regs.eip = regs->eip;
8861+ dump->regs.cs = regs->xcs;
8862+ dump->regs.eflags = regs->eflags;
8863+ dump->regs.esp = regs->esp;
8864+ dump->regs.ss = regs->xss;
8865+
8866+ dump->u_fpvalid = dump_fpu (regs, &dump->i387);
8867+}
8868+EXPORT_SYMBOL(dump_thread);
8869+
8870+/*
8871+ * Capture the user space registers if the task is not running (in user space)
8872+ */
8873+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
8874+{
8875+ struct pt_regs ptregs = *task_pt_regs(tsk);
8876+ ptregs.xcs &= 0xffff;
8877+ ptregs.xds &= 0xffff;
8878+ ptregs.xes &= 0xffff;
8879+ ptregs.xss &= 0xffff;
8880+
8881+ elf_core_copy_regs(regs, &ptregs);
8882+
8883+ return 1;
8884+}
8885+
8886+static noinline void __switch_to_xtra(struct task_struct *next_p)
8887+{
8888+ struct thread_struct *next;
8889+
8890+ next = &next_p->thread;
8891+
8892+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
8893+ set_debugreg(next->debugreg[0], 0);
8894+ set_debugreg(next->debugreg[1], 1);
8895+ set_debugreg(next->debugreg[2], 2);
8896+ set_debugreg(next->debugreg[3], 3);
8897+ /* no 4 and 5 */
8898+ set_debugreg(next->debugreg[6], 6);
8899+ set_debugreg(next->debugreg[7], 7);
8900+ }
8901+}
8902+
8903+/*
8904+ * This function selects if the context switch from prev to next
8905+ * has to tweak the TSC disable bit in the cr4.
8906+ */
8907+static inline void disable_tsc(struct task_struct *prev_p,
8908+ struct task_struct *next_p)
8909+{
8910+ struct thread_info *prev, *next;
8911+
8912+ /*
8913+ * gcc should eliminate the ->thread_info dereference if
8914+ * has_secure_computing returns 0 at compile time (SECCOMP=n).
8915+ */
8916+ prev = task_thread_info(prev_p);
8917+ next = task_thread_info(next_p);
8918+
8919+ if (has_secure_computing(prev) || has_secure_computing(next)) {
8920+ /* slow path here */
8921+ if (has_secure_computing(prev) &&
8922+ !has_secure_computing(next)) {
8923+ write_cr4(read_cr4() & ~X86_CR4_TSD);
8924+ } else if (!has_secure_computing(prev) &&
8925+ has_secure_computing(next))
8926+ write_cr4(read_cr4() | X86_CR4_TSD);
8927+ }
8928+}
8929+
8930+/*
8931+ * switch_to(x,yn) should switch tasks from x to y.
8932+ *
8933+ * We fsave/fwait so that an exception goes off at the right time
8934+ * (as a call from the fsave or fwait in effect) rather than to
8935+ * the wrong process. Lazy FP saving no longer makes any sense
8936+ * with modern CPU's, and this simplifies a lot of things (SMP
8937+ * and UP become the same).
8938+ *
8939+ * NOTE! We used to use the x86 hardware context switching. The
8940+ * reason for not using it any more becomes apparent when you
8941+ * try to recover gracefully from saved state that is no longer
8942+ * valid (stale segment register values in particular). With the
8943+ * hardware task-switch, there is no way to fix up bad state in
8944+ * a reasonable manner.
8945+ *
8946+ * The fact that Intel documents the hardware task-switching to
8947+ * be slow is a fairly red herring - this code is not noticeably
8948+ * faster. However, there _is_ some room for improvement here,
8949+ * so the performance issues may eventually be a valid point.
8950+ * More important, however, is the fact that this allows us much
8951+ * more flexibility.
8952+ *
8953+ * The return value (in %eax) will be the "prev" task after
8954+ * the task-switch, and shows up in ret_from_fork in entry.S,
8955+ * for example.
8956+ */
8957+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
8958+{
8959+ struct thread_struct *prev = &prev_p->thread,
8960+ *next = &next_p->thread;
8961+ int cpu = smp_processor_id();
8962+#ifndef CONFIG_X86_NO_TSS
8963+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
8964+#endif
8965+#if CONFIG_XEN_COMPAT > 0x030002
8966+ struct physdev_set_iopl iopl_op;
8967+ struct physdev_set_iobitmap iobmp_op;
8968+#else
8969+ struct physdev_op _pdo[2], *pdo = _pdo;
8970+#define iopl_op pdo->u.set_iopl
8971+#define iobmp_op pdo->u.set_iobitmap
8972+#endif
8973+ multicall_entry_t _mcl[8], *mcl = _mcl;
8974+
8975+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
8976+
8977+ /*
8978+ * This is basically '__unlazy_fpu', except that we queue a
8979+ * multicall to indicate FPU task switch, rather than
8980+ * synchronously trapping to Xen.
8981+ */
8982+ if (prev_p->thread_info->status & TS_USEDFPU) {
8983+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
8984+ mcl->op = __HYPERVISOR_fpu_taskswitch;
8985+ mcl->args[0] = 1;
8986+ mcl++;
8987+ }
8988+#if 0 /* lazy fpu sanity check */
8989+ else BUG_ON(!(read_cr0() & 8));
8990+#endif
8991+
8992+ /*
8993+ * Reload esp0.
8994+ * This is load_esp0(tss, next) with a multicall.
8995+ */
8996+ mcl->op = __HYPERVISOR_stack_switch;
8997+ mcl->args[0] = __KERNEL_DS;
8998+ mcl->args[1] = next->esp0;
8999+ mcl++;
9000+
9001+ /*
9002+ * Load the per-thread Thread-Local Storage descriptor.
9003+ * This is load_TLS(next, cpu) with multicalls.
9004+ */
9005+#define C(i) do { \
9006+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
9007+ next->tls_array[i].b != prev->tls_array[i].b)) { \
9008+ mcl->op = __HYPERVISOR_update_descriptor; \
9009+ *(u64 *)&mcl->args[0] = virt_to_machine( \
9010+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9011+ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
9012+ mcl++; \
9013+ } \
9014+} while (0)
9015+ C(0); C(1); C(2);
9016+#undef C
9017+
9018+ if (unlikely(prev->iopl != next->iopl)) {
9019+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
9020+#if CONFIG_XEN_COMPAT > 0x030002
9021+ mcl->op = __HYPERVISOR_physdev_op;
9022+ mcl->args[0] = PHYSDEVOP_set_iopl;
9023+ mcl->args[1] = (unsigned long)&iopl_op;
9024+#else
9025+ mcl->op = __HYPERVISOR_physdev_op_compat;
9026+ pdo->cmd = PHYSDEVOP_set_iopl;
9027+ mcl->args[0] = (unsigned long)pdo++;
9028+#endif
9029+ mcl++;
9030+ }
9031+
9032+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9033+ set_xen_guest_handle(iobmp_op.bitmap,
9034+ (char *)next->io_bitmap_ptr);
9035+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9036+#if CONFIG_XEN_COMPAT > 0x030002
9037+ mcl->op = __HYPERVISOR_physdev_op;
9038+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
9039+ mcl->args[1] = (unsigned long)&iobmp_op;
9040+#else
9041+ mcl->op = __HYPERVISOR_physdev_op_compat;
9042+ pdo->cmd = PHYSDEVOP_set_iobitmap;
9043+ mcl->args[0] = (unsigned long)pdo++;
9044+#endif
9045+ mcl++;
9046+ }
9047+
9048+#if CONFIG_XEN_COMPAT <= 0x030002
9049+ BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
9050+#endif
9051+ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
9052+ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
9053+ BUG();
9054+
9055+ /*
9056+ * Restore %fs and %gs if needed.
9057+ *
9058+ * Glibc normally makes %fs be zero, and %gs is one of
9059+ * the TLS segments.
9060+ */
9061+ if (unlikely(next->fs))
9062+ loadsegment(fs, next->fs);
9063+
9064+ if (next->gs)
9065+ loadsegment(gs, next->gs);
9066+
9067+ /*
9068+ * Now maybe handle debug registers
9069+ */
9070+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
9071+ __switch_to_xtra(next_p);
9072+
9073+ disable_tsc(prev_p, next_p);
9074+
9075+ return prev_p;
9076+}
9077+
9078+asmlinkage int sys_fork(struct pt_regs regs)
9079+{
9080+ return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9081+}
9082+
9083+asmlinkage int sys_clone(struct pt_regs regs)
9084+{
9085+ unsigned long clone_flags;
9086+ unsigned long newsp;
9087+ int __user *parent_tidptr, *child_tidptr;
9088+
9089+ clone_flags = regs.ebx;
9090+ newsp = regs.ecx;
9091+ parent_tidptr = (int __user *)regs.edx;
9092+ child_tidptr = (int __user *)regs.edi;
9093+ if (!newsp)
9094+ newsp = regs.esp;
9095+ return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9096+}
9097+
9098+/*
9099+ * This is trivial, and on the face of it looks like it
9100+ * could equally well be done in user mode.
9101+ *
9102+ * Not so, for quite unobvious reasons - register pressure.
9103+ * In user mode vfork() cannot have a stack frame, and if
9104+ * done by calling the "clone()" system call directly, you
9105+ * do not have enough call-clobbered registers to hold all
9106+ * the information you need.
9107+ */
9108+asmlinkage int sys_vfork(struct pt_regs regs)
9109+{
9110+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9111+}
9112+
9113+/*
9114+ * sys_execve() executes a new program.
9115+ */
9116+asmlinkage int sys_execve(struct pt_regs regs)
9117+{
9118+ int error;
9119+ char * filename;
9120+
9121+ filename = getname((char __user *) regs.ebx);
9122+ error = PTR_ERR(filename);
9123+ if (IS_ERR(filename))
9124+ goto out;
9125+ error = do_execve(filename,
9126+ (char __user * __user *) regs.ecx,
9127+ (char __user * __user *) regs.edx,
9128+ &regs);
9129+ if (error == 0) {
9130+ task_lock(current);
9131+ current->ptrace &= ~PT_DTRACE;
9132+ task_unlock(current);
9133+ /* Make sure we don't return using sysenter.. */
9134+ set_thread_flag(TIF_IRET);
9135+ }
9136+ putname(filename);
9137+out:
9138+ return error;
9139+}
9140+
9141+#define top_esp (THREAD_SIZE - sizeof(unsigned long))
9142+#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
9143+
9144+unsigned long get_wchan(struct task_struct *p)
9145+{
9146+ unsigned long ebp, esp, eip;
9147+ unsigned long stack_page;
9148+ int count = 0;
9149+ if (!p || p == current || p->state == TASK_RUNNING)
9150+ return 0;
9151+ stack_page = (unsigned long)task_stack_page(p);
9152+ esp = p->thread.esp;
9153+ if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9154+ return 0;
9155+ /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9156+ ebp = *(unsigned long *) esp;
9157+ do {
9158+ if (ebp < stack_page || ebp > top_ebp+stack_page)
9159+ return 0;
9160+ eip = *(unsigned long *) (ebp+4);
9161+ if (!in_sched_functions(eip))
9162+ return eip;
9163+ ebp = *(unsigned long *) ebp;
9164+ } while (count++ < 16);
9165+ return 0;
9166+}
9167+
9168+/*
9169+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9170+ */
9171+static int get_free_idx(void)
9172+{
9173+ struct thread_struct *t = &current->thread;
9174+ int idx;
9175+
9176+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9177+ if (desc_empty(t->tls_array + idx))
9178+ return idx + GDT_ENTRY_TLS_MIN;
9179+ return -ESRCH;
9180+}
9181+
9182+/*
9183+ * Set a given TLS descriptor:
9184+ */
9185+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9186+{
9187+ struct thread_struct *t = &current->thread;
9188+ struct user_desc info;
9189+ struct desc_struct *desc;
9190+ int cpu, idx;
9191+
9192+ if (copy_from_user(&info, u_info, sizeof(info)))
9193+ return -EFAULT;
9194+ idx = info.entry_number;
9195+
9196+ /*
9197+ * index -1 means the kernel should try to find and
9198+ * allocate an empty descriptor:
9199+ */
9200+ if (idx == -1) {
9201+ idx = get_free_idx();
9202+ if (idx < 0)
9203+ return idx;
9204+ if (put_user(idx, &u_info->entry_number))
9205+ return -EFAULT;
9206+ }
9207+
9208+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9209+ return -EINVAL;
9210+
9211+ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9212+
9213+ /*
9214+ * We must not get preempted while modifying the TLS.
9215+ */
9216+ cpu = get_cpu();
9217+
9218+ if (LDT_empty(&info)) {
9219+ desc->a = 0;
9220+ desc->b = 0;
9221+ } else {
9222+ desc->a = LDT_entry_a(&info);
9223+ desc->b = LDT_entry_b(&info);
9224+ }
9225+ load_TLS(t, cpu);
9226+
9227+ put_cpu();
9228+
9229+ return 0;
9230+}
9231+
9232+/*
9233+ * Get the current Thread-Local Storage area:
9234+ */
9235+
9236+#define GET_BASE(desc) ( \
9237+ (((desc)->a >> 16) & 0x0000ffff) | \
9238+ (((desc)->b << 16) & 0x00ff0000) | \
9239+ ( (desc)->b & 0xff000000) )
9240+
9241+#define GET_LIMIT(desc) ( \
9242+ ((desc)->a & 0x0ffff) | \
9243+ ((desc)->b & 0xf0000) )
9244+
9245+#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
9246+#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
9247+#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
9248+#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
9249+#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
9250+#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
9251+
9252+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9253+{
9254+ struct user_desc info;
9255+ struct desc_struct *desc;
9256+ int idx;
9257+
9258+ if (get_user(idx, &u_info->entry_number))
9259+ return -EFAULT;
9260+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9261+ return -EINVAL;
9262+
9263+ memset(&info, 0, sizeof(info));
9264+
9265+ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9266+
9267+ info.entry_number = idx;
9268+ info.base_addr = GET_BASE(desc);
9269+ info.limit = GET_LIMIT(desc);
9270+ info.seg_32bit = GET_32BIT(desc);
9271+ info.contents = GET_CONTENTS(desc);
9272+ info.read_exec_only = !GET_WRITABLE(desc);
9273+ info.limit_in_pages = GET_LIMIT_PAGES(desc);
9274+ info.seg_not_present = !GET_PRESENT(desc);
9275+ info.useable = GET_USEABLE(desc);
9276+
9277+ if (copy_to_user(u_info, &info, sizeof(info)))
9278+ return -EFAULT;
9279+ return 0;
9280+}
9281+
9282+unsigned long arch_align_stack(unsigned long sp)
9283+{
9284+ if (randomize_va_space)
9285+ sp -= get_random_int() % 8192;
9286+ return sp & ~0xf;
9287+}
9288Index: head-2008-11-25/arch/x86/kernel/quirks-xen.c
9289===================================================================
9290--- /dev/null 1970-01-01 00:00:00.000000000 +0000
9291+++ head-2008-11-25/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100
9292@@ -0,0 +1,47 @@
9293+/*
9294+ * This file contains work-arounds for x86 and x86_64 platform bugs.
9295+ */
9296+#include <linux/pci.h>
9297+#include <linux/irq.h>
9298+
9299+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9300+
9301+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9302+{
9303+ u8 config, rev;
9304+ u32 word;
9305+
9306+ /* BIOS may enable hardware IRQ balancing for
9307+ * E7520/E7320/E7525(revision ID 0x9 and below)
9308+ * based platforms.
9309+ * Disable SW irqbalance/affinity on those platforms.
9310+ */
9311+ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
9312+ if (rev > 0x9)
9313+ return;
9314+
9315+ printk(KERN_INFO "Intel E7520/7320/7525 detected.");
9316+
9317+ /* enable access to config space*/
9318+ pci_read_config_byte(dev, 0xf4, &config);
9319+ pci_write_config_byte(dev, 0xf4, config|0x2);
9320+
9321+ /* read xTPR register */
9322+ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
9323+
9324+ if (!(word & (1 << 13))) {
9325+ struct xen_platform_op op;
9326+ printk(KERN_INFO "Disabling irq balancing and affinity\n");
9327+ op.cmd = XENPF_platform_quirk;
9328+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
9329+ WARN_ON(HYPERVISOR_platform_op(&op));
9330+ }
9331+
9332+ /* put back the original value for config space*/
9333+ if (!(config & 0x2))
9334+ pci_write_config_byte(dev, 0xf4, config);
9335+}
9336+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
9337+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
9338+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
9339+#endif
9340Index: head-2008-11-25/arch/x86/kernel/setup_32-xen.c
9341===================================================================
9342--- /dev/null 1970-01-01 00:00:00.000000000 +0000
9343+++ head-2008-11-25/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200
9344@@ -0,0 +1,1919 @@
9345+/*
9346+ * linux/arch/i386/kernel/setup.c
9347+ *
9348+ * Copyright (C) 1995 Linus Torvalds
9349+ *
9350+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
9351+ *
9352+ * Memory region support
9353+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
9354+ *
9355+ * Added E820 sanitization routine (removes overlapping memory regions);
9356+ * Brian Moyle <bmoyle@mvista.com>, February 2001
9357+ *
9358+ * Moved CPU detection code to cpu/${cpu}.c
9359+ * Patrick Mochel <mochel@osdl.org>, March 2002
9360+ *
9361+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
9362+ * Alex Achenbach <xela@slit.de>, December 2002.
9363+ *
9364+ */
9365+
9366+/*
9367+ * This file handles the architecture-dependent parts of initialization
9368+ */
9369+
9370+#include <linux/sched.h>
9371+#include <linux/mm.h>
9372+#include <linux/mmzone.h>
9373+#include <linux/screen_info.h>
9374+#include <linux/ioport.h>
9375+#include <linux/acpi.h>
9376+#include <linux/apm_bios.h>
9377+#include <linux/initrd.h>
9378+#include <linux/bootmem.h>
9379+#include <linux/seq_file.h>
9380+#include <linux/platform_device.h>
9381+#include <linux/console.h>
9382+#include <linux/mca.h>
9383+#include <linux/root_dev.h>
9384+#include <linux/highmem.h>
9385+#include <linux/module.h>
9386+#include <linux/efi.h>
9387+#include <linux/init.h>
9388+#include <linux/edd.h>
9389+#include <linux/nodemask.h>
9390+#include <linux/kernel.h>
9391+#include <linux/percpu.h>
9392+#include <linux/notifier.h>
9393+#include <linux/kexec.h>
9394+#include <linux/crash_dump.h>
9395+#include <linux/dmi.h>
9396+#include <linux/pfn.h>
9397+
9398+#include <video/edid.h>
9399+
9400+#include <asm/apic.h>
9401+#include <asm/e820.h>
9402+#include <asm/mpspec.h>
9403+#include <asm/setup.h>
9404+#include <asm/arch_hooks.h>
9405+#include <asm/sections.h>
9406+#include <asm/io_apic.h>
9407+#include <asm/ist.h>
9408+#include <asm/io.h>
9409+#include <asm/hypervisor.h>
9410+#include <xen/interface/physdev.h>
9411+#include <xen/interface/memory.h>
9412+#include <xen/features.h>
9413+#include <xen/firmware.h>
9414+#include <xen/xencons.h>
9415+#include <setup_arch.h>
9416+#include <bios_ebda.h>
9417+
9418+#ifdef CONFIG_XEN
9419+#include <xen/interface/kexec.h>
9420+#endif
9421+
9422+/* Forward Declaration. */
9423+void __init find_max_pfn(void);
9424+
9425+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
9426+static struct notifier_block xen_panic_block = {
9427+ xen_panic_event, NULL, 0 /* try to go last */
9428+};
9429+
9430+extern char hypercall_page[PAGE_SIZE];
9431+EXPORT_SYMBOL(hypercall_page);
9432+
9433+int disable_pse __devinitdata = 0;
9434+
9435+/*
9436+ * Machine setup..
9437+ */
9438+
9439+#ifdef CONFIG_EFI
9440+int efi_enabled = 0;
9441+EXPORT_SYMBOL(efi_enabled);
9442+#endif
9443+
9444+/* cpu data as detected by the assembly code in head.S */
9445+struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9446+/* common cpu data for all cpus */
9447+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9448+EXPORT_SYMBOL(boot_cpu_data);
9449+
9450+unsigned long mmu_cr4_features;
9451+
9452+#ifdef CONFIG_ACPI
9453+ int acpi_disabled = 0;
9454+#else
9455+ int acpi_disabled = 1;
9456+#endif
9457+EXPORT_SYMBOL(acpi_disabled);
9458+
9459+#ifdef CONFIG_ACPI
9460+int __initdata acpi_force = 0;
9461+extern acpi_interrupt_flags acpi_sci_flags;
9462+#endif
9463+
9464+/* for MCA, but anyone else can use it if they want */
9465+unsigned int machine_id;
9466+#ifdef CONFIG_MCA
9467+EXPORT_SYMBOL(machine_id);
9468+#endif
9469+unsigned int machine_submodel_id;
9470+unsigned int BIOS_revision;
9471+unsigned int mca_pentium_flag;
9472+
9473+/* For PCI or other memory-mapped resources */
9474+unsigned long pci_mem_start = 0x10000000;
9475+#ifdef CONFIG_PCI
9476+EXPORT_SYMBOL(pci_mem_start);
9477+#endif
9478+
9479+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
9480+int bootloader_type;
9481+
9482+/* user-defined highmem size */
9483+static unsigned int highmem_pages = -1;
9484+
9485+/*
9486+ * Setup options
9487+ */
9488+struct drive_info_struct { char dummy[32]; } drive_info;
9489+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
9490+ defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
9491+EXPORT_SYMBOL(drive_info);
9492+#endif
9493+struct screen_info screen_info;
9494+EXPORT_SYMBOL(screen_info);
9495+struct apm_info apm_info;
9496+EXPORT_SYMBOL(apm_info);
9497+struct sys_desc_table_struct {
9498+ unsigned short length;
9499+ unsigned char table[0];
9500+};
9501+struct edid_info edid_info;
9502+EXPORT_SYMBOL_GPL(edid_info);
9503+#ifndef CONFIG_XEN
9504+#define copy_edid() (edid_info = EDID_INFO)
9505+#endif
9506+struct ist_info ist_info;
9507+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
9508+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
9509+EXPORT_SYMBOL(ist_info);
9510+#endif
9511+struct e820map e820;
9512+#ifdef CONFIG_XEN
9513+struct e820map machine_e820;
9514+#endif
9515+
9516+extern void early_cpu_init(void);
9517+extern void generic_apic_probe(char *);
9518+extern int root_mountflags;
9519+
9520+unsigned long saved_videomode;
9521+
9522+#define RAMDISK_IMAGE_START_MASK 0x07FF
9523+#define RAMDISK_PROMPT_FLAG 0x8000
9524+#define RAMDISK_LOAD_FLAG 0x4000
9525+
9526+static char command_line[COMMAND_LINE_SIZE];
9527+
9528+unsigned char __initdata boot_params[PARAM_SIZE];
9529+
9530+static struct resource data_resource = {
9531+ .name = "Kernel data",
9532+ .start = 0,
9533+ .end = 0,
9534+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9535+};
9536+
9537+static struct resource code_resource = {
9538+ .name = "Kernel code",
9539+ .start = 0,
9540+ .end = 0,
9541+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9542+};
9543+
9544+static struct resource system_rom_resource = {
9545+ .name = "System ROM",
9546+ .start = 0xf0000,
9547+ .end = 0xfffff,
9548+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9549+};
9550+
9551+static struct resource extension_rom_resource = {
9552+ .name = "Extension ROM",
9553+ .start = 0xe0000,
9554+ .end = 0xeffff,
9555+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9556+};
9557+
9558+static struct resource adapter_rom_resources[] = { {
9559+ .name = "Adapter ROM",
9560+ .start = 0xc8000,
9561+ .end = 0,
9562+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9563+}, {
9564+ .name = "Adapter ROM",
9565+ .start = 0,
9566+ .end = 0,
9567+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9568+}, {
9569+ .name = "Adapter ROM",
9570+ .start = 0,
9571+ .end = 0,
9572+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9573+}, {
9574+ .name = "Adapter ROM",
9575+ .start = 0,
9576+ .end = 0,
9577+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9578+}, {
9579+ .name = "Adapter ROM",
9580+ .start = 0,
9581+ .end = 0,
9582+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9583+}, {
9584+ .name = "Adapter ROM",
9585+ .start = 0,
9586+ .end = 0,
9587+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9588+} };
9589+
9590+#define ADAPTER_ROM_RESOURCES \
9591+ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
9592+
9593+static struct resource video_rom_resource = {
9594+ .name = "Video ROM",
9595+ .start = 0xc0000,
9596+ .end = 0xc7fff,
9597+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9598+};
9599+
9600+static struct resource video_ram_resource = {
9601+ .name = "Video RAM area",
9602+ .start = 0xa0000,
9603+ .end = 0xbffff,
9604+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9605+};
9606+
9607+static struct resource standard_io_resources[] = { {
9608+ .name = "dma1",
9609+ .start = 0x0000,
9610+ .end = 0x001f,
9611+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9612+}, {
9613+ .name = "pic1",
9614+ .start = 0x0020,
9615+ .end = 0x0021,
9616+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9617+}, {
9618+ .name = "timer0",
9619+ .start = 0x0040,
9620+ .end = 0x0043,
9621+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9622+}, {
9623+ .name = "timer1",
9624+ .start = 0x0050,
9625+ .end = 0x0053,
9626+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9627+}, {
9628+ .name = "keyboard",
9629+ .start = 0x0060,
9630+ .end = 0x006f,
9631+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9632+}, {
9633+ .name = "dma page reg",
9634+ .start = 0x0080,
9635+ .end = 0x008f,
9636+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9637+}, {
9638+ .name = "pic2",
9639+ .start = 0x00a0,
9640+ .end = 0x00a1,
9641+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9642+}, {
9643+ .name = "dma2",
9644+ .start = 0x00c0,
9645+ .end = 0x00df,
9646+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9647+}, {
9648+ .name = "fpu",
9649+ .start = 0x00f0,
9650+ .end = 0x00ff,
9651+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
9652+} };
9653+
9654+#define STANDARD_IO_RESOURCES \
9655+ (sizeof standard_io_resources / sizeof standard_io_resources[0])
9656+
9657+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
9658+
9659+static int __init romchecksum(unsigned char *rom, unsigned long length)
9660+{
9661+ unsigned char *p, sum = 0;
9662+
9663+ for (p = rom; p < rom + length; p++)
9664+ sum += *p;
9665+ return sum == 0;
9666+}
9667+
9668+static void __init probe_roms(void)
9669+{
9670+ unsigned long start, length, upper;
9671+ unsigned char *rom;
9672+ int i;
9673+
9674+#ifdef CONFIG_XEN
9675+ /* Nothing to do if not running in dom0. */
9676+ if (!is_initial_xendomain())
9677+ return;
9678+#endif
9679+
9680+ /* video rom */
9681+ upper = adapter_rom_resources[0].start;
9682+ for (start = video_rom_resource.start; start < upper; start += 2048) {
9683+ rom = isa_bus_to_virt(start);
9684+ if (!romsignature(rom))
9685+ continue;
9686+
9687+ video_rom_resource.start = start;
9688+
9689+ /* 0 < length <= 0x7f * 512, historically */
9690+ length = rom[2] * 512;
9691+
9692+ /* if checksum okay, trust length byte */
9693+ if (length && romchecksum(rom, length))
9694+ video_rom_resource.end = start + length - 1;
9695+
9696+ request_resource(&iomem_resource, &video_rom_resource);
9697+ break;
9698+ }
9699+
9700+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
9701+ if (start < upper)
9702+ start = upper;
9703+
9704+ /* system rom */
9705+ request_resource(&iomem_resource, &system_rom_resource);
9706+ upper = system_rom_resource.start;
9707+
9708+ /* check for extension rom (ignore length byte!) */
9709+ rom = isa_bus_to_virt(extension_rom_resource.start);
9710+ if (romsignature(rom)) {
9711+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
9712+ if (romchecksum(rom, length)) {
9713+ request_resource(&iomem_resource, &extension_rom_resource);
9714+ upper = extension_rom_resource.start;
9715+ }
9716+ }
9717+
9718+ /* check for adapter roms on 2k boundaries */
9719+ for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
9720+ rom = isa_bus_to_virt(start);
9721+ if (!romsignature(rom))
9722+ continue;
9723+
9724+ /* 0 < length <= 0x7f * 512, historically */
9725+ length = rom[2] * 512;
9726+
9727+ /* but accept any length that fits if checksum okay */
9728+ if (!length || start + length > upper || !romchecksum(rom, length))
9729+ continue;
9730+
9731+ adapter_rom_resources[i].start = start;
9732+ adapter_rom_resources[i].end = start + length - 1;
9733+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
9734+
9735+ start = adapter_rom_resources[i++].end & ~2047UL;
9736+ }
9737+}
9738+
9739+/*
9740+ * Point at the empty zero page to start with. We map the real shared_info
9741+ * page as soon as fixmap is up and running.
9742+ */
9743+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
9744+EXPORT_SYMBOL(HYPERVISOR_shared_info);
9745+
9746+unsigned long *phys_to_machine_mapping;
9747+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
9748+EXPORT_SYMBOL(phys_to_machine_mapping);
9749+
9750+/* Raw start-of-day parameters from the hypervisor. */
9751+start_info_t *xen_start_info;
9752+EXPORT_SYMBOL(xen_start_info);
9753+
9754+void __init add_memory_region(unsigned long long start,
9755+ unsigned long long size, int type)
9756+{
9757+ int x;
9758+
9759+ if (!efi_enabled) {
9760+ x = e820.nr_map;
9761+
9762+ if (x == E820MAX) {
9763+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
9764+ return;
9765+ }
9766+
9767+ e820.map[x].addr = start;
9768+ e820.map[x].size = size;
9769+ e820.map[x].type = type;
9770+ e820.nr_map++;
9771+ }
9772+} /* add_memory_region */
9773+
9774+static void __init limit_regions(unsigned long long size)
9775+{
9776+ unsigned long long current_addr = 0;
9777+ int i;
9778+
9779+ if (efi_enabled) {
9780+ efi_memory_desc_t *md;
9781+ void *p;
9782+
9783+ for (p = memmap.map, i = 0; p < memmap.map_end;
9784+ p += memmap.desc_size, i++) {
9785+ md = p;
9786+ current_addr = md->phys_addr + (md->num_pages << 12);
9787+ if (md->type == EFI_CONVENTIONAL_MEMORY) {
9788+ if (current_addr >= size) {
9789+ md->num_pages -=
9790+ (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
9791+ memmap.nr_map = i + 1;
9792+ return;
9793+ }
9794+ }
9795+ }
9796+ }
9797+ for (i = 0; i < e820.nr_map; i++) {
9798+ current_addr = e820.map[i].addr + e820.map[i].size;
9799+ if (current_addr < size)
9800+ continue;
9801+
9802+ if (e820.map[i].type != E820_RAM)
9803+ continue;
9804+
9805+ if (e820.map[i].addr >= size) {
9806+ /*
9807+ * This region starts past the end of the
9808+ * requested size, skip it completely.
9809+ */
9810+ e820.nr_map = i;
9811+ } else {
9812+ e820.nr_map = i + 1;
9813+ e820.map[i].size -= current_addr - size;
9814+ }
9815+ return;
9816+ }
9817+#ifdef CONFIG_XEN
9818+ if (i==e820.nr_map && current_addr < size) {
9819+ /*
9820+ * The e820 map finished before our requested size so
9821+ * extend the final entry to the requested address.
9822+ */
9823+ --i;
9824+ if (e820.map[i].type == E820_RAM)
9825+ e820.map[i].size -= current_addr - size;
9826+ else
9827+ add_memory_region(current_addr, size - current_addr, E820_RAM);
9828+ }
9829+#endif
9830+}
9831+
9832+#define E820_DEBUG 1
9833+
9834+static void __init print_memory_map(char *who)
9835+{
9836+ int i;
9837+
9838+ for (i = 0; i < e820.nr_map; i++) {
9839+ printk(" %s: %016Lx - %016Lx ", who,
9840+ e820.map[i].addr,
9841+ e820.map[i].addr + e820.map[i].size);
9842+ switch (e820.map[i].type) {
9843+ case E820_RAM: printk("(usable)\n");
9844+ break;
9845+ case E820_RESERVED:
9846+ printk("(reserved)\n");
9847+ break;
9848+ case E820_ACPI:
9849+ printk("(ACPI data)\n");
9850+ break;
9851+ case E820_NVS:
9852+ printk("(ACPI NVS)\n");
9853+ break;
9854+ default: printk("type %lu\n", e820.map[i].type);
9855+ break;
9856+ }
9857+ }
9858+}
9859+
9860+/*
9861+ * Sanitize the BIOS e820 map.
9862+ *
9863+ * Some e820 responses include overlapping entries. The following
9864+ * replaces the original e820 map with a new one, removing overlaps.
9865+ *
9866+ */
9867+struct change_member {
9868+ struct e820entry *pbios; /* pointer to original bios entry */
9869+ unsigned long long addr; /* address for this change point */
9870+};
9871+static struct change_member change_point_list[2*E820MAX] __initdata;
9872+static struct change_member *change_point[2*E820MAX] __initdata;
9873+static struct e820entry *overlap_list[E820MAX] __initdata;
9874+static struct e820entry new_bios[E820MAX] __initdata;
9875+
9876+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
9877+{
9878+ struct change_member *change_tmp;
9879+ unsigned long current_type, last_type;
9880+ unsigned long long last_addr;
9881+ int chgidx, still_changing;
9882+ int overlap_entries;
9883+ int new_bios_entry;
9884+ int old_nr, new_nr, chg_nr;
9885+ int i;
9886+
9887+ /*
9888+ Visually we're performing the following (1,2,3,4 = memory types)...
9889+
9890+ Sample memory map (w/overlaps):
9891+ ____22__________________
9892+ ______________________4_
9893+ ____1111________________
9894+ _44_____________________
9895+ 11111111________________
9896+ ____________________33__
9897+ ___________44___________
9898+ __________33333_________
9899+ ______________22________
9900+ ___________________2222_
9901+ _________111111111______
9902+ _____________________11_
9903+ _________________4______
9904+
9905+ Sanitized equivalent (no overlap):
9906+ 1_______________________
9907+ _44_____________________
9908+ ___1____________________
9909+ ____22__________________
9910+ ______11________________
9911+ _________1______________
9912+ __________3_____________
9913+ ___________44___________
9914+ _____________33_________
9915+ _______________2________
9916+ ________________1_______
9917+ _________________4______
9918+ ___________________2____
9919+ ____________________33__
9920+ ______________________4_
9921+ */
9922+
9923+ /* if there's only one memory region, don't bother */
9924+ if (*pnr_map < 2)
9925+ return -1;
9926+
9927+ old_nr = *pnr_map;
9928+
9929+ /* bail out if we find any unreasonable addresses in bios map */
9930+ for (i=0; i<old_nr; i++)
9931+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
9932+ return -1;
9933+
9934+ /* create pointers for initial change-point information (for sorting) */
9935+ for (i=0; i < 2*old_nr; i++)
9936+ change_point[i] = &change_point_list[i];
9937+
9938+ /* record all known change-points (starting and ending addresses),
9939+ omitting those that are for empty memory regions */
9940+ chgidx = 0;
9941+ for (i=0; i < old_nr; i++) {
9942+ if (biosmap[i].size != 0) {
9943+ change_point[chgidx]->addr = biosmap[i].addr;
9944+ change_point[chgidx++]->pbios = &biosmap[i];
9945+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
9946+ change_point[chgidx++]->pbios = &biosmap[i];
9947+ }
9948+ }
9949+ chg_nr = chgidx; /* true number of change-points */
9950+
9951+ /* sort change-point list by memory addresses (low -> high) */
9952+ still_changing = 1;
9953+ while (still_changing) {
9954+ still_changing = 0;
9955+ for (i=1; i < chg_nr; i++) {
9956+ /* if <current_addr> > <last_addr>, swap */
9957+ /* or, if current=<start_addr> & last=<end_addr>, swap */
9958+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
9959+ ((change_point[i]->addr == change_point[i-1]->addr) &&
9960+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
9961+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
9962+ )
9963+ {
9964+ change_tmp = change_point[i];
9965+ change_point[i] = change_point[i-1];
9966+ change_point[i-1] = change_tmp;
9967+ still_changing=1;
9968+ }
9969+ }
9970+ }
9971+
9972+ /* create a new bios memory map, removing overlaps */
9973+ overlap_entries=0; /* number of entries in the overlap table */
9974+ new_bios_entry=0; /* index for creating new bios map entries */
9975+ last_type = 0; /* start with undefined memory type */
9976+ last_addr = 0; /* start with 0 as last starting address */
9977+ /* loop through change-points, determining affect on the new bios map */
9978+ for (chgidx=0; chgidx < chg_nr; chgidx++)
9979+ {
9980+ /* keep track of all overlapping bios entries */
9981+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
9982+ {
9983+ /* add map entry to overlap list (> 1 entry implies an overlap) */
9984+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
9985+ }
9986+ else
9987+ {
9988+ /* remove entry from list (order independent, so swap with last) */
9989+ for (i=0; i<overlap_entries; i++)
9990+ {
9991+ if (overlap_list[i] == change_point[chgidx]->pbios)
9992+ overlap_list[i] = overlap_list[overlap_entries-1];
9993+ }
9994+ overlap_entries--;
9995+ }
9996+ /* if there are overlapping entries, decide which "type" to use */
9997+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
9998+ current_type = 0;
9999+ for (i=0; i<overlap_entries; i++)
10000+ if (overlap_list[i]->type > current_type)
10001+ current_type = overlap_list[i]->type;
10002+ /* continue building up new bios map based on this information */
10003+ if (current_type != last_type) {
10004+ if (last_type != 0) {
10005+ new_bios[new_bios_entry].size =
10006+ change_point[chgidx]->addr - last_addr;
10007+ /* move forward only if the new size was non-zero */
10008+ if (new_bios[new_bios_entry].size != 0)
10009+ if (++new_bios_entry >= E820MAX)
10010+ break; /* no more space left for new bios entries */
10011+ }
10012+ if (current_type != 0) {
10013+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10014+ new_bios[new_bios_entry].type = current_type;
10015+ last_addr=change_point[chgidx]->addr;
10016+ }
10017+ last_type = current_type;
10018+ }
10019+ }
10020+ new_nr = new_bios_entry; /* retain count for new bios entries */
10021+
10022+ /* copy new bios mapping into original location */
10023+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10024+ *pnr_map = new_nr;
10025+
10026+ return 0;
10027+}
10028+
10029+/*
10030+ * Copy the BIOS e820 map into a safe place.
10031+ *
10032+ * Sanity-check it while we're at it..
10033+ *
10034+ * If we're lucky and live on a modern system, the setup code
10035+ * will have given us a memory map that we can use to properly
10036+ * set up memory. If we aren't, we'll fake a memory map.
10037+ *
10038+ * We check to see that the memory map contains at least 2 elements
10039+ * before we'll use it, because the detection code in setup.S may
10040+ * not be perfect and most every PC known to man has two memory
10041+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
10042+ * thinkpad 560x, for example, does not cooperate with the memory
10043+ * detection code.)
10044+ */
10045+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10046+{
10047+#ifndef CONFIG_XEN
10048+ /* Only one memory region (or negative)? Ignore it */
10049+ if (nr_map < 2)
10050+ return -1;
10051+#else
10052+ BUG_ON(nr_map < 1);
10053+#endif
10054+
10055+ do {
10056+ unsigned long long start = biosmap->addr;
10057+ unsigned long long size = biosmap->size;
10058+ unsigned long long end = start + size;
10059+ unsigned long type = biosmap->type;
10060+
10061+ /* Overflow in 64 bits? Ignore the memory map. */
10062+ if (start > end)
10063+ return -1;
10064+
10065+#ifndef CONFIG_XEN
10066+ /*
10067+ * Some BIOSes claim RAM in the 640k - 1M region.
10068+ * Not right. Fix it up.
10069+ */
10070+ if (type == E820_RAM) {
10071+ if (start < 0x100000ULL && end > 0xA0000ULL) {
10072+ if (start < 0xA0000ULL)
10073+ add_memory_region(start, 0xA0000ULL-start, type);
10074+ if (end <= 0x100000ULL)
10075+ continue;
10076+ start = 0x100000ULL;
10077+ size = end - start;
10078+ }
10079+ }
10080+#endif
10081+ add_memory_region(start, size, type);
10082+ } while (biosmap++,--nr_map);
10083+
10084+#ifdef CONFIG_XEN
10085+ if (is_initial_xendomain()) {
10086+ struct xen_memory_map memmap;
10087+
10088+ memmap.nr_entries = E820MAX;
10089+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
10090+
10091+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
10092+ BUG();
10093+ machine_e820.nr_map = memmap.nr_entries;
10094+ } else
10095+ machine_e820 = e820;
10096+#endif
10097+
10098+ return 0;
10099+}
10100+
10101+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10102+struct edd edd;
10103+#ifdef CONFIG_EDD_MODULE
10104+EXPORT_SYMBOL(edd);
10105+#endif
10106+#ifndef CONFIG_XEN
10107+/**
10108+ * copy_edd() - Copy the BIOS EDD information
10109+ * from boot_params into a safe place.
10110+ *
10111+ */
10112+static inline void copy_edd(void)
10113+{
10114+ memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10115+ memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10116+ edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10117+ edd.edd_info_nr = EDD_NR;
10118+}
10119+#endif
10120+#else
10121+static inline void copy_edd(void)
10122+{
10123+}
10124+#endif
10125+
10126+static void __init parse_cmdline_early (char ** cmdline_p)
10127+{
10128+ char c = ' ', *to = command_line, *from = saved_command_line;
10129+ int len = 0, max_cmdline;
10130+ int userdef = 0;
10131+
10132+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10133+ max_cmdline = COMMAND_LINE_SIZE;
10134+ memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10135+ /* Save unparsed command line copy for /proc/cmdline */
10136+ saved_command_line[max_cmdline-1] = '\0';
10137+
10138+ for (;;) {
10139+ if (c != ' ')
10140+ goto next_char;
10141+ /*
10142+ * "mem=nopentium" disables the 4MB page tables.
10143+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10144+ * to <mem>, overriding the bios size.
10145+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10146+ * <start> to <start>+<mem>, overriding the bios size.
10147+ *
10148+ * HPA tells me bootloaders need to parse mem=, so no new
10149+ * option should be mem= [also see Documentation/i386/boot.txt]
10150+ */
10151+ if (!memcmp(from, "mem=", 4)) {
10152+ if (to != command_line)
10153+ to--;
10154+ if (!memcmp(from+4, "nopentium", 9)) {
10155+ from += 9+4;
10156+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10157+ disable_pse = 1;
10158+ } else {
10159+ /* If the user specifies memory size, we
10160+ * limit the BIOS-provided memory map to
10161+ * that size. exactmap can be used to specify
10162+ * the exact map. mem=number can be used to
10163+ * trim the existing memory map.
10164+ */
10165+ unsigned long long mem_size;
10166+
10167+ mem_size = memparse(from+4, &from);
10168+ limit_regions(mem_size);
10169+ userdef=1;
10170+ }
10171+ }
10172+
10173+ else if (!memcmp(from, "memmap=", 7)) {
10174+ if (to != command_line)
10175+ to--;
10176+ if (!memcmp(from+7, "exactmap", 8)) {
10177+#ifdef CONFIG_CRASH_DUMP
10178+ /* If we are doing a crash dump, we
10179+ * still need to know the real mem
10180+ * size before original memory map is
10181+ * reset.
10182+ */
10183+ find_max_pfn();
10184+ saved_max_pfn = max_pfn;
10185+#endif
10186+ from += 8+7;
10187+ e820.nr_map = 0;
10188+ userdef = 1;
10189+ } else {
10190+ /* If the user specifies memory size, we
10191+ * limit the BIOS-provided memory map to
10192+ * that size. exactmap can be used to specify
10193+ * the exact map. mem=number can be used to
10194+ * trim the existing memory map.
10195+ */
10196+ unsigned long long start_at, mem_size;
10197+
10198+ mem_size = memparse(from+7, &from);
10199+ if (*from == '@') {
10200+ start_at = memparse(from+1, &from);
10201+ add_memory_region(start_at, mem_size, E820_RAM);
10202+ } else if (*from == '#') {
10203+ start_at = memparse(from+1, &from);
10204+ add_memory_region(start_at, mem_size, E820_ACPI);
10205+ } else if (*from == '$') {
10206+ start_at = memparse(from+1, &from);
10207+ add_memory_region(start_at, mem_size, E820_RESERVED);
10208+ } else {
10209+ limit_regions(mem_size);
10210+ userdef=1;
10211+ }
10212+ }
10213+ }
10214+
10215+ else if (!memcmp(from, "noexec=", 7))
10216+ noexec_setup(from + 7);
10217+
10218+
10219+#ifdef CONFIG_X86_MPPARSE
10220+ /*
10221+ * If the BIOS enumerates physical processors before logical,
10222+ * maxcpus=N at enumeration-time can be used to disable HT.
10223+ */
10224+ else if (!memcmp(from, "maxcpus=", 8)) {
10225+ extern unsigned int maxcpus;
10226+
10227+ maxcpus = simple_strtoul(from + 8, NULL, 0);
10228+ }
10229+#endif
10230+
10231+#ifdef CONFIG_ACPI
10232+ /* "acpi=off" disables both ACPI table parsing and interpreter */
10233+ else if (!memcmp(from, "acpi=off", 8)) {
10234+ disable_acpi();
10235+ }
10236+
10237+ /* acpi=force to over-ride black-list */
10238+ else if (!memcmp(from, "acpi=force", 10)) {
10239+ acpi_force = 1;
10240+ acpi_ht = 1;
10241+ acpi_disabled = 0;
10242+ }
10243+
10244+ /* acpi=strict disables out-of-spec workarounds */
10245+ else if (!memcmp(from, "acpi=strict", 11)) {
10246+ acpi_strict = 1;
10247+ }
10248+
10249+ /* Limit ACPI just to boot-time to enable HT */
10250+ else if (!memcmp(from, "acpi=ht", 7)) {
10251+ if (!acpi_force)
10252+ disable_acpi();
10253+ acpi_ht = 1;
10254+ }
10255+
10256+ /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10257+ else if (!memcmp(from, "pci=noacpi", 10)) {
10258+ acpi_disable_pci();
10259+ }
10260+ /* "acpi=noirq" disables ACPI interrupt routing */
10261+ else if (!memcmp(from, "acpi=noirq", 10)) {
10262+ acpi_noirq_set();
10263+ }
10264+
10265+ else if (!memcmp(from, "acpi_sci=edge", 13))
10266+ acpi_sci_flags.trigger = 1;
10267+
10268+ else if (!memcmp(from, "acpi_sci=level", 14))
10269+ acpi_sci_flags.trigger = 3;
10270+
10271+ else if (!memcmp(from, "acpi_sci=high", 13))
10272+ acpi_sci_flags.polarity = 1;
10273+
10274+ else if (!memcmp(from, "acpi_sci=low", 12))
10275+ acpi_sci_flags.polarity = 3;
10276+
10277+#ifdef CONFIG_X86_IO_APIC
10278+ else if (!memcmp(from, "acpi_skip_timer_override", 24))
10279+ acpi_skip_timer_override = 1;
10280+
10281+ if (!memcmp(from, "disable_timer_pin_1", 19))
10282+ disable_timer_pin_1 = 1;
10283+ if (!memcmp(from, "enable_timer_pin_1", 18))
10284+ disable_timer_pin_1 = -1;
10285+
10286+ /* disable IO-APIC */
10287+ else if (!memcmp(from, "noapic", 6))
10288+ disable_ioapic_setup();
10289+#endif /* CONFIG_X86_IO_APIC */
10290+#endif /* CONFIG_ACPI */
10291+
10292+#ifdef CONFIG_X86_LOCAL_APIC
10293+ /* enable local APIC */
10294+ else if (!memcmp(from, "lapic", 5))
10295+ lapic_enable();
10296+
10297+ /* disable local APIC */
10298+ else if (!memcmp(from, "nolapic", 6))
10299+ lapic_disable();
10300+#endif /* CONFIG_X86_LOCAL_APIC */
10301+
10302+#ifdef CONFIG_KEXEC
10303+ /* crashkernel=size@addr specifies the location to reserve for
10304+ * a crash kernel. By reserving this memory we guarantee
10305+ * that linux never set's it up as a DMA target.
10306+ * Useful for holding code to do something appropriate
10307+ * after a kernel panic.
10308+ */
10309+ else if (!memcmp(from, "crashkernel=", 12)) {
10310+#ifndef CONFIG_XEN
10311+ unsigned long size, base;
10312+ size = memparse(from+12, &from);
10313+ if (*from == '@') {
10314+ base = memparse(from+1, &from);
10315+ /* FIXME: Do I want a sanity check
10316+ * to validate the memory range?
10317+ */
10318+ crashk_res.start = base;
10319+ crashk_res.end = base + size - 1;
10320+ }
10321+#else
10322+ printk("Ignoring crashkernel command line, "
10323+ "parameter will be supplied by xen\n");
10324+#endif
10325+ }
10326+#endif
10327+#ifdef CONFIG_PROC_VMCORE
10328+ /* elfcorehdr= specifies the location of elf core header
10329+ * stored by the crashed kernel.
10330+ */
10331+ else if (!memcmp(from, "elfcorehdr=", 11))
10332+ elfcorehdr_addr = memparse(from+11, &from);
10333+#endif
10334+
10335+ /*
10336+ * highmem=size forces highmem to be exactly 'size' bytes.
10337+ * This works even on boxes that have no highmem otherwise.
10338+ * This also works to reduce highmem size on bigger boxes.
10339+ */
10340+ else if (!memcmp(from, "highmem=", 8))
10341+ highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
10342+
10343+ /*
10344+ * vmalloc=size forces the vmalloc area to be exactly 'size'
10345+ * bytes. This can be used to increase (or decrease) the
10346+ * vmalloc area - the default is 128m.
10347+ */
10348+ else if (!memcmp(from, "vmalloc=", 8))
10349+ __VMALLOC_RESERVE = memparse(from+8, &from);
10350+
10351+ next_char:
10352+ c = *(from++);
10353+ if (!c)
10354+ break;
10355+ if (COMMAND_LINE_SIZE <= ++len)
10356+ break;
10357+ *(to++) = c;
10358+ }
10359+ *to = '\0';
10360+ *cmdline_p = command_line;
10361+ if (userdef) {
10362+ printk(KERN_INFO "user-defined physical RAM map:\n");
10363+ print_memory_map("user");
10364+ }
10365+}
10366+
10367+/*
10368+ * Callback for efi_memory_walk.
10369+ */
10370+static int __init
10371+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
10372+{
10373+ unsigned long *max_pfn = arg, pfn;
10374+
10375+ if (start < end) {
10376+ pfn = PFN_UP(end -1);
10377+ if (pfn > *max_pfn)
10378+ *max_pfn = pfn;
10379+ }
10380+ return 0;
10381+}
10382+
10383+static int __init
10384+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
10385+{
10386+ memory_present(0, start, end);
10387+ return 0;
10388+}
10389+
10390+/*
10391+ * This function checks if any part of the range <start,end> is mapped
10392+ * with type.
10393+ */
10394+int
10395+e820_any_mapped(u64 start, u64 end, unsigned type)
10396+{
10397+ int i;
10398+
10399+#ifndef CONFIG_XEN
10400+ for (i = 0; i < e820.nr_map; i++) {
10401+ const struct e820entry *ei = &e820.map[i];
10402+#else
10403+ if (!is_initial_xendomain())
10404+ return 0;
10405+ for (i = 0; i < machine_e820.nr_map; ++i) {
10406+ const struct e820entry *ei = &machine_e820.map[i];
10407+#endif
10408+
10409+ if (type && ei->type != type)
10410+ continue;
10411+ if (ei->addr >= end || ei->addr + ei->size <= start)
10412+ continue;
10413+ return 1;
10414+ }
10415+ return 0;
10416+}
10417+EXPORT_SYMBOL_GPL(e820_any_mapped);
10418+
10419+ /*
10420+ * This function checks if the entire range <start,end> is mapped with type.
10421+ *
10422+ * Note: this function only works correct if the e820 table is sorted and
10423+ * not-overlapping, which is the case
10424+ */
10425+int __init
10426+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
10427+{
10428+ u64 start = s;
10429+ u64 end = e;
10430+ int i;
10431+
10432+#ifndef CONFIG_XEN
10433+ for (i = 0; i < e820.nr_map; i++) {
10434+ struct e820entry *ei = &e820.map[i];
10435+#else
10436+ if (!is_initial_xendomain())
10437+ return 0;
10438+ for (i = 0; i < machine_e820.nr_map; ++i) {
10439+ const struct e820entry *ei = &machine_e820.map[i];
10440+#endif
10441+ if (type && ei->type != type)
10442+ continue;
10443+ /* is the region (part) in overlap with the current region ?*/
10444+ if (ei->addr >= end || ei->addr + ei->size <= start)
10445+ continue;
10446+ /* if the region is at the beginning of <start,end> we move
10447+ * start to the end of the region since it's ok until there
10448+ */
10449+ if (ei->addr <= start)
10450+ start = ei->addr + ei->size;
10451+ /* if start is now at or beyond end, we're done, full
10452+ * coverage */
10453+ if (start >= end)
10454+ return 1; /* we're done */
10455+ }
10456+ return 0;
10457+}
10458+
10459+/*
10460+ * Find the highest page frame number we have available
10461+ */
10462+void __init find_max_pfn(void)
10463+{
10464+ int i;
10465+
10466+ max_pfn = 0;
10467+ if (efi_enabled) {
10468+ efi_memmap_walk(efi_find_max_pfn, &max_pfn);
10469+ efi_memmap_walk(efi_memory_present_wrapper, NULL);
10470+ return;
10471+ }
10472+
10473+ for (i = 0; i < e820.nr_map; i++) {
10474+ unsigned long start, end;
10475+ /* RAM? */
10476+ if (e820.map[i].type != E820_RAM)
10477+ continue;
10478+ start = PFN_UP(e820.map[i].addr);
10479+ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10480+ if (start >= end)
10481+ continue;
10482+ if (end > max_pfn)
10483+ max_pfn = end;
10484+ memory_present(0, start, end);
10485+ }
10486+}
10487+
10488+/*
10489+ * Determine low and high memory ranges:
10490+ */
10491+unsigned long __init find_max_low_pfn(void)
10492+{
10493+ unsigned long max_low_pfn;
10494+
10495+ max_low_pfn = max_pfn;
10496+ if (max_low_pfn > MAXMEM_PFN) {
10497+ if (highmem_pages == -1)
10498+ highmem_pages = max_pfn - MAXMEM_PFN;
10499+ if (highmem_pages + MAXMEM_PFN < max_pfn)
10500+ max_pfn = MAXMEM_PFN + highmem_pages;
10501+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
10502+ printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
10503+ highmem_pages = 0;
10504+ }
10505+ max_low_pfn = MAXMEM_PFN;
10506+#ifndef CONFIG_HIGHMEM
10507+ /* Maximum memory usable is what is directly addressable */
10508+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
10509+ MAXMEM>>20);
10510+ if (max_pfn > MAX_NONPAE_PFN)
10511+ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10512+ else
10513+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
10514+ max_pfn = MAXMEM_PFN;
10515+#else /* !CONFIG_HIGHMEM */
10516+#ifndef CONFIG_X86_PAE
10517+ if (max_pfn > MAX_NONPAE_PFN) {
10518+ max_pfn = MAX_NONPAE_PFN;
10519+ printk(KERN_WARNING "Warning only 4GB will be used.\n");
10520+ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10521+ }
10522+#endif /* !CONFIG_X86_PAE */
10523+#endif /* !CONFIG_HIGHMEM */
10524+ } else {
10525+ if (highmem_pages == -1)
10526+ highmem_pages = 0;
10527+#ifdef CONFIG_HIGHMEM
10528+ if (highmem_pages >= max_pfn) {
10529+ printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
10530+ highmem_pages = 0;
10531+ }
10532+ if (highmem_pages) {
10533+ if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
10534+ printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
10535+ highmem_pages = 0;
10536+ }
10537+ max_low_pfn -= highmem_pages;
10538+ }
10539+#else
10540+ if (highmem_pages)
10541+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
10542+#endif
10543+ }
10544+ return max_low_pfn;
10545+}
10546+
10547+/*
10548+ * Free all available memory for boot time allocation. Used
10549+ * as a callback function by efi_memory_walk()
10550+ */
10551+
10552+static int __init
10553+free_available_memory(unsigned long start, unsigned long end, void *arg)
10554+{
10555+ /* check max_low_pfn */
10556+ if (start >= (max_low_pfn << PAGE_SHIFT))
10557+ return 0;
10558+ if (end >= (max_low_pfn << PAGE_SHIFT))
10559+ end = max_low_pfn << PAGE_SHIFT;
10560+ if (start < end)
10561+ free_bootmem(start, end - start);
10562+
10563+ return 0;
10564+}
10565+/*
10566+ * Register fully available low RAM pages with the bootmem allocator.
10567+ */
10568+static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
10569+{
10570+ int i;
10571+
10572+ if (efi_enabled) {
10573+ efi_memmap_walk(free_available_memory, NULL);
10574+ return;
10575+ }
10576+ for (i = 0; i < e820.nr_map; i++) {
10577+ unsigned long curr_pfn, last_pfn, size;
10578+ /*
10579+ * Reserve usable low memory
10580+ */
10581+ if (e820.map[i].type != E820_RAM)
10582+ continue;
10583+ /*
10584+ * We are rounding up the start address of usable memory:
10585+ */
10586+ curr_pfn = PFN_UP(e820.map[i].addr);
10587+ if (curr_pfn >= max_low_pfn)
10588+ continue;
10589+ /*
10590+ * ... and at the end of the usable range downwards:
10591+ */
10592+ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10593+
10594+#ifdef CONFIG_XEN
10595+ /*
10596+ * Truncate to the number of actual pages currently
10597+ * present.
10598+ */
10599+ if (last_pfn > xen_start_info->nr_pages)
10600+ last_pfn = xen_start_info->nr_pages;
10601+#endif
10602+
10603+ if (last_pfn > max_low_pfn)
10604+ last_pfn = max_low_pfn;
10605+
10606+ /*
10607+ * .. finally, did all the rounding and playing
10608+ * around just make the area go away?
10609+ */
10610+ if (last_pfn <= curr_pfn)
10611+ continue;
10612+
10613+ size = last_pfn - curr_pfn;
10614+ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
10615+ }
10616+}
10617+
10618+#ifndef CONFIG_XEN
10619+/*
10620+ * workaround for Dell systems that neglect to reserve EBDA
10621+ */
10622+static void __init reserve_ebda_region(void)
10623+{
10624+ unsigned int addr;
10625+ addr = get_bios_ebda();
10626+ if (addr)
10627+ reserve_bootmem(addr, PAGE_SIZE);
10628+}
10629+#endif
10630+
10631+#ifndef CONFIG_NEED_MULTIPLE_NODES
10632+void __init setup_bootmem_allocator(void);
10633+static unsigned long __init setup_memory(void)
10634+{
10635+ /*
10636+ * partially used pages are not usable - thus
10637+ * we are rounding upwards:
10638+ */
10639+ min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
10640+ xen_start_info->nr_pt_frames;
10641+
10642+ find_max_pfn();
10643+
10644+ max_low_pfn = find_max_low_pfn();
10645+
10646+#ifdef CONFIG_HIGHMEM
10647+ highstart_pfn = highend_pfn = max_pfn;
10648+ if (max_pfn > max_low_pfn) {
10649+ highstart_pfn = max_low_pfn;
10650+ }
10651+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
10652+ pages_to_mb(highend_pfn - highstart_pfn));
10653+#endif
10654+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
10655+ pages_to_mb(max_low_pfn));
10656+
10657+ setup_bootmem_allocator();
10658+
10659+ return max_low_pfn;
10660+}
10661+
10662+void __init zone_sizes_init(void)
10663+{
10664+ unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
10665+ unsigned int max_dma, low;
10666+
10667+ max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
10668+ low = max_low_pfn;
10669+
10670+ if (low < max_dma)
10671+ zones_size[ZONE_DMA] = low;
10672+ else {
10673+ zones_size[ZONE_DMA] = max_dma;
10674+ zones_size[ZONE_NORMAL] = low - max_dma;
10675+#ifdef CONFIG_HIGHMEM
10676+ zones_size[ZONE_HIGHMEM] = highend_pfn - low;
10677+#endif
10678+ }
10679+ free_area_init(zones_size);
10680+}
10681+#else
10682+extern unsigned long __init setup_memory(void);
10683+extern void zone_sizes_init(void);
10684+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
10685+
10686+void __init setup_bootmem_allocator(void)
10687+{
10688+ unsigned long bootmap_size;
10689+ /*
10690+ * Initialize the boot-time allocator (with low memory only):
10691+ */
10692+ bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
10693+
10694+ register_bootmem_low_pages(max_low_pfn);
10695+
10696+ /*
10697+ * Reserve the bootmem bitmap itself as well. We do this in two
10698+ * steps (first step was init_bootmem()) because this catches
10699+ * the (very unlikely) case of us accidentally initializing the
10700+ * bootmem allocator with an invalid RAM area.
10701+ */
10702+ reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
10703+ bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
10704+
10705+#ifndef CONFIG_XEN
10706+ /*
10707+ * reserve physical page 0 - it's a special BIOS page on many boxes,
10708+ * enabling clean reboots, SMP operation, laptop functions.
10709+ */
10710+ reserve_bootmem(0, PAGE_SIZE);
10711+
10712+ /* reserve EBDA region, it's a 4K region */
10713+ reserve_ebda_region();
10714+
10715+ /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
10716+ PCI prefetch into it (errata #56). Usually the page is reserved anyways,
10717+ unless you have no PS/2 mouse plugged in. */
10718+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10719+ boot_cpu_data.x86 == 6)
10720+ reserve_bootmem(0xa0000 - 4096, 4096);
10721+
10722+#ifdef CONFIG_SMP
10723+ /*
10724+ * But first pinch a few for the stack/trampoline stuff
10725+ * FIXME: Don't need the extra page at 4K, but need to fix
10726+ * trampoline before removing it. (see the GDT stuff)
10727+ */
10728+ reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
10729+#endif
10730+#ifdef CONFIG_ACPI_SLEEP
10731+ /*
10732+ * Reserve low memory region for sleep support.
10733+ */
10734+ acpi_reserve_bootmem();
10735+#endif
10736+#endif /* !CONFIG_XEN */
10737+
10738+#ifdef CONFIG_BLK_DEV_INITRD
10739+ if (xen_start_info->mod_start) {
10740+ if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
10741+ /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
10742+ initrd_start = INITRD_START + PAGE_OFFSET;
10743+ initrd_end = initrd_start+INITRD_SIZE;
10744+ initrd_below_start_ok = 1;
10745+ }
10746+ else {
10747+ printk(KERN_ERR "initrd extends beyond end of memory "
10748+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
10749+ INITRD_START + INITRD_SIZE,
10750+ max_low_pfn << PAGE_SHIFT);
10751+ initrd_start = 0;
10752+ }
10753+ }
10754+#endif
10755+#ifdef CONFIG_KEXEC
10756+#ifdef CONFIG_XEN
10757+ xen_machine_kexec_setup_resources();
10758+#else
10759+ if (crashk_res.start != crashk_res.end)
10760+ reserve_bootmem(crashk_res.start,
10761+ crashk_res.end - crashk_res.start + 1);
10762+#endif
10763+#endif
10764+}
10765+
10766+/*
10767+ * The node 0 pgdat is initialized before all of these because
10768+ * it's needed for bootmem. node>0 pgdats have their virtual
10769+ * space allocated before the pagetables are in place to access
10770+ * them, so they can't be cleared then.
10771+ *
10772+ * This should all compile down to nothing when NUMA is off.
10773+ */
10774+void __init remapped_pgdat_init(void)
10775+{
10776+ int nid;
10777+
10778+ for_each_online_node(nid) {
10779+ if (nid != 0)
10780+ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
10781+ }
10782+}
10783+
10784+/*
10785+ * Request address space for all standard RAM and ROM resources
10786+ * and also for regions reported as reserved by the e820.
10787+ */
10788+static void __init
10789+legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
10790+ struct resource *code_resource,
10791+ struct resource *data_resource)
10792+{
10793+ int i;
10794+
10795+ probe_roms();
10796+
10797+ for (i = 0; i < nr_map; i++) {
10798+ struct resource *res;
10799+#ifndef CONFIG_RESOURCES_64BIT
10800+ if (e820[i].addr + e820[i].size > 0x100000000ULL)
10801+ continue;
10802+#endif
10803+ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
10804+ switch (e820[i].type) {
10805+ case E820_RAM: res->name = "System RAM"; break;
10806+ case E820_ACPI: res->name = "ACPI Tables"; break;
10807+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
10808+ default: res->name = "reserved";
10809+ }
10810+ res->start = e820[i].addr;
10811+ res->end = res->start + e820[i].size - 1;
10812+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
10813+ if (request_resource(&iomem_resource, res)) {
10814+ kfree(res);
10815+ continue;
10816+ }
10817+ if (e820[i].type == E820_RAM) {
10818+ /*
10819+ * We don't know which RAM region contains kernel data,
10820+ * so we try it repeatedly and let the resource manager
10821+ * test it.
10822+ */
10823+#ifndef CONFIG_XEN
10824+ request_resource(res, code_resource);
10825+ request_resource(res, data_resource);
10826+#endif
10827+#ifdef CONFIG_KEXEC
10828+ if (crashk_res.start != crashk_res.end)
10829+ request_resource(res, &crashk_res);
10830+#ifdef CONFIG_XEN
10831+ xen_machine_kexec_register_resources(res);
10832+#endif
10833+#endif
10834+ }
10835+ }
10836+}
10837+
10838+/*
10839+ * Locate a unused range of the physical address space below 4G which
10840+ * can be used for PCI mappings.
10841+ */
10842+static void __init
10843+e820_setup_gap(struct e820entry *e820, int nr_map)
10844+{
10845+ unsigned long gapstart, gapsize, round;
10846+ unsigned long long last;
10847+ int i;
10848+
10849+ /*
10850+ * Search for the bigest gap in the low 32 bits of the e820
10851+ * memory space.
10852+ */
10853+ last = 0x100000000ull;
10854+ gapstart = 0x10000000;
10855+ gapsize = 0x400000;
10856+ i = nr_map;
10857+ while (--i >= 0) {
10858+ unsigned long long start = e820[i].addr;
10859+ unsigned long long end = start + e820[i].size;
10860+
10861+ /*
10862+ * Since "last" is at most 4GB, we know we'll
10863+ * fit in 32 bits if this condition is true
10864+ */
10865+ if (last > end) {
10866+ unsigned long gap = last - end;
10867+
10868+ if (gap > gapsize) {
10869+ gapsize = gap;
10870+ gapstart = end;
10871+ }
10872+ }
10873+ if (start < last)
10874+ last = start;
10875+ }
10876+
10877+ /*
10878+ * See how much we want to round up: start off with
10879+ * rounding to the next 1MB area.
10880+ */
10881+ round = 0x100000;
10882+ while ((gapsize >> 4) > round)
10883+ round += round;
10884+ /* Fun with two's complement */
10885+ pci_mem_start = (gapstart + round) & -round;
10886+
10887+ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
10888+ pci_mem_start, gapstart, gapsize);
10889+}
10890+
10891+/*
10892+ * Request address space for all standard resources
10893+ *
10894+ * This is called just before pcibios_init(), which is also a
10895+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
10896+ */
10897+static int __init request_standard_resources(void)
10898+{
10899+ int i;
10900+
10901+ /* Nothing to do if not running in dom0. */
10902+ if (!is_initial_xendomain())
10903+ return 0;
10904+
10905+ printk("Setting up standard PCI resources\n");
10906+#ifdef CONFIG_XEN
10907+ legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
10908+ &code_resource, &data_resource);
10909+#else
10910+ if (efi_enabled)
10911+ efi_initialize_iomem_resources(&code_resource, &data_resource);
10912+ else
10913+ legacy_init_iomem_resources(e820.map, e820.nr_map,
10914+ &code_resource, &data_resource);
10915+#endif
10916+
10917+ /* EFI systems may still have VGA */
10918+ request_resource(&iomem_resource, &video_ram_resource);
10919+
10920+ /* request I/O space for devices used on all i[345]86 PCs */
10921+ for (i = 0; i < STANDARD_IO_RESOURCES; i++)
10922+ request_resource(&ioport_resource, &standard_io_resources[i]);
10923+ return 0;
10924+}
10925+
10926+subsys_initcall(request_standard_resources);
10927+
10928+static void __init register_memory(void)
10929+{
10930+#ifdef CONFIG_XEN
10931+ if (is_initial_xendomain())
10932+ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
10933+ else
10934+#endif
10935+ e820_setup_gap(e820.map, e820.nr_map);
10936+}
10937+
10938+#ifdef CONFIG_MCA
10939+static void set_mca_bus(int x)
10940+{
10941+ MCA_bus = x;
10942+}
10943+#else
10944+static void set_mca_bus(int x) { }
10945+#endif
10946+
10947+/*
10948+ * Determine if we were loaded by an EFI loader. If so, then we have also been
10949+ * passed the efi memmap, systab, etc., so we should use these data structures
10950+ * for initialization. Note, the efi init code path is determined by the
10951+ * global efi_enabled. This allows the same kernel image to be used on existing
10952+ * systems (with a traditional BIOS) as well as on EFI systems.
10953+ */
10954+void __init setup_arch(char **cmdline_p)
10955+{
10956+ int i, j, k, fpp;
10957+ struct physdev_set_iopl set_iopl;
10958+ unsigned long max_low_pfn;
10959+ unsigned long p2m_pages;
10960+
10961+ /* Force a quick death if the kernel panics (not domain 0). */
10962+ extern int panic_timeout;
10963+ if (!panic_timeout && !is_initial_xendomain())
10964+ panic_timeout = 1;
10965+
10966+ /* Register a call for panic conditions. */
10967+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
10968+
10969+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10970+ VMASST_TYPE_4gb_segments));
10971+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10972+ VMASST_TYPE_writable_pagetables));
10973+
10974+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
10975+ pre_setup_arch_hook();
10976+ early_cpu_init();
10977+#ifdef CONFIG_SMP
10978+ prefill_possible_map();
10979+#endif
10980+
10981+ /*
10982+ * FIXME: This isn't an official loader_type right
10983+ * now but does currently work with elilo.
10984+ * If we were configured as an EFI kernel, check to make
10985+ * sure that we were loaded correctly from elilo and that
10986+ * the system table is valid. If not, then initialize normally.
10987+ */
10988+#ifdef CONFIG_EFI
10989+ if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
10990+ efi_enabled = 1;
10991+#endif
10992+
10993+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
10994+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
10995+ */
10996+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
10997+ drive_info = DRIVE_INFO;
10998+ screen_info = SCREEN_INFO;
10999+ copy_edid();
11000+ apm_info.bios = APM_BIOS_INFO;
11001+ ist_info = IST_INFO;
11002+ saved_videomode = VIDEO_MODE;
11003+ if( SYS_DESC_TABLE.length != 0 ) {
11004+ set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11005+ machine_id = SYS_DESC_TABLE.table[0];
11006+ machine_submodel_id = SYS_DESC_TABLE.table[1];
11007+ BIOS_revision = SYS_DESC_TABLE.table[2];
11008+ }
11009+ bootloader_type = LOADER_TYPE;
11010+
11011+ if (is_initial_xendomain()) {
11012+ const struct dom0_vga_console_info *info =
11013+ (void *)((char *)xen_start_info +
11014+ xen_start_info->console.dom0.info_off);
11015+
11016+ dom0_init_screen_info(info,
11017+ xen_start_info->console.dom0.info_size);
11018+ xen_start_info->console.domU.mfn = 0;
11019+ xen_start_info->console.domU.evtchn = 0;
11020+ } else
11021+ screen_info.orig_video_isVGA = 0;
11022+
11023+#ifdef CONFIG_BLK_DEV_RAM
11024+ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11025+ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11026+ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11027+#endif
11028+
11029+ ARCH_SETUP
11030+ if (efi_enabled)
11031+ efi_init();
11032+ else {
11033+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11034+ print_memory_map(machine_specific_memory_setup());
11035+ }
11036+
11037+ copy_edd();
11038+
11039+ if (!MOUNT_ROOT_RDONLY)
11040+ root_mountflags &= ~MS_RDONLY;
11041+ init_mm.start_code = (unsigned long) _text;
11042+ init_mm.end_code = (unsigned long) _etext;
11043+ init_mm.end_data = (unsigned long) _edata;
11044+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11045+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11046+
11047+ code_resource.start = virt_to_phys(_text);
11048+ code_resource.end = virt_to_phys(_etext)-1;
11049+ data_resource.start = virt_to_phys(_etext);
11050+ data_resource.end = virt_to_phys(_edata)-1;
11051+
11052+ parse_cmdline_early(cmdline_p);
11053+
11054+#ifdef CONFIG_EARLY_PRINTK
11055+ {
11056+ char *s = strstr(*cmdline_p, "earlyprintk=");
11057+ if (s) {
11058+ setup_early_printk(strchr(s, '=') + 1);
11059+ printk("early console enabled\n");
11060+ }
11061+ }
11062+#endif
11063+
11064+ max_low_pfn = setup_memory();
11065+
11066+ /*
11067+ * NOTE: before this point _nobody_ is allowed to allocate
11068+ * any memory using the bootmem allocator. Although the
11069+ * alloctor is now initialised only the first 8Mb of the kernel
11070+ * virtual address space has been mapped. All allocations before
11071+ * paging_init() has completed must use the alloc_bootmem_low_pages()
11072+ * variant (which allocates DMA'able memory) and care must be taken
11073+ * not to exceed the 8Mb limit.
11074+ */
11075+
11076+#ifdef CONFIG_SMP
11077+ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11078+#endif
11079+ paging_init();
11080+ remapped_pgdat_init();
11081+ sparse_init();
11082+ zone_sizes_init();
11083+
11084+#ifdef CONFIG_X86_FIND_SMP_CONFIG
11085+ /*
11086+ * Find and reserve possible boot-time SMP configuration:
11087+ */
11088+ find_smp_config();
11089+#endif
11090+
11091+ p2m_pages = max_pfn;
11092+ if (xen_start_info->nr_pages > max_pfn) {
11093+ /*
11094+ * the max_pfn was shrunk (probably by mem= or highmem=
11095+ * kernel parameter); shrink reservation with the HV
11096+ */
11097+ struct xen_memory_reservation reservation = {
11098+ .address_bits = 0,
11099+ .extent_order = 0,
11100+ .domid = DOMID_SELF
11101+ };
11102+ unsigned int difference;
11103+ int ret;
11104+
11105+ difference = xen_start_info->nr_pages - max_pfn;
11106+
11107+ set_xen_guest_handle(reservation.extent_start,
11108+ ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
11109+ reservation.nr_extents = difference;
11110+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
11111+ &reservation);
11112+ BUG_ON (ret != difference);
11113+ }
11114+ else if (max_pfn > xen_start_info->nr_pages)
11115+ p2m_pages = xen_start_info->nr_pages;
11116+
11117+ /* Make sure we have a correctly sized P->M table. */
11118+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11119+ phys_to_machine_mapping = alloc_bootmem_low_pages(
11120+ max_pfn * sizeof(unsigned long));
11121+ memset(phys_to_machine_mapping, ~0,
11122+ max_pfn * sizeof(unsigned long));
11123+ memcpy(phys_to_machine_mapping,
11124+ (unsigned long *)xen_start_info->mfn_list,
11125+ p2m_pages * sizeof(unsigned long));
11126+ free_bootmem(
11127+ __pa(xen_start_info->mfn_list),
11128+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11129+ sizeof(unsigned long))));
11130+
11131+ /*
11132+ * Initialise the list of the frames that specify the list of
11133+ * frames that make up the p2m table. Used by save/restore
11134+ */
11135+ pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11136+
11137+ fpp = PAGE_SIZE/sizeof(unsigned long);
11138+ for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11139+ if ((j % fpp) == 0) {
11140+ k++;
11141+ BUG_ON(k>=16);
11142+ pfn_to_mfn_frame_list[k] =
11143+ alloc_bootmem_low_pages(PAGE_SIZE);
11144+ pfn_to_mfn_frame_list_list[k] =
11145+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
11146+ j=0;
11147+ }
11148+ pfn_to_mfn_frame_list[k][j] =
11149+ virt_to_mfn(&phys_to_machine_mapping[i]);
11150+ }
11151+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11152+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11153+ virt_to_mfn(pfn_to_mfn_frame_list_list);
11154+ }
11155+
11156+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
11157+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
11158+ if (i != 4 && request_dma(i, "xen") != 0)
11159+ BUG();
11160+
11161+ /*
11162+ * NOTE: at this point the bootmem allocator is fully available.
11163+ */
11164+
11165+ if (is_initial_xendomain())
11166+ dmi_scan_machine();
11167+
11168+#ifdef CONFIG_X86_GENERICARCH
11169+ generic_apic_probe(*cmdline_p);
11170+#endif
11171+ if (efi_enabled)
11172+ efi_map_memmap();
11173+
11174+ set_iopl.iopl = 1;
11175+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
11176+
11177+#ifdef CONFIG_ACPI
11178+ if (!is_initial_xendomain()) {
11179+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11180+ acpi_disabled = 1;
11181+ acpi_ht = 0;
11182+ }
11183+
11184+ /*
11185+ * Parse the ACPI tables for possible boot-time SMP configuration.
11186+ */
11187+ acpi_boot_table_init();
11188+#endif
11189+
11190+#ifdef CONFIG_X86_IO_APIC
11191+ check_acpi_pci(); /* Checks more than just ACPI actually */
11192+#endif
11193+
11194+#ifdef CONFIG_ACPI
11195+ acpi_boot_init();
11196+
11197+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11198+ if (def_to_bigsmp)
11199+ printk(KERN_WARNING "More than 8 CPUs detected and "
11200+ "CONFIG_X86_PC cannot handle it.\nUse "
11201+ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11202+#endif
11203+#endif
11204+#ifdef CONFIG_X86_LOCAL_APIC
11205+ if (smp_found_config)
11206+ get_smp_config();
11207+#endif
11208+
11209+ register_memory();
11210+
11211+ if (is_initial_xendomain()) {
11212+#ifdef CONFIG_VT
11213+#if defined(CONFIG_VGA_CONSOLE)
11214+ if (!efi_enabled ||
11215+ (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11216+ conswitchp = &vga_con;
11217+#elif defined(CONFIG_DUMMY_CONSOLE)
11218+ conswitchp = &dummy_con;
11219+#endif
11220+#endif
11221+ } else {
11222+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
11223+ conswitchp = &dummy_con;
11224+#endif
11225+ }
11226+ tsc_init();
11227+}
11228+
11229+static int
11230+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11231+{
11232+ HYPERVISOR_shutdown(SHUTDOWN_crash);
11233+ /* we're never actually going to get here... */
11234+ return NOTIFY_DONE;
11235+}
11236+
11237+static __init int add_pcspkr(void)
11238+{
11239+ struct platform_device *pd;
11240+ int ret;
11241+
11242+ if (!is_initial_xendomain())
11243+ return 0;
11244+
11245+ pd = platform_device_alloc("pcspkr", -1);
11246+ if (!pd)
11247+ return -ENOMEM;
11248+
11249+ ret = platform_device_add(pd);
11250+ if (ret)
11251+ platform_device_put(pd);
11252+
11253+ return ret;
11254+}
11255+device_initcall(add_pcspkr);
11256+
11257+/*
11258+ * Local Variables:
11259+ * mode:c
11260+ * c-file-style:"k&r"
11261+ * c-basic-offset:8
11262+ * End:
11263+ */
11264Index: head-2008-11-25/arch/x86/kernel/smp_32-xen.c
11265===================================================================
11266--- /dev/null 1970-01-01 00:00:00.000000000 +0000
11267+++ head-2008-11-25/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100
11268@@ -0,0 +1,605 @@
11269+/*
11270+ * Intel SMP support routines.
11271+ *
11272+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11273+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11274+ *
11275+ * This code is released under the GNU General Public License version 2 or
11276+ * later.
11277+ */
11278+
11279+#include <linux/init.h>
11280+
11281+#include <linux/mm.h>
11282+#include <linux/delay.h>
11283+#include <linux/spinlock.h>
11284+#include <linux/smp_lock.h>
11285+#include <linux/kernel_stat.h>
11286+#include <linux/mc146818rtc.h>
11287+#include <linux/cache.h>
11288+#include <linux/interrupt.h>
11289+#include <linux/cpu.h>
11290+#include <linux/module.h>
11291+
11292+#include <asm/mtrr.h>
11293+#include <asm/tlbflush.h>
11294+#if 0
11295+#include <mach_apic.h>
11296+#endif
11297+#include <xen/evtchn.h>
11298+
11299+/*
11300+ * Some notes on x86 processor bugs affecting SMP operation:
11301+ *
11302+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11303+ * The Linux implications for SMP are handled as follows:
11304+ *
11305+ * Pentium III / [Xeon]
11306+ * None of the E1AP-E3AP errata are visible to the user.
11307+ *
11308+ * E1AP. see PII A1AP
11309+ * E2AP. see PII A2AP
11310+ * E3AP. see PII A3AP
11311+ *
11312+ * Pentium II / [Xeon]
11313+ * None of the A1AP-A3AP errata are visible to the user.
11314+ *
11315+ * A1AP. see PPro 1AP
11316+ * A2AP. see PPro 2AP
11317+ * A3AP. see PPro 7AP
11318+ *
11319+ * Pentium Pro
11320+ * None of 1AP-9AP errata are visible to the normal user,
11321+ * except occasional delivery of 'spurious interrupt' as trap #15.
11322+ * This is very rare and a non-problem.
11323+ *
11324+ * 1AP. Linux maps APIC as non-cacheable
11325+ * 2AP. worked around in hardware
11326+ * 3AP. fixed in C0 and above steppings microcode update.
11327+ * Linux does not use excessive STARTUP_IPIs.
11328+ * 4AP. worked around in hardware
11329+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
11330+ * 'noapic' mode has vector 0xf filled out properly.
11331+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
11332+ * 7AP. We do not assume writes to the LVT deassering IRQs
11333+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
11334+ * 9AP. We do not use mixed mode
11335+ *
11336+ * Pentium
11337+ * There is a marginal case where REP MOVS on 100MHz SMP
11338+ * machines with B stepping processors can fail. XXX should provide
11339+ * an L1cache=Writethrough or L1cache=off option.
11340+ *
11341+ * B stepping CPUs may hang. There are hardware work arounds
11342+ * for this. We warn about it in case your board doesn't have the work
11343+ * arounds. Basically thats so I can tell anyone with a B stepping
11344+ * CPU and SMP problems "tough".
11345+ *
11346+ * Specific items [From Pentium Processor Specification Update]
11347+ *
11348+ * 1AP. Linux doesn't use remote read
11349+ * 2AP. Linux doesn't trust APIC errors
11350+ * 3AP. We work around this
11351+ * 4AP. Linux never generated 3 interrupts of the same priority
11352+ * to cause a lost local interrupt.
11353+ * 5AP. Remote read is never used
11354+ * 6AP. not affected - worked around in hardware
11355+ * 7AP. not affected - worked around in hardware
11356+ * 8AP. worked around in hardware - we get explicit CS errors if not
11357+ * 9AP. only 'noapic' mode affected. Might generate spurious
11358+ * interrupts, we log only the first one and count the
11359+ * rest silently.
11360+ * 10AP. not affected - worked around in hardware
11361+ * 11AP. Linux reads the APIC between writes to avoid this, as per
11362+ * the documentation. Make sure you preserve this as it affects
11363+ * the C stepping chips too.
11364+ * 12AP. not affected - worked around in hardware
11365+ * 13AP. not affected - worked around in hardware
11366+ * 14AP. we always deassert INIT during bootup
11367+ * 15AP. not affected - worked around in hardware
11368+ * 16AP. not affected - worked around in hardware
11369+ * 17AP. not affected - worked around in hardware
11370+ * 18AP. not affected - worked around in hardware
11371+ * 19AP. not affected - worked around in BIOS
11372+ *
11373+ * If this sounds worrying believe me these bugs are either ___RARE___,
11374+ * or are signal timing bugs worked around in hardware and there's
11375+ * about nothing of note with C stepping upwards.
11376+ */
11377+
11378+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
11379+
11380+/*
11381+ * the following functions deal with sending IPIs between CPUs.
11382+ *
11383+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
11384+ */
11385+
11386+static inline int __prepare_ICR (unsigned int shortcut, int vector)
11387+{
11388+ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
11389+
11390+ switch (vector) {
11391+ default:
11392+ icr |= APIC_DM_FIXED | vector;
11393+ break;
11394+ case NMI_VECTOR:
11395+ icr |= APIC_DM_NMI;
11396+ break;
11397+ }
11398+ return icr;
11399+}
11400+
11401+static inline int __prepare_ICR2 (unsigned int mask)
11402+{
11403+ return SET_APIC_DEST_FIELD(mask);
11404+}
11405+
11406+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
11407+
11408+static inline void __send_IPI_one(unsigned int cpu, int vector)
11409+{
11410+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
11411+ BUG_ON(irq < 0);
11412+ notify_remote_via_irq(irq);
11413+}
11414+
11415+void __send_IPI_shortcut(unsigned int shortcut, int vector)
11416+{
11417+ int cpu;
11418+
11419+ switch (shortcut) {
11420+ case APIC_DEST_SELF:
11421+ __send_IPI_one(smp_processor_id(), vector);
11422+ break;
11423+ case APIC_DEST_ALLBUT:
11424+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11425+ if (cpu == smp_processor_id())
11426+ continue;
11427+ if (cpu_isset(cpu, cpu_online_map)) {
11428+ __send_IPI_one(cpu, vector);
11429+ }
11430+ }
11431+ break;
11432+ default:
11433+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
11434+ vector);
11435+ break;
11436+ }
11437+}
11438+
11439+void fastcall send_IPI_self(int vector)
11440+{
11441+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
11442+}
11443+
11444+/*
11445+ * This is only used on smaller machines.
11446+ */
11447+void send_IPI_mask_bitmask(cpumask_t mask, int vector)
11448+{
11449+ unsigned long flags;
11450+ unsigned int cpu;
11451+
11452+ local_irq_save(flags);
11453+ WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
11454+
11455+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11456+ if (cpu_isset(cpu, mask)) {
11457+ __send_IPI_one(cpu, vector);
11458+ }
11459+ }
11460+
11461+ local_irq_restore(flags);
11462+}
11463+
11464+void send_IPI_mask_sequence(cpumask_t mask, int vector)
11465+{
11466+
11467+ send_IPI_mask_bitmask(mask, vector);
11468+}
11469+
11470+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
11471+
11472+#if 0 /* XEN */
11473+/*
11474+ * Smarter SMP flushing macros.
11475+ * c/o Linus Torvalds.
11476+ *
11477+ * These mean you can really definitely utterly forget about
11478+ * writing to user space from interrupts. (Its not allowed anyway).
11479+ *
11480+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
11481+ */
11482+
11483+static cpumask_t flush_cpumask;
11484+static struct mm_struct * flush_mm;
11485+static unsigned long flush_va;
11486+static DEFINE_SPINLOCK(tlbstate_lock);
11487+#define FLUSH_ALL 0xffffffff
11488+
11489+/*
11490+ * We cannot call mmdrop() because we are in interrupt context,
11491+ * instead update mm->cpu_vm_mask.
11492+ *
11493+ * We need to reload %cr3 since the page tables may be going
11494+ * away from under us..
11495+ */
11496+static inline void leave_mm (unsigned long cpu)
11497+{
11498+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
11499+ BUG();
11500+ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
11501+ load_cr3(swapper_pg_dir);
11502+}
11503+
11504+/*
11505+ *
11506+ * The flush IPI assumes that a thread switch happens in this order:
11507+ * [cpu0: the cpu that switches]
11508+ * 1) switch_mm() either 1a) or 1b)
11509+ * 1a) thread switch to a different mm
11510+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
11511+ * Stop ipi delivery for the old mm. This is not synchronized with
11512+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
11513+ * for the wrong mm, and in the worst case we perform a superflous
11514+ * tlb flush.
11515+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
11516+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
11517+ * was in lazy tlb mode.
11518+ * 1a3) update cpu_tlbstate[].active_mm
11519+ * Now cpu0 accepts tlb flushes for the new mm.
11520+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
11521+ * Now the other cpus will send tlb flush ipis.
11522+ * 1a4) change cr3.
11523+ * 1b) thread switch without mm change
11524+ * cpu_tlbstate[].active_mm is correct, cpu0 already handles
11525+ * flush ipis.
11526+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
11527+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
11528+ * Atomically set the bit [other cpus will start sending flush ipis],
11529+ * and test the bit.
11530+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
11531+ * 2) switch %%esp, ie current
11532+ *
11533+ * The interrupt must handle 2 special cases:
11534+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
11535+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
11536+ * runs in kernel space, the cpu could load tlb entries for user space
11537+ * pages.
11538+ *
11539+ * The good news is that cpu_tlbstate is local to each cpu, no
11540+ * write/read ordering problems.
11541+ */
11542+
11543+/*
11544+ * TLB flush IPI:
11545+ *
11546+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
11547+ * 2) Leave the mm if we are in the lazy tlb mode.
11548+ */
11549+
11550+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
11551+ struct pt_regs *regs)
11552+{
11553+ unsigned long cpu;
11554+
11555+ cpu = get_cpu();
11556+
11557+ if (!cpu_isset(cpu, flush_cpumask))
11558+ goto out;
11559+ /*
11560+ * This was a BUG() but until someone can quote me the
11561+ * line from the intel manual that guarantees an IPI to
11562+ * multiple CPUs is retried _only_ on the erroring CPUs
11563+ * its staying as a return
11564+ *
11565+ * BUG();
11566+ */
11567+
11568+ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
11569+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
11570+ if (flush_va == FLUSH_ALL)
11571+ local_flush_tlb();
11572+ else
11573+ __flush_tlb_one(flush_va);
11574+ } else
11575+ leave_mm(cpu);
11576+ }
11577+ smp_mb__before_clear_bit();
11578+ cpu_clear(cpu, flush_cpumask);
11579+ smp_mb__after_clear_bit();
11580+out:
11581+ put_cpu_no_resched();
11582+
11583+ return IRQ_HANDLED;
11584+}
11585+
11586+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
11587+ unsigned long va)
11588+{
11589+ /*
11590+ * A couple of (to be removed) sanity checks:
11591+ *
11592+ * - current CPU must not be in mask
11593+ * - mask must exist :)
11594+ */
11595+ BUG_ON(cpus_empty(cpumask));
11596+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
11597+ BUG_ON(!mm);
11598+
11599+ /* If a CPU which we ran on has gone down, OK. */
11600+ cpus_and(cpumask, cpumask, cpu_online_map);
11601+ if (cpus_empty(cpumask))
11602+ return;
11603+
11604+ /*
11605+ * i'm not happy about this global shared spinlock in the
11606+ * MM hot path, but we'll see how contended it is.
11607+ * Temporarily this turns IRQs off, so that lockups are
11608+ * detected by the NMI watchdog.
11609+ */
11610+ spin_lock(&tlbstate_lock);
11611+
11612+ flush_mm = mm;
11613+ flush_va = va;
11614+#if NR_CPUS <= BITS_PER_LONG
11615+ atomic_set_mask(cpumask, &flush_cpumask);
11616+#else
11617+ {
11618+ int k;
11619+ unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
11620+ unsigned long *cpu_mask = (unsigned long *)&cpumask;
11621+ for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
11622+ atomic_set_mask(cpu_mask[k], &flush_mask[k]);
11623+ }
11624+#endif
11625+ /*
11626+ * We have to send the IPI only to
11627+ * CPUs affected.
11628+ */
11629+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
11630+
11631+ while (!cpus_empty(flush_cpumask))
11632+ /* nothing. lockup detection does not belong here */
11633+ mb();
11634+
11635+ flush_mm = NULL;
11636+ flush_va = 0;
11637+ spin_unlock(&tlbstate_lock);
11638+}
11639+
11640+void flush_tlb_current_task(void)
11641+{
11642+ struct mm_struct *mm = current->mm;
11643+ cpumask_t cpu_mask;
11644+
11645+ preempt_disable();
11646+ cpu_mask = mm->cpu_vm_mask;
11647+ cpu_clear(smp_processor_id(), cpu_mask);
11648+
11649+ local_flush_tlb();
11650+ if (!cpus_empty(cpu_mask))
11651+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11652+ preempt_enable();
11653+}
11654+
11655+void flush_tlb_mm (struct mm_struct * mm)
11656+{
11657+ cpumask_t cpu_mask;
11658+
11659+ preempt_disable();
11660+ cpu_mask = mm->cpu_vm_mask;
11661+ cpu_clear(smp_processor_id(), cpu_mask);
11662+
11663+ if (current->active_mm == mm) {
11664+ if (current->mm)
11665+ local_flush_tlb();
11666+ else
11667+ leave_mm(smp_processor_id());
11668+ }
11669+ if (!cpus_empty(cpu_mask))
11670+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11671+
11672+ preempt_enable();
11673+}
11674+
11675+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
11676+{
11677+ struct mm_struct *mm = vma->vm_mm;
11678+ cpumask_t cpu_mask;
11679+
11680+ preempt_disable();
11681+ cpu_mask = mm->cpu_vm_mask;
11682+ cpu_clear(smp_processor_id(), cpu_mask);
11683+
11684+ if (current->active_mm == mm) {
11685+ if(current->mm)
11686+ __flush_tlb_one(va);
11687+ else
11688+ leave_mm(smp_processor_id());
11689+ }
11690+
11691+ if (!cpus_empty(cpu_mask))
11692+ flush_tlb_others(cpu_mask, mm, va);
11693+
11694+ preempt_enable();
11695+}
11696+EXPORT_SYMBOL(flush_tlb_page);
11697+
11698+static void do_flush_tlb_all(void* info)
11699+{
11700+ unsigned long cpu = smp_processor_id();
11701+
11702+ __flush_tlb_all();
11703+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
11704+ leave_mm(cpu);
11705+}
11706+
11707+void flush_tlb_all(void)
11708+{
11709+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
11710+}
11711+
11712+#endif /* XEN */
11713+
11714+/*
11715+ * this function sends a 'reschedule' IPI to another CPU.
11716+ * it goes straight through and wastes no time serializing
11717+ * anything. Worst case is that we lose a reschedule ...
11718+ */
11719+void smp_send_reschedule(int cpu)
11720+{
11721+ WARN_ON(cpu_is_offline(cpu));
11722+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
11723+}
11724+
11725+/*
11726+ * Structure and data for smp_call_function(). This is designed to minimise
11727+ * static memory requirements. It also looks cleaner.
11728+ */
11729+static DEFINE_SPINLOCK(call_lock);
11730+
11731+struct call_data_struct {
11732+ void (*func) (void *info);
11733+ void *info;
11734+ atomic_t started;
11735+ atomic_t finished;
11736+ int wait;
11737+};
11738+
11739+void lock_ipi_call_lock(void)
11740+{
11741+ spin_lock_irq(&call_lock);
11742+}
11743+
11744+void unlock_ipi_call_lock(void)
11745+{
11746+ spin_unlock_irq(&call_lock);
11747+}
11748+
11749+static struct call_data_struct *call_data;
11750+
11751+/**
11752+ * smp_call_function(): Run a function on all other CPUs.
11753+ * @func: The function to run. This must be fast and non-blocking.
11754+ * @info: An arbitrary pointer to pass to the function.
11755+ * @nonatomic: currently unused.
11756+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
11757+ *
11758+ * Returns 0 on success, else a negative status code. Does not return until
11759+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
11760+ *
11761+ * You must not call this function with disabled interrupts or from a
11762+ * hardware interrupt handler or from a bottom half handler.
11763+ */
11764+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
11765+ int wait)
11766+{
11767+ struct call_data_struct data;
11768+ int cpus;
11769+
11770+ /* Holding any lock stops cpus from going down. */
11771+ spin_lock(&call_lock);
11772+ cpus = num_online_cpus() - 1;
11773+ if (!cpus) {
11774+ spin_unlock(&call_lock);
11775+ return 0;
11776+ }
11777+
11778+ /* Can deadlock when called with interrupts disabled */
11779+ WARN_ON(irqs_disabled());
11780+
11781+ data.func = func;
11782+ data.info = info;
11783+ atomic_set(&data.started, 0);
11784+ data.wait = wait;
11785+ if (wait)
11786+ atomic_set(&data.finished, 0);
11787+
11788+ call_data = &data;
11789+ mb();
11790+
11791+ /* Send a message to all other CPUs and wait for them to respond */
11792+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
11793+
11794+ /* Wait for response */
11795+ while (atomic_read(&data.started) != cpus)
11796+ cpu_relax();
11797+
11798+ if (wait)
11799+ while (atomic_read(&data.finished) != cpus)
11800+ cpu_relax();
11801+ spin_unlock(&call_lock);
11802+
11803+ return 0;
11804+}
11805+EXPORT_SYMBOL(smp_call_function);
11806+
11807+static void stop_this_cpu (void * dummy)
11808+{
11809+ /*
11810+ * Remove this CPU:
11811+ */
11812+ cpu_clear(smp_processor_id(), cpu_online_map);
11813+ local_irq_disable();
11814+ disable_all_local_evtchn();
11815+ if (cpu_data[smp_processor_id()].hlt_works_ok)
11816+ for(;;) halt();
11817+ for (;;);
11818+}
11819+
11820+/*
11821+ * this function calls the 'stop' function on all other CPUs in the system.
11822+ */
11823+
11824+void smp_send_stop(void)
11825+{
11826+ smp_call_function(stop_this_cpu, NULL, 1, 0);
11827+
11828+ local_irq_disable();
11829+ disable_all_local_evtchn();
11830+ local_irq_enable();
11831+}
11832+
11833+/*
11834+ * Reschedule call back. Nothing to do,
11835+ * all the work is done automatically when
11836+ * we return from the interrupt.
11837+ */
11838+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
11839+ struct pt_regs *regs)
11840+{
11841+
11842+ return IRQ_HANDLED;
11843+}
11844+
11845+#include <linux/kallsyms.h>
11846+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
11847+ struct pt_regs *regs)
11848+{
11849+ void (*func) (void *info) = call_data->func;
11850+ void *info = call_data->info;
11851+ int wait = call_data->wait;
11852+
11853+ /*
11854+ * Notify initiating CPU that I've grabbed the data and am
11855+ * about to execute the function
11856+ */
11857+ mb();
11858+ atomic_inc(&call_data->started);
11859+ /*
11860+ * At this point the info structure may be out of scope unless wait==1
11861+ */
11862+ irq_enter();
11863+ (*func)(info);
11864+ irq_exit();
11865+
11866+ if (wait) {
11867+ mb();
11868+ atomic_inc(&call_data->finished);
11869+ }
11870+
11871+ return IRQ_HANDLED;
11872+}
11873+
11874Index: head-2008-11-25/arch/x86/kernel/time_32-xen.c
11875===================================================================
11876--- /dev/null 1970-01-01 00:00:00.000000000 +0000
11877+++ head-2008-11-25/arch/x86/kernel/time_32-xen.c 2008-09-01 12:07:31.000000000 +0200
11878@@ -0,0 +1,1209 @@
11879+/*
11880+ * linux/arch/i386/kernel/time.c
11881+ *
11882+ * Copyright (C) 1991, 1992, 1995 Linus Torvalds
11883+ *
11884+ * This file contains the PC-specific time handling details:
11885+ * reading the RTC at bootup, etc..
11886+ * 1994-07-02 Alan Modra
11887+ * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
11888+ * 1995-03-26 Markus Kuhn
11889+ * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
11890+ * precision CMOS clock update
11891+ * 1996-05-03 Ingo Molnar
11892+ * fixed time warps in do_[slow|fast]_gettimeoffset()
11893+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11894+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
11895+ * 1998-09-05 (Various)
11896+ * More robust do_fast_gettimeoffset() algorithm implemented
11897+ * (works with APM, Cyrix 6x86MX and Centaur C6),
11898+ * monotonic gettimeofday() with fast_get_timeoffset(),
11899+ * drift-proof precision TSC calibration on boot
11900+ * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
11901+ * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
11902+ * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
11903+ * 1998-12-16 Andrea Arcangeli
11904+ * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
11905+ * because was not accounting lost_ticks.
11906+ * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
11907+ * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
11908+ * serialize accesses to xtime/lost_ticks).
11909+ */
11910+
11911+#include <linux/errno.h>
11912+#include <linux/sched.h>
11913+#include <linux/kernel.h>
11914+#include <linux/param.h>
11915+#include <linux/string.h>
11916+#include <linux/mm.h>
11917+#include <linux/interrupt.h>
11918+#include <linux/time.h>
11919+#include <linux/delay.h>
11920+#include <linux/init.h>
11921+#include <linux/smp.h>
11922+#include <linux/module.h>
11923+#include <linux/sysdev.h>
11924+#include <linux/bcd.h>
11925+#include <linux/efi.h>
11926+#include <linux/mca.h>
11927+#include <linux/sysctl.h>
11928+#include <linux/percpu.h>
11929+#include <linux/kernel_stat.h>
11930+#include <linux/posix-timers.h>
11931+#include <linux/cpufreq.h>
11932+
11933+#include <asm/io.h>
11934+#include <asm/smp.h>
11935+#include <asm/irq.h>
11936+#include <asm/msr.h>
11937+#include <asm/delay.h>
11938+#include <asm/mpspec.h>
11939+#include <asm/uaccess.h>
11940+#include <asm/processor.h>
11941+#include <asm/timer.h>
11942+#include <asm/sections.h>
11943+
11944+#include "mach_time.h"
11945+
11946+#include <linux/timex.h>
11947+
11948+#include <asm/hpet.h>
11949+
11950+#include <asm/arch_hooks.h>
11951+
11952+#include <xen/evtchn.h>
11953+#include <xen/interface/vcpu.h>
11954+
11955+#if defined (__i386__)
11956+#include <asm/i8259.h>
11957+#endif
11958+
11959+int pit_latch_buggy; /* extern */
11960+
11961+#if defined(__x86_64__)
11962+unsigned long vxtime_hz = PIT_TICK_RATE;
11963+struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
11964+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
11965+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
11966+struct timespec __xtime __section_xtime;
11967+struct timezone __sys_tz __section_sys_tz;
11968+#endif
11969+
11970+unsigned int cpu_khz; /* Detected as we calibrate the TSC */
11971+EXPORT_SYMBOL(cpu_khz);
11972+
11973+extern unsigned long wall_jiffies;
11974+
11975+DEFINE_SPINLOCK(rtc_lock);
11976+EXPORT_SYMBOL(rtc_lock);
11977+
11978+extern struct init_timer_opts timer_tsc_init;
11979+extern struct timer_opts timer_tsc;
11980+#define timer_none timer_tsc
11981+
11982+/* These are peridically updated in shared_info, and then copied here. */
11983+struct shadow_time_info {
11984+ u64 tsc_timestamp; /* TSC at last update of time vals. */
11985+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
11986+ u32 tsc_to_nsec_mul;
11987+ u32 tsc_to_usec_mul;
11988+ int tsc_shift;
11989+ u32 version;
11990+};
11991+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
11992+static struct timespec shadow_tv;
11993+static u32 shadow_tv_version;
11994+
11995+static struct timeval monotonic_tv;
11996+static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED;
11997+
11998+/* Keep track of last time we did processing/updating of jiffies and xtime. */
11999+static u64 processed_system_time; /* System time (ns) at last processing. */
12000+static DEFINE_PER_CPU(u64, processed_system_time);
12001+
12002+/* How much CPU time was spent blocked and how much was 'stolen'? */
12003+static DEFINE_PER_CPU(u64, processed_stolen_time);
12004+static DEFINE_PER_CPU(u64, processed_blocked_time);
12005+
12006+/* Current runstate of each CPU (updated automatically by the hypervisor). */
12007+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
12008+
12009+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
12010+#define NS_PER_TICK (1000000000LL/HZ)
12011+
12012+static void __clock_was_set(void *unused)
12013+{
12014+ clock_was_set();
12015+}
12016+static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
12017+
12018+/*
12019+ * GCC 4.3 can turn loops over an induction variable into division. We do
12020+ * not support arbitrary 64-bit division, and so must break the induction.
12021+ */
12022+#define clobber_induction_variable(v) asm ( "" : "+r" (v) )
12023+
12024+static inline void __normalize_time(time_t *sec, s64 *nsec)
12025+{
12026+ while (*nsec >= NSEC_PER_SEC) {
12027+ clobber_induction_variable(*nsec);
12028+ (*nsec) -= NSEC_PER_SEC;
12029+ (*sec)++;
12030+ }
12031+ while (*nsec < 0) {
12032+ clobber_induction_variable(*nsec);
12033+ (*nsec) += NSEC_PER_SEC;
12034+ (*sec)--;
12035+ }
12036+}
12037+
12038+/* Does this guest OS track Xen time, or set its wall clock independently? */
12039+static int independent_wallclock = 0;
12040+static int __init __independent_wallclock(char *str)
12041+{
12042+ independent_wallclock = 1;
12043+ return 1;
12044+}
12045+__setup("independent_wallclock", __independent_wallclock);
12046+
12047+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
12048+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
12049+static int __init __permitted_clock_jitter(char *str)
12050+{
12051+ permitted_clock_jitter = simple_strtoul(str, NULL, 0);
12052+ return 1;
12053+}
12054+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
12055+
12056+#if 0
12057+static void delay_tsc(unsigned long loops)
12058+{
12059+ unsigned long bclock, now;
12060+
12061+ rdtscl(bclock);
12062+ do {
12063+ rep_nop();
12064+ rdtscl(now);
12065+ } while ((now - bclock) < loops);
12066+}
12067+
12068+struct timer_opts timer_tsc = {
12069+ .name = "tsc",
12070+ .delay = delay_tsc,
12071+};
12072+#endif
12073+
12074+/*
12075+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
12076+ * yielding a 64-bit result.
12077+ */
12078+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
12079+{
12080+ u64 product;
12081+#ifdef __i386__
12082+ u32 tmp1, tmp2;
12083+#endif
12084+
12085+ if (shift < 0)
12086+ delta >>= -shift;
12087+ else
12088+ delta <<= shift;
12089+
12090+#ifdef __i386__
12091+ __asm__ (
12092+ "mul %5 ; "
12093+ "mov %4,%%eax ; "
12094+ "mov %%edx,%4 ; "
12095+ "mul %5 ; "
12096+ "xor %5,%5 ; "
12097+ "add %4,%%eax ; "
12098+ "adc %5,%%edx ; "
12099+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
12100+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
12101+#else
12102+ __asm__ (
12103+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
12104+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
12105+#endif
12106+
12107+ return product;
12108+}
12109+
12110+#if 0 /* defined (__i386__) */
12111+int read_current_timer(unsigned long *timer_val)
12112+{
12113+ rdtscl(*timer_val);
12114+ return 0;
12115+}
12116+#endif
12117+
12118+void init_cpu_khz(void)
12119+{
12120+ u64 __cpu_khz = 1000000ULL << 32;
12121+ struct vcpu_time_info *info = &vcpu_info(0)->time;
12122+ do_div(__cpu_khz, info->tsc_to_system_mul);
12123+ if (info->tsc_shift < 0)
12124+ cpu_khz = __cpu_khz << -info->tsc_shift;
12125+ else
12126+ cpu_khz = __cpu_khz >> info->tsc_shift;
12127+}
12128+
12129+static u64 get_nsec_offset(struct shadow_time_info *shadow)
12130+{
12131+ u64 now, delta;
12132+ rdtscll(now);
12133+ delta = now - shadow->tsc_timestamp;
12134+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
12135+}
12136+
12137+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
12138+{
12139+ u64 now, delta;
12140+ rdtscll(now);
12141+ delta = now - shadow->tsc_timestamp;
12142+ return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
12143+}
12144+
12145+static void __update_wallclock(time_t sec, long nsec)
12146+{
12147+ long wtm_nsec, xtime_nsec;
12148+ time_t wtm_sec, xtime_sec;
12149+ u64 tmp, wc_nsec;
12150+
12151+ /* Adjust wall-clock time base based on wall_jiffies ticks. */
12152+ wc_nsec = processed_system_time;
12153+ wc_nsec += sec * (u64)NSEC_PER_SEC;
12154+ wc_nsec += nsec;
12155+ wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
12156+
12157+ /* Split wallclock base into seconds and nanoseconds. */
12158+ tmp = wc_nsec;
12159+ xtime_nsec = do_div(tmp, 1000000000);
12160+ xtime_sec = (time_t)tmp;
12161+
12162+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
12163+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
12164+
12165+ set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
12166+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
12167+
12168+ ntp_clear();
12169+}
12170+
12171+static void update_wallclock(void)
12172+{
12173+ shared_info_t *s = HYPERVISOR_shared_info;
12174+
12175+ do {
12176+ shadow_tv_version = s->wc_version;
12177+ rmb();
12178+ shadow_tv.tv_sec = s->wc_sec;
12179+ shadow_tv.tv_nsec = s->wc_nsec;
12180+ rmb();
12181+ } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
12182+
12183+ if (!independent_wallclock)
12184+ __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
12185+}
12186+
12187+/*
12188+ * Reads a consistent set of time-base values from Xen, into a shadow data
12189+ * area.
12190+ */
12191+static void get_time_values_from_xen(unsigned int cpu)
12192+{
12193+ struct vcpu_time_info *src;
12194+ struct shadow_time_info *dst;
12195+ unsigned long flags;
12196+ u32 pre_version, post_version;
12197+
12198+ src = &vcpu_info(cpu)->time;
12199+ dst = &per_cpu(shadow_time, cpu);
12200+
12201+ local_irq_save(flags);
12202+
12203+ do {
12204+ pre_version = dst->version = src->version;
12205+ rmb();
12206+ dst->tsc_timestamp = src->tsc_timestamp;
12207+ dst->system_timestamp = src->system_time;
12208+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
12209+ dst->tsc_shift = src->tsc_shift;
12210+ rmb();
12211+ post_version = src->version;
12212+ } while ((pre_version & 1) | (pre_version ^ post_version));
12213+
12214+ dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
12215+
12216+ local_irq_restore(flags);
12217+}
12218+
12219+static inline int time_values_up_to_date(unsigned int cpu)
12220+{
12221+ struct vcpu_time_info *src;
12222+ struct shadow_time_info *dst;
12223+
12224+ src = &vcpu_info(cpu)->time;
12225+ dst = &per_cpu(shadow_time, cpu);
12226+
12227+ rmb();
12228+ return (dst->version == src->version);
12229+}
12230+
12231+/*
12232+ * This is a special lock that is owned by the CPU and holds the index
12233+ * register we are working with. It is required for NMI access to the
12234+ * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
12235+ */
12236+volatile unsigned long cmos_lock = 0;
12237+EXPORT_SYMBOL(cmos_lock);
12238+
12239+/* Routines for accessing the CMOS RAM/RTC. */
12240+unsigned char rtc_cmos_read(unsigned char addr)
12241+{
12242+ unsigned char val;
12243+ lock_cmos_prefix(addr);
12244+ outb_p(addr, RTC_PORT(0));
12245+ val = inb_p(RTC_PORT(1));
12246+ lock_cmos_suffix(addr);
12247+ return val;
12248+}
12249+EXPORT_SYMBOL(rtc_cmos_read);
12250+
12251+void rtc_cmos_write(unsigned char val, unsigned char addr)
12252+{
12253+ lock_cmos_prefix(addr);
12254+ outb_p(addr, RTC_PORT(0));
12255+ outb_p(val, RTC_PORT(1));
12256+ lock_cmos_suffix(addr);
12257+}
12258+EXPORT_SYMBOL(rtc_cmos_write);
12259+
12260+/*
12261+ * This version of gettimeofday has microsecond resolution
12262+ * and better than microsecond precision on fast x86 machines with TSC.
12263+ */
12264+void do_gettimeofday(struct timeval *tv)
12265+{
12266+ unsigned long seq;
12267+ unsigned long usec, sec;
12268+ unsigned long flags;
12269+ s64 nsec;
12270+ unsigned int cpu;
12271+ struct shadow_time_info *shadow;
12272+ u32 local_time_version;
12273+
12274+ cpu = get_cpu();
12275+ shadow = &per_cpu(shadow_time, cpu);
12276+
12277+ do {
12278+ unsigned long lost;
12279+
12280+ local_time_version = shadow->version;
12281+ seq = read_seqbegin(&xtime_lock);
12282+
12283+ usec = get_usec_offset(shadow);
12284+ lost = jiffies - wall_jiffies;
12285+
12286+ if (unlikely(lost))
12287+ usec += lost * (USEC_PER_SEC / HZ);
12288+
12289+ sec = xtime.tv_sec;
12290+ usec += (xtime.tv_nsec / NSEC_PER_USEC);
12291+
12292+ nsec = shadow->system_timestamp - processed_system_time;
12293+ __normalize_time(&sec, &nsec);
12294+ usec += (long)nsec / NSEC_PER_USEC;
12295+
12296+ if (unlikely(!time_values_up_to_date(cpu))) {
12297+ /*
12298+ * We may have blocked for a long time,
12299+ * rendering our calculations invalid
12300+ * (e.g. the time delta may have
12301+ * overflowed). Detect that and recalculate
12302+ * with fresh values.
12303+ */
12304+ get_time_values_from_xen(cpu);
12305+ continue;
12306+ }
12307+ } while (read_seqretry(&xtime_lock, seq) ||
12308+ (local_time_version != shadow->version));
12309+
12310+ put_cpu();
12311+
12312+ while (usec >= USEC_PER_SEC) {
12313+ usec -= USEC_PER_SEC;
12314+ sec++;
12315+ }
12316+
12317+ spin_lock_irqsave(&monotonic_lock, flags);
12318+ if ((sec > monotonic_tv.tv_sec) ||
12319+ ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec)))
12320+ {
12321+ monotonic_tv.tv_sec = sec;
12322+ monotonic_tv.tv_usec = usec;
12323+ } else {
12324+ sec = monotonic_tv.tv_sec;
12325+ usec = monotonic_tv.tv_usec;
12326+ }
12327+ spin_unlock_irqrestore(&monotonic_lock, flags);
12328+
12329+ tv->tv_sec = sec;
12330+ tv->tv_usec = usec;
12331+}
12332+
12333+EXPORT_SYMBOL(do_gettimeofday);
12334+
12335+int do_settimeofday(struct timespec *tv)
12336+{
12337+ time_t sec;
12338+ s64 nsec;
12339+ unsigned int cpu;
12340+ struct shadow_time_info *shadow;
12341+ struct xen_platform_op op;
12342+
12343+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
12344+ return -EINVAL;
12345+
12346+ cpu = get_cpu();
12347+ shadow = &per_cpu(shadow_time, cpu);
12348+
12349+ write_seqlock_irq(&xtime_lock);
12350+
12351+ /*
12352+ * Ensure we don't get blocked for a long time so that our time delta
12353+ * overflows. If that were to happen then our shadow time values would
12354+ * be stale, so we can retry with fresh ones.
12355+ */
12356+ for (;;) {
12357+ nsec = tv->tv_nsec - get_nsec_offset(shadow);
12358+ if (time_values_up_to_date(cpu))
12359+ break;
12360+ get_time_values_from_xen(cpu);
12361+ }
12362+ sec = tv->tv_sec;
12363+ __normalize_time(&sec, &nsec);
12364+
12365+ if (is_initial_xendomain() && !independent_wallclock) {
12366+ op.cmd = XENPF_settime;
12367+ op.u.settime.secs = sec;
12368+ op.u.settime.nsecs = nsec;
12369+ op.u.settime.system_time = shadow->system_timestamp;
12370+ WARN_ON(HYPERVISOR_platform_op(&op));
12371+ update_wallclock();
12372+ } else if (independent_wallclock) {
12373+ nsec -= shadow->system_timestamp;
12374+ __normalize_time(&sec, &nsec);
12375+ __update_wallclock(sec, nsec);
12376+ }
12377+
12378+ /* Reset monotonic gettimeofday() timeval. */
12379+ spin_lock(&monotonic_lock);
12380+ monotonic_tv.tv_sec = 0;
12381+ monotonic_tv.tv_usec = 0;
12382+ spin_unlock(&monotonic_lock);
12383+
12384+ write_sequnlock_irq(&xtime_lock);
12385+
12386+ put_cpu();
12387+
12388+ clock_was_set();
12389+ return 0;
12390+}
12391+
12392+EXPORT_SYMBOL(do_settimeofday);
12393+
12394+static void sync_xen_wallclock(unsigned long dummy);
12395+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
12396+static void sync_xen_wallclock(unsigned long dummy)
12397+{
12398+ time_t sec;
12399+ s64 nsec;
12400+ struct xen_platform_op op;
12401+
12402+ if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
12403+ return;
12404+
12405+ write_seqlock_irq(&xtime_lock);
12406+
12407+ sec = xtime.tv_sec;
12408+ nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
12409+ __normalize_time(&sec, &nsec);
12410+
12411+ op.cmd = XENPF_settime;
12412+ op.u.settime.secs = sec;
12413+ op.u.settime.nsecs = nsec;
12414+ op.u.settime.system_time = processed_system_time;
12415+ WARN_ON(HYPERVISOR_platform_op(&op));
12416+
12417+ update_wallclock();
12418+
12419+ write_sequnlock_irq(&xtime_lock);
12420+
12421+ /* Once per minute. */
12422+ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
12423+}
12424+
12425+static int set_rtc_mmss(unsigned long nowtime)
12426+{
12427+ int retval;
12428+ unsigned long flags;
12429+
12430+ if (independent_wallclock || !is_initial_xendomain())
12431+ return 0;
12432+
12433+ /* gets recalled with irq locally disabled */
12434+ /* XXX - does irqsave resolve this? -johnstul */
12435+ spin_lock_irqsave(&rtc_lock, flags);
12436+ if (efi_enabled)
12437+ retval = efi_set_rtc_mmss(nowtime);
12438+ else
12439+ retval = mach_set_rtc_mmss(nowtime);
12440+ spin_unlock_irqrestore(&rtc_lock, flags);
12441+
12442+ return retval;
12443+}
12444+
12445+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
12446+ * Note: This function is required to return accurate
12447+ * time even in the absence of multiple timer ticks.
12448+ */
12449+unsigned long long monotonic_clock(void)
12450+{
12451+ unsigned int cpu = get_cpu();
12452+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12453+ u64 time;
12454+ u32 local_time_version;
12455+
12456+ do {
12457+ local_time_version = shadow->version;
12458+ barrier();
12459+ time = shadow->system_timestamp + get_nsec_offset(shadow);
12460+ if (!time_values_up_to_date(cpu))
12461+ get_time_values_from_xen(cpu);
12462+ barrier();
12463+ } while (local_time_version != shadow->version);
12464+
12465+ put_cpu();
12466+
12467+ return time;
12468+}
12469+EXPORT_SYMBOL(monotonic_clock);
12470+
12471+#ifdef __x86_64__
12472+unsigned long long sched_clock(void)
12473+{
12474+ return monotonic_clock();
12475+}
12476+#endif
12477+
12478+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
12479+unsigned long profile_pc(struct pt_regs *regs)
12480+{
12481+ unsigned long pc = instruction_pointer(regs);
12482+
12483+#ifdef __x86_64__
12484+ /* Assume the lock function has either no stack frame or only a single word.
12485+ This checks if the address on the stack looks like a kernel text address.
12486+ There is a small window for false hits, but in that case the tick
12487+ is just accounted to the spinlock function.
12488+ Better would be to write these functions in assembler again
12489+ and check exactly. */
12490+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
12491+ char *v = *(char **)regs->rsp;
12492+ if ((v >= _stext && v <= _etext) ||
12493+ (v >= _sinittext && v <= _einittext) ||
12494+ (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
12495+ return (unsigned long)v;
12496+ return ((unsigned long *)regs->rsp)[1];
12497+ }
12498+#else
12499+ if (!user_mode_vm(regs) && in_lock_functions(pc))
12500+ return *(unsigned long *)(regs->ebp + 4);
12501+#endif
12502+
12503+ return pc;
12504+}
12505+EXPORT_SYMBOL(profile_pc);
12506+#endif
12507+
12508+/*
12509+ * This is the same as the above, except we _also_ save the current
12510+ * Time Stamp Counter value at the time of the timer interrupt, so that
12511+ * we later on can estimate the time of day more exactly.
12512+ */
12513+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
12514+{
12515+ s64 delta, delta_cpu, stolen, blocked;
12516+ u64 sched_time;
12517+ unsigned int i, cpu = smp_processor_id();
12518+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12519+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12520+
12521+ /*
12522+ * Here we are in the timer irq handler. We just have irqs locally
12523+ * disabled but we don't know if the timer_bh is running on the other
12524+ * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
12525+ * the irq version of write_lock because as just said we have irq
12526+ * locally disabled. -arca
12527+ */
12528+ write_seqlock(&xtime_lock);
12529+
12530+ do {
12531+ get_time_values_from_xen(cpu);
12532+
12533+ /* Obtain a consistent snapshot of elapsed wallclock cycles. */
12534+ delta = delta_cpu =
12535+ shadow->system_timestamp + get_nsec_offset(shadow);
12536+ delta -= processed_system_time;
12537+ delta_cpu -= per_cpu(processed_system_time, cpu);
12538+
12539+ /*
12540+ * Obtain a consistent snapshot of stolen/blocked cycles. We
12541+ * can use state_entry_time to detect if we get preempted here.
12542+ */
12543+ do {
12544+ sched_time = runstate->state_entry_time;
12545+ barrier();
12546+ stolen = runstate->time[RUNSTATE_runnable] +
12547+ runstate->time[RUNSTATE_offline] -
12548+ per_cpu(processed_stolen_time, cpu);
12549+ blocked = runstate->time[RUNSTATE_blocked] -
12550+ per_cpu(processed_blocked_time, cpu);
12551+ barrier();
12552+ } while (sched_time != runstate->state_entry_time);
12553+ } while (!time_values_up_to_date(cpu));
12554+
12555+ if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
12556+ unlikely(delta_cpu < -(s64)permitted_clock_jitter))
12557+ && printk_ratelimit()) {
12558+ printk("Timer ISR/%u: Time went backwards: "
12559+ "delta=%lld delta_cpu=%lld shadow=%lld "
12560+ "off=%lld processed=%lld cpu_processed=%lld\n",
12561+ cpu, delta, delta_cpu, shadow->system_timestamp,
12562+ (s64)get_nsec_offset(shadow),
12563+ processed_system_time,
12564+ per_cpu(processed_system_time, cpu));
12565+ for (i = 0; i < num_online_cpus(); i++)
12566+ printk(" %d: %lld\n", i,
12567+ per_cpu(processed_system_time, i));
12568+ }
12569+
12570+ /* System-wide jiffy work. */
12571+ while (delta >= NS_PER_TICK) {
12572+ delta -= NS_PER_TICK;
12573+ processed_system_time += NS_PER_TICK;
12574+ do_timer(regs);
12575+ }
12576+
12577+ if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
12578+ update_wallclock();
12579+ if (keventd_up())
12580+ schedule_work(&clock_was_set_work);
12581+ }
12582+
12583+ write_sequnlock(&xtime_lock);
12584+
12585+ /*
12586+ * Account stolen ticks.
12587+ * HACK: Passing NULL to account_steal_time()
12588+ * ensures that the ticks are accounted as stolen.
12589+ */
12590+ if ((stolen > 0) && (delta_cpu > 0)) {
12591+ delta_cpu -= stolen;
12592+ if (unlikely(delta_cpu < 0))
12593+ stolen += delta_cpu; /* clamp local-time progress */
12594+ do_div(stolen, NS_PER_TICK);
12595+ per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
12596+ per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
12597+ account_steal_time(NULL, (cputime_t)stolen);
12598+ }
12599+
12600+ /*
12601+ * Account blocked ticks.
12602+ * HACK: Passing idle_task to account_steal_time()
12603+ * ensures that the ticks are accounted as idle/wait.
12604+ */
12605+ if ((blocked > 0) && (delta_cpu > 0)) {
12606+ delta_cpu -= blocked;
12607+ if (unlikely(delta_cpu < 0))
12608+ blocked += delta_cpu; /* clamp local-time progress */
12609+ do_div(blocked, NS_PER_TICK);
12610+ per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
12611+ per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
12612+ account_steal_time(idle_task(cpu), (cputime_t)blocked);
12613+ }
12614+
12615+ /* Account user/system ticks. */
12616+ if (delta_cpu > 0) {
12617+ do_div(delta_cpu, NS_PER_TICK);
12618+ per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
12619+ if (user_mode_vm(regs))
12620+ account_user_time(current, (cputime_t)delta_cpu);
12621+ else
12622+ account_system_time(current, HARDIRQ_OFFSET,
12623+ (cputime_t)delta_cpu);
12624+ }
12625+
12626+ /* Offlined for more than a few seconds? Avoid lockup warnings. */
12627+ if (stolen > 5*HZ)
12628+ touch_softlockup_watchdog();
12629+
12630+ /* Local timer processing (see update_process_times()). */
12631+ run_local_timers();
12632+ if (rcu_pending(cpu))
12633+ rcu_check_callbacks(cpu, user_mode_vm(regs));
12634+ scheduler_tick();
12635+ run_posix_cpu_timers(current);
12636+ profile_tick(CPU_PROFILING, regs);
12637+
12638+ return IRQ_HANDLED;
12639+}
12640+
12641+static void init_missing_ticks_accounting(unsigned int cpu)
12642+{
12643+ struct vcpu_register_runstate_memory_area area;
12644+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12645+ int rc;
12646+
12647+ memset(runstate, 0, sizeof(*runstate));
12648+
12649+ area.addr.v = runstate;
12650+ rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
12651+ WARN_ON(rc && rc != -ENOSYS);
12652+
12653+ per_cpu(processed_blocked_time, cpu) =
12654+ runstate->time[RUNSTATE_blocked];
12655+ per_cpu(processed_stolen_time, cpu) =
12656+ runstate->time[RUNSTATE_runnable] +
12657+ runstate->time[RUNSTATE_offline];
12658+}
12659+
12660+/* not static: needed by APM */
12661+unsigned long get_cmos_time(void)
12662+{
12663+ unsigned long retval;
12664+ unsigned long flags;
12665+
12666+ spin_lock_irqsave(&rtc_lock, flags);
12667+
12668+ if (efi_enabled)
12669+ retval = efi_get_time();
12670+ else
12671+ retval = mach_get_cmos_time();
12672+
12673+ spin_unlock_irqrestore(&rtc_lock, flags);
12674+
12675+ return retval;
12676+}
12677+EXPORT_SYMBOL(get_cmos_time);
12678+
12679+static void sync_cmos_clock(unsigned long dummy);
12680+
12681+static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
12682+
12683+static void sync_cmos_clock(unsigned long dummy)
12684+{
12685+ struct timeval now, next;
12686+ int fail = 1;
12687+
12688+ /*
12689+ * If we have an externally synchronized Linux clock, then update
12690+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
12691+ * called as close as possible to 500 ms before the new second starts.
12692+ * This code is run on a timer. If the clock is set, that timer
12693+ * may not expire at the correct time. Thus, we adjust...
12694+ */
12695+ if (!ntp_synced())
12696+ /*
12697+ * Not synced, exit, do not restart a timer (if one is
12698+ * running, let it run out).
12699+ */
12700+ return;
12701+
12702+ do_gettimeofday(&now);
12703+ if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
12704+ now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
12705+ fail = set_rtc_mmss(now.tv_sec);
12706+
12707+ next.tv_usec = USEC_AFTER - now.tv_usec;
12708+ if (next.tv_usec <= 0)
12709+ next.tv_usec += USEC_PER_SEC;
12710+
12711+ if (!fail)
12712+ next.tv_sec = 659;
12713+ else
12714+ next.tv_sec = 0;
12715+
12716+ if (next.tv_usec >= USEC_PER_SEC) {
12717+ next.tv_sec++;
12718+ next.tv_usec -= USEC_PER_SEC;
12719+ }
12720+ mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
12721+}
12722+
12723+void notify_arch_cmos_timer(void)
12724+{
12725+ mod_timer(&sync_cmos_timer, jiffies + 1);
12726+ mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
12727+}
12728+
12729+static int timer_resume(struct sys_device *dev)
12730+{
12731+ extern void time_resume(void);
12732+ time_resume();
12733+ return 0;
12734+}
12735+
12736+static struct sysdev_class timer_sysclass = {
12737+ .resume = timer_resume,
12738+ set_kset_name("timer"),
12739+};
12740+
12741+
12742+/* XXX this driverfs stuff should probably go elsewhere later -john */
12743+static struct sys_device device_timer = {
12744+ .id = 0,
12745+ .cls = &timer_sysclass,
12746+};
12747+
12748+static int time_init_device(void)
12749+{
12750+ int error = sysdev_class_register(&timer_sysclass);
12751+ if (!error)
12752+ error = sysdev_register(&device_timer);
12753+ return error;
12754+}
12755+
12756+device_initcall(time_init_device);
12757+
12758+#ifdef CONFIG_HPET_TIMER
12759+extern void (*late_time_init)(void);
12760+/* Duplicate of time_init() below, with hpet_enable part added */
12761+static void __init hpet_time_init(void)
12762+{
12763+ xtime.tv_sec = get_cmos_time();
12764+ xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
12765+ set_normalized_timespec(&wall_to_monotonic,
12766+ -xtime.tv_sec, -xtime.tv_nsec);
12767+
12768+ if ((hpet_enable() >= 0) && hpet_use_timer) {
12769+ printk("Using HPET for base-timer\n");
12770+ }
12771+
12772+ time_init_hook();
12773+}
12774+#endif
12775+
12776+/* Dynamically-mapped IRQ. */
12777+DEFINE_PER_CPU(int, timer_irq);
12778+
12779+extern void (*late_time_init)(void);
12780+static void setup_cpu0_timer_irq(void)
12781+{
12782+ per_cpu(timer_irq, 0) =
12783+ bind_virq_to_irqhandler(
12784+ VIRQ_TIMER,
12785+ 0,
12786+ timer_interrupt,
12787+ SA_INTERRUPT,
12788+ "timer0",
12789+ NULL);
12790+ BUG_ON(per_cpu(timer_irq, 0) < 0);
12791+}
12792+
12793+static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
12794+ .period_ns = NS_PER_TICK
12795+};
12796+
12797+void __init time_init(void)
12798+{
12799+#ifdef CONFIG_HPET_TIMER
12800+ if (is_hpet_capable()) {
12801+ /*
12802+ * HPET initialization needs to do memory-mapped io. So, let
12803+ * us do a late initialization after mem_init().
12804+ */
12805+ late_time_init = hpet_time_init;
12806+ return;
12807+ }
12808+#endif
12809+
12810+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
12811+ &xen_set_periodic_tick)) {
12812+ case 0:
12813+#if CONFIG_XEN_COMPAT <= 0x030004
12814+ case -ENOSYS:
12815+#endif
12816+ break;
12817+ default:
12818+ BUG();
12819+ }
12820+
12821+ get_time_values_from_xen(0);
12822+
12823+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12824+ per_cpu(processed_system_time, 0) = processed_system_time;
12825+ init_missing_ticks_accounting(0);
12826+
12827+ update_wallclock();
12828+
12829+ init_cpu_khz();
12830+ printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
12831+ cpu_khz / 1000, cpu_khz % 1000);
12832+
12833+#if defined(__x86_64__)
12834+ vxtime.mode = VXTIME_TSC;
12835+ vxtime.quot = (1000000L << 32) / vxtime_hz;
12836+ vxtime.tsc_quot = (1000L << 32) / cpu_khz;
12837+ sync_core();
12838+ rdtscll(vxtime.last_tsc);
12839+#endif
12840+
12841+ /* Cannot request_irq() until kmem is initialised. */
12842+ late_time_init = setup_cpu0_timer_irq;
12843+}
12844+
12845+/* Convert jiffies to system time. */
12846+u64 jiffies_to_st(unsigned long j)
12847+{
12848+ unsigned long seq;
12849+ long delta;
12850+ u64 st;
12851+
12852+ do {
12853+ seq = read_seqbegin(&xtime_lock);
12854+ delta = j - jiffies;
12855+ if (delta < 1) {
12856+ /* Triggers in some wrap-around cases, but that's okay:
12857+ * we just end up with a shorter timeout. */
12858+ st = processed_system_time + NS_PER_TICK;
12859+ } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
12860+ /* Very long timeout means there is no pending timer.
12861+ * We indicate this to Xen by passing zero timeout. */
12862+ st = 0;
12863+ } else {
12864+ st = processed_system_time + delta * (u64)NS_PER_TICK;
12865+ }
12866+ } while (read_seqretry(&xtime_lock, seq));
12867+
12868+ return st;
12869+}
12870+EXPORT_SYMBOL(jiffies_to_st);
12871+
12872+/*
12873+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
12874+ * These functions are based on implementations from arch/s390/kernel/time.c
12875+ */
12876+static void stop_hz_timer(void)
12877+{
12878+ struct vcpu_set_singleshot_timer singleshot;
12879+ unsigned int cpu = smp_processor_id();
12880+ unsigned long j;
12881+ int rc;
12882+
12883+ cpu_set(cpu, nohz_cpu_mask);
12884+
12885+ /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
12886+ /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
12887+ /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
12888+ /* stop the hz timer then the cpumasks created for subsequent values */
12889+ /* of cur in rcu_start_batch are guaranteed to pick up the updated */
12890+ /* nohz_cpu_mask and so will not depend on this cpu. */
12891+
12892+ smp_mb();
12893+
12894+ /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
12895+ if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
12896+ (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
12897+ cpu_clear(cpu, nohz_cpu_mask);
12898+ j = jiffies + 1;
12899+ }
12900+
12901+ singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2;
12902+ singleshot.flags = 0;
12903+ rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
12904+#if CONFIG_XEN_COMPAT <= 0x030004
12905+ if (rc) {
12906+ BUG_ON(rc != -ENOSYS);
12907+ rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
12908+ }
12909+#endif
12910+ BUG_ON(rc);
12911+}
12912+
12913+static void start_hz_timer(void)
12914+{
12915+ cpu_clear(smp_processor_id(), nohz_cpu_mask);
12916+}
12917+
12918+void raw_safe_halt(void)
12919+{
12920+ stop_hz_timer();
12921+ /* Blocking includes an implicit local_irq_enable(). */
12922+ HYPERVISOR_block();
12923+ start_hz_timer();
12924+}
12925+EXPORT_SYMBOL(raw_safe_halt);
12926+
12927+void halt(void)
12928+{
12929+ if (irqs_disabled())
12930+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
12931+}
12932+EXPORT_SYMBOL(halt);
12933+
12934+/* No locking required. Interrupts are disabled on all CPUs. */
12935+void time_resume(void)
12936+{
12937+ unsigned int cpu;
12938+
12939+ init_cpu_khz();
12940+
12941+ for_each_online_cpu(cpu) {
12942+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12943+ &xen_set_periodic_tick)) {
12944+ case 0:
12945+#if CONFIG_XEN_COMPAT <= 0x030004
12946+ case -ENOSYS:
12947+#endif
12948+ break;
12949+ default:
12950+ BUG();
12951+ }
12952+ get_time_values_from_xen(cpu);
12953+ per_cpu(processed_system_time, cpu) =
12954+ per_cpu(shadow_time, 0).system_timestamp;
12955+ init_missing_ticks_accounting(cpu);
12956+ }
12957+
12958+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12959+
12960+ update_wallclock();
12961+}
12962+
12963+#ifdef CONFIG_SMP
12964+static char timer_name[NR_CPUS][15];
12965+
12966+int __cpuinit local_setup_timer(unsigned int cpu)
12967+{
12968+ int seq, irq;
12969+
12970+ BUG_ON(cpu == 0);
12971+
12972+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12973+ &xen_set_periodic_tick)) {
12974+ case 0:
12975+#if CONFIG_XEN_COMPAT <= 0x030004
12976+ case -ENOSYS:
12977+#endif
12978+ break;
12979+ default:
12980+ BUG();
12981+ }
12982+
12983+ do {
12984+ seq = read_seqbegin(&xtime_lock);
12985+ /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
12986+ per_cpu(processed_system_time, cpu) =
12987+ per_cpu(shadow_time, 0).system_timestamp;
12988+ init_missing_ticks_accounting(cpu);
12989+ } while (read_seqretry(&xtime_lock, seq));
12990+
12991+ sprintf(timer_name[cpu], "timer%u", cpu);
12992+ irq = bind_virq_to_irqhandler(VIRQ_TIMER,
12993+ cpu,
12994+ timer_interrupt,
12995+ SA_INTERRUPT,
12996+ timer_name[cpu],
12997+ NULL);
12998+ if (irq < 0)
12999+ return irq;
13000+ per_cpu(timer_irq, cpu) = irq;
13001+
13002+ return 0;
13003+}
13004+
13005+void __cpuexit local_teardown_timer(unsigned int cpu)
13006+{
13007+ BUG_ON(cpu == 0);
13008+ unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
13009+}
13010+#endif
13011+
13012+#ifdef CONFIG_CPU_FREQ
13013+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
13014+ void *data)
13015+{
13016+ struct cpufreq_freqs *freq = data;
13017+ struct xen_platform_op op;
13018+
13019+ if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
13020+ return 0;
13021+
13022+ if (val == CPUFREQ_PRECHANGE)
13023+ return 0;
13024+
13025+ op.cmd = XENPF_change_freq;
13026+ op.u.change_freq.flags = 0;
13027+ op.u.change_freq.cpu = freq->cpu;
13028+ op.u.change_freq.freq = (u64)freq->new * 1000;
13029+ WARN_ON(HYPERVISOR_platform_op(&op));
13030+
13031+ return 0;
13032+}
13033+
13034+static struct notifier_block time_cpufreq_notifier_block = {
13035+ .notifier_call = time_cpufreq_notifier
13036+};
13037+
13038+static int __init cpufreq_time_setup(void)
13039+{
13040+ if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
13041+ CPUFREQ_TRANSITION_NOTIFIER)) {
13042+ printk(KERN_ERR "failed to set up cpufreq notifier\n");
13043+ return -ENODEV;
13044+ }
13045+ return 0;
13046+}
13047+
13048+core_initcall(cpufreq_time_setup);
13049+#endif
13050+
13051+/*
13052+ * /proc/sys/xen: This really belongs in another file. It can stay here for
13053+ * now however.
13054+ */
13055+static ctl_table xen_subtable[] = {
13056+ {
13057+ .ctl_name = 1,
13058+ .procname = "independent_wallclock",
13059+ .data = &independent_wallclock,
13060+ .maxlen = sizeof(independent_wallclock),
13061+ .mode = 0644,
13062+ .proc_handler = proc_dointvec
13063+ },
13064+ {
13065+ .ctl_name = 2,
13066+ .procname = "permitted_clock_jitter",
13067+ .data = &permitted_clock_jitter,
13068+ .maxlen = sizeof(permitted_clock_jitter),
13069+ .mode = 0644,
13070+ .proc_handler = proc_doulongvec_minmax
13071+ },
13072+ { 0 }
13073+};
13074+static ctl_table xen_table[] = {
13075+ {
13076+ .ctl_name = 123,
13077+ .procname = "xen",
13078+ .mode = 0555,
13079+ .child = xen_subtable},
13080+ { 0 }
13081+};
13082+static int __init xen_sysctl_init(void)
13083+{
13084+ (void)register_sysctl_table(xen_table, 0);
13085+ return 0;
13086+}
13087+__initcall(xen_sysctl_init);
13088Index: head-2008-11-25/arch/x86/kernel/traps_32-xen.c
13089===================================================================
13090--- /dev/null 1970-01-01 00:00:00.000000000 +0000
13091+++ head-2008-11-25/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200
13092@@ -0,0 +1,1190 @@
13093+/*
13094+ * linux/arch/i386/traps.c
13095+ *
13096+ * Copyright (C) 1991, 1992 Linus Torvalds
13097+ *
13098+ * Pentium III FXSR, SSE support
13099+ * Gareth Hughes <gareth@valinux.com>, May 2000
13100+ */
13101+
13102+/*
13103+ * 'Traps.c' handles hardware traps and faults after we have saved some
13104+ * state in 'asm.s'.
13105+ */
13106+#include <linux/sched.h>
13107+#include <linux/kernel.h>
13108+#include <linux/string.h>
13109+#include <linux/errno.h>
13110+#include <linux/timer.h>
13111+#include <linux/mm.h>
13112+#include <linux/init.h>
13113+#include <linux/delay.h>
13114+#include <linux/spinlock.h>
13115+#include <linux/interrupt.h>
13116+#include <linux/highmem.h>
13117+#include <linux/kallsyms.h>
13118+#include <linux/ptrace.h>
13119+#include <linux/utsname.h>
13120+#include <linux/kprobes.h>
13121+#include <linux/kexec.h>
13122+#include <linux/unwind.h>
13123+
13124+#ifdef CONFIG_EISA
13125+#include <linux/ioport.h>
13126+#include <linux/eisa.h>
13127+#endif
13128+
13129+#ifdef CONFIG_MCA
13130+#include <linux/mca.h>
13131+#endif
13132+
13133+#include <asm/processor.h>
13134+#include <asm/system.h>
13135+#include <asm/uaccess.h>
13136+#include <asm/io.h>
13137+#include <asm/atomic.h>
13138+#include <asm/debugreg.h>
13139+#include <asm/desc.h>
13140+#include <asm/i387.h>
13141+#include <asm/nmi.h>
13142+#include <asm/unwind.h>
13143+#include <asm/smp.h>
13144+#include <asm/arch_hooks.h>
13145+#include <asm/kdebug.h>
13146+
13147+#include <linux/module.h>
13148+
13149+#include "mach_traps.h"
13150+
13151+asmlinkage int system_call(void);
13152+
13153+struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
13154+ { 0, 0 }, { 0, 0 } };
13155+
13156+/* Do we ignore FPU interrupts ? */
13157+char ignore_fpu_irq = 0;
13158+
13159+#ifndef CONFIG_X86_NO_IDT
13160+/*
13161+ * The IDT has to be page-aligned to simplify the Pentium
13162+ * F0 0F bug workaround.. We have a special link segment
13163+ * for this.
13164+ */
13165+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
13166+#endif
13167+
13168+asmlinkage void divide_error(void);
13169+asmlinkage void debug(void);
13170+asmlinkage void nmi(void);
13171+asmlinkage void int3(void);
13172+asmlinkage void overflow(void);
13173+asmlinkage void bounds(void);
13174+asmlinkage void invalid_op(void);
13175+asmlinkage void device_not_available(void);
13176+asmlinkage void coprocessor_segment_overrun(void);
13177+asmlinkage void invalid_TSS(void);
13178+asmlinkage void segment_not_present(void);
13179+asmlinkage void stack_segment(void);
13180+asmlinkage void general_protection(void);
13181+asmlinkage void page_fault(void);
13182+asmlinkage void coprocessor_error(void);
13183+asmlinkage void simd_coprocessor_error(void);
13184+asmlinkage void alignment_check(void);
13185+#ifndef CONFIG_XEN
13186+asmlinkage void spurious_interrupt_bug(void);
13187+#else
13188+asmlinkage void fixup_4gb_segment(void);
13189+#endif
13190+asmlinkage void machine_check(void);
13191+
13192+static int kstack_depth_to_print = 24;
13193+#ifdef CONFIG_STACK_UNWIND
13194+static int call_trace = 1;
13195+#else
13196+#define call_trace (-1)
13197+#endif
13198+ATOMIC_NOTIFIER_HEAD(i386die_chain);
13199+
13200+int register_die_notifier(struct notifier_block *nb)
13201+{
13202+ vmalloc_sync_all();
13203+ return atomic_notifier_chain_register(&i386die_chain, nb);
13204+}
13205+EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
13206+
13207+int unregister_die_notifier(struct notifier_block *nb)
13208+{
13209+ return atomic_notifier_chain_unregister(&i386die_chain, nb);
13210+}
13211+EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
13212+
13213+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
13214+{
13215+ return p > (void *)tinfo &&
13216+ p < (void *)tinfo + THREAD_SIZE - 3;
13217+}
13218+
13219+/*
13220+ * Print one address/symbol entries per line.
13221+ */
13222+static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
13223+{
13224+ printk(" [<%08lx>] ", addr);
13225+
13226+ print_symbol("%s\n", addr);
13227+}
13228+
13229+static inline unsigned long print_context_stack(struct thread_info *tinfo,
13230+ unsigned long *stack, unsigned long ebp,
13231+ char *log_lvl)
13232+{
13233+ unsigned long addr;
13234+
13235+#ifdef CONFIG_FRAME_POINTER
13236+ while (valid_stack_ptr(tinfo, (void *)ebp)) {
13237+ addr = *(unsigned long *)(ebp + 4);
13238+ print_addr_and_symbol(addr, log_lvl);
13239+ /*
13240+ * break out of recursive entries (such as
13241+ * end_of_stack_stop_unwind_function):
13242+ */
13243+ if (ebp == *(unsigned long *)ebp)
13244+ break;
13245+ ebp = *(unsigned long *)ebp;
13246+ }
13247+#else
13248+ while (valid_stack_ptr(tinfo, stack)) {
13249+ addr = *stack++;
13250+ if (__kernel_text_address(addr))
13251+ print_addr_and_symbol(addr, log_lvl);
13252+ }
13253+#endif
13254+ return ebp;
13255+}
13256+
13257+static asmlinkage int
13258+show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
13259+{
13260+ int n = 0;
13261+
13262+ while (unwind(info) == 0 && UNW_PC(info)) {
13263+ n++;
13264+ print_addr_and_symbol(UNW_PC(info), log_lvl);
13265+ if (arch_unw_user_mode(info))
13266+ break;
13267+ }
13268+ return n;
13269+}
13270+
13271+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
13272+ unsigned long *stack, char *log_lvl)
13273+{
13274+ unsigned long ebp;
13275+
13276+ if (!task)
13277+ task = current;
13278+
13279+ if (call_trace >= 0) {
13280+ int unw_ret = 0;
13281+ struct unwind_frame_info info;
13282+
13283+ if (regs) {
13284+ if (unwind_init_frame_info(&info, task, regs) == 0)
13285+ unw_ret = show_trace_unwind(&info, log_lvl);
13286+ } else if (task == current)
13287+ unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
13288+ else {
13289+ if (unwind_init_blocked(&info, task) == 0)
13290+ unw_ret = show_trace_unwind(&info, log_lvl);
13291+ }
13292+ if (unw_ret > 0) {
13293+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
13294+ print_symbol("DWARF2 unwinder stuck at %s\n",
13295+ UNW_PC(&info));
13296+ if (UNW_SP(&info) >= PAGE_OFFSET) {
13297+ printk("Leftover inexact backtrace:\n");
13298+ stack = (void *)UNW_SP(&info);
13299+ } else
13300+ printk("Full inexact backtrace again:\n");
13301+ } else if (call_trace >= 1)
13302+ return;
13303+ else
13304+ printk("Full inexact backtrace again:\n");
13305+ } else
13306+ printk("Inexact backtrace:\n");
13307+ }
13308+
13309+ if (task == current) {
13310+ /* Grab ebp right from our regs */
13311+ asm ("movl %%ebp, %0" : "=r" (ebp) : );
13312+ } else {
13313+ /* ebp is the last reg pushed by switch_to */
13314+ ebp = *(unsigned long *) task->thread.esp;
13315+ }
13316+
13317+ while (1) {
13318+ struct thread_info *context;
13319+ context = (struct thread_info *)
13320+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
13321+ ebp = print_context_stack(context, stack, ebp, log_lvl);
13322+ stack = (unsigned long*)context->previous_esp;
13323+ if (!stack)
13324+ break;
13325+ printk("%s =======================\n", log_lvl);
13326+ }
13327+}
13328+
13329+void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
13330+{
13331+ show_trace_log_lvl(task, regs, stack, "");
13332+}
13333+
13334+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
13335+ unsigned long *esp, char *log_lvl)
13336+{
13337+ unsigned long *stack;
13338+ int i;
13339+
13340+ if (esp == NULL) {
13341+ if (task)
13342+ esp = (unsigned long*)task->thread.esp;
13343+ else
13344+ esp = (unsigned long *)&esp;
13345+ }
13346+
13347+ stack = esp;
13348+ for(i = 0; i < kstack_depth_to_print; i++) {
13349+ if (kstack_end(stack))
13350+ break;
13351+ if (i && ((i % 8) == 0))
13352+ printk("\n%s ", log_lvl);
13353+ printk("%08lx ", *stack++);
13354+ }
13355+ printk("\n%sCall Trace:\n", log_lvl);
13356+ show_trace_log_lvl(task, regs, esp, log_lvl);
13357+}
13358+
13359+void show_stack(struct task_struct *task, unsigned long *esp)
13360+{
13361+ printk(" ");
13362+ show_stack_log_lvl(task, NULL, esp, "");
13363+}
13364+
13365+/*
13366+ * The architecture-independent dump_stack generator
13367+ */
13368+void dump_stack(void)
13369+{
13370+ unsigned long stack;
13371+
13372+ show_trace(current, NULL, &stack);
13373+}
13374+
13375+EXPORT_SYMBOL(dump_stack);
13376+
13377+void show_registers(struct pt_regs *regs)
13378+{
13379+ int i;
13380+ int in_kernel = 1;
13381+ unsigned long esp;
13382+ unsigned short ss;
13383+
13384+ esp = (unsigned long) (&regs->esp);
13385+ savesegment(ss, ss);
13386+ if (user_mode_vm(regs)) {
13387+ in_kernel = 0;
13388+ esp = regs->esp;
13389+ ss = regs->xss & 0xffff;
13390+ }
13391+ print_modules();
13392+ printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
13393+ "EFLAGS: %08lx (%s %.*s) \n",
13394+ smp_processor_id(), 0xffff & regs->xcs, regs->eip,
13395+ print_tainted(), regs->eflags, system_utsname.release,
13396+ (int)strcspn(system_utsname.version, " "),
13397+ system_utsname.version);
13398+ print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
13399+ printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
13400+ regs->eax, regs->ebx, regs->ecx, regs->edx);
13401+ printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
13402+ regs->esi, regs->edi, regs->ebp, esp);
13403+ printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
13404+ regs->xds & 0xffff, regs->xes & 0xffff, ss);
13405+ printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
13406+ TASK_COMM_LEN, current->comm, current->pid,
13407+ current_thread_info(), current, current->thread_info);
13408+ /*
13409+ * When in-kernel, we also print out the stack and code at the
13410+ * time of the fault..
13411+ */
13412+ if (in_kernel) {
13413+ u8 __user *eip;
13414+
13415+ printk("\n" KERN_EMERG "Stack: ");
13416+ show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
13417+
13418+ printk(KERN_EMERG "Code: ");
13419+
13420+ eip = (u8 __user *)regs->eip - 43;
13421+ for (i = 0; i < 64; i++, eip++) {
13422+ unsigned char c;
13423+
13424+ if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
13425+ printk(" Bad EIP value.");
13426+ break;
13427+ }
13428+ if (eip == (u8 __user *)regs->eip)
13429+ printk("<%02x> ", c);
13430+ else
13431+ printk("%02x ", c);
13432+ }
13433+ }
13434+ printk("\n");
13435+}
13436+
13437+static void handle_BUG(struct pt_regs *regs)
13438+{
13439+ unsigned long eip = regs->eip;
13440+ unsigned short ud2;
13441+
13442+ if (eip < PAGE_OFFSET)
13443+ return;
13444+ if (__get_user(ud2, (unsigned short __user *)eip))
13445+ return;
13446+ if (ud2 != 0x0b0f)
13447+ return;
13448+
13449+ printk(KERN_EMERG "------------[ cut here ]------------\n");
13450+
13451+#ifdef CONFIG_DEBUG_BUGVERBOSE
13452+ do {
13453+ unsigned short line;
13454+ char *file;
13455+ char c;
13456+
13457+ if (__get_user(line, (unsigned short __user *)(eip + 2)))
13458+ break;
13459+ if (__get_user(file, (char * __user *)(eip + 4)) ||
13460+ (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
13461+ file = "<bad filename>";
13462+
13463+ printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
13464+ return;
13465+ } while (0);
13466+#endif
13467+ printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
13468+}
13469+
13470+/* This is gone through when something in the kernel
13471+ * has done something bad and is about to be terminated.
13472+*/
13473+void die(const char * str, struct pt_regs * regs, long err)
13474+{
13475+ static struct {
13476+ spinlock_t lock;
13477+ u32 lock_owner;
13478+ int lock_owner_depth;
13479+ } die = {
13480+ .lock = SPIN_LOCK_UNLOCKED,
13481+ .lock_owner = -1,
13482+ .lock_owner_depth = 0
13483+ };
13484+ static int die_counter;
13485+ unsigned long flags;
13486+
13487+ oops_enter();
13488+
13489+ if (die.lock_owner != raw_smp_processor_id()) {
13490+ console_verbose();
13491+ spin_lock_irqsave(&die.lock, flags);
13492+ die.lock_owner = smp_processor_id();
13493+ die.lock_owner_depth = 0;
13494+ bust_spinlocks(1);
13495+ }
13496+ else
13497+ local_save_flags(flags);
13498+
13499+ if (++die.lock_owner_depth < 3) {
13500+ int nl = 0;
13501+ unsigned long esp;
13502+ unsigned short ss;
13503+
13504+ handle_BUG(regs);
13505+ printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
13506+#ifdef CONFIG_PREEMPT
13507+ printk(KERN_EMERG "PREEMPT ");
13508+ nl = 1;
13509+#endif
13510+#ifdef CONFIG_SMP
13511+ if (!nl)
13512+ printk(KERN_EMERG);
13513+ printk("SMP ");
13514+ nl = 1;
13515+#endif
13516+#ifdef CONFIG_DEBUG_PAGEALLOC
13517+ if (!nl)
13518+ printk(KERN_EMERG);
13519+ printk("DEBUG_PAGEALLOC");
13520+ nl = 1;
13521+#endif
13522+ if (nl)
13523+ printk("\n");
13524+ if (notify_die(DIE_OOPS, str, regs, err,
13525+ current->thread.trap_no, SIGSEGV) !=
13526+ NOTIFY_STOP) {
13527+ show_registers(regs);
13528+ /* Executive summary in case the oops scrolled away */
13529+ esp = (unsigned long) (&regs->esp);
13530+ savesegment(ss, ss);
13531+ if (user_mode(regs)) {
13532+ esp = regs->esp;
13533+ ss = regs->xss & 0xffff;
13534+ }
13535+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
13536+ print_symbol("%s", regs->eip);
13537+ printk(" SS:ESP %04x:%08lx\n", ss, esp);
13538+ }
13539+ else
13540+ regs = NULL;
13541+ } else
13542+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
13543+
13544+ bust_spinlocks(0);
13545+ die.lock_owner = -1;
13546+ spin_unlock_irqrestore(&die.lock, flags);
13547+
13548+ if (!regs)
13549+ return;
13550+
13551+ if (kexec_should_crash(current))
13552+ crash_kexec(regs);
13553+
13554+ if (in_interrupt())
13555+ panic("Fatal exception in interrupt");
13556+
13557+ if (panic_on_oops)
13558+ panic("Fatal exception");
13559+
13560+ oops_exit();
13561+ do_exit(SIGSEGV);
13562+}
13563+
13564+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
13565+{
13566+ if (!user_mode_vm(regs))
13567+ die(str, regs, err);
13568+}
13569+
13570+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
13571+ struct pt_regs * regs, long error_code,
13572+ siginfo_t *info)
13573+{
13574+ struct task_struct *tsk = current;
13575+ tsk->thread.error_code = error_code;
13576+ tsk->thread.trap_no = trapnr;
13577+
13578+ if (regs->eflags & VM_MASK) {
13579+ if (vm86)
13580+ goto vm86_trap;
13581+ goto trap_signal;
13582+ }
13583+
13584+ if (!user_mode(regs))
13585+ goto kernel_trap;
13586+
13587+ trap_signal: {
13588+ if (info)
13589+ force_sig_info(signr, info, tsk);
13590+ else
13591+ force_sig(signr, tsk);
13592+ return;
13593+ }
13594+
13595+ kernel_trap: {
13596+ if (!fixup_exception(regs))
13597+ die(str, regs, error_code);
13598+ return;
13599+ }
13600+
13601+ vm86_trap: {
13602+ int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
13603+ if (ret) goto trap_signal;
13604+ return;
13605+ }
13606+}
13607+
13608+#define DO_ERROR(trapnr, signr, str, name) \
13609+fastcall void do_##name(struct pt_regs * regs, long error_code) \
13610+{ \
13611+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13612+ == NOTIFY_STOP) \
13613+ return; \
13614+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
13615+}
13616+
13617+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13618+fastcall void do_##name(struct pt_regs * regs, long error_code) \
13619+{ \
13620+ siginfo_t info; \
13621+ info.si_signo = signr; \
13622+ info.si_errno = 0; \
13623+ info.si_code = sicode; \
13624+ info.si_addr = (void __user *)siaddr; \
13625+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13626+ == NOTIFY_STOP) \
13627+ return; \
13628+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
13629+}
13630+
13631+#define DO_VM86_ERROR(trapnr, signr, str, name) \
13632+fastcall void do_##name(struct pt_regs * regs, long error_code) \
13633+{ \
13634+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13635+ == NOTIFY_STOP) \
13636+ return; \
13637+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
13638+}
13639+
13640+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13641+fastcall void do_##name(struct pt_regs * regs, long error_code) \
13642+{ \
13643+ siginfo_t info; \
13644+ info.si_signo = signr; \
13645+ info.si_errno = 0; \
13646+ info.si_code = sicode; \
13647+ info.si_addr = (void __user *)siaddr; \
13648+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13649+ == NOTIFY_STOP) \
13650+ return; \
13651+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
13652+}
13653+
13654+DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
13655+#ifndef CONFIG_KPROBES
13656+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
13657+#endif
13658+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
13659+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
13660+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
13661+DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
13662+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
13663+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
13664+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
13665+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
13666+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
13667+
13668+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
13669+ long error_code)
13670+{
13671+ current->thread.error_code = error_code;
13672+ current->thread.trap_no = 13;
13673+
13674+ if (regs->eflags & VM_MASK)
13675+ goto gp_in_vm86;
13676+
13677+ if (!user_mode(regs))
13678+ goto gp_in_kernel;
13679+
13680+ current->thread.error_code = error_code;
13681+ current->thread.trap_no = 13;
13682+ force_sig(SIGSEGV, current);
13683+ return;
13684+
13685+gp_in_vm86:
13686+ local_irq_enable();
13687+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
13688+ return;
13689+
13690+gp_in_kernel:
13691+ if (!fixup_exception(regs)) {
13692+ if (notify_die(DIE_GPF, "general protection fault", regs,
13693+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
13694+ return;
13695+ die("general protection fault", regs, error_code);
13696+ }
13697+}
13698+
13699+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
13700+{
13701+ printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
13702+ "to continue\n");
13703+ printk(KERN_EMERG "You probably have a hardware problem with your RAM "
13704+ "chips\n");
13705+
13706+ /* Clear and disable the memory parity error line. */
13707+ clear_mem_error(reason);
13708+}
13709+
13710+static void io_check_error(unsigned char reason, struct pt_regs * regs)
13711+{
13712+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
13713+ show_registers(regs);
13714+
13715+ /* Re-enable the IOCK line, wait for a few seconds */
13716+ clear_io_check_error(reason);
13717+}
13718+
13719+static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
13720+{
13721+#ifdef CONFIG_MCA
13722+ /* Might actually be able to figure out what the guilty party
13723+ * is. */
13724+ if( MCA_bus ) {
13725+ mca_handle_nmi();
13726+ return;
13727+ }
13728+#endif
13729+ printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
13730+ reason, smp_processor_id());
13731+ printk("Dazed and confused, but trying to continue\n");
13732+ printk("Do you have a strange power saving mode enabled?\n");
13733+}
13734+
13735+static DEFINE_SPINLOCK(nmi_print_lock);
13736+
13737+void die_nmi (struct pt_regs *regs, const char *msg)
13738+{
13739+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
13740+ NOTIFY_STOP)
13741+ return;
13742+
13743+ spin_lock(&nmi_print_lock);
13744+ /*
13745+ * We are in trouble anyway, lets at least try
13746+ * to get a message out.
13747+ */
13748+ bust_spinlocks(1);
13749+ printk(KERN_EMERG "%s", msg);
13750+ printk(" on CPU%d, eip %08lx, registers:\n",
13751+ smp_processor_id(), regs->eip);
13752+ show_registers(regs);
13753+ printk(KERN_EMERG "console shuts up ...\n");
13754+ console_silent();
13755+ spin_unlock(&nmi_print_lock);
13756+ bust_spinlocks(0);
13757+
13758+ /* If we are in kernel we are probably nested up pretty bad
13759+ * and might aswell get out now while we still can.
13760+ */
13761+ if (!user_mode_vm(regs)) {
13762+ current->thread.trap_no = 2;
13763+ crash_kexec(regs);
13764+ }
13765+
13766+ do_exit(SIGSEGV);
13767+}
13768+
13769+static void default_do_nmi(struct pt_regs * regs)
13770+{
13771+ unsigned char reason = 0;
13772+
13773+ /* Only the BSP gets external NMIs from the system. */
13774+ if (!smp_processor_id())
13775+ reason = get_nmi_reason();
13776+
13777+ if (!(reason & 0xc0)) {
13778+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
13779+ == NOTIFY_STOP)
13780+ return;
13781+#ifdef CONFIG_X86_LOCAL_APIC
13782+ /*
13783+ * Ok, so this is none of the documented NMI sources,
13784+ * so it must be the NMI watchdog.
13785+ */
13786+ if (nmi_watchdog) {
13787+ nmi_watchdog_tick(regs);
13788+ return;
13789+ }
13790+#endif
13791+ unknown_nmi_error(reason, regs);
13792+ return;
13793+ }
13794+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
13795+ return;
13796+ if (reason & 0x80)
13797+ mem_parity_error(reason, regs);
13798+ if (reason & 0x40)
13799+ io_check_error(reason, regs);
13800+ /*
13801+ * Reassert NMI in case it became active meanwhile
13802+ * as it's edge-triggered.
13803+ */
13804+ reassert_nmi();
13805+}
13806+
13807+static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
13808+{
13809+ return 0;
13810+}
13811+
13812+static nmi_callback_t nmi_callback = dummy_nmi_callback;
13813+
13814+fastcall void do_nmi(struct pt_regs * regs, long error_code)
13815+{
13816+ int cpu;
13817+
13818+ nmi_enter();
13819+
13820+ cpu = smp_processor_id();
13821+
13822+ ++nmi_count(cpu);
13823+
13824+ if (!rcu_dereference(nmi_callback)(regs, cpu))
13825+ default_do_nmi(regs);
13826+
13827+ nmi_exit();
13828+}
13829+
13830+void set_nmi_callback(nmi_callback_t callback)
13831+{
13832+ vmalloc_sync_all();
13833+ rcu_assign_pointer(nmi_callback, callback);
13834+}
13835+EXPORT_SYMBOL_GPL(set_nmi_callback);
13836+
13837+void unset_nmi_callback(void)
13838+{
13839+ nmi_callback = dummy_nmi_callback;
13840+}
13841+EXPORT_SYMBOL_GPL(unset_nmi_callback);
13842+
13843+#ifdef CONFIG_KPROBES
13844+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
13845+{
13846+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
13847+ == NOTIFY_STOP)
13848+ return;
13849+ /* This is an interrupt gate, because kprobes wants interrupts
13850+ disabled. Normal trap handlers don't. */
13851+ restore_interrupts(regs);
13852+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
13853+}
13854+#endif
13855+
13856+/*
13857+ * Our handling of the processor debug registers is non-trivial.
13858+ * We do not clear them on entry and exit from the kernel. Therefore
13859+ * it is possible to get a watchpoint trap here from inside the kernel.
13860+ * However, the code in ./ptrace.c has ensured that the user can
13861+ * only set watchpoints on userspace addresses. Therefore the in-kernel
13862+ * watchpoint trap can only occur in code which is reading/writing
13863+ * from user space. Such code must not hold kernel locks (since it
13864+ * can equally take a page fault), therefore it is safe to call
13865+ * force_sig_info even though that claims and releases locks.
13866+ *
13867+ * Code in ./signal.c ensures that the debug control register
13868+ * is restored before we deliver any signal, and therefore that
13869+ * user code runs with the correct debug control register even though
13870+ * we clear it here.
13871+ *
13872+ * Being careful here means that we don't have to be as careful in a
13873+ * lot of more complicated places (task switching can be a bit lazy
13874+ * about restoring all the debug state, and ptrace doesn't have to
13875+ * find every occurrence of the TF bit that could be saved away even
13876+ * by user code)
13877+ */
13878+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
13879+{
13880+ unsigned int condition;
13881+ struct task_struct *tsk = current;
13882+
13883+ get_debugreg(condition, 6);
13884+
13885+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
13886+ SIGTRAP) == NOTIFY_STOP)
13887+ return;
13888+ /* It's safe to allow irq's after DR6 has been saved */
13889+ if (regs->eflags & X86_EFLAGS_IF)
13890+ local_irq_enable();
13891+
13892+ /* Mask out spurious debug traps due to lazy DR7 setting */
13893+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
13894+ if (!tsk->thread.debugreg[7])
13895+ goto clear_dr7;
13896+ }
13897+
13898+ if (regs->eflags & VM_MASK)
13899+ goto debug_vm86;
13900+
13901+ /* Save debug status register where ptrace can see it */
13902+ tsk->thread.debugreg[6] = condition;
13903+
13904+ /*
13905+ * Single-stepping through TF: make sure we ignore any events in
13906+ * kernel space (but re-enable TF when returning to user mode).
13907+ */
13908+ if (condition & DR_STEP) {
13909+ /*
13910+ * We already checked v86 mode above, so we can
13911+ * check for kernel mode by just checking the CPL
13912+ * of CS.
13913+ */
13914+ if (!user_mode(regs))
13915+ goto clear_TF_reenable;
13916+ }
13917+
13918+ /* Ok, finally something we can handle */
13919+ send_sigtrap(tsk, regs, error_code);
13920+
13921+ /* Disable additional traps. They'll be re-enabled when
13922+ * the signal is delivered.
13923+ */
13924+clear_dr7:
13925+ set_debugreg(0, 7);
13926+ return;
13927+
13928+debug_vm86:
13929+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
13930+ return;
13931+
13932+clear_TF_reenable:
13933+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
13934+ regs->eflags &= ~TF_MASK;
13935+ return;
13936+}
13937+
13938+/*
13939+ * Note that we play around with the 'TS' bit in an attempt to get
13940+ * the correct behaviour even in the presence of the asynchronous
13941+ * IRQ13 behaviour
13942+ */
13943+void math_error(void __user *eip)
13944+{
13945+ struct task_struct * task;
13946+ siginfo_t info;
13947+ unsigned short cwd, swd;
13948+
13949+ /*
13950+ * Save the info for the exception handler and clear the error.
13951+ */
13952+ task = current;
13953+ save_init_fpu(task);
13954+ task->thread.trap_no = 16;
13955+ task->thread.error_code = 0;
13956+ info.si_signo = SIGFPE;
13957+ info.si_errno = 0;
13958+ info.si_code = __SI_FAULT;
13959+ info.si_addr = eip;
13960+ /*
13961+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
13962+ * status. 0x3f is the exception bits in these regs, 0x200 is the
13963+ * C1 reg you need in case of a stack fault, 0x040 is the stack
13964+ * fault bit. We should only be taking one exception at a time,
13965+ * so if this combination doesn't produce any single exception,
13966+ * then we have a bad program that isn't syncronizing its FPU usage
13967+ * and it will suffer the consequences since we won't be able to
13968+ * fully reproduce the context of the exception
13969+ */
13970+ cwd = get_fpu_cwd(task);
13971+ swd = get_fpu_swd(task);
13972+ switch (swd & ~cwd & 0x3f) {
13973+ case 0x000: /* No unmasked exception */
13974+ return;
13975+ default: /* Multiple exceptions */
13976+ break;
13977+ case 0x001: /* Invalid Op */
13978+ /*
13979+ * swd & 0x240 == 0x040: Stack Underflow
13980+ * swd & 0x240 == 0x240: Stack Overflow
13981+ * User must clear the SF bit (0x40) if set
13982+ */
13983+ info.si_code = FPE_FLTINV;
13984+ break;
13985+ case 0x002: /* Denormalize */
13986+ case 0x010: /* Underflow */
13987+ info.si_code = FPE_FLTUND;
13988+ break;
13989+ case 0x004: /* Zero Divide */
13990+ info.si_code = FPE_FLTDIV;
13991+ break;
13992+ case 0x008: /* Overflow */
13993+ info.si_code = FPE_FLTOVF;
13994+ break;
13995+ case 0x020: /* Precision */
13996+ info.si_code = FPE_FLTRES;
13997+ break;
13998+ }
13999+ force_sig_info(SIGFPE, &info, task);
14000+}
14001+
14002+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
14003+{
14004+ ignore_fpu_irq = 1;
14005+ math_error((void __user *)regs->eip);
14006+}
14007+
14008+static void simd_math_error(void __user *eip)
14009+{
14010+ struct task_struct * task;
14011+ siginfo_t info;
14012+ unsigned short mxcsr;
14013+
14014+ /*
14015+ * Save the info for the exception handler and clear the error.
14016+ */
14017+ task = current;
14018+ save_init_fpu(task);
14019+ task->thread.trap_no = 19;
14020+ task->thread.error_code = 0;
14021+ info.si_signo = SIGFPE;
14022+ info.si_errno = 0;
14023+ info.si_code = __SI_FAULT;
14024+ info.si_addr = eip;
14025+ /*
14026+ * The SIMD FPU exceptions are handled a little differently, as there
14027+ * is only a single status/control register. Thus, to determine which
14028+ * unmasked exception was caught we must mask the exception mask bits
14029+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
14030+ */
14031+ mxcsr = get_fpu_mxcsr(task);
14032+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
14033+ case 0x000:
14034+ default:
14035+ break;
14036+ case 0x001: /* Invalid Op */
14037+ info.si_code = FPE_FLTINV;
14038+ break;
14039+ case 0x002: /* Denormalize */
14040+ case 0x010: /* Underflow */
14041+ info.si_code = FPE_FLTUND;
14042+ break;
14043+ case 0x004: /* Zero Divide */
14044+ info.si_code = FPE_FLTDIV;
14045+ break;
14046+ case 0x008: /* Overflow */
14047+ info.si_code = FPE_FLTOVF;
14048+ break;
14049+ case 0x020: /* Precision */
14050+ info.si_code = FPE_FLTRES;
14051+ break;
14052+ }
14053+ force_sig_info(SIGFPE, &info, task);
14054+}
14055+
14056+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
14057+ long error_code)
14058+{
14059+ if (cpu_has_xmm) {
14060+ /* Handle SIMD FPU exceptions on PIII+ processors. */
14061+ ignore_fpu_irq = 1;
14062+ simd_math_error((void __user *)regs->eip);
14063+ } else {
14064+ /*
14065+ * Handle strange cache flush from user space exception
14066+ * in all other cases. This is undocumented behaviour.
14067+ */
14068+ if (regs->eflags & VM_MASK) {
14069+ handle_vm86_fault((struct kernel_vm86_regs *)regs,
14070+ error_code);
14071+ return;
14072+ }
14073+ current->thread.trap_no = 19;
14074+ current->thread.error_code = error_code;
14075+ die_if_kernel("cache flush denied", regs, error_code);
14076+ force_sig(SIGSEGV, current);
14077+ }
14078+}
14079+
14080+#ifndef CONFIG_XEN
14081+fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
14082+ long error_code)
14083+{
14084+#if 0
14085+ /* No need to warn about this any longer. */
14086+ printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
14087+#endif
14088+}
14089+
14090+fastcall void setup_x86_bogus_stack(unsigned char * stk)
14091+{
14092+ unsigned long *switch16_ptr, *switch32_ptr;
14093+ struct pt_regs *regs;
14094+ unsigned long stack_top, stack_bot;
14095+ unsigned short iret_frame16_off;
14096+ int cpu = smp_processor_id();
14097+ /* reserve the space on 32bit stack for the magic switch16 pointer */
14098+ memmove(stk, stk + 8, sizeof(struct pt_regs));
14099+ switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
14100+ regs = (struct pt_regs *)stk;
14101+ /* now the switch32 on 16bit stack */
14102+ stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14103+ stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14104+ switch32_ptr = (unsigned long *)(stack_top - 8);
14105+ iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
14106+ /* copy iret frame on 16bit stack */
14107+ memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
14108+ /* fill in the switch pointers */
14109+ switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
14110+ switch16_ptr[1] = __ESPFIX_SS;
14111+ switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
14112+ 8 - CPU_16BIT_STACK_SIZE;
14113+ switch32_ptr[1] = __KERNEL_DS;
14114+}
14115+
14116+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
14117+{
14118+ unsigned long *switch32_ptr;
14119+ unsigned char *stack16, *stack32;
14120+ unsigned long stack_top, stack_bot;
14121+ int len;
14122+ int cpu = smp_processor_id();
14123+ stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14124+ stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14125+ switch32_ptr = (unsigned long *)(stack_top - 8);
14126+ /* copy the data from 16bit stack to 32bit stack */
14127+ len = CPU_16BIT_STACK_SIZE - 8 - sp;
14128+ stack16 = (unsigned char *)(stack_bot + sp);
14129+ stack32 = (unsigned char *)
14130+ (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
14131+ memcpy(stack32, stack16, len);
14132+ return stack32;
14133+}
14134+#endif
14135+
14136+/*
14137+ * 'math_state_restore()' saves the current math information in the
14138+ * old math state array, and gets the new ones from the current task
14139+ *
14140+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
14141+ * Don't touch unless you *really* know how it works.
14142+ *
14143+ * Must be called with kernel preemption disabled (in this case,
14144+ * local interrupts are disabled at the call-site in entry.S).
14145+ */
14146+asmlinkage void math_state_restore(struct pt_regs regs)
14147+{
14148+ struct thread_info *thread = current_thread_info();
14149+ struct task_struct *tsk = thread->task;
14150+
14151+ /* NB. 'clts' is done for us by Xen during virtual trap. */
14152+ if (!tsk_used_math(tsk))
14153+ init_fpu(tsk);
14154+ restore_fpu(tsk);
14155+ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
14156+}
14157+
14158+#ifndef CONFIG_MATH_EMULATION
14159+
14160+asmlinkage void math_emulate(long arg)
14161+{
14162+ printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
14163+ printk(KERN_EMERG "killing %s.\n",current->comm);
14164+ force_sig(SIGFPE,current);
14165+ schedule();
14166+}
14167+
14168+#endif /* CONFIG_MATH_EMULATION */
14169+
14170+#ifdef CONFIG_X86_F00F_BUG
14171+void __init trap_init_f00f_bug(void)
14172+{
14173+ __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
14174+
14175+ /*
14176+ * Update the IDT descriptor and reload the IDT so that
14177+ * it uses the read-only mapped virtual address.
14178+ */
14179+ idt_descr.address = fix_to_virt(FIX_F00F_IDT);
14180+ load_idt(&idt_descr);
14181+}
14182+#endif
14183+
14184+
14185+/*
14186+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
14187+ * for those that specify <dpl>|4 in the second field.
14188+ */
14189+static trap_info_t __cpuinitdata trap_table[] = {
14190+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
14191+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
14192+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
14193+ { 4, 3, __KERNEL_CS, (unsigned long)overflow },
14194+ { 5, 0, __KERNEL_CS, (unsigned long)bounds },
14195+ { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
14196+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
14197+ { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
14198+ { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
14199+ { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
14200+ { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
14201+ { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
14202+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
14203+ { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
14204+ { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
14205+ { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
14206+#ifdef CONFIG_X86_MCE
14207+ { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
14208+#endif
14209+ { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
14210+ { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
14211+ { 0, 0, 0, 0 }
14212+};
14213+
14214+void __init trap_init(void)
14215+{
14216+ int ret;
14217+
14218+ ret = HYPERVISOR_set_trap_table(trap_table);
14219+ if (ret)
14220+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
14221+
14222+ if (cpu_has_fxsr) {
14223+ /*
14224+ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
14225+ * Generates a compile-time "error: zero width for bit-field" if
14226+ * the alignment is wrong.
14227+ */
14228+ struct fxsrAlignAssert {
14229+ int _:!(offsetof(struct task_struct,
14230+ thread.i387.fxsave) & 15);
14231+ };
14232+
14233+ printk(KERN_INFO "Enabling fast FPU save and restore... ");
14234+ set_in_cr4(X86_CR4_OSFXSR);
14235+ printk("done.\n");
14236+ }
14237+ if (cpu_has_xmm) {
14238+ printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
14239+ "support... ");
14240+ set_in_cr4(X86_CR4_OSXMMEXCPT);
14241+ printk("done.\n");
14242+ }
14243+
14244+ /*
14245+ * Should be a barrier for any external CPU state.
14246+ */
14247+ cpu_init();
14248+}
14249+
14250+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
14251+{
14252+ const trap_info_t *t = trap_table;
14253+
14254+ for (t = trap_table; t->address; t++) {
14255+ trap_ctxt[t->vector].flags = t->flags;
14256+ trap_ctxt[t->vector].cs = t->cs;
14257+ trap_ctxt[t->vector].address = t->address;
14258+ }
14259+}
14260+
14261+static int __init kstack_setup(char *s)
14262+{
14263+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
14264+ return 1;
14265+}
14266+__setup("kstack=", kstack_setup);
14267+
14268+#ifdef CONFIG_STACK_UNWIND
14269+static int __init call_trace_setup(char *s)
14270+{
14271+ if (strcmp(s, "old") == 0)
14272+ call_trace = -1;
14273+ else if (strcmp(s, "both") == 0)
14274+ call_trace = 0;
14275+ else if (strcmp(s, "newfallback") == 0)
14276+ call_trace = 1;
14277+ else if (strcmp(s, "new") == 2)
14278+ call_trace = 2;
14279+ return 1;
14280+}
14281+__setup("call_trace=", call_trace_setup);
14282+#endif
14283Index: head-2008-11-25/arch/x86/mach-xen/Makefile
14284===================================================================
14285--- /dev/null 1970-01-01 00:00:00.000000000 +0000
14286+++ head-2008-11-25/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200
14287@@ -0,0 +1,5 @@
14288+#
14289+# Makefile for the linux kernel.
14290+#
14291+
14292+obj-y := setup.o
14293Index: head-2008-11-25/arch/x86/mach-xen/setup.c
14294===================================================================
14295--- /dev/null 1970-01-01 00:00:00.000000000 +0000
14296+++ head-2008-11-25/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200
14297@@ -0,0 +1,158 @@
14298+/*
14299+ * Machine specific setup for generic
14300+ */
14301+
14302+#include <linux/mm.h>
14303+#include <linux/smp.h>
14304+#include <linux/init.h>
14305+#include <linux/interrupt.h>
14306+#include <linux/module.h>
14307+#include <asm/acpi.h>
14308+#include <asm/arch_hooks.h>
14309+#include <asm/e820.h>
14310+#include <asm/setup.h>
14311+#include <asm/fixmap.h>
14312+
14313+#include <xen/interface/callback.h>
14314+#include <xen/interface/memory.h>
14315+
14316+#ifdef CONFIG_HOTPLUG_CPU
14317+#define DEFAULT_SEND_IPI (1)
14318+#else
14319+#define DEFAULT_SEND_IPI (0)
14320+#endif
14321+
14322+int no_broadcast=DEFAULT_SEND_IPI;
14323+
14324+static __init int no_ipi_broadcast(char *str)
14325+{
14326+ get_option(&str, &no_broadcast);
14327+ printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
14328+ "IPI Broadcast");
14329+ return 1;
14330+}
14331+
14332+__setup("no_ipi_broadcast", no_ipi_broadcast);
14333+
14334+static int __init print_ipi_mode(void)
14335+{
14336+ printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
14337+ "Shortcut");
14338+ return 0;
14339+}
14340+
14341+late_initcall(print_ipi_mode);
14342+
14343+/**
14344+ * machine_specific_memory_setup - Hook for machine specific memory setup.
14345+ *
14346+ * Description:
14347+ * This is included late in kernel/setup.c so that it can make
14348+ * use of all of the static functions.
14349+ **/
14350+
14351+char * __init machine_specific_memory_setup(void)
14352+{
14353+ int rc;
14354+ struct xen_memory_map memmap;
14355+ /*
14356+ * This is rather large for a stack variable but this early in
14357+ * the boot process we know we have plenty slack space.
14358+ */
14359+ struct e820entry map[E820MAX];
14360+
14361+ memmap.nr_entries = E820MAX;
14362+ set_xen_guest_handle(memmap.buffer, map);
14363+
14364+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
14365+ if ( rc == -ENOSYS ) {
14366+ memmap.nr_entries = 1;
14367+ map[0].addr = 0ULL;
14368+ map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
14369+ /* 8MB slack (to balance backend allocations). */
14370+ map[0].size += 8ULL << 20;
14371+ map[0].type = E820_RAM;
14372+ rc = 0;
14373+ }
14374+ BUG_ON(rc);
14375+
14376+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
14377+
14378+ BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
14379+
14380+ return "Xen";
14381+}
14382+
14383+
14384+extern void hypervisor_callback(void);
14385+extern void failsafe_callback(void);
14386+extern void nmi(void);
14387+
14388+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
14389+EXPORT_SYMBOL(machine_to_phys_mapping);
14390+unsigned int machine_to_phys_order;
14391+EXPORT_SYMBOL(machine_to_phys_order);
14392+
14393+void __init pre_setup_arch_hook(void)
14394+{
14395+ struct xen_machphys_mapping mapping;
14396+ unsigned long machine_to_phys_nr_ents;
14397+ struct xen_platform_parameters pp;
14398+
14399+ init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
14400+
14401+ setup_xen_features();
14402+
14403+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
14404+ set_fixaddr_top(pp.virt_start);
14405+
14406+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
14407+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
14408+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
14409+ } else
14410+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
14411+ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
14412+
14413+ if (!xen_feature(XENFEAT_auto_translated_physmap))
14414+ phys_to_machine_mapping =
14415+ (unsigned long *)xen_start_info->mfn_list;
14416+}
14417+
14418+void __init machine_specific_arch_setup(void)
14419+{
14420+ int ret;
14421+ static struct callback_register __initdata event = {
14422+ .type = CALLBACKTYPE_event,
14423+ .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
14424+ };
14425+ static struct callback_register __initdata failsafe = {
14426+ .type = CALLBACKTYPE_failsafe,
14427+ .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
14428+ };
14429+ static struct callback_register __initdata nmi_cb = {
14430+ .type = CALLBACKTYPE_nmi,
14431+ .address = { __KERNEL_CS, (unsigned long)nmi },
14432+ };
14433+
14434+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
14435+ if (ret == 0)
14436+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
14437+#if CONFIG_XEN_COMPAT <= 0x030002
14438+ if (ret == -ENOSYS)
14439+ ret = HYPERVISOR_set_callbacks(
14440+ event.address.cs, event.address.eip,
14441+ failsafe.address.cs, failsafe.address.eip);
14442+#endif
14443+ BUG_ON(ret);
14444+
14445+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
14446+#if CONFIG_XEN_COMPAT <= 0x030002
14447+ if (ret == -ENOSYS) {
14448+ static struct xennmi_callback __initdata cb = {
14449+ .handler_address = (unsigned long)nmi
14450+ };
14451+
14452+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
14453+ }
14454+#endif
14455+}
14456Index: head-2008-11-25/arch/x86/lib/scrub.c
14457===================================================================
14458--- /dev/null 1970-01-01 00:00:00.000000000 +0000
14459+++ head-2008-11-25/arch/x86/lib/scrub.c 2008-02-08 12:30:51.000000000 +0100
14460@@ -0,0 +1,21 @@
14461+#include <asm/cpufeature.h>
14462+#include <asm/page.h>
14463+#include <asm/processor.h>
14464+
14465+void scrub_pages(void *v, unsigned int count)
14466+{
14467+ if (likely(cpu_has_xmm2)) {
14468+ unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
14469+
14470+ for (; n--; v += sizeof(long) * 4)
14471+ asm("movnti %1,(%0)\n\t"
14472+ "movnti %1,%c2(%0)\n\t"
14473+ "movnti %1,2*%c2(%0)\n\t"
14474+ "movnti %1,3*%c2(%0)\n\t"
14475+ : : "r" (v), "r" (0L), "i" (sizeof(long))
14476+ : "memory");
14477+ asm volatile("sfence" : : : "memory");
14478+ } else
14479+ for (; count--; v += PAGE_SIZE)
14480+ clear_page(v);
14481+}
14482Index: head-2008-11-25/arch/x86/mm/fault_32-xen.c
14483===================================================================
14484--- /dev/null 1970-01-01 00:00:00.000000000 +0000
14485+++ head-2008-11-25/arch/x86/mm/fault_32-xen.c 2007-12-10 08:47:31.000000000 +0100
14486@@ -0,0 +1,779 @@
14487+/*
14488+ * linux/arch/i386/mm/fault.c
14489+ *
14490+ * Copyright (C) 1995 Linus Torvalds
14491+ */
14492+
14493+#include <linux/signal.h>
14494+#include <linux/sched.h>
14495+#include <linux/kernel.h>
14496+#include <linux/errno.h>
14497+#include <linux/string.h>
14498+#include <linux/types.h>
14499+#include <linux/ptrace.h>
14500+#include <linux/mman.h>
14501+#include <linux/mm.h>
14502+#include <linux/smp.h>
14503+#include <linux/smp_lock.h>
14504+#include <linux/interrupt.h>
14505+#include <linux/init.h>
14506+#include <linux/tty.h>
14507+#include <linux/vt_kern.h> /* For unblank_screen() */
14508+#include <linux/highmem.h>
14509+#include <linux/module.h>
14510+#include <linux/kprobes.h>
14511+
14512+#include <asm/system.h>
14513+#include <asm/uaccess.h>
14514+#include <asm/desc.h>
14515+#include <asm/kdebug.h>
14516+
14517+extern void die(const char *,struct pt_regs *,long);
14518+
14519+#ifdef CONFIG_KPROBES
14520+ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
14521+int register_page_fault_notifier(struct notifier_block *nb)
14522+{
14523+ vmalloc_sync_all();
14524+ return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
14525+}
14526+
14527+int unregister_page_fault_notifier(struct notifier_block *nb)
14528+{
14529+ return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
14530+}
14531+
14532+static inline int notify_page_fault(enum die_val val, const char *str,
14533+ struct pt_regs *regs, long err, int trap, int sig)
14534+{
14535+ struct die_args args = {
14536+ .regs = regs,
14537+ .str = str,
14538+ .err = err,
14539+ .trapnr = trap,
14540+ .signr = sig
14541+ };
14542+ return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
14543+}
14544+#else
14545+static inline int notify_page_fault(enum die_val val, const char *str,
14546+ struct pt_regs *regs, long err, int trap, int sig)
14547+{
14548+ return NOTIFY_DONE;
14549+}
14550+#endif
14551+
14552+
14553+/*
14554+ * Unlock any spinlocks which will prevent us from getting the
14555+ * message out
14556+ */
14557+void bust_spinlocks(int yes)
14558+{
14559+ int loglevel_save = console_loglevel;
14560+
14561+ if (yes) {
14562+ oops_in_progress = 1;
14563+ return;
14564+ }
14565+#ifdef CONFIG_VT
14566+ unblank_screen();
14567+#endif
14568+ oops_in_progress = 0;
14569+ /*
14570+ * OK, the message is on the console. Now we call printk()
14571+ * without oops_in_progress set so that printk will give klogd
14572+ * a poke. Hold onto your hats...
14573+ */
14574+ console_loglevel = 15; /* NMI oopser may have shut the console up */
14575+ printk(" ");
14576+ console_loglevel = loglevel_save;
14577+}
14578+
14579+/*
14580+ * Return EIP plus the CS segment base. The segment limit is also
14581+ * adjusted, clamped to the kernel/user address space (whichever is
14582+ * appropriate), and returned in *eip_limit.
14583+ *
14584+ * The segment is checked, because it might have been changed by another
14585+ * task between the original faulting instruction and here.
14586+ *
14587+ * If CS is no longer a valid code segment, or if EIP is beyond the
14588+ * limit, or if it is a kernel address when CS is not a kernel segment,
14589+ * then the returned value will be greater than *eip_limit.
14590+ *
14591+ * This is slow, but is very rarely executed.
14592+ */
14593+static inline unsigned long get_segment_eip(struct pt_regs *regs,
14594+ unsigned long *eip_limit)
14595+{
14596+ unsigned long eip = regs->eip;
14597+ unsigned seg = regs->xcs & 0xffff;
14598+ u32 seg_ar, seg_limit, base, *desc;
14599+
14600+ /* Unlikely, but must come before segment checks. */
14601+ if (unlikely(regs->eflags & VM_MASK)) {
14602+ base = seg << 4;
14603+ *eip_limit = base + 0xffff;
14604+ return base + (eip & 0xffff);
14605+ }
14606+
14607+ /* The standard kernel/user address space limit. */
14608+ *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
14609+
14610+ /* By far the most common cases. */
14611+ if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
14612+ return eip;
14613+
14614+ /* Check the segment exists, is within the current LDT/GDT size,
14615+ that kernel/user (ring 0..3) has the appropriate privilege,
14616+ that it's a code segment, and get the limit. */
14617+ __asm__ ("larl %3,%0; lsll %3,%1"
14618+ : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
14619+ if ((~seg_ar & 0x9800) || eip > seg_limit) {
14620+ *eip_limit = 0;
14621+ return 1; /* So that returned eip > *eip_limit. */
14622+ }
14623+
14624+ /* Get the GDT/LDT descriptor base.
14625+ When you look for races in this code remember that
14626+ LDT and other horrors are only used in user space. */
14627+ if (seg & (1<<2)) {
14628+ /* Must lock the LDT while reading it. */
14629+ down(&current->mm->context.sem);
14630+ desc = current->mm->context.ldt;
14631+ desc = (void *)desc + (seg & ~7);
14632+ } else {
14633+ /* Must disable preemption while reading the GDT. */
14634+ desc = (u32 *)get_cpu_gdt_table(get_cpu());
14635+ desc = (void *)desc + (seg & ~7);
14636+ }
14637+
14638+ /* Decode the code segment base from the descriptor */
14639+ base = get_desc_base((unsigned long *)desc);
14640+
14641+ if (seg & (1<<2)) {
14642+ up(&current->mm->context.sem);
14643+ } else
14644+ put_cpu();
14645+
14646+ /* Adjust EIP and segment limit, and clamp at the kernel limit.
14647+ It's legitimate for segments to wrap at 0xffffffff. */
14648+ seg_limit += base;
14649+ if (seg_limit < *eip_limit && seg_limit >= base)
14650+ *eip_limit = seg_limit;
14651+ return eip + base;
14652+}
14653+
14654+/*
14655+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
14656+ * Check that here and ignore it.
14657+ */
14658+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
14659+{
14660+ unsigned long limit;
14661+ unsigned long instr = get_segment_eip (regs, &limit);
14662+ int scan_more = 1;
14663+ int prefetch = 0;
14664+ int i;
14665+
14666+ for (i = 0; scan_more && i < 15; i++) {
14667+ unsigned char opcode;
14668+ unsigned char instr_hi;
14669+ unsigned char instr_lo;
14670+
14671+ if (instr > limit)
14672+ break;
14673+ if (__get_user(opcode, (unsigned char __user *) instr))
14674+ break;
14675+
14676+ instr_hi = opcode & 0xf0;
14677+ instr_lo = opcode & 0x0f;
14678+ instr++;
14679+
14680+ switch (instr_hi) {
14681+ case 0x20:
14682+ case 0x30:
14683+ /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
14684+ scan_more = ((instr_lo & 7) == 0x6);
14685+ break;
14686+
14687+ case 0x60:
14688+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
14689+ scan_more = (instr_lo & 0xC) == 0x4;
14690+ break;
14691+ case 0xF0:
14692+ /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
14693+ scan_more = !instr_lo || (instr_lo>>1) == 1;
14694+ break;
14695+ case 0x00:
14696+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
14697+ scan_more = 0;
14698+ if (instr > limit)
14699+ break;
14700+ if (__get_user(opcode, (unsigned char __user *) instr))
14701+ break;
14702+ prefetch = (instr_lo == 0xF) &&
14703+ (opcode == 0x0D || opcode == 0x18);
14704+ break;
14705+ default:
14706+ scan_more = 0;
14707+ break;
14708+ }
14709+ }
14710+ return prefetch;
14711+}
14712+
14713+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
14714+ unsigned long error_code)
14715+{
14716+ if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
14717+ boot_cpu_data.x86 >= 6)) {
14718+ /* Catch an obscure case of prefetch inside an NX page. */
14719+ if (nx_enabled && (error_code & 16))
14720+ return 0;
14721+ return __is_prefetch(regs, addr);
14722+ }
14723+ return 0;
14724+}
14725+
14726+static noinline void force_sig_info_fault(int si_signo, int si_code,
14727+ unsigned long address, struct task_struct *tsk)
14728+{
14729+ siginfo_t info;
14730+
14731+ info.si_signo = si_signo;
14732+ info.si_errno = 0;
14733+ info.si_code = si_code;
14734+ info.si_addr = (void __user *)address;
14735+ force_sig_info(si_signo, &info, tsk);
14736+}
14737+
14738+fastcall void do_invalid_op(struct pt_regs *, unsigned long);
14739+
14740+#ifdef CONFIG_X86_PAE
14741+static void dump_fault_path(unsigned long address)
14742+{
14743+ unsigned long *p, page;
14744+ unsigned long mfn;
14745+
14746+ page = read_cr3();
14747+ p = (unsigned long *)__va(page);
14748+ p += (address >> 30) * 2;
14749+ printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
14750+ if (p[0] & _PAGE_PRESENT) {
14751+ mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14752+ page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14753+ p = (unsigned long *)__va(page);
14754+ address &= 0x3fffffff;
14755+ p += (address >> 21) * 2;
14756+ printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
14757+ page, p[1], p[0]);
14758+ mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14759+#ifdef CONFIG_HIGHPTE
14760+ if (mfn_to_pfn(mfn) >= highstart_pfn)
14761+ return;
14762+#endif
14763+ if (p[0] & _PAGE_PRESENT) {
14764+ page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14765+ p = (unsigned long *) __va(page);
14766+ address &= 0x001fffff;
14767+ p += (address >> 12) * 2;
14768+ printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
14769+ page, p[1], p[0]);
14770+ }
14771+ }
14772+}
14773+#else
14774+static void dump_fault_path(unsigned long address)
14775+{
14776+ unsigned long page;
14777+
14778+ page = read_cr3();
14779+ page = ((unsigned long *) __va(page))[address >> 22];
14780+ if (oops_may_print())
14781+ printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
14782+ machine_to_phys(page));
14783+ /*
14784+ * We must not directly access the pte in the highpte
14785+ * case if the page table is located in highmem.
14786+ * And lets rather not kmap-atomic the pte, just in case
14787+ * it's allocated already.
14788+ */
14789+#ifdef CONFIG_HIGHPTE
14790+ if ((page >> PAGE_SHIFT) >= highstart_pfn)
14791+ return;
14792+#endif
14793+ if ((page & 1) && oops_may_print()) {
14794+ page &= PAGE_MASK;
14795+ address &= 0x003ff000;
14796+ page = machine_to_phys(page);
14797+ page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
14798+ printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
14799+ machine_to_phys(page));
14800+ }
14801+}
14802+#endif
14803+
14804+static int spurious_fault(struct pt_regs *regs,
14805+ unsigned long address,
14806+ unsigned long error_code)
14807+{
14808+ pgd_t *pgd;
14809+ pud_t *pud;
14810+ pmd_t *pmd;
14811+ pte_t *pte;
14812+
14813+ /* Reserved-bit violation or user access to kernel space? */
14814+ if (error_code & 0x0c)
14815+ return 0;
14816+
14817+ pgd = init_mm.pgd + pgd_index(address);
14818+ if (!pgd_present(*pgd))
14819+ return 0;
14820+
14821+ pud = pud_offset(pgd, address);
14822+ if (!pud_present(*pud))
14823+ return 0;
14824+
14825+ pmd = pmd_offset(pud, address);
14826+ if (!pmd_present(*pmd))
14827+ return 0;
14828+
14829+ pte = pte_offset_kernel(pmd, address);
14830+ if (!pte_present(*pte))
14831+ return 0;
14832+ if ((error_code & 0x02) && !pte_write(*pte))
14833+ return 0;
14834+#ifdef CONFIG_X86_PAE
14835+ if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
14836+ return 0;
14837+#endif
14838+
14839+ return 1;
14840+}
14841+
14842+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
14843+{
14844+ unsigned index = pgd_index(address);
14845+ pgd_t *pgd_k;
14846+ pud_t *pud, *pud_k;
14847+ pmd_t *pmd, *pmd_k;
14848+
14849+ pgd += index;
14850+ pgd_k = init_mm.pgd + index;
14851+
14852+ if (!pgd_present(*pgd_k))
14853+ return NULL;
14854+
14855+ /*
14856+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
14857+ * and redundant with the set_pmd() on non-PAE. As would
14858+ * set_pud.
14859+ */
14860+
14861+ pud = pud_offset(pgd, address);
14862+ pud_k = pud_offset(pgd_k, address);
14863+ if (!pud_present(*pud_k))
14864+ return NULL;
14865+
14866+ pmd = pmd_offset(pud, address);
14867+ pmd_k = pmd_offset(pud_k, address);
14868+ if (!pmd_present(*pmd_k))
14869+ return NULL;
14870+ if (!pmd_present(*pmd))
14871+#if CONFIG_XEN_COMPAT > 0x030002
14872+ set_pmd(pmd, *pmd_k);
14873+#else
14874+ /*
14875+ * When running on older Xen we must launder *pmd_k through
14876+ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
14877+ */
14878+ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
14879+#endif
14880+ else
14881+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
14882+ return pmd_k;
14883+}
14884+
14885+/*
14886+ * Handle a fault on the vmalloc or module mapping area
14887+ *
14888+ * This assumes no large pages in there.
14889+ */
14890+static inline int vmalloc_fault(unsigned long address)
14891+{
14892+ unsigned long pgd_paddr;
14893+ pmd_t *pmd_k;
14894+ pte_t *pte_k;
14895+ /*
14896+ * Synchronize this task's top level page-table
14897+ * with the 'reference' page table.
14898+ *
14899+ * Do _not_ use "current" here. We might be inside
14900+ * an interrupt in the middle of a task switch..
14901+ */
14902+ pgd_paddr = read_cr3();
14903+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
14904+ if (!pmd_k)
14905+ return -1;
14906+ pte_k = pte_offset_kernel(pmd_k, address);
14907+ if (!pte_present(*pte_k))
14908+ return -1;
14909+ return 0;
14910+}
14911+
14912+/*
14913+ * This routine handles page faults. It determines the address,
14914+ * and the problem, and then passes it off to one of the appropriate
14915+ * routines.
14916+ *
14917+ * error_code:
14918+ * bit 0 == 0 means no page found, 1 means protection fault
14919+ * bit 1 == 0 means read, 1 means write
14920+ * bit 2 == 0 means kernel, 1 means user-mode
14921+ * bit 3 == 1 means use of reserved bit detected
14922+ * bit 4 == 1 means fault was an instruction fetch
14923+ */
14924+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
14925+ unsigned long error_code)
14926+{
14927+ struct task_struct *tsk;
14928+ struct mm_struct *mm;
14929+ struct vm_area_struct * vma;
14930+ unsigned long address;
14931+ int write, si_code;
14932+
14933+ /* get the address */
14934+ address = read_cr2();
14935+
14936+ /* Set the "privileged fault" bit to something sane. */
14937+ error_code &= ~4;
14938+ error_code |= (regs->xcs & 2) << 1;
14939+ if (regs->eflags & X86_EFLAGS_VM)
14940+ error_code |= 4;
14941+
14942+ tsk = current;
14943+
14944+ si_code = SEGV_MAPERR;
14945+
14946+ /*
14947+ * We fault-in kernel-space virtual memory on-demand. The
14948+ * 'reference' page table is init_mm.pgd.
14949+ *
14950+ * NOTE! We MUST NOT take any locks for this case. We may
14951+ * be in an interrupt or a critical region, and should
14952+ * only copy the information from the master page table,
14953+ * nothing more.
14954+ *
14955+ * This verifies that the fault happens in kernel space
14956+ * (error_code & 4) == 0, and that the fault was not a
14957+ * protection error (error_code & 9) == 0.
14958+ */
14959+ if (unlikely(address >= TASK_SIZE)) {
14960+#ifdef CONFIG_XEN
14961+ /* Faults in hypervisor area can never be patched up. */
14962+ if (address >= hypervisor_virt_start)
14963+ goto bad_area_nosemaphore;
14964+#endif
14965+ if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
14966+ return;
14967+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
14968+ if (spurious_fault(regs, address, error_code))
14969+ return;
14970+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14971+ SIGSEGV) == NOTIFY_STOP)
14972+ return;
14973+ /*
14974+ * Don't take the mm semaphore here. If we fixup a prefetch
14975+ * fault we could otherwise deadlock.
14976+ */
14977+ goto bad_area_nosemaphore;
14978+ }
14979+
14980+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14981+ SIGSEGV) == NOTIFY_STOP)
14982+ return;
14983+
14984+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
14985+ fault has been handled. */
14986+ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
14987+ local_irq_enable();
14988+
14989+ mm = tsk->mm;
14990+
14991+ /*
14992+ * If we're in an interrupt, have no user context or are running in an
14993+ * atomic region then we must not take the fault..
14994+ */
14995+ if (in_atomic() || !mm)
14996+ goto bad_area_nosemaphore;
14997+
14998+ /* When running in the kernel we expect faults to occur only to
14999+ * addresses in user space. All other faults represent errors in the
15000+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
15001+ * erroneous fault occurring in a code path which already holds mmap_sem
15002+ * we will deadlock attempting to validate the fault against the
15003+ * address space. Luckily the kernel only validly references user
15004+ * space from well defined areas of code, which are listed in the
15005+ * exceptions table.
15006+ *
15007+ * As the vast majority of faults will be valid we will only perform
15008+ * the source reference check when there is a possibilty of a deadlock.
15009+ * Attempt to lock the address space, if we cannot we then validate the
15010+ * source. If this is invalid we can skip the address space check,
15011+ * thus avoiding the deadlock.
15012+ */
15013+ if (!down_read_trylock(&mm->mmap_sem)) {
15014+ if ((error_code & 4) == 0 &&
15015+ !search_exception_tables(regs->eip))
15016+ goto bad_area_nosemaphore;
15017+ down_read(&mm->mmap_sem);
15018+ }
15019+
15020+ vma = find_vma(mm, address);
15021+ if (!vma)
15022+ goto bad_area;
15023+ if (vma->vm_start <= address)
15024+ goto good_area;
15025+ if (!(vma->vm_flags & VM_GROWSDOWN))
15026+ goto bad_area;
15027+ if (error_code & 4) {
15028+ /*
15029+ * Accessing the stack below %esp is always a bug.
15030+ * The large cushion allows instructions like enter
15031+ * and pusha to work. ("enter $65535,$31" pushes
15032+ * 32 pointers and then decrements %esp by 65535.)
15033+ */
15034+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
15035+ goto bad_area;
15036+ }
15037+ if (expand_stack(vma, address))
15038+ goto bad_area;
15039+/*
15040+ * Ok, we have a good vm_area for this memory access, so
15041+ * we can handle it..
15042+ */
15043+good_area:
15044+ si_code = SEGV_ACCERR;
15045+ write = 0;
15046+ switch (error_code & 3) {
15047+ default: /* 3: write, present */
15048+#ifdef TEST_VERIFY_AREA
15049+ if (regs->cs == GET_KERNEL_CS())
15050+ printk("WP fault at %08lx\n", regs->eip);
15051+#endif
15052+ /* fall through */
15053+ case 2: /* write, not present */
15054+ if (!(vma->vm_flags & VM_WRITE))
15055+ goto bad_area;
15056+ write++;
15057+ break;
15058+ case 1: /* read, present */
15059+ goto bad_area;
15060+ case 0: /* read, not present */
15061+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
15062+ goto bad_area;
15063+ }
15064+
15065+ survive:
15066+ /*
15067+ * If for any reason at all we couldn't handle the fault,
15068+ * make sure we exit gracefully rather than endlessly redo
15069+ * the fault.
15070+ */
15071+ switch (handle_mm_fault(mm, vma, address, write)) {
15072+ case VM_FAULT_MINOR:
15073+ tsk->min_flt++;
15074+ break;
15075+ case VM_FAULT_MAJOR:
15076+ tsk->maj_flt++;
15077+ break;
15078+ case VM_FAULT_SIGBUS:
15079+ goto do_sigbus;
15080+ case VM_FAULT_OOM:
15081+ goto out_of_memory;
15082+ default:
15083+ BUG();
15084+ }
15085+
15086+ /*
15087+ * Did it hit the DOS screen memory VA from vm86 mode?
15088+ */
15089+ if (regs->eflags & VM_MASK) {
15090+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
15091+ if (bit < 32)
15092+ tsk->thread.screen_bitmap |= 1 << bit;
15093+ }
15094+ up_read(&mm->mmap_sem);
15095+ return;
15096+
15097+/*
15098+ * Something tried to access memory that isn't in our memory map..
15099+ * Fix it, but check if it's kernel or user first..
15100+ */
15101+bad_area:
15102+ up_read(&mm->mmap_sem);
15103+
15104+bad_area_nosemaphore:
15105+ /* User mode accesses just cause a SIGSEGV */
15106+ if (error_code & 4) {
15107+ /*
15108+ * Valid to do another page fault here because this one came
15109+ * from user space.
15110+ */
15111+ if (is_prefetch(regs, address, error_code))
15112+ return;
15113+
15114+ tsk->thread.cr2 = address;
15115+ /* Kernel addresses are always protection faults */
15116+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
15117+ tsk->thread.trap_no = 14;
15118+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
15119+ return;
15120+ }
15121+
15122+#ifdef CONFIG_X86_F00F_BUG
15123+ /*
15124+ * Pentium F0 0F C7 C8 bug workaround.
15125+ */
15126+ if (boot_cpu_data.f00f_bug) {
15127+ unsigned long nr;
15128+
15129+ nr = (address - idt_descr.address) >> 3;
15130+
15131+ if (nr == 6) {
15132+ do_invalid_op(regs, 0);
15133+ return;
15134+ }
15135+ }
15136+#endif
15137+
15138+no_context:
15139+ /* Are we prepared to handle this kernel fault? */
15140+ if (fixup_exception(regs))
15141+ return;
15142+
15143+ /*
15144+ * Valid to do another page fault here, because if this fault
15145+ * had been triggered by is_prefetch fixup_exception would have
15146+ * handled it.
15147+ */
15148+ if (is_prefetch(regs, address, error_code))
15149+ return;
15150+
15151+/*
15152+ * Oops. The kernel tried to access some bad page. We'll have to
15153+ * terminate things with extreme prejudice.
15154+ */
15155+
15156+ bust_spinlocks(1);
15157+
15158+ if (oops_may_print()) {
15159+ #ifdef CONFIG_X86_PAE
15160+ if (error_code & 16) {
15161+ pte_t *pte = lookup_address(address);
15162+
15163+ if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
15164+ printk(KERN_CRIT "kernel tried to execute "
15165+ "NX-protected page - exploit attempt? "
15166+ "(uid: %d)\n", current->uid);
15167+ }
15168+ #endif
15169+ if (address < PAGE_SIZE)
15170+ printk(KERN_ALERT "BUG: unable to handle kernel NULL "
15171+ "pointer dereference");
15172+ else
15173+ printk(KERN_ALERT "BUG: unable to handle kernel paging"
15174+ " request");
15175+ printk(" at virtual address %08lx\n",address);
15176+ printk(KERN_ALERT " printing eip:\n");
15177+ printk("%08lx\n", regs->eip);
15178+ }
15179+ dump_fault_path(address);
15180+ tsk->thread.cr2 = address;
15181+ tsk->thread.trap_no = 14;
15182+ tsk->thread.error_code = error_code;
15183+ die("Oops", regs, error_code);
15184+ bust_spinlocks(0);
15185+ do_exit(SIGKILL);
15186+
15187+/*
15188+ * We ran out of memory, or some other thing happened to us that made
15189+ * us unable to handle the page fault gracefully.
15190+ */
15191+out_of_memory:
15192+ up_read(&mm->mmap_sem);
15193+ if (tsk->pid == 1) {
15194+ yield();
15195+ down_read(&mm->mmap_sem);
15196+ goto survive;
15197+ }
15198+ printk("VM: killing process %s\n", tsk->comm);
15199+ if (error_code & 4)
15200+ do_exit(SIGKILL);
15201+ goto no_context;
15202+
15203+do_sigbus:
15204+ up_read(&mm->mmap_sem);
15205+
15206+ /* Kernel mode? Handle exceptions or die */
15207+ if (!(error_code & 4))
15208+ goto no_context;
15209+
15210+ /* User space => ok to do another page fault */
15211+ if (is_prefetch(regs, address, error_code))
15212+ return;
15213+
15214+ tsk->thread.cr2 = address;
15215+ tsk->thread.error_code = error_code;
15216+ tsk->thread.trap_no = 14;
15217+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
15218+}
15219+
15220+#if !HAVE_SHARED_KERNEL_PMD
15221+void vmalloc_sync_all(void)
15222+{
15223+ /*
15224+ * Note that races in the updates of insync and start aren't
15225+ * problematic: insync can only get set bits added, and updates to
15226+ * start are only improving performance (without affecting correctness
15227+ * if undone).
15228+ * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
15229+ * This change works just fine with 2-level paging too.
15230+ */
15231+#define sync_index(a) ((a) >> PMD_SHIFT)
15232+ static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
15233+ static unsigned long start = TASK_SIZE;
15234+ unsigned long address;
15235+
15236+ BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
15237+ for (address = start;
15238+ address >= TASK_SIZE && address < hypervisor_virt_start;
15239+ address += 1UL << PMD_SHIFT) {
15240+ if (!test_bit(sync_index(address), insync)) {
15241+ unsigned long flags;
15242+ struct page *page;
15243+
15244+ spin_lock_irqsave(&pgd_lock, flags);
15245+ /* XEN: failure path assumes non-empty pgd_list. */
15246+ if (unlikely(!pgd_list)) {
15247+ spin_unlock_irqrestore(&pgd_lock, flags);
15248+ return;
15249+ }
15250+ for (page = pgd_list; page; page =
15251+ (struct page *)page->index)
15252+ if (!vmalloc_sync_one(page_address(page),
15253+ address)) {
15254+ BUG_ON(page != pgd_list);
15255+ break;
15256+ }
15257+ spin_unlock_irqrestore(&pgd_lock, flags);
15258+ if (!page)
15259+ set_bit(sync_index(address), insync);
15260+ }
15261+ if (address == start && test_bit(sync_index(address), insync))
15262+ start = address + (1UL << PMD_SHIFT);
15263+ }
15264+}
15265+#endif
15266Index: head-2008-11-25/arch/x86/mm/highmem_32-xen.c
15267===================================================================
15268--- /dev/null 1970-01-01 00:00:00.000000000 +0000
15269+++ head-2008-11-25/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100
15270@@ -0,0 +1,183 @@
15271+#include <linux/highmem.h>
15272+#include <linux/module.h>
15273+
15274+void *kmap(struct page *page)
15275+{
15276+ might_sleep();
15277+ if (!PageHighMem(page))
15278+ return page_address(page);
15279+ return kmap_high(page);
15280+}
15281+
15282+void kunmap(struct page *page)
15283+{
15284+ if (in_interrupt())
15285+ BUG();
15286+ if (!PageHighMem(page))
15287+ return;
15288+ kunmap_high(page);
15289+}
15290+
15291+/*
15292+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
15293+ * no global lock is needed and because the kmap code must perform a global TLB
15294+ * invalidation when the kmap pool wraps.
15295+ *
15296+ * However when holding an atomic kmap is is not legal to sleep, so atomic
15297+ * kmaps are appropriate for short, tight code paths only.
15298+ */
15299+static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
15300+{
15301+ enum fixed_addresses idx;
15302+ unsigned long vaddr;
15303+
15304+ /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
15305+ inc_preempt_count();
15306+ if (!PageHighMem(page))
15307+ return page_address(page);
15308+
15309+ idx = type + KM_TYPE_NR*smp_processor_id();
15310+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15311+#ifdef CONFIG_DEBUG_HIGHMEM
15312+ if (!pte_none(*(kmap_pte-idx)))
15313+ BUG();
15314+#endif
15315+ set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
15316+
15317+ return (void*) vaddr;
15318+}
15319+
15320+void *kmap_atomic(struct page *page, enum km_type type)
15321+{
15322+ return __kmap_atomic(page, type, kmap_prot);
15323+}
15324+
15325+/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
15326+void *kmap_atomic_pte(struct page *page, enum km_type type)
15327+{
15328+ return __kmap_atomic(page, type,
15329+ test_bit(PG_pinned, &page->flags)
15330+ ? PAGE_KERNEL_RO : kmap_prot);
15331+}
15332+
15333+void kunmap_atomic(void *kvaddr, enum km_type type)
15334+{
15335+#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
15336+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
15337+ enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
15338+
15339+ if (vaddr < FIXADDR_START) { // FIXME
15340+ dec_preempt_count();
15341+ preempt_check_resched();
15342+ return;
15343+ }
15344+#endif
15345+
15346+#if defined(CONFIG_DEBUG_HIGHMEM)
15347+ if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
15348+ BUG();
15349+
15350+ /*
15351+ * force other mappings to Oops if they'll try to access
15352+ * this pte without first remap it
15353+ */
15354+ pte_clear(&init_mm, vaddr, kmap_pte-idx);
15355+ __flush_tlb_one(vaddr);
15356+#elif defined(CONFIG_XEN)
15357+ /*
15358+ * We must ensure there are no dangling pagetable references when
15359+ * returning memory to Xen (decrease_reservation).
15360+ * XXX TODO: We could make this faster by only zapping when
15361+ * kmap_flush_unused is called but that is trickier and more invasive.
15362+ */
15363+ pte_clear(&init_mm, vaddr, kmap_pte-idx);
15364+#endif
15365+
15366+ dec_preempt_count();
15367+ preempt_check_resched();
15368+}
15369+
15370+/* This is the same as kmap_atomic() but can map memory that doesn't
15371+ * have a struct page associated with it.
15372+ */
15373+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
15374+{
15375+ enum fixed_addresses idx;
15376+ unsigned long vaddr;
15377+
15378+ inc_preempt_count();
15379+
15380+ idx = type + KM_TYPE_NR*smp_processor_id();
15381+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15382+ set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
15383+ __flush_tlb_one(vaddr);
15384+
15385+ return (void*) vaddr;
15386+}
15387+
15388+struct page *kmap_atomic_to_page(void *ptr)
15389+{
15390+ unsigned long idx, vaddr = (unsigned long)ptr;
15391+ pte_t *pte;
15392+
15393+ if (vaddr < FIXADDR_START)
15394+ return virt_to_page(ptr);
15395+
15396+ idx = virt_to_fix(vaddr);
15397+ pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
15398+ return pte_page(*pte);
15399+}
15400+
15401+void clear_highpage(struct page *page)
15402+{
15403+ void *kaddr;
15404+
15405+ if (likely(xen_feature(XENFEAT_highmem_assist))
15406+ && PageHighMem(page)) {
15407+ struct mmuext_op meo;
15408+
15409+ meo.cmd = MMUEXT_CLEAR_PAGE;
15410+ meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
15411+ if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15412+ return;
15413+ }
15414+
15415+ kaddr = kmap_atomic(page, KM_USER0);
15416+ clear_page(kaddr);
15417+ kunmap_atomic(kaddr, KM_USER0);
15418+}
15419+
15420+void copy_highpage(struct page *to, struct page *from)
15421+{
15422+ void *vfrom, *vto;
15423+
15424+ if (likely(xen_feature(XENFEAT_highmem_assist))
15425+ && (PageHighMem(from) || PageHighMem(to))) {
15426+ unsigned long from_pfn = page_to_pfn(from);
15427+ unsigned long to_pfn = page_to_pfn(to);
15428+ struct mmuext_op meo;
15429+
15430+ meo.cmd = MMUEXT_COPY_PAGE;
15431+ meo.arg1.mfn = pfn_to_mfn(to_pfn);
15432+ meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
15433+ if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
15434+ && mfn_to_pfn(meo.arg1.mfn) == to_pfn
15435+ && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15436+ return;
15437+ }
15438+
15439+ vfrom = kmap_atomic(from, KM_USER0);
15440+ vto = kmap_atomic(to, KM_USER1);
15441+ copy_page(vto, vfrom);
15442+ kunmap_atomic(vfrom, KM_USER0);
15443+ kunmap_atomic(vto, KM_USER1);
15444+}
15445+
15446+EXPORT_SYMBOL(kmap);
15447+EXPORT_SYMBOL(kunmap);
15448+EXPORT_SYMBOL(kmap_atomic);
15449+EXPORT_SYMBOL(kmap_atomic_pte);
15450+EXPORT_SYMBOL(kunmap_atomic);
15451+EXPORT_SYMBOL(kmap_atomic_to_page);
15452+EXPORT_SYMBOL(clear_highpage);
15453+EXPORT_SYMBOL(copy_highpage);
15454Index: head-2008-11-25/arch/x86/mm/hypervisor.c
15455===================================================================
15456--- /dev/null 1970-01-01 00:00:00.000000000 +0000
15457+++ head-2008-11-25/arch/x86/mm/hypervisor.c 2008-10-29 09:55:56.000000000 +0100
15458@@ -0,0 +1,547 @@
15459+/******************************************************************************
15460+ * mm/hypervisor.c
15461+ *
15462+ * Update page tables via the hypervisor.
15463+ *
15464+ * Copyright (c) 2002-2004, K A Fraser
15465+ *
15466+ * This program is free software; you can redistribute it and/or
15467+ * modify it under the terms of the GNU General Public License version 2
15468+ * as published by the Free Software Foundation; or, when distributed
15469+ * separately from the Linux kernel or incorporated into other
15470+ * software packages, subject to the following license:
15471+ *
15472+ * Permission is hereby granted, free of charge, to any person obtaining a copy
15473+ * of this source file (the "Software"), to deal in the Software without
15474+ * restriction, including without limitation the rights to use, copy, modify,
15475+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
15476+ * and to permit persons to whom the Software is furnished to do so, subject to
15477+ * the following conditions:
15478+ *
15479+ * The above copyright notice and this permission notice shall be included in
15480+ * all copies or substantial portions of the Software.
15481+ *
15482+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15483+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15484+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15485+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15486+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
15487+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
15488+ * IN THE SOFTWARE.
15489+ */
15490+
15491+#include <linux/sched.h>
15492+#include <linux/mm.h>
15493+#include <linux/vmalloc.h>
15494+#include <asm/page.h>
15495+#include <asm/pgtable.h>
15496+#include <asm/hypervisor.h>
15497+#include <xen/balloon.h>
15498+#include <xen/features.h>
15499+#include <xen/interface/memory.h>
15500+#include <linux/module.h>
15501+#include <linux/percpu.h>
15502+#include <asm/tlbflush.h>
15503+#include <linux/highmem.h>
15504+
15505+void xen_l1_entry_update(pte_t *ptr, pte_t val)
15506+{
15507+ mmu_update_t u;
15508+#ifdef CONFIG_HIGHPTE
15509+ u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ?
15510+ arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr);
15511+#else
15512+ u.ptr = virt_to_machine(ptr);
15513+#endif
15514+ u.val = __pte_val(val);
15515+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15516+}
15517+EXPORT_SYMBOL_GPL(xen_l1_entry_update);
15518+
15519+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
15520+{
15521+ mmu_update_t u;
15522+ u.ptr = virt_to_machine(ptr);
15523+ u.val = __pmd_val(val);
15524+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15525+}
15526+
15527+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
15528+void xen_l3_entry_update(pud_t *ptr, pud_t val)
15529+{
15530+ mmu_update_t u;
15531+ u.ptr = virt_to_machine(ptr);
15532+ u.val = __pud_val(val);
15533+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15534+}
15535+#endif
15536+
15537+#ifdef CONFIG_X86_64
15538+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
15539+{
15540+ mmu_update_t u;
15541+ u.ptr = virt_to_machine(ptr);
15542+ u.val = __pgd_val(val);
15543+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15544+}
15545+#endif /* CONFIG_X86_64 */
15546+
15547+void xen_pt_switch(unsigned long ptr)
15548+{
15549+ struct mmuext_op op;
15550+ op.cmd = MMUEXT_NEW_BASEPTR;
15551+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15552+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15553+}
15554+
15555+void xen_new_user_pt(unsigned long ptr)
15556+{
15557+ struct mmuext_op op;
15558+ op.cmd = MMUEXT_NEW_USER_BASEPTR;
15559+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15560+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15561+}
15562+
15563+void xen_tlb_flush(void)
15564+{
15565+ struct mmuext_op op;
15566+ op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
15567+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15568+}
15569+EXPORT_SYMBOL(xen_tlb_flush);
15570+
15571+void xen_invlpg(unsigned long ptr)
15572+{
15573+ struct mmuext_op op;
15574+ op.cmd = MMUEXT_INVLPG_LOCAL;
15575+ op.arg1.linear_addr = ptr & PAGE_MASK;
15576+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15577+}
15578+EXPORT_SYMBOL(xen_invlpg);
15579+
15580+#ifdef CONFIG_SMP
15581+
15582+void xen_tlb_flush_all(void)
15583+{
15584+ struct mmuext_op op;
15585+ op.cmd = MMUEXT_TLB_FLUSH_ALL;
15586+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15587+}
15588+
15589+void xen_tlb_flush_mask(cpumask_t *mask)
15590+{
15591+ struct mmuext_op op;
15592+ if ( cpus_empty(*mask) )
15593+ return;
15594+ op.cmd = MMUEXT_TLB_FLUSH_MULTI;
15595+ set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15596+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15597+}
15598+
15599+void xen_invlpg_all(unsigned long ptr)
15600+{
15601+ struct mmuext_op op;
15602+ op.cmd = MMUEXT_INVLPG_ALL;
15603+ op.arg1.linear_addr = ptr & PAGE_MASK;
15604+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15605+}
15606+
15607+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
15608+{
15609+ struct mmuext_op op;
15610+ if ( cpus_empty(*mask) )
15611+ return;
15612+ op.cmd = MMUEXT_INVLPG_MULTI;
15613+ op.arg1.linear_addr = ptr & PAGE_MASK;
15614+ set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15615+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15616+}
15617+
15618+#endif /* CONFIG_SMP */
15619+
15620+void xen_pgd_pin(unsigned long ptr)
15621+{
15622+ struct mmuext_op op;
15623+#ifdef CONFIG_X86_64
15624+ op.cmd = MMUEXT_PIN_L4_TABLE;
15625+#elif defined(CONFIG_X86_PAE)
15626+ op.cmd = MMUEXT_PIN_L3_TABLE;
15627+#else
15628+ op.cmd = MMUEXT_PIN_L2_TABLE;
15629+#endif
15630+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15631+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15632+}
15633+
15634+void xen_pgd_unpin(unsigned long ptr)
15635+{
15636+ struct mmuext_op op;
15637+ op.cmd = MMUEXT_UNPIN_TABLE;
15638+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15639+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15640+}
15641+
15642+void xen_set_ldt(const void *ptr, unsigned int ents)
15643+{
15644+ struct mmuext_op op;
15645+ op.cmd = MMUEXT_SET_LDT;
15646+ op.arg1.linear_addr = (unsigned long)ptr;
15647+ op.arg2.nr_ents = ents;
15648+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15649+}
15650+
15651+/* Protected by balloon_lock. */
15652+#define MAX_CONTIG_ORDER 9 /* 2MB */
15653+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
15654+static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
15655+static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
15656+
15657+/* Ensure multi-page extents are contiguous in machine memory. */
15658+int xen_create_contiguous_region(
15659+ unsigned long vstart, unsigned int order, unsigned int address_bits)
15660+{
15661+ unsigned long *in_frames = discontig_frames, out_frame;
15662+ unsigned long frame, flags;
15663+ unsigned int i;
15664+ int rc, success;
15665+ struct xen_memory_exchange exchange = {
15666+ .in = {
15667+ .nr_extents = 1UL << order,
15668+ .extent_order = 0,
15669+ .domid = DOMID_SELF
15670+ },
15671+ .out = {
15672+ .nr_extents = 1,
15673+ .extent_order = order,
15674+ .address_bits = address_bits,
15675+ .domid = DOMID_SELF
15676+ }
15677+ };
15678+
15679+ /*
15680+ * Currently an auto-translated guest will not perform I/O, nor will
15681+ * it require PAE page directories below 4GB. Therefore any calls to
15682+ * this function are redundant and can be ignored.
15683+ */
15684+ if (xen_feature(XENFEAT_auto_translated_physmap))
15685+ return 0;
15686+
15687+ if (unlikely(order > MAX_CONTIG_ORDER))
15688+ return -ENOMEM;
15689+
15690+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
15691+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
15692+
15693+ scrub_pages((void *)vstart, 1 << order);
15694+
15695+ balloon_lock(flags);
15696+
15697+ /* 1. Zap current PTEs, remembering MFNs. */
15698+ for (i = 0; i < (1U<<order); i++) {
15699+ in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
15700+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15701+ __pte_ma(0), 0);
15702+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15703+ INVALID_P2M_ENTRY);
15704+ }
15705+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15706+ BUG();
15707+
15708+ /* 2. Get a new contiguous memory extent. */
15709+ out_frame = __pa(vstart) >> PAGE_SHIFT;
15710+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15711+ success = (exchange.nr_exchanged == (1UL << order));
15712+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15713+ BUG_ON(success && (rc != 0));
15714+#if CONFIG_XEN_COMPAT <= 0x030002
15715+ if (unlikely(rc == -ENOSYS)) {
15716+ /* Compatibility when XENMEM_exchange is unsupported. */
15717+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15718+ &exchange.in) != (1UL << order))
15719+ BUG();
15720+ success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15721+ &exchange.out) == 1);
15722+ if (!success) {
15723+ /* Couldn't get special memory: fall back to normal. */
15724+ for (i = 0; i < (1U<<order); i++)
15725+ in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
15726+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15727+ &exchange.in) != (1UL<<order))
15728+ BUG();
15729+ }
15730+ }
15731+#endif
15732+
15733+ /* 3. Map the new extent in place of old pages. */
15734+ for (i = 0; i < (1U<<order); i++) {
15735+ frame = success ? (out_frame + i) : in_frames[i];
15736+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15737+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
15738+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15739+ }
15740+
15741+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15742+ ? UVMF_TLB_FLUSH|UVMF_ALL
15743+ : UVMF_INVLPG|UVMF_ALL;
15744+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15745+ BUG();
15746+
15747+ balloon_unlock(flags);
15748+
15749+ return success ? 0 : -ENOMEM;
15750+}
15751+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
15752+
15753+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
15754+{
15755+ unsigned long *out_frames = discontig_frames, in_frame;
15756+ unsigned long frame, flags;
15757+ unsigned int i;
15758+ int rc, success;
15759+ struct xen_memory_exchange exchange = {
15760+ .in = {
15761+ .nr_extents = 1,
15762+ .extent_order = order,
15763+ .domid = DOMID_SELF
15764+ },
15765+ .out = {
15766+ .nr_extents = 1UL << order,
15767+ .extent_order = 0,
15768+ .domid = DOMID_SELF
15769+ }
15770+ };
15771+
15772+ if (xen_feature(XENFEAT_auto_translated_physmap))
15773+ return;
15774+
15775+ if (unlikely(order > MAX_CONTIG_ORDER))
15776+ return;
15777+
15778+ set_xen_guest_handle(exchange.in.extent_start, &in_frame);
15779+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
15780+
15781+ scrub_pages((void *)vstart, 1 << order);
15782+
15783+ balloon_lock(flags);
15784+
15785+ /* 1. Find start MFN of contiguous extent. */
15786+ in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
15787+
15788+ /* 2. Zap current PTEs. */
15789+ for (i = 0; i < (1U<<order); i++) {
15790+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15791+ __pte_ma(0), 0);
15792+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15793+ INVALID_P2M_ENTRY);
15794+ out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
15795+ }
15796+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15797+ BUG();
15798+
15799+ /* 3. Do the exchange for non-contiguous MFNs. */
15800+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15801+ success = (exchange.nr_exchanged == 1);
15802+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15803+ BUG_ON(success && (rc != 0));
15804+#if CONFIG_XEN_COMPAT <= 0x030002
15805+ if (unlikely(rc == -ENOSYS)) {
15806+ /* Compatibility when XENMEM_exchange is unsupported. */
15807+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15808+ &exchange.in) != 1)
15809+ BUG();
15810+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15811+ &exchange.out) != (1UL << order))
15812+ BUG();
15813+ success = 1;
15814+ }
15815+#endif
15816+
15817+ /* 4. Map new pages in place of old pages. */
15818+ for (i = 0; i < (1U<<order); i++) {
15819+ frame = success ? out_frames[i] : (in_frame + i);
15820+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15821+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
15822+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15823+ }
15824+
15825+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15826+ ? UVMF_TLB_FLUSH|UVMF_ALL
15827+ : UVMF_INVLPG|UVMF_ALL;
15828+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15829+ BUG();
15830+
15831+ balloon_unlock(flags);
15832+}
15833+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
15834+
15835+int xen_limit_pages_to_max_mfn(
15836+ struct page *pages, unsigned int order, unsigned int address_bits)
15837+{
15838+ unsigned long flags, frame;
15839+ unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
15840+ struct page *page;
15841+ unsigned int i, n, nr_mcl;
15842+ int rc, success;
15843+ DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
15844+
15845+ struct xen_memory_exchange exchange = {
15846+ .in = {
15847+ .extent_order = 0,
15848+ .domid = DOMID_SELF
15849+ },
15850+ .out = {
15851+ .extent_order = 0,
15852+ .address_bits = address_bits,
15853+ .domid = DOMID_SELF
15854+ }
15855+ };
15856+
15857+ if (xen_feature(XENFEAT_auto_translated_physmap))
15858+ return 0;
15859+
15860+ if (unlikely(order > MAX_CONTIG_ORDER))
15861+ return -ENOMEM;
15862+
15863+ bitmap_zero(limit_map, 1U << order);
15864+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
15865+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
15866+
15867+ /* 0. Scrub the pages. */
15868+ for (i = 0, n = 0; i < 1U<<order ; i++) {
15869+ page = &pages[i];
15870+ if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
15871+ continue;
15872+ __set_bit(i, limit_map);
15873+
15874+ if (!PageHighMem(page))
15875+ scrub_pages(page_address(page), 1);
15876+#ifdef CONFIG_XEN_SCRUB_PAGES
15877+ else {
15878+ scrub_pages(kmap(page), 1);
15879+ kunmap(page);
15880+ ++n;
15881+ }
15882+#endif
15883+ }
15884+ if (bitmap_empty(limit_map, 1U << order))
15885+ return 0;
15886+
15887+ if (n)
15888+ kmap_flush_unused();
15889+
15890+ balloon_lock(flags);
15891+
15892+ /* 1. Zap current PTEs (if any), remembering MFNs. */
15893+ for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15894+ if(!test_bit(i, limit_map))
15895+ continue;
15896+ page = &pages[i];
15897+
15898+ out_frames[n] = page_to_pfn(page);
15899+ in_frames[n] = pfn_to_mfn(out_frames[n]);
15900+
15901+ if (!PageHighMem(page))
15902+ MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15903+ (unsigned long)page_address(page),
15904+ __pte_ma(0), 0);
15905+
15906+ set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
15907+ ++n;
15908+ }
15909+ if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15910+ BUG();
15911+
15912+ /* 2. Get new memory below the required limit. */
15913+ exchange.in.nr_extents = n;
15914+ exchange.out.nr_extents = n;
15915+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15916+ success = (exchange.nr_exchanged == n);
15917+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15918+ BUG_ON(success && (rc != 0));
15919+#if CONFIG_XEN_COMPAT <= 0x030002
15920+ if (unlikely(rc == -ENOSYS)) {
15921+ /* Compatibility when XENMEM_exchange is unsupported. */
15922+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15923+ &exchange.in) != n)
15924+ BUG();
15925+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15926+ &exchange.out) != n)
15927+ BUG();
15928+ success = 1;
15929+ }
15930+#endif
15931+
15932+ /* 3. Map the new pages in place of old pages. */
15933+ for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15934+ if(!test_bit(i, limit_map))
15935+ continue;
15936+ page = &pages[i];
15937+
15938+ frame = success ? out_frames[n] : in_frames[n];
15939+
15940+ if (!PageHighMem(page))
15941+ MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15942+ (unsigned long)page_address(page),
15943+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
15944+
15945+ set_phys_to_machine(page_to_pfn(page), frame);
15946+ ++n;
15947+ }
15948+ if (nr_mcl) {
15949+ cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
15950+ ? UVMF_TLB_FLUSH|UVMF_ALL
15951+ : UVMF_INVLPG|UVMF_ALL;
15952+ if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15953+ BUG();
15954+ }
15955+
15956+ balloon_unlock(flags);
15957+
15958+ return success ? 0 : -ENOMEM;
15959+}
15960+EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
15961+
15962+#ifdef __i386__
15963+int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
15964+{
15965+ __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
15966+ maddr_t mach_lp = arbitrary_virt_to_machine(lp);
15967+ return HYPERVISOR_update_descriptor(
15968+ mach_lp, (u64)entry_a | ((u64)entry_b<<32));
15969+}
15970+#endif
15971+
15972+#define MAX_BATCHED_FULL_PTES 32
15973+
15974+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
15975+ unsigned long addr, unsigned long end, pgprot_t newprot)
15976+{
15977+ int rc = 0, i = 0;
15978+ mmu_update_t u[MAX_BATCHED_FULL_PTES];
15979+ pte_t *pte;
15980+ spinlock_t *ptl;
15981+
15982+ if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
15983+ return 0;
15984+
15985+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
15986+ do {
15987+ if (pte_present(*pte)) {
15988+ u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
15989+ | ((unsigned long)pte & ~PAGE_MASK)
15990+ | MMU_PT_UPDATE_PRESERVE_AD;
15991+ u[i].val = __pte_val(pte_modify(*pte, newprot));
15992+ if (++i == MAX_BATCHED_FULL_PTES) {
15993+ if ((rc = HYPERVISOR_mmu_update(
15994+ &u[0], i, NULL, DOMID_SELF)) != 0)
15995+ break;
15996+ i = 0;
15997+ }
15998+ }
15999+ } while (pte++, addr += PAGE_SIZE, addr != end);
16000+ if (i)
16001+ rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
16002+ pte_unmap_unlock(pte - 1, ptl);
16003+ BUG_ON(rc && rc != -ENOSYS);
16004+ return !rc;
16005+}
16006Index: head-2008-11-25/arch/x86/mm/init_32-xen.c
16007===================================================================
16008--- /dev/null 1970-01-01 00:00:00.000000000 +0000
16009+++ head-2008-11-25/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100
16010@@ -0,0 +1,840 @@
16011+/*
16012+ * linux/arch/i386/mm/init.c
16013+ *
16014+ * Copyright (C) 1995 Linus Torvalds
16015+ *
16016+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16017+ */
16018+
16019+#include <linux/module.h>
16020+#include <linux/signal.h>
16021+#include <linux/sched.h>
16022+#include <linux/kernel.h>
16023+#include <linux/errno.h>
16024+#include <linux/string.h>
16025+#include <linux/types.h>
16026+#include <linux/ptrace.h>
16027+#include <linux/mman.h>
16028+#include <linux/mm.h>
16029+#include <linux/hugetlb.h>
16030+#include <linux/swap.h>
16031+#include <linux/smp.h>
16032+#include <linux/init.h>
16033+#include <linux/highmem.h>
16034+#include <linux/pagemap.h>
16035+#include <linux/poison.h>
16036+#include <linux/bootmem.h>
16037+#include <linux/slab.h>
16038+#include <linux/proc_fs.h>
16039+#include <linux/efi.h>
16040+#include <linux/memory_hotplug.h>
16041+#include <linux/initrd.h>
16042+#include <linux/cpumask.h>
16043+#include <linux/dma-mapping.h>
16044+#include <linux/scatterlist.h>
16045+
16046+#include <asm/processor.h>
16047+#include <asm/system.h>
16048+#include <asm/uaccess.h>
16049+#include <asm/pgtable.h>
16050+#include <asm/dma.h>
16051+#include <asm/fixmap.h>
16052+#include <asm/e820.h>
16053+#include <asm/apic.h>
16054+#include <asm/tlb.h>
16055+#include <asm/tlbflush.h>
16056+#include <asm/sections.h>
16057+#include <asm/hypervisor.h>
16058+#include <asm/swiotlb.h>
16059+
16060+unsigned int __VMALLOC_RESERVE = 128 << 20;
16061+
16062+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
16063+unsigned long highstart_pfn, highend_pfn;
16064+
16065+static int noinline do_test_wp_bit(void);
16066+
16067+/*
16068+ * Creates a middle page table and puts a pointer to it in the
16069+ * given global directory entry. This only returns the gd entry
16070+ * in non-PAE compilation mode, since the middle layer is folded.
16071+ */
16072+static pmd_t * __init one_md_table_init(pgd_t *pgd)
16073+{
16074+ pud_t *pud;
16075+ pmd_t *pmd_table;
16076+
16077+#ifdef CONFIG_X86_PAE
16078+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16079+ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
16080+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
16081+ pud = pud_offset(pgd, 0);
16082+ if (pmd_table != pmd_offset(pud, 0))
16083+ BUG();
16084+#else
16085+ pud = pud_offset(pgd, 0);
16086+ pmd_table = pmd_offset(pud, 0);
16087+#endif
16088+
16089+ return pmd_table;
16090+}
16091+
16092+/*
16093+ * Create a page table and place a pointer to it in a middle page
16094+ * directory entry.
16095+ */
16096+static pte_t * __init one_page_table_init(pmd_t *pmd)
16097+{
16098+ if (pmd_none(*pmd)) {
16099+ pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16100+ make_lowmem_page_readonly(page_table,
16101+ XENFEAT_writable_page_tables);
16102+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
16103+ if (page_table != pte_offset_kernel(pmd, 0))
16104+ BUG();
16105+
16106+ return page_table;
16107+ }
16108+
16109+ return pte_offset_kernel(pmd, 0);
16110+}
16111+
16112+/*
16113+ * This function initializes a certain range of kernel virtual memory
16114+ * with new bootmem page tables, everywhere page tables are missing in
16115+ * the given range.
16116+ */
16117+
16118+/*
16119+ * NOTE: The pagetables are allocated contiguous on the physical space
16120+ * so we can cache the place of the first one and move around without
16121+ * checking the pgd every time.
16122+ */
16123+static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
16124+{
16125+ pgd_t *pgd;
16126+ pud_t *pud;
16127+ pmd_t *pmd;
16128+ int pgd_idx, pmd_idx;
16129+ unsigned long vaddr;
16130+
16131+ vaddr = start;
16132+ pgd_idx = pgd_index(vaddr);
16133+ pmd_idx = pmd_index(vaddr);
16134+ pgd = pgd_base + pgd_idx;
16135+
16136+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
16137+ if (pgd_none(*pgd))
16138+ one_md_table_init(pgd);
16139+ pud = pud_offset(pgd, vaddr);
16140+ pmd = pmd_offset(pud, vaddr);
16141+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
16142+ if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
16143+ one_page_table_init(pmd);
16144+
16145+ vaddr += PMD_SIZE;
16146+ }
16147+ pmd_idx = 0;
16148+ }
16149+}
16150+
16151+static inline int is_kernel_text(unsigned long addr)
16152+{
16153+ if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
16154+ return 1;
16155+ return 0;
16156+}
16157+
16158+/*
16159+ * This maps the physical memory to kernel virtual address space, a total
16160+ * of max_low_pfn pages, by creating page tables starting from address
16161+ * PAGE_OFFSET.
16162+ */
16163+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
16164+{
16165+ unsigned long pfn;
16166+ pgd_t *pgd;
16167+ pmd_t *pmd;
16168+ pte_t *pte;
16169+ int pgd_idx, pmd_idx, pte_ofs;
16170+
16171+ unsigned long max_ram_pfn = xen_start_info->nr_pages;
16172+ if (max_ram_pfn > max_low_pfn)
16173+ max_ram_pfn = max_low_pfn;
16174+
16175+ pgd_idx = pgd_index(PAGE_OFFSET);
16176+ pgd = pgd_base + pgd_idx;
16177+ pfn = 0;
16178+ pmd_idx = pmd_index(PAGE_OFFSET);
16179+ pte_ofs = pte_index(PAGE_OFFSET);
16180+
16181+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
16182+#ifdef CONFIG_XEN
16183+ /*
16184+ * Native linux hasn't PAE-paging enabled yet at this
16185+ * point. When running as xen domain we are in PAE
16186+ * mode already, thus we can't simply hook a empty
16187+ * pmd. That would kill the mappings we are currently
16188+ * using ...
16189+ */
16190+ pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
16191+#else
16192+ pmd = one_md_table_init(pgd);
16193+#endif
16194+ if (pfn >= max_low_pfn)
16195+ continue;
16196+ pmd += pmd_idx;
16197+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
16198+ unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
16199+ if (address >= hypervisor_virt_start)
16200+ continue;
16201+
16202+ /* Map with big pages if possible, otherwise create normal page tables. */
16203+ if (cpu_has_pse) {
16204+ unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
16205+
16206+ if (is_kernel_text(address) || is_kernel_text(address2))
16207+ set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
16208+ else
16209+ set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
16210+ pfn += PTRS_PER_PTE;
16211+ } else {
16212+ pte = one_page_table_init(pmd);
16213+
16214+ pte += pte_ofs;
16215+ for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
16216+ /* XEN: Only map initial RAM allocation. */
16217+ if ((pfn >= max_ram_pfn) || pte_present(*pte))
16218+ continue;
16219+ if (is_kernel_text(address))
16220+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
16221+ else
16222+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
16223+ }
16224+ pte_ofs = 0;
16225+ }
16226+ }
16227+ pmd_idx = 0;
16228+ }
16229+}
16230+
16231+#ifndef CONFIG_XEN
16232+
16233+static inline int page_kills_ppro(unsigned long pagenr)
16234+{
16235+ if (pagenr >= 0x70000 && pagenr <= 0x7003F)
16236+ return 1;
16237+ return 0;
16238+}
16239+
16240+#else
16241+
16242+#define page_kills_ppro(p) 0
16243+
16244+#endif
16245+
16246+extern int is_available_memory(efi_memory_desc_t *);
16247+
16248+int page_is_ram(unsigned long pagenr)
16249+{
16250+ int i;
16251+ unsigned long addr, end;
16252+
16253+ if (efi_enabled) {
16254+ efi_memory_desc_t *md;
16255+ void *p;
16256+
16257+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
16258+ md = p;
16259+ if (!is_available_memory(md))
16260+ continue;
16261+ addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16262+ end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
16263+
16264+ if ((pagenr >= addr) && (pagenr < end))
16265+ return 1;
16266+ }
16267+ return 0;
16268+ }
16269+
16270+ for (i = 0; i < e820.nr_map; i++) {
16271+
16272+ if (e820.map[i].type != E820_RAM) /* not usable memory */
16273+ continue;
16274+ /*
16275+ * !!!FIXME!!! Some BIOSen report areas as RAM that
16276+ * are not. Notably the 640->1Mb area. We need a sanity
16277+ * check here.
16278+ */
16279+ addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16280+ end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
16281+ if ((pagenr >= addr) && (pagenr < end))
16282+ return 1;
16283+ }
16284+ return 0;
16285+}
16286+
16287+#ifdef CONFIG_HIGHMEM
16288+pte_t *kmap_pte;
16289+pgprot_t kmap_prot;
16290+
16291+#define kmap_get_fixmap_pte(vaddr) \
16292+ pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
16293+
16294+static void __init kmap_init(void)
16295+{
16296+ unsigned long kmap_vstart;
16297+
16298+ /* cache the first kmap pte */
16299+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
16300+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
16301+
16302+ kmap_prot = PAGE_KERNEL;
16303+}
16304+
16305+static void __init permanent_kmaps_init(pgd_t *pgd_base)
16306+{
16307+ pgd_t *pgd;
16308+ pud_t *pud;
16309+ pmd_t *pmd;
16310+ pte_t *pte;
16311+ unsigned long vaddr;
16312+
16313+ vaddr = PKMAP_BASE;
16314+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
16315+
16316+ pgd = swapper_pg_dir + pgd_index(vaddr);
16317+ pud = pud_offset(pgd, vaddr);
16318+ pmd = pmd_offset(pud, vaddr);
16319+ pte = pte_offset_kernel(pmd, vaddr);
16320+ pkmap_page_table = pte;
16321+}
16322+
16323+static void __meminit free_new_highpage(struct page *page, int pfn)
16324+{
16325+ init_page_count(page);
16326+ if (pfn < xen_start_info->nr_pages)
16327+ __free_page(page);
16328+ totalhigh_pages++;
16329+}
16330+
16331+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
16332+{
16333+ if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
16334+ ClearPageReserved(page);
16335+ free_new_highpage(page, pfn);
16336+ } else
16337+ SetPageReserved(page);
16338+}
16339+
16340+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
16341+{
16342+ free_new_highpage(page, pfn);
16343+ totalram_pages++;
16344+#ifdef CONFIG_FLATMEM
16345+ max_mapnr = max(pfn, max_mapnr);
16346+#endif
16347+ num_physpages++;
16348+ return 0;
16349+}
16350+
16351+/*
16352+ * Not currently handling the NUMA case.
16353+ * Assuming single node and all memory that
16354+ * has been added dynamically that would be
16355+ * onlined here is in HIGHMEM
16356+ */
16357+void online_page(struct page *page)
16358+{
16359+ ClearPageReserved(page);
16360+ add_one_highpage_hotplug(page, page_to_pfn(page));
16361+}
16362+
16363+
16364+#ifdef CONFIG_NUMA
16365+extern void set_highmem_pages_init(int);
16366+#else
16367+static void __init set_highmem_pages_init(int bad_ppro)
16368+{
16369+ int pfn;
16370+ for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
16371+ add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
16372+ totalram_pages += totalhigh_pages;
16373+}
16374+#endif /* CONFIG_FLATMEM */
16375+
16376+#else
16377+#define kmap_init() do { } while (0)
16378+#define permanent_kmaps_init(pgd_base) do { } while (0)
16379+#define set_highmem_pages_init(bad_ppro) do { } while (0)
16380+#endif /* CONFIG_HIGHMEM */
16381+
16382+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
16383+EXPORT_SYMBOL(__PAGE_KERNEL);
16384+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
16385+
16386+#ifdef CONFIG_NUMA
16387+extern void __init remap_numa_kva(void);
16388+#else
16389+#define remap_numa_kva() do {} while (0)
16390+#endif
16391+
16392+pgd_t *swapper_pg_dir;
16393+
16394+static void __init pagetable_init (void)
16395+{
16396+ unsigned long vaddr;
16397+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
16398+
16399+ /* Enable PSE if available */
16400+ if (cpu_has_pse) {
16401+ set_in_cr4(X86_CR4_PSE);
16402+ }
16403+
16404+ /* Enable PGE if available */
16405+ if (cpu_has_pge) {
16406+ set_in_cr4(X86_CR4_PGE);
16407+ __PAGE_KERNEL |= _PAGE_GLOBAL;
16408+ __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
16409+ }
16410+
16411+ kernel_physical_mapping_init(pgd_base);
16412+ remap_numa_kva();
16413+
16414+ /*
16415+ * Fixed mappings, only the page table structure has to be
16416+ * created - mappings will be set by set_fixmap():
16417+ */
16418+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
16419+ page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
16420+
16421+ permanent_kmaps_init(pgd_base);
16422+}
16423+
16424+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
16425+/*
16426+ * Swap suspend & friends need this for resume because things like the intel-agp
16427+ * driver might have split up a kernel 4MB mapping.
16428+ */
16429+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
16430+ __attribute__ ((aligned (PAGE_SIZE)));
16431+
16432+static inline void save_pg_dir(void)
16433+{
16434+ memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
16435+}
16436+#else
16437+static inline void save_pg_dir(void)
16438+{
16439+}
16440+#endif
16441+
16442+void zap_low_mappings (void)
16443+{
16444+ int i;
16445+
16446+ save_pg_dir();
16447+
16448+ /*
16449+ * Zap initial low-memory mappings.
16450+ *
16451+ * Note that "pgd_clear()" doesn't do it for
16452+ * us, because pgd_clear() is a no-op on i386.
16453+ */
16454+ for (i = 0; i < USER_PTRS_PER_PGD; i++)
16455+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16456+ set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
16457+#else
16458+ set_pgd(swapper_pg_dir+i, __pgd(0));
16459+#endif
16460+ flush_tlb_all();
16461+}
16462+
16463+static int disable_nx __initdata = 0;
16464+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
16465+EXPORT_SYMBOL(__supported_pte_mask);
16466+
16467+/*
16468+ * noexec = on|off
16469+ *
16470+ * Control non executable mappings.
16471+ *
16472+ * on Enable
16473+ * off Disable
16474+ */
16475+void __init noexec_setup(const char *str)
16476+{
16477+ if (!strncmp(str, "on",2) && cpu_has_nx) {
16478+ __supported_pte_mask |= _PAGE_NX;
16479+ disable_nx = 0;
16480+ } else if (!strncmp(str,"off",3)) {
16481+ disable_nx = 1;
16482+ __supported_pte_mask &= ~_PAGE_NX;
16483+ }
16484+}
16485+
16486+int nx_enabled = 0;
16487+#ifdef CONFIG_X86_PAE
16488+
16489+static void __init set_nx(void)
16490+{
16491+ unsigned int v[4], l, h;
16492+
16493+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
16494+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
16495+ if ((v[3] & (1 << 20)) && !disable_nx) {
16496+ rdmsr(MSR_EFER, l, h);
16497+ l |= EFER_NX;
16498+ wrmsr(MSR_EFER, l, h);
16499+ nx_enabled = 1;
16500+ __supported_pte_mask |= _PAGE_NX;
16501+ }
16502+ }
16503+}
16504+
16505+/*
16506+ * Enables/disables executability of a given kernel page and
16507+ * returns the previous setting.
16508+ */
16509+int __init set_kernel_exec(unsigned long vaddr, int enable)
16510+{
16511+ pte_t *pte;
16512+ int ret = 1;
16513+
16514+ if (!nx_enabled)
16515+ goto out;
16516+
16517+ pte = lookup_address(vaddr);
16518+ BUG_ON(!pte);
16519+
16520+ if (!pte_exec_kernel(*pte))
16521+ ret = 0;
16522+
16523+ if (enable)
16524+ pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
16525+ else
16526+ pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
16527+ __flush_tlb_all();
16528+out:
16529+ return ret;
16530+}
16531+
16532+#endif
16533+
16534+/*
16535+ * paging_init() sets up the page tables - note that the first 8MB are
16536+ * already mapped by head.S.
16537+ *
16538+ * This routines also unmaps the page at virtual kernel address 0, so
16539+ * that we can trap those pesky NULL-reference errors in the kernel.
16540+ */
16541+void __init paging_init(void)
16542+{
16543+ int i;
16544+
16545+#ifdef CONFIG_X86_PAE
16546+ set_nx();
16547+ if (nx_enabled)
16548+ printk("NX (Execute Disable) protection: active\n");
16549+#endif
16550+
16551+ pagetable_init();
16552+
16553+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16554+ /*
16555+ * We will bail out later - printk doesn't work right now so
16556+ * the user would just see a hanging kernel.
16557+ * when running as xen domain we are already in PAE mode at
16558+ * this point.
16559+ */
16560+ if (cpu_has_pae)
16561+ set_in_cr4(X86_CR4_PAE);
16562+#endif
16563+ __flush_tlb_all();
16564+
16565+ kmap_init();
16566+
16567+ /* Switch to the real shared_info page, and clear the
16568+ * dummy page. */
16569+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
16570+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
16571+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
16572+
16573+ /* Setup mapping of lower 1st MB */
16574+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
16575+ if (is_initial_xendomain())
16576+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
16577+ else
16578+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
16579+ virt_to_machine(empty_zero_page),
16580+ PAGE_KERNEL_RO);
16581+}
16582+
16583+/*
16584+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
16585+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
16586+ * used to involve black magic jumps to work around some nasty CPU bugs,
16587+ * but fortunately the switch to using exceptions got rid of all that.
16588+ */
16589+
16590+static void __init test_wp_bit(void)
16591+{
16592+ printk("Checking if this processor honours the WP bit even in supervisor mode... ");
16593+
16594+ /* Any page-aligned address will do, the test is non-destructive */
16595+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
16596+ boot_cpu_data.wp_works_ok = do_test_wp_bit();
16597+ clear_fixmap(FIX_WP_TEST);
16598+
16599+ if (!boot_cpu_data.wp_works_ok) {
16600+ printk("No.\n");
16601+#ifdef CONFIG_X86_WP_WORKS_OK
16602+ panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
16603+#endif
16604+ } else {
16605+ printk("Ok.\n");
16606+ }
16607+}
16608+
16609+static void __init set_max_mapnr_init(void)
16610+{
16611+#ifdef CONFIG_HIGHMEM
16612+ num_physpages = highend_pfn;
16613+#else
16614+ num_physpages = max_low_pfn;
16615+#endif
16616+#ifdef CONFIG_FLATMEM
16617+ max_mapnr = num_physpages;
16618+#endif
16619+}
16620+
16621+static struct kcore_list kcore_mem, kcore_vmalloc;
16622+
16623+void __init mem_init(void)
16624+{
16625+ extern int ppro_with_ram_bug(void);
16626+ int codesize, reservedpages, datasize, initsize;
16627+ int tmp;
16628+ int bad_ppro;
16629+ unsigned long pfn;
16630+
16631+#if defined(CONFIG_SWIOTLB)
16632+ swiotlb_init();
16633+#endif
16634+
16635+#ifdef CONFIG_FLATMEM
16636+ if (!mem_map)
16637+ BUG();
16638+#endif
16639+
16640+ bad_ppro = ppro_with_ram_bug();
16641+
16642+#ifdef CONFIG_HIGHMEM
16643+ /* check that fixmap and pkmap do not overlap */
16644+ if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
16645+ printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
16646+ printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
16647+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
16648+ BUG();
16649+ }
16650+#endif
16651+
16652+ set_max_mapnr_init();
16653+
16654+#ifdef CONFIG_HIGHMEM
16655+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
16656+#else
16657+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
16658+#endif
16659+ printk("vmalloc area: %lx-%lx, maxmem %lx\n",
16660+ VMALLOC_START,VMALLOC_END,MAXMEM);
16661+ BUG_ON(VMALLOC_START > VMALLOC_END);
16662+
16663+ /* this will put all low memory onto the freelists */
16664+ totalram_pages += free_all_bootmem();
16665+ /* XEN: init and count low-mem pages outside initial allocation. */
16666+ for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
16667+ ClearPageReserved(pfn_to_page(pfn));
16668+ init_page_count(pfn_to_page(pfn));
16669+ totalram_pages++;
16670+ }
16671+
16672+ reservedpages = 0;
16673+ for (tmp = 0; tmp < max_low_pfn; tmp++)
16674+ /*
16675+ * Only count reserved RAM pages
16676+ */
16677+ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
16678+ reservedpages++;
16679+
16680+ set_highmem_pages_init(bad_ppro);
16681+
16682+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
16683+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
16684+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
16685+
16686+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
16687+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
16688+ VMALLOC_END-VMALLOC_START);
16689+
16690+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
16691+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
16692+ num_physpages << (PAGE_SHIFT-10),
16693+ codesize >> 10,
16694+ reservedpages << (PAGE_SHIFT-10),
16695+ datasize >> 10,
16696+ initsize >> 10,
16697+ (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
16698+ );
16699+
16700+#ifdef CONFIG_X86_PAE
16701+ if (!cpu_has_pae)
16702+ panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
16703+#endif
16704+ if (boot_cpu_data.wp_works_ok < 0)
16705+ test_wp_bit();
16706+
16707+ /*
16708+ * Subtle. SMP is doing it's boot stuff late (because it has to
16709+ * fork idle threads) - but it also needs low mappings for the
16710+ * protected-mode entry to work. We zap these entries only after
16711+ * the WP-bit has been tested.
16712+ */
16713+#ifndef CONFIG_SMP
16714+ zap_low_mappings();
16715+#endif
16716+
16717+ set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
16718+}
16719+
16720+/*
16721+ * this is for the non-NUMA, single node SMP system case.
16722+ * Specifically, in the case of x86, we will always add
16723+ * memory to the highmem for now.
16724+ */
16725+#ifdef CONFIG_MEMORY_HOTPLUG
16726+#ifndef CONFIG_NEED_MULTIPLE_NODES
16727+int arch_add_memory(int nid, u64 start, u64 size)
16728+{
16729+ struct pglist_data *pgdata = &contig_page_data;
16730+ struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
16731+ unsigned long start_pfn = start >> PAGE_SHIFT;
16732+ unsigned long nr_pages = size >> PAGE_SHIFT;
16733+
16734+ return __add_pages(zone, start_pfn, nr_pages);
16735+}
16736+
16737+int remove_memory(u64 start, u64 size)
16738+{
16739+ return -EINVAL;
16740+}
16741+#endif
16742+#endif
16743+
16744+kmem_cache_t *pgd_cache;
16745+kmem_cache_t *pmd_cache;
16746+
16747+void __init pgtable_cache_init(void)
16748+{
16749+ if (PTRS_PER_PMD > 1) {
16750+ pmd_cache = kmem_cache_create("pmd",
16751+ PTRS_PER_PMD*sizeof(pmd_t),
16752+ PTRS_PER_PMD*sizeof(pmd_t),
16753+ 0,
16754+ pmd_ctor,
16755+ NULL);
16756+ if (!pmd_cache)
16757+ panic("pgtable_cache_init(): cannot create pmd cache");
16758+ }
16759+ pgd_cache = kmem_cache_create("pgd",
16760+#ifndef CONFIG_XEN
16761+ PTRS_PER_PGD*sizeof(pgd_t),
16762+ PTRS_PER_PGD*sizeof(pgd_t),
16763+#else
16764+ PAGE_SIZE,
16765+ PAGE_SIZE,
16766+#endif
16767+ 0,
16768+ pgd_ctor,
16769+ PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
16770+ if (!pgd_cache)
16771+ panic("pgtable_cache_init(): Cannot create pgd cache");
16772+}
16773+
16774+/*
16775+ * This function cannot be __init, since exceptions don't work in that
16776+ * section. Put this after the callers, so that it cannot be inlined.
16777+ */
16778+static int noinline do_test_wp_bit(void)
16779+{
16780+ char tmp_reg;
16781+ int flag;
16782+
16783+ __asm__ __volatile__(
16784+ " movb %0,%1 \n"
16785+ "1: movb %1,%0 \n"
16786+ " xorl %2,%2 \n"
16787+ "2: \n"
16788+ ".section __ex_table,\"a\"\n"
16789+ " .align 4 \n"
16790+ " .long 1b,2b \n"
16791+ ".previous \n"
16792+ :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
16793+ "=q" (tmp_reg),
16794+ "=r" (flag)
16795+ :"2" (1)
16796+ :"memory");
16797+
16798+ return flag;
16799+}
16800+
16801+#ifdef CONFIG_DEBUG_RODATA
16802+
16803+void mark_rodata_ro(void)
16804+{
16805+ unsigned long addr = (unsigned long)__start_rodata;
16806+
16807+ for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
16808+ change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
16809+
16810+ printk("Write protecting the kernel read-only data: %uk\n",
16811+ (__end_rodata - __start_rodata) >> 10);
16812+
16813+ /*
16814+ * change_page_attr() requires a global_flush_tlb() call after it.
16815+ * We do this after the printk so that if something went wrong in the
16816+ * change, the printk gets out at least to give a better debug hint
16817+ * of who is the culprit.
16818+ */
16819+ global_flush_tlb();
16820+}
16821+#endif
16822+
16823+void free_init_pages(char *what, unsigned long begin, unsigned long end)
16824+{
16825+ unsigned long addr;
16826+
16827+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
16828+ ClearPageReserved(virt_to_page(addr));
16829+ init_page_count(virt_to_page(addr));
16830+ memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
16831+ free_page(addr);
16832+ totalram_pages++;
16833+ }
16834+ printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
16835+}
16836+
16837+void free_initmem(void)
16838+{
16839+ free_init_pages("unused kernel memory",
16840+ (unsigned long)(&__init_begin),
16841+ (unsigned long)(&__init_end));
16842+}
16843+
16844+#ifdef CONFIG_BLK_DEV_INITRD
16845+void free_initrd_mem(unsigned long start, unsigned long end)
16846+{
16847+ free_init_pages("initrd memory", start, end);
16848+}
16849+#endif
16850+
16851Index: head-2008-11-25/arch/x86/mm/ioremap_32-xen.c
16852===================================================================
16853--- /dev/null 1970-01-01 00:00:00.000000000 +0000
16854+++ head-2008-11-25/arch/x86/mm/ioremap_32-xen.c 2008-04-02 12:34:02.000000000 +0200
16855@@ -0,0 +1,443 @@
16856+/*
16857+ * arch/i386/mm/ioremap.c
16858+ *
16859+ * Re-map IO memory to kernel address space so that we can access it.
16860+ * This is needed for high PCI addresses that aren't mapped in the
16861+ * 640k-1MB IO memory area on PC's
16862+ *
16863+ * (C) Copyright 1995 1996 Linus Torvalds
16864+ */
16865+
16866+#include <linux/vmalloc.h>
16867+#include <linux/init.h>
16868+#include <linux/slab.h>
16869+#include <linux/module.h>
16870+#include <asm/io.h>
16871+#include <asm/fixmap.h>
16872+#include <asm/cacheflush.h>
16873+#include <asm/tlbflush.h>
16874+#include <asm/pgtable.h>
16875+#include <asm/pgalloc.h>
16876+
16877+#define ISA_START_ADDRESS 0x0
16878+#define ISA_END_ADDRESS 0x100000
16879+
16880+static int direct_remap_area_pte_fn(pte_t *pte,
16881+ struct page *pmd_page,
16882+ unsigned long address,
16883+ void *data)
16884+{
16885+ mmu_update_t **v = (mmu_update_t **)data;
16886+
16887+ BUG_ON(!pte_none(*pte));
16888+
16889+ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
16890+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
16891+ (*v)++;
16892+
16893+ return 0;
16894+}
16895+
16896+static int __direct_remap_pfn_range(struct mm_struct *mm,
16897+ unsigned long address,
16898+ unsigned long mfn,
16899+ unsigned long size,
16900+ pgprot_t prot,
16901+ domid_t domid)
16902+{
16903+ int rc;
16904+ unsigned long i, start_address;
16905+ mmu_update_t *u, *v, *w;
16906+
16907+ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
16908+ if (u == NULL)
16909+ return -ENOMEM;
16910+
16911+ start_address = address;
16912+
16913+ flush_cache_all();
16914+
16915+ for (i = 0; i < size; i += PAGE_SIZE) {
16916+ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
16917+ /* Flush a full batch after filling in the PTE ptrs. */
16918+ rc = apply_to_page_range(mm, start_address,
16919+ address - start_address,
16920+ direct_remap_area_pte_fn, &w);
16921+ if (rc)
16922+ goto out;
16923+ rc = -EFAULT;
16924+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
16925+ goto out;
16926+ v = w = u;
16927+ start_address = address;
16928+ }
16929+
16930+ /*
16931+ * Fill in the machine address: PTE ptr is done later by
16932+ * apply_to_page_range().
16933+ */
16934+ v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
16935+
16936+ mfn++;
16937+ address += PAGE_SIZE;
16938+ v++;
16939+ }
16940+
16941+ if (v != u) {
16942+ /* Final batch. */
16943+ rc = apply_to_page_range(mm, start_address,
16944+ address - start_address,
16945+ direct_remap_area_pte_fn, &w);
16946+ if (rc)
16947+ goto out;
16948+ rc = -EFAULT;
16949+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
16950+ goto out;
16951+ }
16952+
16953+ rc = 0;
16954+
16955+ out:
16956+ flush_tlb_all();
16957+
16958+ free_page((unsigned long)u);
16959+
16960+ return rc;
16961+}
16962+
16963+int direct_remap_pfn_range(struct vm_area_struct *vma,
16964+ unsigned long address,
16965+ unsigned long mfn,
16966+ unsigned long size,
16967+ pgprot_t prot,
16968+ domid_t domid)
16969+{
16970+ if (xen_feature(XENFEAT_auto_translated_physmap))
16971+ return remap_pfn_range(vma, address, mfn, size, prot);
16972+
16973+ if (domid == DOMID_SELF)
16974+ return -EINVAL;
16975+
16976+ vma->vm_flags |= VM_IO | VM_RESERVED;
16977+
16978+ vma->vm_mm->context.has_foreign_mappings = 1;
16979+
16980+ return __direct_remap_pfn_range(
16981+ vma->vm_mm, address, mfn, size, prot, domid);
16982+}
16983+EXPORT_SYMBOL(direct_remap_pfn_range);
16984+
16985+int direct_kernel_remap_pfn_range(unsigned long address,
16986+ unsigned long mfn,
16987+ unsigned long size,
16988+ pgprot_t prot,
16989+ domid_t domid)
16990+{
16991+ return __direct_remap_pfn_range(
16992+ &init_mm, address, mfn, size, prot, domid);
16993+}
16994+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
16995+
16996+static int lookup_pte_fn(
16997+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
16998+{
16999+ uint64_t *ptep = (uint64_t *)data;
17000+ if (ptep)
17001+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17002+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17003+ return 0;
17004+}
17005+
17006+int create_lookup_pte_addr(struct mm_struct *mm,
17007+ unsigned long address,
17008+ uint64_t *ptep)
17009+{
17010+ return apply_to_page_range(mm, address, PAGE_SIZE,
17011+ lookup_pte_fn, ptep);
17012+}
17013+
17014+EXPORT_SYMBOL(create_lookup_pte_addr);
17015+
17016+static int noop_fn(
17017+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
17018+{
17019+ return 0;
17020+}
17021+
17022+int touch_pte_range(struct mm_struct *mm,
17023+ unsigned long address,
17024+ unsigned long size)
17025+{
17026+ return apply_to_page_range(mm, address, size, noop_fn, NULL);
17027+}
17028+
17029+EXPORT_SYMBOL(touch_pte_range);
17030+
17031+/*
17032+ * Does @address reside within a non-highmem page that is local to this virtual
17033+ * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
17034+ * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
17035+ * why this works.
17036+ */
17037+static inline int is_local_lowmem(unsigned long address)
17038+{
17039+ extern unsigned long max_low_pfn;
17040+ return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
17041+}
17042+
17043+/*
17044+ * Generic mapping function (not visible outside):
17045+ */
17046+
17047+/*
17048+ * Remap an arbitrary physical address space into the kernel virtual
17049+ * address space. Needed when the kernel wants to access high addresses
17050+ * directly.
17051+ *
17052+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
17053+ * have to convert them into an offset in a page-aligned mapping, but the
17054+ * caller shouldn't need to know that small detail.
17055+ */
17056+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
17057+{
17058+ void __iomem * addr;
17059+ struct vm_struct * area;
17060+ unsigned long offset, last_addr;
17061+ domid_t domid = DOMID_IO;
17062+
17063+ /* Don't allow wraparound or zero size */
17064+ last_addr = phys_addr + size - 1;
17065+ if (!size || last_addr < phys_addr)
17066+ return NULL;
17067+
17068+ /*
17069+ * Don't remap the low PCI/ISA area, it's always mapped..
17070+ */
17071+ if (is_initial_xendomain() &&
17072+ phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17073+ return (void __iomem *) isa_bus_to_virt(phys_addr);
17074+
17075+ /*
17076+ * Don't allow anybody to remap normal RAM that we're using..
17077+ */
17078+ if (is_local_lowmem(phys_addr)) {
17079+ char *t_addr, *t_end;
17080+ struct page *page;
17081+
17082+ t_addr = bus_to_virt(phys_addr);
17083+ t_end = t_addr + (size - 1);
17084+
17085+ for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
17086+ if(!PageReserved(page))
17087+ return NULL;
17088+
17089+ domid = DOMID_SELF;
17090+ }
17091+
17092+ /*
17093+ * Mappings have to be page-aligned
17094+ */
17095+ offset = phys_addr & ~PAGE_MASK;
17096+ phys_addr &= PAGE_MASK;
17097+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
17098+
17099+ /*
17100+ * Ok, go for it..
17101+ */
17102+ area = get_vm_area(size, VM_IOREMAP | (flags << 20));
17103+ if (!area)
17104+ return NULL;
17105+ area->phys_addr = phys_addr;
17106+ addr = (void __iomem *) area->addr;
17107+ flags |= _KERNPG_TABLE;
17108+ if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
17109+ phys_addr>>PAGE_SHIFT,
17110+ size, __pgprot(flags), domid)) {
17111+ vunmap((void __force *) addr);
17112+ return NULL;
17113+ }
17114+ return (void __iomem *) (offset + (char __iomem *)addr);
17115+}
17116+EXPORT_SYMBOL(__ioremap);
17117+
17118+/**
17119+ * ioremap_nocache - map bus memory into CPU space
17120+ * @offset: bus address of the memory
17121+ * @size: size of the resource to map
17122+ *
17123+ * ioremap_nocache performs a platform specific sequence of operations to
17124+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
17125+ * writew/writel functions and the other mmio helpers. The returned
17126+ * address is not guaranteed to be usable directly as a virtual
17127+ * address.
17128+ *
17129+ * This version of ioremap ensures that the memory is marked uncachable
17130+ * on the CPU as well as honouring existing caching rules from things like
17131+ * the PCI bus. Note that there are other caches and buffers on many
17132+ * busses. In particular driver authors should read up on PCI writes
17133+ *
17134+ * It's useful if some control registers are in such an area and
17135+ * write combining or read caching is not desirable:
17136+ *
17137+ * Must be freed with iounmap.
17138+ */
17139+
17140+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
17141+{
17142+ unsigned long last_addr;
17143+ void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
17144+ if (!p)
17145+ return p;
17146+
17147+ /* Guaranteed to be > phys_addr, as per __ioremap() */
17148+ last_addr = phys_addr + size - 1;
17149+
17150+ if (is_local_lowmem(last_addr)) {
17151+ struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
17152+ unsigned long npages;
17153+
17154+ phys_addr &= PAGE_MASK;
17155+
17156+ /* This might overflow and become zero.. */
17157+ last_addr = PAGE_ALIGN(last_addr);
17158+
17159+ /* .. but that's ok, because modulo-2**n arithmetic will make
17160+ * the page-aligned "last - first" come out right.
17161+ */
17162+ npages = (last_addr - phys_addr) >> PAGE_SHIFT;
17163+
17164+ if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
17165+ iounmap(p);
17166+ p = NULL;
17167+ }
17168+ global_flush_tlb();
17169+ }
17170+
17171+ return p;
17172+}
17173+EXPORT_SYMBOL(ioremap_nocache);
17174+
17175+/**
17176+ * iounmap - Free a IO remapping
17177+ * @addr: virtual address from ioremap_*
17178+ *
17179+ * Caller must ensure there is only one unmapping for the same pointer.
17180+ */
17181+void iounmap(volatile void __iomem *addr)
17182+{
17183+ struct vm_struct *p, *o;
17184+
17185+ if ((void __force *)addr <= high_memory)
17186+ return;
17187+
17188+ /*
17189+ * __ioremap special-cases the PCI/ISA range by not instantiating a
17190+ * vm_area and by simply returning an address into the kernel mapping
17191+ * of ISA space. So handle that here.
17192+ */
17193+ if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17194+ return;
17195+
17196+ addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
17197+
17198+ /* Use the vm area unlocked, assuming the caller
17199+ ensures there isn't another iounmap for the same address
17200+ in parallel. Reuse of the virtual address is prevented by
17201+ leaving it in the global lists until we're done with it.
17202+ cpa takes care of the direct mappings. */
17203+ read_lock(&vmlist_lock);
17204+ for (p = vmlist; p; p = p->next) {
17205+ if (p->addr == addr)
17206+ break;
17207+ }
17208+ read_unlock(&vmlist_lock);
17209+
17210+ if (!p) {
17211+ printk("iounmap: bad address %p\n", addr);
17212+ dump_stack();
17213+ return;
17214+ }
17215+
17216+ /* Reset the direct mapping. Can block */
17217+ if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
17218+ /* p->size includes the guard page, but cpa doesn't like that */
17219+ change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
17220+ (p->size - PAGE_SIZE) >> PAGE_SHIFT,
17221+ PAGE_KERNEL);
17222+ global_flush_tlb();
17223+ }
17224+
17225+ /* Finally remove it */
17226+ o = remove_vm_area((void *)addr);
17227+ BUG_ON(p != o || o == NULL);
17228+ kfree(p);
17229+}
17230+EXPORT_SYMBOL(iounmap);
17231+
17232+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
17233+{
17234+ unsigned long offset, last_addr;
17235+ unsigned int nrpages;
17236+ enum fixed_addresses idx;
17237+
17238+ /* Don't allow wraparound or zero size */
17239+ last_addr = phys_addr + size - 1;
17240+ if (!size || last_addr < phys_addr)
17241+ return NULL;
17242+
17243+ /*
17244+ * Don't remap the low PCI/ISA area, it's always mapped..
17245+ */
17246+ if (is_initial_xendomain() &&
17247+ phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17248+ return isa_bus_to_virt(phys_addr);
17249+
17250+ /*
17251+ * Mappings have to be page-aligned
17252+ */
17253+ offset = phys_addr & ~PAGE_MASK;
17254+ phys_addr &= PAGE_MASK;
17255+ size = PAGE_ALIGN(last_addr) - phys_addr;
17256+
17257+ /*
17258+ * Mappings have to fit in the FIX_BTMAP area.
17259+ */
17260+ nrpages = size >> PAGE_SHIFT;
17261+ if (nrpages > NR_FIX_BTMAPS)
17262+ return NULL;
17263+
17264+ /*
17265+ * Ok, go for it..
17266+ */
17267+ idx = FIX_BTMAP_BEGIN;
17268+ while (nrpages > 0) {
17269+ set_fixmap(idx, phys_addr);
17270+ phys_addr += PAGE_SIZE;
17271+ --idx;
17272+ --nrpages;
17273+ }
17274+ return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
17275+}
17276+
17277+void __init bt_iounmap(void *addr, unsigned long size)
17278+{
17279+ unsigned long virt_addr;
17280+ unsigned long offset;
17281+ unsigned int nrpages;
17282+ enum fixed_addresses idx;
17283+
17284+ virt_addr = (unsigned long)addr;
17285+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
17286+ return;
17287+ if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17288+ return;
17289+ offset = virt_addr & ~PAGE_MASK;
17290+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
17291+
17292+ idx = FIX_BTMAP_BEGIN;
17293+ while (nrpages > 0) {
17294+ clear_fixmap(idx);
17295+ --idx;
17296+ --nrpages;
17297+ }
17298+}
17299Index: head-2008-11-25/arch/x86/mm/pgtable_32-xen.c
17300===================================================================
17301--- /dev/null 1970-01-01 00:00:00.000000000 +0000
17302+++ head-2008-11-25/arch/x86/mm/pgtable_32-xen.c 2007-10-09 11:48:25.000000000 +0200
17303@@ -0,0 +1,725 @@
17304+/*
17305+ * linux/arch/i386/mm/pgtable.c
17306+ */
17307+
17308+#include <linux/sched.h>
17309+#include <linux/kernel.h>
17310+#include <linux/errno.h>
17311+#include <linux/mm.h>
17312+#include <linux/swap.h>
17313+#include <linux/smp.h>
17314+#include <linux/highmem.h>
17315+#include <linux/slab.h>
17316+#include <linux/pagemap.h>
17317+#include <linux/spinlock.h>
17318+#include <linux/module.h>
17319+
17320+#include <asm/system.h>
17321+#include <asm/pgtable.h>
17322+#include <asm/pgalloc.h>
17323+#include <asm/fixmap.h>
17324+#include <asm/e820.h>
17325+#include <asm/tlb.h>
17326+#include <asm/tlbflush.h>
17327+#include <asm/io.h>
17328+#include <asm/mmu_context.h>
17329+
17330+#include <xen/features.h>
17331+#include <asm/hypervisor.h>
17332+
17333+static void pgd_test_and_unpin(pgd_t *pgd);
17334+
17335+void show_mem(void)
17336+{
17337+ int total = 0, reserved = 0;
17338+ int shared = 0, cached = 0;
17339+ int highmem = 0;
17340+ struct page *page;
17341+ pg_data_t *pgdat;
17342+ unsigned long i;
17343+ unsigned long flags;
17344+
17345+ printk(KERN_INFO "Mem-info:\n");
17346+ show_free_areas();
17347+ printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
17348+ for_each_online_pgdat(pgdat) {
17349+ pgdat_resize_lock(pgdat, &flags);
17350+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
17351+ page = pgdat_page_nr(pgdat, i);
17352+ total++;
17353+ if (PageHighMem(page))
17354+ highmem++;
17355+ if (PageReserved(page))
17356+ reserved++;
17357+ else if (PageSwapCache(page))
17358+ cached++;
17359+ else if (page_count(page))
17360+ shared += page_count(page) - 1;
17361+ }
17362+ pgdat_resize_unlock(pgdat, &flags);
17363+ }
17364+ printk(KERN_INFO "%d pages of RAM\n", total);
17365+ printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
17366+ printk(KERN_INFO "%d reserved pages\n", reserved);
17367+ printk(KERN_INFO "%d pages shared\n", shared);
17368+ printk(KERN_INFO "%d pages swap cached\n", cached);
17369+
17370+ printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
17371+ printk(KERN_INFO "%lu pages writeback\n",
17372+ global_page_state(NR_WRITEBACK));
17373+ printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
17374+ printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
17375+ printk(KERN_INFO "%lu pages pagetables\n",
17376+ global_page_state(NR_PAGETABLE));
17377+}
17378+
17379+/*
17380+ * Associate a large virtual page frame with a given physical page frame
17381+ * and protection flags for that frame. pfn is for the base of the page,
17382+ * vaddr is what the page gets mapped to - both must be properly aligned.
17383+ * The pmd must already be instantiated. Assumes PAE mode.
17384+ */
17385+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
17386+{
17387+ pgd_t *pgd;
17388+ pud_t *pud;
17389+ pmd_t *pmd;
17390+
17391+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
17392+ printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
17393+ return; /* BUG(); */
17394+ }
17395+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
17396+ printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
17397+ return; /* BUG(); */
17398+ }
17399+ pgd = swapper_pg_dir + pgd_index(vaddr);
17400+ if (pgd_none(*pgd)) {
17401+ printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
17402+ return; /* BUG(); */
17403+ }
17404+ pud = pud_offset(pgd, vaddr);
17405+ pmd = pmd_offset(pud, vaddr);
17406+ set_pmd(pmd, pfn_pmd(pfn, flags));
17407+ /*
17408+ * It's enough to flush this one mapping.
17409+ * (PGE mappings get flushed as well)
17410+ */
17411+ __flush_tlb_one(vaddr);
17412+}
17413+
17414+static int nr_fixmaps = 0;
17415+unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
17416+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
17417+EXPORT_SYMBOL(__FIXADDR_TOP);
17418+
17419+void __init set_fixaddr_top(unsigned long top)
17420+{
17421+ BUG_ON(nr_fixmaps > 0);
17422+ hypervisor_virt_start = top;
17423+ __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
17424+}
17425+
17426+void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
17427+{
17428+ unsigned long address = __fix_to_virt(idx);
17429+ pte_t pte;
17430+
17431+ if (idx >= __end_of_fixed_addresses) {
17432+ BUG();
17433+ return;
17434+ }
17435+ switch (idx) {
17436+ case FIX_WP_TEST:
17437+ case FIX_VDSO:
17438+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
17439+ break;
17440+ default:
17441+ pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
17442+ break;
17443+ }
17444+ if (HYPERVISOR_update_va_mapping(address, pte,
17445+ UVMF_INVLPG|UVMF_ALL))
17446+ BUG();
17447+ nr_fixmaps++;
17448+}
17449+
17450+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17451+{
17452+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
17453+ if (pte)
17454+ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
17455+ return pte;
17456+}
17457+
17458+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17459+{
17460+ struct page *pte;
17461+
17462+#ifdef CONFIG_HIGHPTE
17463+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17464+#else
17465+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17466+#endif
17467+ if (pte) {
17468+ SetPageForeign(pte, pte_free);
17469+ init_page_count(pte);
17470+ }
17471+ return pte;
17472+}
17473+
17474+void pte_free(struct page *pte)
17475+{
17476+ unsigned long pfn = page_to_pfn(pte);
17477+
17478+ if (!PageHighMem(pte)) {
17479+ unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
17480+
17481+ if (!pte_write(*virt_to_ptep(va)))
17482+ if (HYPERVISOR_update_va_mapping(
17483+ va, pfn_pte(pfn, PAGE_KERNEL), 0))
17484+ BUG();
17485+ } else
17486+ clear_bit(PG_pinned, &pte->flags);
17487+
17488+ ClearPageForeign(pte);
17489+ init_page_count(pte);
17490+
17491+ __free_page(pte);
17492+}
17493+
17494+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
17495+{
17496+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17497+}
17498+
17499+/*
17500+ * List of all pgd's needed for non-PAE so it can invalidate entries
17501+ * in both cached and uncached pgd's; not needed for PAE since the
17502+ * kernel pmd is shared. If PAE were not to share the pmd a similar
17503+ * tactic would be needed. This is essentially codepath-based locking
17504+ * against pageattr.c; it is the unique case in which a valid change
17505+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
17506+ * vmalloc faults work because attached pagetables are never freed.
17507+ * The locking scheme was chosen on the basis of manfred's
17508+ * recommendations and having no core impact whatsoever.
17509+ * -- wli
17510+ */
17511+DEFINE_SPINLOCK(pgd_lock);
17512+struct page *pgd_list;
17513+
17514+static inline void pgd_list_add(pgd_t *pgd)
17515+{
17516+ struct page *page = virt_to_page(pgd);
17517+ page->index = (unsigned long)pgd_list;
17518+ if (pgd_list)
17519+ set_page_private(pgd_list, (unsigned long)&page->index);
17520+ pgd_list = page;
17521+ set_page_private(page, (unsigned long)&pgd_list);
17522+}
17523+
17524+static inline void pgd_list_del(pgd_t *pgd)
17525+{
17526+ struct page *next, **pprev, *page = virt_to_page(pgd);
17527+ next = (struct page *)page->index;
17528+ pprev = (struct page **)page_private(page);
17529+ *pprev = next;
17530+ if (next)
17531+ set_page_private(next, (unsigned long)pprev);
17532+}
17533+
17534+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17535+{
17536+ unsigned long flags;
17537+
17538+ if (PTRS_PER_PMD > 1) {
17539+ if (HAVE_SHARED_KERNEL_PMD)
17540+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17541+ swapper_pg_dir + USER_PTRS_PER_PGD,
17542+ KERNEL_PGD_PTRS);
17543+ } else {
17544+ spin_lock_irqsave(&pgd_lock, flags);
17545+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17546+ swapper_pg_dir + USER_PTRS_PER_PGD,
17547+ KERNEL_PGD_PTRS);
17548+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
17549+ pgd_list_add(pgd);
17550+ spin_unlock_irqrestore(&pgd_lock, flags);
17551+ }
17552+}
17553+
17554+/* never called when PTRS_PER_PMD > 1 */
17555+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17556+{
17557+ unsigned long flags; /* can be called from interrupt context */
17558+
17559+ spin_lock_irqsave(&pgd_lock, flags);
17560+ pgd_list_del(pgd);
17561+ spin_unlock_irqrestore(&pgd_lock, flags);
17562+
17563+ pgd_test_and_unpin(pgd);
17564+}
17565+
17566+pgd_t *pgd_alloc(struct mm_struct *mm)
17567+{
17568+ int i;
17569+ pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
17570+ pmd_t **pmd;
17571+ unsigned long flags;
17572+
17573+ pgd_test_and_unpin(pgd);
17574+
17575+ if (PTRS_PER_PMD == 1 || !pgd)
17576+ return pgd;
17577+
17578+ if (HAVE_SHARED_KERNEL_PMD) {
17579+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17580+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17581+ if (!pmd)
17582+ goto out_oom;
17583+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
17584+ }
17585+ return pgd;
17586+ }
17587+
17588+ /*
17589+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
17590+ * allocation). We therefore store virtual addresses of pmds as they
17591+ * do not change across save/restore, and poke the machine addresses
17592+ * into the pgdir under the pgd_lock.
17593+ */
17594+ pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
17595+ if (!pmd) {
17596+ kmem_cache_free(pgd_cache, pgd);
17597+ return NULL;
17598+ }
17599+
17600+ /* Allocate pmds, remember virtual addresses. */
17601+ for (i = 0; i < PTRS_PER_PGD; ++i) {
17602+ pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17603+ if (!pmd[i])
17604+ goto out_oom;
17605+ }
17606+
17607+ spin_lock_irqsave(&pgd_lock, flags);
17608+
17609+ /* Protect against save/restore: move below 4GB under pgd_lock. */
17610+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
17611+ int rc = xen_create_contiguous_region(
17612+ (unsigned long)pgd, 0, 32);
17613+ if (rc) {
17614+ spin_unlock_irqrestore(&pgd_lock, flags);
17615+ goto out_oom;
17616+ }
17617+ }
17618+
17619+ /* Copy kernel pmd contents and write-protect the new pmds. */
17620+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17621+ unsigned long v = (unsigned long)i << PGDIR_SHIFT;
17622+ pgd_t *kpgd = pgd_offset_k(v);
17623+ pud_t *kpud = pud_offset(kpgd, v);
17624+ pmd_t *kpmd = pmd_offset(kpud, v);
17625+ memcpy(pmd[i], kpmd, PAGE_SIZE);
17626+ make_lowmem_page_readonly(
17627+ pmd[i], XENFEAT_writable_page_tables);
17628+ }
17629+
17630+ /* It is safe to poke machine addresses of pmds under the pmd_lock. */
17631+ for (i = 0; i < PTRS_PER_PGD; i++)
17632+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
17633+
17634+ /* Ensure this pgd gets picked up and pinned on save/restore. */
17635+ pgd_list_add(pgd);
17636+
17637+ spin_unlock_irqrestore(&pgd_lock, flags);
17638+
17639+ kfree(pmd);
17640+
17641+ return pgd;
17642+
17643+out_oom:
17644+ if (HAVE_SHARED_KERNEL_PMD) {
17645+ for (i--; i >= 0; i--)
17646+ kmem_cache_free(pmd_cache,
17647+ (void *)__va(pgd_val(pgd[i])-1));
17648+ } else {
17649+ for (i--; i >= 0; i--)
17650+ kmem_cache_free(pmd_cache, pmd[i]);
17651+ kfree(pmd);
17652+ }
17653+ kmem_cache_free(pgd_cache, pgd);
17654+ return NULL;
17655+}
17656+
17657+void pgd_free(pgd_t *pgd)
17658+{
17659+ int i;
17660+
17661+ /*
17662+ * After this the pgd should not be pinned for the duration of this
17663+ * function's execution. We should never sleep and thus never race:
17664+ * 1. User pmds will not become write-protected under our feet due
17665+ * to a concurrent mm_pin_all().
17666+ * 2. The machine addresses in PGD entries will not become invalid
17667+ * due to a concurrent save/restore.
17668+ */
17669+ pgd_test_and_unpin(pgd);
17670+
17671+ /* in the PAE case user pgd entries are overwritten before usage */
17672+ if (PTRS_PER_PMD > 1) {
17673+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17674+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17675+ kmem_cache_free(pmd_cache, pmd);
17676+ }
17677+
17678+ if (!HAVE_SHARED_KERNEL_PMD) {
17679+ unsigned long flags;
17680+ spin_lock_irqsave(&pgd_lock, flags);
17681+ pgd_list_del(pgd);
17682+ spin_unlock_irqrestore(&pgd_lock, flags);
17683+
17684+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17685+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17686+ make_lowmem_page_writable(
17687+ pmd, XENFEAT_writable_page_tables);
17688+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17689+ kmem_cache_free(pmd_cache, pmd);
17690+ }
17691+
17692+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
17693+ xen_destroy_contiguous_region(
17694+ (unsigned long)pgd, 0);
17695+ }
17696+ }
17697+
17698+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
17699+ kmem_cache_free(pgd_cache, pgd);
17700+}
17701+
17702+void make_lowmem_page_readonly(void *va, unsigned int feature)
17703+{
17704+ pte_t *pte;
17705+ int rc;
17706+
17707+ if (xen_feature(feature))
17708+ return;
17709+
17710+ pte = virt_to_ptep(va);
17711+ rc = HYPERVISOR_update_va_mapping(
17712+ (unsigned long)va, pte_wrprotect(*pte), 0);
17713+ BUG_ON(rc);
17714+}
17715+
17716+void make_lowmem_page_writable(void *va, unsigned int feature)
17717+{
17718+ pte_t *pte;
17719+ int rc;
17720+
17721+ if (xen_feature(feature))
17722+ return;
17723+
17724+ pte = virt_to_ptep(va);
17725+ rc = HYPERVISOR_update_va_mapping(
17726+ (unsigned long)va, pte_mkwrite(*pte), 0);
17727+ BUG_ON(rc);
17728+}
17729+
17730+void make_page_readonly(void *va, unsigned int feature)
17731+{
17732+ pte_t *pte;
17733+ int rc;
17734+
17735+ if (xen_feature(feature))
17736+ return;
17737+
17738+ pte = virt_to_ptep(va);
17739+ rc = HYPERVISOR_update_va_mapping(
17740+ (unsigned long)va, pte_wrprotect(*pte), 0);
17741+ if (rc) /* fallback? */
17742+ xen_l1_entry_update(pte, pte_wrprotect(*pte));
17743+ if ((unsigned long)va >= (unsigned long)high_memory) {
17744+ unsigned long pfn = pte_pfn(*pte);
17745+#ifdef CONFIG_HIGHMEM
17746+ if (pfn >= highstart_pfn)
17747+ kmap_flush_unused(); /* flush stale writable kmaps */
17748+ else
17749+#endif
17750+ make_lowmem_page_readonly(
17751+ phys_to_virt(pfn << PAGE_SHIFT), feature);
17752+ }
17753+}
17754+
17755+void make_page_writable(void *va, unsigned int feature)
17756+{
17757+ pte_t *pte;
17758+ int rc;
17759+
17760+ if (xen_feature(feature))
17761+ return;
17762+
17763+ pte = virt_to_ptep(va);
17764+ rc = HYPERVISOR_update_va_mapping(
17765+ (unsigned long)va, pte_mkwrite(*pte), 0);
17766+ if (rc) /* fallback? */
17767+ xen_l1_entry_update(pte, pte_mkwrite(*pte));
17768+ if ((unsigned long)va >= (unsigned long)high_memory) {
17769+ unsigned long pfn = pte_pfn(*pte);
17770+#ifdef CONFIG_HIGHMEM
17771+ if (pfn < highstart_pfn)
17772+#endif
17773+ make_lowmem_page_writable(
17774+ phys_to_virt(pfn << PAGE_SHIFT), feature);
17775+ }
17776+}
17777+
17778+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17779+{
17780+ if (xen_feature(feature))
17781+ return;
17782+
17783+ while (nr-- != 0) {
17784+ make_page_readonly(va, feature);
17785+ va = (void *)((unsigned long)va + PAGE_SIZE);
17786+ }
17787+}
17788+
17789+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17790+{
17791+ if (xen_feature(feature))
17792+ return;
17793+
17794+ while (nr-- != 0) {
17795+ make_page_writable(va, feature);
17796+ va = (void *)((unsigned long)va + PAGE_SIZE);
17797+ }
17798+}
17799+
17800+static void _pin_lock(struct mm_struct *mm, int lock) {
17801+ if (lock)
17802+ spin_lock(&mm->page_table_lock);
17803+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17804+ /* While mm->page_table_lock protects us against insertions and
17805+ * removals of higher level page table pages, it doesn't protect
17806+ * against updates of pte-s. Such updates, however, require the
17807+ * pte pages to be in consistent state (unpinned+writable or
17808+ * pinned+readonly). The pinning and attribute changes, however
17809+ * cannot be done atomically, which is why such updates must be
17810+ * prevented from happening concurrently.
17811+ * Note that no pte lock can ever elsewhere be acquired nesting
17812+ * with an already acquired one in the same mm, or with the mm's
17813+ * page_table_lock already acquired, as that would break in the
17814+ * non-split case (where all these are actually resolving to the
17815+ * one page_table_lock). Thus acquiring all of them here is not
17816+ * going to result in dead locks, and the order of acquires
17817+ * doesn't matter.
17818+ */
17819+ {
17820+ pgd_t *pgd = mm->pgd;
17821+ unsigned g;
17822+
17823+ for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17824+ pud_t *pud;
17825+ unsigned u;
17826+
17827+ if (pgd_none(*pgd))
17828+ continue;
17829+ pud = pud_offset(pgd, 0);
17830+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17831+ pmd_t *pmd;
17832+ unsigned m;
17833+
17834+ if (pud_none(*pud))
17835+ continue;
17836+ pmd = pmd_offset(pud, 0);
17837+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17838+ spinlock_t *ptl;
17839+
17840+ if (pmd_none(*pmd))
17841+ continue;
17842+ ptl = pte_lockptr(0, pmd);
17843+ if (lock)
17844+ spin_lock(ptl);
17845+ else
17846+ spin_unlock(ptl);
17847+ }
17848+ }
17849+ }
17850+ }
17851+#endif
17852+ if (!lock)
17853+ spin_unlock(&mm->page_table_lock);
17854+}
17855+#define pin_lock(mm) _pin_lock(mm, 1)
17856+#define pin_unlock(mm) _pin_lock(mm, 0)
17857+
17858+#define PIN_BATCH 4
17859+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17860+
17861+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
17862+ unsigned int cpu, unsigned seq)
17863+{
17864+ unsigned long pfn = page_to_pfn(page);
17865+
17866+ if (PageHighMem(page)) {
17867+ if (pgprot_val(flags) & _PAGE_RW)
17868+ clear_bit(PG_pinned, &page->flags);
17869+ else
17870+ set_bit(PG_pinned, &page->flags);
17871+ } else {
17872+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17873+ (unsigned long)__va(pfn << PAGE_SHIFT),
17874+ pfn_pte(pfn, flags), 0);
17875+ if (unlikely(++seq == PIN_BATCH)) {
17876+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17877+ PIN_BATCH, NULL)))
17878+ BUG();
17879+ seq = 0;
17880+ }
17881+ }
17882+
17883+ return seq;
17884+}
17885+
17886+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17887+{
17888+ pgd_t *pgd = pgd_base;
17889+ pud_t *pud;
17890+ pmd_t *pmd;
17891+ int g, u, m;
17892+ unsigned int cpu, seq;
17893+
17894+ if (xen_feature(XENFEAT_auto_translated_physmap))
17895+ return;
17896+
17897+ cpu = get_cpu();
17898+
17899+ for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17900+ if (pgd_none(*pgd))
17901+ continue;
17902+ pud = pud_offset(pgd, 0);
17903+ if (PTRS_PER_PUD > 1) /* not folded */
17904+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
17905+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17906+ if (pud_none(*pud))
17907+ continue;
17908+ pmd = pmd_offset(pud, 0);
17909+ if (PTRS_PER_PMD > 1) /* not folded */
17910+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
17911+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17912+ if (pmd_none(*pmd))
17913+ continue;
17914+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
17915+ }
17916+ }
17917+ }
17918+
17919+ if (likely(seq != 0)) {
17920+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17921+ (unsigned long)pgd_base,
17922+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17923+ UVMF_TLB_FLUSH);
17924+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17925+ seq + 1, NULL)))
17926+ BUG();
17927+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
17928+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17929+ UVMF_TLB_FLUSH))
17930+ BUG();
17931+
17932+ put_cpu();
17933+}
17934+
17935+static void __pgd_pin(pgd_t *pgd)
17936+{
17937+ pgd_walk(pgd, PAGE_KERNEL_RO);
17938+ kmap_flush_unused();
17939+ xen_pgd_pin(__pa(pgd));
17940+ set_bit(PG_pinned, &virt_to_page(pgd)->flags);
17941+}
17942+
17943+static void __pgd_unpin(pgd_t *pgd)
17944+{
17945+ xen_pgd_unpin(__pa(pgd));
17946+ pgd_walk(pgd, PAGE_KERNEL);
17947+ clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
17948+}
17949+
17950+static void pgd_test_and_unpin(pgd_t *pgd)
17951+{
17952+ if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
17953+ __pgd_unpin(pgd);
17954+}
17955+
17956+void mm_pin(struct mm_struct *mm)
17957+{
17958+ if (xen_feature(XENFEAT_writable_page_tables))
17959+ return;
17960+ pin_lock(mm);
17961+ __pgd_pin(mm->pgd);
17962+ pin_unlock(mm);
17963+}
17964+
17965+void mm_unpin(struct mm_struct *mm)
17966+{
17967+ if (xen_feature(XENFEAT_writable_page_tables))
17968+ return;
17969+ pin_lock(mm);
17970+ __pgd_unpin(mm->pgd);
17971+ pin_unlock(mm);
17972+}
17973+
17974+void mm_pin_all(void)
17975+{
17976+ struct page *page;
17977+ unsigned long flags;
17978+
17979+ if (xen_feature(XENFEAT_writable_page_tables))
17980+ return;
17981+
17982+ /*
17983+ * Allow uninterrupted access to the pgd_list. Also protects
17984+ * __pgd_pin() by disabling preemption.
17985+ * All other CPUs must be at a safe point (e.g., in stop_machine
17986+ * or offlined entirely).
17987+ */
17988+ spin_lock_irqsave(&pgd_lock, flags);
17989+ for (page = pgd_list; page; page = (struct page *)page->index) {
17990+ if (!test_bit(PG_pinned, &page->flags))
17991+ __pgd_pin((pgd_t *)page_address(page));
17992+ }
17993+ spin_unlock_irqrestore(&pgd_lock, flags);
17994+}
17995+
17996+void _arch_dup_mmap(struct mm_struct *mm)
17997+{
17998+ if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
17999+ mm_pin(mm);
18000+}
18001+
18002+void _arch_exit_mmap(struct mm_struct *mm)
18003+{
18004+ struct task_struct *tsk = current;
18005+
18006+ task_lock(tsk);
18007+
18008+ /*
18009+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18010+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18011+ */
18012+ if (tsk->active_mm == mm) {
18013+ tsk->active_mm = &init_mm;
18014+ atomic_inc(&init_mm.mm_count);
18015+
18016+ switch_mm(mm, &init_mm, tsk);
18017+
18018+ atomic_dec(&mm->mm_count);
18019+ BUG_ON(atomic_read(&mm->mm_count) == 0);
18020+ }
18021+
18022+ task_unlock(tsk);
18023+
18024+ if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
18025+ (atomic_read(&mm->mm_count) == 1) &&
18026+ !mm->context.has_foreign_mappings)
18027+ mm_unpin(mm);
18028+}
18029Index: head-2008-11-25/arch/x86/oprofile/xenoprof.c
18030===================================================================
18031--- /dev/null 1970-01-01 00:00:00.000000000 +0000
18032+++ head-2008-11-25/arch/x86/oprofile/xenoprof.c 2008-01-28 12:24:19.000000000 +0100
18033@@ -0,0 +1,179 @@
18034+/**
18035+ * @file xenoprof.c
18036+ *
18037+ * @remark Copyright 2002 OProfile authors
18038+ * @remark Read the file COPYING
18039+ *
18040+ * @author John Levon <levon@movementarian.org>
18041+ *
18042+ * Modified by Aravind Menon and Jose Renato Santos for Xen
18043+ * These modifications are:
18044+ * Copyright (C) 2005 Hewlett-Packard Co.
18045+ *
18046+ * x86-specific part
18047+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
18048+ * VA Linux Systems Japan K.K.
18049+ */
18050+
18051+#include <linux/init.h>
18052+#include <linux/oprofile.h>
18053+#include <linux/sched.h>
18054+#include <asm/pgtable.h>
18055+
18056+#include <xen/driver_util.h>
18057+#include <xen/interface/xen.h>
18058+#include <xen/interface/xenoprof.h>
18059+#include <xen/xenoprof.h>
18060+#include "op_counter.h"
18061+
18062+static unsigned int num_events = 0;
18063+
18064+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
18065+{
18066+ num_events = init->num_events;
18067+ /* just in case - make sure we do not overflow event list
18068+ (i.e. counter_config list) */
18069+ if (num_events > OP_MAX_COUNTER) {
18070+ num_events = OP_MAX_COUNTER;
18071+ init->num_events = num_events;
18072+ }
18073+}
18074+
18075+void xenoprof_arch_counter(void)
18076+{
18077+ int i;
18078+ struct xenoprof_counter counter;
18079+
18080+ for (i=0; i<num_events; i++) {
18081+ counter.ind = i;
18082+ counter.count = (uint64_t)counter_config[i].count;
18083+ counter.enabled = (uint32_t)counter_config[i].enabled;
18084+ counter.event = (uint32_t)counter_config[i].event;
18085+ counter.kernel = (uint32_t)counter_config[i].kernel;
18086+ counter.user = (uint32_t)counter_config[i].user;
18087+ counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
18088+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
18089+ &counter));
18090+ }
18091+}
18092+
18093+void xenoprof_arch_start(void)
18094+{
18095+ /* nothing */
18096+}
18097+
18098+void xenoprof_arch_stop(void)
18099+{
18100+ /* nothing */
18101+}
18102+
18103+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
18104+{
18105+ if (sbuf->buffer) {
18106+ vunmap(sbuf->buffer);
18107+ sbuf->buffer = NULL;
18108+ }
18109+}
18110+
18111+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
18112+ struct xenoprof_shared_buffer * sbuf)
18113+{
18114+ int npages, ret;
18115+ struct vm_struct *area;
18116+
18117+ sbuf->buffer = NULL;
18118+ if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
18119+ return ret;
18120+
18121+ npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
18122+
18123+ area = alloc_vm_area(npages * PAGE_SIZE);
18124+ if (area == NULL)
18125+ return -ENOMEM;
18126+
18127+ if ( (ret = direct_kernel_remap_pfn_range(
18128+ (unsigned long)area->addr,
18129+ get_buffer->buf_gmaddr >> PAGE_SHIFT,
18130+ npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
18131+ DOMID_SELF)) ) {
18132+ vunmap(area->addr);
18133+ return ret;
18134+ }
18135+
18136+ sbuf->buffer = area->addr;
18137+ return ret;
18138+}
18139+
18140+int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
18141+ struct xenoprof_shared_buffer * sbuf)
18142+{
18143+ int ret;
18144+ int npages;
18145+ struct vm_struct *area;
18146+ pgprot_t prot = __pgprot(_KERNPG_TABLE);
18147+
18148+ sbuf->buffer = NULL;
18149+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
18150+ if (ret)
18151+ goto out;
18152+
18153+ npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
18154+
18155+ area = alloc_vm_area(npages * PAGE_SIZE);
18156+ if (area == NULL) {
18157+ ret = -ENOMEM;
18158+ goto out;
18159+ }
18160+
18161+ ret = direct_kernel_remap_pfn_range(
18162+ (unsigned long)area->addr,
18163+ pdomain->buf_gmaddr >> PAGE_SHIFT,
18164+ npages * PAGE_SIZE, prot, DOMID_SELF);
18165+ if (ret) {
18166+ vunmap(area->addr);
18167+ goto out;
18168+ }
18169+ sbuf->buffer = area->addr;
18170+
18171+out:
18172+ return ret;
18173+}
18174+
18175+struct op_counter_config counter_config[OP_MAX_COUNTER];
18176+
18177+int xenoprof_create_files(struct super_block * sb, struct dentry * root)
18178+{
18179+ unsigned int i;
18180+
18181+ for (i = 0; i < num_events; ++i) {
18182+ struct dentry * dir;
18183+ char buf[2];
18184+
18185+ snprintf(buf, 2, "%d", i);
18186+ dir = oprofilefs_mkdir(sb, root, buf);
18187+ oprofilefs_create_ulong(sb, dir, "enabled",
18188+ &counter_config[i].enabled);
18189+ oprofilefs_create_ulong(sb, dir, "event",
18190+ &counter_config[i].event);
18191+ oprofilefs_create_ulong(sb, dir, "count",
18192+ &counter_config[i].count);
18193+ oprofilefs_create_ulong(sb, dir, "unit_mask",
18194+ &counter_config[i].unit_mask);
18195+ oprofilefs_create_ulong(sb, dir, "kernel",
18196+ &counter_config[i].kernel);
18197+ oprofilefs_create_ulong(sb, dir, "user",
18198+ &counter_config[i].user);
18199+ }
18200+
18201+ return 0;
18202+}
18203+
18204+int __init oprofile_arch_init(struct oprofile_operations * ops)
18205+{
18206+ return xenoprofile_init(ops);
18207+}
18208+
18209+void oprofile_arch_exit(void)
18210+{
18211+ xenoprofile_exit();
18212+}
18213Index: head-2008-11-25/arch/x86/pci/irq-xen.c
18214===================================================================
18215--- /dev/null 1970-01-01 00:00:00.000000000 +0000
18216+++ head-2008-11-25/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100
18217@@ -0,0 +1,1211 @@
18218+/*
18219+ * Low-Level PCI Support for PC -- Routing of Interrupts
18220+ *
18221+ * (c) 1999--2000 Martin Mares <mj@ucw.cz>
18222+ */
18223+
18224+#include <linux/types.h>
18225+#include <linux/kernel.h>
18226+#include <linux/pci.h>
18227+#include <linux/init.h>
18228+#include <linux/slab.h>
18229+#include <linux/interrupt.h>
18230+#include <linux/dmi.h>
18231+#include <asm/io.h>
18232+#include <asm/smp.h>
18233+#include <asm/io_apic.h>
18234+#include <linux/irq.h>
18235+#include <linux/acpi.h>
18236+
18237+#include "pci.h"
18238+
18239+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
18240+#define PIRQ_VERSION 0x0100
18241+
18242+static int broken_hp_bios_irq9;
18243+static int acer_tm360_irqrouting;
18244+
18245+static struct irq_routing_table *pirq_table;
18246+
18247+static int pirq_enable_irq(struct pci_dev *dev);
18248+
18249+/*
18250+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
18251+ * Avoid using: 13, 14 and 15 (FP error and IDE).
18252+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
18253+ */
18254+unsigned int pcibios_irq_mask = 0xfff8;
18255+
18256+static int pirq_penalty[16] = {
18257+ 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
18258+ 0, 0, 0, 0, 1000, 100000, 100000, 100000
18259+};
18260+
18261+struct irq_router {
18262+ char *name;
18263+ u16 vendor, device;
18264+ int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
18265+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
18266+};
18267+
18268+struct irq_router_handler {
18269+ u16 vendor;
18270+ int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
18271+};
18272+
18273+int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
18274+void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
18275+
18276+/*
18277+ * Check passed address for the PCI IRQ Routing Table signature
18278+ * and perform checksum verification.
18279+ */
18280+
18281+static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
18282+{
18283+ struct irq_routing_table *rt;
18284+ int i;
18285+ u8 sum;
18286+
18287+ rt = (struct irq_routing_table *) addr;
18288+ if (rt->signature != PIRQ_SIGNATURE ||
18289+ rt->version != PIRQ_VERSION ||
18290+ rt->size % 16 ||
18291+ rt->size < sizeof(struct irq_routing_table))
18292+ return NULL;
18293+ sum = 0;
18294+ for (i=0; i < rt->size; i++)
18295+ sum += addr[i];
18296+ if (!sum) {
18297+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
18298+ return rt;
18299+ }
18300+ return NULL;
18301+}
18302+
18303+
18304+
18305+/*
18306+ * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
18307+ */
18308+
18309+static struct irq_routing_table * __init pirq_find_routing_table(void)
18310+{
18311+ u8 *addr;
18312+ struct irq_routing_table *rt;
18313+
18314+#ifdef CONFIG_XEN
18315+ if (!is_initial_xendomain())
18316+ return NULL;
18317+#endif
18318+ if (pirq_table_addr) {
18319+ rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
18320+ if (rt)
18321+ return rt;
18322+ printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
18323+ }
18324+ for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
18325+ rt = pirq_check_routing_table(addr);
18326+ if (rt)
18327+ return rt;
18328+ }
18329+ return NULL;
18330+}
18331+
18332+/*
18333+ * If we have a IRQ routing table, use it to search for peer host
18334+ * bridges. It's a gross hack, but since there are no other known
18335+ * ways how to get a list of buses, we have to go this way.
18336+ */
18337+
18338+static void __init pirq_peer_trick(void)
18339+{
18340+ struct irq_routing_table *rt = pirq_table;
18341+ u8 busmap[256];
18342+ int i;
18343+ struct irq_info *e;
18344+
18345+ memset(busmap, 0, sizeof(busmap));
18346+ for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
18347+ e = &rt->slots[i];
18348+#ifdef DEBUG
18349+ {
18350+ int j;
18351+ DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
18352+ for(j=0; j<4; j++)
18353+ DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
18354+ DBG("\n");
18355+ }
18356+#endif
18357+ busmap[e->bus] = 1;
18358+ }
18359+ for(i = 1; i < 256; i++) {
18360+ if (!busmap[i] || pci_find_bus(0, i))
18361+ continue;
18362+ if (pci_scan_bus(i, &pci_root_ops, NULL))
18363+ printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
18364+ }
18365+ pcibios_last_bus = -1;
18366+}
18367+
18368+/*
18369+ * Code for querying and setting of IRQ routes on various interrupt routers.
18370+ */
18371+
18372+void eisa_set_level_irq(unsigned int irq)
18373+{
18374+ unsigned char mask = 1 << (irq & 7);
18375+ unsigned int port = 0x4d0 + (irq >> 3);
18376+ unsigned char val;
18377+ static u16 eisa_irq_mask;
18378+
18379+ if (irq >= 16 || (1 << irq) & eisa_irq_mask)
18380+ return;
18381+
18382+ eisa_irq_mask |= (1 << irq);
18383+ printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
18384+ val = inb(port);
18385+ if (!(val & mask)) {
18386+ DBG(KERN_DEBUG " -> edge");
18387+ outb(val | mask, port);
18388+ }
18389+}
18390+
18391+/*
18392+ * Common IRQ routing practice: nybbles in config space,
18393+ * offset by some magic constant.
18394+ */
18395+static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
18396+{
18397+ u8 x;
18398+ unsigned reg = offset + (nr >> 1);
18399+
18400+ pci_read_config_byte(router, reg, &x);
18401+ return (nr & 1) ? (x >> 4) : (x & 0xf);
18402+}
18403+
18404+static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
18405+{
18406+ u8 x;
18407+ unsigned reg = offset + (nr >> 1);
18408+
18409+ pci_read_config_byte(router, reg, &x);
18410+ x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
18411+ pci_write_config_byte(router, reg, x);
18412+}
18413+
18414+/*
18415+ * ALI pirq entries are damn ugly, and completely undocumented.
18416+ * This has been figured out from pirq tables, and it's not a pretty
18417+ * picture.
18418+ */
18419+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18420+{
18421+ static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18422+
18423+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18424+}
18425+
18426+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18427+{
18428+ static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18429+ unsigned int val = irqmap[irq];
18430+
18431+ if (val) {
18432+ write_config_nybble(router, 0x48, pirq-1, val);
18433+ return 1;
18434+ }
18435+ return 0;
18436+}
18437+
18438+/*
18439+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
18440+ * just a pointer to the config space.
18441+ */
18442+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18443+{
18444+ u8 x;
18445+
18446+ pci_read_config_byte(router, pirq, &x);
18447+ return (x < 16) ? x : 0;
18448+}
18449+
18450+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18451+{
18452+ pci_write_config_byte(router, pirq, irq);
18453+ return 1;
18454+}
18455+
18456+/*
18457+ * The VIA pirq rules are nibble-based, like ALI,
18458+ * but without the ugly irq number munging.
18459+ * However, PIRQD is in the upper instead of lower 4 bits.
18460+ */
18461+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18462+{
18463+ return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
18464+}
18465+
18466+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18467+{
18468+ write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
18469+ return 1;
18470+}
18471+
18472+/*
18473+ * The VIA pirq rules are nibble-based, like ALI,
18474+ * but without the ugly irq number munging.
18475+ * However, for 82C586, nibble map is different .
18476+ */
18477+static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18478+{
18479+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18480+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18481+}
18482+
18483+static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18484+{
18485+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18486+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18487+ return 1;
18488+}
18489+
18490+/*
18491+ * ITE 8330G pirq rules are nibble-based
18492+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
18493+ * 2+3 are both mapped to irq 9 on my system
18494+ */
18495+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18496+{
18497+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18498+ return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18499+}
18500+
18501+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18502+{
18503+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18504+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18505+ return 1;
18506+}
18507+
18508+/*
18509+ * OPTI: high four bits are nibble pointer..
18510+ * I wonder what the low bits do?
18511+ */
18512+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18513+{
18514+ return read_config_nybble(router, 0xb8, pirq >> 4);
18515+}
18516+
18517+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18518+{
18519+ write_config_nybble(router, 0xb8, pirq >> 4, irq);
18520+ return 1;
18521+}
18522+
18523+/*
18524+ * Cyrix: nibble offset 0x5C
18525+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
18526+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
18527+ */
18528+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18529+{
18530+ return read_config_nybble(router, 0x5C, (pirq-1)^1);
18531+}
18532+
18533+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18534+{
18535+ write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
18536+ return 1;
18537+}
18538+
18539+/*
18540+ * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
18541+ * We have to deal with the following issues here:
18542+ * - vendors have different ideas about the meaning of link values
18543+ * - some onboard devices (integrated in the chipset) have special
18544+ * links and are thus routed differently (i.e. not via PCI INTA-INTD)
18545+ * - different revision of the router have a different layout for
18546+ * the routing registers, particularly for the onchip devices
18547+ *
18548+ * For all routing registers the common thing is we have one byte
18549+ * per routeable link which is defined as:
18550+ * bit 7 IRQ mapping enabled (0) or disabled (1)
18551+ * bits [6:4] reserved (sometimes used for onchip devices)
18552+ * bits [3:0] IRQ to map to
18553+ * allowed: 3-7, 9-12, 14-15
18554+ * reserved: 0, 1, 2, 8, 13
18555+ *
18556+ * The config-space registers located at 0x41/0x42/0x43/0x44 are
18557+ * always used to route the normal PCI INT A/B/C/D respectively.
18558+ * Apparently there are systems implementing PCI routing table using
18559+ * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
18560+ * We try our best to handle both link mappings.
18561+ *
18562+ * Currently (2003-05-21) it appears most SiS chipsets follow the
18563+ * definition of routing registers from the SiS-5595 southbridge.
18564+ * According to the SiS 5595 datasheets the revision id's of the
18565+ * router (ISA-bridge) should be 0x01 or 0xb0.
18566+ *
18567+ * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
18568+ * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
18569+ * They seem to work with the current routing code. However there is
18570+ * some concern because of the two USB-OHCI HCs (original SiS 5595
18571+ * had only one). YMMV.
18572+ *
18573+ * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
18574+ *
18575+ * 0x61: IDEIRQ:
18576+ * bits [6:5] must be written 01
18577+ * bit 4 channel-select primary (0), secondary (1)
18578+ *
18579+ * 0x62: USBIRQ:
18580+ * bit 6 OHCI function disabled (0), enabled (1)
18581+ *
18582+ * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
18583+ *
18584+ * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
18585+ *
18586+ * We support USBIRQ (in addition to INTA-INTD) and keep the
18587+ * IDE, ACPI and DAQ routing untouched as set by the BIOS.
18588+ *
18589+ * Currently the only reported exception is the new SiS 65x chipset
18590+ * which includes the SiS 69x southbridge. Here we have the 85C503
18591+ * router revision 0x04 and there are changes in the register layout
18592+ * mostly related to the different USB HCs with USB 2.0 support.
18593+ *
18594+ * Onchip routing for router rev-id 0x04 (try-and-error observation)
18595+ *
18596+ * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
18597+ * bit 6-4 are probably unused, not like 5595
18598+ */
18599+
18600+#define PIRQ_SIS_IRQ_MASK 0x0f
18601+#define PIRQ_SIS_IRQ_DISABLE 0x80
18602+#define PIRQ_SIS_USB_ENABLE 0x40
18603+
18604+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18605+{
18606+ u8 x;
18607+ int reg;
18608+
18609+ reg = pirq;
18610+ if (reg >= 0x01 && reg <= 0x04)
18611+ reg += 0x40;
18612+ pci_read_config_byte(router, reg, &x);
18613+ return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
18614+}
18615+
18616+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18617+{
18618+ u8 x;
18619+ int reg;
18620+
18621+ reg = pirq;
18622+ if (reg >= 0x01 && reg <= 0x04)
18623+ reg += 0x40;
18624+ pci_read_config_byte(router, reg, &x);
18625+ x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
18626+ x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
18627+ pci_write_config_byte(router, reg, x);
18628+ return 1;
18629+}
18630+
18631+
18632+/*
18633+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
18634+ * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
18635+ * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
18636+ * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
18637+ * for the busbridge to the docking station.
18638+ */
18639+
18640+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18641+{
18642+ if (pirq > 8) {
18643+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18644+ return 0;
18645+ }
18646+ return read_config_nybble(router, 0x74, pirq-1);
18647+}
18648+
18649+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18650+{
18651+ if (pirq > 8) {
18652+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18653+ return 0;
18654+ }
18655+ write_config_nybble(router, 0x74, pirq-1, irq);
18656+ return 1;
18657+}
18658+
18659+/*
18660+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
18661+ * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
18662+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
18663+ * register is a straight binary coding of desired PIC IRQ (low nibble).
18664+ *
18665+ * The 'link' value in the PIRQ table is already in the correct format
18666+ * for the Index register. There are some special index values:
18667+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
18668+ * and 0x03 for SMBus.
18669+ */
18670+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18671+{
18672+ outb_p(pirq, 0xc00);
18673+ return inb(0xc01) & 0xf;
18674+}
18675+
18676+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18677+{
18678+ outb_p(pirq, 0xc00);
18679+ outb_p(irq, 0xc01);
18680+ return 1;
18681+}
18682+
18683+/* Support for AMD756 PCI IRQ Routing
18684+ * Jhon H. Caicedo <jhcaiced@osso.org.co>
18685+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
18686+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
18687+ * The AMD756 pirq rules are nibble-based
18688+ * offset 0x56 0-3 PIRQA 4-7 PIRQB
18689+ * offset 0x57 0-3 PIRQC 4-7 PIRQD
18690+ */
18691+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18692+{
18693+ u8 irq;
18694+ irq = 0;
18695+ if (pirq <= 4)
18696+ {
18697+ irq = read_config_nybble(router, 0x56, pirq - 1);
18698+ }
18699+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
18700+ dev->vendor, dev->device, pirq, irq);
18701+ return irq;
18702+}
18703+
18704+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18705+{
18706+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
18707+ dev->vendor, dev->device, pirq, irq);
18708+ if (pirq <= 4)
18709+ {
18710+ write_config_nybble(router, 0x56, pirq - 1, irq);
18711+ }
18712+ return 1;
18713+}
18714+
18715+#ifdef CONFIG_PCI_BIOS
18716+
18717+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18718+{
18719+ struct pci_dev *bridge;
18720+ int pin = pci_get_interrupt_pin(dev, &bridge);
18721+ return pcibios_set_irq_routing(bridge, pin, irq);
18722+}
18723+
18724+#endif
18725+
18726+static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18727+{
18728+ static struct pci_device_id __initdata pirq_440gx[] = {
18729+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
18730+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
18731+ { },
18732+ };
18733+
18734+ /* 440GX has a proprietary PIRQ router -- don't use it */
18735+ if (pci_dev_present(pirq_440gx))
18736+ return 0;
18737+
18738+ switch(device)
18739+ {
18740+ case PCI_DEVICE_ID_INTEL_82371FB_0:
18741+ case PCI_DEVICE_ID_INTEL_82371SB_0:
18742+ case PCI_DEVICE_ID_INTEL_82371AB_0:
18743+ case PCI_DEVICE_ID_INTEL_82371MX:
18744+ case PCI_DEVICE_ID_INTEL_82443MX_0:
18745+ case PCI_DEVICE_ID_INTEL_82801AA_0:
18746+ case PCI_DEVICE_ID_INTEL_82801AB_0:
18747+ case PCI_DEVICE_ID_INTEL_82801BA_0:
18748+ case PCI_DEVICE_ID_INTEL_82801BA_10:
18749+ case PCI_DEVICE_ID_INTEL_82801CA_0:
18750+ case PCI_DEVICE_ID_INTEL_82801CA_12:
18751+ case PCI_DEVICE_ID_INTEL_82801DB_0:
18752+ case PCI_DEVICE_ID_INTEL_82801E_0:
18753+ case PCI_DEVICE_ID_INTEL_82801EB_0:
18754+ case PCI_DEVICE_ID_INTEL_ESB_1:
18755+ case PCI_DEVICE_ID_INTEL_ICH6_0:
18756+ case PCI_DEVICE_ID_INTEL_ICH6_1:
18757+ case PCI_DEVICE_ID_INTEL_ICH7_0:
18758+ case PCI_DEVICE_ID_INTEL_ICH7_1:
18759+ case PCI_DEVICE_ID_INTEL_ICH7_30:
18760+ case PCI_DEVICE_ID_INTEL_ICH7_31:
18761+ case PCI_DEVICE_ID_INTEL_ESB2_0:
18762+ case PCI_DEVICE_ID_INTEL_ICH8_0:
18763+ case PCI_DEVICE_ID_INTEL_ICH8_1:
18764+ case PCI_DEVICE_ID_INTEL_ICH8_2:
18765+ case PCI_DEVICE_ID_INTEL_ICH8_3:
18766+ case PCI_DEVICE_ID_INTEL_ICH8_4:
18767+ case PCI_DEVICE_ID_INTEL_ICH9_0:
18768+ case PCI_DEVICE_ID_INTEL_ICH9_1:
18769+ case PCI_DEVICE_ID_INTEL_ICH9_2:
18770+ case PCI_DEVICE_ID_INTEL_ICH9_3:
18771+ case PCI_DEVICE_ID_INTEL_ICH9_4:
18772+ case PCI_DEVICE_ID_INTEL_ICH9_5:
18773+ r->name = "PIIX/ICH";
18774+ r->get = pirq_piix_get;
18775+ r->set = pirq_piix_set;
18776+ return 1;
18777+ }
18778+ return 0;
18779+}
18780+
18781+static __init int via_router_probe(struct irq_router *r,
18782+ struct pci_dev *router, u16 device)
18783+{
18784+ /* FIXME: We should move some of the quirk fixup stuff here */
18785+
18786+ /*
18787+ * work arounds for some buggy BIOSes
18788+ */
18789+ if (device == PCI_DEVICE_ID_VIA_82C586_0) {
18790+ switch(router->device) {
18791+ case PCI_DEVICE_ID_VIA_82C686:
18792+ /*
18793+ * Asus k7m bios wrongly reports 82C686A
18794+ * as 586-compatible
18795+ */
18796+ device = PCI_DEVICE_ID_VIA_82C686;
18797+ break;
18798+ case PCI_DEVICE_ID_VIA_8235:
18799+ /**
18800+ * Asus a7v-x bios wrongly reports 8235
18801+ * as 586-compatible
18802+ */
18803+ device = PCI_DEVICE_ID_VIA_8235;
18804+ break;
18805+ }
18806+ }
18807+
18808+ switch(device) {
18809+ case PCI_DEVICE_ID_VIA_82C586_0:
18810+ r->name = "VIA";
18811+ r->get = pirq_via586_get;
18812+ r->set = pirq_via586_set;
18813+ return 1;
18814+ case PCI_DEVICE_ID_VIA_82C596:
18815+ case PCI_DEVICE_ID_VIA_82C686:
18816+ case PCI_DEVICE_ID_VIA_8231:
18817+ case PCI_DEVICE_ID_VIA_8233A:
18818+ case PCI_DEVICE_ID_VIA_8235:
18819+ case PCI_DEVICE_ID_VIA_8237:
18820+ /* FIXME: add new ones for 8233/5 */
18821+ r->name = "VIA";
18822+ r->get = pirq_via_get;
18823+ r->set = pirq_via_set;
18824+ return 1;
18825+ }
18826+ return 0;
18827+}
18828+
18829+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18830+{
18831+ switch(device)
18832+ {
18833+ case PCI_DEVICE_ID_VLSI_82C534:
18834+ r->name = "VLSI 82C534";
18835+ r->get = pirq_vlsi_get;
18836+ r->set = pirq_vlsi_set;
18837+ return 1;
18838+ }
18839+ return 0;
18840+}
18841+
18842+
18843+static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18844+{
18845+ switch(device)
18846+ {
18847+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
18848+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
18849+ r->name = "ServerWorks";
18850+ r->get = pirq_serverworks_get;
18851+ r->set = pirq_serverworks_set;
18852+ return 1;
18853+ }
18854+ return 0;
18855+}
18856+
18857+static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18858+{
18859+ if (device != PCI_DEVICE_ID_SI_503)
18860+ return 0;
18861+
18862+ r->name = "SIS";
18863+ r->get = pirq_sis_get;
18864+ r->set = pirq_sis_set;
18865+ return 1;
18866+}
18867+
18868+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18869+{
18870+ switch(device)
18871+ {
18872+ case PCI_DEVICE_ID_CYRIX_5520:
18873+ r->name = "NatSemi";
18874+ r->get = pirq_cyrix_get;
18875+ r->set = pirq_cyrix_set;
18876+ return 1;
18877+ }
18878+ return 0;
18879+}
18880+
18881+static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18882+{
18883+ switch(device)
18884+ {
18885+ case PCI_DEVICE_ID_OPTI_82C700:
18886+ r->name = "OPTI";
18887+ r->get = pirq_opti_get;
18888+ r->set = pirq_opti_set;
18889+ return 1;
18890+ }
18891+ return 0;
18892+}
18893+
18894+static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18895+{
18896+ switch(device)
18897+ {
18898+ case PCI_DEVICE_ID_ITE_IT8330G_0:
18899+ r->name = "ITE";
18900+ r->get = pirq_ite_get;
18901+ r->set = pirq_ite_set;
18902+ return 1;
18903+ }
18904+ return 0;
18905+}
18906+
18907+static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18908+{
18909+ switch(device)
18910+ {
18911+ case PCI_DEVICE_ID_AL_M1533:
18912+ case PCI_DEVICE_ID_AL_M1563:
18913+ printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
18914+ r->name = "ALI";
18915+ r->get = pirq_ali_get;
18916+ r->set = pirq_ali_set;
18917+ return 1;
18918+ }
18919+ return 0;
18920+}
18921+
18922+static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18923+{
18924+ switch(device)
18925+ {
18926+ case PCI_DEVICE_ID_AMD_VIPER_740B:
18927+ r->name = "AMD756";
18928+ break;
18929+ case PCI_DEVICE_ID_AMD_VIPER_7413:
18930+ r->name = "AMD766";
18931+ break;
18932+ case PCI_DEVICE_ID_AMD_VIPER_7443:
18933+ r->name = "AMD768";
18934+ break;
18935+ default:
18936+ return 0;
18937+ }
18938+ r->get = pirq_amd756_get;
18939+ r->set = pirq_amd756_set;
18940+ return 1;
18941+}
18942+
18943+static __initdata struct irq_router_handler pirq_routers[] = {
18944+ { PCI_VENDOR_ID_INTEL, intel_router_probe },
18945+ { PCI_VENDOR_ID_AL, ali_router_probe },
18946+ { PCI_VENDOR_ID_ITE, ite_router_probe },
18947+ { PCI_VENDOR_ID_VIA, via_router_probe },
18948+ { PCI_VENDOR_ID_OPTI, opti_router_probe },
18949+ { PCI_VENDOR_ID_SI, sis_router_probe },
18950+ { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
18951+ { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
18952+ { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
18953+ { PCI_VENDOR_ID_AMD, amd_router_probe },
18954+ /* Someone with docs needs to add the ATI Radeon IGP */
18955+ { 0, NULL }
18956+};
18957+static struct irq_router pirq_router;
18958+static struct pci_dev *pirq_router_dev;
18959+
18960+
18961+/*
18962+ * FIXME: should we have an option to say "generic for
18963+ * chipset" ?
18964+ */
18965+
18966+static void __init pirq_find_router(struct irq_router *r)
18967+{
18968+ struct irq_routing_table *rt = pirq_table;
18969+ struct irq_router_handler *h;
18970+
18971+#ifdef CONFIG_PCI_BIOS
18972+ if (!rt->signature) {
18973+ printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
18974+ r->set = pirq_bios_set;
18975+ r->name = "BIOS";
18976+ return;
18977+ }
18978+#endif
18979+
18980+ /* Default unless a driver reloads it */
18981+ r->name = "default";
18982+ r->get = NULL;
18983+ r->set = NULL;
18984+
18985+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
18986+ rt->rtr_vendor, rt->rtr_device);
18987+
18988+ pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
18989+ if (!pirq_router_dev) {
18990+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
18991+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
18992+ return;
18993+ }
18994+
18995+ for( h = pirq_routers; h->vendor; h++) {
18996+ /* First look for a router match */
18997+ if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
18998+ break;
18999+ /* Fall back to a device match */
19000+ if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19001+ break;
19002+ }
19003+ printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19004+ pirq_router.name,
19005+ pirq_router_dev->vendor,
19006+ pirq_router_dev->device,
19007+ pci_name(pirq_router_dev));
19008+}
19009+
19010+static struct irq_info *pirq_get_info(struct pci_dev *dev)
19011+{
19012+ struct irq_routing_table *rt = pirq_table;
19013+ int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19014+ struct irq_info *info;
19015+
19016+ for (info = rt->slots; entries--; info++)
19017+ if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19018+ return info;
19019+ return NULL;
19020+}
19021+
19022+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19023+{
19024+ u8 pin;
19025+ struct irq_info *info;
19026+ int i, pirq, newirq;
19027+ int irq = 0;
19028+ u32 mask;
19029+ struct irq_router *r = &pirq_router;
19030+ struct pci_dev *dev2 = NULL;
19031+ char *msg = NULL;
19032+
19033+ /* Find IRQ pin */
19034+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19035+ if (!pin) {
19036+ DBG(KERN_DEBUG " -> no interrupt pin\n");
19037+ return 0;
19038+ }
19039+ pin = pin - 1;
19040+
19041+ /* Find IRQ routing entry */
19042+
19043+ if (!pirq_table)
19044+ return 0;
19045+
19046+ DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19047+ info = pirq_get_info(dev);
19048+ if (!info) {
19049+ DBG(" -> not found in routing table\n" KERN_DEBUG);
19050+ return 0;
19051+ }
19052+ pirq = info->irq[pin].link;
19053+ mask = info->irq[pin].bitmap;
19054+ if (!pirq) {
19055+ DBG(" -> not routed\n" KERN_DEBUG);
19056+ return 0;
19057+ }
19058+ DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19059+ mask &= pcibios_irq_mask;
19060+
19061+ /* Work around broken HP Pavilion Notebooks which assign USB to
19062+ IRQ 9 even though it is actually wired to IRQ 11 */
19063+
19064+ if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19065+ dev->irq = 11;
19066+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19067+ r->set(pirq_router_dev, dev, pirq, 11);
19068+ }
19069+
19070+ /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19071+ if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19072+ pirq = 0x68;
19073+ mask = 0x400;
19074+ dev->irq = r->get(pirq_router_dev, dev, pirq);
19075+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19076+ }
19077+
19078+ /*
19079+ * Find the best IRQ to assign: use the one
19080+ * reported by the device if possible.
19081+ */
19082+ newirq = dev->irq;
19083+ if (newirq && !((1 << newirq) & mask)) {
19084+ if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19085+ else printk("\n" KERN_WARNING
19086+ "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19087+ "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19088+ pci_name(dev));
19089+ }
19090+ if (!newirq && assign) {
19091+ for (i = 0; i < 16; i++) {
19092+ if (!(mask & (1 << i)))
19093+ continue;
19094+ if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
19095+ newirq = i;
19096+ }
19097+ }
19098+ DBG(" -> newirq=%d", newirq);
19099+
19100+ /* Check if it is hardcoded */
19101+ if ((pirq & 0xf0) == 0xf0) {
19102+ irq = pirq & 0xf;
19103+ DBG(" -> hardcoded IRQ %d\n", irq);
19104+ msg = "Hardcoded";
19105+ } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19106+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19107+ DBG(" -> got IRQ %d\n", irq);
19108+ msg = "Found";
19109+ eisa_set_level_irq(irq);
19110+ } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19111+ DBG(" -> assigning IRQ %d", newirq);
19112+ if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19113+ eisa_set_level_irq(newirq);
19114+ DBG(" ... OK\n");
19115+ msg = "Assigned";
19116+ irq = newirq;
19117+ }
19118+ }
19119+
19120+ if (!irq) {
19121+ DBG(" ... failed\n");
19122+ if (newirq && mask == (1 << newirq)) {
19123+ msg = "Guessed";
19124+ irq = newirq;
19125+ } else
19126+ return 0;
19127+ }
19128+ printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19129+
19130+ /* Update IRQ for all devices with the same pirq value */
19131+ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
19132+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
19133+ if (!pin)
19134+ continue;
19135+ pin--;
19136+ info = pirq_get_info(dev2);
19137+ if (!info)
19138+ continue;
19139+ if (info->irq[pin].link == pirq) {
19140+ /* We refuse to override the dev->irq information. Give a warning! */
19141+ if ( dev2->irq && dev2->irq != irq && \
19142+ (!(pci_probe & PCI_USE_PIRQ_MASK) || \
19143+ ((1 << dev2->irq) & mask)) ) {
19144+#ifndef CONFIG_PCI_MSI
19145+ printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
19146+ pci_name(dev2), dev2->irq, irq);
19147+#endif
19148+ continue;
19149+ }
19150+ dev2->irq = irq;
19151+ pirq_penalty[irq]++;
19152+ if (dev != dev2)
19153+ printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
19154+ }
19155+ }
19156+ return 1;
19157+}
19158+
19159+static void __init pcibios_fixup_irqs(void)
19160+{
19161+ struct pci_dev *dev = NULL;
19162+ u8 pin;
19163+
19164+ DBG(KERN_DEBUG "PCI: IRQ fixup\n");
19165+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19166+ /*
19167+ * If the BIOS has set an out of range IRQ number, just ignore it.
19168+ * Also keep track of which IRQ's are already in use.
19169+ */
19170+ if (dev->irq >= 16) {
19171+ DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
19172+ dev->irq = 0;
19173+ }
19174+ /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
19175+ if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
19176+ pirq_penalty[dev->irq] = 0;
19177+ pirq_penalty[dev->irq]++;
19178+ }
19179+
19180+ dev = NULL;
19181+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19182+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19183+#ifdef CONFIG_X86_IO_APIC
19184+ /*
19185+ * Recalculate IRQ numbers if we use the I/O APIC.
19186+ */
19187+ if (io_apic_assign_pci_irqs)
19188+ {
19189+ int irq;
19190+
19191+ if (pin) {
19192+ pin--; /* interrupt pins are numbered starting from 1 */
19193+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19194+ /*
19195+ * Busses behind bridges are typically not listed in the MP-table.
19196+ * In this case we have to look up the IRQ based on the parent bus,
19197+ * parent slot, and pin number. The SMP code detects such bridged
19198+ * busses itself so we should get into this branch reliably.
19199+ */
19200+ if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19201+ struct pci_dev * bridge = dev->bus->self;
19202+
19203+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19204+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19205+ PCI_SLOT(bridge->devfn), pin);
19206+ if (irq >= 0)
19207+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19208+ pci_name(bridge), 'A' + pin, irq);
19209+ }
19210+ if (irq >= 0) {
19211+ if (use_pci_vector() &&
19212+ !platform_legacy_irq(irq))
19213+ irq = IO_APIC_VECTOR(irq);
19214+
19215+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19216+ pci_name(dev), 'A' + pin, irq);
19217+ dev->irq = irq;
19218+ }
19219+ }
19220+ }
19221+#endif
19222+ /*
19223+ * Still no IRQ? Try to lookup one...
19224+ */
19225+ if (pin && !dev->irq)
19226+ pcibios_lookup_irq(dev, 0);
19227+ }
19228+}
19229+
19230+/*
19231+ * Work around broken HP Pavilion Notebooks which assign USB to
19232+ * IRQ 9 even though it is actually wired to IRQ 11
19233+ */
19234+static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
19235+{
19236+ if (!broken_hp_bios_irq9) {
19237+ broken_hp_bios_irq9 = 1;
19238+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19239+ }
19240+ return 0;
19241+}
19242+
19243+/*
19244+ * Work around broken Acer TravelMate 360 Notebooks which assign
19245+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
19246+ */
19247+static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
19248+{
19249+ if (!acer_tm360_irqrouting) {
19250+ acer_tm360_irqrouting = 1;
19251+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19252+ }
19253+ return 0;
19254+}
19255+
19256+static struct dmi_system_id __initdata pciirq_dmi_table[] = {
19257+ {
19258+ .callback = fix_broken_hp_bios_irq9,
19259+ .ident = "HP Pavilion N5400 Series Laptop",
19260+ .matches = {
19261+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
19262+ DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
19263+ DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
19264+ DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
19265+ },
19266+ },
19267+ {
19268+ .callback = fix_acer_tm360_irqrouting,
19269+ .ident = "Acer TravelMate 36x Laptop",
19270+ .matches = {
19271+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
19272+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
19273+ },
19274+ },
19275+ { }
19276+};
19277+
19278+static int __init pcibios_irq_init(void)
19279+{
19280+ DBG(KERN_DEBUG "PCI: IRQ init\n");
19281+
19282+ if (pcibios_enable_irq || raw_pci_ops == NULL)
19283+ return 0;
19284+
19285+ dmi_check_system(pciirq_dmi_table);
19286+
19287+ pirq_table = pirq_find_routing_table();
19288+
19289+#ifdef CONFIG_PCI_BIOS
19290+ if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
19291+ pirq_table = pcibios_get_irq_routing_table();
19292+#endif
19293+ if (pirq_table) {
19294+ pirq_peer_trick();
19295+ pirq_find_router(&pirq_router);
19296+ if (pirq_table->exclusive_irqs) {
19297+ int i;
19298+ for (i=0; i<16; i++)
19299+ if (!(pirq_table->exclusive_irqs & (1 << i)))
19300+ pirq_penalty[i] += 100;
19301+ }
19302+ /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
19303+ if (io_apic_assign_pci_irqs)
19304+ pirq_table = NULL;
19305+ }
19306+
19307+ pcibios_enable_irq = pirq_enable_irq;
19308+
19309+ pcibios_fixup_irqs();
19310+ return 0;
19311+}
19312+
19313+subsys_initcall(pcibios_irq_init);
19314+
19315+
19316+static void pirq_penalize_isa_irq(int irq, int active)
19317+{
19318+ /*
19319+ * If any ISAPnP device reports an IRQ in its list of possible
19320+ * IRQ's, we try to avoid assigning it to PCI devices.
19321+ */
19322+ if (irq < 16) {
19323+ if (active)
19324+ pirq_penalty[irq] += 1000;
19325+ else
19326+ pirq_penalty[irq] += 100;
19327+ }
19328+}
19329+
19330+void pcibios_penalize_isa_irq(int irq, int active)
19331+{
19332+#ifdef CONFIG_ACPI
19333+ if (!acpi_noirq)
19334+ acpi_penalize_isa_irq(irq, active);
19335+ else
19336+#endif
19337+ pirq_penalize_isa_irq(irq, active);
19338+}
19339+
19340+static int pirq_enable_irq(struct pci_dev *dev)
19341+{
19342+ u8 pin;
19343+ struct pci_dev *temp_dev;
19344+
19345+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19346+ if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
19347+ char *msg = "";
19348+
19349+ pin--; /* interrupt pins are numbered starting from 1 */
19350+
19351+ if (io_apic_assign_pci_irqs) {
19352+ int irq;
19353+
19354+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19355+ /*
19356+ * Busses behind bridges are typically not listed in the MP-table.
19357+ * In this case we have to look up the IRQ based on the parent bus,
19358+ * parent slot, and pin number. The SMP code detects such bridged
19359+ * busses itself so we should get into this branch reliably.
19360+ */
19361+ temp_dev = dev;
19362+ while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19363+ struct pci_dev * bridge = dev->bus->self;
19364+
19365+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19366+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19367+ PCI_SLOT(bridge->devfn), pin);
19368+ if (irq >= 0)
19369+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19370+ pci_name(bridge), 'A' + pin, irq);
19371+ dev = bridge;
19372+ }
19373+ dev = temp_dev;
19374+ if (irq >= 0) {
19375+#ifdef CONFIG_PCI_MSI
19376+ if (!platform_legacy_irq(irq))
19377+ irq = IO_APIC_VECTOR(irq);
19378+#endif
19379+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19380+ pci_name(dev), 'A' + pin, irq);
19381+ dev->irq = irq;
19382+ return 0;
19383+ } else
19384+ msg = " Probably buggy MP table.";
19385+ } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
19386+ msg = "";
19387+ else
19388+ msg = " Please try using pci=biosirq.";
19389+
19390+ /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
19391+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
19392+ return 0;
19393+
19394+ printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
19395+ 'A' + pin, pci_name(dev), msg);
19396+ }
19397+ return 0;
19398+}
19399+
19400+int pci_vector_resources(int last, int nr_released)
19401+{
19402+ int count = nr_released;
19403+
19404+ int next = last;
19405+ int offset = (last % 8);
19406+
19407+ while (next < FIRST_SYSTEM_VECTOR) {
19408+ next += 8;
19409+#ifdef CONFIG_X86_64
19410+ if (next == IA32_SYSCALL_VECTOR)
19411+ continue;
19412+#else
19413+ if (next == SYSCALL_VECTOR)
19414+ continue;
19415+#endif
19416+ count++;
19417+ if (next >= FIRST_SYSTEM_VECTOR) {
19418+ if (offset%8) {
19419+ next = FIRST_DEVICE_VECTOR + offset;
19420+ offset++;
19421+ continue;
19422+ }
19423+ count--;
19424+ }
19425+ }
19426+
19427+ return count;
19428+}
19429Index: head-2008-11-25/arch/x86/pci/pcifront.c
19430===================================================================
19431--- /dev/null 1970-01-01 00:00:00.000000000 +0000
19432+++ head-2008-11-25/arch/x86/pci/pcifront.c 2007-06-12 13:12:49.000000000 +0200
19433@@ -0,0 +1,55 @@
19434+/*
19435+ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
19436+ * to support the Xen PCI Frontend's operation
19437+ *
19438+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
19439+ */
19440+#include <linux/module.h>
19441+#include <linux/init.h>
19442+#include <linux/pci.h>
19443+#include <asm/acpi.h>
19444+#include "pci.h"
19445+
19446+static int pcifront_enable_irq(struct pci_dev *dev)
19447+{
19448+ u8 irq;
19449+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
19450+ dev->irq = irq;
19451+
19452+ return 0;
19453+}
19454+
19455+extern u8 pci_cache_line_size;
19456+
19457+static int __init pcifront_x86_stub_init(void)
19458+{
19459+ struct cpuinfo_x86 *c = &boot_cpu_data;
19460+
19461+ /* Only install our method if we haven't found real hardware already */
19462+ if (raw_pci_ops)
19463+ return 0;
19464+
19465+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
19466+
19467+ /* Copied from arch/i386/pci/common.c */
19468+ pci_cache_line_size = 32 >> 2;
19469+ if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
19470+ pci_cache_line_size = 64 >> 2; /* K7 & K8 */
19471+ else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
19472+ pci_cache_line_size = 128 >> 2; /* P4 */
19473+
19474+ /* On x86, we need to disable the normal IRQ routing table and
19475+ * just ask the backend
19476+ */
19477+ pcibios_enable_irq = pcifront_enable_irq;
19478+ pcibios_disable_irq = NULL;
19479+
19480+#ifdef CONFIG_ACPI
19481+ /* Keep ACPI out of the picture */
19482+ acpi_noirq = 1;
19483+#endif
19484+
19485+ return 0;
19486+}
19487+
19488+arch_initcall(pcifront_x86_stub_init);
19489Index: head-2008-11-25/arch/x86/ia32/ia32entry-xen.S
19490===================================================================
19491--- /dev/null 1970-01-01 00:00:00.000000000 +0000
19492+++ head-2008-11-25/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200
19493@@ -0,0 +1,666 @@
19494+/*
19495+ * Compatibility mode system call entry point for x86-64.
19496+ *
19497+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
19498+ */
19499+
19500+#include <asm/dwarf2.h>
19501+#include <asm/calling.h>
19502+#include <asm/asm-offsets.h>
19503+#include <asm/current.h>
19504+#include <asm/errno.h>
19505+#include <asm/ia32_unistd.h>
19506+#include <asm/thread_info.h>
19507+#include <asm/segment.h>
19508+#include <asm/vsyscall32.h>
19509+#include <asm/irqflags.h>
19510+#include <linux/linkage.h>
19511+
19512+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19513+
19514+ .macro IA32_ARG_FIXUP noebp=0
19515+ movl %edi,%r8d
19516+ .if \noebp
19517+ .else
19518+ movl %ebp,%r9d
19519+ .endif
19520+ xchg %ecx,%esi
19521+ movl %ebx,%edi
19522+ movl %edx,%edx /* zero extension */
19523+ .endm
19524+
19525+ /* clobbers %eax */
19526+ .macro CLEAR_RREGS
19527+ xorl %eax,%eax
19528+ movq %rax,R11(%rsp)
19529+ movq %rax,R10(%rsp)
19530+ movq %rax,R9(%rsp)
19531+ movq %rax,R8(%rsp)
19532+ .endm
19533+
19534+ .macro LOAD_ARGS32 offset
19535+ movl \offset(%rsp),%r11d
19536+ movl \offset+8(%rsp),%r10d
19537+ movl \offset+16(%rsp),%r9d
19538+ movl \offset+24(%rsp),%r8d
19539+ movl \offset+40(%rsp),%ecx
19540+ movl \offset+48(%rsp),%edx
19541+ movl \offset+56(%rsp),%esi
19542+ movl \offset+64(%rsp),%edi
19543+ movl \offset+72(%rsp),%eax
19544+ .endm
19545+
19546+ .macro CFI_STARTPROC32 simple
19547+ CFI_STARTPROC \simple
19548+ CFI_UNDEFINED r8
19549+ CFI_UNDEFINED r9
19550+ CFI_UNDEFINED r10
19551+ CFI_UNDEFINED r11
19552+ CFI_UNDEFINED r12
19553+ CFI_UNDEFINED r13
19554+ CFI_UNDEFINED r14
19555+ CFI_UNDEFINED r15
19556+ .endm
19557+
19558+/*
19559+ * 32bit SYSENTER instruction entry.
19560+ *
19561+ * Arguments:
19562+ * %eax System call number.
19563+ * %ebx Arg1
19564+ * %ecx Arg2
19565+ * %edx Arg3
19566+ * %esi Arg4
19567+ * %edi Arg5
19568+ * %ebp user stack
19569+ * 0(%ebp) Arg6
19570+ *
19571+ * Interrupts on.
19572+ *
19573+ * This is purely a fast path. For anything complicated we use the int 0x80
19574+ * path below. Set up a complete hardware stack frame to share code
19575+ * with the int 0x80 path.
19576+ */
19577+ENTRY(ia32_sysenter_target)
19578+ CFI_STARTPROC32 simple
19579+ CFI_DEF_CFA rsp,SS+8-RIP+16
19580+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
19581+ CFI_REL_OFFSET rsp,RSP-RIP+16
19582+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19583+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
19584+ CFI_REL_OFFSET rip,RIP-RIP+16
19585+ CFI_REL_OFFSET r11,8
19586+ CFI_REL_OFFSET rcx,0
19587+ movq 8(%rsp),%r11
19588+ CFI_RESTORE r11
19589+ popq %rcx
19590+ CFI_ADJUST_CFA_OFFSET -8
19591+ CFI_RESTORE rcx
19592+ movl %ebp,%ebp /* zero extension */
19593+ movl %eax,%eax
19594+ movl $__USER32_DS,40(%rsp)
19595+ movq %rbp,32(%rsp)
19596+ movl $__USER32_CS,16(%rsp)
19597+ movl $VSYSCALL32_SYSEXIT,8(%rsp)
19598+ movq %rax,(%rsp)
19599+ cld
19600+ SAVE_ARGS 0,0,0
19601+ /* no need to do an access_ok check here because rbp has been
19602+ 32bit zero extended */
19603+1: movl (%rbp),%r9d
19604+ .section __ex_table,"a"
19605+ .quad 1b,ia32_badarg
19606+ .previous
19607+ GET_THREAD_INFO(%r10)
19608+ orl $TS_COMPAT,threadinfo_status(%r10)
19609+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19610+ jnz sysenter_tracesys
19611+sysenter_do_call:
19612+ cmpl $(IA32_NR_syscalls-1),%eax
19613+ ja ia32_badsys
19614+ IA32_ARG_FIXUP 1
19615+ call *ia32_sys_call_table(,%rax,8)
19616+ movq %rax,RAX-ARGOFFSET(%rsp)
19617+ jmp int_ret_from_sys_call
19618+
19619+sysenter_tracesys:
19620+ SAVE_REST
19621+ CLEAR_RREGS
19622+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
19623+ movq %rsp,%rdi /* &pt_regs -> arg1 */
19624+ call syscall_trace_enter
19625+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19626+ RESTORE_REST
19627+ movl %ebp, %ebp
19628+ /* no need to do an access_ok check here because rbp has been
19629+ 32bit zero extended */
19630+1: movl (%rbp),%r9d
19631+ .section __ex_table,"a"
19632+ .quad 1b,ia32_badarg
19633+ .previous
19634+ jmp sysenter_do_call
19635+ CFI_ENDPROC
19636+ENDPROC(ia32_sysenter_target)
19637+
19638+/*
19639+ * 32bit SYSCALL instruction entry.
19640+ *
19641+ * Arguments:
19642+ * %eax System call number.
19643+ * %ebx Arg1
19644+ * %ecx return EIP
19645+ * %edx Arg3
19646+ * %esi Arg4
19647+ * %edi Arg5
19648+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
19649+ * %esp user stack
19650+ * 0(%esp) Arg6
19651+ *
19652+ * Interrupts on.
19653+ *
19654+ * This is purely a fast path. For anything complicated we use the int 0x80
19655+ * path below. Set up a complete hardware stack frame to share code
19656+ * with the int 0x80 path.
19657+ */
19658+ENTRY(ia32_cstar_target)
19659+ CFI_STARTPROC32 simple
19660+ CFI_DEF_CFA rsp,SS+8-RIP+16
19661+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
19662+ CFI_REL_OFFSET rsp,RSP-RIP+16
19663+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19664+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
19665+ CFI_REL_OFFSET rip,RIP-RIP+16
19666+ movl %eax,%eax /* zero extension */
19667+ movl RSP-RIP+16(%rsp),%r8d
19668+ SAVE_ARGS -8,1,1
19669+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
19670+ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
19671+ movl %ebp,%ecx
19672+ movl $__USER32_CS,CS-ARGOFFSET(%rsp)
19673+ movl $__USER32_DS,SS-ARGOFFSET(%rsp)
19674+ /* no need to do an access_ok check here because r8 has been
19675+ 32bit zero extended */
19676+ /* hardware stack frame is complete now */
19677+1: movl (%r8),%r9d
19678+ .section __ex_table,"a"
19679+ .quad 1b,ia32_badarg
19680+ .previous
19681+ GET_THREAD_INFO(%r10)
19682+ orl $TS_COMPAT,threadinfo_status(%r10)
19683+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19684+ jnz cstar_tracesys
19685+cstar_do_call:
19686+ cmpl $IA32_NR_syscalls-1,%eax
19687+ ja ia32_badsys
19688+ IA32_ARG_FIXUP 1
19689+ call *ia32_sys_call_table(,%rax,8)
19690+ movq %rax,RAX-ARGOFFSET(%rsp)
19691+ jmp int_ret_from_sys_call
19692+
19693+cstar_tracesys:
19694+ SAVE_REST
19695+ CLEAR_RREGS
19696+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
19697+ movq %rsp,%rdi /* &pt_regs -> arg1 */
19698+ call syscall_trace_enter
19699+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19700+ RESTORE_REST
19701+ movl RSP-ARGOFFSET(%rsp), %r8d
19702+ /* no need to do an access_ok check here because r8 has been
19703+ 32bit zero extended */
19704+1: movl (%r8),%r9d
19705+ .section __ex_table,"a"
19706+ .quad 1b,ia32_badarg
19707+ .previous
19708+ jmp cstar_do_call
19709+END(ia32_cstar_target)
19710+
19711+ia32_badarg:
19712+ movq $-EFAULT,%rax
19713+ jmp ia32_sysret
19714+ CFI_ENDPROC
19715+
19716+/*
19717+ * Emulated IA32 system calls via int 0x80.
19718+ *
19719+ * Arguments:
19720+ * %eax System call number.
19721+ * %ebx Arg1
19722+ * %ecx Arg2
19723+ * %edx Arg3
19724+ * %esi Arg4
19725+ * %edi Arg5
19726+ * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
19727+ *
19728+ * Notes:
19729+ * Uses the same stack frame as the x86-64 version.
19730+ * All registers except %eax must be saved (but ptrace may violate that)
19731+ * Arguments are zero extended. For system calls that want sign extension and
19732+ * take long arguments a wrapper is needed. Most calls can just be called
19733+ * directly.
19734+ * Assumes it is only called from user space and entered with interrupts on.
19735+ */
19736+
19737+ENTRY(ia32_syscall)
19738+ CFI_STARTPROC simple
19739+ CFI_DEF_CFA rsp,SS+8-RIP+16
19740+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
19741+ CFI_REL_OFFSET rsp,RSP-RIP+16
19742+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19743+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
19744+ CFI_REL_OFFSET rip,RIP-RIP+16
19745+ CFI_REL_OFFSET r11,8
19746+ CFI_REL_OFFSET rcx,0
19747+ movq 8(%rsp),%r11
19748+ CFI_RESTORE r11
19749+ popq %rcx
19750+ CFI_ADJUST_CFA_OFFSET -8
19751+ CFI_RESTORE rcx
19752+ movl %eax,%eax
19753+ movq %rax,(%rsp)
19754+ cld
19755+ /* note the registers are not zero extended to the sf.
19756+ this could be a problem. */
19757+ SAVE_ARGS 0,0,1
19758+ GET_THREAD_INFO(%r10)
19759+ orl $TS_COMPAT,threadinfo_status(%r10)
19760+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19761+ jnz ia32_tracesys
19762+ia32_do_syscall:
19763+ cmpl $(IA32_NR_syscalls-1),%eax
19764+ ja ia32_badsys
19765+ IA32_ARG_FIXUP
19766+ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
19767+ia32_sysret:
19768+ movq %rax,RAX-ARGOFFSET(%rsp)
19769+ jmp int_ret_from_sys_call
19770+
19771+ia32_tracesys:
19772+ SAVE_REST
19773+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
19774+ movq %rsp,%rdi /* &pt_regs -> arg1 */
19775+ call syscall_trace_enter
19776+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19777+ RESTORE_REST
19778+ jmp ia32_do_syscall
19779+END(ia32_syscall)
19780+
19781+ia32_badsys:
19782+ movq $0,ORIG_RAX-ARGOFFSET(%rsp)
19783+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
19784+ jmp int_ret_from_sys_call
19785+
19786+quiet_ni_syscall:
19787+ movq $-ENOSYS,%rax
19788+ ret
19789+ CFI_ENDPROC
19790+
19791+ .macro PTREGSCALL label, func, arg
19792+ .globl \label
19793+\label:
19794+ leaq \func(%rip),%rax
19795+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
19796+ jmp ia32_ptregs_common
19797+ .endm
19798+
19799+ CFI_STARTPROC32
19800+
19801+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
19802+ PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
19803+ PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
19804+ PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
19805+ PTREGSCALL stub32_execve, sys32_execve, %rcx
19806+ PTREGSCALL stub32_fork, sys_fork, %rdi
19807+ PTREGSCALL stub32_clone, sys32_clone, %rdx
19808+ PTREGSCALL stub32_vfork, sys_vfork, %rdi
19809+ PTREGSCALL stub32_iopl, sys_iopl, %rsi
19810+ PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
19811+
19812+ENTRY(ia32_ptregs_common)
19813+ popq %r11
19814+ CFI_ENDPROC
19815+ CFI_STARTPROC32 simple
19816+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
19817+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
19818+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
19819+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
19820+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
19821+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
19822+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
19823+/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
19824+/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
19825+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
19826+/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
19827+ SAVE_REST
19828+ call *%rax
19829+ RESTORE_REST
19830+ jmp ia32_sysret /* misbalances the return cache */
19831+ CFI_ENDPROC
19832+END(ia32_ptregs_common)
19833+
19834+ .section .rodata,"a"
19835+ .align 8
19836+ia32_sys_call_table:
19837+ .quad sys_restart_syscall
19838+ .quad sys_exit
19839+ .quad stub32_fork
19840+ .quad sys_read
19841+ .quad sys_write
19842+ .quad compat_sys_open /* 5 */
19843+ .quad sys_close
19844+ .quad sys32_waitpid
19845+ .quad sys_creat
19846+ .quad sys_link
19847+ .quad sys_unlink /* 10 */
19848+ .quad stub32_execve
19849+ .quad sys_chdir
19850+ .quad compat_sys_time
19851+ .quad sys_mknod
19852+ .quad sys_chmod /* 15 */
19853+ .quad sys_lchown16
19854+ .quad quiet_ni_syscall /* old break syscall holder */
19855+ .quad sys_stat
19856+ .quad sys32_lseek
19857+ .quad sys_getpid /* 20 */
19858+ .quad compat_sys_mount /* mount */
19859+ .quad sys_oldumount /* old_umount */
19860+ .quad sys_setuid16
19861+ .quad sys_getuid16
19862+ .quad compat_sys_stime /* stime */ /* 25 */
19863+ .quad sys32_ptrace /* ptrace */
19864+ .quad sys_alarm
19865+ .quad sys_fstat /* (old)fstat */
19866+ .quad sys_pause
19867+ .quad compat_sys_utime /* 30 */
19868+ .quad quiet_ni_syscall /* old stty syscall holder */
19869+ .quad quiet_ni_syscall /* old gtty syscall holder */
19870+ .quad sys_access
19871+ .quad sys_nice
19872+ .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
19873+ .quad sys_sync
19874+ .quad sys32_kill
19875+ .quad sys_rename
19876+ .quad sys_mkdir
19877+ .quad sys_rmdir /* 40 */
19878+ .quad sys_dup
19879+ .quad sys32_pipe
19880+ .quad compat_sys_times
19881+ .quad quiet_ni_syscall /* old prof syscall holder */
19882+ .quad sys_brk /* 45 */
19883+ .quad sys_setgid16
19884+ .quad sys_getgid16
19885+ .quad sys_signal
19886+ .quad sys_geteuid16
19887+ .quad sys_getegid16 /* 50 */
19888+ .quad sys_acct
19889+ .quad sys_umount /* new_umount */
19890+ .quad quiet_ni_syscall /* old lock syscall holder */
19891+ .quad compat_sys_ioctl
19892+ .quad compat_sys_fcntl64 /* 55 */
19893+ .quad quiet_ni_syscall /* old mpx syscall holder */
19894+ .quad sys_setpgid
19895+ .quad quiet_ni_syscall /* old ulimit syscall holder */
19896+ .quad sys32_olduname
19897+ .quad sys_umask /* 60 */
19898+ .quad sys_chroot
19899+ .quad sys32_ustat
19900+ .quad sys_dup2
19901+ .quad sys_getppid
19902+ .quad sys_getpgrp /* 65 */
19903+ .quad sys_setsid
19904+ .quad sys32_sigaction
19905+ .quad sys_sgetmask
19906+ .quad sys_ssetmask
19907+ .quad sys_setreuid16 /* 70 */
19908+ .quad sys_setregid16
19909+ .quad stub32_sigsuspend
19910+ .quad compat_sys_sigpending
19911+ .quad sys_sethostname
19912+ .quad compat_sys_setrlimit /* 75 */
19913+ .quad compat_sys_old_getrlimit /* old_getrlimit */
19914+ .quad compat_sys_getrusage
19915+ .quad sys32_gettimeofday
19916+ .quad sys32_settimeofday
19917+ .quad sys_getgroups16 /* 80 */
19918+ .quad sys_setgroups16
19919+ .quad sys32_old_select
19920+ .quad sys_symlink
19921+ .quad sys_lstat
19922+ .quad sys_readlink /* 85 */
19923+#ifdef CONFIG_IA32_AOUT
19924+ .quad sys_uselib
19925+#else
19926+ .quad quiet_ni_syscall
19927+#endif
19928+ .quad sys_swapon
19929+ .quad sys_reboot
19930+ .quad compat_sys_old_readdir
19931+ .quad sys32_mmap /* 90 */
19932+ .quad sys_munmap
19933+ .quad sys_truncate
19934+ .quad sys_ftruncate
19935+ .quad sys_fchmod
19936+ .quad sys_fchown16 /* 95 */
19937+ .quad sys_getpriority
19938+ .quad sys_setpriority
19939+ .quad quiet_ni_syscall /* old profil syscall holder */
19940+ .quad compat_sys_statfs
19941+ .quad compat_sys_fstatfs /* 100 */
19942+ .quad sys_ioperm
19943+ .quad compat_sys_socketcall
19944+ .quad sys_syslog
19945+ .quad compat_sys_setitimer
19946+ .quad compat_sys_getitimer /* 105 */
19947+ .quad compat_sys_newstat
19948+ .quad compat_sys_newlstat
19949+ .quad compat_sys_newfstat
19950+ .quad sys32_uname
19951+ .quad stub32_iopl /* 110 */
19952+ .quad sys_vhangup
19953+ .quad quiet_ni_syscall /* old "idle" system call */
19954+ .quad sys32_vm86_warning /* vm86old */
19955+ .quad compat_sys_wait4
19956+ .quad sys_swapoff /* 115 */
19957+ .quad sys32_sysinfo
19958+ .quad sys32_ipc
19959+ .quad sys_fsync
19960+ .quad stub32_sigreturn
19961+ .quad stub32_clone /* 120 */
19962+ .quad sys_setdomainname
19963+ .quad sys_uname
19964+ .quad sys_modify_ldt
19965+ .quad compat_sys_adjtimex
19966+ .quad sys32_mprotect /* 125 */
19967+ .quad compat_sys_sigprocmask
19968+ .quad quiet_ni_syscall /* create_module */
19969+ .quad sys_init_module
19970+ .quad sys_delete_module
19971+ .quad quiet_ni_syscall /* 130 get_kernel_syms */
19972+ .quad sys_quotactl
19973+ .quad sys_getpgid
19974+ .quad sys_fchdir
19975+ .quad quiet_ni_syscall /* bdflush */
19976+ .quad sys_sysfs /* 135 */
19977+ .quad sys_personality
19978+ .quad quiet_ni_syscall /* for afs_syscall */
19979+ .quad sys_setfsuid16
19980+ .quad sys_setfsgid16
19981+ .quad sys_llseek /* 140 */
19982+ .quad compat_sys_getdents
19983+ .quad compat_sys_select
19984+ .quad sys_flock
19985+ .quad sys_msync
19986+ .quad compat_sys_readv /* 145 */
19987+ .quad compat_sys_writev
19988+ .quad sys_getsid
19989+ .quad sys_fdatasync
19990+ .quad sys32_sysctl /* sysctl */
19991+ .quad sys_mlock /* 150 */
19992+ .quad sys_munlock
19993+ .quad sys_mlockall
19994+ .quad sys_munlockall
19995+ .quad sys_sched_setparam
19996+ .quad sys_sched_getparam /* 155 */
19997+ .quad sys_sched_setscheduler
19998+ .quad sys_sched_getscheduler
19999+ .quad sys_sched_yield
20000+ .quad sys_sched_get_priority_max
20001+ .quad sys_sched_get_priority_min /* 160 */
20002+ .quad sys_sched_rr_get_interval
20003+ .quad compat_sys_nanosleep
20004+ .quad sys_mremap
20005+ .quad sys_setresuid16
20006+ .quad sys_getresuid16 /* 165 */
20007+ .quad sys32_vm86_warning /* vm86 */
20008+ .quad quiet_ni_syscall /* query_module */
20009+ .quad sys_poll
20010+ .quad compat_sys_nfsservctl
20011+ .quad sys_setresgid16 /* 170 */
20012+ .quad sys_getresgid16
20013+ .quad sys_prctl
20014+ .quad stub32_rt_sigreturn
20015+ .quad sys32_rt_sigaction
20016+ .quad sys32_rt_sigprocmask /* 175 */
20017+ .quad sys32_rt_sigpending
20018+ .quad compat_sys_rt_sigtimedwait
20019+ .quad sys32_rt_sigqueueinfo
20020+ .quad stub32_rt_sigsuspend
20021+ .quad sys32_pread /* 180 */
20022+ .quad sys32_pwrite
20023+ .quad sys_chown16
20024+ .quad sys_getcwd
20025+ .quad sys_capget
20026+ .quad sys_capset
20027+ .quad stub32_sigaltstack
20028+ .quad sys32_sendfile
20029+ .quad quiet_ni_syscall /* streams1 */
20030+ .quad quiet_ni_syscall /* streams2 */
20031+ .quad stub32_vfork /* 190 */
20032+ .quad compat_sys_getrlimit
20033+ .quad sys32_mmap2
20034+ .quad sys32_truncate64
20035+ .quad sys32_ftruncate64
20036+ .quad sys32_stat64 /* 195 */
20037+ .quad sys32_lstat64
20038+ .quad sys32_fstat64
20039+ .quad sys_lchown
20040+ .quad sys_getuid
20041+ .quad sys_getgid /* 200 */
20042+ .quad sys_geteuid
20043+ .quad sys_getegid
20044+ .quad sys_setreuid
20045+ .quad sys_setregid
20046+ .quad sys_getgroups /* 205 */
20047+ .quad sys_setgroups
20048+ .quad sys_fchown
20049+ .quad sys_setresuid
20050+ .quad sys_getresuid
20051+ .quad sys_setresgid /* 210 */
20052+ .quad sys_getresgid
20053+ .quad sys_chown
20054+ .quad sys_setuid
20055+ .quad sys_setgid
20056+ .quad sys_setfsuid /* 215 */
20057+ .quad sys_setfsgid
20058+ .quad sys_pivot_root
20059+ .quad sys_mincore
20060+ .quad sys_madvise
20061+ .quad compat_sys_getdents64 /* 220 getdents64 */
20062+ .quad compat_sys_fcntl64
20063+ .quad quiet_ni_syscall /* tux */
20064+ .quad quiet_ni_syscall /* security */
20065+ .quad sys_gettid
20066+ .quad sys_readahead /* 225 */
20067+ .quad sys_setxattr
20068+ .quad sys_lsetxattr
20069+ .quad sys_fsetxattr
20070+ .quad sys_getxattr
20071+ .quad sys_lgetxattr /* 230 */
20072+ .quad sys_fgetxattr
20073+ .quad sys_listxattr
20074+ .quad sys_llistxattr
20075+ .quad sys_flistxattr
20076+ .quad sys_removexattr /* 235 */
20077+ .quad sys_lremovexattr
20078+ .quad sys_fremovexattr
20079+ .quad sys_tkill
20080+ .quad sys_sendfile64
20081+ .quad compat_sys_futex /* 240 */
20082+ .quad compat_sys_sched_setaffinity
20083+ .quad compat_sys_sched_getaffinity
20084+ .quad sys32_set_thread_area
20085+ .quad sys32_get_thread_area
20086+ .quad compat_sys_io_setup /* 245 */
20087+ .quad sys_io_destroy
20088+ .quad compat_sys_io_getevents
20089+ .quad compat_sys_io_submit
20090+ .quad sys_io_cancel
20091+ .quad sys_fadvise64 /* 250 */
20092+ .quad quiet_ni_syscall /* free_huge_pages */
20093+ .quad sys_exit_group
20094+ .quad sys32_lookup_dcookie
20095+ .quad sys_epoll_create
20096+ .quad sys_epoll_ctl /* 255 */
20097+ .quad sys_epoll_wait
20098+ .quad sys_remap_file_pages
20099+ .quad sys_set_tid_address
20100+ .quad compat_sys_timer_create
20101+ .quad compat_sys_timer_settime /* 260 */
20102+ .quad compat_sys_timer_gettime
20103+ .quad sys_timer_getoverrun
20104+ .quad sys_timer_delete
20105+ .quad compat_sys_clock_settime
20106+ .quad compat_sys_clock_gettime /* 265 */
20107+ .quad compat_sys_clock_getres
20108+ .quad compat_sys_clock_nanosleep
20109+ .quad compat_sys_statfs64
20110+ .quad compat_sys_fstatfs64
20111+ .quad sys_tgkill /* 270 */
20112+ .quad compat_sys_utimes
20113+ .quad sys32_fadvise64_64
20114+ .quad quiet_ni_syscall /* sys_vserver */
20115+ .quad sys_mbind
20116+ .quad compat_sys_get_mempolicy /* 275 */
20117+ .quad sys_set_mempolicy
20118+ .quad compat_sys_mq_open
20119+ .quad sys_mq_unlink
20120+ .quad compat_sys_mq_timedsend
20121+ .quad compat_sys_mq_timedreceive /* 280 */
20122+ .quad compat_sys_mq_notify
20123+ .quad compat_sys_mq_getsetattr
20124+ .quad compat_sys_kexec_load /* reserved for kexec */
20125+ .quad compat_sys_waitid
20126+ .quad quiet_ni_syscall /* 285: sys_altroot */
20127+ .quad sys_add_key
20128+ .quad sys_request_key
20129+ .quad sys_keyctl
20130+ .quad sys_ioprio_set
20131+ .quad sys_ioprio_get /* 290 */
20132+ .quad sys_inotify_init
20133+ .quad sys_inotify_add_watch
20134+ .quad sys_inotify_rm_watch
20135+ .quad sys_migrate_pages
20136+ .quad compat_sys_openat /* 295 */
20137+ .quad sys_mkdirat
20138+ .quad sys_mknodat
20139+ .quad sys_fchownat
20140+ .quad compat_sys_futimesat
20141+ .quad sys32_fstatat /* 300 */
20142+ .quad sys_unlinkat
20143+ .quad sys_renameat
20144+ .quad sys_linkat
20145+ .quad sys_symlinkat
20146+ .quad sys_readlinkat /* 305 */
20147+ .quad sys_fchmodat
20148+ .quad sys_faccessat
20149+ .quad quiet_ni_syscall /* pselect6 for now */
20150+ .quad quiet_ni_syscall /* ppoll for now */
20151+ .quad sys_unshare /* 310 */
20152+ .quad compat_sys_set_robust_list
20153+ .quad compat_sys_get_robust_list
20154+ .quad sys_splice
20155+ .quad sys_sync_file_range
20156+ .quad sys_tee
20157+ .quad compat_sys_vmsplice
20158+ .quad compat_sys_move_pages
20159+ia32_syscall_end:
20160Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c
20161===================================================================
20162--- /dev/null 1970-01-01 00:00:00.000000000 +0000
20163+++ head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200
20164@@ -0,0 +1,146 @@
20165+/*
20166+ * acpi.c - Architecture-Specific Low-Level ACPI Support
20167+ *
20168+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
20169+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
20170+ * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
20171+ * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
20172+ * Copyright (C) 2003 Pavel Machek, SuSE Labs
20173+ *
20174+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20175+ *
20176+ * This program is free software; you can redistribute it and/or modify
20177+ * it under the terms of the GNU General Public License as published by
20178+ * the Free Software Foundation; either version 2 of the License, or
20179+ * (at your option) any later version.
20180+ *
20181+ * This program is distributed in the hope that it will be useful,
20182+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20183+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20184+ * GNU General Public License for more details.
20185+ *
20186+ * You should have received a copy of the GNU General Public License
20187+ * along with this program; if not, write to the Free Software
20188+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20189+ *
20190+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20191+ */
20192+
20193+#include <linux/kernel.h>
20194+#include <linux/init.h>
20195+#include <linux/types.h>
20196+#include <linux/stddef.h>
20197+#include <linux/slab.h>
20198+#include <linux/pci.h>
20199+#include <linux/bootmem.h>
20200+#include <linux/acpi.h>
20201+#include <linux/cpumask.h>
20202+
20203+#include <asm/mpspec.h>
20204+#include <asm/io.h>
20205+#include <asm/apic.h>
20206+#include <asm/apicdef.h>
20207+#include <asm/page.h>
20208+#include <asm/pgtable.h>
20209+#include <asm/pgalloc.h>
20210+#include <asm/io_apic.h>
20211+#include <asm/proto.h>
20212+#include <asm/tlbflush.h>
20213+
20214+/* --------------------------------------------------------------------------
20215+ Low-Level Sleep Support
20216+ -------------------------------------------------------------------------- */
20217+
20218+#ifdef CONFIG_ACPI_SLEEP
20219+
20220+#ifndef CONFIG_ACPI_PV_SLEEP
20221+/* address in low memory of the wakeup routine. */
20222+unsigned long acpi_wakeup_address = 0;
20223+unsigned long acpi_video_flags;
20224+extern char wakeup_start, wakeup_end;
20225+
20226+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
20227+
20228+static pgd_t low_ptr;
20229+
20230+static void init_low_mapping(void)
20231+{
20232+ pgd_t *slot0 = pgd_offset(current->mm, 0UL);
20233+ low_ptr = *slot0;
20234+ set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
20235+ WARN_ON(num_online_cpus() != 1);
20236+ local_flush_tlb();
20237+}
20238+#endif
20239+
20240+/**
20241+ * acpi_save_state_mem - save kernel state
20242+ *
20243+ * Create an identity mapped page table and copy the wakeup routine to
20244+ * low memory.
20245+ */
20246+int acpi_save_state_mem(void)
20247+{
20248+#ifndef CONFIG_ACPI_PV_SLEEP
20249+ init_low_mapping();
20250+
20251+ memcpy((void *)acpi_wakeup_address, &wakeup_start,
20252+ &wakeup_end - &wakeup_start);
20253+ acpi_copy_wakeup_routine(acpi_wakeup_address);
20254+#endif
20255+ return 0;
20256+}
20257+
20258+/*
20259+ * acpi_restore_state
20260+ */
20261+void acpi_restore_state_mem(void)
20262+{
20263+#ifndef CONFIG_ACPI_PV_SLEEP
20264+ set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
20265+ local_flush_tlb();
20266+#endif
20267+}
20268+
20269+/**
20270+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
20271+ *
20272+ * We allocate a page in low memory for the wakeup
20273+ * routine for when we come back from a sleep state. The
20274+ * runtime allocator allows specification of <16M pages, but not
20275+ * <1M pages.
20276+ */
20277+void __init acpi_reserve_bootmem(void)
20278+{
20279+#ifndef CONFIG_ACPI_PV_SLEEP
20280+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
20281+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
20282+ printk(KERN_CRIT
20283+ "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
20284+#endif
20285+}
20286+
20287+#ifndef CONFIG_ACPI_PV_SLEEP
20288+static int __init acpi_sleep_setup(char *str)
20289+{
20290+ while ((str != NULL) && (*str != '\0')) {
20291+ if (strncmp(str, "s3_bios", 7) == 0)
20292+ acpi_video_flags = 1;
20293+ if (strncmp(str, "s3_mode", 7) == 0)
20294+ acpi_video_flags |= 2;
20295+ str = strchr(str, ',');
20296+ if (str != NULL)
20297+ str += strspn(str, ", \t");
20298+ }
20299+
20300+ return 1;
20301+}
20302+
20303+__setup("acpi_sleep=", acpi_sleep_setup);
20304+#endif /* CONFIG_ACPI_PV_SLEEP */
20305+
20306+#endif /*CONFIG_ACPI_SLEEP */
20307+
20308+void acpi_pci_link_exit(void)
20309+{
20310+}
20311Index: head-2008-11-25/arch/x86/kernel/apic_64-xen.c
20312===================================================================
20313--- /dev/null 1970-01-01 00:00:00.000000000 +0000
20314+++ head-2008-11-25/arch/x86/kernel/apic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
20315@@ -0,0 +1,197 @@
20316+/*
20317+ * Local APIC handling, local APIC timers
20318+ *
20319+ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
20320+ *
20321+ * Fixes
20322+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
20323+ * thanks to Eric Gilmore
20324+ * and Rolf G. Tews
20325+ * for testing these extensively.
20326+ * Maciej W. Rozycki : Various updates and fixes.
20327+ * Mikael Pettersson : Power Management for UP-APIC.
20328+ * Pavel Machek and
20329+ * Mikael Pettersson : PM converted to driver model.
20330+ */
20331+
20332+#include <linux/init.h>
20333+
20334+#include <linux/mm.h>
20335+#include <linux/delay.h>
20336+#include <linux/bootmem.h>
20337+#include <linux/smp_lock.h>
20338+#include <linux/interrupt.h>
20339+#include <linux/mc146818rtc.h>
20340+#include <linux/kernel_stat.h>
20341+#include <linux/sysdev.h>
20342+#include <linux/module.h>
20343+
20344+#include <asm/atomic.h>
20345+#include <asm/smp.h>
20346+#include <asm/mtrr.h>
20347+#include <asm/mpspec.h>
20348+#include <asm/desc.h>
20349+#include <asm/arch_hooks.h>
20350+#include <asm/hpet.h>
20351+#include <asm/idle.h>
20352+
20353+int apic_verbosity;
20354+
20355+/*
20356+ * 'what should we do if we get a hw irq event on an illegal vector'.
20357+ * each architecture has to answer this themselves.
20358+ */
20359+void ack_bad_irq(unsigned int irq)
20360+{
20361+ printk("unexpected IRQ trap at vector %02x\n", irq);
20362+ /*
20363+ * Currently unexpected vectors happen only on SMP and APIC.
20364+ * We _must_ ack these because every local APIC has only N
20365+ * irq slots per priority level, and a 'hanging, unacked' IRQ
20366+ * holds up an irq slot - in excessive cases (when multiple
20367+ * unexpected vectors occur) that might lock up the APIC
20368+ * completely.
20369+ * But don't ack when the APIC is disabled. -AK
20370+ */
20371+ if (!disable_apic)
20372+ ack_APIC_irq();
20373+}
20374+
20375+int setup_profiling_timer(unsigned int multiplier)
20376+{
20377+ return -EINVAL;
20378+}
20379+
20380+void smp_local_timer_interrupt(struct pt_regs *regs)
20381+{
20382+ profile_tick(CPU_PROFILING, regs);
20383+#ifndef CONFIG_XEN
20384+#ifdef CONFIG_SMP
20385+ update_process_times(user_mode(regs));
20386+#endif
20387+#endif
20388+ /*
20389+ * We take the 'long' return path, and there every subsystem
20390+ * grabs the appropriate locks (kernel lock/ irq lock).
20391+ *
20392+ * we might want to decouple profiling from the 'long path',
20393+ * and do the profiling totally in assembly.
20394+ *
20395+ * Currently this isn't too much of an issue (performance wise),
20396+ * we can take more than 100K local irqs per second on a 100 MHz P5.
20397+ */
20398+}
20399+
20400+/*
20401+ * Local APIC timer interrupt. This is the most natural way for doing
20402+ * local interrupts, but local timer interrupts can be emulated by
20403+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
20404+ *
20405+ * [ if a single-CPU system runs an SMP kernel then we call the local
20406+ * interrupt as well. Thus we cannot inline the local irq ... ]
20407+ */
20408+void smp_apic_timer_interrupt(struct pt_regs *regs)
20409+{
20410+ /*
20411+ * the NMI deadlock-detector uses this.
20412+ */
20413+ add_pda(apic_timer_irqs, 1);
20414+
20415+ /*
20416+ * NOTE! We'd better ACK the irq immediately,
20417+ * because timer handling can be slow.
20418+ */
20419+ ack_APIC_irq();
20420+ /*
20421+ * update_process_times() expects us to have done irq_enter().
20422+ * Besides, if we don't timer interrupts ignore the global
20423+ * interrupt lock, which is the WrongThing (tm) to do.
20424+ */
20425+ exit_idle();
20426+ irq_enter();
20427+ smp_local_timer_interrupt(regs);
20428+ irq_exit();
20429+}
20430+
20431+/*
20432+ * This interrupt should _never_ happen with our APIC/SMP architecture
20433+ */
20434+asmlinkage void smp_spurious_interrupt(void)
20435+{
20436+ unsigned int v;
20437+ exit_idle();
20438+ irq_enter();
20439+ /*
20440+ * Check if this really is a spurious interrupt and ACK it
20441+ * if it is a vectored one. Just in case...
20442+ * Spurious interrupts should not be ACKed.
20443+ */
20444+ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
20445+ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
20446+ ack_APIC_irq();
20447+
20448+#if 0
20449+ static unsigned long last_warning;
20450+ static unsigned long skipped;
20451+
20452+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */
20453+ if (time_before(last_warning+30*HZ,jiffies)) {
20454+ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
20455+ smp_processor_id(), skipped);
20456+ last_warning = jiffies;
20457+ skipped = 0;
20458+ } else {
20459+ skipped++;
20460+ }
20461+#endif
20462+ irq_exit();
20463+}
20464+
20465+/*
20466+ * This interrupt should never happen with our APIC/SMP architecture
20467+ */
20468+
20469+asmlinkage void smp_error_interrupt(void)
20470+{
20471+ unsigned int v, v1;
20472+
20473+ exit_idle();
20474+ irq_enter();
20475+ /* First tickle the hardware, only then report what went on. -- REW */
20476+ v = apic_read(APIC_ESR);
20477+ apic_write(APIC_ESR, 0);
20478+ v1 = apic_read(APIC_ESR);
20479+ ack_APIC_irq();
20480+ atomic_inc(&irq_err_count);
20481+
20482+ /* Here is what the APIC error bits mean:
20483+ 0: Send CS error
20484+ 1: Receive CS error
20485+ 2: Send accept error
20486+ 3: Receive accept error
20487+ 4: Reserved
20488+ 5: Send illegal vector
20489+ 6: Received illegal vector
20490+ 7: Illegal register address
20491+ */
20492+ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
20493+ smp_processor_id(), v , v1);
20494+ irq_exit();
20495+}
20496+
20497+int disable_apic;
20498+
20499+/*
20500+ * This initializes the IO-APIC and APIC hardware if this is
20501+ * a UP kernel.
20502+ */
20503+int __init APIC_init_uniprocessor (void)
20504+{
20505+#ifdef CONFIG_X86_IO_APIC
20506+ if (smp_found_config)
20507+ if (!skip_ioapic_setup && nr_ioapics)
20508+ setup_IO_APIC();
20509+#endif
20510+
20511+ return 1;
20512+}
20513Index: head-2008-11-25/arch/x86/kernel/e820_64-xen.c
20514===================================================================
20515--- /dev/null 1970-01-01 00:00:00.000000000 +0000
20516+++ head-2008-11-25/arch/x86/kernel/e820_64-xen.c 2008-04-22 19:56:27.000000000 +0200
20517@@ -0,0 +1,798 @@
20518+/*
20519+ * Handle the memory map.
20520+ * The functions here do the job until bootmem takes over.
20521+ *
20522+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
20523+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
20524+ * Alex Achenbach <xela@slit.de>, December 2002.
20525+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
20526+ *
20527+ */
20528+#include <linux/kernel.h>
20529+#include <linux/types.h>
20530+#include <linux/init.h>
20531+#include <linux/bootmem.h>
20532+#include <linux/ioport.h>
20533+#include <linux/string.h>
20534+#include <linux/kexec.h>
20535+#include <linux/module.h>
20536+
20537+#include <asm/pgtable.h>
20538+#include <asm/page.h>
20539+#include <asm/e820.h>
20540+#include <asm/proto.h>
20541+#include <asm/bootsetup.h>
20542+#include <asm/sections.h>
20543+#include <xen/interface/memory.h>
20544+
20545+/*
20546+ * PFN of last memory page.
20547+ */
20548+unsigned long end_pfn;
20549+EXPORT_SYMBOL(end_pfn);
20550+
20551+/*
20552+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
20553+ * The direct mapping extends to end_pfn_map, so that we can directly access
20554+ * apertures, ACPI and other tables without having to play with fixmaps.
20555+ */
20556+unsigned long end_pfn_map;
20557+
20558+/*
20559+ * Last pfn which the user wants to use.
20560+ */
20561+unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
20562+
20563+extern struct resource code_resource, data_resource;
20564+
20565+#ifdef CONFIG_XEN
20566+extern struct e820map machine_e820;
20567+#endif
20568+
20569+/* Check for some hardcoded bad areas that early boot is not allowed to touch */
20570+static inline int bad_addr(unsigned long *addrp, unsigned long size)
20571+{
20572+ unsigned long addr = *addrp, last = addr + size;
20573+
20574+#ifndef CONFIG_XEN
20575+ /* various gunk below that needed for SMP startup */
20576+ if (addr < 0x8000) {
20577+ *addrp = 0x8000;
20578+ return 1;
20579+ }
20580+
20581+ /* direct mapping tables of the kernel */
20582+ if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
20583+ *addrp = table_end << PAGE_SHIFT;
20584+ return 1;
20585+ }
20586+
20587+ /* initrd */
20588+#ifdef CONFIG_BLK_DEV_INITRD
20589+ if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
20590+ addr < INITRD_START+INITRD_SIZE) {
20591+ *addrp = INITRD_START + INITRD_SIZE;
20592+ return 1;
20593+ }
20594+#endif
20595+ /* kernel code + 640k memory hole (later should not be needed, but
20596+ be paranoid for now) */
20597+ if (last >= 640*1024 && addr < 1024*1024) {
20598+ *addrp = 1024*1024;
20599+ return 1;
20600+ }
20601+ if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
20602+ *addrp = __pa_symbol(&_end);
20603+ return 1;
20604+ }
20605+
20606+ if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
20607+ *addrp = ebda_addr + ebda_size;
20608+ return 1;
20609+ }
20610+
20611+ /* XXX ramdisk image here? */
20612+#else
20613+ if (last < (table_end<<PAGE_SHIFT)) {
20614+ *addrp = table_end << PAGE_SHIFT;
20615+ return 1;
20616+ }
20617+#endif
20618+ return 0;
20619+}
20620+
20621+/*
20622+ * This function checks if any part of the range <start,end> is mapped
20623+ * with type.
20624+ */
20625+int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
20626+{
20627+ int i;
20628+
20629+#ifndef CONFIG_XEN
20630+ for (i = 0; i < e820.nr_map; i++) {
20631+ struct e820entry *ei = &e820.map[i];
20632+#else
20633+ if (!is_initial_xendomain())
20634+ return 0;
20635+ for (i = 0; i < machine_e820.nr_map; i++) {
20636+ const struct e820entry *ei = &machine_e820.map[i];
20637+#endif
20638+
20639+ if (type && ei->type != type)
20640+ continue;
20641+ if (ei->addr >= end || ei->addr + ei->size <= start)
20642+ continue;
20643+ return 1;
20644+ }
20645+ return 0;
20646+}
20647+EXPORT_SYMBOL_GPL(e820_any_mapped);
20648+
20649+/*
20650+ * This function checks if the entire range <start,end> is mapped with type.
20651+ *
20652+ * Note: this function only works correct if the e820 table is sorted and
20653+ * not-overlapping, which is the case
20654+ */
20655+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
20656+{
20657+ int i;
20658+
20659+#ifndef CONFIG_XEN
20660+ for (i = 0; i < e820.nr_map; i++) {
20661+ struct e820entry *ei = &e820.map[i];
20662+#else
20663+ if (!is_initial_xendomain())
20664+ return 0;
20665+ for (i = 0; i < machine_e820.nr_map; i++) {
20666+ const struct e820entry *ei = &machine_e820.map[i];
20667+#endif
20668+
20669+ if (type && ei->type != type)
20670+ continue;
20671+ /* is the region (part) in overlap with the current region ?*/
20672+ if (ei->addr >= end || ei->addr + ei->size <= start)
20673+ continue;
20674+
20675+ /* if the region is at the beginning of <start,end> we move
20676+ * start to the end of the region since it's ok until there
20677+ */
20678+ if (ei->addr <= start)
20679+ start = ei->addr + ei->size;
20680+ /* if start is now at or beyond end, we're done, full coverage */
20681+ if (start >= end)
20682+ return 1; /* we're done */
20683+ }
20684+ return 0;
20685+}
20686+
20687+/*
20688+ * Find a free area in a specific range.
20689+ */
20690+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
20691+{
20692+ int i;
20693+ for (i = 0; i < e820.nr_map; i++) {
20694+ struct e820entry *ei = &e820.map[i];
20695+ unsigned long addr = ei->addr, last;
20696+ if (ei->type != E820_RAM)
20697+ continue;
20698+ if (addr < start)
20699+ addr = start;
20700+ if (addr > ei->addr + ei->size)
20701+ continue;
20702+ while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
20703+ ;
20704+ last = addr + size;
20705+ if (last > ei->addr + ei->size)
20706+ continue;
20707+ if (last > end)
20708+ continue;
20709+ return addr;
20710+ }
20711+ return -1UL;
20712+}
20713+
20714+/*
20715+ * Free bootmem based on the e820 table for a node.
20716+ */
20717+void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
20718+{
20719+ int i;
20720+ for (i = 0; i < e820.nr_map; i++) {
20721+ struct e820entry *ei = &e820.map[i];
20722+ unsigned long last, addr;
20723+
20724+ if (ei->type != E820_RAM ||
20725+ ei->addr+ei->size <= start ||
20726+ ei->addr >= end)
20727+ continue;
20728+
20729+ addr = round_up(ei->addr, PAGE_SIZE);
20730+ if (addr < start)
20731+ addr = start;
20732+
20733+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
20734+ if (last >= end)
20735+ last = end;
20736+
20737+ if (last > addr && last-addr >= PAGE_SIZE)
20738+ free_bootmem_node(pgdat, addr, last-addr);
20739+ }
20740+}
20741+
20742+/*
20743+ * Find the highest page frame number we have available
20744+ */
20745+unsigned long __init e820_end_of_ram(void)
20746+{
20747+ int i;
20748+ unsigned long end_pfn = 0;
20749+
20750+ for (i = 0; i < e820.nr_map; i++) {
20751+ struct e820entry *ei = &e820.map[i];
20752+ unsigned long start, end;
20753+
20754+ start = round_up(ei->addr, PAGE_SIZE);
20755+ end = round_down(ei->addr + ei->size, PAGE_SIZE);
20756+ if (start >= end)
20757+ continue;
20758+ if (ei->type == E820_RAM) {
20759+ if (end > end_pfn<<PAGE_SHIFT)
20760+ end_pfn = end>>PAGE_SHIFT;
20761+ } else {
20762+ if (end > end_pfn_map<<PAGE_SHIFT)
20763+ end_pfn_map = end>>PAGE_SHIFT;
20764+ }
20765+ }
20766+
20767+ if (end_pfn > end_pfn_map)
20768+ end_pfn_map = end_pfn;
20769+ if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
20770+ end_pfn_map = MAXMEM>>PAGE_SHIFT;
20771+ if (end_pfn > end_user_pfn)
20772+ end_pfn = end_user_pfn;
20773+ if (end_pfn > end_pfn_map)
20774+ end_pfn = end_pfn_map;
20775+
20776+ return end_pfn;
20777+}
20778+
20779+/*
20780+ * Compute how much memory is missing in a range.
20781+ * Unlike the other functions in this file the arguments are in page numbers.
20782+ */
20783+unsigned long __init
20784+e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
20785+{
20786+ unsigned long ram = 0;
20787+ unsigned long start = start_pfn << PAGE_SHIFT;
20788+ unsigned long end = end_pfn << PAGE_SHIFT;
20789+ int i;
20790+ for (i = 0; i < e820.nr_map; i++) {
20791+ struct e820entry *ei = &e820.map[i];
20792+ unsigned long last, addr;
20793+
20794+ if (ei->type != E820_RAM ||
20795+ ei->addr+ei->size <= start ||
20796+ ei->addr >= end)
20797+ continue;
20798+
20799+ addr = round_up(ei->addr, PAGE_SIZE);
20800+ if (addr < start)
20801+ addr = start;
20802+
20803+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
20804+ if (last >= end)
20805+ last = end;
20806+
20807+ if (last > addr)
20808+ ram += last - addr;
20809+ }
20810+ return ((end - start) - ram) >> PAGE_SHIFT;
20811+}
20812+
20813+/*
20814+ * Mark e820 reserved areas as busy for the resource manager.
20815+ */
20816+void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
20817+{
20818+ int i;
20819+ for (i = 0; i < nr_map; i++) {
20820+ struct resource *res;
20821+ res = alloc_bootmem_low(sizeof(struct resource));
20822+ switch (e820[i].type) {
20823+ case E820_RAM: res->name = "System RAM"; break;
20824+ case E820_ACPI: res->name = "ACPI Tables"; break;
20825+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
20826+ default: res->name = "reserved";
20827+ }
20828+ res->start = e820[i].addr;
20829+ res->end = res->start + e820[i].size - 1;
20830+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
20831+ request_resource(&iomem_resource, res);
20832+ if (e820[i].type == E820_RAM) {
20833+ /*
20834+ * We don't know which RAM region contains kernel data,
20835+ * so we try it repeatedly and let the resource manager
20836+ * test it.
20837+ */
20838+#ifndef CONFIG_XEN
20839+ request_resource(res, &code_resource);
20840+ request_resource(res, &data_resource);
20841+#endif
20842+#ifdef CONFIG_KEXEC
20843+ if (crashk_res.start != crashk_res.end)
20844+ request_resource(res, &crashk_res);
20845+#ifdef CONFIG_XEN
20846+ xen_machine_kexec_register_resources(res);
20847+#endif
20848+#endif
20849+ }
20850+ }
20851+}
20852+
20853+/*
20854+ * Add a memory region to the kernel e820 map.
20855+ */
20856+void __init add_memory_region(unsigned long start, unsigned long size, int type)
20857+{
20858+ int x = e820.nr_map;
20859+
20860+ if (x == E820MAX) {
20861+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
20862+ return;
20863+ }
20864+
20865+ e820.map[x].addr = start;
20866+ e820.map[x].size = size;
20867+ e820.map[x].type = type;
20868+ e820.nr_map++;
20869+}
20870+
20871+void __init e820_print_map(char *who)
20872+{
20873+ int i;
20874+
20875+ for (i = 0; i < e820.nr_map; i++) {
20876+ printk(" %s: %016Lx - %016Lx ", who,
20877+ (unsigned long long) e820.map[i].addr,
20878+ (unsigned long long) (e820.map[i].addr + e820.map[i].size));
20879+ switch (e820.map[i].type) {
20880+ case E820_RAM: printk("(usable)\n");
20881+ break;
20882+ case E820_RESERVED:
20883+ printk("(reserved)\n");
20884+ break;
20885+ case E820_ACPI:
20886+ printk("(ACPI data)\n");
20887+ break;
20888+ case E820_NVS:
20889+ printk("(ACPI NVS)\n");
20890+ break;
20891+ default: printk("type %u\n", e820.map[i].type);
20892+ break;
20893+ }
20894+ }
20895+}
20896+
20897+/*
20898+ * Sanitize the BIOS e820 map.
20899+ *
20900+ * Some e820 responses include overlapping entries. The following
20901+ * replaces the original e820 map with a new one, removing overlaps.
20902+ *
20903+ */
20904+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
20905+{
20906+ struct change_member {
20907+ struct e820entry *pbios; /* pointer to original bios entry */
20908+ unsigned long long addr; /* address for this change point */
20909+ };
20910+ static struct change_member change_point_list[2*E820MAX] __initdata;
20911+ static struct change_member *change_point[2*E820MAX] __initdata;
20912+ static struct e820entry *overlap_list[E820MAX] __initdata;
20913+ static struct e820entry new_bios[E820MAX] __initdata;
20914+ struct change_member *change_tmp;
20915+ unsigned long current_type, last_type;
20916+ unsigned long long last_addr;
20917+ int chgidx, still_changing;
20918+ int overlap_entries;
20919+ int new_bios_entry;
20920+ int old_nr, new_nr, chg_nr;
20921+ int i;
20922+
20923+ /*
20924+ Visually we're performing the following (1,2,3,4 = memory types)...
20925+
20926+ Sample memory map (w/overlaps):
20927+ ____22__________________
20928+ ______________________4_
20929+ ____1111________________
20930+ _44_____________________
20931+ 11111111________________
20932+ ____________________33__
20933+ ___________44___________
20934+ __________33333_________
20935+ ______________22________
20936+ ___________________2222_
20937+ _________111111111______
20938+ _____________________11_
20939+ _________________4______
20940+
20941+ Sanitized equivalent (no overlap):
20942+ 1_______________________
20943+ _44_____________________
20944+ ___1____________________
20945+ ____22__________________
20946+ ______11________________
20947+ _________1______________
20948+ __________3_____________
20949+ ___________44___________
20950+ _____________33_________
20951+ _______________2________
20952+ ________________1_______
20953+ _________________4______
20954+ ___________________2____
20955+ ____________________33__
20956+ ______________________4_
20957+ */
20958+
20959+ /* if there's only one memory region, don't bother */
20960+ if (*pnr_map < 2)
20961+ return -1;
20962+
20963+ old_nr = *pnr_map;
20964+
20965+ /* bail out if we find any unreasonable addresses in bios map */
20966+ for (i=0; i<old_nr; i++)
20967+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
20968+ return -1;
20969+
20970+ /* create pointers for initial change-point information (for sorting) */
20971+ for (i=0; i < 2*old_nr; i++)
20972+ change_point[i] = &change_point_list[i];
20973+
20974+ /* record all known change-points (starting and ending addresses),
20975+ omitting those that are for empty memory regions */
20976+ chgidx = 0;
20977+ for (i=0; i < old_nr; i++) {
20978+ if (biosmap[i].size != 0) {
20979+ change_point[chgidx]->addr = biosmap[i].addr;
20980+ change_point[chgidx++]->pbios = &biosmap[i];
20981+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
20982+ change_point[chgidx++]->pbios = &biosmap[i];
20983+ }
20984+ }
20985+ chg_nr = chgidx;
20986+
20987+ /* sort change-point list by memory addresses (low -> high) */
20988+ still_changing = 1;
20989+ while (still_changing) {
20990+ still_changing = 0;
20991+ for (i=1; i < chg_nr; i++) {
20992+ /* if <current_addr> > <last_addr>, swap */
20993+ /* or, if current=<start_addr> & last=<end_addr>, swap */
20994+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
20995+ ((change_point[i]->addr == change_point[i-1]->addr) &&
20996+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
20997+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
20998+ )
20999+ {
21000+ change_tmp = change_point[i];
21001+ change_point[i] = change_point[i-1];
21002+ change_point[i-1] = change_tmp;
21003+ still_changing=1;
21004+ }
21005+ }
21006+ }
21007+
21008+ /* create a new bios memory map, removing overlaps */
21009+ overlap_entries=0; /* number of entries in the overlap table */
21010+ new_bios_entry=0; /* index for creating new bios map entries */
21011+ last_type = 0; /* start with undefined memory type */
21012+ last_addr = 0; /* start with 0 as last starting address */
21013+ /* loop through change-points, determining affect on the new bios map */
21014+ for (chgidx=0; chgidx < chg_nr; chgidx++)
21015+ {
21016+ /* keep track of all overlapping bios entries */
21017+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
21018+ {
21019+ /* add map entry to overlap list (> 1 entry implies an overlap) */
21020+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
21021+ }
21022+ else
21023+ {
21024+ /* remove entry from list (order independent, so swap with last) */
21025+ for (i=0; i<overlap_entries; i++)
21026+ {
21027+ if (overlap_list[i] == change_point[chgidx]->pbios)
21028+ overlap_list[i] = overlap_list[overlap_entries-1];
21029+ }
21030+ overlap_entries--;
21031+ }
21032+ /* if there are overlapping entries, decide which "type" to use */
21033+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
21034+ current_type = 0;
21035+ for (i=0; i<overlap_entries; i++)
21036+ if (overlap_list[i]->type > current_type)
21037+ current_type = overlap_list[i]->type;
21038+ /* continue building up new bios map based on this information */
21039+ if (current_type != last_type) {
21040+ if (last_type != 0) {
21041+ new_bios[new_bios_entry].size =
21042+ change_point[chgidx]->addr - last_addr;
21043+ /* move forward only if the new size was non-zero */
21044+ if (new_bios[new_bios_entry].size != 0)
21045+ if (++new_bios_entry >= E820MAX)
21046+ break; /* no more space left for new bios entries */
21047+ }
21048+ if (current_type != 0) {
21049+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
21050+ new_bios[new_bios_entry].type = current_type;
21051+ last_addr=change_point[chgidx]->addr;
21052+ }
21053+ last_type = current_type;
21054+ }
21055+ }
21056+ new_nr = new_bios_entry; /* retain count for new bios entries */
21057+
21058+ /* copy new bios mapping into original location */
21059+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
21060+ *pnr_map = new_nr;
21061+
21062+ return 0;
21063+}
21064+
21065+/*
21066+ * Copy the BIOS e820 map into a safe place.
21067+ *
21068+ * Sanity-check it while we're at it..
21069+ *
21070+ * If we're lucky and live on a modern system, the setup code
21071+ * will have given us a memory map that we can use to properly
21072+ * set up memory. If we aren't, we'll fake a memory map.
21073+ *
21074+ * We check to see that the memory map contains at least 2 elements
21075+ * before we'll use it, because the detection code in setup.S may
21076+ * not be perfect and most every PC known to man has two memory
21077+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
21078+ * thinkpad 560x, for example, does not cooperate with the memory
21079+ * detection code.)
21080+ */
21081+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
21082+{
21083+#ifndef CONFIG_XEN
21084+ /* Only one memory region (or negative)? Ignore it */
21085+ if (nr_map < 2)
21086+ return -1;
21087+#else
21088+ BUG_ON(nr_map < 1);
21089+#endif
21090+
21091+ do {
21092+ unsigned long start = biosmap->addr;
21093+ unsigned long size = biosmap->size;
21094+ unsigned long end = start + size;
21095+ unsigned long type = biosmap->type;
21096+
21097+ /* Overflow in 64 bits? Ignore the memory map. */
21098+ if (start > end)
21099+ return -1;
21100+
21101+#ifndef CONFIG_XEN
21102+ /*
21103+ * Some BIOSes claim RAM in the 640k - 1M region.
21104+ * Not right. Fix it up.
21105+ *
21106+ * This should be removed on Hammer which is supposed to not
21107+ * have non e820 covered ISA mappings there, but I had some strange
21108+ * problems so it stays for now. -AK
21109+ */
21110+ if (type == E820_RAM) {
21111+ if (start < 0x100000ULL && end > 0xA0000ULL) {
21112+ if (start < 0xA0000ULL)
21113+ add_memory_region(start, 0xA0000ULL-start, type);
21114+ if (end <= 0x100000ULL)
21115+ continue;
21116+ start = 0x100000ULL;
21117+ size = end - start;
21118+ }
21119+ }
21120+#endif
21121+
21122+ add_memory_region(start, size, type);
21123+ } while (biosmap++,--nr_map);
21124+
21125+#ifdef CONFIG_XEN
21126+ if (is_initial_xendomain()) {
21127+ struct xen_memory_map memmap;
21128+
21129+ memmap.nr_entries = E820MAX;
21130+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
21131+
21132+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
21133+ BUG();
21134+ machine_e820.nr_map = memmap.nr_entries;
21135+ } else
21136+ machine_e820 = e820;
21137+#endif
21138+
21139+ return 0;
21140+}
21141+
21142+#ifndef CONFIG_XEN
21143+void __init setup_memory_region(void)
21144+{
21145+ char *who = "BIOS-e820";
21146+
21147+ /*
21148+ * Try to copy the BIOS-supplied E820-map.
21149+ *
21150+ * Otherwise fake a memory map; one section from 0k->640k,
21151+ * the next section from 1mb->appropriate_mem_k
21152+ */
21153+ sanitize_e820_map(E820_MAP, &E820_MAP_NR);
21154+ if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
21155+ unsigned long mem_size;
21156+
21157+ /* compare results from other methods and take the greater */
21158+ if (ALT_MEM_K < EXT_MEM_K) {
21159+ mem_size = EXT_MEM_K;
21160+ who = "BIOS-88";
21161+ } else {
21162+ mem_size = ALT_MEM_K;
21163+ who = "BIOS-e801";
21164+ }
21165+
21166+ e820.nr_map = 0;
21167+ add_memory_region(0, LOWMEMSIZE(), E820_RAM);
21168+ add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
21169+ }
21170+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21171+ e820_print_map(who);
21172+}
21173+
21174+#else /* CONFIG_XEN */
21175+
21176+void __init setup_memory_region(void)
21177+{
21178+ int rc;
21179+ struct xen_memory_map memmap;
21180+ /*
21181+ * This is rather large for a stack variable but this early in
21182+ * the boot process we know we have plenty slack space.
21183+ */
21184+ struct e820entry map[E820MAX];
21185+
21186+ memmap.nr_entries = E820MAX;
21187+ set_xen_guest_handle(memmap.buffer, map);
21188+
21189+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
21190+ if ( rc == -ENOSYS ) {
21191+ memmap.nr_entries = 1;
21192+ map[0].addr = 0ULL;
21193+ map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
21194+ /* 8MB slack (to balance backend allocations). */
21195+ map[0].size += 8 << 20;
21196+ map[0].type = E820_RAM;
21197+ rc = 0;
21198+ }
21199+ BUG_ON(rc);
21200+
21201+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
21202+
21203+ BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
21204+
21205+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21206+ e820_print_map("Xen");
21207+}
21208+#endif
21209+
21210+void __init parse_memopt(char *p, char **from)
21211+{
21212+ int i;
21213+ unsigned long current_end;
21214+ unsigned long end;
21215+
21216+ end_user_pfn = memparse(p, from);
21217+ end_user_pfn >>= PAGE_SHIFT;
21218+
21219+ end = end_user_pfn<<PAGE_SHIFT;
21220+ i = e820.nr_map-1;
21221+ current_end = e820.map[i].addr + e820.map[i].size;
21222+
21223+ if (current_end < end) {
21224+ /*
21225+ * The e820 map ends before our requested size so
21226+ * extend the final entry to the requested address.
21227+ */
21228+ if (e820.map[i].type == E820_RAM)
21229+ e820.map[i].size = end - e820.map[i].addr;
21230+ else
21231+ add_memory_region(current_end, end - current_end, E820_RAM);
21232+ }
21233+}
21234+
21235+void __init parse_memmapopt(char *p, char **from)
21236+{
21237+ unsigned long long start_at, mem_size;
21238+
21239+ mem_size = memparse(p, from);
21240+ p = *from;
21241+ if (*p == '@') {
21242+ start_at = memparse(p+1, from);
21243+ add_memory_region(start_at, mem_size, E820_RAM);
21244+ } else if (*p == '#') {
21245+ start_at = memparse(p+1, from);
21246+ add_memory_region(start_at, mem_size, E820_ACPI);
21247+ } else if (*p == '$') {
21248+ start_at = memparse(p+1, from);
21249+ add_memory_region(start_at, mem_size, E820_RESERVED);
21250+ } else {
21251+ end_user_pfn = (mem_size >> PAGE_SHIFT);
21252+ }
21253+ p = *from;
21254+}
21255+
21256+unsigned long pci_mem_start = 0xaeedbabe;
21257+EXPORT_SYMBOL(pci_mem_start);
21258+
21259+/*
21260+ * Search for the biggest gap in the low 32 bits of the e820
21261+ * memory space. We pass this space to PCI to assign MMIO resources
21262+ * for hotplug or unconfigured devices in.
21263+ * Hopefully the BIOS let enough space left.
21264+ */
21265+__init void e820_setup_gap(struct e820entry *e820, int nr_map)
21266+{
21267+ unsigned long gapstart, gapsize, round;
21268+ unsigned long last;
21269+ int i;
21270+ int found = 0;
21271+
21272+ last = 0x100000000ull;
21273+ gapstart = 0x10000000;
21274+ gapsize = 0x400000;
21275+ i = nr_map;
21276+ while (--i >= 0) {
21277+ unsigned long long start = e820[i].addr;
21278+ unsigned long long end = start + e820[i].size;
21279+
21280+ /*
21281+ * Since "last" is at most 4GB, we know we'll
21282+ * fit in 32 bits if this condition is true
21283+ */
21284+ if (last > end) {
21285+ unsigned long gap = last - end;
21286+
21287+ if (gap > gapsize) {
21288+ gapsize = gap;
21289+ gapstart = end;
21290+ found = 1;
21291+ }
21292+ }
21293+ if (start < last)
21294+ last = start;
21295+ }
21296+
21297+ if (!found) {
21298+ gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
21299+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
21300+ KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
21301+ }
21302+
21303+ /*
21304+ * See how much we want to round up: start off with
21305+ * rounding to the next 1MB area.
21306+ */
21307+ round = 0x100000;
21308+ while ((gapsize >> 4) > round)
21309+ round += round;
21310+ /* Fun with two's complement */
21311+ pci_mem_start = (gapstart + round) & -round;
21312+
21313+ printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
21314+ pci_mem_start, gapstart, gapsize);
21315+}
21316Index: head-2008-11-25/arch/x86/kernel/early_printk-xen.c
21317===================================================================
21318--- /dev/null 1970-01-01 00:00:00.000000000 +0000
21319+++ head-2008-11-25/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200
21320@@ -0,0 +1,302 @@
21321+#include <linux/console.h>
21322+#include <linux/kernel.h>
21323+#include <linux/init.h>
21324+#include <linux/string.h>
21325+#include <linux/screen_info.h>
21326+#include <asm/io.h>
21327+#include <asm/processor.h>
21328+#include <asm/fcntl.h>
21329+
21330+/* Simple VGA output */
21331+
21332+#ifdef __i386__
21333+#include <asm/setup.h>
21334+#define VGABASE (__ISA_IO_base + 0xb8000)
21335+#else
21336+#include <asm/bootsetup.h>
21337+#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
21338+#endif
21339+
21340+#ifndef CONFIG_XEN
21341+static int max_ypos = 25, max_xpos = 80;
21342+static int current_ypos = 25, current_xpos = 0;
21343+
21344+static void early_vga_write(struct console *con, const char *str, unsigned n)
21345+{
21346+ char c;
21347+ int i, k, j;
21348+
21349+ while ((c = *str++) != '\0' && n-- > 0) {
21350+ if (current_ypos >= max_ypos) {
21351+ /* scroll 1 line up */
21352+ for (k = 1, j = 0; k < max_ypos; k++, j++) {
21353+ for (i = 0; i < max_xpos; i++) {
21354+ writew(readw(VGABASE+2*(max_xpos*k+i)),
21355+ VGABASE + 2*(max_xpos*j + i));
21356+ }
21357+ }
21358+ for (i = 0; i < max_xpos; i++)
21359+ writew(0x720, VGABASE + 2*(max_xpos*j + i));
21360+ current_ypos = max_ypos-1;
21361+ }
21362+ if (c == '\n') {
21363+ current_xpos = 0;
21364+ current_ypos++;
21365+ } else if (c != '\r') {
21366+ writew(((0x7 << 8) | (unsigned short) c),
21367+ VGABASE + 2*(max_xpos*current_ypos +
21368+ current_xpos++));
21369+ if (current_xpos >= max_xpos) {
21370+ current_xpos = 0;
21371+ current_ypos++;
21372+ }
21373+ }
21374+ }
21375+}
21376+
21377+static struct console early_vga_console = {
21378+ .name = "earlyvga",
21379+ .write = early_vga_write,
21380+ .flags = CON_PRINTBUFFER,
21381+ .index = -1,
21382+};
21383+
21384+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
21385+
21386+static int early_serial_base = 0x3f8; /* ttyS0 */
21387+
21388+#define XMTRDY 0x20
21389+
21390+#define DLAB 0x80
21391+
21392+#define TXR 0 /* Transmit register (WRITE) */
21393+#define RXR 0 /* Receive register (READ) */
21394+#define IER 1 /* Interrupt Enable */
21395+#define IIR 2 /* Interrupt ID */
21396+#define FCR 2 /* FIFO control */
21397+#define LCR 3 /* Line control */
21398+#define MCR 4 /* Modem control */
21399+#define LSR 5 /* Line Status */
21400+#define MSR 6 /* Modem Status */
21401+#define DLL 0 /* Divisor Latch Low */
21402+#define DLH 1 /* Divisor latch High */
21403+
21404+static int early_serial_putc(unsigned char ch)
21405+{
21406+ unsigned timeout = 0xffff;
21407+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
21408+ cpu_relax();
21409+ outb(ch, early_serial_base + TXR);
21410+ return timeout ? 0 : -1;
21411+}
21412+
21413+static void early_serial_write(struct console *con, const char *s, unsigned n)
21414+{
21415+ while (*s && n-- > 0) {
21416+ early_serial_putc(*s);
21417+ if (*s == '\n')
21418+ early_serial_putc('\r');
21419+ s++;
21420+ }
21421+}
21422+
21423+#define DEFAULT_BAUD 9600
21424+
21425+static __init void early_serial_init(char *s)
21426+{
21427+ unsigned char c;
21428+ unsigned divisor;
21429+ unsigned baud = DEFAULT_BAUD;
21430+ char *e;
21431+
21432+ if (*s == ',')
21433+ ++s;
21434+
21435+ if (*s) {
21436+ unsigned port;
21437+ if (!strncmp(s,"0x",2)) {
21438+ early_serial_base = simple_strtoul(s, &e, 16);
21439+ } else {
21440+ static int bases[] = { 0x3f8, 0x2f8 };
21441+
21442+ if (!strncmp(s,"ttyS",4))
21443+ s += 4;
21444+ port = simple_strtoul(s, &e, 10);
21445+ if (port > 1 || s == e)
21446+ port = 0;
21447+ early_serial_base = bases[port];
21448+ }
21449+ s += strcspn(s, ",");
21450+ if (*s == ',')
21451+ s++;
21452+ }
21453+
21454+ outb(0x3, early_serial_base + LCR); /* 8n1 */
21455+ outb(0, early_serial_base + IER); /* no interrupt */
21456+ outb(0, early_serial_base + FCR); /* no fifo */
21457+ outb(0x3, early_serial_base + MCR); /* DTR + RTS */
21458+
21459+ if (*s) {
21460+ baud = simple_strtoul(s, &e, 0);
21461+ if (baud == 0 || s == e)
21462+ baud = DEFAULT_BAUD;
21463+ }
21464+
21465+ divisor = 115200 / baud;
21466+ c = inb(early_serial_base + LCR);
21467+ outb(c | DLAB, early_serial_base + LCR);
21468+ outb(divisor & 0xff, early_serial_base + DLL);
21469+ outb((divisor >> 8) & 0xff, early_serial_base + DLH);
21470+ outb(c & ~DLAB, early_serial_base + LCR);
21471+}
21472+
21473+#else /* CONFIG_XEN */
21474+
21475+static void
21476+early_serial_write(struct console *con, const char *s, unsigned count)
21477+{
21478+ int n;
21479+
21480+ while (count > 0) {
21481+ n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
21482+ if (n <= 0)
21483+ break;
21484+ count -= n;
21485+ s += n;
21486+ }
21487+}
21488+
21489+static __init void early_serial_init(char *s)
21490+{
21491+}
21492+
21493+/*
21494+ * No early VGA console on Xen, as we do not have convenient ISA-space
21495+ * mappings. Someone should fix this for domain 0. For now, use fake serial.
21496+ */
21497+#define early_vga_console early_serial_console
21498+
21499+#endif
21500+
21501+static struct console early_serial_console = {
21502+ .name = "earlyser",
21503+ .write = early_serial_write,
21504+ .flags = CON_PRINTBUFFER,
21505+ .index = -1,
21506+};
21507+
21508+/* Console interface to a host file on AMD's SimNow! */
21509+
21510+static int simnow_fd;
21511+
21512+enum {
21513+ MAGIC1 = 0xBACCD00A,
21514+ MAGIC2 = 0xCA110000,
21515+ XOPEN = 5,
21516+ XWRITE = 4,
21517+};
21518+
21519+static noinline long simnow(long cmd, long a, long b, long c)
21520+{
21521+ long ret;
21522+ asm volatile("cpuid" :
21523+ "=a" (ret) :
21524+ "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
21525+ return ret;
21526+}
21527+
21528+void __init simnow_init(char *str)
21529+{
21530+ char *fn = "klog";
21531+ if (*str == '=')
21532+ fn = ++str;
21533+ /* error ignored */
21534+ simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
21535+}
21536+
21537+static void simnow_write(struct console *con, const char *s, unsigned n)
21538+{
21539+ simnow(XWRITE, simnow_fd, (unsigned long)s, n);
21540+}
21541+
21542+static struct console simnow_console = {
21543+ .name = "simnow",
21544+ .write = simnow_write,
21545+ .flags = CON_PRINTBUFFER,
21546+ .index = -1,
21547+};
21548+
21549+/* Direct interface for emergencies */
21550+struct console *early_console = &early_vga_console;
21551+static int early_console_initialized = 0;
21552+
21553+void early_printk(const char *fmt, ...)
21554+{
21555+ char buf[512];
21556+ int n;
21557+ va_list ap;
21558+
21559+ va_start(ap,fmt);
21560+ n = vscnprintf(buf,512,fmt,ap);
21561+ early_console->write(early_console,buf,n);
21562+ va_end(ap);
21563+}
21564+
21565+static int __initdata keep_early;
21566+
21567+int __init setup_early_printk(char *opt)
21568+{
21569+ char *space;
21570+ char buf[256];
21571+
21572+ if (early_console_initialized)
21573+ return 1;
21574+
21575+ strlcpy(buf,opt,sizeof(buf));
21576+ space = strchr(buf, ' ');
21577+ if (space)
21578+ *space = 0;
21579+
21580+ if (strstr(buf,"keep"))
21581+ keep_early = 1;
21582+
21583+ if (!strncmp(buf, "serial", 6)) {
21584+ early_serial_init(buf + 6);
21585+ early_console = &early_serial_console;
21586+ } else if (!strncmp(buf, "ttyS", 4)) {
21587+ early_serial_init(buf);
21588+ early_console = &early_serial_console;
21589+ } else if (!strncmp(buf, "vga", 3)
21590+#ifndef CONFIG_XEN
21591+ && SCREEN_INFO.orig_video_isVGA == 1) {
21592+ max_xpos = SCREEN_INFO.orig_video_cols;
21593+ max_ypos = SCREEN_INFO.orig_video_lines;
21594+ current_ypos = SCREEN_INFO.orig_y;
21595+#else
21596+ || !strncmp(buf, "xen", 3)) {
21597+#endif
21598+ early_console = &early_vga_console;
21599+ } else if (!strncmp(buf, "simnow", 6)) {
21600+ simnow_init(buf + 6);
21601+ early_console = &simnow_console;
21602+ keep_early = 1;
21603+ }
21604+ early_console_initialized = 1;
21605+ register_console(early_console);
21606+ return 0;
21607+}
21608+
21609+void __init disable_early_printk(void)
21610+{
21611+ if (!early_console_initialized || !early_console)
21612+ return;
21613+ if (!keep_early) {
21614+ printk("disabling early console\n");
21615+ unregister_console(early_console);
21616+ early_console_initialized = 0;
21617+ } else {
21618+ printk("keeping early console\n");
21619+ }
21620+}
21621+
21622+__setup("earlyprintk=", setup_early_printk);
21623Index: head-2008-11-25/arch/x86/kernel/entry_64-xen.S
21624===================================================================
21625--- /dev/null 1970-01-01 00:00:00.000000000 +0000
21626+++ head-2008-11-25/arch/x86/kernel/entry_64-xen.S 2008-10-29 09:55:56.000000000 +0100
21627@@ -0,0 +1,1322 @@
21628+/*
21629+ * linux/arch/x86_64/entry.S
21630+ *
21631+ * Copyright (C) 1991, 1992 Linus Torvalds
21632+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
21633+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
21634+ *
21635+ * $Id$
21636+ *
21637+ * Jun Nakajima <jun.nakajima@intel.com>
21638+ * Asit Mallick <asit.k.mallick@intel.com>
21639+ * Modified for Xen
21640+ */
21641+
21642+/*
21643+ * entry.S contains the system-call and fault low-level handling routines.
21644+ *
21645+ * NOTE: This code handles signal-recognition, which happens every time
21646+ * after an interrupt and after each system call.
21647+ *
21648+ * Normal syscalls and interrupts don't save a full stack frame, this is
21649+ * only done for syscall tracing, signals or fork/exec et.al.
21650+ *
21651+ * A note on terminology:
21652+ * - top of stack: Architecture defined interrupt frame from SS to RIP
21653+ * at the top of the kernel process stack.
21654+ * - partial stack frame: partially saved registers upto R11.
21655+ * - full stack frame: Like partial stack frame, but all register saved.
21656+ *
21657+ * TODO:
21658+ * - schedule it carefully for the final hardware.
21659+ */
21660+
21661+#define ASSEMBLY 1
21662+#include <linux/linkage.h>
21663+#include <asm/segment.h>
21664+#include <asm/smp.h>
21665+#include <asm/cache.h>
21666+#include <asm/errno.h>
21667+#include <asm/dwarf2.h>
21668+#include <asm/calling.h>
21669+#include <asm/asm-offsets.h>
21670+#include <asm/msr.h>
21671+#include <asm/unistd.h>
21672+#include <asm/thread_info.h>
21673+#include <asm/hw_irq.h>
21674+#include <asm/page.h>
21675+#include <asm/irqflags.h>
21676+#include <asm/errno.h>
21677+#include <xen/interface/arch-x86_64.h>
21678+#include <xen/interface/features.h>
21679+
21680+#include "xen_entry.S"
21681+
21682+ .code64
21683+
21684+#ifndef CONFIG_PREEMPT
21685+#define retint_kernel retint_restore_args
21686+#endif
21687+
21688+
21689+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
21690+#ifdef CONFIG_TRACE_IRQFLAGS
21691+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
21692+ jnc 1f
21693+ TRACE_IRQS_ON
21694+1:
21695+#endif
21696+.endm
21697+
21698+NMI_MASK = 0x80000000
21699+
21700+/*
21701+ * C code is not supposed to know about undefined top of stack. Every time
21702+ * a C function with an pt_regs argument is called from the SYSCALL based
21703+ * fast path FIXUP_TOP_OF_STACK is needed.
21704+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
21705+ * manipulation.
21706+ */
21707+
21708+ /* %rsp:at FRAMEEND */
21709+ .macro FIXUP_TOP_OF_STACK tmp
21710+ movq $__USER_CS,CS(%rsp)
21711+ movq $-1,RCX(%rsp)
21712+ .endm
21713+
21714+ .macro RESTORE_TOP_OF_STACK tmp,offset=0
21715+ .endm
21716+
21717+ .macro FAKE_STACK_FRAME child_rip
21718+ /* push in order ss, rsp, eflags, cs, rip */
21719+ xorl %eax, %eax
21720+ pushq %rax /* ss */
21721+ CFI_ADJUST_CFA_OFFSET 8
21722+ /*CFI_REL_OFFSET ss,0*/
21723+ pushq %rax /* rsp */
21724+ CFI_ADJUST_CFA_OFFSET 8
21725+ CFI_REL_OFFSET rsp,0
21726+ pushq $(1<<9) /* eflags - interrupts on */
21727+ CFI_ADJUST_CFA_OFFSET 8
21728+ /*CFI_REL_OFFSET rflags,0*/
21729+ pushq $__KERNEL_CS /* cs */
21730+ CFI_ADJUST_CFA_OFFSET 8
21731+ /*CFI_REL_OFFSET cs,0*/
21732+ pushq \child_rip /* rip */
21733+ CFI_ADJUST_CFA_OFFSET 8
21734+ CFI_REL_OFFSET rip,0
21735+ pushq %rax /* orig rax */
21736+ CFI_ADJUST_CFA_OFFSET 8
21737+ .endm
21738+
21739+ .macro UNFAKE_STACK_FRAME
21740+ addq $8*6, %rsp
21741+ CFI_ADJUST_CFA_OFFSET -(6*8)
21742+ .endm
21743+
21744+ .macro CFI_DEFAULT_STACK start=1,adj=0
21745+ .if \start
21746+ CFI_STARTPROC simple
21747+ CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET
21748+ .else
21749+ CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
21750+ .endif
21751+ .if \adj == 0
21752+ CFI_REL_OFFSET r15,R15
21753+ CFI_REL_OFFSET r14,R14
21754+ CFI_REL_OFFSET r13,R13
21755+ CFI_REL_OFFSET r12,R12
21756+ CFI_REL_OFFSET rbp,RBP
21757+ CFI_REL_OFFSET rbx,RBX
21758+ .endif
21759+ CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET
21760+ CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET
21761+ CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET
21762+ CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET
21763+ CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET
21764+ CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET
21765+ CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET
21766+ CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET
21767+ CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET
21768+ CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET
21769+ /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/
21770+ /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/
21771+ CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET
21772+ /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/
21773+ .endm
21774+
21775+ /*
21776+ * Must be consistent with the definition in arch-x86/xen-x86_64.h:
21777+ * struct iret_context {
21778+ * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
21779+ * };
21780+ * with rax, r11, and rcx being taken care of in the hypercall stub.
21781+ */
21782+ .macro HYPERVISOR_IRET flag
21783+ testb $3,1*8(%rsp)
21784+ jnz 2f
21785+ testl $NMI_MASK,2*8(%rsp)
21786+ jnz 2f
21787+
21788+ cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
21789+ jne 1f
21790+
21791+ /* Direct iret to kernel space. Correct CS and SS. */
21792+ orl $3,1*8(%rsp)
21793+ orl $3,4*8(%rsp)
21794+1: iretq
21795+
21796+2: /* Slow iret via hypervisor. */
21797+ andl $~NMI_MASK, 2*8(%rsp)
21798+ pushq $\flag
21799+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
21800+ .endm
21801+
21802+/*
21803+ * A newly forked process directly context switches into this.
21804+ */
21805+/* rdi: prev */
21806+ENTRY(ret_from_fork)
21807+ CFI_DEFAULT_STACK
21808+ push kernel_eflags(%rip)
21809+ CFI_ADJUST_CFA_OFFSET 4
21810+ popf # reset kernel eflags
21811+ CFI_ADJUST_CFA_OFFSET -4
21812+ call schedule_tail
21813+ GET_THREAD_INFO(%rcx)
21814+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
21815+ jnz rff_trace
21816+rff_action:
21817+ RESTORE_REST
21818+ testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
21819+ je int_ret_from_sys_call
21820+ testl $_TIF_IA32,threadinfo_flags(%rcx)
21821+ jnz int_ret_from_sys_call
21822+ RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
21823+ jmp ret_from_sys_call
21824+rff_trace:
21825+ movq %rsp,%rdi
21826+ call syscall_trace_leave
21827+ GET_THREAD_INFO(%rcx)
21828+ jmp rff_action
21829+ CFI_ENDPROC
21830+END(ret_from_fork)
21831+
21832+/*
21833+ * initial frame state for interrupts and exceptions
21834+ */
21835+ .macro _frame ref
21836+ CFI_STARTPROC simple
21837+ CFI_DEF_CFA rsp,SS+8-\ref
21838+ /*CFI_REL_OFFSET ss,SS-\ref*/
21839+ CFI_REL_OFFSET rsp,RSP-\ref
21840+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
21841+ /*CFI_REL_OFFSET cs,CS-\ref*/
21842+ CFI_REL_OFFSET rip,RIP-\ref
21843+ .endm
21844+
21845+/*
21846+ * System call entry. Upto 6 arguments in registers are supported.
21847+ *
21848+ * SYSCALL does not save anything on the stack and does not change the
21849+ * stack pointer.
21850+ */
21851+
21852+/*
21853+ * Register setup:
21854+ * rax system call number
21855+ * rdi arg0
21856+ * rcx return address for syscall/sysret, C arg3
21857+ * rsi arg1
21858+ * rdx arg2
21859+ * r10 arg3 (--> moved to rcx for C)
21860+ * r8 arg4
21861+ * r9 arg5
21862+ * r11 eflags for syscall/sysret, temporary for C
21863+ * r12-r15,rbp,rbx saved by C code, not touched.
21864+ *
21865+ * Interrupts are enabled on entry.
21866+ * Only called from user space.
21867+ *
21868+ * XXX if we had a free scratch register we could save the RSP into the stack frame
21869+ * and report it properly in ps. Unfortunately we haven't.
21870+ *
21871+ * When user can change the frames always force IRET. That is because
21872+ * it deals with uncanonical addresses better. SYSRET has trouble
21873+ * with them due to bugs in both AMD and Intel CPUs.
21874+ */
21875+
21876+ENTRY(system_call)
21877+ _frame (RIP-0x10)
21878+ SAVE_ARGS -8,0
21879+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
21880+ GET_THREAD_INFO(%rcx)
21881+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
21882+ CFI_REMEMBER_STATE
21883+ jnz tracesys
21884+ cmpq $__NR_syscall_max,%rax
21885+ ja badsys
21886+ movq %r10,%rcx
21887+ call *sys_call_table(,%rax,8) # XXX: rip relative
21888+ movq %rax,RAX-ARGOFFSET(%rsp)
21889+/*
21890+ * Syscall return path ending with SYSRET (fast path)
21891+ * Has incomplete stack frame and undefined top of stack.
21892+ */
21893+ .globl ret_from_sys_call
21894+ret_from_sys_call:
21895+ movl $_TIF_ALLWORK_MASK,%edi
21896+ /* edi: flagmask */
21897+sysret_check:
21898+ GET_THREAD_INFO(%rcx)
21899+ XEN_BLOCK_EVENTS(%rsi)
21900+ TRACE_IRQS_OFF
21901+ movl threadinfo_flags(%rcx),%edx
21902+ andl %edi,%edx
21903+ CFI_REMEMBER_STATE
21904+ jnz sysret_careful
21905+ /*
21906+ * sysretq will re-enable interrupts:
21907+ */
21908+ TRACE_IRQS_ON
21909+ XEN_UNBLOCK_EVENTS(%rsi)
21910+ RESTORE_ARGS 0,8,0
21911+ HYPERVISOR_IRET VGCF_IN_SYSCALL
21912+
21913+ /* Handle reschedules */
21914+ /* edx: work, edi: workmask */
21915+sysret_careful:
21916+ CFI_RESTORE_STATE
21917+ bt $TIF_NEED_RESCHED,%edx
21918+ jnc sysret_signal
21919+ TRACE_IRQS_ON
21920+ XEN_UNBLOCK_EVENTS(%rsi)
21921+ pushq %rdi
21922+ CFI_ADJUST_CFA_OFFSET 8
21923+ call schedule
21924+ popq %rdi
21925+ CFI_ADJUST_CFA_OFFSET -8
21926+ jmp sysret_check
21927+
21928+ /* Handle a signal */
21929+sysret_signal:
21930+ TRACE_IRQS_ON
21931+/* sti */
21932+ XEN_UNBLOCK_EVENTS(%rsi)
21933+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
21934+ jz 1f
21935+
21936+ /* Really a signal */
21937+ /* edx: work flags (arg3) */
21938+ leaq do_notify_resume(%rip),%rax
21939+ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
21940+ xorl %esi,%esi # oldset -> arg2
21941+ call ptregscall_common
21942+1: movl $_TIF_NEED_RESCHED,%edi
21943+ /* Use IRET because user could have changed frame. This
21944+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
21945+ XEN_BLOCK_EVENTS(%rsi)
21946+ TRACE_IRQS_OFF
21947+ jmp int_with_check
21948+
21949+badsys:
21950+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
21951+ jmp ret_from_sys_call
21952+
21953+ /* Do syscall tracing */
21954+tracesys:
21955+ CFI_RESTORE_STATE
21956+ SAVE_REST
21957+ movq $-ENOSYS,RAX(%rsp)
21958+ FIXUP_TOP_OF_STACK %rdi
21959+ movq %rsp,%rdi
21960+ call syscall_trace_enter
21961+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
21962+ RESTORE_REST
21963+ cmpq $__NR_syscall_max,%rax
21964+ ja 1f
21965+ movq %r10,%rcx /* fixup for C */
21966+ call *sys_call_table(,%rax,8)
21967+1: movq %rax,RAX-ARGOFFSET(%rsp)
21968+ /* Use IRET because user could have changed frame */
21969+ jmp int_ret_from_sys_call
21970+ CFI_ENDPROC
21971+END(system_call)
21972+
21973+/*
21974+ * Syscall return path ending with IRET.
21975+ * Has correct top of stack, but partial stack frame.
21976+ */
21977+ENTRY(int_ret_from_sys_call)
21978+ CFI_STARTPROC simple
21979+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
21980+ /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
21981+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
21982+ /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
21983+ /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
21984+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
21985+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
21986+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
21987+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
21988+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
21989+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
21990+ CFI_REL_OFFSET r8,R8-ARGOFFSET
21991+ CFI_REL_OFFSET r9,R9-ARGOFFSET
21992+ CFI_REL_OFFSET r10,R10-ARGOFFSET
21993+ CFI_REL_OFFSET r11,R11-ARGOFFSET
21994+ XEN_BLOCK_EVENTS(%rsi)
21995+ TRACE_IRQS_OFF
21996+ testb $3,CS-ARGOFFSET(%rsp)
21997+ jnz 1f
21998+ /* Need to set the proper %ss (not NULL) for ring 3 iretq */
21999+ movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
22000+ jmp retint_restore_args # retrun from ring3 kernel
22001+1:
22002+ movl $_TIF_ALLWORK_MASK,%edi
22003+ /* edi: mask to check */
22004+int_with_check:
22005+ GET_THREAD_INFO(%rcx)
22006+ movl threadinfo_flags(%rcx),%edx
22007+ andl %edi,%edx
22008+ jnz int_careful
22009+ andl $~TS_COMPAT,threadinfo_status(%rcx)
22010+ jmp retint_restore_args
22011+
22012+ /* Either reschedule or signal or syscall exit tracking needed. */
22013+ /* First do a reschedule test. */
22014+ /* edx: work, edi: workmask */
22015+int_careful:
22016+ bt $TIF_NEED_RESCHED,%edx
22017+ jnc int_very_careful
22018+ TRACE_IRQS_ON
22019+/* sti */
22020+ XEN_UNBLOCK_EVENTS(%rsi)
22021+ pushq %rdi
22022+ CFI_ADJUST_CFA_OFFSET 8
22023+ call schedule
22024+ popq %rdi
22025+ CFI_ADJUST_CFA_OFFSET -8
22026+ XEN_BLOCK_EVENTS(%rsi)
22027+ TRACE_IRQS_OFF
22028+ jmp int_with_check
22029+
22030+ /* handle signals and tracing -- both require a full stack frame */
22031+int_very_careful:
22032+ TRACE_IRQS_ON
22033+/* sti */
22034+ XEN_UNBLOCK_EVENTS(%rsi)
22035+ SAVE_REST
22036+ /* Check for syscall exit trace */
22037+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
22038+ jz int_signal
22039+ pushq %rdi
22040+ CFI_ADJUST_CFA_OFFSET 8
22041+ leaq 8(%rsp),%rdi # &ptregs -> arg1
22042+ call syscall_trace_leave
22043+ popq %rdi
22044+ CFI_ADJUST_CFA_OFFSET -8
22045+ andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
22046+ XEN_BLOCK_EVENTS(%rsi)
22047+ TRACE_IRQS_OFF
22048+ jmp int_restore_rest
22049+
22050+int_signal:
22051+ testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
22052+ jz 1f
22053+ movq %rsp,%rdi # &ptregs -> arg1
22054+ xorl %esi,%esi # oldset -> arg2
22055+ call do_notify_resume
22056+1: movl $_TIF_NEED_RESCHED,%edi
22057+int_restore_rest:
22058+ RESTORE_REST
22059+ XEN_BLOCK_EVENTS(%rsi)
22060+ TRACE_IRQS_OFF
22061+ jmp int_with_check
22062+ CFI_ENDPROC
22063+END(int_ret_from_sys_call)
22064+
22065+/*
22066+ * Certain special system calls that need to save a complete full stack frame.
22067+ */
22068+
22069+ .macro PTREGSCALL label,func,arg
22070+ .globl \label
22071+\label:
22072+ leaq \func(%rip),%rax
22073+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
22074+ jmp ptregscall_common
22075+END(\label)
22076+ .endm
22077+
22078+ CFI_STARTPROC
22079+
22080+ PTREGSCALL stub_clone, sys_clone, %r8
22081+ PTREGSCALL stub_fork, sys_fork, %rdi
22082+ PTREGSCALL stub_vfork, sys_vfork, %rdi
22083+ PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
22084+ PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
22085+ PTREGSCALL stub_iopl, sys_iopl, %rsi
22086+
22087+ENTRY(ptregscall_common)
22088+ popq %r11
22089+ CFI_ADJUST_CFA_OFFSET -8
22090+ CFI_REGISTER rip, r11
22091+ SAVE_REST
22092+ movq %r11, %r15
22093+ CFI_REGISTER rip, r15
22094+ FIXUP_TOP_OF_STACK %r11
22095+ call *%rax
22096+ RESTORE_TOP_OF_STACK %r11
22097+ movq %r15, %r11
22098+ CFI_REGISTER rip, r11
22099+ RESTORE_REST
22100+ pushq %r11
22101+ CFI_ADJUST_CFA_OFFSET 8
22102+ CFI_REL_OFFSET rip, 0
22103+ ret
22104+ CFI_ENDPROC
22105+END(ptregscall_common)
22106+
22107+ENTRY(stub_execve)
22108+ CFI_STARTPROC
22109+ popq %r11
22110+ CFI_ADJUST_CFA_OFFSET -8
22111+ CFI_REGISTER rip, r11
22112+ SAVE_REST
22113+ FIXUP_TOP_OF_STACK %r11
22114+ call sys_execve
22115+ RESTORE_TOP_OF_STACK %r11
22116+ movq %rax,RAX(%rsp)
22117+ RESTORE_REST
22118+ jmp int_ret_from_sys_call
22119+ CFI_ENDPROC
22120+END(stub_execve)
22121+
22122+/*
22123+ * sigreturn is special because it needs to restore all registers on return.
22124+ * This cannot be done with SYSRET, so use the IRET return path instead.
22125+ */
22126+ENTRY(stub_rt_sigreturn)
22127+ CFI_STARTPROC
22128+ addq $8, %rsp
22129+ CFI_ADJUST_CFA_OFFSET -8
22130+ SAVE_REST
22131+ movq %rsp,%rdi
22132+ FIXUP_TOP_OF_STACK %r11
22133+ call sys_rt_sigreturn
22134+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
22135+ RESTORE_REST
22136+ jmp int_ret_from_sys_call
22137+ CFI_ENDPROC
22138+END(stub_rt_sigreturn)
22139+
22140+/* initial frame state for interrupts (and exceptions without error code) */
22141+#define INTR_FRAME _frame (RIP-0x10); \
22142+ CFI_REL_OFFSET rcx,0; \
22143+ CFI_REL_OFFSET r11,8
22144+
22145+/* initial frame state for exceptions with error code (and interrupts with
22146+ vector already pushed) */
22147+#define XCPT_FRAME _frame (RIP-0x18); \
22148+ CFI_REL_OFFSET rcx,0; \
22149+ CFI_REL_OFFSET r11,8
22150+
22151+/*
22152+ * Interrupt exit.
22153+ *
22154+ */
22155+
22156+retint_check:
22157+ CFI_DEFAULT_STACK adj=1
22158+ movl threadinfo_flags(%rcx),%edx
22159+ andl %edi,%edx
22160+ CFI_REMEMBER_STATE
22161+ jnz retint_careful
22162+retint_restore_args:
22163+ movl EFLAGS-REST_SKIP(%rsp), %eax
22164+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
22165+ XEN_GET_VCPU_INFO(%rsi)
22166+ andb evtchn_upcall_mask(%rsi),%al
22167+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
22168+ jnz restore_all_enable_events # != 0 => enable event delivery
22169+ XEN_PUT_VCPU_INFO(%rsi)
22170+
22171+ RESTORE_ARGS 0,8,0
22172+ HYPERVISOR_IRET 0
22173+
22174+ /* edi: workmask, edx: work */
22175+retint_careful:
22176+ CFI_RESTORE_STATE
22177+ bt $TIF_NEED_RESCHED,%edx
22178+ jnc retint_signal
22179+ TRACE_IRQS_ON
22180+ XEN_UNBLOCK_EVENTS(%rsi)
22181+/* sti */
22182+ pushq %rdi
22183+ CFI_ADJUST_CFA_OFFSET 8
22184+ call schedule
22185+ popq %rdi
22186+ CFI_ADJUST_CFA_OFFSET -8
22187+ GET_THREAD_INFO(%rcx)
22188+ XEN_BLOCK_EVENTS(%rsi)
22189+/* cli */
22190+ TRACE_IRQS_OFF
22191+ jmp retint_check
22192+
22193+retint_signal:
22194+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
22195+ jz retint_restore_args
22196+ TRACE_IRQS_ON
22197+ XEN_UNBLOCK_EVENTS(%rsi)
22198+ SAVE_REST
22199+ movq $-1,ORIG_RAX(%rsp)
22200+ xorl %esi,%esi # oldset
22201+ movq %rsp,%rdi # &pt_regs
22202+ call do_notify_resume
22203+ RESTORE_REST
22204+ XEN_BLOCK_EVENTS(%rsi)
22205+ TRACE_IRQS_OFF
22206+ movl $_TIF_NEED_RESCHED,%edi
22207+ GET_THREAD_INFO(%rcx)
22208+ jmp retint_check
22209+
22210+#ifdef CONFIG_PREEMPT
22211+ /* Returning to kernel space. Check if we need preemption */
22212+ /* rcx: threadinfo. interrupts off. */
22213+ .p2align
22214+retint_kernel:
22215+ cmpl $0,threadinfo_preempt_count(%rcx)
22216+ jnz retint_restore_args
22217+ bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
22218+ jnc retint_restore_args
22219+ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
22220+ jnc retint_restore_args
22221+ call preempt_schedule_irq
22222+ jmp retint_kernel /* check again */
22223+#endif
22224+
22225+ CFI_ENDPROC
22226+END(retint_check)
22227+
22228+#ifndef CONFIG_XEN
22229+/*
22230+ * APIC interrupts.
22231+ */
22232+ .macro apicinterrupt num,func
22233+ INTR_FRAME
22234+ pushq $~(\num)
22235+ CFI_ADJUST_CFA_OFFSET 8
22236+ interrupt \func
22237+ jmp error_entry
22238+ CFI_ENDPROC
22239+ .endm
22240+
22241+ENTRY(thermal_interrupt)
22242+ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
22243+END(thermal_interrupt)
22244+
22245+ENTRY(threshold_interrupt)
22246+ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
22247+END(threshold_interrupt)
22248+
22249+#ifdef CONFIG_SMP
22250+ENTRY(reschedule_interrupt)
22251+ apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
22252+END(reschedule_interrupt)
22253+
22254+ .macro INVALIDATE_ENTRY num
22255+ENTRY(invalidate_interrupt\num)
22256+ apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
22257+END(invalidate_interrupt\num)
22258+ .endm
22259+
22260+ INVALIDATE_ENTRY 0
22261+ INVALIDATE_ENTRY 1
22262+ INVALIDATE_ENTRY 2
22263+ INVALIDATE_ENTRY 3
22264+ INVALIDATE_ENTRY 4
22265+ INVALIDATE_ENTRY 5
22266+ INVALIDATE_ENTRY 6
22267+ INVALIDATE_ENTRY 7
22268+
22269+ENTRY(call_function_interrupt)
22270+ apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
22271+END(call_function_interrupt)
22272+#endif
22273+
22274+#ifdef CONFIG_X86_LOCAL_APIC
22275+ENTRY(apic_timer_interrupt)
22276+ apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
22277+END(apic_timer_interrupt)
22278+
22279+ENTRY(error_interrupt)
22280+ apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
22281+END(error_interrupt)
22282+
22283+ENTRY(spurious_interrupt)
22284+ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
22285+END(spurious_interrupt)
22286+#endif
22287+#endif /* !CONFIG_XEN */
22288+
22289+/*
22290+ * Exception entry points.
22291+ */
22292+ .macro zeroentry sym
22293+ INTR_FRAME
22294+ movq (%rsp),%rcx
22295+ CFI_RESTORE rcx
22296+ movq 8(%rsp),%r11
22297+ CFI_RESTORE r11
22298+ addq $0x10,%rsp /* skip rcx and r11 */
22299+ CFI_ADJUST_CFA_OFFSET -0x10
22300+ pushq $0 /* push error code/oldrax */
22301+ CFI_ADJUST_CFA_OFFSET 8
22302+ pushq %rax /* push real oldrax to the rdi slot */
22303+ CFI_ADJUST_CFA_OFFSET 8
22304+ CFI_REL_OFFSET rax,0
22305+ leaq \sym(%rip),%rax
22306+ jmp error_entry
22307+ CFI_ENDPROC
22308+ .endm
22309+
22310+ .macro errorentry sym
22311+ XCPT_FRAME
22312+ movq (%rsp),%rcx
22313+ CFI_RESTORE rcx
22314+ movq 8(%rsp),%r11
22315+ CFI_RESTORE r11
22316+ addq $0x10,%rsp /* rsp points to the error code */
22317+ CFI_ADJUST_CFA_OFFSET -0x10
22318+ pushq %rax
22319+ CFI_ADJUST_CFA_OFFSET 8
22320+ CFI_REL_OFFSET rax,0
22321+ leaq \sym(%rip),%rax
22322+ jmp error_entry
22323+ CFI_ENDPROC
22324+ .endm
22325+
22326+#if 0 /* not XEN */
22327+ /* error code is on the stack already */
22328+ /* handle NMI like exceptions that can happen everywhere */
22329+ .macro paranoidentry sym, ist=0, irqtrace=1
22330+ movq (%rsp),%rcx
22331+ movq 8(%rsp),%r11
22332+ addq $0x10,%rsp /* skip rcx and r11 */
22333+ SAVE_ALL
22334+ cld
22335+#if 0 /* not XEN */
22336+ movl $1,%ebx
22337+ movl $MSR_GS_BASE,%ecx
22338+ rdmsr
22339+ testl %edx,%edx
22340+ js 1f
22341+ swapgs
22342+ xorl %ebx,%ebx
22343+1:
22344+#endif
22345+ .if \ist
22346+ movq %gs:pda_data_offset, %rbp
22347+ .endif
22348+ movq %rsp,%rdi
22349+ movq ORIG_RAX(%rsp),%rsi
22350+ movq $-1,ORIG_RAX(%rsp)
22351+ .if \ist
22352+ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22353+ .endif
22354+ call \sym
22355+ .if \ist
22356+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22357+ .endif
22358+/* cli */
22359+ XEN_BLOCK_EVENTS(%rsi)
22360+ .if \irqtrace
22361+ TRACE_IRQS_OFF
22362+ .endif
22363+ .endm
22364+
22365+ /*
22366+ * "Paranoid" exit path from exception stack.
22367+ * Paranoid because this is used by NMIs and cannot take
22368+ * any kernel state for granted.
22369+ * We don't do kernel preemption checks here, because only
22370+ * NMI should be common and it does not enable IRQs and
22371+ * cannot get reschedule ticks.
22372+ *
22373+ * "trace" is 0 for the NMI handler only, because irq-tracing
22374+ * is fundamentally NMI-unsafe. (we cannot change the soft and
22375+ * hard flags at once, atomically)
22376+ */
22377+ .macro paranoidexit trace=1
22378+ /* ebx: no swapgs flag */
22379+paranoid_exit\trace:
22380+ testl %ebx,%ebx /* swapgs needed? */
22381+ jnz paranoid_restore\trace
22382+ testl $3,CS(%rsp)
22383+ jnz paranoid_userspace\trace
22384+paranoid_swapgs\trace:
22385+ TRACE_IRQS_IRETQ 0
22386+ swapgs
22387+paranoid_restore\trace:
22388+ RESTORE_ALL 8
22389+ iretq
22390+paranoid_userspace\trace:
22391+ GET_THREAD_INFO(%rcx)
22392+ movl threadinfo_flags(%rcx),%ebx
22393+ andl $_TIF_WORK_MASK,%ebx
22394+ jz paranoid_swapgs\trace
22395+ movq %rsp,%rdi /* &pt_regs */
22396+ call sync_regs
22397+ movq %rax,%rsp /* switch stack for scheduling */
22398+ testl $_TIF_NEED_RESCHED,%ebx
22399+ jnz paranoid_schedule\trace
22400+ movl %ebx,%edx /* arg3: thread flags */
22401+ .if \trace
22402+ TRACE_IRQS_ON
22403+ .endif
22404+ sti
22405+ xorl %esi,%esi /* arg2: oldset */
22406+ movq %rsp,%rdi /* arg1: &pt_regs */
22407+ call do_notify_resume
22408+ cli
22409+ .if \trace
22410+ TRACE_IRQS_OFF
22411+ .endif
22412+ jmp paranoid_userspace\trace
22413+paranoid_schedule\trace:
22414+ .if \trace
22415+ TRACE_IRQS_ON
22416+ .endif
22417+ sti
22418+ call schedule
22419+ cli
22420+ .if \trace
22421+ TRACE_IRQS_OFF
22422+ .endif
22423+ jmp paranoid_userspace\trace
22424+ CFI_ENDPROC
22425+ .endm
22426+#endif
22427+
22428+/*
22429+ * Exception entry point. This expects an error code/orig_rax on the stack
22430+ * and the exception handler in %rax.
22431+ */
22432+ENTRY(error_entry)
22433+ _frame RDI
22434+ CFI_REL_OFFSET rax,0
22435+ /* rdi slot contains rax, oldrax contains error code */
22436+ cld
22437+ subq $14*8,%rsp
22438+ CFI_ADJUST_CFA_OFFSET (14*8)
22439+ movq %rsi,13*8(%rsp)
22440+ CFI_REL_OFFSET rsi,RSI
22441+ movq 14*8(%rsp),%rsi /* load rax from rdi slot */
22442+ CFI_REGISTER rax,rsi
22443+ movq %rdx,12*8(%rsp)
22444+ CFI_REL_OFFSET rdx,RDX
22445+ movq %rcx,11*8(%rsp)
22446+ CFI_REL_OFFSET rcx,RCX
22447+ movq %rsi,10*8(%rsp) /* store rax */
22448+ CFI_REL_OFFSET rax,RAX
22449+ movq %r8, 9*8(%rsp)
22450+ CFI_REL_OFFSET r8,R8
22451+ movq %r9, 8*8(%rsp)
22452+ CFI_REL_OFFSET r9,R9
22453+ movq %r10,7*8(%rsp)
22454+ CFI_REL_OFFSET r10,R10
22455+ movq %r11,6*8(%rsp)
22456+ CFI_REL_OFFSET r11,R11
22457+ movq %rbx,5*8(%rsp)
22458+ CFI_REL_OFFSET rbx,RBX
22459+ movq %rbp,4*8(%rsp)
22460+ CFI_REL_OFFSET rbp,RBP
22461+ movq %r12,3*8(%rsp)
22462+ CFI_REL_OFFSET r12,R12
22463+ movq %r13,2*8(%rsp)
22464+ CFI_REL_OFFSET r13,R13
22465+ movq %r14,1*8(%rsp)
22466+ CFI_REL_OFFSET r14,R14
22467+ movq %r15,(%rsp)
22468+ CFI_REL_OFFSET r15,R15
22469+#if 0
22470+ cmpl $__KERNEL_CS,CS(%rsp)
22471+ CFI_REMEMBER_STATE
22472+ je error_kernelspace
22473+#endif
22474+error_call_handler:
22475+ movq %rdi, RDI(%rsp)
22476+ CFI_REL_OFFSET rdi,RDI
22477+ movq %rsp,%rdi
22478+ movq ORIG_RAX(%rsp),%rsi # get error code
22479+ movq $-1,ORIG_RAX(%rsp)
22480+ call *%rax
22481+error_exit:
22482+ RESTORE_REST
22483+/* cli */
22484+ XEN_BLOCK_EVENTS(%rsi)
22485+ TRACE_IRQS_OFF
22486+ GET_THREAD_INFO(%rcx)
22487+ testb $3,CS-ARGOFFSET(%rsp)
22488+ jz retint_kernel
22489+ movl threadinfo_flags(%rcx),%edx
22490+ movl $_TIF_WORK_MASK,%edi
22491+ andl %edi,%edx
22492+ jnz retint_careful
22493+ /*
22494+ * The iret might restore flags:
22495+ */
22496+ TRACE_IRQS_IRETQ
22497+ jmp retint_restore_args
22498+
22499+#if 0
22500+ /*
22501+ * We need to re-write the logic here because we don't do iretq to
22502+ * to return to user mode. It's still possible that we get trap/fault
22503+ * in the kernel (when accessing buffers pointed to by system calls,
22504+ * for example).
22505+ *
22506+ */
22507+ CFI_RESTORE_STATE
22508+error_kernelspace:
22509+ incl %ebx
22510+ /* There are two places in the kernel that can potentially fault with
22511+ usergs. Handle them here. The exception handlers after
22512+ iret run with kernel gs again, so don't set the user space flag.
22513+ B stepping K8s sometimes report an truncated RIP for IRET
22514+ exceptions returning to compat mode. Check for these here too. */
22515+ leaq iret_label(%rip),%rbp
22516+ cmpq %rbp,RIP(%rsp)
22517+ je error_swapgs
22518+ movl %ebp,%ebp /* zero extend */
22519+ cmpq %rbp,RIP(%rsp)
22520+ je error_swapgs
22521+ cmpq $gs_change,RIP(%rsp)
22522+ je error_swapgs
22523+ jmp error_sti
22524+#endif
22525+ CFI_ENDPROC
22526+END(error_entry)
22527+
22528+ENTRY(hypervisor_callback)
22529+ zeroentry do_hypervisor_callback
22530+END(hypervisor_callback)
22531+
22532+/*
22533+ * Copied from arch/xen/i386/kernel/entry.S
22534+ */
22535+# A note on the "critical region" in our callback handler.
22536+# We want to avoid stacking callback handlers due to events occurring
22537+# during handling of the last event. To do this, we keep events disabled
22538+# until we've done all processing. HOWEVER, we must enable events before
22539+# popping the stack frame (can't be done atomically) and so it would still
22540+# be possible to get enough handler activations to overflow the stack.
22541+# Although unlikely, bugs of that kind are hard to track down, so we'd
22542+# like to avoid the possibility.
22543+# So, on entry to the handler we detect whether we interrupted an
22544+# existing activation in its critical region -- if so, we pop the current
22545+# activation and restart the handler using the previous one.
22546+ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
22547+ CFI_STARTPROC
22548+# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
22549+# see the correct pointer to the pt_regs
22550+ movq %rdi, %rsp # we don't return, adjust the stack frame
22551+ CFI_ENDPROC
22552+ CFI_DEFAULT_STACK
22553+11: incl %gs:pda_irqcount
22554+ movq %rsp,%rbp
22555+ CFI_DEF_CFA_REGISTER rbp
22556+ cmovzq %gs:pda_irqstackptr,%rsp
22557+ pushq %rbp # backlink for old unwinder
22558+ call evtchn_do_upcall
22559+ popq %rsp
22560+ CFI_DEF_CFA_REGISTER rsp
22561+ decl %gs:pda_irqcount
22562+ jmp error_exit
22563+ CFI_ENDPROC
22564+END(do_hypervisor_callback)
22565+
22566+#ifdef CONFIG_X86_LOCAL_APIC
22567+KPROBE_ENTRY(nmi)
22568+ zeroentry do_nmi_callback
22569+ENTRY(do_nmi_callback)
22570+ CFI_STARTPROC
22571+ addq $8, %rsp
22572+ CFI_ENDPROC
22573+ CFI_DEFAULT_STACK
22574+ call do_nmi
22575+ orl $NMI_MASK,EFLAGS(%rsp)
22576+ RESTORE_REST
22577+ XEN_BLOCK_EVENTS(%rsi)
22578+ TRACE_IRQS_OFF
22579+ GET_THREAD_INFO(%rcx)
22580+ jmp retint_restore_args
22581+ CFI_ENDPROC
22582+ .previous .text
22583+END(nmi)
22584+#endif
22585+
22586+ ALIGN
22587+restore_all_enable_events:
22588+ CFI_DEFAULT_STACK adj=1
22589+ TRACE_IRQS_ON
22590+ XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
22591+
22592+scrit: /**** START OF CRITICAL REGION ****/
22593+ XEN_TEST_PENDING(%rsi)
22594+ CFI_REMEMBER_STATE
22595+ jnz 14f # process more events if necessary...
22596+ XEN_PUT_VCPU_INFO(%rsi)
22597+ RESTORE_ARGS 0,8,0
22598+ HYPERVISOR_IRET 0
22599+
22600+ CFI_RESTORE_STATE
22601+14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
22602+ XEN_PUT_VCPU_INFO(%rsi)
22603+ SAVE_REST
22604+ movq %rsp,%rdi # set the argument again
22605+ jmp 11b
22606+ CFI_ENDPROC
22607+ecrit: /**** END OF CRITICAL REGION ****/
22608+# At this point, unlike on x86-32, we don't do the fixup to simplify the
22609+# code and the stack frame is more complex on x86-64.
22610+# When the kernel is interrupted in the critical section, the kernel
22611+# will do IRET in that case, and everything will be restored at that point,
22612+# i.e. it just resumes from the next instruction interrupted with the same context.
22613+
22614+# Hypervisor uses this for application faults while it executes.
22615+# We get here for two reasons:
22616+# 1. Fault while reloading DS, ES, FS or GS
22617+# 2. Fault while executing IRET
22618+# Category 1 we do not need to fix up as Xen has already reloaded all segment
22619+# registers that could be reloaded and zeroed the others.
22620+# Category 2 we fix up by killing the current process. We cannot use the
22621+# normal Linux return path in this case because if we use the IRET hypercall
22622+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
22623+# We distinguish between categories by comparing each saved segment register
22624+# with its current contents: any discrepancy means we in category 1.
22625+ENTRY(failsafe_callback)
22626+ _frame (RIP-0x30)
22627+ CFI_REL_OFFSET rcx, 0
22628+ CFI_REL_OFFSET r11, 8
22629+ movw %ds,%cx
22630+ cmpw %cx,0x10(%rsp)
22631+ CFI_REMEMBER_STATE
22632+ jne 1f
22633+ movw %es,%cx
22634+ cmpw %cx,0x18(%rsp)
22635+ jne 1f
22636+ movw %fs,%cx
22637+ cmpw %cx,0x20(%rsp)
22638+ jne 1f
22639+ movw %gs,%cx
22640+ cmpw %cx,0x28(%rsp)
22641+ jne 1f
22642+ /* All segments match their saved values => Category 2 (Bad IRET). */
22643+ movq (%rsp),%rcx
22644+ CFI_RESTORE rcx
22645+ movq 8(%rsp),%r11
22646+ CFI_RESTORE r11
22647+ addq $0x30,%rsp
22648+ CFI_ADJUST_CFA_OFFSET -0x30
22649+ movq $11,%rdi /* SIGSEGV */
22650+ jmp do_exit
22651+ CFI_RESTORE_STATE
22652+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
22653+ movq (%rsp),%rcx
22654+ CFI_RESTORE rcx
22655+ movq 8(%rsp),%r11
22656+ CFI_RESTORE r11
22657+ addq $0x30,%rsp
22658+ CFI_ADJUST_CFA_OFFSET -0x30
22659+ pushq $0
22660+ CFI_ADJUST_CFA_OFFSET 8
22661+ SAVE_ALL
22662+ jmp error_exit
22663+ CFI_ENDPROC
22664+#if 0
22665+ .section __ex_table,"a"
22666+ .align 8
22667+ .quad gs_change,bad_gs
22668+ .previous
22669+ .section .fixup,"ax"
22670+ /* running with kernelgs */
22671+bad_gs:
22672+/* swapgs */ /* switch back to user gs */
22673+ xorl %eax,%eax
22674+ movl %eax,%gs
22675+ jmp 2b
22676+ .previous
22677+#endif
22678+
22679+/*
22680+ * Create a kernel thread.
22681+ *
22682+ * C extern interface:
22683+ * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
22684+ *
22685+ * asm input arguments:
22686+ * rdi: fn, rsi: arg, rdx: flags
22687+ */
22688+ENTRY(kernel_thread)
22689+ CFI_STARTPROC
22690+ FAKE_STACK_FRAME $child_rip
22691+ SAVE_ALL
22692+
22693+ # rdi: flags, rsi: usp, rdx: will be &pt_regs
22694+ movq %rdx,%rdi
22695+ orq kernel_thread_flags(%rip),%rdi
22696+ movq $-1, %rsi
22697+ movq %rsp, %rdx
22698+
22699+ xorl %r8d,%r8d
22700+ xorl %r9d,%r9d
22701+
22702+ # clone now
22703+ call do_fork
22704+ movq %rax,RAX(%rsp)
22705+ xorl %edi,%edi
22706+
22707+ /*
22708+ * It isn't worth to check for reschedule here,
22709+ * so internally to the x86_64 port you can rely on kernel_thread()
22710+ * not to reschedule the child before returning, this avoids the need
22711+ * of hacks for example to fork off the per-CPU idle tasks.
22712+ * [Hopefully no generic code relies on the reschedule -AK]
22713+ */
22714+ RESTORE_ALL
22715+ UNFAKE_STACK_FRAME
22716+ ret
22717+ CFI_ENDPROC
22718+ENDPROC(kernel_thread)
22719+
22720+child_rip:
22721+ pushq $0 # fake return address
22722+ CFI_STARTPROC
22723+ /*
22724+ * Here we are in the child and the registers are set as they were
22725+ * at kernel_thread() invocation in the parent.
22726+ */
22727+ movq %rdi, %rax
22728+ movq %rsi, %rdi
22729+ call *%rax
22730+ # exit
22731+ xorl %edi, %edi
22732+ call do_exit
22733+ CFI_ENDPROC
22734+ENDPROC(child_rip)
22735+
22736+/*
22737+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
22738+ *
22739+ * C extern interface:
22740+ * extern long execve(char *name, char **argv, char **envp)
22741+ *
22742+ * asm input arguments:
22743+ * rdi: name, rsi: argv, rdx: envp
22744+ *
22745+ * We want to fallback into:
22746+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
22747+ *
22748+ * do_sys_execve asm fallback arguments:
22749+ * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
22750+ */
22751+ENTRY(execve)
22752+ CFI_STARTPROC
22753+ FAKE_STACK_FRAME $0
22754+ SAVE_ALL
22755+ call sys_execve
22756+ movq %rax, RAX(%rsp)
22757+ RESTORE_REST
22758+ testq %rax,%rax
22759+ jne 1f
22760+ jmp int_ret_from_sys_call
22761+1: RESTORE_ARGS
22762+ UNFAKE_STACK_FRAME
22763+ ret
22764+ CFI_ENDPROC
22765+ENDPROC(execve)
22766+
22767+KPROBE_ENTRY(page_fault)
22768+ errorentry do_page_fault
22769+END(page_fault)
22770+ .previous .text
22771+
22772+ENTRY(coprocessor_error)
22773+ zeroentry do_coprocessor_error
22774+END(coprocessor_error)
22775+
22776+ENTRY(simd_coprocessor_error)
22777+ zeroentry do_simd_coprocessor_error
22778+END(simd_coprocessor_error)
22779+
22780+ENTRY(device_not_available)
22781+ zeroentry math_state_restore
22782+END(device_not_available)
22783+
22784+ /* runs on exception stack */
22785+KPROBE_ENTRY(debug)
22786+/* INTR_FRAME
22787+ pushq $0
22788+ CFI_ADJUST_CFA_OFFSET 8 */
22789+ zeroentry do_debug
22790+/* paranoidexit
22791+ CFI_ENDPROC */
22792+END(debug)
22793+ .previous .text
22794+
22795+#if 0
22796+ /* runs on exception stack */
22797+KPROBE_ENTRY(nmi)
22798+ INTR_FRAME
22799+ pushq $-1
22800+ CFI_ADJUST_CFA_OFFSET 8
22801+ paranoidentry do_nmi, 0, 0
22802+#ifdef CONFIG_TRACE_IRQFLAGS
22803+ paranoidexit 0
22804+#else
22805+ jmp paranoid_exit1
22806+ CFI_ENDPROC
22807+#endif
22808+END(nmi)
22809+ .previous .text
22810+#endif
22811+
22812+KPROBE_ENTRY(int3)
22813+/* INTR_FRAME
22814+ pushq $0
22815+ CFI_ADJUST_CFA_OFFSET 8 */
22816+ zeroentry do_int3
22817+/* jmp paranoid_exit1
22818+ CFI_ENDPROC */
22819+END(int3)
22820+ .previous .text
22821+
22822+ENTRY(overflow)
22823+ zeroentry do_overflow
22824+END(overflow)
22825+
22826+ENTRY(bounds)
22827+ zeroentry do_bounds
22828+END(bounds)
22829+
22830+ENTRY(invalid_op)
22831+ zeroentry do_invalid_op
22832+END(invalid_op)
22833+
22834+ENTRY(coprocessor_segment_overrun)
22835+ zeroentry do_coprocessor_segment_overrun
22836+END(coprocessor_segment_overrun)
22837+
22838+ENTRY(reserved)
22839+ zeroentry do_reserved
22840+END(reserved)
22841+
22842+#if 0
22843+ /* runs on exception stack */
22844+ENTRY(double_fault)
22845+ XCPT_FRAME
22846+ paranoidentry do_double_fault
22847+ jmp paranoid_exit1
22848+ CFI_ENDPROC
22849+END(double_fault)
22850+#endif
22851+
22852+ENTRY(invalid_TSS)
22853+ errorentry do_invalid_TSS
22854+END(invalid_TSS)
22855+
22856+ENTRY(segment_not_present)
22857+ errorentry do_segment_not_present
22858+END(segment_not_present)
22859+
22860+ /* runs on exception stack */
22861+ENTRY(stack_segment)
22862+/* XCPT_FRAME
22863+ paranoidentry do_stack_segment */
22864+ errorentry do_stack_segment
22865+/* jmp paranoid_exit1
22866+ CFI_ENDPROC */
22867+END(stack_segment)
22868+
22869+KPROBE_ENTRY(general_protection)
22870+ errorentry do_general_protection
22871+END(general_protection)
22872+ .previous .text
22873+
22874+ENTRY(alignment_check)
22875+ errorentry do_alignment_check
22876+END(alignment_check)
22877+
22878+ENTRY(divide_error)
22879+ zeroentry do_divide_error
22880+END(divide_error)
22881+
22882+ENTRY(spurious_interrupt_bug)
22883+ zeroentry do_spurious_interrupt_bug
22884+END(spurious_interrupt_bug)
22885+
22886+#ifdef CONFIG_X86_MCE
22887+ /* runs on exception stack */
22888+ENTRY(machine_check)
22889+ INTR_FRAME
22890+ pushq $0
22891+ CFI_ADJUST_CFA_OFFSET 8
22892+ paranoidentry do_machine_check
22893+ jmp paranoid_exit1
22894+ CFI_ENDPROC
22895+END(machine_check)
22896+#endif
22897+
22898+/* Call softirq on interrupt stack. Interrupts are off. */
22899+ENTRY(call_softirq)
22900+ CFI_STARTPROC
22901+ push %rbp
22902+ CFI_ADJUST_CFA_OFFSET 8
22903+ CFI_REL_OFFSET rbp,0
22904+ mov %rsp,%rbp
22905+ CFI_DEF_CFA_REGISTER rbp
22906+ incl %gs:pda_irqcount
22907+ cmove %gs:pda_irqstackptr,%rsp
22908+ push %rbp # backlink for old unwinder
22909+ call __do_softirq
22910+ leaveq
22911+ CFI_DEF_CFA_REGISTER rsp
22912+ CFI_ADJUST_CFA_OFFSET -8
22913+ decl %gs:pda_irqcount
22914+ ret
22915+ CFI_ENDPROC
22916+ENDPROC(call_softirq)
22917+
22918+#ifdef CONFIG_STACK_UNWIND
22919+ENTRY(arch_unwind_init_running)
22920+ CFI_STARTPROC
22921+ movq %r15, R15(%rdi)
22922+ movq %r14, R14(%rdi)
22923+ xchgq %rsi, %rdx
22924+ movq %r13, R13(%rdi)
22925+ movq %r12, R12(%rdi)
22926+ xorl %eax, %eax
22927+ movq %rbp, RBP(%rdi)
22928+ movq %rbx, RBX(%rdi)
22929+ movq (%rsp), %rcx
22930+ movq %rax, R11(%rdi)
22931+ movq %rax, R10(%rdi)
22932+ movq %rax, R9(%rdi)
22933+ movq %rax, R8(%rdi)
22934+ movq %rax, RAX(%rdi)
22935+ movq %rax, RCX(%rdi)
22936+ movq %rax, RDX(%rdi)
22937+ movq %rax, RSI(%rdi)
22938+ movq %rax, RDI(%rdi)
22939+ movq %rax, ORIG_RAX(%rdi)
22940+ movq %rcx, RIP(%rdi)
22941+ leaq 8(%rsp), %rcx
22942+ movq $__KERNEL_CS, CS(%rdi)
22943+ movq %rax, EFLAGS(%rdi)
22944+ movq %rcx, RSP(%rdi)
22945+ movq $__KERNEL_DS, SS(%rdi)
22946+ jmpq *%rdx
22947+ CFI_ENDPROC
22948+ENDPROC(arch_unwind_init_running)
22949+#endif
22950Index: head-2008-11-25/arch/x86/kernel/genapic_64-xen.c
22951===================================================================
22952--- /dev/null 1970-01-01 00:00:00.000000000 +0000
22953+++ head-2008-11-25/arch/x86/kernel/genapic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
22954@@ -0,0 +1,143 @@
22955+/*
22956+ * Copyright 2004 James Cleverdon, IBM.
22957+ * Subject to the GNU Public License, v.2
22958+ *
22959+ * Generic APIC sub-arch probe layer.
22960+ *
22961+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
22962+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
22963+ * James Cleverdon.
22964+ */
22965+#include <linux/threads.h>
22966+#include <linux/cpumask.h>
22967+#include <linux/string.h>
22968+#include <linux/kernel.h>
22969+#include <linux/ctype.h>
22970+#include <linux/init.h>
22971+#include <linux/module.h>
22972+
22973+#include <asm/smp.h>
22974+#include <asm/ipi.h>
22975+
22976+#if defined(CONFIG_ACPI)
22977+#include <acpi/acpi_bus.h>
22978+#endif
22979+
22980+/* which logical CPU number maps to which CPU (physical APIC ID) */
22981+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
22982+EXPORT_SYMBOL(x86_cpu_to_apicid);
22983+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
22984+
22985+extern struct genapic apic_cluster;
22986+extern struct genapic apic_flat;
22987+extern struct genapic apic_physflat;
22988+
22989+#ifndef CONFIG_XEN
22990+struct genapic *genapic = &apic_flat;
22991+#else
22992+extern struct genapic apic_xen;
22993+struct genapic *genapic = &apic_xen;
22994+#endif
22995+
22996+
22997+/*
22998+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
22999+ */
23000+void __init clustered_apic_check(void)
23001+{
23002+#ifndef CONFIG_XEN
23003+ long i;
23004+ u8 clusters, max_cluster;
23005+ u8 id;
23006+ u8 cluster_cnt[NUM_APIC_CLUSTERS];
23007+ int max_apic = 0;
23008+
23009+#if defined(CONFIG_ACPI)
23010+ /*
23011+ * Some x86_64 machines use physical APIC mode regardless of how many
23012+ * procs/clusters are present (x86_64 ES7000 is an example).
23013+ */
23014+ if (acpi_fadt.revision > FADT2_REVISION_ID)
23015+ if (acpi_fadt.force_apic_physical_destination_mode) {
23016+ genapic = &apic_cluster;
23017+ goto print;
23018+ }
23019+#endif
23020+
23021+ memset(cluster_cnt, 0, sizeof(cluster_cnt));
23022+ for (i = 0; i < NR_CPUS; i++) {
23023+ id = bios_cpu_apicid[i];
23024+ if (id == BAD_APICID)
23025+ continue;
23026+ if (id > max_apic)
23027+ max_apic = id;
23028+ cluster_cnt[APIC_CLUSTERID(id)]++;
23029+ }
23030+
23031+ /* Don't use clustered mode on AMD platforms. */
23032+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
23033+ genapic = &apic_physflat;
23034+#ifndef CONFIG_HOTPLUG_CPU
23035+ /* In the CPU hotplug case we cannot use broadcast mode
23036+ because that opens a race when a CPU is removed.
23037+ Stay at physflat mode in this case.
23038+ It is bad to do this unconditionally though. Once
23039+ we have ACPI platform support for CPU hotplug
23040+ we should detect hotplug capablity from ACPI tables and
23041+ only do this when really needed. -AK */
23042+ if (max_apic <= 8)
23043+ genapic = &apic_flat;
23044+#endif
23045+ goto print;
23046+ }
23047+
23048+ clusters = 0;
23049+ max_cluster = 0;
23050+
23051+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
23052+ if (cluster_cnt[i] > 0) {
23053+ ++clusters;
23054+ if (cluster_cnt[i] > max_cluster)
23055+ max_cluster = cluster_cnt[i];
23056+ }
23057+ }
23058+
23059+ /*
23060+ * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
23061+ * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
23062+ * else physical mode.
23063+ * (We don't use lowest priority delivery + HW APIC IRQ steering, so
23064+ * can ignore the clustered logical case and go straight to physical.)
23065+ */
23066+ if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
23067+#ifdef CONFIG_HOTPLUG_CPU
23068+ /* Don't use APIC shortcuts in CPU hotplug to avoid races */
23069+ genapic = &apic_physflat;
23070+#else
23071+ genapic = &apic_flat;
23072+#endif
23073+ } else
23074+ genapic = &apic_cluster;
23075+
23076+print:
23077+#else
23078+ /* hardcode to xen apic functions */
23079+ genapic = &apic_xen;
23080+#endif
23081+ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
23082+}
23083+
23084+/* Same for both flat and clustered. */
23085+
23086+#ifdef CONFIG_XEN
23087+extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
23088+#endif
23089+
23090+void send_IPI_self(int vector)
23091+{
23092+#ifndef CONFIG_XEN
23093+ __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23094+#else
23095+ xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23096+#endif
23097+}
23098Index: head-2008-11-25/arch/x86/kernel/genapic_xen_64.c
23099===================================================================
23100--- /dev/null 1970-01-01 00:00:00.000000000 +0000
23101+++ head-2008-11-25/arch/x86/kernel/genapic_xen_64.c 2007-06-12 13:13:01.000000000 +0200
23102@@ -0,0 +1,161 @@
23103+/*
23104+ * Copyright 2004 James Cleverdon, IBM.
23105+ * Subject to the GNU Public License, v.2
23106+ *
23107+ * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
23108+ *
23109+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
23110+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
23111+ * James Cleverdon.
23112+ *
23113+ * Hacked to pieces for Xen by Chris Wright.
23114+ */
23115+#include <linux/threads.h>
23116+#include <linux/cpumask.h>
23117+#include <linux/string.h>
23118+#include <linux/kernel.h>
23119+#include <linux/ctype.h>
23120+#include <linux/init.h>
23121+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23122+#include <asm/smp.h>
23123+#include <asm/ipi.h>
23124+#else
23125+#include <asm/apic.h>
23126+#include <asm/apicdef.h>
23127+#include <asm/genapic.h>
23128+#endif
23129+#include <xen/evtchn.h>
23130+
23131+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
23132+
23133+static inline void __send_IPI_one(unsigned int cpu, int vector)
23134+{
23135+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
23136+ BUG_ON(irq < 0);
23137+ notify_remote_via_irq(irq);
23138+}
23139+
23140+void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
23141+{
23142+ int cpu;
23143+
23144+ switch (shortcut) {
23145+ case APIC_DEST_SELF:
23146+ __send_IPI_one(smp_processor_id(), vector);
23147+ break;
23148+ case APIC_DEST_ALLBUT:
23149+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23150+ if (cpu == smp_processor_id())
23151+ continue;
23152+ if (cpu_isset(cpu, cpu_online_map)) {
23153+ __send_IPI_one(cpu, vector);
23154+ }
23155+ }
23156+ break;
23157+ case APIC_DEST_ALLINC:
23158+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23159+ if (cpu_isset(cpu, cpu_online_map)) {
23160+ __send_IPI_one(cpu, vector);
23161+ }
23162+ }
23163+ break;
23164+ default:
23165+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
23166+ vector);
23167+ break;
23168+ }
23169+}
23170+
23171+static cpumask_t xen_target_cpus(void)
23172+{
23173+ return cpu_online_map;
23174+}
23175+
23176+/*
23177+ * Set up the logical destination ID.
23178+ * Do nothing, not called now.
23179+ */
23180+static void xen_init_apic_ldr(void)
23181+{
23182+ Dprintk("%s\n", __FUNCTION__);
23183+ return;
23184+}
23185+
23186+static void xen_send_IPI_allbutself(int vector)
23187+{
23188+ /*
23189+ * if there are no other CPUs in the system then
23190+ * we get an APIC send error if we try to broadcast.
23191+ * thus we have to avoid sending IPIs in this case.
23192+ */
23193+ Dprintk("%s\n", __FUNCTION__);
23194+ if (num_online_cpus() > 1)
23195+ xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
23196+}
23197+
23198+static void xen_send_IPI_all(int vector)
23199+{
23200+ Dprintk("%s\n", __FUNCTION__);
23201+ xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
23202+}
23203+
23204+static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
23205+{
23206+ unsigned long mask = cpus_addr(cpumask)[0];
23207+ unsigned int cpu;
23208+ unsigned long flags;
23209+
23210+ Dprintk("%s\n", __FUNCTION__);
23211+ local_irq_save(flags);
23212+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
23213+
23214+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23215+ if (cpu_isset(cpu, cpumask)) {
23216+ __send_IPI_one(cpu, vector);
23217+ }
23218+ }
23219+ local_irq_restore(flags);
23220+}
23221+
23222+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23223+static int xen_apic_id_registered(void)
23224+{
23225+ /* better be set */
23226+ Dprintk("%s\n", __FUNCTION__);
23227+ return physid_isset(smp_processor_id(), phys_cpu_present_map);
23228+}
23229+#endif
23230+
23231+static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
23232+{
23233+ Dprintk("%s\n", __FUNCTION__);
23234+ return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
23235+}
23236+
23237+static unsigned int phys_pkg_id(int index_msb)
23238+{
23239+ u32 ebx;
23240+
23241+ Dprintk("%s\n", __FUNCTION__);
23242+ ebx = cpuid_ebx(1);
23243+ return ((ebx >> 24) & 0xFF) >> index_msb;
23244+}
23245+
23246+struct genapic apic_xen = {
23247+ .name = "xen",
23248+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23249+ .int_delivery_mode = dest_LowestPrio,
23250+#endif
23251+ .int_dest_mode = (APIC_DEST_LOGICAL != 0),
23252+ .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
23253+ .target_cpus = xen_target_cpus,
23254+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23255+ .apic_id_registered = xen_apic_id_registered,
23256+#endif
23257+ .init_apic_ldr = xen_init_apic_ldr,
23258+ .send_IPI_all = xen_send_IPI_all,
23259+ .send_IPI_allbutself = xen_send_IPI_allbutself,
23260+ .send_IPI_mask = xen_send_IPI_mask,
23261+ .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
23262+ .phys_pkg_id = phys_pkg_id,
23263+};
23264Index: head-2008-11-25/arch/x86/kernel/head_64-xen.S
23265===================================================================
23266--- /dev/null 1970-01-01 00:00:00.000000000 +0000
23267+++ head-2008-11-25/arch/x86/kernel/head_64-xen.S 2007-08-06 15:10:49.000000000 +0200
23268@@ -0,0 +1,214 @@
23269+/*
23270+ * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
23271+ *
23272+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23273+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
23274+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
23275+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
23276+ *
23277+ * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
23278+ *
23279+ * Jun Nakajima <jun.nakajima@intel.com>
23280+ * Modified for Xen
23281+ */
23282+
23283+
23284+#include <linux/linkage.h>
23285+#include <linux/threads.h>
23286+#include <linux/init.h>
23287+#include <linux/elfnote.h>
23288+#include <asm/desc.h>
23289+#include <asm/segment.h>
23290+#include <asm/page.h>
23291+#include <asm/msr.h>
23292+#include <asm/cache.h>
23293+#include <asm/dwarf2.h>
23294+#include <xen/interface/elfnote.h>
23295+
23296+ .section .bootstrap.text, "ax", @progbits
23297+ .code64
23298+ .globl startup_64
23299+startup_64:
23300+ movq $(init_thread_union+THREAD_SIZE-8),%rsp
23301+
23302+ /* rsi is pointer to startup info structure.
23303+ pass it to C */
23304+ movq %rsi,%rdi
23305+ pushq $0 # fake return address
23306+ jmp x86_64_start_kernel
23307+
23308+#ifdef CONFIG_ACPI_SLEEP
23309+.org 0xf00
23310+ .globl pGDT32
23311+pGDT32:
23312+ .word gdt_end-cpu_gdt_table-1
23313+ .long cpu_gdt_table-__START_KERNEL_map
23314+#endif
23315+ENTRY(stext)
23316+ENTRY(_stext)
23317+
23318+ $page = 0
23319+#define NEXT_PAGE(name) \
23320+ $page = $page + 1; \
23321+ .org $page * 0x1000; \
23322+ phys_##name = $page * 0x1000 + __PHYSICAL_START; \
23323+ENTRY(name)
23324+
23325+NEXT_PAGE(init_level4_pgt)
23326+ /* This gets initialized in x86_64_start_kernel */
23327+ .fill 512,8,0
23328+NEXT_PAGE(init_level4_user_pgt)
23329+ /*
23330+ * We update two pgd entries to make kernel and user pgd consistent
23331+ * at pgd_populate(). It can be used for kernel modules. So we place
23332+ * this page here for those cases to avoid memory corruption.
23333+ * We also use this page to establish the initial mapping for the
23334+ * vsyscall area.
23335+ */
23336+ .fill 512,8,0
23337+
23338+NEXT_PAGE(level3_kernel_pgt)
23339+ .fill 512,8,0
23340+
23341+ /*
23342+ * This is used for vsyscall area mapping as we have a different
23343+ * level4 page table for user.
23344+ */
23345+NEXT_PAGE(level3_user_pgt)
23346+ .fill 512,8,0
23347+
23348+NEXT_PAGE(level2_kernel_pgt)
23349+ .fill 512,8,0
23350+
23351+NEXT_PAGE(hypercall_page)
23352+ CFI_STARTPROC
23353+ .rept 0x1000 / 0x20
23354+ .skip 1 /* push %rcx */
23355+ CFI_ADJUST_CFA_OFFSET 8
23356+ CFI_REL_OFFSET rcx,0
23357+ .skip 2 /* push %r11 */
23358+ CFI_ADJUST_CFA_OFFSET 8
23359+ CFI_REL_OFFSET rcx,0
23360+ .skip 5 /* mov $#,%eax */
23361+ .skip 2 /* syscall */
23362+ .skip 2 /* pop %r11 */
23363+ CFI_ADJUST_CFA_OFFSET -8
23364+ CFI_RESTORE r11
23365+ .skip 1 /* pop %rcx */
23366+ CFI_ADJUST_CFA_OFFSET -8
23367+ CFI_RESTORE rcx
23368+ .align 0x20,0 /* ret */
23369+ .endr
23370+ CFI_ENDPROC
23371+
23372+#undef NEXT_PAGE
23373+
23374+ .data
23375+/* Just dummy symbol to allow compilation. Not used in sleep path */
23376+#ifdef CONFIG_ACPI_SLEEP
23377+ .align PAGE_SIZE
23378+ENTRY(wakeup_level4_pgt)
23379+ .fill 512,8,0
23380+#endif
23381+
23382+ .data
23383+
23384+ .align 16
23385+ .globl cpu_gdt_descr
23386+cpu_gdt_descr:
23387+ .word gdt_end-cpu_gdt_table-1
23388+gdt:
23389+ .quad cpu_gdt_table
23390+#ifdef CONFIG_SMP
23391+ .rept NR_CPUS-1
23392+ .word 0
23393+ .quad 0
23394+ .endr
23395+#endif
23396+
23397+/* We need valid kernel segments for data and code in long mode too
23398+ * IRET will check the segment types kkeil 2000/10/28
23399+ * Also sysret mandates a special GDT layout
23400+ */
23401+
23402+ .section .data.page_aligned, "aw"
23403+ .align PAGE_SIZE
23404+
23405+/* The TLS descriptors are currently at a different place compared to i386.
23406+ Hopefully nobody expects them at a fixed place (Wine?) */
23407+
23408+ENTRY(cpu_gdt_table)
23409+ .quad 0x0000000000000000 /* NULL descriptor */
23410+ .quad 0x0 /* unused */
23411+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
23412+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
23413+ .quad 0x00cffa000000ffff /* __USER32_CS */
23414+ .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
23415+ .quad 0x00affa000000ffff /* __USER_CS */
23416+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
23417+ .quad 0,0 /* TSS */
23418+ .quad 0,0 /* LDT */
23419+ .quad 0,0,0 /* three TLS descriptors */
23420+ .quad 0 /* unused */
23421+gdt_end:
23422+ /* asm/segment.h:GDT_ENTRIES must match this */
23423+ /* This should be a multiple of the cache line size */
23424+ /* GDTs of other CPUs are now dynamically allocated */
23425+
23426+ /* zero the remaining page */
23427+ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
23428+
23429+ .section .bss.page_aligned, "aw", @nobits
23430+ .align PAGE_SIZE
23431+ENTRY(empty_zero_page)
23432+ .skip PAGE_SIZE
23433+
23434+#if CONFIG_XEN_COMPAT <= 0x030002
23435+/*
23436+ * __xen_guest information
23437+ */
23438+.macro utoh value
23439+ .if (\value) < 0 || (\value) >= 0x10
23440+ utoh (((\value)>>4)&0x0fffffffffffffff)
23441+ .endif
23442+ .if ((\value) & 0xf) < 10
23443+ .byte '0' + ((\value) & 0xf)
23444+ .else
23445+ .byte 'A' + ((\value) & 0xf) - 10
23446+ .endif
23447+.endm
23448+
23449+.section __xen_guest
23450+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
23451+ .ascii ",XEN_VER=xen-3.0"
23452+ .ascii ",VIRT_BASE=0x"
23453+ utoh __START_KERNEL_map
23454+ .ascii ",ELF_PADDR_OFFSET=0x"
23455+ utoh __START_KERNEL_map
23456+ .ascii ",VIRT_ENTRY=0x"
23457+ utoh (__START_KERNEL_map + __PHYSICAL_START)
23458+ .ascii ",HYPERCALL_PAGE=0x"
23459+ utoh (phys_hypercall_page >> PAGE_SHIFT)
23460+ .ascii ",FEATURES=writable_page_tables"
23461+ .ascii "|writable_descriptor_tables"
23462+ .ascii "|auto_translated_physmap"
23463+ .ascii "|supervisor_mode_kernel"
23464+ .ascii ",LOADER=generic"
23465+ .byte 0
23466+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
23467+
23468+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
23469+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
23470+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
23471+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map)
23472+#if CONFIG_XEN_COMPAT <= 0x030002
23473+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map)
23474+#else
23475+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0)
23476+#endif
23477+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64)
23478+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page)
23479+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
23480+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
23481+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
23482+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
23483Index: head-2008-11-25/arch/x86/kernel/head64-xen.c
23484===================================================================
23485--- /dev/null 1970-01-01 00:00:00.000000000 +0000
23486+++ head-2008-11-25/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200
23487@@ -0,0 +1,162 @@
23488+/*
23489+ * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
23490+ *
23491+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23492+ *
23493+ * Jun Nakajima <jun.nakajima@intel.com>
23494+ * Modified for Xen.
23495+ */
23496+
23497+#include <linux/init.h>
23498+#include <linux/linkage.h>
23499+#include <linux/types.h>
23500+#include <linux/kernel.h>
23501+#include <linux/string.h>
23502+#include <linux/percpu.h>
23503+#include <linux/module.h>
23504+
23505+#include <asm/processor.h>
23506+#include <asm/proto.h>
23507+#include <asm/smp.h>
23508+#include <asm/bootsetup.h>
23509+#include <asm/setup.h>
23510+#include <asm/desc.h>
23511+#include <asm/pgtable.h>
23512+#include <asm/sections.h>
23513+
23514+unsigned long start_pfn;
23515+
23516+/* Don't add a printk in there. printk relies on the PDA which is not initialized
23517+ yet. */
23518+#if 0
23519+static void __init clear_bss(void)
23520+{
23521+ memset(__bss_start, 0,
23522+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
23523+}
23524+#endif
23525+
23526+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
23527+#define OLD_CL_MAGIC_ADDR 0x90020
23528+#define OLD_CL_MAGIC 0xA33F
23529+#define OLD_CL_BASE_ADDR 0x90000
23530+#define OLD_CL_OFFSET 0x90022
23531+
23532+extern char saved_command_line[];
23533+
23534+static void __init copy_bootdata(char *real_mode_data)
23535+{
23536+#ifndef CONFIG_XEN
23537+ int new_data;
23538+ char * command_line;
23539+
23540+ memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
23541+ new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
23542+ if (!new_data) {
23543+ if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
23544+ printk("so old bootloader that it does not support commandline?!\n");
23545+ return;
23546+ }
23547+ new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
23548+ printk("old bootloader convention, maybe loadlin?\n");
23549+ }
23550+ command_line = (char *) ((u64)(new_data));
23551+ memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
23552+#else
23553+ int max_cmdline;
23554+
23555+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
23556+ max_cmdline = COMMAND_LINE_SIZE;
23557+ memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
23558+ saved_command_line[max_cmdline-1] = '\0';
23559+#endif
23560+ printk("Bootdata ok (command line is %s)\n", saved_command_line);
23561+}
23562+
23563+static void __init setup_boot_cpu_data(void)
23564+{
23565+ unsigned int dummy, eax;
23566+
23567+ /* get vendor info */
23568+ cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
23569+ (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
23570+ (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
23571+ (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
23572+
23573+ /* get cpu type */
23574+ cpuid(1, &eax, &dummy, &dummy,
23575+ (unsigned int *) &boot_cpu_data.x86_capability);
23576+ boot_cpu_data.x86 = (eax >> 8) & 0xf;
23577+ boot_cpu_data.x86_model = (eax >> 4) & 0xf;
23578+ boot_cpu_data.x86_mask = eax & 0xf;
23579+}
23580+
23581+#include <xen/interface/memory.h>
23582+unsigned long *machine_to_phys_mapping;
23583+EXPORT_SYMBOL(machine_to_phys_mapping);
23584+unsigned int machine_to_phys_order;
23585+EXPORT_SYMBOL(machine_to_phys_order);
23586+
23587+void __init x86_64_start_kernel(char * real_mode_data)
23588+{
23589+ struct xen_machphys_mapping mapping;
23590+ unsigned long machine_to_phys_nr_ents;
23591+ char *s;
23592+ int i;
23593+
23594+ setup_xen_features();
23595+
23596+ xen_start_info = (struct start_info *)real_mode_data;
23597+ if (!xen_feature(XENFEAT_auto_translated_physmap))
23598+ phys_to_machine_mapping =
23599+ (unsigned long *)xen_start_info->mfn_list;
23600+ start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
23601+ xen_start_info->nr_pt_frames;
23602+
23603+ machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
23604+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
23605+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
23606+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
23607+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
23608+ }
23609+ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
23610+ machine_to_phys_order++;
23611+
23612+#if 0
23613+ for (i = 0; i < 256; i++)
23614+ set_intr_gate(i, early_idt_handler);
23615+ asm volatile("lidt %0" :: "m" (idt_descr));
23616+#endif
23617+
23618+ /*
23619+ * This must be called really, really early:
23620+ */
23621+ lockdep_init();
23622+
23623+ for (i = 0; i < NR_CPUS; i++)
23624+ cpu_pda(i) = &boot_cpu_pda[i];
23625+
23626+ pda_init(0);
23627+ copy_bootdata(real_mode_data);
23628+#ifdef CONFIG_SMP
23629+ cpu_set(0, cpu_online_map);
23630+#endif
23631+ s = strstr(saved_command_line, "earlyprintk=");
23632+ if (s != NULL)
23633+ setup_early_printk(strchr(s, '=') + 1);
23634+#ifdef CONFIG_NUMA
23635+ s = strstr(saved_command_line, "numa=");
23636+ if (s != NULL)
23637+ numa_setup(s+5);
23638+#endif
23639+#ifdef CONFIG_X86_IO_APIC
23640+ if (strstr(saved_command_line, "disableapic"))
23641+ disable_apic = 1;
23642+#endif
23643+ /* You need early console to see that */
23644+ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
23645+ panic("Kernel too big for kernel mapping\n");
23646+
23647+ setup_boot_cpu_data();
23648+ start_kernel();
23649+}
23650Index: head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c
23651===================================================================
23652--- /dev/null 1970-01-01 00:00:00.000000000 +0000
23653+++ head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c 2008-11-25 12:22:34.000000000 +0100
23654@@ -0,0 +1,2268 @@
23655+/*
23656+ * Intel IO-APIC support for multi-Pentium hosts.
23657+ *
23658+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
23659+ *
23660+ * Many thanks to Stig Venaas for trying out countless experimental
23661+ * patches and reporting/debugging problems patiently!
23662+ *
23663+ * (c) 1999, Multiple IO-APIC support, developed by
23664+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
23665+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
23666+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
23667+ * and Ingo Molnar <mingo@redhat.com>
23668+ *
23669+ * Fixes
23670+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
23671+ * thanks to Eric Gilmore
23672+ * and Rolf G. Tews
23673+ * for testing these extensively
23674+ * Paul Diefenbaugh : Added full ACPI support
23675+ */
23676+
23677+#include <linux/mm.h>
23678+#include <linux/interrupt.h>
23679+#include <linux/init.h>
23680+#include <linux/delay.h>
23681+#include <linux/sched.h>
23682+#include <linux/smp_lock.h>
23683+#include <linux/mc146818rtc.h>
23684+#include <linux/acpi.h>
23685+#include <linux/sysdev.h>
23686+#ifdef CONFIG_ACPI
23687+#include <acpi/acpi_bus.h>
23688+#endif
23689+
23690+#include <asm/io.h>
23691+#include <asm/smp.h>
23692+#include <asm/desc.h>
23693+#include <asm/proto.h>
23694+#include <asm/mach_apic.h>
23695+#include <asm/acpi.h>
23696+#include <asm/dma.h>
23697+#include <asm/nmi.h>
23698+
23699+#define __apicdebuginit __init
23700+
23701+int sis_apic_bug; /* not actually supported, dummy for compile */
23702+
23703+static int no_timer_check;
23704+
23705+int disable_timer_pin_1 __initdata;
23706+
23707+#ifndef CONFIG_XEN
23708+int timer_over_8254 __initdata = 0;
23709+
23710+/* Where if anywhere is the i8259 connect in external int mode */
23711+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
23712+#endif
23713+
23714+static DEFINE_SPINLOCK(ioapic_lock);
23715+static DEFINE_SPINLOCK(vector_lock);
23716+
23717+/*
23718+ * # of IRQ routing registers
23719+ */
23720+int nr_ioapic_registers[MAX_IO_APICS];
23721+
23722+/*
23723+ * Rough estimation of how many shared IRQs there are, can
23724+ * be changed anytime.
23725+ */
23726+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
23727+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
23728+
23729+/*
23730+ * This is performance-critical, we want to do it O(1)
23731+ *
23732+ * the indexing order of this array favors 1:1 mappings
23733+ * between pins and IRQs.
23734+ */
23735+
23736+static struct irq_pin_list {
23737+ short apic, pin, next;
23738+} irq_2_pin[PIN_MAP_SIZE];
23739+
23740+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
23741+#ifdef CONFIG_PCI_MSI
23742+#define vector_to_irq(vector) \
23743+ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
23744+#else
23745+#define vector_to_irq(vector) (vector)
23746+#endif
23747+
23748+#ifdef CONFIG_XEN
23749+
23750+#include <xen/interface/xen.h>
23751+#include <xen/interface/physdev.h>
23752+#include <xen/evtchn.h>
23753+
23754+/* Fake i8259 */
23755+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
23756+#define disable_8259A_irq(_irq) ((void)0)
23757+#define i8259A_irq_pending(_irq) (0)
23758+
23759+unsigned long io_apic_irqs;
23760+
23761+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
23762+{
23763+ struct physdev_apic apic_op;
23764+ int ret;
23765+
23766+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23767+ apic_op.reg = reg;
23768+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
23769+ if (ret)
23770+ return ret;
23771+ return apic_op.value;
23772+}
23773+
23774+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
23775+{
23776+ struct physdev_apic apic_op;
23777+
23778+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23779+ apic_op.reg = reg;
23780+ apic_op.value = value;
23781+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
23782+}
23783+
23784+#define io_apic_read(a,r) xen_io_apic_read(a,r)
23785+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
23786+
23787+#define clear_IO_APIC() ((void)0)
23788+
23789+#else
23790+
23791+#ifdef CONFIG_SMP
23792+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
23793+{
23794+ unsigned long flags;
23795+ unsigned int dest;
23796+ cpumask_t tmp;
23797+
23798+ cpus_and(tmp, mask, cpu_online_map);
23799+ if (cpus_empty(tmp))
23800+ tmp = TARGET_CPUS;
23801+
23802+ cpus_and(mask, tmp, CPU_MASK_ALL);
23803+
23804+ dest = cpu_mask_to_apicid(mask);
23805+
23806+ /*
23807+ * Only the high 8 bits are valid.
23808+ */
23809+ dest = SET_APIC_LOGICAL_ID(dest);
23810+
23811+ spin_lock_irqsave(&ioapic_lock, flags);
23812+ __DO_ACTION(1, = dest, )
23813+ set_irq_info(irq, mask);
23814+ spin_unlock_irqrestore(&ioapic_lock, flags);
23815+}
23816+#endif
23817+
23818+#endif /* !CONFIG_XEN */
23819+
23820+/*
23821+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
23822+ * shared ISA-space IRQs, so we have to support them. We are super
23823+ * fast in the common case, and fast for shared ISA-space IRQs.
23824+ */
23825+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
23826+{
23827+ static int first_free_entry = NR_IRQS;
23828+ struct irq_pin_list *entry = irq_2_pin + irq;
23829+
23830+ BUG_ON(irq >= NR_IRQS);
23831+ while (entry->next)
23832+ entry = irq_2_pin + entry->next;
23833+
23834+ if (entry->pin != -1) {
23835+ entry->next = first_free_entry;
23836+ entry = irq_2_pin + entry->next;
23837+ if (++first_free_entry >= PIN_MAP_SIZE)
23838+ panic("io_apic.c: ran out of irq_2_pin entries!");
23839+ }
23840+ entry->apic = apic;
23841+ entry->pin = pin;
23842+}
23843+
23844+#ifndef CONFIG_XEN
23845+#define __DO_ACTION(R, ACTION, FINAL) \
23846+ \
23847+{ \
23848+ int pin; \
23849+ struct irq_pin_list *entry = irq_2_pin + irq; \
23850+ \
23851+ BUG_ON(irq >= NR_IRQS); \
23852+ for (;;) { \
23853+ unsigned int reg; \
23854+ pin = entry->pin; \
23855+ if (pin == -1) \
23856+ break; \
23857+ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
23858+ reg ACTION; \
23859+ io_apic_modify(entry->apic, reg); \
23860+ if (!entry->next) \
23861+ break; \
23862+ entry = irq_2_pin + entry->next; \
23863+ } \
23864+ FINAL; \
23865+}
23866+
23867+#define DO_ACTION(name,R,ACTION, FINAL) \
23868+ \
23869+ static void name##_IO_APIC_irq (unsigned int irq) \
23870+ __DO_ACTION(R, ACTION, FINAL)
23871+
23872+DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
23873+ /* mask = 1 */
23874+DO_ACTION( __unmask, 0, &= 0xfffeffff, )
23875+ /* mask = 0 */
23876+
23877+static void mask_IO_APIC_irq (unsigned int irq)
23878+{
23879+ unsigned long flags;
23880+
23881+ spin_lock_irqsave(&ioapic_lock, flags);
23882+ __mask_IO_APIC_irq(irq);
23883+ spin_unlock_irqrestore(&ioapic_lock, flags);
23884+}
23885+
23886+static void unmask_IO_APIC_irq (unsigned int irq)
23887+{
23888+ unsigned long flags;
23889+
23890+ spin_lock_irqsave(&ioapic_lock, flags);
23891+ __unmask_IO_APIC_irq(irq);
23892+ spin_unlock_irqrestore(&ioapic_lock, flags);
23893+}
23894+
23895+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
23896+{
23897+ struct IO_APIC_route_entry entry;
23898+ unsigned long flags;
23899+
23900+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
23901+ spin_lock_irqsave(&ioapic_lock, flags);
23902+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
23903+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
23904+ spin_unlock_irqrestore(&ioapic_lock, flags);
23905+ if (entry.delivery_mode == dest_SMI)
23906+ return;
23907+ /*
23908+ * Disable it in the IO-APIC irq-routing table:
23909+ */
23910+ memset(&entry, 0, sizeof(entry));
23911+ entry.mask = 1;
23912+ spin_lock_irqsave(&ioapic_lock, flags);
23913+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
23914+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
23915+ spin_unlock_irqrestore(&ioapic_lock, flags);
23916+}
23917+
23918+static void clear_IO_APIC (void)
23919+{
23920+ int apic, pin;
23921+
23922+ for (apic = 0; apic < nr_ioapics; apic++)
23923+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
23924+ clear_IO_APIC_pin(apic, pin);
23925+}
23926+
23927+#endif /* !CONFIG_XEN */
23928+
23929+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
23930+
23931+/*
23932+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
23933+ * specific CPU-side IRQs.
23934+ */
23935+
23936+#define MAX_PIRQS 8
23937+static int pirq_entries [MAX_PIRQS];
23938+static int pirqs_enabled;
23939+int skip_ioapic_setup;
23940+int ioapic_force;
23941+
23942+/* dummy parsing: see setup.c */
23943+
23944+static int __init disable_ioapic_setup(char *str)
23945+{
23946+ skip_ioapic_setup = 1;
23947+ return 1;
23948+}
23949+
23950+static int __init enable_ioapic_setup(char *str)
23951+{
23952+ ioapic_force = 1;
23953+ skip_ioapic_setup = 0;
23954+ return 1;
23955+}
23956+
23957+__setup("noapic", disable_ioapic_setup);
23958+__setup("apic", enable_ioapic_setup);
23959+
23960+#ifndef CONFIG_XEN
23961+static int __init setup_disable_8254_timer(char *s)
23962+{
23963+ timer_over_8254 = -1;
23964+ return 1;
23965+}
23966+static int __init setup_enable_8254_timer(char *s)
23967+{
23968+ timer_over_8254 = 2;
23969+ return 1;
23970+}
23971+
23972+__setup("disable_8254_timer", setup_disable_8254_timer);
23973+__setup("enable_8254_timer", setup_enable_8254_timer);
23974+#endif /* !CONFIG_XEN */
23975+
23976+#include <asm/pci-direct.h>
23977+#include <linux/pci_ids.h>
23978+#include <linux/pci.h>
23979+
23980+
23981+#ifdef CONFIG_ACPI
23982+
23983+static int nvidia_hpet_detected __initdata;
23984+
23985+static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
23986+{
23987+ nvidia_hpet_detected = 1;
23988+ return 0;
23989+}
23990+#endif
23991+
23992+/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
23993+ off. Check for an Nvidia or VIA PCI bridge and turn it off.
23994+ Use pci direct infrastructure because this runs before the PCI subsystem.
23995+
23996+ Can be overwritten with "apic"
23997+
23998+ And another hack to disable the IOMMU on VIA chipsets.
23999+
24000+ ... and others. Really should move this somewhere else.
24001+
24002+ Kludge-O-Rama. */
24003+void __init check_ioapic(void)
24004+{
24005+ int num,slot,func;
24006+ /* Poor man's PCI discovery */
24007+ for (num = 0; num < 32; num++) {
24008+ for (slot = 0; slot < 32; slot++) {
24009+ for (func = 0; func < 8; func++) {
24010+ u32 class;
24011+ u32 vendor;
24012+ u8 type;
24013+ class = read_pci_config(num,slot,func,
24014+ PCI_CLASS_REVISION);
24015+ if (class == 0xffffffff)
24016+ break;
24017+
24018+ if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
24019+ continue;
24020+
24021+ vendor = read_pci_config(num, slot, func,
24022+ PCI_VENDOR_ID);
24023+ vendor &= 0xffff;
24024+ switch (vendor) {
24025+ case PCI_VENDOR_ID_VIA:
24026+#ifdef CONFIG_IOMMU
24027+ if ((end_pfn > MAX_DMA32_PFN ||
24028+ force_iommu) &&
24029+ !iommu_aperture_allowed) {
24030+ printk(KERN_INFO
24031+ "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
24032+ iommu_aperture_disabled = 1;
24033+ }
24034+#endif
24035+ return;
24036+ case PCI_VENDOR_ID_NVIDIA:
24037+#ifdef CONFIG_ACPI
24038+ /*
24039+ * All timer overrides on Nvidia are
24040+ * wrong unless HPET is enabled.
24041+ */
24042+ nvidia_hpet_detected = 0;
24043+ acpi_table_parse(ACPI_HPET,
24044+ nvidia_hpet_check);
24045+ if (nvidia_hpet_detected == 0) {
24046+ acpi_skip_timer_override = 1;
24047+ printk(KERN_INFO "Nvidia board "
24048+ "detected. Ignoring ACPI "
24049+ "timer override.\n");
24050+ }
24051+#endif
24052+ /* RED-PEN skip them on mptables too? */
24053+ return;
24054+ case PCI_VENDOR_ID_ATI:
24055+
24056+ /* This should be actually default, but
24057+ for 2.6.16 let's do it for ATI only where
24058+ it's really needed. */
24059+#ifndef CONFIG_XEN
24060+ if (timer_over_8254 == 1) {
24061+ timer_over_8254 = 0;
24062+ printk(KERN_INFO
24063+ "ATI board detected. Disabling timer routing over 8254.\n");
24064+ }
24065+#endif
24066+ return;
24067+ }
24068+
24069+
24070+ /* No multi-function device? */
24071+ type = read_pci_config_byte(num,slot,func,
24072+ PCI_HEADER_TYPE);
24073+ if (!(type & 0x80))
24074+ break;
24075+ }
24076+ }
24077+ }
24078+}
24079+
24080+static int __init ioapic_pirq_setup(char *str)
24081+{
24082+ int i, max;
24083+ int ints[MAX_PIRQS+1];
24084+
24085+ get_options(str, ARRAY_SIZE(ints), ints);
24086+
24087+ for (i = 0; i < MAX_PIRQS; i++)
24088+ pirq_entries[i] = -1;
24089+
24090+ pirqs_enabled = 1;
24091+ apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
24092+ max = MAX_PIRQS;
24093+ if (ints[0] < MAX_PIRQS)
24094+ max = ints[0];
24095+
24096+ for (i = 0; i < max; i++) {
24097+ apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
24098+ /*
24099+ * PIRQs are mapped upside down, usually.
24100+ */
24101+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
24102+ }
24103+ return 1;
24104+}
24105+
24106+__setup("pirq=", ioapic_pirq_setup);
24107+
24108+/*
24109+ * Find the IRQ entry number of a certain pin.
24110+ */
24111+static int find_irq_entry(int apic, int pin, int type)
24112+{
24113+ int i;
24114+
24115+ for (i = 0; i < mp_irq_entries; i++)
24116+ if (mp_irqs[i].mpc_irqtype == type &&
24117+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
24118+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
24119+ mp_irqs[i].mpc_dstirq == pin)
24120+ return i;
24121+
24122+ return -1;
24123+}
24124+
24125+#ifndef CONFIG_XEN
24126+/*
24127+ * Find the pin to which IRQ[irq] (ISA) is connected
24128+ */
24129+static int __init find_isa_irq_pin(int irq, int type)
24130+{
24131+ int i;
24132+
24133+ for (i = 0; i < mp_irq_entries; i++) {
24134+ int lbus = mp_irqs[i].mpc_srcbus;
24135+
24136+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24137+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24138+ mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24139+ (mp_irqs[i].mpc_irqtype == type) &&
24140+ (mp_irqs[i].mpc_srcbusirq == irq))
24141+
24142+ return mp_irqs[i].mpc_dstirq;
24143+ }
24144+ return -1;
24145+}
24146+
24147+static int __init find_isa_irq_apic(int irq, int type)
24148+{
24149+ int i;
24150+
24151+ for (i = 0; i < mp_irq_entries; i++) {
24152+ int lbus = mp_irqs[i].mpc_srcbus;
24153+
24154+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24155+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24156+ mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24157+ (mp_irqs[i].mpc_irqtype == type) &&
24158+ (mp_irqs[i].mpc_srcbusirq == irq))
24159+ break;
24160+ }
24161+ if (i < mp_irq_entries) {
24162+ int apic;
24163+ for(apic = 0; apic < nr_ioapics; apic++) {
24164+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
24165+ return apic;
24166+ }
24167+ }
24168+
24169+ return -1;
24170+}
24171+#endif
24172+
24173+/*
24174+ * Find a specific PCI IRQ entry.
24175+ * Not an __init, possibly needed by modules
24176+ */
24177+static int pin_2_irq(int idx, int apic, int pin);
24178+
24179+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
24180+{
24181+ int apic, i, best_guess = -1;
24182+
24183+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
24184+ bus, slot, pin);
24185+ if (mp_bus_id_to_pci_bus[bus] == -1) {
24186+ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
24187+ return -1;
24188+ }
24189+ for (i = 0; i < mp_irq_entries; i++) {
24190+ int lbus = mp_irqs[i].mpc_srcbus;
24191+
24192+ for (apic = 0; apic < nr_ioapics; apic++)
24193+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
24194+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
24195+ break;
24196+
24197+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
24198+ !mp_irqs[i].mpc_irqtype &&
24199+ (bus == lbus) &&
24200+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
24201+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
24202+
24203+ if (!(apic || IO_APIC_IRQ(irq)))
24204+ continue;
24205+
24206+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
24207+ return irq;
24208+ /*
24209+ * Use the first all-but-pin matching entry as a
24210+ * best-guess fuzzy result for broken mptables.
24211+ */
24212+ if (best_guess < 0)
24213+ best_guess = irq;
24214+ }
24215+ }
24216+ BUG_ON(best_guess >= NR_IRQS);
24217+ return best_guess;
24218+}
24219+
24220+/*
24221+ * EISA Edge/Level control register, ELCR
24222+ */
24223+static int EISA_ELCR(unsigned int irq)
24224+{
24225+ if (irq < 16) {
24226+ unsigned int port = 0x4d0 + (irq >> 3);
24227+ return (inb(port) >> (irq & 7)) & 1;
24228+ }
24229+ apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
24230+ return 0;
24231+}
24232+
24233+/* EISA interrupts are always polarity zero and can be edge or level
24234+ * trigger depending on the ELCR value. If an interrupt is listed as
24235+ * EISA conforming in the MP table, that means its trigger type must
24236+ * be read in from the ELCR */
24237+
24238+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
24239+#define default_EISA_polarity(idx) (0)
24240+
24241+/* ISA interrupts are always polarity zero edge triggered,
24242+ * when listed as conforming in the MP table. */
24243+
24244+#define default_ISA_trigger(idx) (0)
24245+#define default_ISA_polarity(idx) (0)
24246+
24247+/* PCI interrupts are always polarity one level triggered,
24248+ * when listed as conforming in the MP table. */
24249+
24250+#define default_PCI_trigger(idx) (1)
24251+#define default_PCI_polarity(idx) (1)
24252+
24253+/* MCA interrupts are always polarity zero level triggered,
24254+ * when listed as conforming in the MP table. */
24255+
24256+#define default_MCA_trigger(idx) (1)
24257+#define default_MCA_polarity(idx) (0)
24258+
24259+static int __init MPBIOS_polarity(int idx)
24260+{
24261+ int bus = mp_irqs[idx].mpc_srcbus;
24262+ int polarity;
24263+
24264+ /*
24265+ * Determine IRQ line polarity (high active or low active):
24266+ */
24267+ switch (mp_irqs[idx].mpc_irqflag & 3)
24268+ {
24269+ case 0: /* conforms, ie. bus-type dependent polarity */
24270+ {
24271+ switch (mp_bus_id_to_type[bus])
24272+ {
24273+ case MP_BUS_ISA: /* ISA pin */
24274+ {
24275+ polarity = default_ISA_polarity(idx);
24276+ break;
24277+ }
24278+ case MP_BUS_EISA: /* EISA pin */
24279+ {
24280+ polarity = default_EISA_polarity(idx);
24281+ break;
24282+ }
24283+ case MP_BUS_PCI: /* PCI pin */
24284+ {
24285+ polarity = default_PCI_polarity(idx);
24286+ break;
24287+ }
24288+ case MP_BUS_MCA: /* MCA pin */
24289+ {
24290+ polarity = default_MCA_polarity(idx);
24291+ break;
24292+ }
24293+ default:
24294+ {
24295+ printk(KERN_WARNING "broken BIOS!!\n");
24296+ polarity = 1;
24297+ break;
24298+ }
24299+ }
24300+ break;
24301+ }
24302+ case 1: /* high active */
24303+ {
24304+ polarity = 0;
24305+ break;
24306+ }
24307+ case 2: /* reserved */
24308+ {
24309+ printk(KERN_WARNING "broken BIOS!!\n");
24310+ polarity = 1;
24311+ break;
24312+ }
24313+ case 3: /* low active */
24314+ {
24315+ polarity = 1;
24316+ break;
24317+ }
24318+ default: /* invalid */
24319+ {
24320+ printk(KERN_WARNING "broken BIOS!!\n");
24321+ polarity = 1;
24322+ break;
24323+ }
24324+ }
24325+ return polarity;
24326+}
24327+
24328+static int MPBIOS_trigger(int idx)
24329+{
24330+ int bus = mp_irqs[idx].mpc_srcbus;
24331+ int trigger;
24332+
24333+ /*
24334+ * Determine IRQ trigger mode (edge or level sensitive):
24335+ */
24336+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
24337+ {
24338+ case 0: /* conforms, ie. bus-type dependent */
24339+ {
24340+ switch (mp_bus_id_to_type[bus])
24341+ {
24342+ case MP_BUS_ISA: /* ISA pin */
24343+ {
24344+ trigger = default_ISA_trigger(idx);
24345+ break;
24346+ }
24347+ case MP_BUS_EISA: /* EISA pin */
24348+ {
24349+ trigger = default_EISA_trigger(idx);
24350+ break;
24351+ }
24352+ case MP_BUS_PCI: /* PCI pin */
24353+ {
24354+ trigger = default_PCI_trigger(idx);
24355+ break;
24356+ }
24357+ case MP_BUS_MCA: /* MCA pin */
24358+ {
24359+ trigger = default_MCA_trigger(idx);
24360+ break;
24361+ }
24362+ default:
24363+ {
24364+ printk(KERN_WARNING "broken BIOS!!\n");
24365+ trigger = 1;
24366+ break;
24367+ }
24368+ }
24369+ break;
24370+ }
24371+ case 1: /* edge */
24372+ {
24373+ trigger = 0;
24374+ break;
24375+ }
24376+ case 2: /* reserved */
24377+ {
24378+ printk(KERN_WARNING "broken BIOS!!\n");
24379+ trigger = 1;
24380+ break;
24381+ }
24382+ case 3: /* level */
24383+ {
24384+ trigger = 1;
24385+ break;
24386+ }
24387+ default: /* invalid */
24388+ {
24389+ printk(KERN_WARNING "broken BIOS!!\n");
24390+ trigger = 0;
24391+ break;
24392+ }
24393+ }
24394+ return trigger;
24395+}
24396+
24397+static inline int irq_polarity(int idx)
24398+{
24399+ return MPBIOS_polarity(idx);
24400+}
24401+
24402+static inline int irq_trigger(int idx)
24403+{
24404+ return MPBIOS_trigger(idx);
24405+}
24406+
24407+static int next_irq = 16;
24408+
24409+/*
24410+ * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
24411+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
24412+ * from ACPI, which can reach 800 in large boxen.
24413+ *
24414+ * Compact the sparse GSI space into a sequential IRQ series and reuse
24415+ * vectors if possible.
24416+ */
24417+int gsi_irq_sharing(int gsi)
24418+{
24419+ int i, tries, vector;
24420+
24421+ BUG_ON(gsi >= NR_IRQ_VECTORS);
24422+
24423+ if (platform_legacy_irq(gsi))
24424+ return gsi;
24425+
24426+ if (gsi_2_irq[gsi] != 0xFF)
24427+ return (int)gsi_2_irq[gsi];
24428+
24429+ tries = NR_IRQS;
24430+ try_again:
24431+ vector = assign_irq_vector(gsi);
24432+
24433+ /*
24434+ * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
24435+ * use of vector and if found, return that IRQ. However, we never want
24436+ * to share legacy IRQs, which usually have a different trigger mode
24437+ * than PCI.
24438+ */
24439+ for (i = 0; i < NR_IRQS; i++)
24440+ if (IO_APIC_VECTOR(i) == vector)
24441+ break;
24442+ if (platform_legacy_irq(i)) {
24443+ if (--tries >= 0) {
24444+ IO_APIC_VECTOR(i) = 0;
24445+ goto try_again;
24446+ }
24447+ panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
24448+ }
24449+ if (i < NR_IRQS) {
24450+ gsi_2_irq[gsi] = i;
24451+ printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
24452+ gsi, vector, i);
24453+ return i;
24454+ }
24455+
24456+ i = next_irq++;
24457+ BUG_ON(i >= NR_IRQS);
24458+ gsi_2_irq[gsi] = i;
24459+ IO_APIC_VECTOR(i) = vector;
24460+ printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
24461+ gsi, vector, i);
24462+ return i;
24463+}
24464+
24465+static int pin_2_irq(int idx, int apic, int pin)
24466+{
24467+ int irq, i;
24468+ int bus = mp_irqs[idx].mpc_srcbus;
24469+
24470+ /*
24471+ * Debugging check, we are in big trouble if this message pops up!
24472+ */
24473+ if (mp_irqs[idx].mpc_dstirq != pin)
24474+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
24475+
24476+ switch (mp_bus_id_to_type[bus])
24477+ {
24478+ case MP_BUS_ISA: /* ISA pin */
24479+ case MP_BUS_EISA:
24480+ case MP_BUS_MCA:
24481+ {
24482+ irq = mp_irqs[idx].mpc_srcbusirq;
24483+ break;
24484+ }
24485+ case MP_BUS_PCI: /* PCI pin */
24486+ {
24487+ /*
24488+ * PCI IRQs are mapped in order
24489+ */
24490+ i = irq = 0;
24491+ while (i < apic)
24492+ irq += nr_ioapic_registers[i++];
24493+ irq += pin;
24494+ irq = gsi_irq_sharing(irq);
24495+ break;
24496+ }
24497+ default:
24498+ {
24499+ printk(KERN_ERR "unknown bus type %d.\n",bus);
24500+ irq = 0;
24501+ break;
24502+ }
24503+ }
24504+ BUG_ON(irq >= NR_IRQS);
24505+
24506+ /*
24507+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
24508+ */
24509+ if ((pin >= 16) && (pin <= 23)) {
24510+ if (pirq_entries[pin-16] != -1) {
24511+ if (!pirq_entries[pin-16]) {
24512+ apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
24513+ } else {
24514+ irq = pirq_entries[pin-16];
24515+ apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
24516+ pin-16, irq);
24517+ }
24518+ }
24519+ }
24520+ BUG_ON(irq >= NR_IRQS);
24521+ return irq;
24522+}
24523+
24524+static inline int IO_APIC_irq_trigger(int irq)
24525+{
24526+ int apic, idx, pin;
24527+
24528+ for (apic = 0; apic < nr_ioapics; apic++) {
24529+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24530+ idx = find_irq_entry(apic,pin,mp_INT);
24531+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
24532+ return irq_trigger(idx);
24533+ }
24534+ }
24535+ /*
24536+ * nonexistent IRQs are edge default
24537+ */
24538+ return 0;
24539+}
24540+
24541+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
24542+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
24543+
24544+int assign_irq_vector(int irq)
24545+{
24546+ unsigned long flags;
24547+ int vector;
24548+ struct physdev_irq irq_op;
24549+
24550+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
24551+
24552+ if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
24553+ return -EINVAL;
24554+
24555+ spin_lock_irqsave(&vector_lock, flags);
24556+
24557+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
24558+ spin_unlock_irqrestore(&vector_lock, flags);
24559+ return IO_APIC_VECTOR(irq);
24560+ }
24561+
24562+ irq_op.irq = irq;
24563+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
24564+ spin_unlock_irqrestore(&vector_lock, flags);
24565+ return -ENOSPC;
24566+ }
24567+
24568+ vector = irq_op.vector;
24569+ vector_irq[vector] = irq;
24570+ if (irq != AUTO_ASSIGN)
24571+ IO_APIC_VECTOR(irq) = vector;
24572+
24573+ spin_unlock_irqrestore(&vector_lock, flags);
24574+
24575+ return vector;
24576+}
24577+
24578+extern void (*interrupt[NR_IRQS])(void);
24579+#ifndef CONFIG_XEN
24580+static struct hw_interrupt_type ioapic_level_type;
24581+static struct hw_interrupt_type ioapic_edge_type;
24582+
24583+#define IOAPIC_AUTO -1
24584+#define IOAPIC_EDGE 0
24585+#define IOAPIC_LEVEL 1
24586+
24587+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
24588+{
24589+ unsigned idx;
24590+
24591+ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
24592+
24593+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
24594+ trigger == IOAPIC_LEVEL)
24595+ irq_desc[idx].chip = &ioapic_level_type;
24596+ else
24597+ irq_desc[idx].chip = &ioapic_edge_type;
24598+ set_intr_gate(vector, interrupt[idx]);
24599+}
24600+#else
24601+#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
24602+#endif /* !CONFIG_XEN */
24603+
24604+static void __init setup_IO_APIC_irqs(void)
24605+{
24606+ struct IO_APIC_route_entry entry;
24607+ int apic, pin, idx, irq, first_notcon = 1, vector;
24608+ unsigned long flags;
24609+
24610+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
24611+
24612+ for (apic = 0; apic < nr_ioapics; apic++) {
24613+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24614+
24615+ /*
24616+ * add it to the IO-APIC irq-routing table:
24617+ */
24618+ memset(&entry,0,sizeof(entry));
24619+
24620+ entry.delivery_mode = INT_DELIVERY_MODE;
24621+ entry.dest_mode = INT_DEST_MODE;
24622+ entry.mask = 0; /* enable IRQ */
24623+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24624+
24625+ idx = find_irq_entry(apic,pin,mp_INT);
24626+ if (idx == -1) {
24627+ if (first_notcon) {
24628+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24629+ first_notcon = 0;
24630+ } else
24631+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24632+ continue;
24633+ }
24634+
24635+ entry.trigger = irq_trigger(idx);
24636+ entry.polarity = irq_polarity(idx);
24637+
24638+ if (irq_trigger(idx)) {
24639+ entry.trigger = 1;
24640+ entry.mask = 1;
24641+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24642+ }
24643+
24644+ irq = pin_2_irq(idx, apic, pin);
24645+ add_pin_to_irq(irq, apic, pin);
24646+
24647+ if (/* !apic && */ !IO_APIC_IRQ(irq))
24648+ continue;
24649+
24650+ if (IO_APIC_IRQ(irq)) {
24651+ vector = assign_irq_vector(irq);
24652+ entry.vector = vector;
24653+
24654+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
24655+ if (!apic && (irq < 16))
24656+ disable_8259A_irq(irq);
24657+ }
24658+ spin_lock_irqsave(&ioapic_lock, flags);
24659+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24660+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24661+ set_native_irq_info(irq, TARGET_CPUS);
24662+ spin_unlock_irqrestore(&ioapic_lock, flags);
24663+ }
24664+ }
24665+
24666+ if (!first_notcon)
24667+ apic_printk(APIC_VERBOSE," not connected.\n");
24668+}
24669+
24670+#ifndef CONFIG_XEN
24671+/*
24672+ * Set up the 8259A-master output pin as broadcast to all
24673+ * CPUs.
24674+ */
24675+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
24676+{
24677+ struct IO_APIC_route_entry entry;
24678+ unsigned long flags;
24679+
24680+ memset(&entry,0,sizeof(entry));
24681+
24682+ disable_8259A_irq(0);
24683+
24684+ /* mask LVT0 */
24685+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
24686+
24687+ /*
24688+ * We use logical delivery to get the timer IRQ
24689+ * to the first CPU.
24690+ */
24691+ entry.dest_mode = INT_DEST_MODE;
24692+ entry.mask = 0; /* unmask IRQ now */
24693+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24694+ entry.delivery_mode = INT_DELIVERY_MODE;
24695+ entry.polarity = 0;
24696+ entry.trigger = 0;
24697+ entry.vector = vector;
24698+
24699+ /*
24700+ * The timer IRQ doesn't have to know that behind the
24701+ * scene we have a 8259A-master in AEOI mode ...
24702+ */
24703+ irq_desc[0].chip = &ioapic_edge_type;
24704+
24705+ /*
24706+ * Add it to the IO-APIC irq-routing table:
24707+ */
24708+ spin_lock_irqsave(&ioapic_lock, flags);
24709+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24710+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24711+ spin_unlock_irqrestore(&ioapic_lock, flags);
24712+
24713+ enable_8259A_irq(0);
24714+}
24715+
24716+void __init UNEXPECTED_IO_APIC(void)
24717+{
24718+}
24719+
24720+void __apicdebuginit print_IO_APIC(void)
24721+{
24722+ int apic, i;
24723+ union IO_APIC_reg_00 reg_00;
24724+ union IO_APIC_reg_01 reg_01;
24725+ union IO_APIC_reg_02 reg_02;
24726+ unsigned long flags;
24727+
24728+ if (apic_verbosity == APIC_QUIET)
24729+ return;
24730+
24731+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
24732+ for (i = 0; i < nr_ioapics; i++)
24733+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
24734+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
24735+
24736+ /*
24737+ * We are a bit conservative about what we expect. We have to
24738+ * know about every hardware change ASAP.
24739+ */
24740+ printk(KERN_INFO "testing the IO APIC.......................\n");
24741+
24742+ for (apic = 0; apic < nr_ioapics; apic++) {
24743+
24744+ spin_lock_irqsave(&ioapic_lock, flags);
24745+ reg_00.raw = io_apic_read(apic, 0);
24746+ reg_01.raw = io_apic_read(apic, 1);
24747+ if (reg_01.bits.version >= 0x10)
24748+ reg_02.raw = io_apic_read(apic, 2);
24749+ spin_unlock_irqrestore(&ioapic_lock, flags);
24750+
24751+ printk("\n");
24752+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
24753+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
24754+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
24755+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
24756+ UNEXPECTED_IO_APIC();
24757+
24758+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
24759+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
24760+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
24761+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
24762+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
24763+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
24764+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
24765+ (reg_01.bits.entries != 0x2E) &&
24766+ (reg_01.bits.entries != 0x3F) &&
24767+ (reg_01.bits.entries != 0x03)
24768+ )
24769+ UNEXPECTED_IO_APIC();
24770+
24771+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
24772+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
24773+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
24774+ (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
24775+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
24776+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
24777+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
24778+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
24779+ )
24780+ UNEXPECTED_IO_APIC();
24781+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
24782+ UNEXPECTED_IO_APIC();
24783+
24784+ if (reg_01.bits.version >= 0x10) {
24785+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
24786+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
24787+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
24788+ UNEXPECTED_IO_APIC();
24789+ }
24790+
24791+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
24792+
24793+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
24794+ " Stat Dest Deli Vect: \n");
24795+
24796+ for (i = 0; i <= reg_01.bits.entries; i++) {
24797+ struct IO_APIC_route_entry entry;
24798+
24799+ spin_lock_irqsave(&ioapic_lock, flags);
24800+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
24801+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
24802+ spin_unlock_irqrestore(&ioapic_lock, flags);
24803+
24804+ printk(KERN_DEBUG " %02x %03X %02X ",
24805+ i,
24806+ entry.dest.logical.logical_dest,
24807+ entry.dest.physical.physical_dest
24808+ );
24809+
24810+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
24811+ entry.mask,
24812+ entry.trigger,
24813+ entry.irr,
24814+ entry.polarity,
24815+ entry.delivery_status,
24816+ entry.dest_mode,
24817+ entry.delivery_mode,
24818+ entry.vector
24819+ );
24820+ }
24821+ }
24822+ if (use_pci_vector())
24823+ printk(KERN_INFO "Using vector-based indexing\n");
24824+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
24825+ for (i = 0; i < NR_IRQS; i++) {
24826+ struct irq_pin_list *entry = irq_2_pin + i;
24827+ if (entry->pin < 0)
24828+ continue;
24829+ if (use_pci_vector() && !platform_legacy_irq(i))
24830+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
24831+ else
24832+ printk(KERN_DEBUG "IRQ%d ", i);
24833+ for (;;) {
24834+ printk("-> %d:%d", entry->apic, entry->pin);
24835+ if (!entry->next)
24836+ break;
24837+ entry = irq_2_pin + entry->next;
24838+ }
24839+ printk("\n");
24840+ }
24841+
24842+ printk(KERN_INFO ".................................... done.\n");
24843+
24844+ return;
24845+}
24846+
24847+static __apicdebuginit void print_APIC_bitfield (int base)
24848+{
24849+ unsigned int v;
24850+ int i, j;
24851+
24852+ if (apic_verbosity == APIC_QUIET)
24853+ return;
24854+
24855+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
24856+ for (i = 0; i < 8; i++) {
24857+ v = apic_read(base + i*0x10);
24858+ for (j = 0; j < 32; j++) {
24859+ if (v & (1<<j))
24860+ printk("1");
24861+ else
24862+ printk("0");
24863+ }
24864+ printk("\n");
24865+ }
24866+}
24867+
24868+void __apicdebuginit print_local_APIC(void * dummy)
24869+{
24870+ unsigned int v, ver, maxlvt;
24871+
24872+ if (apic_verbosity == APIC_QUIET)
24873+ return;
24874+
24875+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
24876+ smp_processor_id(), hard_smp_processor_id());
24877+ v = apic_read(APIC_ID);
24878+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
24879+ v = apic_read(APIC_LVR);
24880+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
24881+ ver = GET_APIC_VERSION(v);
24882+ maxlvt = get_maxlvt();
24883+
24884+ v = apic_read(APIC_TASKPRI);
24885+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
24886+
24887+ v = apic_read(APIC_ARBPRI);
24888+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
24889+ v & APIC_ARBPRI_MASK);
24890+ v = apic_read(APIC_PROCPRI);
24891+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
24892+
24893+ v = apic_read(APIC_EOI);
24894+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
24895+ v = apic_read(APIC_RRR);
24896+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
24897+ v = apic_read(APIC_LDR);
24898+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
24899+ v = apic_read(APIC_DFR);
24900+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
24901+ v = apic_read(APIC_SPIV);
24902+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
24903+
24904+ printk(KERN_DEBUG "... APIC ISR field:\n");
24905+ print_APIC_bitfield(APIC_ISR);
24906+ printk(KERN_DEBUG "... APIC TMR field:\n");
24907+ print_APIC_bitfield(APIC_TMR);
24908+ printk(KERN_DEBUG "... APIC IRR field:\n");
24909+ print_APIC_bitfield(APIC_IRR);
24910+
24911+ v = apic_read(APIC_ESR);
24912+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
24913+
24914+ v = apic_read(APIC_ICR);
24915+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
24916+ v = apic_read(APIC_ICR2);
24917+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
24918+
24919+ v = apic_read(APIC_LVTT);
24920+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
24921+
24922+ if (maxlvt > 3) { /* PC is LVT#4. */
24923+ v = apic_read(APIC_LVTPC);
24924+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
24925+ }
24926+ v = apic_read(APIC_LVT0);
24927+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
24928+ v = apic_read(APIC_LVT1);
24929+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
24930+
24931+ if (maxlvt > 2) { /* ERR is LVT#3. */
24932+ v = apic_read(APIC_LVTERR);
24933+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
24934+ }
24935+
24936+ v = apic_read(APIC_TMICT);
24937+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
24938+ v = apic_read(APIC_TMCCT);
24939+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
24940+ v = apic_read(APIC_TDCR);
24941+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
24942+ printk("\n");
24943+}
24944+
24945+void print_all_local_APICs (void)
24946+{
24947+ on_each_cpu(print_local_APIC, NULL, 1, 1);
24948+}
24949+
24950+void __apicdebuginit print_PIC(void)
24951+{
24952+ unsigned int v;
24953+ unsigned long flags;
24954+
24955+ if (apic_verbosity == APIC_QUIET)
24956+ return;
24957+
24958+ printk(KERN_DEBUG "\nprinting PIC contents\n");
24959+
24960+ spin_lock_irqsave(&i8259A_lock, flags);
24961+
24962+ v = inb(0xa1) << 8 | inb(0x21);
24963+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
24964+
24965+ v = inb(0xa0) << 8 | inb(0x20);
24966+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
24967+
24968+ outb(0x0b,0xa0);
24969+ outb(0x0b,0x20);
24970+ v = inb(0xa0) << 8 | inb(0x20);
24971+ outb(0x0a,0xa0);
24972+ outb(0x0a,0x20);
24973+
24974+ spin_unlock_irqrestore(&i8259A_lock, flags);
24975+
24976+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
24977+
24978+ v = inb(0x4d1) << 8 | inb(0x4d0);
24979+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
24980+}
24981+#endif /* !CONFIG_XEN */
24982+
24983+static void __init enable_IO_APIC(void)
24984+{
24985+ union IO_APIC_reg_01 reg_01;
24986+#ifndef CONFIG_XEN
24987+ int i8259_apic, i8259_pin;
24988+#endif
24989+ int i, apic;
24990+ unsigned long flags;
24991+
24992+ for (i = 0; i < PIN_MAP_SIZE; i++) {
24993+ irq_2_pin[i].pin = -1;
24994+ irq_2_pin[i].next = 0;
24995+ }
24996+ if (!pirqs_enabled)
24997+ for (i = 0; i < MAX_PIRQS; i++)
24998+ pirq_entries[i] = -1;
24999+
25000+ /*
25001+ * The number of IO-APIC IRQ registers (== #pins):
25002+ */
25003+ for (apic = 0; apic < nr_ioapics; apic++) {
25004+ spin_lock_irqsave(&ioapic_lock, flags);
25005+ reg_01.raw = io_apic_read(apic, 1);
25006+ spin_unlock_irqrestore(&ioapic_lock, flags);
25007+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
25008+ }
25009+#ifndef CONFIG_XEN
25010+ for(apic = 0; apic < nr_ioapics; apic++) {
25011+ int pin;
25012+ /* See if any of the pins is in ExtINT mode */
25013+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
25014+ struct IO_APIC_route_entry entry;
25015+ spin_lock_irqsave(&ioapic_lock, flags);
25016+ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25017+ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25018+ spin_unlock_irqrestore(&ioapic_lock, flags);
25019+
25020+
25021+ /* If the interrupt line is enabled and in ExtInt mode
25022+ * I have found the pin where the i8259 is connected.
25023+ */
25024+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
25025+ ioapic_i8259.apic = apic;
25026+ ioapic_i8259.pin = pin;
25027+ goto found_i8259;
25028+ }
25029+ }
25030+ }
25031+ found_i8259:
25032+ /* Look to see what if the MP table has reported the ExtINT */
25033+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
25034+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
25035+ /* Trust the MP table if nothing is setup in the hardware */
25036+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
25037+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
25038+ ioapic_i8259.pin = i8259_pin;
25039+ ioapic_i8259.apic = i8259_apic;
25040+ }
25041+ /* Complain if the MP table and the hardware disagree */
25042+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
25043+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
25044+ {
25045+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
25046+ }
25047+#endif
25048+
25049+ /*
25050+ * Do not trust the IO-APIC being empty at bootup
25051+ */
25052+ clear_IO_APIC();
25053+}
25054+
25055+/*
25056+ * Not an __init, needed by the reboot code
25057+ */
25058+void disable_IO_APIC(void)
25059+{
25060+ /*
25061+ * Clear the IO-APIC before rebooting:
25062+ */
25063+ clear_IO_APIC();
25064+
25065+#ifndef CONFIG_XEN
25066+ /*
25067+ * If the i8259 is routed through an IOAPIC
25068+ * Put that IOAPIC in virtual wire mode
25069+ * so legacy interrupts can be delivered.
25070+ */
25071+ if (ioapic_i8259.pin != -1) {
25072+ struct IO_APIC_route_entry entry;
25073+ unsigned long flags;
25074+
25075+ memset(&entry, 0, sizeof(entry));
25076+ entry.mask = 0; /* Enabled */
25077+ entry.trigger = 0; /* Edge */
25078+ entry.irr = 0;
25079+ entry.polarity = 0; /* High */
25080+ entry.delivery_status = 0;
25081+ entry.dest_mode = 0; /* Physical */
25082+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
25083+ entry.vector = 0;
25084+ entry.dest.physical.physical_dest =
25085+ GET_APIC_ID(apic_read(APIC_ID));
25086+
25087+ /*
25088+ * Add it to the IO-APIC irq-routing table:
25089+ */
25090+ spin_lock_irqsave(&ioapic_lock, flags);
25091+ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
25092+ *(((int *)&entry)+1));
25093+ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
25094+ *(((int *)&entry)+0));
25095+ spin_unlock_irqrestore(&ioapic_lock, flags);
25096+ }
25097+
25098+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
25099+#endif
25100+}
25101+
25102+/*
25103+ * function to set the IO-APIC physical IDs based on the
25104+ * values stored in the MPC table.
25105+ *
25106+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
25107+ */
25108+
25109+#ifndef CONFIG_XEN
25110+static void __init setup_ioapic_ids_from_mpc (void)
25111+{
25112+ union IO_APIC_reg_00 reg_00;
25113+ int apic;
25114+ int i;
25115+ unsigned char old_id;
25116+ unsigned long flags;
25117+
25118+ /*
25119+ * Set the IOAPIC ID to the value stored in the MPC table.
25120+ */
25121+ for (apic = 0; apic < nr_ioapics; apic++) {
25122+
25123+ /* Read the register 0 value */
25124+ spin_lock_irqsave(&ioapic_lock, flags);
25125+ reg_00.raw = io_apic_read(apic, 0);
25126+ spin_unlock_irqrestore(&ioapic_lock, flags);
25127+
25128+ old_id = mp_ioapics[apic].mpc_apicid;
25129+
25130+
25131+ printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
25132+
25133+
25134+ /*
25135+ * We need to adjust the IRQ routing table
25136+ * if the ID changed.
25137+ */
25138+ if (old_id != mp_ioapics[apic].mpc_apicid)
25139+ for (i = 0; i < mp_irq_entries; i++)
25140+ if (mp_irqs[i].mpc_dstapic == old_id)
25141+ mp_irqs[i].mpc_dstapic
25142+ = mp_ioapics[apic].mpc_apicid;
25143+
25144+ /*
25145+ * Read the right value from the MPC table and
25146+ * write it into the ID register.
25147+ */
25148+ apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
25149+ mp_ioapics[apic].mpc_apicid);
25150+
25151+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
25152+ spin_lock_irqsave(&ioapic_lock, flags);
25153+ io_apic_write(apic, 0, reg_00.raw);
25154+ spin_unlock_irqrestore(&ioapic_lock, flags);
25155+
25156+ /*
25157+ * Sanity check
25158+ */
25159+ spin_lock_irqsave(&ioapic_lock, flags);
25160+ reg_00.raw = io_apic_read(apic, 0);
25161+ spin_unlock_irqrestore(&ioapic_lock, flags);
25162+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
25163+ printk("could not set ID!\n");
25164+ else
25165+ apic_printk(APIC_VERBOSE," ok.\n");
25166+ }
25167+}
25168+#else
25169+static void __init setup_ioapic_ids_from_mpc(void) { }
25170+#endif
25171+
25172+/*
25173+ * There is a nasty bug in some older SMP boards, their mptable lies
25174+ * about the timer IRQ. We do the following to work around the situation:
25175+ *
25176+ * - timer IRQ defaults to IO-APIC IRQ
25177+ * - if this function detects that timer IRQs are defunct, then we fall
25178+ * back to ISA timer IRQs
25179+ */
25180+#ifndef CONFIG_XEN
25181+static int __init timer_irq_works(void)
25182+{
25183+ unsigned long t1 = jiffies;
25184+
25185+ local_irq_enable();
25186+ /* Let ten ticks pass... */
25187+ mdelay((10 * 1000) / HZ);
25188+
25189+ /*
25190+ * Expect a few ticks at least, to be sure some possible
25191+ * glue logic does not lock up after one or two first
25192+ * ticks in a non-ExtINT mode. Also the local APIC
25193+ * might have cached one ExtINT interrupt. Finally, at
25194+ * least one tick may be lost due to delays.
25195+ */
25196+
25197+ /* jiffies wrap? */
25198+ if (jiffies - t1 > 4)
25199+ return 1;
25200+ return 0;
25201+}
25202+
25203+/*
25204+ * In the SMP+IOAPIC case it might happen that there are an unspecified
25205+ * number of pending IRQ events unhandled. These cases are very rare,
25206+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
25207+ * better to do it this way as thus we do not have to be aware of
25208+ * 'pending' interrupts in the IRQ path, except at this point.
25209+ */
25210+/*
25211+ * Edge triggered needs to resend any interrupt
25212+ * that was delayed but this is now handled in the device
25213+ * independent code.
25214+ */
25215+
25216+/*
25217+ * Starting up a edge-triggered IO-APIC interrupt is
25218+ * nasty - we need to make sure that we get the edge.
25219+ * If it is already asserted for some reason, we need
25220+ * return 1 to indicate that is was pending.
25221+ *
25222+ * This is not complete - we should be able to fake
25223+ * an edge even if it isn't on the 8259A...
25224+ */
25225+
25226+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
25227+{
25228+ int was_pending = 0;
25229+ unsigned long flags;
25230+
25231+ spin_lock_irqsave(&ioapic_lock, flags);
25232+ if (irq < 16) {
25233+ disable_8259A_irq(irq);
25234+ if (i8259A_irq_pending(irq))
25235+ was_pending = 1;
25236+ }
25237+ __unmask_IO_APIC_irq(irq);
25238+ spin_unlock_irqrestore(&ioapic_lock, flags);
25239+
25240+ return was_pending;
25241+}
25242+
25243+/*
25244+ * Once we have recorded IRQ_PENDING already, we can mask the
25245+ * interrupt for real. This prevents IRQ storms from unhandled
25246+ * devices.
25247+ */
25248+static void ack_edge_ioapic_irq(unsigned int irq)
25249+{
25250+ move_irq(irq);
25251+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
25252+ == (IRQ_PENDING | IRQ_DISABLED))
25253+ mask_IO_APIC_irq(irq);
25254+ ack_APIC_irq();
25255+}
25256+
25257+/*
25258+ * Level triggered interrupts can just be masked,
25259+ * and shutting down and starting up the interrupt
25260+ * is the same as enabling and disabling them -- except
25261+ * with a startup need to return a "was pending" value.
25262+ *
25263+ * Level triggered interrupts are special because we
25264+ * do not touch any IO-APIC register while handling
25265+ * them. We ack the APIC in the end-IRQ handler, not
25266+ * in the start-IRQ-handler. Protection against reentrance
25267+ * from the same interrupt is still provided, both by the
25268+ * generic IRQ layer and by the fact that an unacked local
25269+ * APIC does not accept IRQs.
25270+ */
25271+static unsigned int startup_level_ioapic_irq (unsigned int irq)
25272+{
25273+ unmask_IO_APIC_irq(irq);
25274+
25275+ return 0; /* don't check for pending */
25276+}
25277+
25278+static void end_level_ioapic_irq (unsigned int irq)
25279+{
25280+ move_irq(irq);
25281+ ack_APIC_irq();
25282+}
25283+
25284+#ifdef CONFIG_PCI_MSI
25285+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
25286+{
25287+ int irq = vector_to_irq(vector);
25288+
25289+ return startup_edge_ioapic_irq(irq);
25290+}
25291+
25292+static void ack_edge_ioapic_vector(unsigned int vector)
25293+{
25294+ int irq = vector_to_irq(vector);
25295+
25296+ move_native_irq(vector);
25297+ ack_edge_ioapic_irq(irq);
25298+}
25299+
25300+static unsigned int startup_level_ioapic_vector (unsigned int vector)
25301+{
25302+ int irq = vector_to_irq(vector);
25303+
25304+ return startup_level_ioapic_irq (irq);
25305+}
25306+
25307+static void end_level_ioapic_vector (unsigned int vector)
25308+{
25309+ int irq = vector_to_irq(vector);
25310+
25311+ move_native_irq(vector);
25312+ end_level_ioapic_irq(irq);
25313+}
25314+
25315+static void mask_IO_APIC_vector (unsigned int vector)
25316+{
25317+ int irq = vector_to_irq(vector);
25318+
25319+ mask_IO_APIC_irq(irq);
25320+}
25321+
25322+static void unmask_IO_APIC_vector (unsigned int vector)
25323+{
25324+ int irq = vector_to_irq(vector);
25325+
25326+ unmask_IO_APIC_irq(irq);
25327+}
25328+
25329+#ifdef CONFIG_SMP
25330+static void set_ioapic_affinity_vector (unsigned int vector,
25331+ cpumask_t cpu_mask)
25332+{
25333+ int irq = vector_to_irq(vector);
25334+
25335+ set_native_irq_info(vector, cpu_mask);
25336+ set_ioapic_affinity_irq(irq, cpu_mask);
25337+}
25338+#endif // CONFIG_SMP
25339+#endif // CONFIG_PCI_MSI
25340+
25341+static int ioapic_retrigger(unsigned int irq)
25342+{
25343+ send_IPI_self(IO_APIC_VECTOR(irq));
25344+
25345+ return 1;
25346+}
25347+
25348+/*
25349+ * Level and edge triggered IO-APIC interrupts need different handling,
25350+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
25351+ * handled with the level-triggered descriptor, but that one has slightly
25352+ * more overhead. Level-triggered interrupts cannot be handled with the
25353+ * edge-triggered handler, without risking IRQ storms and other ugly
25354+ * races.
25355+ */
25356+
25357+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
25358+ .typename = "IO-APIC-edge",
25359+ .startup = startup_edge_ioapic,
25360+ .shutdown = shutdown_edge_ioapic,
25361+ .enable = enable_edge_ioapic,
25362+ .disable = disable_edge_ioapic,
25363+ .ack = ack_edge_ioapic,
25364+ .end = end_edge_ioapic,
25365+#ifdef CONFIG_SMP
25366+ .set_affinity = set_ioapic_affinity,
25367+#endif
25368+ .retrigger = ioapic_retrigger,
25369+};
25370+
25371+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
25372+ .typename = "IO-APIC-level",
25373+ .startup = startup_level_ioapic,
25374+ .shutdown = shutdown_level_ioapic,
25375+ .enable = enable_level_ioapic,
25376+ .disable = disable_level_ioapic,
25377+ .ack = mask_and_ack_level_ioapic,
25378+ .end = end_level_ioapic,
25379+#ifdef CONFIG_SMP
25380+ .set_affinity = set_ioapic_affinity,
25381+#endif
25382+ .retrigger = ioapic_retrigger,
25383+};
25384+#endif /* !CONFIG_XEN */
25385+
25386+static inline void init_IO_APIC_traps(void)
25387+{
25388+ int irq;
25389+
25390+ /*
25391+ * NOTE! The local APIC isn't very good at handling
25392+ * multiple interrupts at the same interrupt level.
25393+ * As the interrupt level is determined by taking the
25394+ * vector number and shifting that right by 4, we
25395+ * want to spread these out a bit so that they don't
25396+ * all fall in the same interrupt level.
25397+ *
25398+ * Also, we've got to be careful not to trash gate
25399+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
25400+ */
25401+ for (irq = 0; irq < NR_IRQS ; irq++) {
25402+ int tmp = irq;
25403+ if (use_pci_vector()) {
25404+ if (!platform_legacy_irq(tmp))
25405+ if ((tmp = vector_to_irq(tmp)) == -1)
25406+ continue;
25407+ }
25408+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
25409+ /*
25410+ * Hmm.. We don't have an entry for this,
25411+ * so default to an old-fashioned 8259
25412+ * interrupt if we can..
25413+ */
25414+ if (irq < 16)
25415+ make_8259A_irq(irq);
25416+#ifndef CONFIG_XEN
25417+ else
25418+ /* Strange. Oh, well.. */
25419+ irq_desc[irq].chip = &no_irq_type;
25420+#endif
25421+ }
25422+ }
25423+}
25424+
25425+#ifndef CONFIG_XEN
25426+static void enable_lapic_irq (unsigned int irq)
25427+{
25428+ unsigned long v;
25429+
25430+ v = apic_read(APIC_LVT0);
25431+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
25432+}
25433+
25434+static void disable_lapic_irq (unsigned int irq)
25435+{
25436+ unsigned long v;
25437+
25438+ v = apic_read(APIC_LVT0);
25439+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
25440+}
25441+
25442+static void ack_lapic_irq (unsigned int irq)
25443+{
25444+ ack_APIC_irq();
25445+}
25446+
25447+static void end_lapic_irq (unsigned int i) { /* nothing */ }
25448+
25449+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
25450+ .typename = "local-APIC-edge",
25451+ .startup = NULL, /* startup_irq() not used for IRQ0 */
25452+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
25453+ .enable = enable_lapic_irq,
25454+ .disable = disable_lapic_irq,
25455+ .ack = ack_lapic_irq,
25456+ .end = end_lapic_irq,
25457+};
25458+
25459+static void setup_nmi (void)
25460+{
25461+ /*
25462+ * Dirty trick to enable the NMI watchdog ...
25463+ * We put the 8259A master into AEOI mode and
25464+ * unmask on all local APICs LVT0 as NMI.
25465+ *
25466+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
25467+ * is from Maciej W. Rozycki - so we do not have to EOI from
25468+ * the NMI handler or the timer interrupt.
25469+ */
25470+ printk(KERN_INFO "activating NMI Watchdog ...");
25471+
25472+ enable_NMI_through_LVT0(NULL);
25473+
25474+ printk(" done.\n");
25475+}
25476+
25477+/*
25478+ * This looks a bit hackish but it's about the only one way of sending
25479+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
25480+ * not support the ExtINT mode, unfortunately. We need to send these
25481+ * cycles as some i82489DX-based boards have glue logic that keeps the
25482+ * 8259A interrupt line asserted until INTA. --macro
25483+ */
25484+static inline void unlock_ExtINT_logic(void)
25485+{
25486+ int apic, pin, i;
25487+ struct IO_APIC_route_entry entry0, entry1;
25488+ unsigned char save_control, save_freq_select;
25489+ unsigned long flags;
25490+
25491+ pin = find_isa_irq_pin(8, mp_INT);
25492+ apic = find_isa_irq_apic(8, mp_INT);
25493+ if (pin == -1)
25494+ return;
25495+
25496+ spin_lock_irqsave(&ioapic_lock, flags);
25497+ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25498+ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25499+ spin_unlock_irqrestore(&ioapic_lock, flags);
25500+ clear_IO_APIC_pin(apic, pin);
25501+
25502+ memset(&entry1, 0, sizeof(entry1));
25503+
25504+ entry1.dest_mode = 0; /* physical delivery */
25505+ entry1.mask = 0; /* unmask IRQ now */
25506+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
25507+ entry1.delivery_mode = dest_ExtINT;
25508+ entry1.polarity = entry0.polarity;
25509+ entry1.trigger = 0;
25510+ entry1.vector = 0;
25511+
25512+ spin_lock_irqsave(&ioapic_lock, flags);
25513+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
25514+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
25515+ spin_unlock_irqrestore(&ioapic_lock, flags);
25516+
25517+ save_control = CMOS_READ(RTC_CONTROL);
25518+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
25519+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
25520+ RTC_FREQ_SELECT);
25521+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
25522+
25523+ i = 100;
25524+ while (i-- > 0) {
25525+ mdelay(10);
25526+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
25527+ i -= 10;
25528+ }
25529+
25530+ CMOS_WRITE(save_control, RTC_CONTROL);
25531+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
25532+ clear_IO_APIC_pin(apic, pin);
25533+
25534+ spin_lock_irqsave(&ioapic_lock, flags);
25535+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
25536+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
25537+ spin_unlock_irqrestore(&ioapic_lock, flags);
25538+}
25539+
25540+int timer_uses_ioapic_pin_0;
25541+
25542+/*
25543+ * This code may look a bit paranoid, but it's supposed to cooperate with
25544+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
25545+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
25546+ * fanatically on his truly buggy board.
25547+ *
25548+ * FIXME: really need to revamp this for modern platforms only.
25549+ */
25550+static inline void check_timer(void)
25551+{
25552+ int apic1, pin1, apic2, pin2;
25553+ int vector;
25554+
25555+ /*
25556+ * get/set the timer IRQ vector:
25557+ */
25558+ disable_8259A_irq(0);
25559+ vector = assign_irq_vector(0);
25560+ set_intr_gate(vector, interrupt[0]);
25561+
25562+ /*
25563+ * Subtle, code in do_timer_interrupt() expects an AEOI
25564+ * mode for the 8259A whenever interrupts are routed
25565+ * through I/O APICs. Also IRQ0 has to be enabled in
25566+ * the 8259A which implies the virtual wire has to be
25567+ * disabled in the local APIC.
25568+ */
25569+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
25570+ init_8259A(1);
25571+ if (timer_over_8254 > 0)
25572+ enable_8259A_irq(0);
25573+
25574+ pin1 = find_isa_irq_pin(0, mp_INT);
25575+ apic1 = find_isa_irq_apic(0, mp_INT);
25576+ pin2 = ioapic_i8259.pin;
25577+ apic2 = ioapic_i8259.apic;
25578+
25579+ if (pin1 == 0)
25580+ timer_uses_ioapic_pin_0 = 1;
25581+
25582+ apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
25583+ vector, apic1, pin1, apic2, pin2);
25584+
25585+ if (pin1 != -1) {
25586+ /*
25587+ * Ok, does IRQ0 through the IOAPIC work?
25588+ */
25589+ unmask_IO_APIC_irq(0);
25590+ if (!no_timer_check && timer_irq_works()) {
25591+ nmi_watchdog_default();
25592+ if (nmi_watchdog == NMI_IO_APIC) {
25593+ disable_8259A_irq(0);
25594+ setup_nmi();
25595+ enable_8259A_irq(0);
25596+ }
25597+ if (disable_timer_pin_1 > 0)
25598+ clear_IO_APIC_pin(0, pin1);
25599+ return;
25600+ }
25601+ clear_IO_APIC_pin(apic1, pin1);
25602+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
25603+ "connected to IO-APIC\n");
25604+ }
25605+
25606+ apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
25607+ "through the 8259A ... ");
25608+ if (pin2 != -1) {
25609+ apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
25610+ apic2, pin2);
25611+ /*
25612+ * legacy devices should be connected to IO APIC #0
25613+ */
25614+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
25615+ if (timer_irq_works()) {
25616+ apic_printk(APIC_VERBOSE," works.\n");
25617+ nmi_watchdog_default();
25618+ if (nmi_watchdog == NMI_IO_APIC) {
25619+ setup_nmi();
25620+ }
25621+ return;
25622+ }
25623+ /*
25624+ * Cleanup, just in case ...
25625+ */
25626+ clear_IO_APIC_pin(apic2, pin2);
25627+ }
25628+ apic_printk(APIC_VERBOSE," failed.\n");
25629+
25630+ if (nmi_watchdog == NMI_IO_APIC) {
25631+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
25632+ nmi_watchdog = 0;
25633+ }
25634+
25635+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
25636+
25637+ disable_8259A_irq(0);
25638+ irq_desc[0].chip = &lapic_irq_type;
25639+ apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
25640+ enable_8259A_irq(0);
25641+
25642+ if (timer_irq_works()) {
25643+ apic_printk(APIC_VERBOSE," works.\n");
25644+ return;
25645+ }
25646+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
25647+ apic_printk(APIC_VERBOSE," failed.\n");
25648+
25649+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
25650+
25651+ init_8259A(0);
25652+ make_8259A_irq(0);
25653+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
25654+
25655+ unlock_ExtINT_logic();
25656+
25657+ if (timer_irq_works()) {
25658+ apic_printk(APIC_VERBOSE," works.\n");
25659+ return;
25660+ }
25661+ apic_printk(APIC_VERBOSE," failed :(.\n");
25662+ panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
25663+}
25664+#else
25665+#define check_timer() ((void)0)
25666+int timer_uses_ioapic_pin_0 = 0;
25667+#endif /* !CONFIG_XEN */
25668+
25669+static int __init notimercheck(char *s)
25670+{
25671+ no_timer_check = 1;
25672+ return 1;
25673+}
25674+__setup("no_timer_check", notimercheck);
25675+
25676+/*
25677+ *
25678+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
25679+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
25680+ * Linux doesn't really care, as it's not actually used
25681+ * for any interrupt handling anyway.
25682+ */
25683+#define PIC_IRQS (1<<2)
25684+
25685+void __init setup_IO_APIC(void)
25686+{
25687+ enable_IO_APIC();
25688+
25689+ if (acpi_ioapic)
25690+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
25691+ else
25692+ io_apic_irqs = ~PIC_IRQS;
25693+
25694+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
25695+
25696+ /*
25697+ * Set up the IO-APIC IRQ routing table.
25698+ */
25699+ if (!acpi_ioapic)
25700+ setup_ioapic_ids_from_mpc();
25701+#ifndef CONFIG_XEN
25702+ sync_Arb_IDs();
25703+#endif /* !CONFIG_XEN */
25704+ setup_IO_APIC_irqs();
25705+ init_IO_APIC_traps();
25706+ check_timer();
25707+ if (!acpi_ioapic)
25708+ print_IO_APIC();
25709+}
25710+
25711+struct sysfs_ioapic_data {
25712+ struct sys_device dev;
25713+ struct IO_APIC_route_entry entry[0];
25714+};
25715+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
25716+
25717+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
25718+{
25719+ struct IO_APIC_route_entry *entry;
25720+ struct sysfs_ioapic_data *data;
25721+ unsigned long flags;
25722+ int i;
25723+
25724+ data = container_of(dev, struct sysfs_ioapic_data, dev);
25725+ entry = data->entry;
25726+ spin_lock_irqsave(&ioapic_lock, flags);
25727+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25728+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
25729+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
25730+ }
25731+ spin_unlock_irqrestore(&ioapic_lock, flags);
25732+
25733+ return 0;
25734+}
25735+
25736+static int ioapic_resume(struct sys_device *dev)
25737+{
25738+ struct IO_APIC_route_entry *entry;
25739+ struct sysfs_ioapic_data *data;
25740+ unsigned long flags;
25741+ union IO_APIC_reg_00 reg_00;
25742+ int i;
25743+
25744+ data = container_of(dev, struct sysfs_ioapic_data, dev);
25745+ entry = data->entry;
25746+
25747+ spin_lock_irqsave(&ioapic_lock, flags);
25748+ reg_00.raw = io_apic_read(dev->id, 0);
25749+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
25750+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
25751+ io_apic_write(dev->id, 0, reg_00.raw);
25752+ }
25753+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25754+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
25755+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
25756+ }
25757+ spin_unlock_irqrestore(&ioapic_lock, flags);
25758+
25759+ return 0;
25760+}
25761+
25762+static struct sysdev_class ioapic_sysdev_class = {
25763+ set_kset_name("ioapic"),
25764+#ifndef CONFIG_XEN
25765+ .suspend = ioapic_suspend,
25766+ .resume = ioapic_resume,
25767+#endif
25768+};
25769+
25770+static int __init ioapic_init_sysfs(void)
25771+{
25772+ struct sys_device * dev;
25773+ int i, size, error = 0;
25774+
25775+ error = sysdev_class_register(&ioapic_sysdev_class);
25776+ if (error)
25777+ return error;
25778+
25779+ for (i = 0; i < nr_ioapics; i++ ) {
25780+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
25781+ * sizeof(struct IO_APIC_route_entry);
25782+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
25783+ if (!mp_ioapic_data[i]) {
25784+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25785+ continue;
25786+ }
25787+ memset(mp_ioapic_data[i], 0, size);
25788+ dev = &mp_ioapic_data[i]->dev;
25789+ dev->id = i;
25790+ dev->cls = &ioapic_sysdev_class;
25791+ error = sysdev_register(dev);
25792+ if (error) {
25793+ kfree(mp_ioapic_data[i]);
25794+ mp_ioapic_data[i] = NULL;
25795+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25796+ continue;
25797+ }
25798+ }
25799+
25800+ return 0;
25801+}
25802+
25803+device_initcall(ioapic_init_sysfs);
25804+
25805+/* --------------------------------------------------------------------------
25806+ ACPI-based IOAPIC Configuration
25807+ -------------------------------------------------------------------------- */
25808+
25809+#ifdef CONFIG_ACPI
25810+
25811+#define IO_APIC_MAX_ID 0xFE
25812+
25813+int __init io_apic_get_version (int ioapic)
25814+{
25815+ union IO_APIC_reg_01 reg_01;
25816+ unsigned long flags;
25817+
25818+ spin_lock_irqsave(&ioapic_lock, flags);
25819+ reg_01.raw = io_apic_read(ioapic, 1);
25820+ spin_unlock_irqrestore(&ioapic_lock, flags);
25821+
25822+ return reg_01.bits.version;
25823+}
25824+
25825+
25826+int __init io_apic_get_redir_entries (int ioapic)
25827+{
25828+ union IO_APIC_reg_01 reg_01;
25829+ unsigned long flags;
25830+
25831+ spin_lock_irqsave(&ioapic_lock, flags);
25832+ reg_01.raw = io_apic_read(ioapic, 1);
25833+ spin_unlock_irqrestore(&ioapic_lock, flags);
25834+
25835+ return reg_01.bits.entries;
25836+}
25837+
25838+
25839+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
25840+{
25841+ struct IO_APIC_route_entry entry;
25842+ unsigned long flags;
25843+
25844+ if (!IO_APIC_IRQ(irq)) {
25845+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
25846+ ioapic);
25847+ return -EINVAL;
25848+ }
25849+
25850+ /*
25851+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
25852+ * Note that we mask (disable) IRQs now -- these get enabled when the
25853+ * corresponding device driver registers for this IRQ.
25854+ */
25855+
25856+ memset(&entry,0,sizeof(entry));
25857+
25858+ entry.delivery_mode = INT_DELIVERY_MODE;
25859+ entry.dest_mode = INT_DEST_MODE;
25860+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
25861+ entry.trigger = edge_level;
25862+ entry.polarity = active_high_low;
25863+ entry.mask = 1; /* Disabled (masked) */
25864+
25865+ irq = gsi_irq_sharing(irq);
25866+ /*
25867+ * IRQs < 16 are already in the irq_2_pin[] map
25868+ */
25869+ if (irq >= 16)
25870+ add_pin_to_irq(irq, ioapic, pin);
25871+
25872+ entry.vector = assign_irq_vector(irq);
25873+
25874+ apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
25875+ "IRQ %d Mode:%i Active:%i)\n", ioapic,
25876+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
25877+ edge_level, active_high_low);
25878+
25879+ ioapic_register_intr(irq, entry.vector, edge_level);
25880+
25881+ if (!ioapic && (irq < 16))
25882+ disable_8259A_irq(irq);
25883+
25884+ spin_lock_irqsave(&ioapic_lock, flags);
25885+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
25886+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
25887+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
25888+ spin_unlock_irqrestore(&ioapic_lock, flags);
25889+
25890+ return 0;
25891+}
25892+
25893+#endif /* CONFIG_ACPI */
25894+
25895+
25896+#ifndef CONFIG_XEN
25897+/*
25898+ * This function currently is only a helper for the i386 smp boot process where
25899+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
25900+ * so mask in all cases should simply be TARGET_CPUS
25901+ */
25902+#ifdef CONFIG_SMP
25903+void __init setup_ioapic_dest(void)
25904+{
25905+ int pin, ioapic, irq, irq_entry;
25906+
25907+ if (skip_ioapic_setup == 1)
25908+ return;
25909+
25910+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
25911+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
25912+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
25913+ if (irq_entry == -1)
25914+ continue;
25915+ irq = pin_2_irq(irq_entry, ioapic, pin);
25916+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
25917+ }
25918+
25919+ }
25920+}
25921+#endif
25922+#endif /* !CONFIG_XEN */
25923Index: head-2008-11-25/arch/x86/kernel/ioport_64-xen.c
25924===================================================================
25925--- /dev/null 1970-01-01 00:00:00.000000000 +0000
25926+++ head-2008-11-25/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100
25927@@ -0,0 +1,100 @@
25928+/*
25929+ * linux/arch/x86_64/kernel/ioport.c
25930+ *
25931+ * This contains the io-permission bitmap code - written by obz, with changes
25932+ * by Linus.
25933+ */
25934+
25935+#include <linux/sched.h>
25936+#include <linux/kernel.h>
25937+#include <linux/capability.h>
25938+#include <linux/errno.h>
25939+#include <linux/types.h>
25940+#include <linux/ioport.h>
25941+#include <linux/mm.h>
25942+#include <linux/smp.h>
25943+#include <linux/smp_lock.h>
25944+#include <linux/stddef.h>
25945+#include <linux/slab.h>
25946+#include <linux/thread_info.h>
25947+#include <xen/interface/physdev.h>
25948+
25949+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
25950+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
25951+{
25952+ int i;
25953+
25954+ if (new_value)
25955+ for (i = base; i < base + extent; i++)
25956+ __set_bit(i, bitmap);
25957+ else
25958+ for (i = base; i < base + extent; i++)
25959+ clear_bit(i, bitmap);
25960+}
25961+
25962+/*
25963+ * this changes the io permissions bitmap in the current task.
25964+ */
25965+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
25966+{
25967+ struct thread_struct * t = &current->thread;
25968+ unsigned long *bitmap;
25969+ struct physdev_set_iobitmap set_iobitmap;
25970+
25971+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
25972+ return -EINVAL;
25973+ if (turn_on && !capable(CAP_SYS_RAWIO))
25974+ return -EPERM;
25975+
25976+ /*
25977+ * If it's the first ioperm() call in this thread's lifetime, set the
25978+ * IO bitmap up. ioperm() is much less timing critical than clone(),
25979+ * this is why we delay this operation until now:
25980+ */
25981+ if (!t->io_bitmap_ptr) {
25982+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
25983+ if (!bitmap)
25984+ return -ENOMEM;
25985+
25986+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
25987+ t->io_bitmap_ptr = bitmap;
25988+
25989+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
25990+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
25991+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
25992+ &set_iobitmap));
25993+ }
25994+
25995+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
25996+
25997+ return 0;
25998+}
25999+
26000+/*
26001+ * sys_iopl has to be used when you want to access the IO ports
26002+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
26003+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
26004+ *
26005+ */
26006+
26007+asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
26008+{
26009+ unsigned int old_iopl = current->thread.iopl;
26010+ struct physdev_set_iopl set_iopl;
26011+
26012+ if (new_iopl > 3)
26013+ return -EINVAL;
26014+
26015+ /* Need "raw I/O" privileges for direct port access. */
26016+ if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
26017+ return -EPERM;
26018+
26019+ /* Change our version of the privilege levels. */
26020+ current->thread.iopl = new_iopl;
26021+
26022+ /* Force the change at ring 0. */
26023+ set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
26024+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26025+
26026+ return 0;
26027+}
26028Index: head-2008-11-25/arch/x86/kernel/irq_64-xen.c
26029===================================================================
26030--- /dev/null 1970-01-01 00:00:00.000000000 +0000
26031+++ head-2008-11-25/arch/x86/kernel/irq_64-xen.c 2008-10-29 09:55:56.000000000 +0100
26032@@ -0,0 +1,197 @@
26033+/*
26034+ * linux/arch/x86_64/kernel/irq.c
26035+ *
26036+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
26037+ *
26038+ * This file contains the lowest level x86_64-specific interrupt
26039+ * entry and irq statistics code. All the remaining irq logic is
26040+ * done by the generic kernel/irq/ code and in the
26041+ * x86_64-specific irq controller code. (e.g. i8259.c and
26042+ * io_apic.c.)
26043+ */
26044+
26045+#include <linux/kernel_stat.h>
26046+#include <linux/interrupt.h>
26047+#include <linux/seq_file.h>
26048+#include <linux/module.h>
26049+#include <linux/delay.h>
26050+#include <asm/uaccess.h>
26051+#include <asm/io_apic.h>
26052+#include <asm/idle.h>
26053+
26054+atomic_t irq_err_count;
26055+#ifdef CONFIG_X86_IO_APIC
26056+#ifdef APIC_MISMATCH_DEBUG
26057+atomic_t irq_mis_count;
26058+#endif
26059+#endif
26060+
26061+#ifdef CONFIG_DEBUG_STACKOVERFLOW
26062+/*
26063+ * Probabilistic stack overflow check:
26064+ *
26065+ * Only check the stack in process context, because everything else
26066+ * runs on the big interrupt stacks. Checking reliably is too expensive,
26067+ * so we just check from interrupts.
26068+ */
26069+static inline void stack_overflow_check(struct pt_regs *regs)
26070+{
26071+ u64 curbase = (u64) current->thread_info;
26072+ static unsigned long warned = -60*HZ;
26073+
26074+ if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
26075+ regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
26076+ time_after(jiffies, warned + 60*HZ)) {
26077+ printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
26078+ current->comm, curbase, regs->rsp);
26079+ show_stack(NULL,NULL);
26080+ warned = jiffies;
26081+ }
26082+}
26083+#endif
26084+
26085+/*
26086+ * Generic, controller-independent functions:
26087+ */
26088+
26089+int show_interrupts(struct seq_file *p, void *v)
26090+{
26091+ int i = *(loff_t *) v, j;
26092+ struct irqaction * action;
26093+ unsigned long flags;
26094+
26095+ if (i == 0) {
26096+ seq_printf(p, " ");
26097+ for_each_online_cpu(j)
26098+ seq_printf(p, "CPU%-8d",j);
26099+ seq_putc(p, '\n');
26100+ }
26101+
26102+ if (i < NR_IRQS) {
26103+ spin_lock_irqsave(&irq_desc[i].lock, flags);
26104+ action = irq_desc[i].action;
26105+ if (!action)
26106+ goto skip;
26107+ seq_printf(p, "%3d: ",i);
26108+#ifndef CONFIG_SMP
26109+ seq_printf(p, "%10u ", kstat_irqs(i));
26110+#else
26111+ for_each_online_cpu(j)
26112+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
26113+#endif
26114+ seq_printf(p, " %14s", irq_desc[i].chip->typename);
26115+
26116+ seq_printf(p, " %s", action->name);
26117+ for (action=action->next; action; action = action->next)
26118+ seq_printf(p, ", %s", action->name);
26119+ seq_putc(p, '\n');
26120+skip:
26121+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
26122+ } else if (i == NR_IRQS) {
26123+ seq_printf(p, "NMI: ");
26124+ for_each_online_cpu(j)
26125+ seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
26126+ seq_putc(p, '\n');
26127+#ifdef CONFIG_X86_LOCAL_APIC
26128+ seq_printf(p, "LOC: ");
26129+ for_each_online_cpu(j)
26130+ seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
26131+ seq_putc(p, '\n');
26132+#endif
26133+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
26134+#ifdef CONFIG_X86_IO_APIC
26135+#ifdef APIC_MISMATCH_DEBUG
26136+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
26137+#endif
26138+#endif
26139+ }
26140+ return 0;
26141+}
26142+
26143+/*
26144+ * do_IRQ handles all normal device IRQ's (the special
26145+ * SMP cross-CPU interrupts have their own specific
26146+ * handlers).
26147+ */
26148+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
26149+{
26150+ /* high bit used in ret_from_ code */
26151+ unsigned irq = ~regs->orig_rax;
26152+
26153+ if (unlikely(irq >= NR_IRQS)) {
26154+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
26155+ __FUNCTION__, irq);
26156+ BUG();
26157+ }
26158+
26159+ /*exit_idle();*/
26160+ /*irq_enter();*/
26161+#ifdef CONFIG_DEBUG_STACKOVERFLOW
26162+ stack_overflow_check(regs);
26163+#endif
26164+ __do_IRQ(irq, regs);
26165+ /*irq_exit();*/
26166+
26167+ return 1;
26168+}
26169+
26170+#ifdef CONFIG_HOTPLUG_CPU
26171+void fixup_irqs(cpumask_t map)
26172+{
26173+ unsigned int irq;
26174+ static int warned;
26175+
26176+ for (irq = 0; irq < NR_IRQS; irq++) {
26177+ cpumask_t mask;
26178+ if (irq == 2)
26179+ continue;
26180+
26181+ cpus_and(mask, irq_desc[irq].affinity, map);
26182+ if (any_online_cpu(mask) == NR_CPUS) {
26183+ /*printk("Breaking affinity for irq %i\n", irq);*/
26184+ mask = map;
26185+ }
26186+ if (irq_desc[irq].chip->set_affinity)
26187+ irq_desc[irq].chip->set_affinity(irq, mask);
26188+ else if (irq_desc[irq].action && !(warned++))
26189+ printk("Cannot set affinity for irq %i\n", irq);
26190+ }
26191+
26192+ /* That doesn't seem sufficient. Give it 1ms. */
26193+ local_irq_enable();
26194+ mdelay(1);
26195+ local_irq_disable();
26196+}
26197+#endif
26198+
26199+extern void call_softirq(void);
26200+
26201+asmlinkage void do_softirq(void)
26202+{
26203+ __u32 pending;
26204+ unsigned long flags;
26205+
26206+ if (in_interrupt())
26207+ return;
26208+
26209+ local_irq_save(flags);
26210+ pending = local_softirq_pending();
26211+ /* Switch to interrupt stack */
26212+ if (pending) {
26213+ call_softirq();
26214+ WARN_ON_ONCE(softirq_count());
26215+ }
26216+ local_irq_restore(flags);
26217+}
26218+EXPORT_SYMBOL(do_softirq);
26219+
26220+#ifndef CONFIG_X86_LOCAL_APIC
26221+/*
26222+ * 'what should we do if we get a hw irq event on an illegal vector'.
26223+ * each architecture has to answer this themselves.
26224+ */
26225+void ack_bad_irq(unsigned int irq)
26226+{
26227+ printk("unexpected IRQ trap at vector %02x\n", irq);
26228+}
26229+#endif
26230Index: head-2008-11-25/arch/x86/kernel/ldt_64-xen.c
26231===================================================================
26232--- /dev/null 1970-01-01 00:00:00.000000000 +0000
26233+++ head-2008-11-25/arch/x86/kernel/ldt_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26234@@ -0,0 +1,282 @@
26235+/*
26236+ * linux/arch/x86_64/kernel/ldt.c
26237+ *
26238+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
26239+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
26240+ * Copyright (C) 2002 Andi Kleen
26241+ *
26242+ * This handles calls from both 32bit and 64bit mode.
26243+ */
26244+
26245+#include <linux/errno.h>
26246+#include <linux/sched.h>
26247+#include <linux/string.h>
26248+#include <linux/mm.h>
26249+#include <linux/smp.h>
26250+#include <linux/smp_lock.h>
26251+#include <linux/vmalloc.h>
26252+#include <linux/slab.h>
26253+
26254+#include <asm/uaccess.h>
26255+#include <asm/system.h>
26256+#include <asm/ldt.h>
26257+#include <asm/desc.h>
26258+#include <asm/proto.h>
26259+#include <asm/pgalloc.h>
26260+
26261+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26262+static void flush_ldt(void *null)
26263+{
26264+ if (current->active_mm)
26265+ load_LDT(&current->active_mm->context);
26266+}
26267+#endif
26268+
26269+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
26270+{
26271+ void *oldldt;
26272+ void *newldt;
26273+ unsigned oldsize;
26274+
26275+ if (mincount <= (unsigned)pc->size)
26276+ return 0;
26277+ oldsize = pc->size;
26278+ mincount = (mincount+511)&(~511);
26279+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
26280+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
26281+ else
26282+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
26283+
26284+ if (!newldt)
26285+ return -ENOMEM;
26286+
26287+ if (oldsize)
26288+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
26289+ oldldt = pc->ldt;
26290+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
26291+ wmb();
26292+ pc->ldt = newldt;
26293+ wmb();
26294+ pc->size = mincount;
26295+ wmb();
26296+ if (reload) {
26297+#ifdef CONFIG_SMP
26298+ cpumask_t mask;
26299+
26300+ preempt_disable();
26301+#endif
26302+ make_pages_readonly(
26303+ pc->ldt,
26304+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26305+ XENFEAT_writable_descriptor_tables);
26306+ load_LDT(pc);
26307+#ifdef CONFIG_SMP
26308+ mask = cpumask_of_cpu(smp_processor_id());
26309+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
26310+ smp_call_function(flush_ldt, NULL, 1, 1);
26311+ preempt_enable();
26312+#endif
26313+ }
26314+ if (oldsize) {
26315+ make_pages_writable(
26316+ oldldt,
26317+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
26318+ XENFEAT_writable_descriptor_tables);
26319+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
26320+ vfree(oldldt);
26321+ else
26322+ kfree(oldldt);
26323+ }
26324+ return 0;
26325+}
26326+
26327+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
26328+{
26329+ int err = alloc_ldt(new, old->size, 0);
26330+ if (err < 0)
26331+ return err;
26332+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
26333+ make_pages_readonly(
26334+ new->ldt,
26335+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26336+ XENFEAT_writable_descriptor_tables);
26337+ return 0;
26338+}
26339+
26340+/*
26341+ * we do not have to muck with descriptors here, that is
26342+ * done in switch_mm() as needed.
26343+ */
26344+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
26345+{
26346+ struct mm_struct * old_mm;
26347+ int retval = 0;
26348+
26349+ memset(&mm->context, 0, sizeof(mm->context));
26350+ init_MUTEX(&mm->context.sem);
26351+ old_mm = current->mm;
26352+ if (old_mm && old_mm->context.size > 0) {
26353+ down(&old_mm->context.sem);
26354+ retval = copy_ldt(&mm->context, &old_mm->context);
26355+ up(&old_mm->context.sem);
26356+ }
26357+ if (retval == 0) {
26358+ spin_lock(&mm_unpinned_lock);
26359+ list_add(&mm->context.unpinned, &mm_unpinned);
26360+ spin_unlock(&mm_unpinned_lock);
26361+ }
26362+ return retval;
26363+}
26364+
26365+/*
26366+ *
26367+ * Don't touch the LDT register - we're already in the next thread.
26368+ */
26369+void destroy_context(struct mm_struct *mm)
26370+{
26371+ if (mm->context.size) {
26372+ if (mm == current->active_mm)
26373+ clear_LDT();
26374+ make_pages_writable(
26375+ mm->context.ldt,
26376+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26377+ XENFEAT_writable_descriptor_tables);
26378+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
26379+ vfree(mm->context.ldt);
26380+ else
26381+ kfree(mm->context.ldt);
26382+ mm->context.size = 0;
26383+ }
26384+ if (!mm->context.pinned) {
26385+ spin_lock(&mm_unpinned_lock);
26386+ list_del(&mm->context.unpinned);
26387+ spin_unlock(&mm_unpinned_lock);
26388+ }
26389+}
26390+
26391+static int read_ldt(void __user * ptr, unsigned long bytecount)
26392+{
26393+ int err;
26394+ unsigned long size;
26395+ struct mm_struct * mm = current->mm;
26396+
26397+ if (!mm->context.size)
26398+ return 0;
26399+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
26400+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
26401+
26402+ down(&mm->context.sem);
26403+ size = mm->context.size*LDT_ENTRY_SIZE;
26404+ if (size > bytecount)
26405+ size = bytecount;
26406+
26407+ err = 0;
26408+ if (copy_to_user(ptr, mm->context.ldt, size))
26409+ err = -EFAULT;
26410+ up(&mm->context.sem);
26411+ if (err < 0)
26412+ goto error_return;
26413+ if (size != bytecount) {
26414+ /* zero-fill the rest */
26415+ if (clear_user(ptr+size, bytecount-size) != 0) {
26416+ err = -EFAULT;
26417+ goto error_return;
26418+ }
26419+ }
26420+ return bytecount;
26421+error_return:
26422+ return err;
26423+}
26424+
26425+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
26426+{
26427+ /* Arbitrary number */
26428+ /* x86-64 default LDT is all zeros */
26429+ if (bytecount > 128)
26430+ bytecount = 128;
26431+ if (clear_user(ptr, bytecount))
26432+ return -EFAULT;
26433+ return bytecount;
26434+}
26435+
26436+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
26437+{
26438+ struct task_struct *me = current;
26439+ struct mm_struct * mm = me->mm;
26440+ __u32 entry_1, entry_2, *lp;
26441+ unsigned long mach_lp;
26442+ int error;
26443+ struct user_desc ldt_info;
26444+
26445+ error = -EINVAL;
26446+
26447+ if (bytecount != sizeof(ldt_info))
26448+ goto out;
26449+ error = -EFAULT;
26450+ if (copy_from_user(&ldt_info, ptr, bytecount))
26451+ goto out;
26452+
26453+ error = -EINVAL;
26454+ if (ldt_info.entry_number >= LDT_ENTRIES)
26455+ goto out;
26456+ if (ldt_info.contents == 3) {
26457+ if (oldmode)
26458+ goto out;
26459+ if (ldt_info.seg_not_present == 0)
26460+ goto out;
26461+ }
26462+
26463+ down(&mm->context.sem);
26464+ if (ldt_info.entry_number >= (unsigned)mm->context.size) {
26465+ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
26466+ if (error < 0)
26467+ goto out_unlock;
26468+ }
26469+
26470+ lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
26471+ mach_lp = arbitrary_virt_to_machine(lp);
26472+
26473+ /* Allow LDTs to be cleared by the user. */
26474+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
26475+ if (oldmode || LDT_empty(&ldt_info)) {
26476+ entry_1 = 0;
26477+ entry_2 = 0;
26478+ goto install;
26479+ }
26480+ }
26481+
26482+ entry_1 = LDT_entry_a(&ldt_info);
26483+ entry_2 = LDT_entry_b(&ldt_info);
26484+ if (oldmode)
26485+ entry_2 &= ~(1 << 20);
26486+
26487+ /* Install the new entry ... */
26488+install:
26489+ error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
26490+
26491+out_unlock:
26492+ up(&mm->context.sem);
26493+out:
26494+ return error;
26495+}
26496+
26497+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
26498+{
26499+ int ret = -ENOSYS;
26500+
26501+ switch (func) {
26502+ case 0:
26503+ ret = read_ldt(ptr, bytecount);
26504+ break;
26505+ case 1:
26506+ ret = write_ldt(ptr, bytecount, 1);
26507+ break;
26508+ case 2:
26509+ ret = read_default_ldt(ptr, bytecount);
26510+ break;
26511+ case 0x11:
26512+ ret = write_ldt(ptr, bytecount, 0);
26513+ break;
26514+ }
26515+ return ret;
26516+}
26517Index: head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c
26518===================================================================
26519--- /dev/null 1970-01-01 00:00:00.000000000 +0000
26520+++ head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26521@@ -0,0 +1,1011 @@
26522+/*
26523+ * Intel Multiprocessor Specification 1.1 and 1.4
26524+ * compliant MP-table parsing routines.
26525+ *
26526+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
26527+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
26528+ *
26529+ * Fixes
26530+ * Erich Boleyn : MP v1.4 and additional changes.
26531+ * Alan Cox : Added EBDA scanning
26532+ * Ingo Molnar : various cleanups and rewrites
26533+ * Maciej W. Rozycki: Bits for default MP configurations
26534+ * Paul Diefenbaugh: Added full ACPI support
26535+ */
26536+
26537+#include <linux/mm.h>
26538+#include <linux/init.h>
26539+#include <linux/delay.h>
26540+#include <linux/bootmem.h>
26541+#include <linux/smp_lock.h>
26542+#include <linux/kernel_stat.h>
26543+#include <linux/mc146818rtc.h>
26544+#include <linux/acpi.h>
26545+#include <linux/module.h>
26546+
26547+#include <asm/smp.h>
26548+#include <asm/mtrr.h>
26549+#include <asm/mpspec.h>
26550+#include <asm/pgalloc.h>
26551+#include <asm/io_apic.h>
26552+#include <asm/proto.h>
26553+#include <asm/acpi.h>
26554+
26555+/* Have we found an MP table */
26556+int smp_found_config;
26557+unsigned int __initdata maxcpus = NR_CPUS;
26558+
26559+int acpi_found_madt;
26560+
26561+/*
26562+ * Various Linux-internal data structures created from the
26563+ * MP-table.
26564+ */
26565+unsigned char apic_version [MAX_APICS];
26566+unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26567+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26568+
26569+static int mp_current_pci_id = 0;
26570+/* I/O APIC entries */
26571+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
26572+
26573+/* # of MP IRQ source entries */
26574+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
26575+
26576+/* MP IRQ source entries */
26577+int mp_irq_entries;
26578+
26579+int nr_ioapics;
26580+int pic_mode;
26581+unsigned long mp_lapic_addr = 0;
26582+
26583+
26584+
26585+/* Processor that is doing the boot up */
26586+unsigned int boot_cpu_id = -1U;
26587+/* Internal processor count */
26588+unsigned int num_processors __initdata = 0;
26589+
26590+unsigned disabled_cpus __initdata;
26591+
26592+/* Bitmask of physically existing CPUs */
26593+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
26594+
26595+/* ACPI MADT entry parsing functions */
26596+#ifdef CONFIG_ACPI
26597+extern struct acpi_boot_flags acpi_boot;
26598+#ifdef CONFIG_X86_LOCAL_APIC
26599+extern int acpi_parse_lapic (acpi_table_entry_header *header);
26600+extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
26601+extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
26602+#endif /*CONFIG_X86_LOCAL_APIC*/
26603+#ifdef CONFIG_X86_IO_APIC
26604+extern int acpi_parse_ioapic (acpi_table_entry_header *header);
26605+#endif /*CONFIG_X86_IO_APIC*/
26606+#endif /*CONFIG_ACPI*/
26607+
26608+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
26609+
26610+
26611+/*
26612+ * Intel MP BIOS table parsing routines:
26613+ */
26614+
26615+/*
26616+ * Checksum an MP configuration block.
26617+ */
26618+
26619+static int __init mpf_checksum(unsigned char *mp, int len)
26620+{
26621+ int sum = 0;
26622+
26623+ while (len--)
26624+ sum += *mp++;
26625+
26626+ return sum & 0xFF;
26627+}
26628+
26629+#ifndef CONFIG_XEN
26630+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26631+{
26632+ int cpu;
26633+ unsigned char ver;
26634+ cpumask_t tmp_map;
26635+
26636+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
26637+ disabled_cpus++;
26638+ return;
26639+ }
26640+
26641+ printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
26642+ m->mpc_apicid,
26643+ (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
26644+ (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
26645+ m->mpc_apicver);
26646+
26647+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26648+ Dprintk(" Bootup CPU\n");
26649+ boot_cpu_id = m->mpc_apicid;
26650+ }
26651+ if (num_processors >= NR_CPUS) {
26652+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
26653+ " Processor ignored.\n", NR_CPUS);
26654+ return;
26655+ }
26656+
26657+ num_processors++;
26658+ cpus_complement(tmp_map, cpu_present_map);
26659+ cpu = first_cpu(tmp_map);
26660+
26661+#if MAX_APICS < 255
26662+ if ((int)m->mpc_apicid > MAX_APICS) {
26663+ printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
26664+ m->mpc_apicid, MAX_APICS);
26665+ return;
26666+ }
26667+#endif
26668+ ver = m->mpc_apicver;
26669+
26670+ physid_set(m->mpc_apicid, phys_cpu_present_map);
26671+ /*
26672+ * Validate version
26673+ */
26674+ if (ver == 0x0) {
26675+ printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
26676+ ver = 0x10;
26677+ }
26678+ apic_version[m->mpc_apicid] = ver;
26679+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26680+ /*
26681+ * bios_cpu_apicid is required to have processors listed
26682+ * in same order as logical cpu numbers. Hence the first
26683+ * entry is BSP, and so on.
26684+ */
26685+ cpu = 0;
26686+ }
26687+ bios_cpu_apicid[cpu] = m->mpc_apicid;
26688+ x86_cpu_to_apicid[cpu] = m->mpc_apicid;
26689+
26690+ cpu_set(cpu, cpu_possible_map);
26691+ cpu_set(cpu, cpu_present_map);
26692+}
26693+#else
26694+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26695+{
26696+ num_processors++;
26697+}
26698+#endif /* CONFIG_XEN */
26699+
26700+static void __init MP_bus_info (struct mpc_config_bus *m)
26701+{
26702+ char str[7];
26703+
26704+ memcpy(str, m->mpc_bustype, 6);
26705+ str[6] = 0;
26706+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
26707+
26708+ if (strncmp(str, "ISA", 3) == 0) {
26709+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
26710+ } else if (strncmp(str, "EISA", 4) == 0) {
26711+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
26712+ } else if (strncmp(str, "PCI", 3) == 0) {
26713+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
26714+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
26715+ mp_current_pci_id++;
26716+ } else if (strncmp(str, "MCA", 3) == 0) {
26717+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
26718+ } else {
26719+ printk(KERN_ERR "Unknown bustype %s\n", str);
26720+ }
26721+}
26722+
26723+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
26724+{
26725+ if (!(m->mpc_flags & MPC_APIC_USABLE))
26726+ return;
26727+
26728+ printk("I/O APIC #%d Version %d at 0x%X.\n",
26729+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
26730+ if (nr_ioapics >= MAX_IO_APICS) {
26731+ printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
26732+ MAX_IO_APICS, nr_ioapics);
26733+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
26734+ }
26735+ if (!m->mpc_apicaddr) {
26736+ printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
26737+ " found in MP table, skipping!\n");
26738+ return;
26739+ }
26740+ mp_ioapics[nr_ioapics] = *m;
26741+ nr_ioapics++;
26742+}
26743+
26744+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
26745+{
26746+ mp_irqs [mp_irq_entries] = *m;
26747+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
26748+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
26749+ m->mpc_irqtype, m->mpc_irqflag & 3,
26750+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
26751+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
26752+ if (++mp_irq_entries >= MAX_IRQ_SOURCES)
26753+ panic("Max # of irq sources exceeded!!\n");
26754+}
26755+
26756+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
26757+{
26758+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
26759+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
26760+ m->mpc_irqtype, m->mpc_irqflag & 3,
26761+ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
26762+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
26763+ /*
26764+ * Well it seems all SMP boards in existence
26765+ * use ExtINT/LVT1 == LINT0 and
26766+ * NMI/LVT2 == LINT1 - the following check
26767+ * will show us if this assumptions is false.
26768+ * Until then we do not have to add baggage.
26769+ */
26770+ if ((m->mpc_irqtype == mp_ExtINT) &&
26771+ (m->mpc_destapiclint != 0))
26772+ BUG();
26773+ if ((m->mpc_irqtype == mp_NMI) &&
26774+ (m->mpc_destapiclint != 1))
26775+ BUG();
26776+}
26777+
26778+/*
26779+ * Read/parse the MPC
26780+ */
26781+
26782+static int __init smp_read_mpc(struct mp_config_table *mpc)
26783+{
26784+ char str[16];
26785+ int count=sizeof(*mpc);
26786+ unsigned char *mpt=((unsigned char *)mpc)+count;
26787+
26788+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
26789+ printk("SMP mptable: bad signature [%c%c%c%c]!\n",
26790+ mpc->mpc_signature[0],
26791+ mpc->mpc_signature[1],
26792+ mpc->mpc_signature[2],
26793+ mpc->mpc_signature[3]);
26794+ return 0;
26795+ }
26796+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
26797+ printk("SMP mptable: checksum error!\n");
26798+ return 0;
26799+ }
26800+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
26801+ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
26802+ mpc->mpc_spec);
26803+ return 0;
26804+ }
26805+ if (!mpc->mpc_lapic) {
26806+ printk(KERN_ERR "SMP mptable: null local APIC address!\n");
26807+ return 0;
26808+ }
26809+ memcpy(str,mpc->mpc_oem,8);
26810+ str[8]=0;
26811+ printk(KERN_INFO "OEM ID: %s ",str);
26812+
26813+ memcpy(str,mpc->mpc_productid,12);
26814+ str[12]=0;
26815+ printk("Product ID: %s ",str);
26816+
26817+ printk("APIC at: 0x%X\n",mpc->mpc_lapic);
26818+
26819+ /* save the local APIC address, it might be non-default */
26820+ if (!acpi_lapic)
26821+ mp_lapic_addr = mpc->mpc_lapic;
26822+
26823+ /*
26824+ * Now process the configuration blocks.
26825+ */
26826+ while (count < mpc->mpc_length) {
26827+ switch(*mpt) {
26828+ case MP_PROCESSOR:
26829+ {
26830+ struct mpc_config_processor *m=
26831+ (struct mpc_config_processor *)mpt;
26832+ if (!acpi_lapic)
26833+ MP_processor_info(m);
26834+ mpt += sizeof(*m);
26835+ count += sizeof(*m);
26836+ break;
26837+ }
26838+ case MP_BUS:
26839+ {
26840+ struct mpc_config_bus *m=
26841+ (struct mpc_config_bus *)mpt;
26842+ MP_bus_info(m);
26843+ mpt += sizeof(*m);
26844+ count += sizeof(*m);
26845+ break;
26846+ }
26847+ case MP_IOAPIC:
26848+ {
26849+ struct mpc_config_ioapic *m=
26850+ (struct mpc_config_ioapic *)mpt;
26851+ MP_ioapic_info(m);
26852+ mpt+=sizeof(*m);
26853+ count+=sizeof(*m);
26854+ break;
26855+ }
26856+ case MP_INTSRC:
26857+ {
26858+ struct mpc_config_intsrc *m=
26859+ (struct mpc_config_intsrc *)mpt;
26860+
26861+ MP_intsrc_info(m);
26862+ mpt+=sizeof(*m);
26863+ count+=sizeof(*m);
26864+ break;
26865+ }
26866+ case MP_LINTSRC:
26867+ {
26868+ struct mpc_config_lintsrc *m=
26869+ (struct mpc_config_lintsrc *)mpt;
26870+ MP_lintsrc_info(m);
26871+ mpt+=sizeof(*m);
26872+ count+=sizeof(*m);
26873+ break;
26874+ }
26875+ }
26876+ }
26877+ clustered_apic_check();
26878+ if (!num_processors)
26879+ printk(KERN_ERR "SMP mptable: no processors registered!\n");
26880+ return num_processors;
26881+}
26882+
26883+static int __init ELCR_trigger(unsigned int irq)
26884+{
26885+ unsigned int port;
26886+
26887+ port = 0x4d0 + (irq >> 3);
26888+ return (inb(port) >> (irq & 7)) & 1;
26889+}
26890+
26891+static void __init construct_default_ioirq_mptable(int mpc_default_type)
26892+{
26893+ struct mpc_config_intsrc intsrc;
26894+ int i;
26895+ int ELCR_fallback = 0;
26896+
26897+ intsrc.mpc_type = MP_INTSRC;
26898+ intsrc.mpc_irqflag = 0; /* conforming */
26899+ intsrc.mpc_srcbus = 0;
26900+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
26901+
26902+ intsrc.mpc_irqtype = mp_INT;
26903+
26904+ /*
26905+ * If true, we have an ISA/PCI system with no IRQ entries
26906+ * in the MP table. To prevent the PCI interrupts from being set up
26907+ * incorrectly, we try to use the ELCR. The sanity check to see if
26908+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
26909+ * never be level sensitive, so we simply see if the ELCR agrees.
26910+ * If it does, we assume it's valid.
26911+ */
26912+ if (mpc_default_type == 5) {
26913+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
26914+
26915+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
26916+ printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
26917+ else {
26918+ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
26919+ ELCR_fallback = 1;
26920+ }
26921+ }
26922+
26923+ for (i = 0; i < 16; i++) {
26924+ switch (mpc_default_type) {
26925+ case 2:
26926+ if (i == 0 || i == 13)
26927+ continue; /* IRQ0 & IRQ13 not connected */
26928+ /* fall through */
26929+ default:
26930+ if (i == 2)
26931+ continue; /* IRQ2 is never connected */
26932+ }
26933+
26934+ if (ELCR_fallback) {
26935+ /*
26936+ * If the ELCR indicates a level-sensitive interrupt, we
26937+ * copy that information over to the MP table in the
26938+ * irqflag field (level sensitive, active high polarity).
26939+ */
26940+ if (ELCR_trigger(i))
26941+ intsrc.mpc_irqflag = 13;
26942+ else
26943+ intsrc.mpc_irqflag = 0;
26944+ }
26945+
26946+ intsrc.mpc_srcbusirq = i;
26947+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
26948+ MP_intsrc_info(&intsrc);
26949+ }
26950+
26951+ intsrc.mpc_irqtype = mp_ExtINT;
26952+ intsrc.mpc_srcbusirq = 0;
26953+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
26954+ MP_intsrc_info(&intsrc);
26955+}
26956+
26957+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
26958+{
26959+ struct mpc_config_processor processor;
26960+ struct mpc_config_bus bus;
26961+ struct mpc_config_ioapic ioapic;
26962+ struct mpc_config_lintsrc lintsrc;
26963+ int linttypes[2] = { mp_ExtINT, mp_NMI };
26964+ int i;
26965+
26966+ /*
26967+ * local APIC has default address
26968+ */
26969+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
26970+
26971+ /*
26972+ * 2 CPUs, numbered 0 & 1.
26973+ */
26974+ processor.mpc_type = MP_PROCESSOR;
26975+ /* Either an integrated APIC or a discrete 82489DX. */
26976+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
26977+ processor.mpc_cpuflag = CPU_ENABLED;
26978+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
26979+ (boot_cpu_data.x86_model << 4) |
26980+ boot_cpu_data.x86_mask;
26981+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
26982+ processor.mpc_reserved[0] = 0;
26983+ processor.mpc_reserved[1] = 0;
26984+ for (i = 0; i < 2; i++) {
26985+ processor.mpc_apicid = i;
26986+ MP_processor_info(&processor);
26987+ }
26988+
26989+ bus.mpc_type = MP_BUS;
26990+ bus.mpc_busid = 0;
26991+ switch (mpc_default_type) {
26992+ default:
26993+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
26994+ mpc_default_type);
26995+ /* fall through */
26996+ case 1:
26997+ case 5:
26998+ memcpy(bus.mpc_bustype, "ISA ", 6);
26999+ break;
27000+ case 2:
27001+ case 6:
27002+ case 3:
27003+ memcpy(bus.mpc_bustype, "EISA ", 6);
27004+ break;
27005+ case 4:
27006+ case 7:
27007+ memcpy(bus.mpc_bustype, "MCA ", 6);
27008+ }
27009+ MP_bus_info(&bus);
27010+ if (mpc_default_type > 4) {
27011+ bus.mpc_busid = 1;
27012+ memcpy(bus.mpc_bustype, "PCI ", 6);
27013+ MP_bus_info(&bus);
27014+ }
27015+
27016+ ioapic.mpc_type = MP_IOAPIC;
27017+ ioapic.mpc_apicid = 2;
27018+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
27019+ ioapic.mpc_flags = MPC_APIC_USABLE;
27020+ ioapic.mpc_apicaddr = 0xFEC00000;
27021+ MP_ioapic_info(&ioapic);
27022+
27023+ /*
27024+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
27025+ */
27026+ construct_default_ioirq_mptable(mpc_default_type);
27027+
27028+ lintsrc.mpc_type = MP_LINTSRC;
27029+ lintsrc.mpc_irqflag = 0; /* conforming */
27030+ lintsrc.mpc_srcbusid = 0;
27031+ lintsrc.mpc_srcbusirq = 0;
27032+ lintsrc.mpc_destapic = MP_APIC_ALL;
27033+ for (i = 0; i < 2; i++) {
27034+ lintsrc.mpc_irqtype = linttypes[i];
27035+ lintsrc.mpc_destapiclint = i;
27036+ MP_lintsrc_info(&lintsrc);
27037+ }
27038+}
27039+
27040+static struct intel_mp_floating *mpf_found;
27041+
27042+/*
27043+ * Scan the memory blocks for an SMP configuration block.
27044+ */
27045+void __init get_smp_config (void)
27046+{
27047+ struct intel_mp_floating *mpf = mpf_found;
27048+
27049+ /*
27050+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
27051+ * processors, where MPS only supports physical.
27052+ */
27053+ if (acpi_lapic && acpi_ioapic) {
27054+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
27055+ return;
27056+ }
27057+ else if (acpi_lapic)
27058+ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
27059+
27060+ printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
27061+ if (mpf->mpf_feature2 & (1<<7)) {
27062+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
27063+ pic_mode = 1;
27064+ } else {
27065+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
27066+ pic_mode = 0;
27067+ }
27068+
27069+ /*
27070+ * Now see if we need to read further.
27071+ */
27072+ if (mpf->mpf_feature1 != 0) {
27073+
27074+ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
27075+ construct_default_ISA_mptable(mpf->mpf_feature1);
27076+
27077+ } else if (mpf->mpf_physptr) {
27078+
27079+ /*
27080+ * Read the physical hardware table. Anything here will
27081+ * override the defaults.
27082+ */
27083+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
27084+ smp_found_config = 0;
27085+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
27086+ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
27087+ return;
27088+ }
27089+ /*
27090+ * If there are no explicit MP IRQ entries, then we are
27091+ * broken. We set up most of the low 16 IO-APIC pins to
27092+ * ISA defaults and hope it will work.
27093+ */
27094+ if (!mp_irq_entries) {
27095+ struct mpc_config_bus bus;
27096+
27097+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
27098+
27099+ bus.mpc_type = MP_BUS;
27100+ bus.mpc_busid = 0;
27101+ memcpy(bus.mpc_bustype, "ISA ", 6);
27102+ MP_bus_info(&bus);
27103+
27104+ construct_default_ioirq_mptable(0);
27105+ }
27106+
27107+ } else
27108+ BUG();
27109+
27110+ printk(KERN_INFO "Processors: %d\n", num_processors);
27111+ /*
27112+ * Only use the first configuration found.
27113+ */
27114+}
27115+
27116+static int __init smp_scan_config (unsigned long base, unsigned long length)
27117+{
27118+ extern void __bad_mpf_size(void);
27119+ unsigned int *bp = isa_bus_to_virt(base);
27120+ struct intel_mp_floating *mpf;
27121+
27122+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
27123+ if (sizeof(*mpf) != 16)
27124+ __bad_mpf_size();
27125+
27126+ while (length > 0) {
27127+ mpf = (struct intel_mp_floating *)bp;
27128+ if ((*bp == SMP_MAGIC_IDENT) &&
27129+ (mpf->mpf_length == 1) &&
27130+ !mpf_checksum((unsigned char *)bp, 16) &&
27131+ ((mpf->mpf_specification == 1)
27132+ || (mpf->mpf_specification == 4)) ) {
27133+
27134+ smp_found_config = 1;
27135+ mpf_found = mpf;
27136+ return 1;
27137+ }
27138+ bp += 4;
27139+ length -= 16;
27140+ }
27141+ return 0;
27142+}
27143+
27144+void __init find_intel_smp (void)
27145+{
27146+ unsigned int address;
27147+
27148+ /*
27149+ * FIXME: Linux assumes you have 640K of base ram..
27150+ * this continues the error...
27151+ *
27152+ * 1) Scan the bottom 1K for a signature
27153+ * 2) Scan the top 1K of base RAM
27154+ * 3) Scan the 64K of bios
27155+ */
27156+ if (smp_scan_config(0x0,0x400) ||
27157+ smp_scan_config(639*0x400,0x400) ||
27158+ smp_scan_config(0xF0000,0x10000))
27159+ return;
27160+ /*
27161+ * If it is an SMP machine we should know now, unless the
27162+ * configuration is in an EISA/MCA bus machine with an
27163+ * extended bios data area.
27164+ *
27165+ * there is a real-mode segmented pointer pointing to the
27166+ * 4K EBDA area at 0x40E, calculate and scan it here.
27167+ *
27168+ * NOTE! There are Linux loaders that will corrupt the EBDA
27169+ * area, and as such this kind of SMP config may be less
27170+ * trustworthy, simply because the SMP table may have been
27171+ * stomped on during early boot. These loaders are buggy and
27172+ * should be fixed.
27173+ */
27174+
27175+ address = *(unsigned short *)phys_to_virt(0x40E);
27176+ address <<= 4;
27177+ if (smp_scan_config(address, 0x1000))
27178+ return;
27179+
27180+ /* If we have come this far, we did not find an MP table */
27181+ printk(KERN_INFO "No mptable found.\n");
27182+}
27183+
27184+/*
27185+ * - Intel MP Configuration Table
27186+ */
27187+void __init find_smp_config (void)
27188+{
27189+#ifdef CONFIG_X86_LOCAL_APIC
27190+ find_intel_smp();
27191+#endif
27192+}
27193+
27194+
27195+/* --------------------------------------------------------------------------
27196+ ACPI-based MP Configuration
27197+ -------------------------------------------------------------------------- */
27198+
27199+#ifdef CONFIG_ACPI
27200+
27201+void __init mp_register_lapic_address (
27202+ u64 address)
27203+{
27204+#ifndef CONFIG_XEN
27205+ mp_lapic_addr = (unsigned long) address;
27206+
27207+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
27208+
27209+ if (boot_cpu_id == -1U)
27210+ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
27211+
27212+ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
27213+#endif
27214+}
27215+
27216+
27217+void __cpuinit mp_register_lapic (
27218+ u8 id,
27219+ u8 enabled)
27220+{
27221+ struct mpc_config_processor processor;
27222+ int boot_cpu = 0;
27223+
27224+ if (id >= MAX_APICS) {
27225+ printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
27226+ id, MAX_APICS);
27227+ return;
27228+ }
27229+
27230+ if (id == boot_cpu_physical_apicid)
27231+ boot_cpu = 1;
27232+
27233+#ifndef CONFIG_XEN
27234+ processor.mpc_type = MP_PROCESSOR;
27235+ processor.mpc_apicid = id;
27236+ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
27237+ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
27238+ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
27239+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
27240+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
27241+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
27242+ processor.mpc_reserved[0] = 0;
27243+ processor.mpc_reserved[1] = 0;
27244+#endif
27245+
27246+ MP_processor_info(&processor);
27247+}
27248+
27249+#ifdef CONFIG_X86_IO_APIC
27250+
27251+#define MP_ISA_BUS 0
27252+#define MP_MAX_IOAPIC_PIN 127
27253+
27254+static struct mp_ioapic_routing {
27255+ int apic_id;
27256+ int gsi_start;
27257+ int gsi_end;
27258+ u32 pin_programmed[4];
27259+} mp_ioapic_routing[MAX_IO_APICS];
27260+
27261+
27262+static int mp_find_ioapic (
27263+ int gsi)
27264+{
27265+ int i = 0;
27266+
27267+ /* Find the IOAPIC that manages this GSI. */
27268+ for (i = 0; i < nr_ioapics; i++) {
27269+ if ((gsi >= mp_ioapic_routing[i].gsi_start)
27270+ && (gsi <= mp_ioapic_routing[i].gsi_end))
27271+ return i;
27272+ }
27273+
27274+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
27275+
27276+ return -1;
27277+}
27278+
27279+
27280+void __init mp_register_ioapic (
27281+ u8 id,
27282+ u32 address,
27283+ u32 gsi_base)
27284+{
27285+ int idx = 0;
27286+
27287+ if (nr_ioapics >= MAX_IO_APICS) {
27288+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
27289+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
27290+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
27291+ }
27292+ if (!address) {
27293+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
27294+ " found in MADT table, skipping!\n");
27295+ return;
27296+ }
27297+
27298+ idx = nr_ioapics++;
27299+
27300+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
27301+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
27302+ mp_ioapics[idx].mpc_apicaddr = address;
27303+
27304+#ifndef CONFIG_XEN
27305+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
27306+#endif
27307+ mp_ioapics[idx].mpc_apicid = id;
27308+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
27309+
27310+ /*
27311+ * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
27312+ * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
27313+ */
27314+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
27315+ mp_ioapic_routing[idx].gsi_start = gsi_base;
27316+ mp_ioapic_routing[idx].gsi_end = gsi_base +
27317+ io_apic_get_redir_entries(idx);
27318+
27319+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
27320+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
27321+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
27322+ mp_ioapic_routing[idx].gsi_start,
27323+ mp_ioapic_routing[idx].gsi_end);
27324+
27325+ return;
27326+}
27327+
27328+
27329+void __init mp_override_legacy_irq (
27330+ u8 bus_irq,
27331+ u8 polarity,
27332+ u8 trigger,
27333+ u32 gsi)
27334+{
27335+ struct mpc_config_intsrc intsrc;
27336+ int ioapic = -1;
27337+ int pin = -1;
27338+
27339+ /*
27340+ * Convert 'gsi' to 'ioapic.pin'.
27341+ */
27342+ ioapic = mp_find_ioapic(gsi);
27343+ if (ioapic < 0)
27344+ return;
27345+ pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27346+
27347+ /*
27348+ * TBD: This check is for faulty timer entries, where the override
27349+ * erroneously sets the trigger to level, resulting in a HUGE
27350+ * increase of timer interrupts!
27351+ */
27352+ if ((bus_irq == 0) && (trigger == 3))
27353+ trigger = 1;
27354+
27355+ intsrc.mpc_type = MP_INTSRC;
27356+ intsrc.mpc_irqtype = mp_INT;
27357+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
27358+ intsrc.mpc_srcbus = MP_ISA_BUS;
27359+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
27360+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
27361+ intsrc.mpc_dstirq = pin; /* INTIN# */
27362+
27363+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
27364+ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27365+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27366+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
27367+
27368+ mp_irqs[mp_irq_entries] = intsrc;
27369+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
27370+ panic("Max # of irq sources exceeded!\n");
27371+
27372+ return;
27373+}
27374+
27375+
27376+void __init mp_config_acpi_legacy_irqs (void)
27377+{
27378+ struct mpc_config_intsrc intsrc;
27379+ int i = 0;
27380+ int ioapic = -1;
27381+
27382+ /*
27383+ * Fabricate the legacy ISA bus (bus #31).
27384+ */
27385+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
27386+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
27387+
27388+ /*
27389+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
27390+ */
27391+ ioapic = mp_find_ioapic(0);
27392+ if (ioapic < 0)
27393+ return;
27394+
27395+ intsrc.mpc_type = MP_INTSRC;
27396+ intsrc.mpc_irqflag = 0; /* Conforming */
27397+ intsrc.mpc_srcbus = MP_ISA_BUS;
27398+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
27399+
27400+ /*
27401+ * Use the default configuration for the IRQs 0-15. Unless
27402+ * overridden by (MADT) interrupt source override entries.
27403+ */
27404+ for (i = 0; i < 16; i++) {
27405+ int idx;
27406+
27407+ for (idx = 0; idx < mp_irq_entries; idx++) {
27408+ struct mpc_config_intsrc *irq = mp_irqs + idx;
27409+
27410+ /* Do we already have a mapping for this ISA IRQ? */
27411+ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
27412+ break;
27413+
27414+ /* Do we already have a mapping for this IOAPIC pin */
27415+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
27416+ (irq->mpc_dstirq == i))
27417+ break;
27418+ }
27419+
27420+ if (idx != mp_irq_entries) {
27421+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
27422+ continue; /* IRQ already used */
27423+ }
27424+
27425+ intsrc.mpc_irqtype = mp_INT;
27426+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
27427+ intsrc.mpc_dstirq = i;
27428+
27429+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
27430+ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27431+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27432+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
27433+ intsrc.mpc_dstirq);
27434+
27435+ mp_irqs[mp_irq_entries] = intsrc;
27436+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
27437+ panic("Max # of irq sources exceeded!\n");
27438+ }
27439+
27440+ return;
27441+}
27442+
27443+#define MAX_GSI_NUM 4096
27444+
27445+int mp_register_gsi(u32 gsi, int triggering, int polarity)
27446+{
27447+ int ioapic = -1;
27448+ int ioapic_pin = 0;
27449+ int idx, bit = 0;
27450+ static int pci_irq = 16;
27451+ /*
27452+ * Mapping between Global System Interrupts, which
27453+ * represent all possible interrupts, to the IRQs
27454+ * assigned to actual devices.
27455+ */
27456+ static int gsi_to_irq[MAX_GSI_NUM];
27457+
27458+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
27459+ return gsi;
27460+
27461+ /* Don't set up the ACPI SCI because it's already set up */
27462+ if (acpi_fadt.sci_int == gsi)
27463+ return gsi;
27464+
27465+ ioapic = mp_find_ioapic(gsi);
27466+ if (ioapic < 0) {
27467+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
27468+ return gsi;
27469+ }
27470+
27471+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27472+
27473+ /*
27474+ * Avoid pin reprogramming. PRTs typically include entries
27475+ * with redundant pin->gsi mappings (but unique PCI devices);
27476+ * we only program the IOAPIC on the first.
27477+ */
27478+ bit = ioapic_pin % 32;
27479+ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
27480+ if (idx > 3) {
27481+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
27482+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
27483+ ioapic_pin);
27484+ return gsi;
27485+ }
27486+ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
27487+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
27488+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
27489+ return gsi_to_irq[gsi];
27490+ }
27491+
27492+ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
27493+
27494+ if (triggering == ACPI_LEVEL_SENSITIVE) {
27495+ /*
27496+ * For PCI devices assign IRQs in order, avoiding gaps
27497+ * due to unused I/O APIC pins.
27498+ */
27499+ int irq = gsi;
27500+ if (gsi < MAX_GSI_NUM) {
27501+ /*
27502+ * Retain the VIA chipset work-around (gsi > 15), but
27503+ * avoid a problem where the 8254 timer (IRQ0) is setup
27504+ * via an override (so it's not on pin 0 of the ioapic),
27505+ * and at the same time, the pin 0 interrupt is a PCI
27506+ * type. The gsi > 15 test could cause these two pins
27507+ * to be shared as IRQ0, and they are not shareable.
27508+ * So test for this condition, and if necessary, avoid
27509+ * the pin collision.
27510+ */
27511+ if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
27512+ gsi = pci_irq++;
27513+ /*
27514+ * Don't assign IRQ used by ACPI SCI
27515+ */
27516+ if (gsi == acpi_fadt.sci_int)
27517+ gsi = pci_irq++;
27518+ gsi_to_irq[irq] = gsi;
27519+ } else {
27520+ printk(KERN_ERR "GSI %u is too high\n", gsi);
27521+ return gsi;
27522+ }
27523+ }
27524+
27525+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
27526+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
27527+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
27528+ return gsi;
27529+}
27530+
27531+#endif /*CONFIG_X86_IO_APIC*/
27532+#endif /*CONFIG_ACPI*/
27533Index: head-2008-11-25/arch/x86/kernel/process_64-xen.c
27534===================================================================
27535--- /dev/null 1970-01-01 00:00:00.000000000 +0000
27536+++ head-2008-11-25/arch/x86/kernel/process_64-xen.c 2008-08-07 12:44:36.000000000 +0200
27537@@ -0,0 +1,848 @@
27538+/*
27539+ * linux/arch/x86-64/kernel/process.c
27540+ *
27541+ * Copyright (C) 1995 Linus Torvalds
27542+ *
27543+ * Pentium III FXSR, SSE support
27544+ * Gareth Hughes <gareth@valinux.com>, May 2000
27545+ *
27546+ * X86-64 port
27547+ * Andi Kleen.
27548+ *
27549+ * CPU hotplug support - ashok.raj@intel.com
27550+ *
27551+ * Jun Nakajima <jun.nakajima@intel.com>
27552+ * Modified for Xen
27553+ */
27554+
27555+/*
27556+ * This file handles the architecture-dependent parts of process handling..
27557+ */
27558+
27559+#include <stdarg.h>
27560+
27561+#include <linux/cpu.h>
27562+#include <linux/errno.h>
27563+#include <linux/sched.h>
27564+#include <linux/kernel.h>
27565+#include <linux/mm.h>
27566+#include <linux/elfcore.h>
27567+#include <linux/smp.h>
27568+#include <linux/slab.h>
27569+#include <linux/user.h>
27570+#include <linux/module.h>
27571+#include <linux/a.out.h>
27572+#include <linux/interrupt.h>
27573+#include <linux/delay.h>
27574+#include <linux/ptrace.h>
27575+#include <linux/utsname.h>
27576+#include <linux/random.h>
27577+#include <linux/notifier.h>
27578+#include <linux/kprobes.h>
27579+
27580+#include <asm/uaccess.h>
27581+#include <asm/pgtable.h>
27582+#include <asm/system.h>
27583+#include <asm/io.h>
27584+#include <asm/processor.h>
27585+#include <asm/i387.h>
27586+#include <asm/mmu_context.h>
27587+#include <asm/pda.h>
27588+#include <asm/prctl.h>
27589+#include <asm/kdebug.h>
27590+#include <xen/interface/platform.h>
27591+#include <xen/interface/physdev.h>
27592+#include <xen/interface/vcpu.h>
27593+#include <asm/desc.h>
27594+#include <asm/proto.h>
27595+#include <asm/hardirq.h>
27596+#include <asm/ia32.h>
27597+#include <asm/idle.h>
27598+
27599+#include <xen/cpu_hotplug.h>
27600+
27601+asmlinkage extern void ret_from_fork(void);
27602+
27603+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
27604+
27605+unsigned long boot_option_idle_override = 0;
27606+EXPORT_SYMBOL(boot_option_idle_override);
27607+
27608+/*
27609+ * Powermanagement idle function, if any..
27610+ */
27611+void (*pm_idle)(void);
27612+EXPORT_SYMBOL(pm_idle);
27613+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
27614+
27615+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
27616+
27617+void idle_notifier_register(struct notifier_block *n)
27618+{
27619+ atomic_notifier_chain_register(&idle_notifier, n);
27620+}
27621+EXPORT_SYMBOL_GPL(idle_notifier_register);
27622+
27623+void idle_notifier_unregister(struct notifier_block *n)
27624+{
27625+ atomic_notifier_chain_unregister(&idle_notifier, n);
27626+}
27627+EXPORT_SYMBOL(idle_notifier_unregister);
27628+
27629+enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
27630+static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
27631+
27632+void enter_idle(void)
27633+{
27634+ __get_cpu_var(idle_state) = CPU_IDLE;
27635+ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
27636+}
27637+
27638+static void __exit_idle(void)
27639+{
27640+ __get_cpu_var(idle_state) = CPU_NOT_IDLE;
27641+ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
27642+}
27643+
27644+/* Called from interrupts to signify idle end */
27645+void exit_idle(void)
27646+{
27647+ if (current->pid | read_pda(irqcount))
27648+ return;
27649+ __exit_idle();
27650+}
27651+
27652+/*
27653+ * On SMP it's slightly faster (but much more power-consuming!)
27654+ * to poll the ->need_resched flag instead of waiting for the
27655+ * cross-CPU IPI to arrive. Use this option with caution.
27656+ */
27657+static void poll_idle (void)
27658+{
27659+ local_irq_enable();
27660+
27661+ asm volatile(
27662+ "2:"
27663+ "testl %0,%1;"
27664+ "rep; nop;"
27665+ "je 2b;"
27666+ : :
27667+ "i" (_TIF_NEED_RESCHED),
27668+ "m" (current_thread_info()->flags));
27669+}
27670+
27671+static void xen_idle(void)
27672+{
27673+ local_irq_disable();
27674+
27675+ if (need_resched())
27676+ local_irq_enable();
27677+ else {
27678+ current_thread_info()->status &= ~TS_POLLING;
27679+ smp_mb__after_clear_bit();
27680+ safe_halt();
27681+ current_thread_info()->status |= TS_POLLING;
27682+ }
27683+}
27684+
27685+#ifdef CONFIG_HOTPLUG_CPU
27686+static inline void play_dead(void)
27687+{
27688+ idle_task_exit();
27689+ local_irq_disable();
27690+ cpu_clear(smp_processor_id(), cpu_initialized);
27691+ preempt_enable_no_resched();
27692+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
27693+ cpu_bringup();
27694+}
27695+#else
27696+static inline void play_dead(void)
27697+{
27698+ BUG();
27699+}
27700+#endif /* CONFIG_HOTPLUG_CPU */
27701+
27702+/*
27703+ * The idle thread. There's no useful work to be
27704+ * done, so just try to conserve power and have a
27705+ * low exit latency (ie sit in a loop waiting for
27706+ * somebody to say that they'd like to reschedule)
27707+ */
27708+void cpu_idle (void)
27709+{
27710+ current_thread_info()->status |= TS_POLLING;
27711+ /* endless idle loop with no priority at all */
27712+ while (1) {
27713+ while (!need_resched()) {
27714+ void (*idle)(void);
27715+
27716+ if (__get_cpu_var(cpu_idle_state))
27717+ __get_cpu_var(cpu_idle_state) = 0;
27718+ rmb();
27719+ idle = xen_idle; /* no alternatives */
27720+ if (cpu_is_offline(smp_processor_id()))
27721+ play_dead();
27722+ enter_idle();
27723+ idle();
27724+ __exit_idle();
27725+ }
27726+
27727+ preempt_enable_no_resched();
27728+ schedule();
27729+ preempt_disable();
27730+ }
27731+}
27732+
27733+void cpu_idle_wait(void)
27734+{
27735+ unsigned int cpu, this_cpu = get_cpu();
27736+ cpumask_t map;
27737+
27738+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
27739+ put_cpu();
27740+
27741+ cpus_clear(map);
27742+ for_each_online_cpu(cpu) {
27743+ per_cpu(cpu_idle_state, cpu) = 1;
27744+ cpu_set(cpu, map);
27745+ }
27746+
27747+ __get_cpu_var(cpu_idle_state) = 0;
27748+
27749+ wmb();
27750+ do {
27751+ ssleep(1);
27752+ for_each_online_cpu(cpu) {
27753+ if (cpu_isset(cpu, map) &&
27754+ !per_cpu(cpu_idle_state, cpu))
27755+ cpu_clear(cpu, map);
27756+ }
27757+ cpus_and(map, map, cpu_online_map);
27758+ } while (!cpus_empty(map));
27759+}
27760+EXPORT_SYMBOL_GPL(cpu_idle_wait);
27761+
27762+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
27763+{
27764+}
27765+
27766+static int __init idle_setup (char *str)
27767+{
27768+ if (!strncmp(str, "poll", 4)) {
27769+ printk("using polling idle threads.\n");
27770+ pm_idle = poll_idle;
27771+ }
27772+
27773+ boot_option_idle_override = 1;
27774+ return 1;
27775+}
27776+
27777+__setup("idle=", idle_setup);
27778+
27779+/* Prints also some state that isn't saved in the pt_regs */
27780+void __show_regs(struct pt_regs * regs)
27781+{
27782+ unsigned long fs, gs, shadowgs;
27783+ unsigned int fsindex,gsindex;
27784+ unsigned int ds,cs,es;
27785+
27786+ printk("\n");
27787+ print_modules();
27788+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
27789+ current->pid, current->comm, print_tainted(),
27790+ system_utsname.release,
27791+ (int)strcspn(system_utsname.version, " "),
27792+ system_utsname.version);
27793+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
27794+ printk_address(regs->rip);
27795+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
27796+ regs->eflags);
27797+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
27798+ regs->rax, regs->rbx, regs->rcx);
27799+ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
27800+ regs->rdx, regs->rsi, regs->rdi);
27801+ printk("RBP: %016lx R08: %016lx R09: %016lx\n",
27802+ regs->rbp, regs->r8, regs->r9);
27803+ printk("R10: %016lx R11: %016lx R12: %016lx\n",
27804+ regs->r10, regs->r11, regs->r12);
27805+ printk("R13: %016lx R14: %016lx R15: %016lx\n",
27806+ regs->r13, regs->r14, regs->r15);
27807+
27808+ asm("mov %%ds,%0" : "=r" (ds));
27809+ asm("mov %%cs,%0" : "=r" (cs));
27810+ asm("mov %%es,%0" : "=r" (es));
27811+ asm("mov %%fs,%0" : "=r" (fsindex));
27812+ asm("mov %%gs,%0" : "=r" (gsindex));
27813+
27814+ rdmsrl(MSR_FS_BASE, fs);
27815+ rdmsrl(MSR_GS_BASE, gs);
27816+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
27817+
27818+ printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
27819+ fs,fsindex,gs,gsindex,shadowgs);
27820+ printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
27821+
27822+}
27823+
27824+void show_regs(struct pt_regs *regs)
27825+{
27826+ printk("CPU %d:", smp_processor_id());
27827+ __show_regs(regs);
27828+ show_trace(NULL, regs, (void *)(regs + 1));
27829+}
27830+
27831+/*
27832+ * Free current thread data structures etc..
27833+ */
27834+void exit_thread(void)
27835+{
27836+ struct task_struct *me = current;
27837+ struct thread_struct *t = &me->thread;
27838+
27839+ if (me->thread.io_bitmap_ptr) {
27840+#ifndef CONFIG_X86_NO_TSS
27841+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
27842+#endif
27843+#ifdef CONFIG_XEN
27844+ struct physdev_set_iobitmap iobmp_op;
27845+ memset(&iobmp_op, 0, sizeof(iobmp_op));
27846+#endif
27847+
27848+ kfree(t->io_bitmap_ptr);
27849+ t->io_bitmap_ptr = NULL;
27850+ /*
27851+ * Careful, clear this in the TSS too:
27852+ */
27853+#ifndef CONFIG_X86_NO_TSS
27854+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
27855+ put_cpu();
27856+#endif
27857+#ifdef CONFIG_XEN
27858+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
27859+ &iobmp_op));
27860+#endif
27861+ t->io_bitmap_max = 0;
27862+ }
27863+}
27864+
27865+void load_gs_index(unsigned gs)
27866+{
27867+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
27868+}
27869+
27870+void flush_thread(void)
27871+{
27872+ struct task_struct *tsk = current;
27873+ struct thread_info *t = current_thread_info();
27874+
27875+ if (t->flags & _TIF_ABI_PENDING) {
27876+ t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
27877+ if (t->flags & _TIF_IA32)
27878+ current_thread_info()->status |= TS_COMPAT;
27879+ }
27880+
27881+ tsk->thread.debugreg0 = 0;
27882+ tsk->thread.debugreg1 = 0;
27883+ tsk->thread.debugreg2 = 0;
27884+ tsk->thread.debugreg3 = 0;
27885+ tsk->thread.debugreg6 = 0;
27886+ tsk->thread.debugreg7 = 0;
27887+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
27888+ /*
27889+ * Forget coprocessor state..
27890+ */
27891+ clear_fpu(tsk);
27892+ clear_used_math();
27893+}
27894+
27895+void release_thread(struct task_struct *dead_task)
27896+{
27897+ if (dead_task->mm) {
27898+ if (dead_task->mm->context.size) {
27899+ printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
27900+ dead_task->comm,
27901+ dead_task->mm->context.ldt,
27902+ dead_task->mm->context.size);
27903+ BUG();
27904+ }
27905+ }
27906+}
27907+
27908+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
27909+{
27910+ struct user_desc ud = {
27911+ .base_addr = addr,
27912+ .limit = 0xfffff,
27913+ .seg_32bit = 1,
27914+ .limit_in_pages = 1,
27915+ .useable = 1,
27916+ };
27917+ struct n_desc_struct *desc = (void *)t->thread.tls_array;
27918+ desc += tls;
27919+ desc->a = LDT_entry_a(&ud);
27920+ desc->b = LDT_entry_b(&ud);
27921+}
27922+
27923+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
27924+{
27925+ struct desc_struct *desc = (void *)t->thread.tls_array;
27926+ desc += tls;
27927+ return desc->base0 |
27928+ (((u32)desc->base1) << 16) |
27929+ (((u32)desc->base2) << 24);
27930+}
27931+
27932+/*
27933+ * This gets called before we allocate a new thread and copy
27934+ * the current task into it.
27935+ */
27936+void prepare_to_copy(struct task_struct *tsk)
27937+{
27938+ unlazy_fpu(tsk);
27939+}
27940+
27941+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
27942+ unsigned long unused,
27943+ struct task_struct * p, struct pt_regs * regs)
27944+{
27945+ int err;
27946+ struct pt_regs * childregs;
27947+ struct task_struct *me = current;
27948+
27949+ childregs = ((struct pt_regs *)
27950+ (THREAD_SIZE + task_stack_page(p))) - 1;
27951+ *childregs = *regs;
27952+
27953+ childregs->rax = 0;
27954+ childregs->rsp = rsp;
27955+ if (rsp == ~0UL)
27956+ childregs->rsp = (unsigned long)childregs;
27957+
27958+ p->thread.rsp = (unsigned long) childregs;
27959+ p->thread.rsp0 = (unsigned long) (childregs+1);
27960+ p->thread.userrsp = me->thread.userrsp;
27961+
27962+ set_tsk_thread_flag(p, TIF_FORK);
27963+
27964+ p->thread.fs = me->thread.fs;
27965+ p->thread.gs = me->thread.gs;
27966+
27967+ asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
27968+ asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
27969+ asm("mov %%es,%0" : "=m" (p->thread.es));
27970+ asm("mov %%ds,%0" : "=m" (p->thread.ds));
27971+
27972+ if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
27973+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
27974+ if (!p->thread.io_bitmap_ptr) {
27975+ p->thread.io_bitmap_max = 0;
27976+ return -ENOMEM;
27977+ }
27978+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
27979+ IO_BITMAP_BYTES);
27980+ }
27981+
27982+ /*
27983+ * Set a new TLS for the child thread?
27984+ */
27985+ if (clone_flags & CLONE_SETTLS) {
27986+#ifdef CONFIG_IA32_EMULATION
27987+ if (test_thread_flag(TIF_IA32))
27988+ err = ia32_child_tls(p, childregs);
27989+ else
27990+#endif
27991+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
27992+ if (err)
27993+ goto out;
27994+ }
27995+ p->thread.iopl = current->thread.iopl;
27996+
27997+ err = 0;
27998+out:
27999+ if (err && p->thread.io_bitmap_ptr) {
28000+ kfree(p->thread.io_bitmap_ptr);
28001+ p->thread.io_bitmap_max = 0;
28002+ }
28003+ return err;
28004+}
28005+
28006+static inline void __save_init_fpu( struct task_struct *tsk )
28007+{
28008+ asm volatile( "rex64 ; fxsave %0 ; fnclex"
28009+ : "=m" (tsk->thread.i387.fxsave));
28010+ tsk->thread_info->status &= ~TS_USEDFPU;
28011+}
28012+
28013+/*
28014+ * switch_to(x,y) should switch tasks from x to y.
28015+ *
28016+ * This could still be optimized:
28017+ * - fold all the options into a flag word and test it with a single test.
28018+ * - could test fs/gs bitsliced
28019+ *
28020+ * Kprobes not supported here. Set the probe on schedule instead.
28021+ */
28022+__kprobes struct task_struct *
28023+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
28024+{
28025+ struct thread_struct *prev = &prev_p->thread,
28026+ *next = &next_p->thread;
28027+ int cpu = smp_processor_id();
28028+#ifndef CONFIG_X86_NO_TSS
28029+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
28030+#endif
28031+#if CONFIG_XEN_COMPAT > 0x030002
28032+ struct physdev_set_iopl iopl_op;
28033+ struct physdev_set_iobitmap iobmp_op;
28034+#else
28035+ struct physdev_op _pdo[2], *pdo = _pdo;
28036+#define iopl_op pdo->u.set_iopl
28037+#define iobmp_op pdo->u.set_iobitmap
28038+#endif
28039+ multicall_entry_t _mcl[8], *mcl = _mcl;
28040+
28041+ /*
28042+ * This is basically '__unlazy_fpu', except that we queue a
28043+ * multicall to indicate FPU task switch, rather than
28044+ * synchronously trapping to Xen.
28045+ * The AMD workaround requires it to be after DS reload, or
28046+ * after DS has been cleared, which we do in __prepare_arch_switch.
28047+ */
28048+ if (prev_p->thread_info->status & TS_USEDFPU) {
28049+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
28050+ mcl->op = __HYPERVISOR_fpu_taskswitch;
28051+ mcl->args[0] = 1;
28052+ mcl++;
28053+ }
28054+
28055+ /*
28056+ * Reload esp0, LDT and the page table pointer:
28057+ */
28058+ mcl->op = __HYPERVISOR_stack_switch;
28059+ mcl->args[0] = __KERNEL_DS;
28060+ mcl->args[1] = next->rsp0;
28061+ mcl++;
28062+
28063+ /*
28064+ * Load the per-thread Thread-Local Storage descriptor.
28065+ * This is load_TLS(next, cpu) with multicalls.
28066+ */
28067+#define C(i) do { \
28068+ if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
28069+ mcl->op = __HYPERVISOR_update_descriptor; \
28070+ mcl->args[0] = virt_to_machine( \
28071+ &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
28072+ mcl->args[1] = next->tls_array[i]; \
28073+ mcl++; \
28074+ } \
28075+} while (0)
28076+ C(0); C(1); C(2);
28077+#undef C
28078+
28079+ if (unlikely(prev->iopl != next->iopl)) {
28080+ iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
28081+#if CONFIG_XEN_COMPAT > 0x030002
28082+ mcl->op = __HYPERVISOR_physdev_op;
28083+ mcl->args[0] = PHYSDEVOP_set_iopl;
28084+ mcl->args[1] = (unsigned long)&iopl_op;
28085+#else
28086+ mcl->op = __HYPERVISOR_physdev_op_compat;
28087+ pdo->cmd = PHYSDEVOP_set_iopl;
28088+ mcl->args[0] = (unsigned long)pdo++;
28089+#endif
28090+ mcl++;
28091+ }
28092+
28093+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
28094+ set_xen_guest_handle(iobmp_op.bitmap,
28095+ (char *)next->io_bitmap_ptr);
28096+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
28097+#if CONFIG_XEN_COMPAT > 0x030002
28098+ mcl->op = __HYPERVISOR_physdev_op;
28099+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
28100+ mcl->args[1] = (unsigned long)&iobmp_op;
28101+#else
28102+ mcl->op = __HYPERVISOR_physdev_op_compat;
28103+ pdo->cmd = PHYSDEVOP_set_iobitmap;
28104+ mcl->args[0] = (unsigned long)pdo++;
28105+#endif
28106+ mcl++;
28107+ }
28108+
28109+#if CONFIG_XEN_COMPAT <= 0x030002
28110+ BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
28111+#endif
28112+ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
28113+ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
28114+ BUG();
28115+
28116+ /*
28117+ * Switch DS and ES.
28118+ * This won't pick up thread selector changes, but I guess that is ok.
28119+ */
28120+ if (unlikely(next->es))
28121+ loadsegment(es, next->es);
28122+
28123+ if (unlikely(next->ds))
28124+ loadsegment(ds, next->ds);
28125+
28126+ /*
28127+ * Switch FS and GS.
28128+ */
28129+ if (unlikely(next->fsindex))
28130+ loadsegment(fs, next->fsindex);
28131+
28132+ if (next->fs)
28133+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
28134+
28135+ if (unlikely(next->gsindex))
28136+ load_gs_index(next->gsindex);
28137+
28138+ if (next->gs)
28139+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
28140+
28141+ /*
28142+ * Switch the PDA context.
28143+ */
28144+ prev->userrsp = read_pda(oldrsp);
28145+ write_pda(oldrsp, next->userrsp);
28146+ write_pda(pcurrent, next_p);
28147+ write_pda(kernelstack,
28148+ task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
28149+
28150+ /*
28151+ * Now maybe reload the debug registers
28152+ */
28153+ if (unlikely(next->debugreg7)) {
28154+ set_debugreg(next->debugreg0, 0);
28155+ set_debugreg(next->debugreg1, 1);
28156+ set_debugreg(next->debugreg2, 2);
28157+ set_debugreg(next->debugreg3, 3);
28158+ /* no 4 and 5 */
28159+ set_debugreg(next->debugreg6, 6);
28160+ set_debugreg(next->debugreg7, 7);
28161+ }
28162+
28163+ return prev_p;
28164+}
28165+
28166+/*
28167+ * sys_execve() executes a new program.
28168+ */
28169+asmlinkage
28170+long sys_execve(char __user *name, char __user * __user *argv,
28171+ char __user * __user *envp, struct pt_regs regs)
28172+{
28173+ long error;
28174+ char * filename;
28175+
28176+ filename = getname(name);
28177+ error = PTR_ERR(filename);
28178+ if (IS_ERR(filename))
28179+ return error;
28180+ error = do_execve(filename, argv, envp, &regs);
28181+ if (error == 0) {
28182+ task_lock(current);
28183+ current->ptrace &= ~PT_DTRACE;
28184+ task_unlock(current);
28185+ }
28186+ putname(filename);
28187+ return error;
28188+}
28189+
28190+void set_personality_64bit(void)
28191+{
28192+ /* inherit personality from parent */
28193+
28194+ /* Make sure to be in 64bit mode */
28195+ clear_thread_flag(TIF_IA32);
28196+
28197+ /* TBD: overwrites user setup. Should have two bits.
28198+ But 64bit processes have always behaved this way,
28199+ so it's not too bad. The main problem is just that
28200+ 32bit childs are affected again. */
28201+ current->personality &= ~READ_IMPLIES_EXEC;
28202+}
28203+
28204+asmlinkage long sys_fork(struct pt_regs *regs)
28205+{
28206+ return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
28207+}
28208+
28209+asmlinkage long
28210+sys_clone(unsigned long clone_flags, unsigned long newsp,
28211+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
28212+{
28213+ if (!newsp)
28214+ newsp = regs->rsp;
28215+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
28216+}
28217+
28218+/*
28219+ * This is trivial, and on the face of it looks like it
28220+ * could equally well be done in user mode.
28221+ *
28222+ * Not so, for quite unobvious reasons - register pressure.
28223+ * In user mode vfork() cannot have a stack frame, and if
28224+ * done by calling the "clone()" system call directly, you
28225+ * do not have enough call-clobbered registers to hold all
28226+ * the information you need.
28227+ */
28228+asmlinkage long sys_vfork(struct pt_regs *regs)
28229+{
28230+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
28231+ NULL, NULL);
28232+}
28233+
28234+unsigned long get_wchan(struct task_struct *p)
28235+{
28236+ unsigned long stack;
28237+ u64 fp,rip;
28238+ int count = 0;
28239+
28240+ if (!p || p == current || p->state==TASK_RUNNING)
28241+ return 0;
28242+ stack = (unsigned long)task_stack_page(p);
28243+ if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
28244+ return 0;
28245+ fp = *(u64 *)(p->thread.rsp);
28246+ do {
28247+ if (fp < (unsigned long)stack ||
28248+ fp > (unsigned long)stack+THREAD_SIZE)
28249+ return 0;
28250+ rip = *(u64 *)(fp+8);
28251+ if (!in_sched_functions(rip))
28252+ return rip;
28253+ fp = *(u64 *)fp;
28254+ } while (count++ < 16);
28255+ return 0;
28256+}
28257+
28258+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
28259+{
28260+ int ret = 0;
28261+ int doit = task == current;
28262+ int cpu;
28263+
28264+ switch (code) {
28265+ case ARCH_SET_GS:
28266+ if (addr >= TASK_SIZE_OF(task))
28267+ return -EPERM;
28268+ cpu = get_cpu();
28269+ /* handle small bases via the GDT because that's faster to
28270+ switch. */
28271+ if (addr <= 0xffffffff) {
28272+ set_32bit_tls(task, GS_TLS, addr);
28273+ if (doit) {
28274+ load_TLS(&task->thread, cpu);
28275+ load_gs_index(GS_TLS_SEL);
28276+ }
28277+ task->thread.gsindex = GS_TLS_SEL;
28278+ task->thread.gs = 0;
28279+ } else {
28280+ task->thread.gsindex = 0;
28281+ task->thread.gs = addr;
28282+ if (doit) {
28283+ load_gs_index(0);
28284+ ret = HYPERVISOR_set_segment_base(
28285+ SEGBASE_GS_USER, addr);
28286+ }
28287+ }
28288+ put_cpu();
28289+ break;
28290+ case ARCH_SET_FS:
28291+ /* Not strictly needed for fs, but do it for symmetry
28292+ with gs */
28293+ if (addr >= TASK_SIZE_OF(task))
28294+ return -EPERM;
28295+ cpu = get_cpu();
28296+ /* handle small bases via the GDT because that's faster to
28297+ switch. */
28298+ if (addr <= 0xffffffff) {
28299+ set_32bit_tls(task, FS_TLS, addr);
28300+ if (doit) {
28301+ load_TLS(&task->thread, cpu);
28302+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
28303+ }
28304+ task->thread.fsindex = FS_TLS_SEL;
28305+ task->thread.fs = 0;
28306+ } else {
28307+ task->thread.fsindex = 0;
28308+ task->thread.fs = addr;
28309+ if (doit) {
28310+ /* set the selector to 0 to not confuse
28311+ __switch_to */
28312+ asm volatile("movl %0,%%fs" :: "r" (0));
28313+ ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
28314+ addr);
28315+ }
28316+ }
28317+ put_cpu();
28318+ break;
28319+ case ARCH_GET_FS: {
28320+ unsigned long base;
28321+ if (task->thread.fsindex == FS_TLS_SEL)
28322+ base = read_32bit_tls(task, FS_TLS);
28323+ else if (doit)
28324+ rdmsrl(MSR_FS_BASE, base);
28325+ else
28326+ base = task->thread.fs;
28327+ ret = put_user(base, (unsigned long __user *)addr);
28328+ break;
28329+ }
28330+ case ARCH_GET_GS: {
28331+ unsigned long base;
28332+ unsigned gsindex;
28333+ if (task->thread.gsindex == GS_TLS_SEL)
28334+ base = read_32bit_tls(task, GS_TLS);
28335+ else if (doit) {
28336+ asm("movl %%gs,%0" : "=r" (gsindex));
28337+ if (gsindex)
28338+ rdmsrl(MSR_KERNEL_GS_BASE, base);
28339+ else
28340+ base = task->thread.gs;
28341+ }
28342+ else
28343+ base = task->thread.gs;
28344+ ret = put_user(base, (unsigned long __user *)addr);
28345+ break;
28346+ }
28347+
28348+ default:
28349+ ret = -EINVAL;
28350+ break;
28351+ }
28352+
28353+ return ret;
28354+}
28355+
28356+long sys_arch_prctl(int code, unsigned long addr)
28357+{
28358+ return do_arch_prctl(current, code, addr);
28359+}
28360+
28361+/*
28362+ * Capture the user space registers if the task is not running (in user space)
28363+ */
28364+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
28365+{
28366+ struct pt_regs *pp, ptregs;
28367+
28368+ pp = task_pt_regs(tsk);
28369+
28370+ ptregs = *pp;
28371+ ptregs.cs &= 0xffff;
28372+ ptregs.ss &= 0xffff;
28373+
28374+ elf_core_copy_regs(regs, &ptregs);
28375+
28376+ boot_option_idle_override = 1;
28377+ return 1;
28378+}
28379+
28380+unsigned long arch_align_stack(unsigned long sp)
28381+{
28382+ if (randomize_va_space)
28383+ sp -= get_random_int() % 8192;
28384+ return sp & ~0xf;
28385+}
28386Index: head-2008-11-25/arch/x86/kernel/setup_64-xen.c
28387===================================================================
28388--- /dev/null 1970-01-01 00:00:00.000000000 +0000
28389+++ head-2008-11-25/arch/x86/kernel/setup_64-xen.c 2008-04-22 15:41:51.000000000 +0200
28390@@ -0,0 +1,1652 @@
28391+/*
28392+ * linux/arch/x86-64/kernel/setup.c
28393+ *
28394+ * Copyright (C) 1995 Linus Torvalds
28395+ *
28396+ * Nov 2001 Dave Jones <davej@suse.de>
28397+ * Forked from i386 setup code.
28398+ */
28399+
28400+/*
28401+ * This file handles the architecture-dependent parts of initialization
28402+ */
28403+
28404+#include <linux/errno.h>
28405+#include <linux/sched.h>
28406+#include <linux/kernel.h>
28407+#include <linux/mm.h>
28408+#include <linux/stddef.h>
28409+#include <linux/unistd.h>
28410+#include <linux/ptrace.h>
28411+#include <linux/slab.h>
28412+#include <linux/user.h>
28413+#include <linux/a.out.h>
28414+#include <linux/screen_info.h>
28415+#include <linux/ioport.h>
28416+#include <linux/delay.h>
28417+#include <linux/init.h>
28418+#include <linux/initrd.h>
28419+#include <linux/highmem.h>
28420+#include <linux/bootmem.h>
28421+#include <linux/module.h>
28422+#include <asm/processor.h>
28423+#include <linux/console.h>
28424+#include <linux/seq_file.h>
28425+#include <linux/crash_dump.h>
28426+#include <linux/root_dev.h>
28427+#include <linux/pci.h>
28428+#include <linux/acpi.h>
28429+#include <linux/kallsyms.h>
28430+#include <linux/edd.h>
28431+#include <linux/mmzone.h>
28432+#include <linux/kexec.h>
28433+#include <linux/cpufreq.h>
28434+#include <linux/dmi.h>
28435+#include <linux/dma-mapping.h>
28436+#include <linux/ctype.h>
28437+
28438+#include <asm/mtrr.h>
28439+#include <asm/uaccess.h>
28440+#include <asm/system.h>
28441+#include <asm/io.h>
28442+#include <asm/smp.h>
28443+#include <asm/msr.h>
28444+#include <asm/desc.h>
28445+#include <video/edid.h>
28446+#include <asm/e820.h>
28447+#include <asm/dma.h>
28448+#include <asm/mpspec.h>
28449+#include <asm/mmu_context.h>
28450+#include <asm/bootsetup.h>
28451+#include <asm/proto.h>
28452+#include <asm/setup.h>
28453+#include <asm/mach_apic.h>
28454+#include <asm/numa.h>
28455+#include <asm/sections.h>
28456+#include <asm/dmi.h>
28457+#ifdef CONFIG_XEN
28458+#include <linux/percpu.h>
28459+#include <xen/interface/physdev.h>
28460+#include "setup_arch_pre.h"
28461+#include <asm/hypervisor.h>
28462+#include <xen/interface/nmi.h>
28463+#include <xen/features.h>
28464+#include <xen/firmware.h>
28465+#include <xen/xencons.h>
28466+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
28467+#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
28468+#include <asm/mach-xen/setup_arch_post.h>
28469+#include <xen/interface/memory.h>
28470+
28471+#ifdef CONFIG_XEN
28472+#include <xen/interface/kexec.h>
28473+#endif
28474+
28475+extern unsigned long start_pfn;
28476+extern struct edid_info edid_info;
28477+
28478+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
28479+EXPORT_SYMBOL(HYPERVISOR_shared_info);
28480+
28481+extern char hypercall_page[PAGE_SIZE];
28482+EXPORT_SYMBOL(hypercall_page);
28483+
28484+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
28485+static struct notifier_block xen_panic_block = {
28486+ xen_panic_event, NULL, 0 /* try to go last */
28487+};
28488+
28489+unsigned long *phys_to_machine_mapping;
28490+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
28491+
28492+EXPORT_SYMBOL(phys_to_machine_mapping);
28493+
28494+DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
28495+DEFINE_PER_CPU(int, nr_multicall_ents);
28496+
28497+/* Raw start-of-day parameters from the hypervisor. */
28498+start_info_t *xen_start_info;
28499+EXPORT_SYMBOL(xen_start_info);
28500+#endif
28501+
28502+/*
28503+ * Machine setup..
28504+ */
28505+
28506+struct cpuinfo_x86 boot_cpu_data __read_mostly;
28507+EXPORT_SYMBOL(boot_cpu_data);
28508+
28509+unsigned long mmu_cr4_features;
28510+
28511+int acpi_disabled;
28512+EXPORT_SYMBOL(acpi_disabled);
28513+#ifdef CONFIG_ACPI
28514+extern int __initdata acpi_ht;
28515+extern acpi_interrupt_flags acpi_sci_flags;
28516+int __initdata acpi_force = 0;
28517+#endif
28518+
28519+int acpi_numa __initdata;
28520+
28521+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
28522+int bootloader_type;
28523+
28524+unsigned long saved_video_mode;
28525+
28526+/*
28527+ * Early DMI memory
28528+ */
28529+int dmi_alloc_index;
28530+char dmi_alloc_data[DMI_MAX_DATA];
28531+
28532+/*
28533+ * Setup options
28534+ */
28535+struct screen_info screen_info;
28536+EXPORT_SYMBOL(screen_info);
28537+struct sys_desc_table_struct {
28538+ unsigned short length;
28539+ unsigned char table[0];
28540+};
28541+
28542+struct edid_info edid_info;
28543+EXPORT_SYMBOL_GPL(edid_info);
28544+struct e820map e820;
28545+#ifdef CONFIG_XEN
28546+struct e820map machine_e820;
28547+#endif
28548+
28549+extern int root_mountflags;
28550+
28551+char command_line[COMMAND_LINE_SIZE];
28552+
28553+struct resource standard_io_resources[] = {
28554+ { .name = "dma1", .start = 0x00, .end = 0x1f,
28555+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28556+ { .name = "pic1", .start = 0x20, .end = 0x21,
28557+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28558+ { .name = "timer0", .start = 0x40, .end = 0x43,
28559+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28560+ { .name = "timer1", .start = 0x50, .end = 0x53,
28561+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28562+ { .name = "keyboard", .start = 0x60, .end = 0x6f,
28563+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28564+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
28565+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28566+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
28567+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28568+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
28569+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28570+ { .name = "fpu", .start = 0xf0, .end = 0xff,
28571+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
28572+};
28573+
28574+#define STANDARD_IO_RESOURCES \
28575+ (sizeof standard_io_resources / sizeof standard_io_resources[0])
28576+
28577+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
28578+
28579+struct resource data_resource = {
28580+ .name = "Kernel data",
28581+ .start = 0,
28582+ .end = 0,
28583+ .flags = IORESOURCE_RAM,
28584+};
28585+struct resource code_resource = {
28586+ .name = "Kernel code",
28587+ .start = 0,
28588+ .end = 0,
28589+ .flags = IORESOURCE_RAM,
28590+};
28591+
28592+#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
28593+
28594+static struct resource system_rom_resource = {
28595+ .name = "System ROM",
28596+ .start = 0xf0000,
28597+ .end = 0xfffff,
28598+ .flags = IORESOURCE_ROM,
28599+};
28600+
28601+static struct resource extension_rom_resource = {
28602+ .name = "Extension ROM",
28603+ .start = 0xe0000,
28604+ .end = 0xeffff,
28605+ .flags = IORESOURCE_ROM,
28606+};
28607+
28608+static struct resource adapter_rom_resources[] = {
28609+ { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
28610+ .flags = IORESOURCE_ROM },
28611+ { .name = "Adapter ROM", .start = 0, .end = 0,
28612+ .flags = IORESOURCE_ROM },
28613+ { .name = "Adapter ROM", .start = 0, .end = 0,
28614+ .flags = IORESOURCE_ROM },
28615+ { .name = "Adapter ROM", .start = 0, .end = 0,
28616+ .flags = IORESOURCE_ROM },
28617+ { .name = "Adapter ROM", .start = 0, .end = 0,
28618+ .flags = IORESOURCE_ROM },
28619+ { .name = "Adapter ROM", .start = 0, .end = 0,
28620+ .flags = IORESOURCE_ROM }
28621+};
28622+
28623+#define ADAPTER_ROM_RESOURCES \
28624+ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
28625+
28626+static struct resource video_rom_resource = {
28627+ .name = "Video ROM",
28628+ .start = 0xc0000,
28629+ .end = 0xc7fff,
28630+ .flags = IORESOURCE_ROM,
28631+};
28632+
28633+static struct resource video_ram_resource = {
28634+ .name = "Video RAM area",
28635+ .start = 0xa0000,
28636+ .end = 0xbffff,
28637+ .flags = IORESOURCE_RAM,
28638+};
28639+
28640+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
28641+
28642+static int __init romchecksum(unsigned char *rom, unsigned long length)
28643+{
28644+ unsigned char *p, sum = 0;
28645+
28646+ for (p = rom; p < rom + length; p++)
28647+ sum += *p;
28648+ return sum == 0;
28649+}
28650+
28651+static void __init probe_roms(void)
28652+{
28653+ unsigned long start, length, upper;
28654+ unsigned char *rom;
28655+ int i;
28656+
28657+#ifdef CONFIG_XEN
28658+ /* Nothing to do if not running in dom0. */
28659+ if (!is_initial_xendomain())
28660+ return;
28661+#endif
28662+
28663+ /* video rom */
28664+ upper = adapter_rom_resources[0].start;
28665+ for (start = video_rom_resource.start; start < upper; start += 2048) {
28666+ rom = isa_bus_to_virt(start);
28667+ if (!romsignature(rom))
28668+ continue;
28669+
28670+ video_rom_resource.start = start;
28671+
28672+ /* 0 < length <= 0x7f * 512, historically */
28673+ length = rom[2] * 512;
28674+
28675+ /* if checksum okay, trust length byte */
28676+ if (length && romchecksum(rom, length))
28677+ video_rom_resource.end = start + length - 1;
28678+
28679+ request_resource(&iomem_resource, &video_rom_resource);
28680+ break;
28681+ }
28682+
28683+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
28684+ if (start < upper)
28685+ start = upper;
28686+
28687+ /* system rom */
28688+ request_resource(&iomem_resource, &system_rom_resource);
28689+ upper = system_rom_resource.start;
28690+
28691+ /* check for extension rom (ignore length byte!) */
28692+ rom = isa_bus_to_virt(extension_rom_resource.start);
28693+ if (romsignature(rom)) {
28694+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
28695+ if (romchecksum(rom, length)) {
28696+ request_resource(&iomem_resource, &extension_rom_resource);
28697+ upper = extension_rom_resource.start;
28698+ }
28699+ }
28700+
28701+ /* check for adapter roms on 2k boundaries */
28702+ for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
28703+ rom = isa_bus_to_virt(start);
28704+ if (!romsignature(rom))
28705+ continue;
28706+
28707+ /* 0 < length <= 0x7f * 512, historically */
28708+ length = rom[2] * 512;
28709+
28710+ /* but accept any length that fits if checksum okay */
28711+ if (!length || start + length > upper || !romchecksum(rom, length))
28712+ continue;
28713+
28714+ adapter_rom_resources[i].start = start;
28715+ adapter_rom_resources[i].end = start + length - 1;
28716+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
28717+
28718+ start = adapter_rom_resources[i++].end & ~2047UL;
28719+ }
28720+}
28721+
28722+/* Check for full argument with no trailing characters */
28723+static int fullarg(char *p, char *arg)
28724+{
28725+ int l = strlen(arg);
28726+ return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
28727+}
28728+
28729+static __init void parse_cmdline_early (char ** cmdline_p)
28730+{
28731+ char c = ' ', *to = command_line, *from = COMMAND_LINE;
28732+ int len = 0;
28733+ int userdef = 0;
28734+
28735+ for (;;) {
28736+ if (c != ' ')
28737+ goto next_char;
28738+
28739+#ifdef CONFIG_SMP
28740+ /*
28741+ * If the BIOS enumerates physical processors before logical,
28742+ * maxcpus=N at enumeration-time can be used to disable HT.
28743+ */
28744+ else if (!memcmp(from, "maxcpus=", 8)) {
28745+ extern unsigned int maxcpus;
28746+
28747+ maxcpus = simple_strtoul(from + 8, NULL, 0);
28748+ }
28749+#endif
28750+#ifdef CONFIG_ACPI
28751+ /* "acpi=off" disables both ACPI table parsing and interpreter init */
28752+ if (fullarg(from,"acpi=off"))
28753+ disable_acpi();
28754+
28755+ if (fullarg(from, "acpi=force")) {
28756+ /* add later when we do DMI horrors: */
28757+ acpi_force = 1;
28758+ acpi_disabled = 0;
28759+ }
28760+
28761+ /* acpi=ht just means: do ACPI MADT parsing
28762+ at bootup, but don't enable the full ACPI interpreter */
28763+ if (fullarg(from, "acpi=ht")) {
28764+ if (!acpi_force)
28765+ disable_acpi();
28766+ acpi_ht = 1;
28767+ }
28768+ else if (fullarg(from, "pci=noacpi"))
28769+ acpi_disable_pci();
28770+ else if (fullarg(from, "acpi=noirq"))
28771+ acpi_noirq_set();
28772+
28773+ else if (fullarg(from, "acpi_sci=edge"))
28774+ acpi_sci_flags.trigger = 1;
28775+ else if (fullarg(from, "acpi_sci=level"))
28776+ acpi_sci_flags.trigger = 3;
28777+ else if (fullarg(from, "acpi_sci=high"))
28778+ acpi_sci_flags.polarity = 1;
28779+ else if (fullarg(from, "acpi_sci=low"))
28780+ acpi_sci_flags.polarity = 3;
28781+
28782+ /* acpi=strict disables out-of-spec workarounds */
28783+ else if (fullarg(from, "acpi=strict")) {
28784+ acpi_strict = 1;
28785+ }
28786+#ifdef CONFIG_X86_IO_APIC
28787+ else if (fullarg(from, "acpi_skip_timer_override"))
28788+ acpi_skip_timer_override = 1;
28789+#endif
28790+#endif
28791+
28792+#ifndef CONFIG_XEN
28793+ if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
28794+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
28795+ disable_apic = 1;
28796+ }
28797+
28798+ if (fullarg(from, "noapic"))
28799+ skip_ioapic_setup = 1;
28800+
28801+ if (fullarg(from,"apic")) {
28802+ skip_ioapic_setup = 0;
28803+ ioapic_force = 1;
28804+ }
28805+#endif
28806+
28807+ if (!memcmp(from, "mem=", 4))
28808+ parse_memopt(from+4, &from);
28809+
28810+ if (!memcmp(from, "memmap=", 7)) {
28811+ /* exactmap option is for used defined memory */
28812+ if (!memcmp(from+7, "exactmap", 8)) {
28813+#ifdef CONFIG_CRASH_DUMP
28814+ /* If we are doing a crash dump, we
28815+ * still need to know the real mem
28816+ * size before original memory map is
28817+ * reset.
28818+ */
28819+ saved_max_pfn = e820_end_of_ram();
28820+#endif
28821+ from += 8+7;
28822+ end_pfn_map = 0;
28823+ e820.nr_map = 0;
28824+ userdef = 1;
28825+ }
28826+ else {
28827+ parse_memmapopt(from+7, &from);
28828+ userdef = 1;
28829+ }
28830+ }
28831+
28832+#ifdef CONFIG_NUMA
28833+ if (!memcmp(from, "numa=", 5))
28834+ numa_setup(from+5);
28835+#endif
28836+
28837+ if (!memcmp(from,"iommu=",6)) {
28838+ iommu_setup(from+6);
28839+ }
28840+
28841+ if (fullarg(from,"oops=panic"))
28842+ panic_on_oops = 1;
28843+
28844+ if (!memcmp(from, "noexec=", 7))
28845+ nonx_setup(from + 7);
28846+
28847+#ifdef CONFIG_KEXEC
28848+ /* crashkernel=size@addr specifies the location to reserve for
28849+ * a crash kernel. By reserving this memory we guarantee
28850+ * that linux never set's it up as a DMA target.
28851+ * Useful for holding code to do something appropriate
28852+ * after a kernel panic.
28853+ */
28854+ else if (!memcmp(from, "crashkernel=", 12)) {
28855+#ifndef CONFIG_XEN
28856+ unsigned long size, base;
28857+ size = memparse(from+12, &from);
28858+ if (*from == '@') {
28859+ base = memparse(from+1, &from);
28860+ /* FIXME: Do I want a sanity check
28861+ * to validate the memory range?
28862+ */
28863+ crashk_res.start = base;
28864+ crashk_res.end = base + size - 1;
28865+ }
28866+#else
28867+ printk("Ignoring crashkernel command line, "
28868+ "parameter will be supplied by xen\n");
28869+#endif
28870+ }
28871+#endif
28872+
28873+#ifdef CONFIG_PROC_VMCORE
28874+ /* elfcorehdr= specifies the location of elf core header
28875+ * stored by the crashed kernel. This option will be passed
28876+ * by kexec loader to the capture kernel.
28877+ */
28878+ else if(!memcmp(from, "elfcorehdr=", 11))
28879+ elfcorehdr_addr = memparse(from+11, &from);
28880+#endif
28881+
28882+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
28883+ else if (!memcmp(from, "additional_cpus=", 16))
28884+ setup_additional_cpus(from+16);
28885+#endif
28886+
28887+ next_char:
28888+ c = *(from++);
28889+ if (!c)
28890+ break;
28891+ if (COMMAND_LINE_SIZE <= ++len)
28892+ break;
28893+ *(to++) = c;
28894+ }
28895+ if (userdef) {
28896+ printk(KERN_INFO "user-defined physical RAM map:\n");
28897+ e820_print_map("user");
28898+ }
28899+ *to = '\0';
28900+ *cmdline_p = command_line;
28901+}
28902+
28903+#ifndef CONFIG_NUMA
28904+static void __init
28905+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
28906+{
28907+ unsigned long bootmap_size, bootmap;
28908+
28909+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
28910+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
28911+ if (bootmap == -1L)
28912+ panic("Cannot find bootmem map of size %ld\n",bootmap_size);
28913+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
28914+#ifdef CONFIG_XEN
28915+ e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
28916+#else
28917+ e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
28918+#endif
28919+ reserve_bootmem(bootmap, bootmap_size);
28920+}
28921+#endif
28922+
28923+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
28924+struct edd edd;
28925+#ifdef CONFIG_EDD_MODULE
28926+EXPORT_SYMBOL(edd);
28927+#endif
28928+#ifndef CONFIG_XEN
28929+/**
28930+ * copy_edd() - Copy the BIOS EDD information
28931+ * from boot_params into a safe place.
28932+ *
28933+ */
28934+static inline void copy_edd(void)
28935+{
28936+ memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
28937+ memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
28938+ edd.mbr_signature_nr = EDD_MBR_SIG_NR;
28939+ edd.edd_info_nr = EDD_NR;
28940+}
28941+#endif
28942+#else
28943+static inline void copy_edd(void)
28944+{
28945+}
28946+#endif
28947+
28948+#ifndef CONFIG_XEN
28949+#define EBDA_ADDR_POINTER 0x40E
28950+
28951+unsigned __initdata ebda_addr;
28952+unsigned __initdata ebda_size;
28953+
28954+static void discover_ebda(void)
28955+{
28956+ /*
28957+ * there is a real-mode segmented pointer pointing to the
28958+ * 4K EBDA area at 0x40E
28959+ */
28960+ ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
28961+ ebda_addr <<= 4;
28962+
28963+ ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
28964+
28965+ /* Round EBDA up to pages */
28966+ if (ebda_size == 0)
28967+ ebda_size = 1;
28968+ ebda_size <<= 10;
28969+ ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
28970+ if (ebda_size > 64*1024)
28971+ ebda_size = 64*1024;
28972+}
28973+#else
28974+#define discover_ebda() ((void)0)
28975+#endif
28976+
28977+void __init setup_arch(char **cmdline_p)
28978+{
28979+#ifdef CONFIG_XEN
28980+ /* Register a call for panic conditions. */
28981+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
28982+
28983+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
28984+ screen_info = SCREEN_INFO;
28985+
28986+ if (is_initial_xendomain()) {
28987+ const struct dom0_vga_console_info *info =
28988+ (void *)((char *)xen_start_info +
28989+ xen_start_info->console.dom0.info_off);
28990+
28991+ dom0_init_screen_info(info,
28992+ xen_start_info->console.dom0.info_size);
28993+ xen_start_info->console.domU.mfn = 0;
28994+ xen_start_info->console.domU.evtchn = 0;
28995+ } else
28996+ screen_info.orig_video_isVGA = 0;
28997+
28998+ copy_edid();
28999+
29000+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
29001+ VMASST_TYPE_writable_pagetables));
29002+
29003+ ARCH_SETUP
29004+#else
29005+ ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
29006+ screen_info = SCREEN_INFO;
29007+ edid_info = EDID_INFO;
29008+#endif /* !CONFIG_XEN */
29009+ saved_video_mode = SAVED_VIDEO_MODE;
29010+ bootloader_type = LOADER_TYPE;
29011+
29012+#ifdef CONFIG_BLK_DEV_RAM
29013+ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
29014+ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
29015+ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
29016+#endif
29017+ setup_memory_region();
29018+ copy_edd();
29019+
29020+ if (!MOUNT_ROOT_RDONLY)
29021+ root_mountflags &= ~MS_RDONLY;
29022+ init_mm.start_code = (unsigned long) &_text;
29023+ init_mm.end_code = (unsigned long) &_etext;
29024+ init_mm.end_data = (unsigned long) &_edata;
29025+ init_mm.brk = (unsigned long) &_end;
29026+
29027+ code_resource.start = virt_to_phys(&_text);
29028+ code_resource.end = virt_to_phys(&_etext)-1;
29029+ data_resource.start = virt_to_phys(&_etext);
29030+ data_resource.end = virt_to_phys(&_edata)-1;
29031+
29032+ parse_cmdline_early(cmdline_p);
29033+
29034+ early_identify_cpu(&boot_cpu_data);
29035+
29036+ /*
29037+ * partially used pages are not usable - thus
29038+ * we are rounding upwards:
29039+ */
29040+ end_pfn = e820_end_of_ram();
29041+ num_physpages = end_pfn; /* for pfn_valid */
29042+
29043+ check_efer();
29044+
29045+ discover_ebda();
29046+
29047+ init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
29048+
29049+ if (is_initial_xendomain())
29050+ dmi_scan_machine();
29051+
29052+#ifdef CONFIG_ACPI_NUMA
29053+ /*
29054+ * Parse SRAT to discover nodes.
29055+ */
29056+ acpi_numa_init();
29057+#endif
29058+
29059+#ifdef CONFIG_NUMA
29060+ numa_initmem_init(0, end_pfn);
29061+#else
29062+ contig_initmem_init(0, end_pfn);
29063+#endif
29064+
29065+#ifdef CONFIG_XEN
29066+ /*
29067+ * Reserve kernel, physmap, start info, initial page tables, and
29068+ * direct mapping.
29069+ */
29070+ reserve_bootmem_generic(__pa_symbol(&_text),
29071+ (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
29072+#else
29073+ /* Reserve direct mapping */
29074+ reserve_bootmem_generic(table_start << PAGE_SHIFT,
29075+ (table_end - table_start) << PAGE_SHIFT);
29076+
29077+ /* reserve kernel */
29078+ reserve_bootmem_generic(__pa_symbol(&_text),
29079+ __pa_symbol(&_end) - __pa_symbol(&_text));
29080+
29081+ /*
29082+ * reserve physical page 0 - it's a special BIOS page on many boxes,
29083+ * enabling clean reboots, SMP operation, laptop functions.
29084+ */
29085+ reserve_bootmem_generic(0, PAGE_SIZE);
29086+
29087+ /* reserve ebda region */
29088+ if (ebda_addr)
29089+ reserve_bootmem_generic(ebda_addr, ebda_size);
29090+
29091+#ifdef CONFIG_SMP
29092+ /*
29093+ * But first pinch a few for the stack/trampoline stuff
29094+ * FIXME: Don't need the extra page at 4K, but need to fix
29095+ * trampoline before removing it. (see the GDT stuff)
29096+ */
29097+ reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
29098+
29099+ /* Reserve SMP trampoline */
29100+ reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
29101+#endif
29102+#endif
29103+
29104+#ifdef CONFIG_ACPI_SLEEP
29105+ /*
29106+ * Reserve low memory region for sleep support.
29107+ */
29108+ acpi_reserve_bootmem();
29109+#endif
29110+#ifdef CONFIG_XEN
29111+#ifdef CONFIG_BLK_DEV_INITRD
29112+ if (xen_start_info->mod_start) {
29113+ if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29114+ /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
29115+ initrd_start = INITRD_START + PAGE_OFFSET;
29116+ initrd_end = initrd_start+INITRD_SIZE;
29117+ initrd_below_start_ok = 1;
29118+ } else {
29119+ printk(KERN_ERR "initrd extends beyond end of memory "
29120+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29121+ (unsigned long)(INITRD_START + INITRD_SIZE),
29122+ (unsigned long)(end_pfn << PAGE_SHIFT));
29123+ initrd_start = 0;
29124+ }
29125+ }
29126+#endif
29127+#else /* CONFIG_XEN */
29128+#ifdef CONFIG_BLK_DEV_INITRD
29129+ if (LOADER_TYPE && INITRD_START) {
29130+ if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29131+ reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
29132+ initrd_start =
29133+ INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
29134+ initrd_end = initrd_start+INITRD_SIZE;
29135+ }
29136+ else {
29137+ printk(KERN_ERR "initrd extends beyond end of memory "
29138+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29139+ (unsigned long)(INITRD_START + INITRD_SIZE),
29140+ (unsigned long)(end_pfn << PAGE_SHIFT));
29141+ initrd_start = 0;
29142+ }
29143+ }
29144+#endif
29145+#endif /* !CONFIG_XEN */
29146+#ifdef CONFIG_KEXEC
29147+#ifdef CONFIG_XEN
29148+ xen_machine_kexec_setup_resources();
29149+#else
29150+ if (crashk_res.start != crashk_res.end) {
29151+ reserve_bootmem_generic(crashk_res.start,
29152+ crashk_res.end - crashk_res.start + 1);
29153+ }
29154+#endif
29155+#endif
29156+
29157+ paging_init();
29158+#ifdef CONFIG_X86_LOCAL_APIC
29159+ /*
29160+ * Find and reserve possible boot-time SMP configuration:
29161+ */
29162+ find_smp_config();
29163+#endif
29164+#ifdef CONFIG_XEN
29165+ {
29166+ int i, j, k, fpp;
29167+ unsigned long p2m_pages;
29168+
29169+ p2m_pages = end_pfn;
29170+ if (xen_start_info->nr_pages > end_pfn) {
29171+ /*
29172+ * the end_pfn was shrunk (probably by mem= or highmem=
29173+ * kernel parameter); shrink reservation with the HV
29174+ */
29175+ struct xen_memory_reservation reservation = {
29176+ .address_bits = 0,
29177+ .extent_order = 0,
29178+ .domid = DOMID_SELF
29179+ };
29180+ unsigned int difference;
29181+ int ret;
29182+
29183+ difference = xen_start_info->nr_pages - end_pfn;
29184+
29185+ set_xen_guest_handle(reservation.extent_start,
29186+ ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
29187+ reservation.nr_extents = difference;
29188+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
29189+ &reservation);
29190+ BUG_ON (ret != difference);
29191+ }
29192+ else if (end_pfn > xen_start_info->nr_pages)
29193+ p2m_pages = xen_start_info->nr_pages;
29194+
29195+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29196+ /* Make sure we have a large enough P->M table. */
29197+ phys_to_machine_mapping = alloc_bootmem_pages(
29198+ end_pfn * sizeof(unsigned long));
29199+ memset(phys_to_machine_mapping, ~0,
29200+ end_pfn * sizeof(unsigned long));
29201+ memcpy(phys_to_machine_mapping,
29202+ (unsigned long *)xen_start_info->mfn_list,
29203+ p2m_pages * sizeof(unsigned long));
29204+ free_bootmem(
29205+ __pa(xen_start_info->mfn_list),
29206+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
29207+ sizeof(unsigned long))));
29208+
29209+ /*
29210+ * Initialise the list of the frames that specify the
29211+ * list of frames that make up the p2m table. Used by
29212+ * save/restore.
29213+ */
29214+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
29215+
29216+ fpp = PAGE_SIZE/sizeof(unsigned long);
29217+ for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
29218+ if ((j % fpp) == 0) {
29219+ k++;
29220+ BUG_ON(k>=fpp);
29221+ pfn_to_mfn_frame_list[k] =
29222+ alloc_bootmem_pages(PAGE_SIZE);
29223+ pfn_to_mfn_frame_list_list[k] =
29224+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
29225+ j=0;
29226+ }
29227+ pfn_to_mfn_frame_list[k][j] =
29228+ virt_to_mfn(&phys_to_machine_mapping[i]);
29229+ }
29230+ HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
29231+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
29232+ virt_to_mfn(pfn_to_mfn_frame_list_list);
29233+ }
29234+
29235+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
29236+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
29237+ if (i != 4 && request_dma(i, "xen") != 0)
29238+ BUG();
29239+ }
29240+
29241+ if (!is_initial_xendomain()) {
29242+ acpi_disabled = 1;
29243+#ifdef CONFIG_ACPI
29244+ acpi_ht = 0;
29245+#endif
29246+ }
29247+#endif
29248+
29249+#ifndef CONFIG_XEN
29250+ check_ioapic();
29251+#endif
29252+
29253+ zap_low_mappings(0);
29254+
29255+ /*
29256+ * set this early, so we dont allocate cpu0
29257+ * if MADT list doesnt list BSP first
29258+ * mpparse.c/MP_processor_info() allocates logical cpu numbers.
29259+ */
29260+ cpu_set(0, cpu_present_map);
29261+#ifdef CONFIG_ACPI
29262+ /*
29263+ * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
29264+ * Call this early for SRAT node setup.
29265+ */
29266+ acpi_boot_table_init();
29267+
29268+ /*
29269+ * Read APIC and some other early information from ACPI tables.
29270+ */
29271+ acpi_boot_init();
29272+#endif
29273+
29274+ init_cpu_to_node();
29275+
29276+#ifdef CONFIG_X86_LOCAL_APIC
29277+ /*
29278+ * get boot-time SMP configuration:
29279+ */
29280+ if (smp_found_config)
29281+ get_smp_config();
29282+#ifndef CONFIG_XEN
29283+ init_apic_mappings();
29284+#endif
29285+#endif
29286+#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
29287+ prefill_possible_map();
29288+#endif
29289+
29290+ /*
29291+ * Request address space for all standard RAM and ROM resources
29292+ * and also for regions reported as reserved by the e820.
29293+ */
29294+ probe_roms();
29295+#ifdef CONFIG_XEN
29296+ if (is_initial_xendomain())
29297+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
29298+#else
29299+ e820_reserve_resources(e820.map, e820.nr_map);
29300+#endif
29301+
29302+ request_resource(&iomem_resource, &video_ram_resource);
29303+
29304+ {
29305+ unsigned i;
29306+ /* request I/O space for devices used on all i[345]86 PCs */
29307+ for (i = 0; i < STANDARD_IO_RESOURCES; i++)
29308+ request_resource(&ioport_resource, &standard_io_resources[i]);
29309+ }
29310+
29311+#ifdef CONFIG_XEN
29312+ if (is_initial_xendomain())
29313+ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
29314+#else
29315+ e820_setup_gap(e820.map, e820.nr_map);
29316+#endif
29317+
29318+#ifdef CONFIG_XEN
29319+ {
29320+ struct physdev_set_iopl set_iopl;
29321+
29322+ set_iopl.iopl = 1;
29323+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
29324+
29325+ if (is_initial_xendomain()) {
29326+#ifdef CONFIG_VT
29327+#if defined(CONFIG_VGA_CONSOLE)
29328+ conswitchp = &vga_con;
29329+#elif defined(CONFIG_DUMMY_CONSOLE)
29330+ conswitchp = &dummy_con;
29331+#endif
29332+#endif
29333+ } else {
29334+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
29335+ conswitchp = &dummy_con;
29336+#endif
29337+ }
29338+ }
29339+#else /* CONFIG_XEN */
29340+
29341+#ifdef CONFIG_VT
29342+#if defined(CONFIG_VGA_CONSOLE)
29343+ conswitchp = &vga_con;
29344+#elif defined(CONFIG_DUMMY_CONSOLE)
29345+ conswitchp = &dummy_con;
29346+#endif
29347+#endif
29348+
29349+#endif /* !CONFIG_XEN */
29350+}
29351+
29352+#ifdef CONFIG_XEN
29353+static int
29354+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
29355+{
29356+ HYPERVISOR_shutdown(SHUTDOWN_crash);
29357+ /* we're never actually going to get here... */
29358+ return NOTIFY_DONE;
29359+}
29360+#endif /* !CONFIG_XEN */
29361+
29362+
29363+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
29364+{
29365+ unsigned int *v;
29366+
29367+ if (c->extended_cpuid_level < 0x80000004)
29368+ return 0;
29369+
29370+ v = (unsigned int *) c->x86_model_id;
29371+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
29372+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
29373+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
29374+ c->x86_model_id[48] = 0;
29375+ return 1;
29376+}
29377+
29378+
29379+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
29380+{
29381+ unsigned int n, dummy, eax, ebx, ecx, edx;
29382+
29383+ n = c->extended_cpuid_level;
29384+
29385+ if (n >= 0x80000005) {
29386+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
29387+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
29388+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
29389+ c->x86_cache_size=(ecx>>24)+(edx>>24);
29390+ /* On K8 L1 TLB is inclusive, so don't count it */
29391+ c->x86_tlbsize = 0;
29392+ }
29393+
29394+ if (n >= 0x80000006) {
29395+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
29396+ ecx = cpuid_ecx(0x80000006);
29397+ c->x86_cache_size = ecx >> 16;
29398+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
29399+
29400+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
29401+ c->x86_cache_size, ecx & 0xFF);
29402+ }
29403+
29404+ if (n >= 0x80000007)
29405+ cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
29406+ if (n >= 0x80000008) {
29407+ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
29408+ c->x86_virt_bits = (eax >> 8) & 0xff;
29409+ c->x86_phys_bits = eax & 0xff;
29410+ }
29411+}
29412+
29413+#ifdef CONFIG_NUMA
29414+static int nearby_node(int apicid)
29415+{
29416+ int i;
29417+ for (i = apicid - 1; i >= 0; i--) {
29418+ int node = apicid_to_node[i];
29419+ if (node != NUMA_NO_NODE && node_online(node))
29420+ return node;
29421+ }
29422+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
29423+ int node = apicid_to_node[i];
29424+ if (node != NUMA_NO_NODE && node_online(node))
29425+ return node;
29426+ }
29427+ return first_node(node_online_map); /* Shouldn't happen */
29428+}
29429+#endif
29430+
29431+/*
29432+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
29433+ * Assumes number of cores is a power of two.
29434+ */
29435+static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
29436+{
29437+#ifdef CONFIG_SMP
29438+ unsigned bits;
29439+#ifdef CONFIG_NUMA
29440+ int cpu = smp_processor_id();
29441+ int node = 0;
29442+ unsigned apicid = hard_smp_processor_id();
29443+#endif
29444+ unsigned ecx = cpuid_ecx(0x80000008);
29445+
29446+ c->x86_max_cores = (ecx & 0xff) + 1;
29447+
29448+ /* CPU telling us the core id bits shift? */
29449+ bits = (ecx >> 12) & 0xF;
29450+
29451+ /* Otherwise recompute */
29452+ if (bits == 0) {
29453+ while ((1 << bits) < c->x86_max_cores)
29454+ bits++;
29455+ }
29456+
29457+ /* Low order bits define the core id (index of core in socket) */
29458+ c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
29459+ /* Convert the APIC ID into the socket ID */
29460+ c->phys_proc_id = phys_pkg_id(bits);
29461+
29462+#ifdef CONFIG_NUMA
29463+ node = c->phys_proc_id;
29464+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
29465+ node = apicid_to_node[apicid];
29466+ if (!node_online(node)) {
29467+ /* Two possibilities here:
29468+ - The CPU is missing memory and no node was created.
29469+ In that case try picking one from a nearby CPU
29470+ - The APIC IDs differ from the HyperTransport node IDs
29471+ which the K8 northbridge parsing fills in.
29472+ Assume they are all increased by a constant offset,
29473+ but in the same order as the HT nodeids.
29474+ If that doesn't result in a usable node fall back to the
29475+ path for the previous case. */
29476+ int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
29477+ if (ht_nodeid >= 0 &&
29478+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
29479+ node = apicid_to_node[ht_nodeid];
29480+ /* Pick a nearby node */
29481+ if (!node_online(node))
29482+ node = nearby_node(apicid);
29483+ }
29484+ numa_set_node(cpu, node);
29485+
29486+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29487+#endif
29488+#endif
29489+}
29490+
29491+static void __init init_amd(struct cpuinfo_x86 *c)
29492+{
29493+ unsigned level;
29494+
29495+#ifdef CONFIG_SMP
29496+ unsigned long value;
29497+
29498+ /*
29499+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
29500+ * bit 6 of msr C001_0015
29501+ *
29502+ * Errata 63 for SH-B3 steppings
29503+ * Errata 122 for all steppings (F+ have it disabled by default)
29504+ */
29505+ if (c->x86 == 15) {
29506+ rdmsrl(MSR_K8_HWCR, value);
29507+ value |= 1 << 6;
29508+ wrmsrl(MSR_K8_HWCR, value);
29509+ }
29510+#endif
29511+
29512+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
29513+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
29514+ clear_bit(0*32+31, &c->x86_capability);
29515+
29516+ /* On C+ stepping K8 rep microcode works well for copy/memset */
29517+ level = cpuid_eax(1);
29518+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
29519+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
29520+
29521+ /* Enable workaround for FXSAVE leak */
29522+ if (c->x86 >= 6)
29523+ set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
29524+
29525+ level = get_model_name(c);
29526+ if (!level) {
29527+ switch (c->x86) {
29528+ case 15:
29529+ /* Should distinguish Models here, but this is only
29530+ a fallback anyways. */
29531+ strcpy(c->x86_model_id, "Hammer");
29532+ break;
29533+ }
29534+ }
29535+ display_cacheinfo(c);
29536+
29537+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
29538+ if (c->x86_power & (1<<8))
29539+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29540+
29541+ /* Multi core CPU? */
29542+ if (c->extended_cpuid_level >= 0x80000008)
29543+ amd_detect_cmp(c);
29544+
29545+ /* Fix cpuid4 emulation for more */
29546+ num_cache_leaves = 3;
29547+}
29548+
29549+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
29550+{
29551+#ifdef CONFIG_SMP
29552+ u32 eax, ebx, ecx, edx;
29553+ int index_msb, core_bits;
29554+
29555+ cpuid(1, &eax, &ebx, &ecx, &edx);
29556+
29557+
29558+ if (!cpu_has(c, X86_FEATURE_HT))
29559+ return;
29560+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
29561+ goto out;
29562+
29563+ smp_num_siblings = (ebx & 0xff0000) >> 16;
29564+
29565+ if (smp_num_siblings == 1) {
29566+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
29567+ } else if (smp_num_siblings > 1 ) {
29568+
29569+ if (smp_num_siblings > NR_CPUS) {
29570+ printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
29571+ smp_num_siblings = 1;
29572+ return;
29573+ }
29574+
29575+ index_msb = get_count_order(smp_num_siblings);
29576+ c->phys_proc_id = phys_pkg_id(index_msb);
29577+
29578+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
29579+
29580+ index_msb = get_count_order(smp_num_siblings) ;
29581+
29582+ core_bits = get_count_order(c->x86_max_cores);
29583+
29584+ c->cpu_core_id = phys_pkg_id(index_msb) &
29585+ ((1 << core_bits) - 1);
29586+ }
29587+out:
29588+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
29589+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
29590+ printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
29591+ }
29592+
29593+#endif
29594+}
29595+
29596+/*
29597+ * find out the number of processor cores on the die
29598+ */
29599+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
29600+{
29601+ unsigned int eax, t;
29602+
29603+ if (c->cpuid_level < 4)
29604+ return 1;
29605+
29606+ cpuid_count(4, 0, &eax, &t, &t, &t);
29607+
29608+ if (eax & 0x1f)
29609+ return ((eax >> 26) + 1);
29610+ else
29611+ return 1;
29612+}
29613+
29614+static void srat_detect_node(void)
29615+{
29616+#ifdef CONFIG_NUMA
29617+ unsigned node;
29618+ int cpu = smp_processor_id();
29619+ int apicid = hard_smp_processor_id();
29620+
29621+ /* Don't do the funky fallback heuristics the AMD version employs
29622+ for now. */
29623+ node = apicid_to_node[apicid];
29624+ if (node == NUMA_NO_NODE)
29625+ node = first_node(node_online_map);
29626+ numa_set_node(cpu, node);
29627+
29628+ if (acpi_numa > 0)
29629+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29630+#endif
29631+}
29632+
29633+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
29634+{
29635+ /* Cache sizes */
29636+ unsigned n;
29637+
29638+ init_intel_cacheinfo(c);
29639+ if (c->cpuid_level > 9 ) {
29640+ unsigned eax = cpuid_eax(10);
29641+ /* Check for version and the number of counters */
29642+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
29643+ set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
29644+ }
29645+
29646+ n = c->extended_cpuid_level;
29647+ if (n >= 0x80000008) {
29648+ unsigned eax = cpuid_eax(0x80000008);
29649+ c->x86_virt_bits = (eax >> 8) & 0xff;
29650+ c->x86_phys_bits = eax & 0xff;
29651+ /* CPUID workaround for Intel 0F34 CPU */
29652+ if (c->x86_vendor == X86_VENDOR_INTEL &&
29653+ c->x86 == 0xF && c->x86_model == 0x3 &&
29654+ c->x86_mask == 0x4)
29655+ c->x86_phys_bits = 36;
29656+ }
29657+
29658+ if (c->x86 == 15)
29659+ c->x86_cache_alignment = c->x86_clflush_size * 2;
29660+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
29661+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
29662+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29663+ set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
29664+ c->x86_max_cores = intel_num_cpu_cores(c);
29665+
29666+ srat_detect_node();
29667+}
29668+
29669+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
29670+{
29671+ char *v = c->x86_vendor_id;
29672+
29673+ if (!strcmp(v, "AuthenticAMD"))
29674+ c->x86_vendor = X86_VENDOR_AMD;
29675+ else if (!strcmp(v, "GenuineIntel"))
29676+ c->x86_vendor = X86_VENDOR_INTEL;
29677+ else
29678+ c->x86_vendor = X86_VENDOR_UNKNOWN;
29679+}
29680+
29681+struct cpu_model_info {
29682+ int vendor;
29683+ int family;
29684+ char *model_names[16];
29685+};
29686+
29687+/* Do some early cpuid on the boot CPU to get some parameter that are
29688+ needed before check_bugs. Everything advanced is in identify_cpu
29689+ below. */
29690+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
29691+{
29692+ u32 tfms;
29693+
29694+ c->loops_per_jiffy = loops_per_jiffy;
29695+ c->x86_cache_size = -1;
29696+ c->x86_vendor = X86_VENDOR_UNKNOWN;
29697+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
29698+ c->x86_vendor_id[0] = '\0'; /* Unset */
29699+ c->x86_model_id[0] = '\0'; /* Unset */
29700+ c->x86_clflush_size = 64;
29701+ c->x86_cache_alignment = c->x86_clflush_size;
29702+ c->x86_max_cores = 1;
29703+ c->extended_cpuid_level = 0;
29704+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
29705+
29706+ /* Get vendor name */
29707+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
29708+ (unsigned int *)&c->x86_vendor_id[0],
29709+ (unsigned int *)&c->x86_vendor_id[8],
29710+ (unsigned int *)&c->x86_vendor_id[4]);
29711+
29712+ get_cpu_vendor(c);
29713+
29714+ /* Initialize the standard set of capabilities */
29715+ /* Note that the vendor-specific code below might override */
29716+
29717+ /* Intel-defined flags: level 0x00000001 */
29718+ if (c->cpuid_level >= 0x00000001) {
29719+ __u32 misc;
29720+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
29721+ &c->x86_capability[0]);
29722+ c->x86 = (tfms >> 8) & 0xf;
29723+ c->x86_model = (tfms >> 4) & 0xf;
29724+ c->x86_mask = tfms & 0xf;
29725+ if (c->x86 == 0xf)
29726+ c->x86 += (tfms >> 20) & 0xff;
29727+ if (c->x86 >= 0x6)
29728+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
29729+ if (c->x86_capability[0] & (1<<19))
29730+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
29731+ } else {
29732+ /* Have CPUID level 0 only - unheard of */
29733+ c->x86 = 4;
29734+ }
29735+
29736+#ifdef CONFIG_SMP
29737+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
29738+#endif
29739+}
29740+
29741+/*
29742+ * This does the hard work of actually picking apart the CPU stuff...
29743+ */
29744+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
29745+{
29746+ int i;
29747+ u32 xlvl;
29748+
29749+ early_identify_cpu(c);
29750+
29751+ /* AMD-defined flags: level 0x80000001 */
29752+ xlvl = cpuid_eax(0x80000000);
29753+ c->extended_cpuid_level = xlvl;
29754+ if ((xlvl & 0xffff0000) == 0x80000000) {
29755+ if (xlvl >= 0x80000001) {
29756+ c->x86_capability[1] = cpuid_edx(0x80000001);
29757+ c->x86_capability[6] = cpuid_ecx(0x80000001);
29758+ }
29759+ if (xlvl >= 0x80000004)
29760+ get_model_name(c); /* Default name */
29761+ }
29762+
29763+ /* Transmeta-defined flags: level 0x80860001 */
29764+ xlvl = cpuid_eax(0x80860000);
29765+ if ((xlvl & 0xffff0000) == 0x80860000) {
29766+ /* Don't set x86_cpuid_level here for now to not confuse. */
29767+ if (xlvl >= 0x80860001)
29768+ c->x86_capability[2] = cpuid_edx(0x80860001);
29769+ }
29770+
29771+ c->apicid = phys_pkg_id(0);
29772+
29773+ /*
29774+ * Vendor-specific initialization. In this section we
29775+ * canonicalize the feature flags, meaning if there are
29776+ * features a certain CPU supports which CPUID doesn't
29777+ * tell us, CPUID claiming incorrect flags, or other bugs,
29778+ * we handle them here.
29779+ *
29780+ * At the end of this section, c->x86_capability better
29781+ * indicate the features this CPU genuinely supports!
29782+ */
29783+ switch (c->x86_vendor) {
29784+ case X86_VENDOR_AMD:
29785+ init_amd(c);
29786+ break;
29787+
29788+ case X86_VENDOR_INTEL:
29789+ init_intel(c);
29790+ break;
29791+
29792+ case X86_VENDOR_UNKNOWN:
29793+ default:
29794+ display_cacheinfo(c);
29795+ break;
29796+ }
29797+
29798+ select_idle_routine(c);
29799+ detect_ht(c);
29800+
29801+ /*
29802+ * On SMP, boot_cpu_data holds the common feature set between
29803+ * all CPUs; so make sure that we indicate which features are
29804+ * common between the CPUs. The first time this routine gets
29805+ * executed, c == &boot_cpu_data.
29806+ */
29807+ if (c != &boot_cpu_data) {
29808+ /* AND the already accumulated flags with these */
29809+ for (i = 0 ; i < NCAPINTS ; i++)
29810+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
29811+ }
29812+
29813+#ifdef CONFIG_X86_MCE
29814+ mcheck_init(c);
29815+#endif
29816+ if (c == &boot_cpu_data)
29817+ mtrr_bp_init();
29818+ else
29819+ mtrr_ap_init();
29820+#ifdef CONFIG_NUMA
29821+ numa_add_cpu(smp_processor_id());
29822+#endif
29823+}
29824+
29825+
29826+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
29827+{
29828+ if (c->x86_model_id[0])
29829+ printk("%s", c->x86_model_id);
29830+
29831+ if (c->x86_mask || c->cpuid_level >= 0)
29832+ printk(" stepping %02x\n", c->x86_mask);
29833+ else
29834+ printk("\n");
29835+}
29836+
29837+/*
29838+ * Get CPU information for use by the procfs.
29839+ */
29840+
29841+static int show_cpuinfo(struct seq_file *m, void *v)
29842+{
29843+ struct cpuinfo_x86 *c = v;
29844+
29845+ /*
29846+ * These flag bits must match the definitions in <asm/cpufeature.h>.
29847+ * NULL means this bit is undefined or reserved; either way it doesn't
29848+ * have meaning as far as Linux is concerned. Note that it's important
29849+ * to realize there is a difference between this table and CPUID -- if
29850+ * applications want to get the raw CPUID data, they should access
29851+ * /dev/cpu/<cpu_nr>/cpuid instead.
29852+ */
29853+ static char *x86_cap_flags[] = {
29854+ /* Intel-defined */
29855+ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
29856+ "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
29857+ "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
29858+ "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
29859+
29860+ /* AMD-defined */
29861+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29862+ NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
29863+ NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
29864+ NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
29865+
29866+ /* Transmeta-defined */
29867+ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
29868+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29869+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29870+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29871+
29872+ /* Other (Linux-defined) */
29873+ "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
29874+ "constant_tsc", NULL, NULL,
29875+ "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29876+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29877+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29878+
29879+ /* Intel-defined (#2) */
29880+ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
29881+ "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
29882+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29883+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29884+
29885+ /* VIA/Cyrix/Centaur-defined */
29886+ NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
29887+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29888+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29889+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29890+
29891+ /* AMD-defined (#2) */
29892+ "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
29893+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29894+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29895+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29896+ };
29897+ static char *x86_power_flags[] = {
29898+ "ts", /* temperature sensor */
29899+ "fid", /* frequency id control */
29900+ "vid", /* voltage id control */
29901+ "ttp", /* thermal trip */
29902+ "tm",
29903+ "stc",
29904+ NULL,
29905+ /* nothing */ /* constant_tsc - moved to flags */
29906+ };
29907+
29908+
29909+#ifdef CONFIG_SMP
29910+ if (!cpu_online(c-cpu_data))
29911+ return 0;
29912+#endif
29913+
29914+ seq_printf(m,"processor\t: %u\n"
29915+ "vendor_id\t: %s\n"
29916+ "cpu family\t: %d\n"
29917+ "model\t\t: %d\n"
29918+ "model name\t: %s\n",
29919+ (unsigned)(c-cpu_data),
29920+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
29921+ c->x86,
29922+ (int)c->x86_model,
29923+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
29924+
29925+ if (c->x86_mask || c->cpuid_level >= 0)
29926+ seq_printf(m, "stepping\t: %d\n", c->x86_mask);
29927+ else
29928+ seq_printf(m, "stepping\t: unknown\n");
29929+
29930+ if (cpu_has(c,X86_FEATURE_TSC)) {
29931+ unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
29932+ if (!freq)
29933+ freq = cpu_khz;
29934+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
29935+ freq / 1000, (freq % 1000));
29936+ }
29937+
29938+ /* Cache size */
29939+ if (c->x86_cache_size >= 0)
29940+ seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
29941+
29942+#ifdef CONFIG_SMP
29943+ if (smp_num_siblings * c->x86_max_cores > 1) {
29944+ int cpu = c - cpu_data;
29945+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
29946+ seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
29947+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
29948+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
29949+ }
29950+#endif
29951+
29952+ seq_printf(m,
29953+ "fpu\t\t: yes\n"
29954+ "fpu_exception\t: yes\n"
29955+ "cpuid level\t: %d\n"
29956+ "wp\t\t: yes\n"
29957+ "flags\t\t:",
29958+ c->cpuid_level);
29959+
29960+ {
29961+ int i;
29962+ for ( i = 0 ; i < 32*NCAPINTS ; i++ )
29963+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
29964+ seq_printf(m, " %s", x86_cap_flags[i]);
29965+ }
29966+
29967+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
29968+ c->loops_per_jiffy/(500000/HZ),
29969+ (c->loops_per_jiffy/(5000/HZ)) % 100);
29970+
29971+ if (c->x86_tlbsize > 0)
29972+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
29973+ seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
29974+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
29975+
29976+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
29977+ c->x86_phys_bits, c->x86_virt_bits);
29978+
29979+ seq_printf(m, "power management:");
29980+ {
29981+ unsigned i;
29982+ for (i = 0; i < 32; i++)
29983+ if (c->x86_power & (1 << i)) {
29984+ if (i < ARRAY_SIZE(x86_power_flags) &&
29985+ x86_power_flags[i])
29986+ seq_printf(m, "%s%s",
29987+ x86_power_flags[i][0]?" ":"",
29988+ x86_power_flags[i]);
29989+ else
29990+ seq_printf(m, " [%d]", i);
29991+ }
29992+ }
29993+
29994+ seq_printf(m, "\n\n");
29995+
29996+ return 0;
29997+}
29998+
29999+static void *c_start(struct seq_file *m, loff_t *pos)
30000+{
30001+ return *pos < NR_CPUS ? cpu_data + *pos : NULL;
30002+}
30003+
30004+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
30005+{
30006+ ++*pos;
30007+ return c_start(m, pos);
30008+}
30009+
30010+static void c_stop(struct seq_file *m, void *v)
30011+{
30012+}
30013+
30014+struct seq_operations cpuinfo_op = {
30015+ .start =c_start,
30016+ .next = c_next,
30017+ .stop = c_stop,
30018+ .show = show_cpuinfo,
30019+};
30020+
30021+#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
30022+#include <linux/platform_device.h>
30023+static __init int add_pcspkr(void)
30024+{
30025+ struct platform_device *pd;
30026+ int ret;
30027+
30028+ if (!is_initial_xendomain())
30029+ return 0;
30030+
30031+ pd = platform_device_alloc("pcspkr", -1);
30032+ if (!pd)
30033+ return -ENOMEM;
30034+
30035+ ret = platform_device_add(pd);
30036+ if (ret)
30037+ platform_device_put(pd);
30038+
30039+ return ret;
30040+}
30041+device_initcall(add_pcspkr);
30042+#endif
30043Index: head-2008-11-25/arch/x86/kernel/setup64-xen.c
30044===================================================================
30045--- /dev/null 1970-01-01 00:00:00.000000000 +0000
30046+++ head-2008-11-25/arch/x86/kernel/setup64-xen.c 2008-01-28 12:24:19.000000000 +0100
30047@@ -0,0 +1,367 @@
30048+/*
30049+ * X86-64 specific CPU setup.
30050+ * Copyright (C) 1995 Linus Torvalds
30051+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
30052+ * See setup.c for older changelog.
30053+ *
30054+ * Jun Nakajima <jun.nakajima@intel.com>
30055+ * Modified for Xen
30056+ *
30057+ */
30058+#include <linux/init.h>
30059+#include <linux/kernel.h>
30060+#include <linux/sched.h>
30061+#include <linux/string.h>
30062+#include <linux/bootmem.h>
30063+#include <linux/bitops.h>
30064+#include <linux/module.h>
30065+#include <asm/bootsetup.h>
30066+#include <asm/pda.h>
30067+#include <asm/pgtable.h>
30068+#include <asm/processor.h>
30069+#include <asm/desc.h>
30070+#include <asm/atomic.h>
30071+#include <asm/mmu_context.h>
30072+#include <asm/smp.h>
30073+#include <asm/i387.h>
30074+#include <asm/percpu.h>
30075+#include <asm/proto.h>
30076+#include <asm/sections.h>
30077+#ifdef CONFIG_XEN
30078+#include <asm/hypervisor.h>
30079+#endif
30080+
30081+char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30082+
30083+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30084+
30085+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
30086+EXPORT_SYMBOL(_cpu_pda);
30087+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
30088+
30089+#ifndef CONFIG_X86_NO_IDT
30090+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
30091+#endif
30092+
30093+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
30094+
30095+unsigned long __supported_pte_mask __read_mostly = ~0UL;
30096+EXPORT_SYMBOL(__supported_pte_mask);
30097+static int do_not_nx __cpuinitdata = 0;
30098+
30099+/* noexec=on|off
30100+Control non executable mappings for 64bit processes.
30101+
30102+on Enable(default)
30103+off Disable
30104+*/
30105+int __init nonx_setup(char *str)
30106+{
30107+ if (!strncmp(str, "on", 2)) {
30108+ __supported_pte_mask |= _PAGE_NX;
30109+ do_not_nx = 0;
30110+ } else if (!strncmp(str, "off", 3)) {
30111+ do_not_nx = 1;
30112+ __supported_pte_mask &= ~_PAGE_NX;
30113+ }
30114+ return 1;
30115+}
30116+__setup("noexec=", nonx_setup); /* parsed early actually */
30117+
30118+int force_personality32 = 0;
30119+
30120+/* noexec32=on|off
30121+Control non executable heap for 32bit processes.
30122+To control the stack too use noexec=off
30123+
30124+on PROT_READ does not imply PROT_EXEC for 32bit processes
30125+off PROT_READ implies PROT_EXEC (default)
30126+*/
30127+static int __init nonx32_setup(char *str)
30128+{
30129+ if (!strcmp(str, "on"))
30130+ force_personality32 &= ~READ_IMPLIES_EXEC;
30131+ else if (!strcmp(str, "off"))
30132+ force_personality32 |= READ_IMPLIES_EXEC;
30133+ return 1;
30134+}
30135+__setup("noexec32=", nonx32_setup);
30136+
30137+/*
30138+ * Great future plan:
30139+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
30140+ * Always point %gs to its beginning
30141+ */
30142+void __init setup_per_cpu_areas(void)
30143+{
30144+ int i;
30145+ unsigned long size;
30146+
30147+#ifdef CONFIG_HOTPLUG_CPU
30148+ prefill_possible_map();
30149+#endif
30150+
30151+ /* Copy section for each CPU (we discard the original) */
30152+ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
30153+#ifdef CONFIG_MODULES
30154+ if (size < PERCPU_ENOUGH_ROOM)
30155+ size = PERCPU_ENOUGH_ROOM;
30156+#endif
30157+
30158+ for_each_cpu_mask (i, cpu_possible_map) {
30159+ char *ptr;
30160+
30161+ if (!NODE_DATA(cpu_to_node(i))) {
30162+ printk("cpu with no node %d, num_online_nodes %d\n",
30163+ i, num_online_nodes());
30164+ ptr = alloc_bootmem(size);
30165+ } else {
30166+ ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
30167+ }
30168+ if (!ptr)
30169+ panic("Cannot allocate cpu data for CPU %d\n", i);
30170+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
30171+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
30172+ }
30173+}
30174+
30175+#ifdef CONFIG_XEN
30176+static void switch_pt(void)
30177+{
30178+ xen_pt_switch(__pa_symbol(init_level4_pgt));
30179+ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
30180+}
30181+
30182+static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30183+{
30184+ unsigned long frames[16];
30185+ unsigned long va;
30186+ int f;
30187+
30188+ for (va = gdt_descr->address, f = 0;
30189+ va < gdt_descr->address + gdt_descr->size;
30190+ va += PAGE_SIZE, f++) {
30191+ frames[f] = virt_to_mfn(va);
30192+ make_page_readonly(
30193+ (void *)va, XENFEAT_writable_descriptor_tables);
30194+ }
30195+ if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
30196+ sizeof (struct desc_struct)))
30197+ BUG();
30198+}
30199+#else
30200+static void switch_pt(void)
30201+{
30202+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
30203+}
30204+
30205+static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30206+{
30207+ asm volatile("lgdt %0" :: "m" (*gdt_descr));
30208+ asm volatile("lidt %0" :: "m" (idt_descr));
30209+}
30210+#endif
30211+
30212+void pda_init(int cpu)
30213+{
30214+ struct x8664_pda *pda = cpu_pda(cpu);
30215+
30216+ /* Setup up data that may be needed in __get_free_pages early */
30217+ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
30218+#ifndef CONFIG_XEN
30219+ wrmsrl(MSR_GS_BASE, pda);
30220+#else
30221+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
30222+ (unsigned long)pda))
30223+ BUG();
30224+#endif
30225+ pda->cpunumber = cpu;
30226+ pda->irqcount = -1;
30227+ pda->kernelstack =
30228+ (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
30229+ pda->active_mm = &init_mm;
30230+ pda->mmu_state = 0;
30231+
30232+ if (cpu == 0) {
30233+#ifdef CONFIG_XEN
30234+ xen_init_pt();
30235+#endif
30236+ /* others are initialized in smpboot.c */
30237+ pda->pcurrent = &init_task;
30238+ pda->irqstackptr = boot_cpu_stack;
30239+ } else {
30240+ pda->irqstackptr = (char *)
30241+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
30242+ if (!pda->irqstackptr)
30243+ panic("cannot allocate irqstack for cpu %d", cpu);
30244+ }
30245+
30246+ switch_pt();
30247+
30248+ pda->irqstackptr += IRQSTACKSIZE-64;
30249+}
30250+
30251+#ifndef CONFIG_X86_NO_TSS
30252+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
30253+__attribute__((section(".bss.page_aligned")));
30254+#endif
30255+
30256+/* May not be marked __init: used by software suspend */
30257+void syscall_init(void)
30258+{
30259+#ifndef CONFIG_XEN
30260+ /*
30261+ * LSTAR and STAR live in a bit strange symbiosis.
30262+ * They both write to the same internal register. STAR allows to set CS/DS
30263+ * but only a 32bit target. LSTAR sets the 64bit rip.
30264+ */
30265+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
30266+ wrmsrl(MSR_LSTAR, system_call);
30267+
30268+ /* Flags to clear on syscall */
30269+ wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
30270+#endif
30271+#ifdef CONFIG_IA32_EMULATION
30272+ syscall32_cpu_init ();
30273+#endif
30274+}
30275+
30276+void __cpuinit check_efer(void)
30277+{
30278+ unsigned long efer;
30279+
30280+ rdmsrl(MSR_EFER, efer);
30281+ if (!(efer & EFER_NX) || do_not_nx) {
30282+ __supported_pte_mask &= ~_PAGE_NX;
30283+ }
30284+}
30285+
30286+unsigned long kernel_eflags;
30287+
30288+/*
30289+ * cpu_init() initializes state that is per-CPU. Some data is already
30290+ * initialized (naturally) in the bootstrap process, such as the GDT
30291+ * and IDT. We reload them nevertheless, this function acts as a
30292+ * 'CPU state barrier', nothing should get across.
30293+ * A lot of state is already set up in PDA init.
30294+ */
30295+void __cpuinit cpu_init (void)
30296+{
30297+ int cpu = stack_smp_processor_id();
30298+#ifndef CONFIG_X86_NO_TSS
30299+ struct tss_struct *t = &per_cpu(init_tss, cpu);
30300+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
30301+ unsigned long v;
30302+ char *estacks = NULL;
30303+ unsigned i;
30304+#endif
30305+ struct task_struct *me;
30306+
30307+ /* CPU 0 is initialised in head64.c */
30308+ if (cpu != 0) {
30309+ pda_init(cpu);
30310+ zap_low_mappings(cpu);
30311+ }
30312+#ifndef CONFIG_X86_NO_TSS
30313+ else
30314+ estacks = boot_exception_stacks;
30315+#endif
30316+
30317+ me = current;
30318+
30319+ if (cpu_test_and_set(cpu, cpu_initialized))
30320+ panic("CPU#%d already initialized!\n", cpu);
30321+
30322+ printk("Initializing CPU#%d\n", cpu);
30323+
30324+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
30325+
30326+ /*
30327+ * Initialize the per-CPU GDT with the boot GDT,
30328+ * and set up the GDT descriptor:
30329+ */
30330+#ifndef CONFIG_XEN
30331+ if (cpu)
30332+ memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
30333+#endif
30334+
30335+ cpu_gdt_descr[cpu].size = GDT_SIZE;
30336+ cpu_gdt_init(&cpu_gdt_descr[cpu]);
30337+
30338+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
30339+ syscall_init();
30340+
30341+ wrmsrl(MSR_FS_BASE, 0);
30342+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
30343+ barrier();
30344+
30345+ check_efer();
30346+
30347+#ifndef CONFIG_X86_NO_TSS
30348+ /*
30349+ * set up and load the per-CPU TSS
30350+ */
30351+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
30352+ if (cpu) {
30353+ static const unsigned int order[N_EXCEPTION_STACKS] = {
30354+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
30355+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
30356+ };
30357+
30358+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
30359+ if (!estacks)
30360+ panic("Cannot allocate exception stack %ld %d\n",
30361+ v, cpu);
30362+ }
30363+ switch (v + 1) {
30364+#if DEBUG_STKSZ > EXCEPTION_STKSZ
30365+ case DEBUG_STACK:
30366+ cpu_pda(cpu)->debugstack = (unsigned long)estacks;
30367+ estacks += DEBUG_STKSZ;
30368+ break;
30369+#endif
30370+ default:
30371+ estacks += EXCEPTION_STKSZ;
30372+ break;
30373+ }
30374+ orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
30375+ }
30376+
30377+ t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
30378+ /*
30379+ * <= is required because the CPU will access up to
30380+ * 8 bits beyond the end of the IO permission bitmap.
30381+ */
30382+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
30383+ t->io_bitmap[i] = ~0UL;
30384+#endif
30385+
30386+ atomic_inc(&init_mm.mm_count);
30387+ me->active_mm = &init_mm;
30388+ if (me->mm)
30389+ BUG();
30390+ enter_lazy_tlb(&init_mm, me);
30391+
30392+#ifndef CONFIG_X86_NO_TSS
30393+ set_tss_desc(cpu, t);
30394+#endif
30395+#ifndef CONFIG_XEN
30396+ load_TR_desc();
30397+#endif
30398+ load_LDT(&init_mm.context);
30399+
30400+ /*
30401+ * Clear all 6 debug registers:
30402+ */
30403+
30404+ set_debugreg(0UL, 0);
30405+ set_debugreg(0UL, 1);
30406+ set_debugreg(0UL, 2);
30407+ set_debugreg(0UL, 3);
30408+ set_debugreg(0UL, 6);
30409+ set_debugreg(0UL, 7);
30410+
30411+ fpu_init();
30412+
30413+ raw_local_save_flags(kernel_eflags);
30414+}
30415Index: head-2008-11-25/arch/x86/kernel/smp_64-xen.c
30416===================================================================
30417--- /dev/null 1970-01-01 00:00:00.000000000 +0000
30418+++ head-2008-11-25/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200
30419@@ -0,0 +1,575 @@
30420+/*
30421+ * Intel SMP support routines.
30422+ *
30423+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
30424+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
30425+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
30426+ *
30427+ * This code is released under the GNU General Public License version 2 or
30428+ * later.
30429+ */
30430+
30431+#include <linux/init.h>
30432+
30433+#include <linux/mm.h>
30434+#include <linux/delay.h>
30435+#include <linux/spinlock.h>
30436+#include <linux/smp_lock.h>
30437+#include <linux/smp.h>
30438+#include <linux/kernel_stat.h>
30439+#include <linux/mc146818rtc.h>
30440+#include <linux/interrupt.h>
30441+
30442+#include <asm/mtrr.h>
30443+#include <asm/pgalloc.h>
30444+#include <asm/tlbflush.h>
30445+#include <asm/mach_apic.h>
30446+#include <asm/mmu_context.h>
30447+#include <asm/proto.h>
30448+#include <asm/apicdef.h>
30449+#include <asm/idle.h>
30450+#ifdef CONFIG_XEN
30451+#include <xen/evtchn.h>
30452+#endif
30453+
30454+#ifndef CONFIG_XEN
30455+/*
30456+ * Smarter SMP flushing macros.
30457+ * c/o Linus Torvalds.
30458+ *
30459+ * These mean you can really definitely utterly forget about
30460+ * writing to user space from interrupts. (Its not allowed anyway).
30461+ *
30462+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
30463+ *
30464+ * More scalable flush, from Andi Kleen
30465+ *
30466+ * To avoid global state use 8 different call vectors.
30467+ * Each CPU uses a specific vector to trigger flushes on other
30468+ * CPUs. Depending on the received vector the target CPUs look into
30469+ * the right per cpu variable for the flush data.
30470+ *
30471+ * With more than 8 CPUs they are hashed to the 8 available
30472+ * vectors. The limited global vector space forces us to this right now.
30473+ * In future when interrupts are split into per CPU domains this could be
30474+ * fixed, at the cost of triggering multiple IPIs in some cases.
30475+ */
30476+
30477+union smp_flush_state {
30478+ struct {
30479+ cpumask_t flush_cpumask;
30480+ struct mm_struct *flush_mm;
30481+ unsigned long flush_va;
30482+#define FLUSH_ALL -1ULL
30483+ spinlock_t tlbstate_lock;
30484+ };
30485+ char pad[SMP_CACHE_BYTES];
30486+} ____cacheline_aligned;
30487+
30488+/* State is put into the per CPU data section, but padded
30489+ to a full cache line because other CPUs can access it and we don't
30490+ want false sharing in the per cpu data segment. */
30491+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
30492+
30493+/*
30494+ * We cannot call mmdrop() because we are in interrupt context,
30495+ * instead update mm->cpu_vm_mask.
30496+ */
30497+static inline void leave_mm(unsigned long cpu)
30498+{
30499+ if (read_pda(mmu_state) == TLBSTATE_OK)
30500+ BUG();
30501+ cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
30502+ load_cr3(swapper_pg_dir);
30503+}
30504+
30505+/*
30506+ *
30507+ * The flush IPI assumes that a thread switch happens in this order:
30508+ * [cpu0: the cpu that switches]
30509+ * 1) switch_mm() either 1a) or 1b)
30510+ * 1a) thread switch to a different mm
30511+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
30512+ * Stop ipi delivery for the old mm. This is not synchronized with
30513+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
30514+ * for the wrong mm, and in the worst case we perform a superfluous
30515+ * tlb flush.
30516+ * 1a2) set cpu mmu_state to TLBSTATE_OK
30517+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
30518+ * was in lazy tlb mode.
30519+ * 1a3) update cpu active_mm
30520+ * Now cpu0 accepts tlb flushes for the new mm.
30521+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
30522+ * Now the other cpus will send tlb flush ipis.
30523+ * 1a4) change cr3.
30524+ * 1b) thread switch without mm change
30525+ * cpu active_mm is correct, cpu0 already handles
30526+ * flush ipis.
30527+ * 1b1) set cpu mmu_state to TLBSTATE_OK
30528+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
30529+ * Atomically set the bit [other cpus will start sending flush ipis],
30530+ * and test the bit.
30531+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
30532+ * 2) switch %%esp, ie current
30533+ *
30534+ * The interrupt must handle 2 special cases:
30535+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
30536+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
30537+ * runs in kernel space, the cpu could load tlb entries for user space
30538+ * pages.
30539+ *
30540+ * The good news is that cpu mmu_state is local to each cpu, no
30541+ * write/read ordering problems.
30542+ */
30543+
30544+/*
30545+ * TLB flush IPI:
30546+ *
30547+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
30548+ * 2) Leave the mm if we are in the lazy tlb mode.
30549+ *
30550+ * Interrupts are disabled.
30551+ */
30552+
30553+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
30554+{
30555+ int cpu;
30556+ int sender;
30557+ union smp_flush_state *f;
30558+
30559+ cpu = smp_processor_id();
30560+ /*
30561+ * orig_rax contains the negated interrupt vector.
30562+ * Use that to determine where the sender put the data.
30563+ */
30564+ sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
30565+ f = &per_cpu(flush_state, sender);
30566+
30567+ if (!cpu_isset(cpu, f->flush_cpumask))
30568+ goto out;
30569+ /*
30570+ * This was a BUG() but until someone can quote me the
30571+ * line from the intel manual that guarantees an IPI to
30572+ * multiple CPUs is retried _only_ on the erroring CPUs
30573+ * its staying as a return
30574+ *
30575+ * BUG();
30576+ */
30577+
30578+ if (f->flush_mm == read_pda(active_mm)) {
30579+ if (read_pda(mmu_state) == TLBSTATE_OK) {
30580+ if (f->flush_va == FLUSH_ALL)
30581+ local_flush_tlb();
30582+ else
30583+ __flush_tlb_one(f->flush_va);
30584+ } else
30585+ leave_mm(cpu);
30586+ }
30587+out:
30588+ ack_APIC_irq();
30589+ cpu_clear(cpu, f->flush_cpumask);
30590+}
30591+
30592+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
30593+ unsigned long va)
30594+{
30595+ int sender;
30596+ union smp_flush_state *f;
30597+
30598+ /* Caller has disabled preemption */
30599+ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
30600+ f = &per_cpu(flush_state, sender);
30601+
30602+ /* Could avoid this lock when
30603+ num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
30604+ probably not worth checking this for a cache-hot lock. */
30605+ spin_lock(&f->tlbstate_lock);
30606+
30607+ f->flush_mm = mm;
30608+ f->flush_va = va;
30609+ cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
30610+
30611+ /*
30612+ * We have to send the IPI only to
30613+ * CPUs affected.
30614+ */
30615+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
30616+
30617+ while (!cpus_empty(f->flush_cpumask))
30618+ cpu_relax();
30619+
30620+ f->flush_mm = NULL;
30621+ f->flush_va = 0;
30622+ spin_unlock(&f->tlbstate_lock);
30623+}
30624+
30625+int __cpuinit init_smp_flush(void)
30626+{
30627+ int i;
30628+ for_each_cpu_mask(i, cpu_possible_map) {
30629+ spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
30630+ }
30631+ return 0;
30632+}
30633+
30634+core_initcall(init_smp_flush);
30635+
30636+void flush_tlb_current_task(void)
30637+{
30638+ struct mm_struct *mm = current->mm;
30639+ cpumask_t cpu_mask;
30640+
30641+ preempt_disable();
30642+ cpu_mask = mm->cpu_vm_mask;
30643+ cpu_clear(smp_processor_id(), cpu_mask);
30644+
30645+ local_flush_tlb();
30646+ if (!cpus_empty(cpu_mask))
30647+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30648+ preempt_enable();
30649+}
30650+EXPORT_SYMBOL(flush_tlb_current_task);
30651+
30652+void flush_tlb_mm (struct mm_struct * mm)
30653+{
30654+ cpumask_t cpu_mask;
30655+
30656+ preempt_disable();
30657+ cpu_mask = mm->cpu_vm_mask;
30658+ cpu_clear(smp_processor_id(), cpu_mask);
30659+
30660+ if (current->active_mm == mm) {
30661+ if (current->mm)
30662+ local_flush_tlb();
30663+ else
30664+ leave_mm(smp_processor_id());
30665+ }
30666+ if (!cpus_empty(cpu_mask))
30667+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30668+
30669+ preempt_enable();
30670+}
30671+EXPORT_SYMBOL(flush_tlb_mm);
30672+
30673+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
30674+{
30675+ struct mm_struct *mm = vma->vm_mm;
30676+ cpumask_t cpu_mask;
30677+
30678+ preempt_disable();
30679+ cpu_mask = mm->cpu_vm_mask;
30680+ cpu_clear(smp_processor_id(), cpu_mask);
30681+
30682+ if (current->active_mm == mm) {
30683+ if(current->mm)
30684+ __flush_tlb_one(va);
30685+ else
30686+ leave_mm(smp_processor_id());
30687+ }
30688+
30689+ if (!cpus_empty(cpu_mask))
30690+ flush_tlb_others(cpu_mask, mm, va);
30691+
30692+ preempt_enable();
30693+}
30694+EXPORT_SYMBOL(flush_tlb_page);
30695+
30696+static void do_flush_tlb_all(void* info)
30697+{
30698+ unsigned long cpu = smp_processor_id();
30699+
30700+ __flush_tlb_all();
30701+ if (read_pda(mmu_state) == TLBSTATE_LAZY)
30702+ leave_mm(cpu);
30703+}
30704+
30705+void flush_tlb_all(void)
30706+{
30707+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
30708+}
30709+#endif /* Xen */
30710+
30711+/*
30712+ * this function sends a 'reschedule' IPI to another CPU.
30713+ * it goes straight through and wastes no time serializing
30714+ * anything. Worst case is that we lose a reschedule ...
30715+ */
30716+
30717+void smp_send_reschedule(int cpu)
30718+{
30719+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
30720+}
30721+
30722+/*
30723+ * Structure and data for smp_call_function(). This is designed to minimise
30724+ * static memory requirements. It also looks cleaner.
30725+ */
30726+static DEFINE_SPINLOCK(call_lock);
30727+
30728+struct call_data_struct {
30729+ void (*func) (void *info);
30730+ void *info;
30731+ atomic_t started;
30732+ atomic_t finished;
30733+ int wait;
30734+};
30735+
30736+static struct call_data_struct * call_data;
30737+
30738+void lock_ipi_call_lock(void)
30739+{
30740+ spin_lock_irq(&call_lock);
30741+}
30742+
30743+void unlock_ipi_call_lock(void)
30744+{
30745+ spin_unlock_irq(&call_lock);
30746+}
30747+
30748+/*
30749+ * this function sends a 'generic call function' IPI to one other CPU
30750+ * in the system.
30751+ *
30752+ * cpu is a standard Linux logical CPU number.
30753+ */
30754+static void
30755+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
30756+ int nonatomic, int wait)
30757+{
30758+ struct call_data_struct data;
30759+ int cpus = 1;
30760+
30761+ data.func = func;
30762+ data.info = info;
30763+ atomic_set(&data.started, 0);
30764+ data.wait = wait;
30765+ if (wait)
30766+ atomic_set(&data.finished, 0);
30767+
30768+ call_data = &data;
30769+ wmb();
30770+ /* Send a message to all other CPUs and wait for them to respond */
30771+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
30772+
30773+ /* Wait for response */
30774+ while (atomic_read(&data.started) != cpus)
30775+ cpu_relax();
30776+
30777+ if (!wait)
30778+ return;
30779+
30780+ while (atomic_read(&data.finished) != cpus)
30781+ cpu_relax();
30782+}
30783+
30784+/*
30785+ * smp_call_function_single - Run a function on another CPU
30786+ * @func: The function to run. This must be fast and non-blocking.
30787+ * @info: An arbitrary pointer to pass to the function.
30788+ * @nonatomic: Currently unused.
30789+ * @wait: If true, wait until function has completed on other CPUs.
30790+ *
30791+ * Retrurns 0 on success, else a negative status code.
30792+ *
30793+ * Does not return until the remote CPU is nearly ready to execute <func>
30794+ * or is or has executed.
30795+ */
30796+
30797+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
30798+ int nonatomic, int wait)
30799+{
30800+ /* prevent preemption and reschedule on another processor */
30801+ int me = get_cpu();
30802+ if (cpu == me) {
30803+ WARN_ON(1);
30804+ put_cpu();
30805+ return -EBUSY;
30806+ }
30807+ spin_lock_bh(&call_lock);
30808+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
30809+ spin_unlock_bh(&call_lock);
30810+ put_cpu();
30811+ return 0;
30812+}
30813+
30814+/*
30815+ * this function sends a 'generic call function' IPI to all other CPUs
30816+ * in the system.
30817+ */
30818+static void __smp_call_function (void (*func) (void *info), void *info,
30819+ int nonatomic, int wait)
30820+{
30821+ struct call_data_struct data;
30822+ int cpus = num_online_cpus()-1;
30823+
30824+ if (!cpus)
30825+ return;
30826+
30827+ data.func = func;
30828+ data.info = info;
30829+ atomic_set(&data.started, 0);
30830+ data.wait = wait;
30831+ if (wait)
30832+ atomic_set(&data.finished, 0);
30833+
30834+ call_data = &data;
30835+ wmb();
30836+ /* Send a message to all other CPUs and wait for them to respond */
30837+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
30838+
30839+ /* Wait for response */
30840+ while (atomic_read(&data.started) != cpus)
30841+ cpu_relax();
30842+
30843+ if (!wait)
30844+ return;
30845+
30846+ while (atomic_read(&data.finished) != cpus)
30847+ cpu_relax();
30848+}
30849+
30850+/*
30851+ * smp_call_function - run a function on all other CPUs.
30852+ * @func: The function to run. This must be fast and non-blocking.
30853+ * @info: An arbitrary pointer to pass to the function.
30854+ * @nonatomic: currently unused.
30855+ * @wait: If true, wait (atomically) until function has completed on other
30856+ * CPUs.
30857+ *
30858+ * Returns 0 on success, else a negative status code. Does not return until
30859+ * remote CPUs are nearly ready to execute func or are or have executed.
30860+ *
30861+ * You must not call this function with disabled interrupts or from a
30862+ * hardware interrupt handler or from a bottom half handler.
30863+ * Actually there are a few legal cases, like panic.
30864+ */
30865+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
30866+ int wait)
30867+{
30868+ spin_lock(&call_lock);
30869+ __smp_call_function(func,info,nonatomic,wait);
30870+ spin_unlock(&call_lock);
30871+ return 0;
30872+}
30873+EXPORT_SYMBOL(smp_call_function);
30874+
30875+void smp_stop_cpu(void)
30876+{
30877+ unsigned long flags;
30878+ /*
30879+ * Remove this CPU:
30880+ */
30881+ cpu_clear(smp_processor_id(), cpu_online_map);
30882+ local_irq_save(flags);
30883+ disable_all_local_evtchn();
30884+ local_irq_restore(flags);
30885+}
30886+
30887+static void smp_really_stop_cpu(void *dummy)
30888+{
30889+ smp_stop_cpu();
30890+ for (;;)
30891+ halt();
30892+}
30893+
30894+void smp_send_stop(void)
30895+{
30896+ int nolock = 0;
30897+#ifndef CONFIG_XEN
30898+ if (reboot_force)
30899+ return;
30900+#endif
30901+ /* Don't deadlock on the call lock in panic */
30902+ if (!spin_trylock(&call_lock)) {
30903+ /* ignore locking because we have panicked anyways */
30904+ nolock = 1;
30905+ }
30906+ __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
30907+ if (!nolock)
30908+ spin_unlock(&call_lock);
30909+
30910+ local_irq_disable();
30911+ disable_all_local_evtchn();
30912+ local_irq_enable();
30913+}
30914+
30915+/*
30916+ * Reschedule call back. Nothing to do,
30917+ * all the work is done automatically when
30918+ * we return from the interrupt.
30919+ */
30920+#ifndef CONFIG_XEN
30921+asmlinkage void smp_reschedule_interrupt(void)
30922+#else
30923+asmlinkage irqreturn_t smp_reschedule_interrupt(void)
30924+#endif
30925+{
30926+#ifndef CONFIG_XEN
30927+ ack_APIC_irq();
30928+#else
30929+ return IRQ_HANDLED;
30930+#endif
30931+}
30932+
30933+#ifndef CONFIG_XEN
30934+asmlinkage void smp_call_function_interrupt(void)
30935+#else
30936+asmlinkage irqreturn_t smp_call_function_interrupt(void)
30937+#endif
30938+{
30939+ void (*func) (void *info) = call_data->func;
30940+ void *info = call_data->info;
30941+ int wait = call_data->wait;
30942+
30943+#ifndef CONFIG_XEN
30944+ ack_APIC_irq();
30945+#endif
30946+ /*
30947+ * Notify initiating CPU that I've grabbed the data and am
30948+ * about to execute the function
30949+ */
30950+ mb();
30951+ atomic_inc(&call_data->started);
30952+ /*
30953+ * At this point the info structure may be out of scope unless wait==1
30954+ */
30955+ exit_idle();
30956+ irq_enter();
30957+ (*func)(info);
30958+ irq_exit();
30959+ if (wait) {
30960+ mb();
30961+ atomic_inc(&call_data->finished);
30962+ }
30963+#ifdef CONFIG_XEN
30964+ return IRQ_HANDLED;
30965+#endif
30966+}
30967+
30968+int safe_smp_processor_id(void)
30969+{
30970+#ifdef CONFIG_XEN
30971+ return smp_processor_id();
30972+#else
30973+ unsigned apicid, i;
30974+
30975+ if (disable_apic)
30976+ return 0;
30977+
30978+ apicid = hard_smp_processor_id();
30979+ if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
30980+ return apicid;
30981+
30982+ for (i = 0; i < NR_CPUS; ++i) {
30983+ if (x86_cpu_to_apicid[i] == apicid)
30984+ return i;
30985+ }
30986+
30987+ /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
30988+ * or called too early. Either way, we must be CPU 0. */
30989+ if (x86_cpu_to_apicid[0] == BAD_APICID)
30990+ return 0;
30991+
30992+ return 0; /* Should not happen */
30993+#endif
30994+}
30995Index: head-2008-11-25/arch/x86/kernel/traps_64-xen.c
30996===================================================================
30997--- /dev/null 1970-01-01 00:00:00.000000000 +0000
30998+++ head-2008-11-25/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200
30999@@ -0,0 +1,1173 @@
31000+/*
31001+ * linux/arch/x86-64/traps.c
31002+ *
31003+ * Copyright (C) 1991, 1992 Linus Torvalds
31004+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
31005+ *
31006+ * Pentium III FXSR, SSE support
31007+ * Gareth Hughes <gareth@valinux.com>, May 2000
31008+ */
31009+
31010+/*
31011+ * 'Traps.c' handles hardware traps and faults after we have saved some
31012+ * state in 'entry.S'.
31013+ */
31014+#include <linux/sched.h>
31015+#include <linux/kernel.h>
31016+#include <linux/string.h>
31017+#include <linux/errno.h>
31018+#include <linux/ptrace.h>
31019+#include <linux/timer.h>
31020+#include <linux/mm.h>
31021+#include <linux/init.h>
31022+#include <linux/delay.h>
31023+#include <linux/spinlock.h>
31024+#include <linux/interrupt.h>
31025+#include <linux/module.h>
31026+#include <linux/moduleparam.h>
31027+#include <linux/nmi.h>
31028+#include <linux/kprobes.h>
31029+#include <linux/kexec.h>
31030+#include <linux/unwind.h>
31031+
31032+#include <asm/system.h>
31033+#include <asm/uaccess.h>
31034+#include <asm/io.h>
31035+#include <asm/atomic.h>
31036+#include <asm/debugreg.h>
31037+#include <asm/desc.h>
31038+#include <asm/i387.h>
31039+#include <asm/kdebug.h>
31040+#include <asm/processor.h>
31041+#include <asm/unwind.h>
31042+#include <asm/smp.h>
31043+#include <asm/pgalloc.h>
31044+#include <asm/pda.h>
31045+#include <asm/proto.h>
31046+#include <asm/nmi.h>
31047+
31048+asmlinkage void divide_error(void);
31049+asmlinkage void debug(void);
31050+asmlinkage void nmi(void);
31051+asmlinkage void int3(void);
31052+asmlinkage void overflow(void);
31053+asmlinkage void bounds(void);
31054+asmlinkage void invalid_op(void);
31055+asmlinkage void device_not_available(void);
31056+asmlinkage void double_fault(void);
31057+asmlinkage void coprocessor_segment_overrun(void);
31058+asmlinkage void invalid_TSS(void);
31059+asmlinkage void segment_not_present(void);
31060+asmlinkage void stack_segment(void);
31061+asmlinkage void general_protection(void);
31062+asmlinkage void page_fault(void);
31063+asmlinkage void coprocessor_error(void);
31064+asmlinkage void simd_coprocessor_error(void);
31065+asmlinkage void reserved(void);
31066+asmlinkage void alignment_check(void);
31067+asmlinkage void machine_check(void);
31068+asmlinkage void spurious_interrupt_bug(void);
31069+
31070+ATOMIC_NOTIFIER_HEAD(die_chain);
31071+EXPORT_SYMBOL(die_chain);
31072+
31073+int register_die_notifier(struct notifier_block *nb)
31074+{
31075+ vmalloc_sync_all();
31076+ return atomic_notifier_chain_register(&die_chain, nb);
31077+}
31078+EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
31079+
31080+int unregister_die_notifier(struct notifier_block *nb)
31081+{
31082+ return atomic_notifier_chain_unregister(&die_chain, nb);
31083+}
31084+EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
31085+
31086+static inline void conditional_sti(struct pt_regs *regs)
31087+{
31088+ if (regs->eflags & X86_EFLAGS_IF)
31089+ local_irq_enable();
31090+}
31091+
31092+static inline void preempt_conditional_sti(struct pt_regs *regs)
31093+{
31094+ preempt_disable();
31095+ if (regs->eflags & X86_EFLAGS_IF)
31096+ local_irq_enable();
31097+}
31098+
31099+static inline void preempt_conditional_cli(struct pt_regs *regs)
31100+{
31101+ if (regs->eflags & X86_EFLAGS_IF)
31102+ local_irq_disable();
31103+ /* Make sure to not schedule here because we could be running
31104+ on an exception stack. */
31105+ preempt_enable_no_resched();
31106+}
31107+
31108+static int kstack_depth_to_print = 12;
31109+#ifdef CONFIG_STACK_UNWIND
31110+static int call_trace = 1;
31111+#else
31112+#define call_trace (-1)
31113+#endif
31114+
31115+#ifdef CONFIG_KALLSYMS
31116+# include <linux/kallsyms.h>
31117+void printk_address(unsigned long address)
31118+{
31119+ unsigned long offset = 0, symsize;
31120+ const char *symname;
31121+ char *modname;
31122+ char *delim = ":";
31123+ char namebuf[128];
31124+
31125+ symname = kallsyms_lookup(address, &symsize, &offset,
31126+ &modname, namebuf);
31127+ if (!symname) {
31128+ printk(" [<%016lx>]\n", address);
31129+ return;
31130+ }
31131+ if (!modname)
31132+ modname = delim = "";
31133+ printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
31134+ address, delim, modname, delim, symname, offset, symsize);
31135+}
31136+#else
31137+void printk_address(unsigned long address)
31138+{
31139+ printk(" [<%016lx>]\n", address);
31140+}
31141+#endif
31142+
31143+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
31144+ unsigned *usedp, const char **idp)
31145+{
31146+#ifndef CONFIG_X86_NO_TSS
31147+ static char ids[][8] = {
31148+ [DEBUG_STACK - 1] = "#DB",
31149+ [NMI_STACK - 1] = "NMI",
31150+ [DOUBLEFAULT_STACK - 1] = "#DF",
31151+ [STACKFAULT_STACK - 1] = "#SS",
31152+ [MCE_STACK - 1] = "#MC",
31153+#if DEBUG_STKSZ > EXCEPTION_STKSZ
31154+ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
31155+#endif
31156+ };
31157+ unsigned k;
31158+
31159+ /*
31160+ * Iterate over all exception stacks, and figure out whether
31161+ * 'stack' is in one of them:
31162+ */
31163+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
31164+ unsigned long end;
31165+
31166+ /*
31167+ * set 'end' to the end of the exception stack.
31168+ */
31169+ switch (k + 1) {
31170+ /*
31171+ * TODO: this block is not needed i think, because
31172+ * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
31173+ * properly too.
31174+ */
31175+#if DEBUG_STKSZ > EXCEPTION_STKSZ
31176+ case DEBUG_STACK:
31177+ end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
31178+ break;
31179+#endif
31180+ default:
31181+ end = per_cpu(orig_ist, cpu).ist[k];
31182+ break;
31183+ }
31184+ /*
31185+ * Is 'stack' above this exception frame's end?
31186+ * If yes then skip to the next frame.
31187+ */
31188+ if (stack >= end)
31189+ continue;
31190+ /*
31191+ * Is 'stack' above this exception frame's start address?
31192+ * If yes then we found the right frame.
31193+ */
31194+ if (stack >= end - EXCEPTION_STKSZ) {
31195+ /*
31196+ * Make sure we only iterate through an exception
31197+ * stack once. If it comes up for the second time
31198+ * then there's something wrong going on - just
31199+ * break out and return NULL:
31200+ */
31201+ if (*usedp & (1U << k))
31202+ break;
31203+ *usedp |= 1U << k;
31204+ *idp = ids[k];
31205+ return (unsigned long *)end;
31206+ }
31207+ /*
31208+ * If this is a debug stack, and if it has a larger size than
31209+ * the usual exception stacks, then 'stack' might still
31210+ * be within the lower portion of the debug stack:
31211+ */
31212+#if DEBUG_STKSZ > EXCEPTION_STKSZ
31213+ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
31214+ unsigned j = N_EXCEPTION_STACKS - 1;
31215+
31216+ /*
31217+ * Black magic. A large debug stack is composed of
31218+ * multiple exception stack entries, which we
31219+ * iterate through now. Dont look:
31220+ */
31221+ do {
31222+ ++j;
31223+ end -= EXCEPTION_STKSZ;
31224+ ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
31225+ } while (stack < end - EXCEPTION_STKSZ);
31226+ if (*usedp & (1U << j))
31227+ break;
31228+ *usedp |= 1U << j;
31229+ *idp = ids[j];
31230+ return (unsigned long *)end;
31231+ }
31232+#endif
31233+ }
31234+#endif
31235+ return NULL;
31236+}
31237+
31238+static int show_trace_unwind(struct unwind_frame_info *info, void *context)
31239+{
31240+ int n = 0;
31241+
31242+ while (unwind(info) == 0 && UNW_PC(info)) {
31243+ n++;
31244+ printk_address(UNW_PC(info));
31245+ if (arch_unw_user_mode(info))
31246+ break;
31247+ }
31248+ return n;
31249+}
31250+
31251+/*
31252+ * x86-64 can have upto three kernel stacks:
31253+ * process stack
31254+ * interrupt stack
31255+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
31256+ */
31257+
31258+void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
31259+{
31260+ const unsigned cpu = safe_smp_processor_id();
31261+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
31262+ unsigned used = 0;
31263+
31264+ printk("\nCall Trace:\n");
31265+
31266+ if (!tsk)
31267+ tsk = current;
31268+
31269+ if (call_trace >= 0) {
31270+ int unw_ret = 0;
31271+ struct unwind_frame_info info;
31272+
31273+ if (regs) {
31274+ if (unwind_init_frame_info(&info, tsk, regs) == 0)
31275+ unw_ret = show_trace_unwind(&info, NULL);
31276+ } else if (tsk == current)
31277+ unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
31278+ else {
31279+ if (unwind_init_blocked(&info, tsk) == 0)
31280+ unw_ret = show_trace_unwind(&info, NULL);
31281+ }
31282+ if (unw_ret > 0) {
31283+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
31284+ print_symbol("DWARF2 unwinder stuck at %s\n",
31285+ UNW_PC(&info));
31286+ if ((long)UNW_SP(&info) < 0) {
31287+ printk("Leftover inexact backtrace:\n");
31288+ stack = (unsigned long *)UNW_SP(&info);
31289+ } else
31290+ printk("Full inexact backtrace again:\n");
31291+ } else if (call_trace >= 1)
31292+ return;
31293+ else
31294+ printk("Full inexact backtrace again:\n");
31295+ } else
31296+ printk("Inexact backtrace:\n");
31297+ }
31298+
31299+ /*
31300+ * Print function call entries within a stack. 'cond' is the
31301+ * "end of stackframe" condition, that the 'stack++'
31302+ * iteration will eventually trigger.
31303+ */
31304+#define HANDLE_STACK(cond) \
31305+ do while (cond) { \
31306+ unsigned long addr = *stack++; \
31307+ if (kernel_text_address(addr)) { \
31308+ /* \
31309+ * If the address is either in the text segment of the \
31310+ * kernel, or in the region which contains vmalloc'ed \
31311+ * memory, it *may* be the address of a calling \
31312+ * routine; if so, print it so that someone tracing \
31313+ * down the cause of the crash will be able to figure \
31314+ * out the call path that was taken. \
31315+ */ \
31316+ printk_address(addr); \
31317+ } \
31318+ } while (0)
31319+
31320+ /*
31321+ * Print function call entries in all stacks, starting at the
31322+ * current stack address. If the stacks consist of nested
31323+ * exceptions
31324+ */
31325+ for ( ; ; ) {
31326+ const char *id;
31327+ unsigned long *estack_end;
31328+ estack_end = in_exception_stack(cpu, (unsigned long)stack,
31329+ &used, &id);
31330+
31331+ if (estack_end) {
31332+ printk(" <%s>", id);
31333+ HANDLE_STACK (stack < estack_end);
31334+ printk(" <EOE>");
31335+ /*
31336+ * We link to the next stack via the
31337+ * second-to-last pointer (index -2 to end) in the
31338+ * exception stack:
31339+ */
31340+ stack = (unsigned long *) estack_end[-2];
31341+ continue;
31342+ }
31343+ if (irqstack_end) {
31344+ unsigned long *irqstack;
31345+ irqstack = irqstack_end -
31346+ (IRQSTACKSIZE - 64) / sizeof(*irqstack);
31347+
31348+ if (stack >= irqstack && stack < irqstack_end) {
31349+ printk(" <IRQ>");
31350+ HANDLE_STACK (stack < irqstack_end);
31351+ /*
31352+ * We link to the next stack (which would be
31353+ * the process stack normally) the last
31354+ * pointer (index -1 to end) in the IRQ stack:
31355+ */
31356+ stack = (unsigned long *) (irqstack_end[-1]);
31357+ irqstack_end = NULL;
31358+ printk(" <EOI>");
31359+ continue;
31360+ }
31361+ }
31362+ break;
31363+ }
31364+
31365+ /*
31366+ * This prints the process stack:
31367+ */
31368+ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
31369+#undef HANDLE_STACK
31370+
31371+ printk("\n");
31372+}
31373+
31374+static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
31375+{
31376+ unsigned long *stack;
31377+ int i;
31378+ const int cpu = safe_smp_processor_id();
31379+ unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
31380+ unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
31381+
31382+ // debugging aid: "show_stack(NULL, NULL);" prints the
31383+ // back trace for this cpu.
31384+
31385+ if (rsp == NULL) {
31386+ if (tsk)
31387+ rsp = (unsigned long *)tsk->thread.rsp;
31388+ else
31389+ rsp = (unsigned long *)&rsp;
31390+ }
31391+
31392+ stack = rsp;
31393+ for(i=0; i < kstack_depth_to_print; i++) {
31394+ if (stack >= irqstack && stack <= irqstack_end) {
31395+ if (stack == irqstack_end) {
31396+ stack = (unsigned long *) (irqstack_end[-1]);
31397+ printk(" <EOI> ");
31398+ }
31399+ } else {
31400+ if (((long) stack & (THREAD_SIZE-1)) == 0)
31401+ break;
31402+ }
31403+ if (i && ((i % 4) == 0))
31404+ printk("\n");
31405+ printk(" %016lx", *stack++);
31406+ touch_nmi_watchdog();
31407+ }
31408+ show_trace(tsk, regs, rsp);
31409+}
31410+
31411+void show_stack(struct task_struct *tsk, unsigned long * rsp)
31412+{
31413+ _show_stack(tsk, NULL, rsp);
31414+}
31415+
31416+/*
31417+ * The architecture-independent dump_stack generator
31418+ */
31419+void dump_stack(void)
31420+{
31421+ unsigned long dummy;
31422+ show_trace(NULL, NULL, &dummy);
31423+}
31424+
31425+EXPORT_SYMBOL(dump_stack);
31426+
31427+void show_registers(struct pt_regs *regs)
31428+{
31429+ int i;
31430+ int in_kernel = !user_mode(regs);
31431+ unsigned long rsp;
31432+ const int cpu = safe_smp_processor_id();
31433+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;
31434+
31435+ rsp = regs->rsp;
31436+
31437+ printk("CPU %d ", cpu);
31438+ __show_regs(regs);
31439+ printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
31440+ cur->comm, cur->pid, task_thread_info(cur), cur);
31441+
31442+ /*
31443+ * When in-kernel, we also print out the stack and code at the
31444+ * time of the fault..
31445+ */
31446+ if (in_kernel) {
31447+
31448+ printk("Stack: ");
31449+ _show_stack(NULL, regs, (unsigned long*)rsp);
31450+
31451+ printk("\nCode: ");
31452+ if (regs->rip < PAGE_OFFSET)
31453+ goto bad;
31454+
31455+ for (i=0; i<20; i++) {
31456+ unsigned char c;
31457+ if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
31458+bad:
31459+ printk(" Bad RIP value.");
31460+ break;
31461+ }
31462+ printk("%02x ", c);
31463+ }
31464+ }
31465+ printk("\n");
31466+}
31467+
31468+void handle_BUG(struct pt_regs *regs)
31469+{
31470+ struct bug_frame f;
31471+ long len;
31472+ const char *prefix = "";
31473+
31474+ if (user_mode(regs))
31475+ return;
31476+ if (__copy_from_user(&f, (const void __user *) regs->rip,
31477+ sizeof(struct bug_frame)))
31478+ return;
31479+ if (f.filename >= 0 ||
31480+ f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
31481+ return;
31482+ len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
31483+ if (len < 0 || len >= PATH_MAX)
31484+ f.filename = (int)(long)"unmapped filename";
31485+ else if (len > 50) {
31486+ f.filename += len - 50;
31487+ prefix = "...";
31488+ }
31489+ printk("----------- [cut here ] --------- [please bite here ] ---------\n");
31490+ printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
31491+}
31492+
31493+#ifdef CONFIG_BUG
31494+void out_of_line_bug(void)
31495+{
31496+ BUG();
31497+}
31498+EXPORT_SYMBOL(out_of_line_bug);
31499+#endif
31500+
31501+static DEFINE_SPINLOCK(die_lock);
31502+static int die_owner = -1;
31503+static unsigned int die_nest_count;
31504+
31505+unsigned __kprobes long oops_begin(void)
31506+{
31507+ int cpu = safe_smp_processor_id();
31508+ unsigned long flags;
31509+
31510+ /* racy, but better than risking deadlock. */
31511+ local_irq_save(flags);
31512+ if (!spin_trylock(&die_lock)) {
31513+ if (cpu == die_owner)
31514+ /* nested oops. should stop eventually */;
31515+ else
31516+ spin_lock(&die_lock);
31517+ }
31518+ die_nest_count++;
31519+ die_owner = cpu;
31520+ console_verbose();
31521+ bust_spinlocks(1);
31522+ return flags;
31523+}
31524+
31525+void __kprobes oops_end(unsigned long flags)
31526+{
31527+ die_owner = -1;
31528+ bust_spinlocks(0);
31529+ die_nest_count--;
31530+ if (die_nest_count)
31531+ /* We still own the lock */
31532+ local_irq_restore(flags);
31533+ else
31534+ /* Nest count reaches zero, release the lock. */
31535+ spin_unlock_irqrestore(&die_lock, flags);
31536+ if (panic_on_oops)
31537+ panic("Fatal exception");
31538+}
31539+
31540+void __kprobes __die(const char * str, struct pt_regs * regs, long err)
31541+{
31542+ static int die_counter;
31543+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
31544+#ifdef CONFIG_PREEMPT
31545+ printk("PREEMPT ");
31546+#endif
31547+#ifdef CONFIG_SMP
31548+ printk("SMP ");
31549+#endif
31550+#ifdef CONFIG_DEBUG_PAGEALLOC
31551+ printk("DEBUG_PAGEALLOC");
31552+#endif
31553+ printk("\n");
31554+ notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
31555+ show_registers(regs);
31556+ /* Executive summary in case the oops scrolled away */
31557+ printk(KERN_ALERT "RIP ");
31558+ printk_address(regs->rip);
31559+ printk(" RSP <%016lx>\n", regs->rsp);
31560+ if (kexec_should_crash(current))
31561+ crash_kexec(regs);
31562+}
31563+
31564+void die(const char * str, struct pt_regs * regs, long err)
31565+{
31566+ unsigned long flags = oops_begin();
31567+
31568+ handle_BUG(regs);
31569+ __die(str, regs, err);
31570+ oops_end(flags);
31571+ do_exit(SIGSEGV);
31572+}
31573+
31574+#ifdef CONFIG_X86_LOCAL_APIC
31575+void __kprobes die_nmi(char *str, struct pt_regs *regs)
31576+{
31577+ unsigned long flags = oops_begin();
31578+
31579+ /*
31580+ * We are in trouble anyway, lets at least try
31581+ * to get a message out.
31582+ */
31583+ printk(str, safe_smp_processor_id());
31584+ show_registers(regs);
31585+ if (kexec_should_crash(current))
31586+ crash_kexec(regs);
31587+ if (panic_on_timeout || panic_on_oops)
31588+ panic("nmi watchdog");
31589+ printk("console shuts up ...\n");
31590+ oops_end(flags);
31591+ nmi_exit();
31592+ local_irq_enable();
31593+ do_exit(SIGSEGV);
31594+}
31595+#endif
31596+
31597+static void __kprobes do_trap(int trapnr, int signr, char *str,
31598+ struct pt_regs * regs, long error_code,
31599+ siginfo_t *info)
31600+{
31601+ struct task_struct *tsk = current;
31602+
31603+ tsk->thread.error_code = error_code;
31604+ tsk->thread.trap_no = trapnr;
31605+
31606+ if (user_mode(regs)) {
31607+ if (exception_trace && unhandled_signal(tsk, signr))
31608+ printk(KERN_INFO
31609+ "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
31610+ tsk->comm, tsk->pid, str,
31611+ regs->rip, regs->rsp, error_code);
31612+
31613+ if (info)
31614+ force_sig_info(signr, info, tsk);
31615+ else
31616+ force_sig(signr, tsk);
31617+ return;
31618+ }
31619+
31620+
31621+ /* kernel trap */
31622+ {
31623+ const struct exception_table_entry *fixup;
31624+ fixup = search_exception_tables(regs->rip);
31625+ if (fixup)
31626+ regs->rip = fixup->fixup;
31627+ else
31628+ die(str, regs, error_code);
31629+ return;
31630+ }
31631+}
31632+
31633+#define DO_ERROR(trapnr, signr, str, name) \
31634+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31635+{ \
31636+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31637+ == NOTIFY_STOP) \
31638+ return; \
31639+ conditional_sti(regs); \
31640+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
31641+}
31642+
31643+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
31644+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31645+{ \
31646+ siginfo_t info; \
31647+ info.si_signo = signr; \
31648+ info.si_errno = 0; \
31649+ info.si_code = sicode; \
31650+ info.si_addr = (void __user *)siaddr; \
31651+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31652+ == NOTIFY_STOP) \
31653+ return; \
31654+ conditional_sti(regs); \
31655+ do_trap(trapnr, signr, str, regs, error_code, &info); \
31656+}
31657+
31658+DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
31659+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
31660+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
31661+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
31662+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
31663+DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
31664+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
31665+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
31666+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
31667+DO_ERROR(18, SIGSEGV, "reserved", reserved)
31668+
31669+/* Runs on IST stack */
31670+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
31671+{
31672+ if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
31673+ 12, SIGBUS) == NOTIFY_STOP)
31674+ return;
31675+ preempt_conditional_sti(regs);
31676+ do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
31677+ preempt_conditional_cli(regs);
31678+}
31679+
31680+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
31681+{
31682+ static const char str[] = "double fault";
31683+ struct task_struct *tsk = current;
31684+
31685+ /* Return not checked because double check cannot be ignored */
31686+ notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
31687+
31688+ tsk->thread.error_code = error_code;
31689+ tsk->thread.trap_no = 8;
31690+
31691+ /* This is always a kernel trap and never fixable (and thus must
31692+ never return). */
31693+ for (;;)
31694+ die(str, regs, error_code);
31695+}
31696+
31697+asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
31698+ long error_code)
31699+{
31700+ struct task_struct *tsk = current;
31701+
31702+ conditional_sti(regs);
31703+
31704+ tsk->thread.error_code = error_code;
31705+ tsk->thread.trap_no = 13;
31706+
31707+ if (user_mode(regs)) {
31708+ if (exception_trace && unhandled_signal(tsk, SIGSEGV))
31709+ printk(KERN_INFO
31710+ "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
31711+ tsk->comm, tsk->pid,
31712+ regs->rip, regs->rsp, error_code);
31713+
31714+ force_sig(SIGSEGV, tsk);
31715+ return;
31716+ }
31717+
31718+ /* kernel gp */
31719+ {
31720+ const struct exception_table_entry *fixup;
31721+ fixup = search_exception_tables(regs->rip);
31722+ if (fixup) {
31723+ regs->rip = fixup->fixup;
31724+ return;
31725+ }
31726+ if (notify_die(DIE_GPF, "general protection fault", regs,
31727+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
31728+ return;
31729+ die("general protection fault", regs, error_code);
31730+ }
31731+}
31732+
31733+static __kprobes void
31734+mem_parity_error(unsigned char reason, struct pt_regs * regs)
31735+{
31736+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
31737+ printk("You probably have a hardware problem with your RAM chips\n");
31738+
31739+#if 0 /* XEN */
31740+ /* Clear and disable the memory parity error line. */
31741+ reason = (reason & 0xf) | 4;
31742+ outb(reason, 0x61);
31743+#endif /* XEN */
31744+}
31745+
31746+static __kprobes void
31747+io_check_error(unsigned char reason, struct pt_regs * regs)
31748+{
31749+ printk("NMI: IOCK error (debug interrupt?)\n");
31750+ show_registers(regs);
31751+
31752+#if 0 /* XEN */
31753+ /* Re-enable the IOCK line, wait for a few seconds */
31754+ reason = (reason & 0xf) | 8;
31755+ outb(reason, 0x61);
31756+ mdelay(2000);
31757+ reason &= ~8;
31758+ outb(reason, 0x61);
31759+#endif /* XEN */
31760+}
31761+
31762+static __kprobes void
31763+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
31764+{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
31765+ printk("Dazed and confused, but trying to continue\n");
31766+ printk("Do you have a strange power saving mode enabled?\n");
31767+}
31768+
31769+/* Runs on IST stack. This code must keep interrupts off all the time.
31770+ Nested NMIs are prevented by the CPU. */
31771+asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
31772+{
31773+ unsigned char reason = 0;
31774+ int cpu;
31775+
31776+ cpu = smp_processor_id();
31777+
31778+ /* Only the BSP gets external NMIs from the system. */
31779+ if (!cpu)
31780+ reason = get_nmi_reason();
31781+
31782+ if (!(reason & 0xc0)) {
31783+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
31784+ == NOTIFY_STOP)
31785+ return;
31786+#ifdef CONFIG_X86_LOCAL_APIC
31787+ /*
31788+ * Ok, so this is none of the documented NMI sources,
31789+ * so it must be the NMI watchdog.
31790+ */
31791+ if (nmi_watchdog > 0) {
31792+ nmi_watchdog_tick(regs,reason);
31793+ return;
31794+ }
31795+#endif
31796+ unknown_nmi_error(reason, regs);
31797+ return;
31798+ }
31799+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
31800+ return;
31801+
31802+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
31803+
31804+ if (reason & 0x80)
31805+ mem_parity_error(reason, regs);
31806+ if (reason & 0x40)
31807+ io_check_error(reason, regs);
31808+}
31809+
31810+/* runs on IST stack. */
31811+asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
31812+{
31813+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
31814+ return;
31815+ }
31816+ preempt_conditional_sti(regs);
31817+ do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
31818+ preempt_conditional_cli(regs);
31819+}
31820+
31821+/* Help handler running on IST stack to switch back to user stack
31822+ for scheduling or signal handling. The actual stack switch is done in
31823+ entry.S */
31824+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
31825+{
31826+ struct pt_regs *regs = eregs;
31827+ /* Did already sync */
31828+ if (eregs == (struct pt_regs *)eregs->rsp)
31829+ ;
31830+ /* Exception from user space */
31831+ else if (user_mode(eregs))
31832+ regs = task_pt_regs(current);
31833+ /* Exception from kernel and interrupts are enabled. Move to
31834+ kernel process stack. */
31835+ else if (eregs->eflags & X86_EFLAGS_IF)
31836+ regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
31837+ if (eregs != regs)
31838+ *regs = *eregs;
31839+ return regs;
31840+}
31841+
31842+/* runs on IST stack. */
31843+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
31844+ unsigned long error_code)
31845+{
31846+ unsigned long condition;
31847+ struct task_struct *tsk = current;
31848+ siginfo_t info;
31849+
31850+ get_debugreg(condition, 6);
31851+
31852+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
31853+ SIGTRAP) == NOTIFY_STOP)
31854+ return;
31855+
31856+ preempt_conditional_sti(regs);
31857+
31858+ /* Mask out spurious debug traps due to lazy DR7 setting */
31859+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
31860+ if (!tsk->thread.debugreg7) {
31861+ goto clear_dr7;
31862+ }
31863+ }
31864+
31865+ tsk->thread.debugreg6 = condition;
31866+
31867+ /* Mask out spurious TF errors due to lazy TF clearing */
31868+ if (condition & DR_STEP) {
31869+ /*
31870+ * The TF error should be masked out only if the current
31871+ * process is not traced and if the TRAP flag has been set
31872+ * previously by a tracing process (condition detected by
31873+ * the PT_DTRACE flag); remember that the i386 TRAP flag
31874+ * can be modified by the process itself in user mode,
31875+ * allowing programs to debug themselves without the ptrace()
31876+ * interface.
31877+ */
31878+ if (!user_mode(regs))
31879+ goto clear_TF_reenable;
31880+ /*
31881+ * Was the TF flag set by a debugger? If so, clear it now,
31882+ * so that register information is correct.
31883+ */
31884+ if (tsk->ptrace & PT_DTRACE) {
31885+ regs->eflags &= ~TF_MASK;
31886+ tsk->ptrace &= ~PT_DTRACE;
31887+ }
31888+ }
31889+
31890+ /* Ok, finally something we can handle */
31891+ tsk->thread.trap_no = 1;
31892+ tsk->thread.error_code = error_code;
31893+ info.si_signo = SIGTRAP;
31894+ info.si_errno = 0;
31895+ info.si_code = TRAP_BRKPT;
31896+ info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
31897+ force_sig_info(SIGTRAP, &info, tsk);
31898+
31899+clear_dr7:
31900+ set_debugreg(0UL, 7);
31901+ preempt_conditional_cli(regs);
31902+ return;
31903+
31904+clear_TF_reenable:
31905+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
31906+ regs->eflags &= ~TF_MASK;
31907+ preempt_conditional_cli(regs);
31908+}
31909+
31910+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
31911+{
31912+ const struct exception_table_entry *fixup;
31913+ fixup = search_exception_tables(regs->rip);
31914+ if (fixup) {
31915+ regs->rip = fixup->fixup;
31916+ return 1;
31917+ }
31918+ notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
31919+ /* Illegal floating point operation in the kernel */
31920+ current->thread.trap_no = trapnr;
31921+ die(str, regs, 0);
31922+ return 0;
31923+}
31924+
31925+/*
31926+ * Note that we play around with the 'TS' bit in an attempt to get
31927+ * the correct behaviour even in the presence of the asynchronous
31928+ * IRQ13 behaviour
31929+ */
31930+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
31931+{
31932+ void __user *rip = (void __user *)(regs->rip);
31933+ struct task_struct * task;
31934+ siginfo_t info;
31935+ unsigned short cwd, swd;
31936+
31937+ conditional_sti(regs);
31938+ if (!user_mode(regs) &&
31939+ kernel_math_error(regs, "kernel x87 math error", 16))
31940+ return;
31941+
31942+ /*
31943+ * Save the info for the exception handler and clear the error.
31944+ */
31945+ task = current;
31946+ save_init_fpu(task);
31947+ task->thread.trap_no = 16;
31948+ task->thread.error_code = 0;
31949+ info.si_signo = SIGFPE;
31950+ info.si_errno = 0;
31951+ info.si_code = __SI_FAULT;
31952+ info.si_addr = rip;
31953+ /*
31954+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
31955+ * status. 0x3f is the exception bits in these regs, 0x200 is the
31956+ * C1 reg you need in case of a stack fault, 0x040 is the stack
31957+ * fault bit. We should only be taking one exception at a time,
31958+ * so if this combination doesn't produce any single exception,
31959+ * then we have a bad program that isn't synchronizing its FPU usage
31960+ * and it will suffer the consequences since we won't be able to
31961+ * fully reproduce the context of the exception
31962+ */
31963+ cwd = get_fpu_cwd(task);
31964+ swd = get_fpu_swd(task);
31965+ switch (swd & ~cwd & 0x3f) {
31966+ case 0x000:
31967+ default:
31968+ break;
31969+ case 0x001: /* Invalid Op */
31970+ /*
31971+ * swd & 0x240 == 0x040: Stack Underflow
31972+ * swd & 0x240 == 0x240: Stack Overflow
31973+ * User must clear the SF bit (0x40) if set
31974+ */
31975+ info.si_code = FPE_FLTINV;
31976+ break;
31977+ case 0x002: /* Denormalize */
31978+ case 0x010: /* Underflow */
31979+ info.si_code = FPE_FLTUND;
31980+ break;
31981+ case 0x004: /* Zero Divide */
31982+ info.si_code = FPE_FLTDIV;
31983+ break;
31984+ case 0x008: /* Overflow */
31985+ info.si_code = FPE_FLTOVF;
31986+ break;
31987+ case 0x020: /* Precision */
31988+ info.si_code = FPE_FLTRES;
31989+ break;
31990+ }
31991+ force_sig_info(SIGFPE, &info, task);
31992+}
31993+
31994+asmlinkage void bad_intr(void)
31995+{
31996+ printk("bad interrupt");
31997+}
31998+
31999+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
32000+{
32001+ void __user *rip = (void __user *)(regs->rip);
32002+ struct task_struct * task;
32003+ siginfo_t info;
32004+ unsigned short mxcsr;
32005+
32006+ conditional_sti(regs);
32007+ if (!user_mode(regs) &&
32008+ kernel_math_error(regs, "kernel simd math error", 19))
32009+ return;
32010+
32011+ /*
32012+ * Save the info for the exception handler and clear the error.
32013+ */
32014+ task = current;
32015+ save_init_fpu(task);
32016+ task->thread.trap_no = 19;
32017+ task->thread.error_code = 0;
32018+ info.si_signo = SIGFPE;
32019+ info.si_errno = 0;
32020+ info.si_code = __SI_FAULT;
32021+ info.si_addr = rip;
32022+ /*
32023+ * The SIMD FPU exceptions are handled a little differently, as there
32024+ * is only a single status/control register. Thus, to determine which
32025+ * unmasked exception was caught we must mask the exception mask bits
32026+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
32027+ */
32028+ mxcsr = get_fpu_mxcsr(task);
32029+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
32030+ case 0x000:
32031+ default:
32032+ break;
32033+ case 0x001: /* Invalid Op */
32034+ info.si_code = FPE_FLTINV;
32035+ break;
32036+ case 0x002: /* Denormalize */
32037+ case 0x010: /* Underflow */
32038+ info.si_code = FPE_FLTUND;
32039+ break;
32040+ case 0x004: /* Zero Divide */
32041+ info.si_code = FPE_FLTDIV;
32042+ break;
32043+ case 0x008: /* Overflow */
32044+ info.si_code = FPE_FLTOVF;
32045+ break;
32046+ case 0x020: /* Precision */
32047+ info.si_code = FPE_FLTRES;
32048+ break;
32049+ }
32050+ force_sig_info(SIGFPE, &info, task);
32051+}
32052+
32053+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
32054+{
32055+}
32056+
32057+#if 0
32058+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
32059+{
32060+}
32061+#endif
32062+
32063+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
32064+{
32065+}
32066+
32067+/*
32068+ * 'math_state_restore()' saves the current math information in the
32069+ * old math state array, and gets the new ones from the current task
32070+ *
32071+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
32072+ * Don't touch unless you *really* know how it works.
32073+ */
32074+asmlinkage void math_state_restore(void)
32075+{
32076+ struct task_struct *me = current;
32077+ /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
32078+
32079+ if (!used_math())
32080+ init_fpu(me);
32081+ restore_fpu_checking(&me->thread.i387.fxsave);
32082+ task_thread_info(me)->status |= TS_USEDFPU;
32083+}
32084+
32085+
32086+/*
32087+ * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
32088+ * specify <dpl>|4 in the second field.
32089+ */
32090+static trap_info_t __cpuinitdata trap_table[] = {
32091+ { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
32092+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
32093+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
32094+ { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
32095+ { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
32096+ { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
32097+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
32098+ { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
32099+ { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
32100+ { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
32101+ { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
32102+ { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
32103+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
32104+ { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
32105+ { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
32106+ { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
32107+#ifdef CONFIG_X86_MCE
32108+ { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
32109+#endif
32110+ { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
32111+#ifdef CONFIG_IA32_EMULATION
32112+ { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall},
32113+#endif
32114+ { 0, 0, 0, 0 }
32115+};
32116+
32117+void __init trap_init(void)
32118+{
32119+ int ret;
32120+
32121+ ret = HYPERVISOR_set_trap_table(trap_table);
32122+ if (ret)
32123+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
32124+
32125+ /*
32126+ * Should be a barrier for any external CPU state.
32127+ */
32128+ cpu_init();
32129+}
32130+
32131+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
32132+{
32133+ const trap_info_t *t = trap_table;
32134+
32135+ for (t = trap_table; t->address; t++) {
32136+ trap_ctxt[t->vector].flags = t->flags;
32137+ trap_ctxt[t->vector].cs = t->cs;
32138+ trap_ctxt[t->vector].address = t->address;
32139+ }
32140+}
32141+
32142+
32143+/* Actual parsing is done early in setup.c. */
32144+static int __init oops_dummy(char *s)
32145+{
32146+ panic_on_oops = 1;
32147+ return 1;
32148+}
32149+__setup("oops=", oops_dummy);
32150+
32151+static int __init kstack_setup(char *s)
32152+{
32153+ kstack_depth_to_print = simple_strtoul(s,NULL,0);
32154+ return 1;
32155+}
32156+__setup("kstack=", kstack_setup);
32157+
32158+#ifdef CONFIG_STACK_UNWIND
32159+static int __init call_trace_setup(char *s)
32160+{
32161+ if (strcmp(s, "old") == 0)
32162+ call_trace = -1;
32163+ else if (strcmp(s, "both") == 0)
32164+ call_trace = 0;
32165+ else if (strcmp(s, "newfallback") == 0)
32166+ call_trace = 1;
32167+ else if (strcmp(s, "new") == 0)
32168+ call_trace = 2;
32169+ return 1;
32170+}
32171+__setup("call_trace=", call_trace_setup);
32172+#endif
32173Index: head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c
32174===================================================================
32175--- /dev/null 1970-01-01 00:00:00.000000000 +0000
32176+++ head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200
32177@@ -0,0 +1,227 @@
32178+/*
32179+ * linux/arch/x86_64/kernel/vsyscall.c
32180+ *
32181+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
32182+ * Copyright 2003 Andi Kleen, SuSE Labs.
32183+ *
32184+ * Thanks to hpa@transmeta.com for some useful hint.
32185+ * Special thanks to Ingo Molnar for his early experience with
32186+ * a different vsyscall implementation for Linux/IA32 and for the name.
32187+ *
32188+ * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
32189+ * at virtual address -10Mbyte+1024bytes etc... There are at max 4
32190+ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
32191+ * jumping out of line if necessary. We cannot add more with this
32192+ * mechanism because older kernels won't return -ENOSYS.
32193+ * If we want more than four we need a vDSO.
32194+ *
32195+ * Note: the concept clashes with user mode linux. If you use UML and
32196+ * want per guest time just set the kernel.vsyscall64 sysctl to 0.
32197+ */
32198+
32199+#include <linux/time.h>
32200+#include <linux/init.h>
32201+#include <linux/kernel.h>
32202+#include <linux/timer.h>
32203+#include <linux/seqlock.h>
32204+#include <linux/jiffies.h>
32205+#include <linux/sysctl.h>
32206+
32207+#include <asm/vsyscall.h>
32208+#include <asm/pgtable.h>
32209+#include <asm/page.h>
32210+#include <asm/fixmap.h>
32211+#include <asm/errno.h>
32212+#include <asm/io.h>
32213+
32214+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
32215+
32216+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
32217+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
32218+
32219+#include <asm/unistd.h>
32220+
32221+static __always_inline void timeval_normalize(struct timeval * tv)
32222+{
32223+ time_t __sec;
32224+
32225+ __sec = tv->tv_usec / 1000000;
32226+ if (__sec) {
32227+ tv->tv_usec %= 1000000;
32228+ tv->tv_sec += __sec;
32229+ }
32230+}
32231+
32232+static __always_inline void do_vgettimeofday(struct timeval * tv)
32233+{
32234+ long sequence, t;
32235+ unsigned long sec, usec;
32236+
32237+ do {
32238+ sequence = read_seqbegin(&__xtime_lock);
32239+
32240+ sec = __xtime.tv_sec;
32241+ usec = (__xtime.tv_nsec / 1000) +
32242+ (__jiffies - __wall_jiffies) * (1000000 / HZ);
32243+
32244+ if (__vxtime.mode != VXTIME_HPET) {
32245+ t = get_cycles_sync();
32246+ if (t < __vxtime.last_tsc)
32247+ t = __vxtime.last_tsc;
32248+ usec += ((t - __vxtime.last_tsc) *
32249+ __vxtime.tsc_quot) >> 32;
32250+ /* See comment in x86_64 do_gettimeofday. */
32251+ } else {
32252+ usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
32253+ __vxtime.last) * __vxtime.quot) >> 32;
32254+ }
32255+ } while (read_seqretry(&__xtime_lock, sequence));
32256+
32257+ tv->tv_sec = sec + usec / 1000000;
32258+ tv->tv_usec = usec % 1000000;
32259+}
32260+
32261+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
32262+static __always_inline void do_get_tz(struct timezone * tz)
32263+{
32264+ *tz = __sys_tz;
32265+}
32266+
32267+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
32268+{
32269+ int ret;
32270+ asm volatile("vsysc2: syscall"
32271+ : "=a" (ret)
32272+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
32273+ return ret;
32274+}
32275+
32276+static __always_inline long time_syscall(long *t)
32277+{
32278+ long secs;
32279+ asm volatile("vsysc1: syscall"
32280+ : "=a" (secs)
32281+ : "0" (__NR_time),"D" (t) : __syscall_clobber);
32282+ return secs;
32283+}
32284+
32285+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
32286+{
32287+ if (!__sysctl_vsyscall)
32288+ return gettimeofday(tv,tz);
32289+ if (tv)
32290+ do_vgettimeofday(tv);
32291+ if (tz)
32292+ do_get_tz(tz);
32293+ return 0;
32294+}
32295+
32296+/* This will break when the xtime seconds get inaccurate, but that is
32297+ * unlikely */
32298+time_t __vsyscall(1) vtime(time_t *t)
32299+{
32300+ if (!__sysctl_vsyscall)
32301+ return time_syscall(t);
32302+ else if (t)
32303+ *t = __xtime.tv_sec;
32304+ return __xtime.tv_sec;
32305+}
32306+
32307+long __vsyscall(2) venosys_0(void)
32308+{
32309+ return -ENOSYS;
32310+}
32311+
32312+long __vsyscall(3) venosys_1(void)
32313+{
32314+ return -ENOSYS;
32315+}
32316+
32317+#ifdef CONFIG_SYSCTL
32318+
32319+#define SYSCALL 0x050f
32320+#define NOP2 0x9090
32321+
32322+/*
32323+ * NOP out syscall in vsyscall page when not needed.
32324+ */
32325+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
32326+ void __user *buffer, size_t *lenp, loff_t *ppos)
32327+{
32328+ extern u16 vsysc1, vsysc2;
32329+ u16 *map1, *map2;
32330+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
32331+ if (!write)
32332+ return ret;
32333+ /* gcc has some trouble with __va(__pa()), so just do it this
32334+ way. */
32335+ map1 = ioremap(__pa_symbol(&vsysc1), 2);
32336+ if (!map1)
32337+ return -ENOMEM;
32338+ map2 = ioremap(__pa_symbol(&vsysc2), 2);
32339+ if (!map2) {
32340+ ret = -ENOMEM;
32341+ goto out;
32342+ }
32343+ if (!sysctl_vsyscall) {
32344+ *map1 = SYSCALL;
32345+ *map2 = SYSCALL;
32346+ } else {
32347+ *map1 = NOP2;
32348+ *map2 = NOP2;
32349+ }
32350+ iounmap(map2);
32351+out:
32352+ iounmap(map1);
32353+ return ret;
32354+}
32355+
32356+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
32357+ void __user *oldval, size_t __user *oldlenp,
32358+ void __user *newval, size_t newlen,
32359+ void **context)
32360+{
32361+ return -ENOSYS;
32362+}
32363+
32364+static ctl_table kernel_table2[] = {
32365+ { .ctl_name = 99, .procname = "vsyscall64",
32366+ .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
32367+ .strategy = vsyscall_sysctl_nostrat,
32368+ .proc_handler = vsyscall_sysctl_change },
32369+ { 0, }
32370+};
32371+
32372+static ctl_table kernel_root_table2[] = {
32373+ { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
32374+ .child = kernel_table2 },
32375+ { 0 },
32376+};
32377+
32378+#endif
32379+
32380+static void __init map_vsyscall(void)
32381+{
32382+ extern char __vsyscall_0;
32383+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
32384+
32385+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
32386+}
32387+
32388+static int __init vsyscall_init(void)
32389+{
32390+ BUG_ON(((unsigned long) &vgettimeofday !=
32391+ VSYSCALL_ADDR(__NR_vgettimeofday)));
32392+ BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
32393+ BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
32394+ map_vsyscall();
32395+#ifdef CONFIG_XEN
32396+ sysctl_vsyscall = 0; /* disable vgettimeofay() */
32397+#endif
32398+#ifdef CONFIG_SYSCTL
32399+ register_sysctl_table(kernel_root_table2, 0);
32400+#endif
32401+ return 0;
32402+}
32403+
32404+__initcall(vsyscall_init);
32405Index: head-2008-11-25/arch/x86/kernel/xen_entry_64.S
32406===================================================================
32407--- /dev/null 1970-01-01 00:00:00.000000000 +0000
32408+++ head-2008-11-25/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200
32409@@ -0,0 +1,36 @@
32410+/*
32411+ * Copied from arch/xen/i386/kernel/entry.S
32412+ */
32413+/* Offsets into shared_info_t. */
32414+#define evtchn_upcall_pending /* 0 */
32415+#define evtchn_upcall_mask 1
32416+
32417+#define sizeof_vcpu_shift 6
32418+
32419+#ifdef CONFIG_SMP
32420+//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
32421+//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
32422+#define preempt_disable(reg)
32423+#define preempt_enable(reg)
32424+#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
32425+ movq %gs:pda_cpunumber,reg ; \
32426+ shl $32, reg ; \
32427+ shr $32-sizeof_vcpu_shift,reg ; \
32428+ addq HYPERVISOR_shared_info,reg
32429+#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
32430+#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
32431+#else
32432+#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
32433+#define XEN_PUT_VCPU_INFO(reg)
32434+#define XEN_PUT_VCPU_INFO_fixup
32435+#endif
32436+
32437+#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
32438+#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
32439+#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32440+ XEN_LOCKED_BLOCK_EVENTS(reg) ; \
32441+ XEN_PUT_VCPU_INFO(reg)
32442+#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32443+ XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
32444+ XEN_PUT_VCPU_INFO(reg)
32445+#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
32446Index: head-2008-11-25/arch/x86/mm/fault_64-xen.c
32447===================================================================
32448--- /dev/null 1970-01-01 00:00:00.000000000 +0000
32449+++ head-2008-11-25/arch/x86/mm/fault_64-xen.c 2007-11-02 17:34:23.000000000 +0100
32450@@ -0,0 +1,724 @@
32451+/*
32452+ * linux/arch/x86-64/mm/fault.c
32453+ *
32454+ * Copyright (C) 1995 Linus Torvalds
32455+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
32456+ */
32457+
32458+#include <linux/signal.h>
32459+#include <linux/sched.h>
32460+#include <linux/kernel.h>
32461+#include <linux/errno.h>
32462+#include <linux/string.h>
32463+#include <linux/types.h>
32464+#include <linux/ptrace.h>
32465+#include <linux/mman.h>
32466+#include <linux/mm.h>
32467+#include <linux/smp.h>
32468+#include <linux/smp_lock.h>
32469+#include <linux/interrupt.h>
32470+#include <linux/init.h>
32471+#include <linux/tty.h>
32472+#include <linux/vt_kern.h> /* For unblank_screen() */
32473+#include <linux/compiler.h>
32474+#include <linux/module.h>
32475+#include <linux/kprobes.h>
32476+
32477+#include <asm/system.h>
32478+#include <asm/uaccess.h>
32479+#include <asm/pgalloc.h>
32480+#include <asm/smp.h>
32481+#include <asm/tlbflush.h>
32482+#include <asm/proto.h>
32483+#include <asm/kdebug.h>
32484+#include <asm-generic/sections.h>
32485+
32486+/* Page fault error code bits */
32487+#define PF_PROT (1<<0) /* or no page found */
32488+#define PF_WRITE (1<<1)
32489+#define PF_USER (1<<2)
32490+#define PF_RSVD (1<<3)
32491+#define PF_INSTR (1<<4)
32492+
32493+#ifdef CONFIG_KPROBES
32494+ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
32495+
32496+/* Hook to register for page fault notifications */
32497+int register_page_fault_notifier(struct notifier_block *nb)
32498+{
32499+ vmalloc_sync_all();
32500+ return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
32501+}
32502+
32503+int unregister_page_fault_notifier(struct notifier_block *nb)
32504+{
32505+ return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
32506+}
32507+
32508+static inline int notify_page_fault(enum die_val val, const char *str,
32509+ struct pt_regs *regs, long err, int trap, int sig)
32510+{
32511+ struct die_args args = {
32512+ .regs = regs,
32513+ .str = str,
32514+ .err = err,
32515+ .trapnr = trap,
32516+ .signr = sig
32517+ };
32518+ return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
32519+}
32520+#else
32521+static inline int notify_page_fault(enum die_val val, const char *str,
32522+ struct pt_regs *regs, long err, int trap, int sig)
32523+{
32524+ return NOTIFY_DONE;
32525+}
32526+#endif
32527+
32528+void bust_spinlocks(int yes)
32529+{
32530+ int loglevel_save = console_loglevel;
32531+ if (yes) {
32532+ oops_in_progress = 1;
32533+ } else {
32534+#ifdef CONFIG_VT
32535+ unblank_screen();
32536+#endif
32537+ oops_in_progress = 0;
32538+ /*
32539+ * OK, the message is on the console. Now we call printk()
32540+ * without oops_in_progress set so that printk will give klogd
32541+ * a poke. Hold onto your hats...
32542+ */
32543+ console_loglevel = 15; /* NMI oopser may have shut the console up */
32544+ printk(" ");
32545+ console_loglevel = loglevel_save;
32546+ }
32547+}
32548+
32549+/* Sometimes the CPU reports invalid exceptions on prefetch.
32550+ Check that here and ignore.
32551+ Opcode checker based on code by Richard Brunner */
32552+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
32553+ unsigned long error_code)
32554+{
32555+ unsigned char *instr;
32556+ int scan_more = 1;
32557+ int prefetch = 0;
32558+ unsigned char *max_instr;
32559+
32560+ /* If it was a exec fault ignore */
32561+ if (error_code & PF_INSTR)
32562+ return 0;
32563+
32564+ instr = (unsigned char *)convert_rip_to_linear(current, regs);
32565+ max_instr = instr + 15;
32566+
32567+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
32568+ return 0;
32569+
32570+ while (scan_more && instr < max_instr) {
32571+ unsigned char opcode;
32572+ unsigned char instr_hi;
32573+ unsigned char instr_lo;
32574+
32575+ if (__get_user(opcode, instr))
32576+ break;
32577+
32578+ instr_hi = opcode & 0xf0;
32579+ instr_lo = opcode & 0x0f;
32580+ instr++;
32581+
32582+ switch (instr_hi) {
32583+ case 0x20:
32584+ case 0x30:
32585+ /* Values 0x26,0x2E,0x36,0x3E are valid x86
32586+ prefixes. In long mode, the CPU will signal
32587+ invalid opcode if some of these prefixes are
32588+ present so we will never get here anyway */
32589+ scan_more = ((instr_lo & 7) == 0x6);
32590+ break;
32591+
32592+ case 0x40:
32593+ /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
32594+ Need to figure out under what instruction mode the
32595+ instruction was issued ... */
32596+ /* Could check the LDT for lm, but for now it's good
32597+ enough to assume that long mode only uses well known
32598+ segments or kernel. */
32599+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
32600+ break;
32601+
32602+ case 0x60:
32603+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
32604+ scan_more = (instr_lo & 0xC) == 0x4;
32605+ break;
32606+ case 0xF0:
32607+ /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
32608+ scan_more = !instr_lo || (instr_lo>>1) == 1;
32609+ break;
32610+ case 0x00:
32611+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
32612+ scan_more = 0;
32613+ if (__get_user(opcode, instr))
32614+ break;
32615+ prefetch = (instr_lo == 0xF) &&
32616+ (opcode == 0x0D || opcode == 0x18);
32617+ break;
32618+ default:
32619+ scan_more = 0;
32620+ break;
32621+ }
32622+ }
32623+ return prefetch;
32624+}
32625+
32626+static int bad_address(void *p)
32627+{
32628+ unsigned long dummy;
32629+ return __get_user(dummy, (unsigned long *)p);
32630+}
32631+
32632+void dump_pagetable(unsigned long address)
32633+{
32634+ pgd_t *pgd;
32635+ pud_t *pud;
32636+ pmd_t *pmd;
32637+ pte_t *pte;
32638+
32639+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32640+ pgd += pgd_index(address);
32641+ if (bad_address(pgd)) goto bad;
32642+ printk("PGD %lx ", pgd_val(*pgd));
32643+ if (!pgd_present(*pgd)) goto ret;
32644+
32645+ pud = pud_offset(pgd, address);
32646+ if (bad_address(pud)) goto bad;
32647+ printk("PUD %lx ", pud_val(*pud));
32648+ if (!pud_present(*pud)) goto ret;
32649+
32650+ pmd = pmd_offset(pud, address);
32651+ if (bad_address(pmd)) goto bad;
32652+ printk("PMD %lx ", pmd_val(*pmd));
32653+ if (!pmd_present(*pmd)) goto ret;
32654+
32655+ pte = pte_offset_kernel(pmd, address);
32656+ if (bad_address(pte)) goto bad;
32657+ printk("PTE %lx", pte_val(*pte));
32658+ret:
32659+ printk("\n");
32660+ return;
32661+bad:
32662+ printk("BAD\n");
32663+}
32664+
32665+static const char errata93_warning[] =
32666+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
32667+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
32668+KERN_ERR "******* Please consider a BIOS update.\n"
32669+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
32670+
32671+/* Workaround for K8 erratum #93 & buggy BIOS.
32672+ BIOS SMM functions are required to use a specific workaround
32673+ to avoid corruption of the 64bit RIP register on C stepping K8.
32674+ A lot of BIOS that didn't get tested properly miss this.
32675+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
32676+ Try to work around it here.
32677+ Note we only handle faults in kernel here. */
32678+
32679+static int is_errata93(struct pt_regs *regs, unsigned long address)
32680+{
32681+ static int warned;
32682+ if (address != regs->rip)
32683+ return 0;
32684+ if ((address >> 32) != 0)
32685+ return 0;
32686+ address |= 0xffffffffUL << 32;
32687+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
32688+ (address >= MODULES_VADDR && address <= MODULES_END)) {
32689+ if (!warned) {
32690+ printk(errata93_warning);
32691+ warned = 1;
32692+ }
32693+ regs->rip = address;
32694+ return 1;
32695+ }
32696+ return 0;
32697+}
32698+
32699+int unhandled_signal(struct task_struct *tsk, int sig)
32700+{
32701+ if (tsk->pid == 1)
32702+ return 1;
32703+ if (tsk->ptrace & PT_PTRACED)
32704+ return 0;
32705+ return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
32706+ (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
32707+}
32708+
32709+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
32710+ unsigned long error_code)
32711+{
32712+ unsigned long flags = oops_begin();
32713+ struct task_struct *tsk;
32714+
32715+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
32716+ current->comm, address);
32717+ dump_pagetable(address);
32718+ tsk = current;
32719+ tsk->thread.cr2 = address;
32720+ tsk->thread.trap_no = 14;
32721+ tsk->thread.error_code = error_code;
32722+ __die("Bad pagetable", regs, error_code);
32723+ oops_end(flags);
32724+ do_exit(SIGKILL);
32725+}
32726+
32727+/*
32728+ * Handle a fault on the vmalloc area
32729+ *
32730+ * This assumes no large pages in there.
32731+ */
32732+static int vmalloc_fault(unsigned long address)
32733+{
32734+ pgd_t *pgd, *pgd_ref;
32735+ pud_t *pud, *pud_ref;
32736+ pmd_t *pmd, *pmd_ref;
32737+ pte_t *pte, *pte_ref;
32738+
32739+ /* Copy kernel mappings over when needed. This can also
32740+ happen within a race in page table update. In the later
32741+ case just flush. */
32742+
32743+ /* On Xen the line below does not always work. Needs investigating! */
32744+ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
32745+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32746+ pgd += pgd_index(address);
32747+ pgd_ref = pgd_offset_k(address);
32748+ if (pgd_none(*pgd_ref))
32749+ return -1;
32750+ if (pgd_none(*pgd))
32751+ set_pgd(pgd, *pgd_ref);
32752+ else
32753+ BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
32754+
32755+ /* Below here mismatches are bugs because these lower tables
32756+ are shared */
32757+
32758+ pud = pud_offset(pgd, address);
32759+ pud_ref = pud_offset(pgd_ref, address);
32760+ if (pud_none(*pud_ref))
32761+ return -1;
32762+ if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
32763+ BUG();
32764+ pmd = pmd_offset(pud, address);
32765+ pmd_ref = pmd_offset(pud_ref, address);
32766+ if (pmd_none(*pmd_ref))
32767+ return -1;
32768+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
32769+ BUG();
32770+ pte_ref = pte_offset_kernel(pmd_ref, address);
32771+ if (!pte_present(*pte_ref))
32772+ return -1;
32773+ pte = pte_offset_kernel(pmd, address);
32774+ /* Don't use pte_page here, because the mappings can point
32775+ outside mem_map, and the NUMA hash lookup cannot handle
32776+ that. */
32777+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
32778+ BUG();
32779+ return 0;
32780+}
32781+
32782+int page_fault_trace = 0;
32783+int exception_trace = 1;
32784+
32785+
32786+#define MEM_VERBOSE 1
32787+
32788+#ifdef MEM_VERBOSE
32789+#define MEM_LOG(_f, _a...) \
32790+ printk("fault.c:[%d]-> " _f "\n", \
32791+ __LINE__ , ## _a )
32792+#else
32793+#define MEM_LOG(_f, _a...) ((void)0)
32794+#endif
32795+
32796+static int spurious_fault(struct pt_regs *regs,
32797+ unsigned long address,
32798+ unsigned long error_code)
32799+{
32800+ pgd_t *pgd;
32801+ pud_t *pud;
32802+ pmd_t *pmd;
32803+ pte_t *pte;
32804+
32805+#ifdef CONFIG_XEN
32806+ /* Faults in hypervisor area are never spurious. */
32807+ if ((address >= HYPERVISOR_VIRT_START) &&
32808+ (address < HYPERVISOR_VIRT_END))
32809+ return 0;
32810+#endif
32811+
32812+ /* Reserved-bit violation or user access to kernel space? */
32813+ if (error_code & (PF_RSVD|PF_USER))
32814+ return 0;
32815+
32816+ pgd = init_mm.pgd + pgd_index(address);
32817+ if (!pgd_present(*pgd))
32818+ return 0;
32819+
32820+ pud = pud_offset(pgd, address);
32821+ if (!pud_present(*pud))
32822+ return 0;
32823+
32824+ pmd = pmd_offset(pud, address);
32825+ if (!pmd_present(*pmd))
32826+ return 0;
32827+
32828+ pte = pte_offset_kernel(pmd, address);
32829+ if (!pte_present(*pte))
32830+ return 0;
32831+ if ((error_code & PF_WRITE) && !pte_write(*pte))
32832+ return 0;
32833+ if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
32834+ return 0;
32835+
32836+ return 1;
32837+}
32838+
32839+/*
32840+ * This routine handles page faults. It determines the address,
32841+ * and the problem, and then passes it off to one of the appropriate
32842+ * routines.
32843+ */
32844+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
32845+ unsigned long error_code)
32846+{
32847+ struct task_struct *tsk;
32848+ struct mm_struct *mm;
32849+ struct vm_area_struct * vma;
32850+ unsigned long address;
32851+ const struct exception_table_entry *fixup;
32852+ int write;
32853+ unsigned long flags;
32854+ siginfo_t info;
32855+
32856+ if (!user_mode(regs))
32857+ error_code &= ~PF_USER; /* means kernel */
32858+
32859+ tsk = current;
32860+ mm = tsk->mm;
32861+ prefetchw(&mm->mmap_sem);
32862+
32863+ /* get the address */
32864+ address = current_vcpu_info()->arch.cr2;
32865+
32866+ info.si_code = SEGV_MAPERR;
32867+
32868+
32869+ /*
32870+ * We fault-in kernel-space virtual memory on-demand. The
32871+ * 'reference' page table is init_mm.pgd.
32872+ *
32873+ * NOTE! We MUST NOT take any locks for this case. We may
32874+ * be in an interrupt or a critical region, and should
32875+ * only copy the information from the master page table,
32876+ * nothing more.
32877+ *
32878+ * This verifies that the fault happens in kernel space
32879+ * (error_code & 4) == 0, and that the fault was not a
32880+ * protection error (error_code & 9) == 0.
32881+ */
32882+ if (unlikely(address >= TASK_SIZE64)) {
32883+ /*
32884+ * Don't check for the module range here: its PML4
32885+ * is always initialized because it's shared with the main
32886+ * kernel text. Only vmalloc may need PML4 syncups.
32887+ */
32888+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
32889+ ((address >= VMALLOC_START && address < VMALLOC_END))) {
32890+ if (vmalloc_fault(address) >= 0)
32891+ return;
32892+ }
32893+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
32894+ if (spurious_fault(regs, address, error_code))
32895+ return;
32896+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32897+ SIGSEGV) == NOTIFY_STOP)
32898+ return;
32899+ /*
32900+ * Don't take the mm semaphore here. If we fixup a prefetch
32901+ * fault we could otherwise deadlock.
32902+ */
32903+ goto bad_area_nosemaphore;
32904+ }
32905+
32906+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32907+ SIGSEGV) == NOTIFY_STOP)
32908+ return;
32909+
32910+ if (likely(regs->eflags & X86_EFLAGS_IF))
32911+ local_irq_enable();
32912+
32913+ if (unlikely(page_fault_trace))
32914+ printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
32915+ regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
32916+
32917+ if (unlikely(error_code & PF_RSVD))
32918+ pgtable_bad(address, regs, error_code);
32919+
32920+ /*
32921+ * If we're in an interrupt or have no user
32922+ * context, we must not take the fault..
32923+ */
32924+ if (unlikely(in_atomic() || !mm))
32925+ goto bad_area_nosemaphore;
32926+
32927+ again:
32928+ /* When running in the kernel we expect faults to occur only to
32929+ * addresses in user space. All other faults represent errors in the
32930+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
32931+ * erroneous fault occurring in a code path which already holds mmap_sem
32932+ * we will deadlock attempting to validate the fault against the
32933+ * address space. Luckily the kernel only validly references user
32934+ * space from well defined areas of code, which are listed in the
32935+ * exceptions table.
32936+ *
32937+ * As the vast majority of faults will be valid we will only perform
32938+ * the source reference check when there is a possibilty of a deadlock.
32939+ * Attempt to lock the address space, if we cannot we then validate the
32940+ * source. If this is invalid we can skip the address space check,
32941+ * thus avoiding the deadlock.
32942+ */
32943+ if (!down_read_trylock(&mm->mmap_sem)) {
32944+ if ((error_code & PF_USER) == 0 &&
32945+ !search_exception_tables(regs->rip))
32946+ goto bad_area_nosemaphore;
32947+ down_read(&mm->mmap_sem);
32948+ }
32949+
32950+ vma = find_vma(mm, address);
32951+ if (!vma)
32952+ goto bad_area;
32953+ if (likely(vma->vm_start <= address))
32954+ goto good_area;
32955+ if (!(vma->vm_flags & VM_GROWSDOWN))
32956+ goto bad_area;
32957+ if (error_code & 4) {
32958+ /* Allow userspace just enough access below the stack pointer
32959+ * to let the 'enter' instruction work.
32960+ */
32961+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
32962+ goto bad_area;
32963+ }
32964+ if (expand_stack(vma, address))
32965+ goto bad_area;
32966+/*
32967+ * Ok, we have a good vm_area for this memory access, so
32968+ * we can handle it..
32969+ */
32970+good_area:
32971+ info.si_code = SEGV_ACCERR;
32972+ write = 0;
32973+ switch (error_code & (PF_PROT|PF_WRITE)) {
32974+ default: /* 3: write, present */
32975+ /* fall through */
32976+ case PF_WRITE: /* write, not present */
32977+ if (!(vma->vm_flags & VM_WRITE))
32978+ goto bad_area;
32979+ write++;
32980+ break;
32981+ case PF_PROT: /* read, present */
32982+ goto bad_area;
32983+ case 0: /* read, not present */
32984+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
32985+ goto bad_area;
32986+ }
32987+
32988+ /*
32989+ * If for any reason at all we couldn't handle the fault,
32990+ * make sure we exit gracefully rather than endlessly redo
32991+ * the fault.
32992+ */
32993+ switch (handle_mm_fault(mm, vma, address, write)) {
32994+ case VM_FAULT_MINOR:
32995+ tsk->min_flt++;
32996+ break;
32997+ case VM_FAULT_MAJOR:
32998+ tsk->maj_flt++;
32999+ break;
33000+ case VM_FAULT_SIGBUS:
33001+ goto do_sigbus;
33002+ default:
33003+ goto out_of_memory;
33004+ }
33005+
33006+ up_read(&mm->mmap_sem);
33007+ return;
33008+
33009+/*
33010+ * Something tried to access memory that isn't in our memory map..
33011+ * Fix it, but check if it's kernel or user first..
33012+ */
33013+bad_area:
33014+ up_read(&mm->mmap_sem);
33015+
33016+bad_area_nosemaphore:
33017+ /* User mode accesses just cause a SIGSEGV */
33018+ if (error_code & PF_USER) {
33019+ if (is_prefetch(regs, address, error_code))
33020+ return;
33021+
33022+ /* Work around K8 erratum #100 K8 in compat mode
33023+ occasionally jumps to illegal addresses >4GB. We
33024+ catch this here in the page fault handler because
33025+ these addresses are not reachable. Just detect this
33026+ case and return. Any code segment in LDT is
33027+ compatibility mode. */
33028+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
33029+ (address >> 32))
33030+ return;
33031+
33032+ if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
33033+ printk(
33034+ "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
33035+ tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
33036+ tsk->comm, tsk->pid, address, regs->rip,
33037+ regs->rsp, error_code);
33038+ }
33039+
33040+ tsk->thread.cr2 = address;
33041+ /* Kernel addresses are always protection faults */
33042+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
33043+ tsk->thread.trap_no = 14;
33044+ info.si_signo = SIGSEGV;
33045+ info.si_errno = 0;
33046+ /* info.si_code has been set above */
33047+ info.si_addr = (void __user *)address;
33048+ force_sig_info(SIGSEGV, &info, tsk);
33049+ return;
33050+ }
33051+
33052+no_context:
33053+
33054+ /* Are we prepared to handle this kernel fault? */
33055+ fixup = search_exception_tables(regs->rip);
33056+ if (fixup) {
33057+ regs->rip = fixup->fixup;
33058+ return;
33059+ }
33060+
33061+ /*
33062+ * Hall of shame of CPU/BIOS bugs.
33063+ */
33064+
33065+ if (is_prefetch(regs, address, error_code))
33066+ return;
33067+
33068+ if (is_errata93(regs, address))
33069+ return;
33070+
33071+/*
33072+ * Oops. The kernel tried to access some bad page. We'll have to
33073+ * terminate things with extreme prejudice.
33074+ */
33075+
33076+ flags = oops_begin();
33077+
33078+ if (address < PAGE_SIZE)
33079+ printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
33080+ else
33081+ printk(KERN_ALERT "Unable to handle kernel paging request");
33082+ printk(" at %016lx RIP: \n" KERN_ALERT,address);
33083+ printk_address(regs->rip);
33084+ dump_pagetable(address);
33085+ tsk->thread.cr2 = address;
33086+ tsk->thread.trap_no = 14;
33087+ tsk->thread.error_code = error_code;
33088+ __die("Oops", regs, error_code);
33089+ /* Executive summary in case the body of the oops scrolled away */
33090+ printk(KERN_EMERG "CR2: %016lx\n", address);
33091+ oops_end(flags);
33092+ do_exit(SIGKILL);
33093+
33094+/*
33095+ * We ran out of memory, or some other thing happened to us that made
33096+ * us unable to handle the page fault gracefully.
33097+ */
33098+out_of_memory:
33099+ up_read(&mm->mmap_sem);
33100+ if (current->pid == 1) {
33101+ yield();
33102+ goto again;
33103+ }
33104+ printk("VM: killing process %s\n", tsk->comm);
33105+ if (error_code & 4)
33106+ do_exit(SIGKILL);
33107+ goto no_context;
33108+
33109+do_sigbus:
33110+ up_read(&mm->mmap_sem);
33111+
33112+ /* Kernel mode? Handle exceptions or die */
33113+ if (!(error_code & PF_USER))
33114+ goto no_context;
33115+
33116+ tsk->thread.cr2 = address;
33117+ tsk->thread.error_code = error_code;
33118+ tsk->thread.trap_no = 14;
33119+ info.si_signo = SIGBUS;
33120+ info.si_errno = 0;
33121+ info.si_code = BUS_ADRERR;
33122+ info.si_addr = (void __user *)address;
33123+ force_sig_info(SIGBUS, &info, tsk);
33124+ return;
33125+}
33126+
33127+DEFINE_SPINLOCK(pgd_lock);
33128+struct page *pgd_list;
33129+
33130+void vmalloc_sync_all(void)
33131+{
33132+ /* Note that races in the updates of insync and start aren't
33133+ problematic:
33134+ insync can only get set bits added, and updates to start are only
33135+ improving performance (without affecting correctness if undone). */
33136+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
33137+ static unsigned long start = VMALLOC_START & PGDIR_MASK;
33138+ unsigned long address;
33139+
33140+ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
33141+ if (!test_bit(pgd_index(address), insync)) {
33142+ const pgd_t *pgd_ref = pgd_offset_k(address);
33143+ struct page *page;
33144+
33145+ if (pgd_none(*pgd_ref))
33146+ continue;
33147+ spin_lock(&pgd_lock);
33148+ for (page = pgd_list; page;
33149+ page = (struct page *)page->index) {
33150+ pgd_t *pgd;
33151+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
33152+ if (pgd_none(*pgd))
33153+ set_pgd(pgd, *pgd_ref);
33154+ else
33155+ BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
33156+ }
33157+ spin_unlock(&pgd_lock);
33158+ set_bit(pgd_index(address), insync);
33159+ }
33160+ if (address == start)
33161+ start = address + PGDIR_SIZE;
33162+ }
33163+ /* Check that there is no need to do the same for the modules area. */
33164+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33165+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
33166+ (__START_KERNEL & PGDIR_MASK)));
33167+}
33168+
33169+static int __init enable_pagefaulttrace(char *str)
33170+{
33171+ page_fault_trace = 1;
33172+ return 1;
33173+}
33174+__setup("pagefaulttrace", enable_pagefaulttrace);
33175Index: head-2008-11-25/arch/x86/mm/init_64-xen.c
33176===================================================================
33177--- /dev/null 1970-01-01 00:00:00.000000000 +0000
33178+++ head-2008-11-25/arch/x86/mm/init_64-xen.c 2008-10-29 09:55:56.000000000 +0100
33179@@ -0,0 +1,1206 @@
33180+/*
33181+ * linux/arch/x86_64/mm/init.c
33182+ *
33183+ * Copyright (C) 1995 Linus Torvalds
33184+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
33185+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
33186+ *
33187+ * Jun Nakajima <jun.nakajima@intel.com>
33188+ * Modified for Xen.
33189+ */
33190+
33191+#include <linux/signal.h>
33192+#include <linux/sched.h>
33193+#include <linux/kernel.h>
33194+#include <linux/errno.h>
33195+#include <linux/string.h>
33196+#include <linux/types.h>
33197+#include <linux/ptrace.h>
33198+#include <linux/mman.h>
33199+#include <linux/mm.h>
33200+#include <linux/swap.h>
33201+#include <linux/smp.h>
33202+#include <linux/init.h>
33203+#include <linux/pagemap.h>
33204+#include <linux/bootmem.h>
33205+#include <linux/proc_fs.h>
33206+#include <linux/pci.h>
33207+#include <linux/poison.h>
33208+#include <linux/dma-mapping.h>
33209+#include <linux/module.h>
33210+#include <linux/memory_hotplug.h>
33211+
33212+#include <asm/processor.h>
33213+#include <asm/system.h>
33214+#include <asm/uaccess.h>
33215+#include <asm/pgtable.h>
33216+#include <asm/pgalloc.h>
33217+#include <asm/dma.h>
33218+#include <asm/fixmap.h>
33219+#include <asm/e820.h>
33220+#include <asm/apic.h>
33221+#include <asm/tlb.h>
33222+#include <asm/mmu_context.h>
33223+#include <asm/proto.h>
33224+#include <asm/smp.h>
33225+#include <asm/sections.h>
33226+
33227+#include <xen/features.h>
33228+
33229+#ifndef Dprintk
33230+#define Dprintk(x...)
33231+#endif
33232+
33233+struct dma_mapping_ops* dma_ops;
33234+EXPORT_SYMBOL(dma_ops);
33235+
33236+#if CONFIG_XEN_COMPAT <= 0x030002
33237+unsigned int __kernel_page_user;
33238+EXPORT_SYMBOL(__kernel_page_user);
33239+#endif
33240+
33241+int after_bootmem;
33242+
33243+static unsigned long dma_reserve __initdata;
33244+
33245+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
33246+extern unsigned long start_pfn;
33247+
33248+/*
33249+ * Use this until direct mapping is established, i.e. before __va() is
33250+ * available in init_memory_mapping().
33251+ */
33252+
33253+#define addr_to_page(addr, page) \
33254+ (addr) &= PHYSICAL_PAGE_MASK; \
33255+ (page) = ((unsigned long *) ((unsigned long) \
33256+ (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
33257+ __START_KERNEL_map)))
33258+
33259+static void __meminit early_make_page_readonly(void *va, unsigned int feature)
33260+{
33261+ unsigned long addr, _va = (unsigned long)va;
33262+ pte_t pte, *ptep;
33263+ unsigned long *page = (unsigned long *) init_level4_pgt;
33264+
33265+ BUG_ON(after_bootmem);
33266+
33267+ if (xen_feature(feature))
33268+ return;
33269+
33270+ addr = (unsigned long) page[pgd_index(_va)];
33271+ addr_to_page(addr, page);
33272+
33273+ addr = page[pud_index(_va)];
33274+ addr_to_page(addr, page);
33275+
33276+ addr = page[pmd_index(_va)];
33277+ addr_to_page(addr, page);
33278+
33279+ ptep = (pte_t *) &page[pte_index(_va)];
33280+
33281+ pte.pte = ptep->pte & ~_PAGE_RW;
33282+ if (HYPERVISOR_update_va_mapping(_va, pte, 0))
33283+ BUG();
33284+}
33285+
33286+static void __make_page_readonly(void *va)
33287+{
33288+ pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33289+ unsigned long addr = (unsigned long) va;
33290+
33291+ pgd = pgd_offset_k(addr);
33292+ pud = pud_offset(pgd, addr);
33293+ pmd = pmd_offset(pud, addr);
33294+ ptep = pte_offset_kernel(pmd, addr);
33295+
33296+ pte.pte = ptep->pte & ~_PAGE_RW;
33297+ if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33298+ xen_l1_entry_update(ptep, pte); /* fallback */
33299+
33300+ if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33301+ __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
33302+}
33303+
33304+static void __make_page_writable(void *va)
33305+{
33306+ pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33307+ unsigned long addr = (unsigned long) va;
33308+
33309+ pgd = pgd_offset_k(addr);
33310+ pud = pud_offset(pgd, addr);
33311+ pmd = pmd_offset(pud, addr);
33312+ ptep = pte_offset_kernel(pmd, addr);
33313+
33314+ pte.pte = ptep->pte | _PAGE_RW;
33315+ if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33316+ xen_l1_entry_update(ptep, pte); /* fallback */
33317+
33318+ if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33319+ __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
33320+}
33321+
33322+void make_page_readonly(void *va, unsigned int feature)
33323+{
33324+ if (!xen_feature(feature))
33325+ __make_page_readonly(va);
33326+}
33327+
33328+void make_page_writable(void *va, unsigned int feature)
33329+{
33330+ if (!xen_feature(feature))
33331+ __make_page_writable(va);
33332+}
33333+
33334+void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
33335+{
33336+ if (xen_feature(feature))
33337+ return;
33338+
33339+ while (nr-- != 0) {
33340+ __make_page_readonly(va);
33341+ va = (void*)((unsigned long)va + PAGE_SIZE);
33342+ }
33343+}
33344+
33345+void make_pages_writable(void *va, unsigned nr, unsigned int feature)
33346+{
33347+ if (xen_feature(feature))
33348+ return;
33349+
33350+ while (nr-- != 0) {
33351+ __make_page_writable(va);
33352+ va = (void*)((unsigned long)va + PAGE_SIZE);
33353+ }
33354+}
33355+
33356+/*
33357+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
33358+ * physical space so we can cache the place of the first one and move
33359+ * around without checking the pgd every time.
33360+ */
33361+
33362+void show_mem(void)
33363+{
33364+ long i, total = 0, reserved = 0;
33365+ long shared = 0, cached = 0;
33366+ pg_data_t *pgdat;
33367+ struct page *page;
33368+
33369+ printk(KERN_INFO "Mem-info:\n");
33370+ show_free_areas();
33371+ printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
33372+
33373+ for_each_online_pgdat(pgdat) {
33374+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
33375+ page = pfn_to_page(pgdat->node_start_pfn + i);
33376+ total++;
33377+ if (PageReserved(page))
33378+ reserved++;
33379+ else if (PageSwapCache(page))
33380+ cached++;
33381+ else if (page_count(page))
33382+ shared += page_count(page) - 1;
33383+ }
33384+ }
33385+ printk(KERN_INFO "%lu pages of RAM\n", total);
33386+ printk(KERN_INFO "%lu reserved pages\n",reserved);
33387+ printk(KERN_INFO "%lu pages shared\n",shared);
33388+ printk(KERN_INFO "%lu pages swap cached\n",cached);
33389+}
33390+
33391+
33392+static __init void *spp_getpage(void)
33393+{
33394+ void *ptr;
33395+ if (after_bootmem)
33396+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
33397+ else if (start_pfn < table_end) {
33398+ ptr = __va(start_pfn << PAGE_SHIFT);
33399+ start_pfn++;
33400+ memset(ptr, 0, PAGE_SIZE);
33401+ } else
33402+ ptr = alloc_bootmem_pages(PAGE_SIZE);
33403+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
33404+ panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
33405+
33406+ Dprintk("spp_getpage %p\n", ptr);
33407+ return ptr;
33408+}
33409+
33410+#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
33411+#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
33412+
33413+static __init void set_pte_phys(unsigned long vaddr,
33414+ unsigned long phys, pgprot_t prot, int user_mode)
33415+{
33416+ pgd_t *pgd;
33417+ pud_t *pud;
33418+ pmd_t *pmd;
33419+ pte_t *pte, new_pte;
33420+
33421+ Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33422+
33423+ pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
33424+ if (pgd_none(*pgd)) {
33425+ printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33426+ return;
33427+ }
33428+ pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
33429+ if (pud_none(*pud)) {
33430+ pmd = (pmd_t *) spp_getpage();
33431+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
33432+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33433+ if (pmd != pmd_offset(pud, 0)) {
33434+ printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33435+ return;
33436+ }
33437+ }
33438+ pmd = pmd_offset(pud, vaddr);
33439+ if (pmd_none(*pmd)) {
33440+ pte = (pte_t *) spp_getpage();
33441+ make_page_readonly(pte, XENFEAT_writable_page_tables);
33442+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33443+ if (pte != pte_offset_kernel(pmd, 0)) {
33444+ printk("PAGETABLE BUG #02!\n");
33445+ return;
33446+ }
33447+ }
33448+ if (pgprot_val(prot))
33449+ new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
33450+ else
33451+ new_pte = __pte(0);
33452+
33453+ pte = pte_offset_kernel(pmd, vaddr);
33454+ if (!pte_none(*pte) && __pte_val(new_pte) &&
33455+ __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33456+ pte_ERROR(*pte);
33457+ set_pte(pte, new_pte);
33458+
33459+ /*
33460+ * It's enough to flush this one mapping.
33461+ * (PGE mappings get flushed as well)
33462+ */
33463+ __flush_tlb_one(vaddr);
33464+}
33465+
33466+static __init void set_pte_phys_ma(unsigned long vaddr,
33467+ unsigned long phys, pgprot_t prot)
33468+{
33469+ pgd_t *pgd;
33470+ pud_t *pud;
33471+ pmd_t *pmd;
33472+ pte_t *pte, new_pte;
33473+
33474+ Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33475+
33476+ pgd = pgd_offset_k(vaddr);
33477+ if (pgd_none(*pgd)) {
33478+ printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33479+ return;
33480+ }
33481+ pud = pud_offset(pgd, vaddr);
33482+ if (pud_none(*pud)) {
33483+
33484+ pmd = (pmd_t *) spp_getpage();
33485+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
33486+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33487+ if (pmd != pmd_offset(pud, 0)) {
33488+ printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33489+ return;
33490+ }
33491+ }
33492+ pmd = pmd_offset(pud, vaddr);
33493+ if (pmd_none(*pmd)) {
33494+ pte = (pte_t *) spp_getpage();
33495+ make_page_readonly(pte, XENFEAT_writable_page_tables);
33496+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33497+ if (pte != pte_offset_kernel(pmd, 0)) {
33498+ printk("PAGETABLE BUG #02!\n");
33499+ return;
33500+ }
33501+ }
33502+ new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
33503+
33504+ pte = pte_offset_kernel(pmd, vaddr);
33505+ if (!pte_none(*pte) && __pte_val(new_pte) &&
33506+#ifdef CONFIG_ACPI
33507+ /* __acpi_map_table() fails to properly call clear_fixmap() */
33508+ (vaddr < __fix_to_virt(FIX_ACPI_END) ||
33509+ vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
33510+#endif
33511+ __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33512+ pte_ERROR(*pte);
33513+ set_pte(pte, new_pte);
33514+
33515+ /*
33516+ * It's enough to flush this one mapping.
33517+ * (PGE mappings get flushed as well)
33518+ */
33519+ __flush_tlb_one(vaddr);
33520+}
33521+
33522+/* NOTE: this is meant to be run only at boot */
33523+void __init
33524+__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
33525+{
33526+ unsigned long address = __fix_to_virt(idx);
33527+
33528+ if (idx >= __end_of_fixed_addresses) {
33529+ printk("Invalid __set_fixmap\n");
33530+ return;
33531+ }
33532+ switch (idx) {
33533+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
33534+ set_pte_phys(address, phys, prot, 0);
33535+ set_pte_phys(address, phys, prot, 1);
33536+ break;
33537+ default:
33538+ set_pte_phys_ma(address, phys, prot);
33539+ break;
33540+ }
33541+}
33542+
33543+unsigned long __initdata table_start, table_end;
33544+
33545+static __meminit void *alloc_static_page(unsigned long *phys)
33546+{
33547+ unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
33548+
33549+ if (after_bootmem) {
33550+ void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
33551+
33552+ *phys = __pa(adr);
33553+ return adr;
33554+ }
33555+
33556+ *phys = start_pfn << PAGE_SHIFT;
33557+ start_pfn++;
33558+ memset((void *)va, 0, PAGE_SIZE);
33559+ return (void *)va;
33560+}
33561+
33562+#define PTE_SIZE PAGE_SIZE
33563+
33564+static inline int make_readonly(unsigned long paddr)
33565+{
33566+ extern char __vsyscall_0;
33567+ int readonly = 0;
33568+
33569+ /* Make new page tables read-only. */
33570+ if (!xen_feature(XENFEAT_writable_page_tables)
33571+ && (paddr >= (table_start << PAGE_SHIFT))
33572+ && (paddr < (table_end << PAGE_SHIFT)))
33573+ readonly = 1;
33574+ /* Make old page tables read-only. */
33575+ if (!xen_feature(XENFEAT_writable_page_tables)
33576+ && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
33577+ && (paddr < (start_pfn << PAGE_SHIFT)))
33578+ readonly = 1;
33579+
33580+ /*
33581+ * No need for writable mapping of kernel image. This also ensures that
33582+ * page and descriptor tables embedded inside don't have writable
33583+ * mappings. Exclude the vsyscall area here, allowing alternative
33584+ * instruction patching to work.
33585+ */
33586+ if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
33587+ && !(paddr >= __pa_symbol(&__vsyscall_0)
33588+ && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
33589+ readonly = 1;
33590+
33591+ return readonly;
33592+}
33593+
33594+#ifndef CONFIG_XEN
33595+/* Must run before zap_low_mappings */
33596+__init void *early_ioremap(unsigned long addr, unsigned long size)
33597+{
33598+ unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
33599+
33600+ /* actually usually some more */
33601+ if (size >= LARGE_PAGE_SIZE) {
33602+ printk("SMBIOS area too long %lu\n", size);
33603+ return NULL;
33604+ }
33605+ set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33606+ map += LARGE_PAGE_SIZE;
33607+ set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33608+ __flush_tlb();
33609+ return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
33610+}
33611+
33612+/* To avoid virtual aliases later */
33613+__init void early_iounmap(void *addr, unsigned long size)
33614+{
33615+ if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
33616+ printk("early_iounmap: bad address %p\n", addr);
33617+ set_pmd(temp_mappings[0].pmd, __pmd(0));
33618+ set_pmd(temp_mappings[1].pmd, __pmd(0));
33619+ __flush_tlb();
33620+}
33621+#endif
33622+
33623+static void __meminit
33624+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
33625+{
33626+ int i, k;
33627+
33628+ for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
33629+ unsigned long pte_phys;
33630+ pte_t *pte, *pte_save;
33631+
33632+ if (address >= end)
33633+ break;
33634+ pte = alloc_static_page(&pte_phys);
33635+ pte_save = pte;
33636+ for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
33637+ unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
33638+
33639+ if (address >= (after_bootmem
33640+ ? end
33641+ : xen_start_info->nr_pages << PAGE_SHIFT))
33642+ pteval = 0;
33643+ else if (make_readonly(address))
33644+ pteval &= ~_PAGE_RW;
33645+ set_pte(pte, __pte(pteval & __supported_pte_mask));
33646+ }
33647+ if (!after_bootmem) {
33648+ early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33649+ *pmd = __pmd(pte_phys | _KERNPG_TABLE);
33650+ } else {
33651+ make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33652+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
33653+ }
33654+ }
33655+}
33656+
33657+static void __meminit
33658+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
33659+{
33660+ pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
33661+
33662+ if (pmd_none(*pmd)) {
33663+ spin_lock(&init_mm.page_table_lock);
33664+ phys_pmd_init(pmd, address, end);
33665+ spin_unlock(&init_mm.page_table_lock);
33666+ __flush_tlb_all();
33667+ }
33668+}
33669+
33670+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
33671+{
33672+ long i = pud_index(address);
33673+
33674+ pud = pud + i;
33675+
33676+ if (after_bootmem && pud_val(*pud)) {
33677+ phys_pmd_update(pud, address, end);
33678+ return;
33679+ }
33680+
33681+ for (; i < PTRS_PER_PUD; pud++, i++) {
33682+ unsigned long paddr, pmd_phys;
33683+ pmd_t *pmd;
33684+
33685+ paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
33686+ if (paddr >= end)
33687+ break;
33688+
33689+ pmd = alloc_static_page(&pmd_phys);
33690+
33691+ spin_lock(&init_mm.page_table_lock);
33692+ *pud = __pud(pmd_phys | _KERNPG_TABLE);
33693+ phys_pmd_init(pmd, paddr, end);
33694+ spin_unlock(&init_mm.page_table_lock);
33695+
33696+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
33697+ }
33698+ __flush_tlb();
33699+}
33700+
33701+void __init xen_init_pt(void)
33702+{
33703+ unsigned long addr, *page;
33704+
33705+ /* Find the initial pte page that was built for us. */
33706+ page = (unsigned long *)xen_start_info->pt_base;
33707+ addr = page[pgd_index(__START_KERNEL_map)];
33708+ addr_to_page(addr, page);
33709+ addr = page[pud_index(__START_KERNEL_map)];
33710+ addr_to_page(addr, page);
33711+
33712+#if CONFIG_XEN_COMPAT <= 0x030002
33713+ /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
33714+ in kernel PTEs. We check that here. */
33715+ if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
33716+ unsigned long *pg;
33717+ pte_t pte;
33718+
33719+ /* Mess with the initial mapping of page 0. It's not needed. */
33720+ BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
33721+ addr = page[pmd_index(__START_KERNEL_map)];
33722+ addr_to_page(addr, pg);
33723+ pte.pte = pg[pte_index(__START_KERNEL_map)];
33724+ BUG_ON(!(pte.pte & _PAGE_PRESENT));
33725+
33726+ /* If _PAGE_USER isn't set, we obviously do not need it. */
33727+ if (pte.pte & _PAGE_USER) {
33728+ /* _PAGE_USER is needed, but is it set implicitly? */
33729+ pte.pte &= ~_PAGE_USER;
33730+ if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
33731+ pte, 0) != 0) ||
33732+ !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
33733+ /* We need to explicitly specify _PAGE_USER. */
33734+ __kernel_page_user = _PAGE_USER;
33735+ }
33736+ }
33737+#endif
33738+
33739+ /* Construct mapping of initial pte page in our own directories. */
33740+ init_level4_pgt[pgd_index(__START_KERNEL_map)] =
33741+ __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
33742+ level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
33743+ __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
33744+ memcpy(level2_kernel_pgt, page, PAGE_SIZE);
33745+
33746+ __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
33747+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
33748+
33749+ early_make_page_readonly(init_level4_pgt,
33750+ XENFEAT_writable_page_tables);
33751+ early_make_page_readonly(__user_pgd(init_level4_pgt),
33752+ XENFEAT_writable_page_tables);
33753+ early_make_page_readonly(level3_kernel_pgt,
33754+ XENFEAT_writable_page_tables);
33755+ early_make_page_readonly(level3_user_pgt,
33756+ XENFEAT_writable_page_tables);
33757+ early_make_page_readonly(level2_kernel_pgt,
33758+ XENFEAT_writable_page_tables);
33759+
33760+ if (!xen_feature(XENFEAT_writable_page_tables)) {
33761+ xen_pgd_pin(__pa_symbol(init_level4_pgt));
33762+ xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
33763+ }
33764+}
33765+
33766+static void __init extend_init_mapping(unsigned long tables_space)
33767+{
33768+ unsigned long va = __START_KERNEL_map;
33769+ unsigned long phys, addr, *pte_page;
33770+ pmd_t *pmd;
33771+ pte_t *pte, new_pte;
33772+ unsigned long *page = (unsigned long *)init_level4_pgt;
33773+
33774+ addr = page[pgd_index(va)];
33775+ addr_to_page(addr, page);
33776+ addr = page[pud_index(va)];
33777+ addr_to_page(addr, page);
33778+
33779+ /* Kill mapping of low 1MB. */
33780+ while (va < (unsigned long)&_text) {
33781+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33782+ BUG();
33783+ va += PAGE_SIZE;
33784+ }
33785+
33786+ /* Ensure init mappings cover kernel text/data and initial tables. */
33787+ while (va < (__START_KERNEL_map
33788+ + (start_pfn << PAGE_SHIFT)
33789+ + tables_space)) {
33790+ pmd = (pmd_t *)&page[pmd_index(va)];
33791+ if (pmd_none(*pmd)) {
33792+ pte_page = alloc_static_page(&phys);
33793+ early_make_page_readonly(
33794+ pte_page, XENFEAT_writable_page_tables);
33795+ set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
33796+ } else {
33797+ addr = page[pmd_index(va)];
33798+ addr_to_page(addr, pte_page);
33799+ }
33800+ pte = (pte_t *)&pte_page[pte_index(va)];
33801+ if (pte_none(*pte)) {
33802+ new_pte = pfn_pte(
33803+ (va - __START_KERNEL_map) >> PAGE_SHIFT,
33804+ __pgprot(_KERNPG_TABLE));
33805+ xen_l1_entry_update(pte, new_pte);
33806+ }
33807+ va += PAGE_SIZE;
33808+ }
33809+
33810+ /* Finally, blow away any spurious initial mappings. */
33811+ while (1) {
33812+ pmd = (pmd_t *)&page[pmd_index(va)];
33813+ if (pmd_none(*pmd))
33814+ break;
33815+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33816+ BUG();
33817+ va += PAGE_SIZE;
33818+ }
33819+}
33820+
33821+static void __init find_early_table_space(unsigned long end)
33822+{
33823+ unsigned long puds, pmds, ptes, tables;
33824+
33825+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
33826+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
33827+ ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
33828+
33829+ tables = round_up(puds * 8, PAGE_SIZE) +
33830+ round_up(pmds * 8, PAGE_SIZE) +
33831+ round_up(ptes * 8, PAGE_SIZE);
33832+
33833+ extend_init_mapping(tables);
33834+
33835+ table_start = start_pfn;
33836+ table_end = table_start + (tables>>PAGE_SHIFT);
33837+
33838+ early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
33839+ end, table_start << PAGE_SHIFT,
33840+ (table_start << PAGE_SHIFT) + tables);
33841+}
33842+
33843+static void xen_finish_init_mapping(void)
33844+{
33845+ unsigned long i, start, end;
33846+
33847+ /* Re-vector virtual addresses pointing into the initial
33848+ mapping to the just-established permanent ones. */
33849+ xen_start_info = __va(__pa(xen_start_info));
33850+ xen_start_info->pt_base = (unsigned long)
33851+ __va(__pa(xen_start_info->pt_base));
33852+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
33853+ phys_to_machine_mapping =
33854+ __va(__pa(xen_start_info->mfn_list));
33855+ xen_start_info->mfn_list = (unsigned long)
33856+ phys_to_machine_mapping;
33857+ }
33858+ if (xen_start_info->mod_start)
33859+ xen_start_info->mod_start = (unsigned long)
33860+ __va(__pa(xen_start_info->mod_start));
33861+
33862+ /* Destroy the Xen-created mappings beyond the kernel image as
33863+ * well as the temporary mappings created above. Prevents
33864+ * overlap with modules area (if init mapping is very big).
33865+ */
33866+ start = PAGE_ALIGN((unsigned long)_end);
33867+ end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
33868+ for (; start < end; start += PAGE_SIZE)
33869+ if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
33870+ BUG();
33871+
33872+ /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
33873+ table_end = ~0UL;
33874+
33875+ /*
33876+ * Prefetch pte's for the bt_ioremap() area. It gets used before the
33877+ * boot-time allocator is online, so allocate-on-demand would fail.
33878+ */
33879+ for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
33880+ __set_fixmap(i, 0, __pgprot(0));
33881+
33882+ /* Switch to the real shared_info page, and clear the dummy page. */
33883+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
33884+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
33885+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
33886+
33887+ /* Set up mapping of lowest 1MB of physical memory. */
33888+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
33889+ if (is_initial_xendomain())
33890+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
33891+ else
33892+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
33893+ virt_to_mfn(empty_zero_page)
33894+ << PAGE_SHIFT,
33895+ PAGE_KERNEL_RO);
33896+
33897+ /* Disable the 'start_pfn' allocator. */
33898+ table_end = start_pfn;
33899+}
33900+
33901+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
33902+ This runs before bootmem is initialized and gets pages directly from the
33903+ physical memory. To access them they are temporarily mapped. */
33904+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
33905+{
33906+ unsigned long next;
33907+
33908+ Dprintk("init_memory_mapping\n");
33909+
33910+ /*
33911+ * Find space for the kernel direct mapping tables.
33912+ * Later we should allocate these tables in the local node of the memory
33913+ * mapped. Unfortunately this is done currently before the nodes are
33914+ * discovered.
33915+ */
33916+ if (!after_bootmem)
33917+ find_early_table_space(end);
33918+
33919+ start = (unsigned long)__va(start);
33920+ end = (unsigned long)__va(end);
33921+
33922+ for (; start < end; start = next) {
33923+ unsigned long pud_phys;
33924+ pgd_t *pgd = pgd_offset_k(start);
33925+ pud_t *pud;
33926+
33927+ if (after_bootmem)
33928+ pud = pud_offset(pgd, start & PGDIR_MASK);
33929+ else
33930+ pud = alloc_static_page(&pud_phys);
33931+ next = start + PGDIR_SIZE;
33932+ if (next > end)
33933+ next = end;
33934+ phys_pud_init(pud, __pa(start), __pa(next));
33935+ if (!after_bootmem) {
33936+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
33937+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
33938+ }
33939+ }
33940+
33941+ if (!after_bootmem) {
33942+ BUG_ON(start_pfn != table_end);
33943+ xen_finish_init_mapping();
33944+ }
33945+
33946+ __flush_tlb_all();
33947+}
33948+
33949+void __cpuinit zap_low_mappings(int cpu)
33950+{
33951+ /* this is not required for Xen */
33952+#if 0
33953+ swap_low_mappings();
33954+#endif
33955+}
33956+
33957+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
33958+__init void
33959+size_zones(unsigned long *z, unsigned long *h,
33960+ unsigned long start_pfn, unsigned long end_pfn)
33961+{
33962+ int i;
33963+ unsigned long w;
33964+
33965+ for (i = 0; i < MAX_NR_ZONES; i++)
33966+ z[i] = 0;
33967+
33968+ if (start_pfn < MAX_DMA_PFN)
33969+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
33970+ if (start_pfn < MAX_DMA32_PFN) {
33971+ unsigned long dma32_pfn = MAX_DMA32_PFN;
33972+ if (dma32_pfn > end_pfn)
33973+ dma32_pfn = end_pfn;
33974+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
33975+ }
33976+ z[ZONE_NORMAL] = end_pfn - start_pfn;
33977+
33978+ /* Remove lower zones from higher ones. */
33979+ w = 0;
33980+ for (i = 0; i < MAX_NR_ZONES; i++) {
33981+ if (z[i])
33982+ z[i] -= w;
33983+ w += z[i];
33984+ }
33985+
33986+ /* Compute holes */
33987+ w = start_pfn;
33988+ for (i = 0; i < MAX_NR_ZONES; i++) {
33989+ unsigned long s = w;
33990+ w += z[i];
33991+ h[i] = e820_hole_size(s, w);
33992+ }
33993+
33994+ /* Add the space pace needed for mem_map to the holes too. */
33995+ for (i = 0; i < MAX_NR_ZONES; i++)
33996+ h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
33997+
33998+ /* The 16MB DMA zone has the kernel and other misc mappings.
33999+ Account them too */
34000+ if (h[ZONE_DMA]) {
34001+ h[ZONE_DMA] += dma_reserve;
34002+ if (h[ZONE_DMA] >= z[ZONE_DMA]) {
34003+ printk(KERN_WARNING
34004+ "Kernel too large and filling up ZONE_DMA?\n");
34005+ h[ZONE_DMA] = z[ZONE_DMA];
34006+ }
34007+ }
34008+}
34009+
34010+#ifndef CONFIG_NUMA
34011+void __init paging_init(void)
34012+{
34013+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
34014+
34015+ memory_present(0, 0, end_pfn);
34016+ sparse_init();
34017+ size_zones(zones, holes, 0, end_pfn);
34018+ free_area_init_node(0, NODE_DATA(0), zones,
34019+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
34020+
34021+ init_mm.context.pinned = 1;
34022+}
34023+#endif
34024+
34025+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
34026+ from the CPU leading to inconsistent cache lines. address and size
34027+ must be aligned to 2MB boundaries.
34028+ Does nothing when the mapping doesn't exist. */
34029+void __init clear_kernel_mapping(unsigned long address, unsigned long size)
34030+{
34031+ unsigned long end = address + size;
34032+
34033+ BUG_ON(address & ~LARGE_PAGE_MASK);
34034+ BUG_ON(size & ~LARGE_PAGE_MASK);
34035+
34036+ for (; address < end; address += LARGE_PAGE_SIZE) {
34037+ pgd_t *pgd = pgd_offset_k(address);
34038+ pud_t *pud;
34039+ pmd_t *pmd;
34040+ if (pgd_none(*pgd))
34041+ continue;
34042+ pud = pud_offset(pgd, address);
34043+ if (pud_none(*pud))
34044+ continue;
34045+ pmd = pmd_offset(pud, address);
34046+ if (!pmd || pmd_none(*pmd))
34047+ continue;
34048+ if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
34049+ /* Could handle this, but it should not happen currently. */
34050+ printk(KERN_ERR
34051+ "clear_kernel_mapping: mapping has been split. will leak memory\n");
34052+ pmd_ERROR(*pmd);
34053+ }
34054+ set_pmd(pmd, __pmd(0));
34055+ }
34056+ __flush_tlb_all();
34057+}
34058+
34059+/*
34060+ * Memory hotplug specific functions
34061+ */
34062+void online_page(struct page *page)
34063+{
34064+ ClearPageReserved(page);
34065+ init_page_count(page);
34066+ __free_page(page);
34067+ totalram_pages++;
34068+ num_physpages++;
34069+}
34070+
34071+#ifdef CONFIG_MEMORY_HOTPLUG
34072+/*
34073+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
34074+ * via probe interface of sysfs. If acpi notifies hot-add event, then it
34075+ * can tell node id by searching dsdt. But, probe interface doesn't have
34076+ * node id. So, return 0 as node id at this time.
34077+ */
34078+#ifdef CONFIG_NUMA
34079+int memory_add_physaddr_to_nid(u64 start)
34080+{
34081+ return 0;
34082+}
34083+#endif
34084+
34085+/*
34086+ * Memory is added always to NORMAL zone. This means you will never get
34087+ * additional DMA/DMA32 memory.
34088+ */
34089+int arch_add_memory(int nid, u64 start, u64 size)
34090+{
34091+ struct pglist_data *pgdat = NODE_DATA(nid);
34092+ struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
34093+ unsigned long start_pfn = start >> PAGE_SHIFT;
34094+ unsigned long nr_pages = size >> PAGE_SHIFT;
34095+ int ret;
34096+
34097+ ret = __add_pages(zone, start_pfn, nr_pages);
34098+ if (ret)
34099+ goto error;
34100+
34101+ init_memory_mapping(start, (start + size -1));
34102+
34103+ return ret;
34104+error:
34105+ printk("%s: Problem encountered in __add_pages!\n", __func__);
34106+ return ret;
34107+}
34108+EXPORT_SYMBOL_GPL(arch_add_memory);
34109+
34110+int remove_memory(u64 start, u64 size)
34111+{
34112+ return -EINVAL;
34113+}
34114+EXPORT_SYMBOL_GPL(remove_memory);
34115+
34116+#else /* CONFIG_MEMORY_HOTPLUG */
34117+/*
34118+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
34119+ * just online the pages.
34120+ */
34121+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
34122+{
34123+ int err = -EIO;
34124+ unsigned long pfn;
34125+ unsigned long total = 0, mem = 0;
34126+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
34127+ if (pfn_valid(pfn)) {
34128+ online_page(pfn_to_page(pfn));
34129+ err = 0;
34130+ mem++;
34131+ }
34132+ total++;
34133+ }
34134+ if (!err) {
34135+ z->spanned_pages += total;
34136+ z->present_pages += mem;
34137+ z->zone_pgdat->node_spanned_pages += total;
34138+ z->zone_pgdat->node_present_pages += mem;
34139+ }
34140+ return err;
34141+}
34142+#endif /* CONFIG_MEMORY_HOTPLUG */
34143+
34144+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
34145+ kcore_vsyscall;
34146+
34147+void __init mem_init(void)
34148+{
34149+ long codesize, reservedpages, datasize, initsize;
34150+ unsigned long pfn;
34151+
34152+ pci_iommu_alloc();
34153+
34154+ /* How many end-of-memory variables you have, grandma! */
34155+ max_low_pfn = end_pfn;
34156+ max_pfn = end_pfn;
34157+ num_physpages = end_pfn;
34158+ high_memory = (void *) __va(end_pfn * PAGE_SIZE);
34159+
34160+ /* clear the zero-page */
34161+ memset(empty_zero_page, 0, PAGE_SIZE);
34162+
34163+ reservedpages = 0;
34164+
34165+ /* this will put all low memory onto the freelists */
34166+#ifdef CONFIG_NUMA
34167+ totalram_pages = numa_free_all_bootmem();
34168+#else
34169+ totalram_pages = free_all_bootmem();
34170+#endif
34171+ /* XEN: init and count pages outside initial allocation. */
34172+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
34173+ ClearPageReserved(pfn_to_page(pfn));
34174+ init_page_count(pfn_to_page(pfn));
34175+ totalram_pages++;
34176+ }
34177+ reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
34178+
34179+ after_bootmem = 1;
34180+
34181+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
34182+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
34183+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
34184+
34185+ /* Register memory areas for /proc/kcore */
34186+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
34187+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
34188+ VMALLOC_END-VMALLOC_START);
34189+ kclist_add(&kcore_kernel, &_stext, _end - _stext);
34190+ kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
34191+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
34192+ VSYSCALL_END - VSYSCALL_START);
34193+
34194+ printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
34195+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
34196+ end_pfn << (PAGE_SHIFT-10),
34197+ codesize >> 10,
34198+ reservedpages << (PAGE_SHIFT-10),
34199+ datasize >> 10,
34200+ initsize >> 10);
34201+
34202+#ifndef CONFIG_XEN
34203+#ifdef CONFIG_SMP
34204+ /*
34205+ * Sync boot_level4_pgt mappings with the init_level4_pgt
34206+ * except for the low identity mappings which are already zapped
34207+ * in init_level4_pgt. This sync-up is essential for AP's bringup
34208+ */
34209+ memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
34210+#endif
34211+#endif
34212+}
34213+
34214+void free_init_pages(char *what, unsigned long begin, unsigned long end)
34215+{
34216+ unsigned long addr;
34217+
34218+ if (begin >= end)
34219+ return;
34220+
34221+ printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
34222+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
34223+ ClearPageReserved(virt_to_page(addr));
34224+ init_page_count(virt_to_page(addr));
34225+ memset((void *)(addr & ~(PAGE_SIZE-1)),
34226+ POISON_FREE_INITMEM, PAGE_SIZE);
34227+ if (addr >= __START_KERNEL_map) {
34228+ /* make_readonly() reports all kernel addresses. */
34229+ __make_page_writable(__va(__pa(addr)));
34230+ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
34231+ pgd_t *pgd = pgd_offset_k(addr);
34232+ pud_t *pud = pud_offset(pgd, addr);
34233+ pmd_t *pmd = pmd_offset(pud, addr);
34234+ pte_t *pte = pte_offset_kernel(pmd, addr);
34235+
34236+ xen_l1_entry_update(pte, __pte(0)); /* fallback */
34237+ }
34238+ }
34239+ free_page(addr);
34240+ totalram_pages++;
34241+ }
34242+}
34243+
34244+void free_initmem(void)
34245+{
34246+ memset(__initdata_begin, POISON_FREE_INITDATA,
34247+ __initdata_end - __initdata_begin);
34248+ free_init_pages("unused kernel memory",
34249+ (unsigned long)(&__init_begin),
34250+ (unsigned long)(&__init_end));
34251+}
34252+
34253+#ifdef CONFIG_DEBUG_RODATA
34254+
34255+void mark_rodata_ro(void)
34256+{
34257+ unsigned long addr = (unsigned long)__start_rodata;
34258+
34259+ for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
34260+ change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
34261+
34262+ printk ("Write protecting the kernel read-only data: %luk\n",
34263+ (__end_rodata - __start_rodata) >> 10);
34264+
34265+ /*
34266+ * change_page_attr_addr() requires a global_flush_tlb() call after it.
34267+ * We do this after the printk so that if something went wrong in the
34268+ * change, the printk gets out at least to give a better debug hint
34269+ * of who is the culprit.
34270+ */
34271+ global_flush_tlb();
34272+}
34273+#endif
34274+
34275+#ifdef CONFIG_BLK_DEV_INITRD
34276+void free_initrd_mem(unsigned long start, unsigned long end)
34277+{
34278+ free_init_pages("initrd memory", start, end);
34279+}
34280+#endif
34281+
34282+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
34283+{
34284+ /* Should check here against the e820 map to avoid double free */
34285+#ifdef CONFIG_NUMA
34286+ int nid = phys_to_nid(phys);
34287+ reserve_bootmem_node(NODE_DATA(nid), phys, len);
34288+#else
34289+ reserve_bootmem(phys, len);
34290+#endif
34291+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
34292+ dma_reserve += len / PAGE_SIZE;
34293+}
34294+
34295+int kern_addr_valid(unsigned long addr)
34296+{
34297+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
34298+ pgd_t *pgd;
34299+ pud_t *pud;
34300+ pmd_t *pmd;
34301+ pte_t *pte;
34302+
34303+ if (above != 0 && above != -1UL)
34304+ return 0;
34305+
34306+ pgd = pgd_offset_k(addr);
34307+ if (pgd_none(*pgd))
34308+ return 0;
34309+
34310+ pud = pud_offset(pgd, addr);
34311+ if (pud_none(*pud))
34312+ return 0;
34313+
34314+ pmd = pmd_offset(pud, addr);
34315+ if (pmd_none(*pmd))
34316+ return 0;
34317+ if (pmd_large(*pmd))
34318+ return pfn_valid(pmd_pfn(*pmd));
34319+
34320+ pte = pte_offset_kernel(pmd, addr);
34321+ if (pte_none(*pte))
34322+ return 0;
34323+ return pfn_valid(pte_pfn(*pte));
34324+}
34325+
34326+#ifdef CONFIG_SYSCTL
34327+#include <linux/sysctl.h>
34328+
34329+extern int exception_trace, page_fault_trace;
34330+
34331+static ctl_table debug_table2[] = {
34332+ { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
34333+ proc_dointvec },
34334+ { 0, }
34335+};
34336+
34337+static ctl_table debug_root_table2[] = {
34338+ { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
34339+ .child = debug_table2 },
34340+ { 0 },
34341+};
34342+
34343+static __init int x8664_sysctl_init(void)
34344+{
34345+ register_sysctl_table(debug_root_table2, 1);
34346+ return 0;
34347+}
34348+__initcall(x8664_sysctl_init);
34349+#endif
34350+
34351+/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
34352+ covers the 64bit vsyscall page now. 32bit has a real VMA now and does
34353+ not need special handling anymore. */
34354+
34355+static struct vm_area_struct gate_vma = {
34356+ .vm_start = VSYSCALL_START,
34357+ .vm_end = VSYSCALL_END,
34358+ .vm_page_prot = PAGE_READONLY
34359+};
34360+
34361+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
34362+{
34363+#ifdef CONFIG_IA32_EMULATION
34364+ if (test_tsk_thread_flag(tsk, TIF_IA32))
34365+ return NULL;
34366+#endif
34367+ return &gate_vma;
34368+}
34369+
34370+int in_gate_area(struct task_struct *task, unsigned long addr)
34371+{
34372+ struct vm_area_struct *vma = get_gate_vma(task);
34373+ if (!vma)
34374+ return 0;
34375+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
34376+}
34377+
34378+/* Use this when you have no reliable task/vma, typically from interrupt
34379+ * context. It is less reliable than using the task's vma and may give
34380+ * false positives.
34381+ */
34382+int in_gate_area_no_task(unsigned long addr)
34383+{
34384+ return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
34385+}
34386Index: head-2008-11-25/arch/x86/mm/pageattr_64-xen.c
34387===================================================================
34388--- /dev/null 1970-01-01 00:00:00.000000000 +0000
34389+++ head-2008-11-25/arch/x86/mm/pageattr_64-xen.c 2008-07-21 11:00:32.000000000 +0200
34390@@ -0,0 +1,502 @@
34391+/*
34392+ * Copyright 2002 Andi Kleen, SuSE Labs.
34393+ * Thanks to Ben LaHaise for precious feedback.
34394+ */
34395+
34396+#include <linux/mm.h>
34397+#include <linux/sched.h>
34398+#include <linux/highmem.h>
34399+#include <linux/module.h>
34400+#include <linux/slab.h>
34401+#include <asm/uaccess.h>
34402+#include <asm/processor.h>
34403+#include <asm/tlbflush.h>
34404+#include <asm/io.h>
34405+
34406+#ifdef CONFIG_XEN
34407+#include <asm/pgalloc.h>
34408+#include <asm/mmu_context.h>
34409+
34410+LIST_HEAD(mm_unpinned);
34411+DEFINE_SPINLOCK(mm_unpinned_lock);
34412+
34413+static void _pin_lock(struct mm_struct *mm, int lock) {
34414+ if (lock)
34415+ spin_lock(&mm->page_table_lock);
34416+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
34417+ /* While mm->page_table_lock protects us against insertions and
34418+ * removals of higher level page table pages, it doesn't protect
34419+ * against updates of pte-s. Such updates, however, require the
34420+ * pte pages to be in consistent state (unpinned+writable or
34421+ * pinned+readonly). The pinning and attribute changes, however
34422+ * cannot be done atomically, which is why such updates must be
34423+ * prevented from happening concurrently.
34424+ * Note that no pte lock can ever elsewhere be acquired nesting
34425+ * with an already acquired one in the same mm, or with the mm's
34426+ * page_table_lock already acquired, as that would break in the
34427+ * non-split case (where all these are actually resolving to the
34428+ * one page_table_lock). Thus acquiring all of them here is not
34429+ * going to result in dead locks, and the order of acquires
34430+ * doesn't matter.
34431+ */
34432+ {
34433+ pgd_t *pgd = mm->pgd;
34434+ unsigned g;
34435+
34436+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34437+ pud_t *pud;
34438+ unsigned u;
34439+
34440+ if (pgd_none(*pgd))
34441+ continue;
34442+ pud = pud_offset(pgd, 0);
34443+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34444+ pmd_t *pmd;
34445+ unsigned m;
34446+
34447+ if (pud_none(*pud))
34448+ continue;
34449+ pmd = pmd_offset(pud, 0);
34450+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34451+ spinlock_t *ptl;
34452+
34453+ if (pmd_none(*pmd))
34454+ continue;
34455+ ptl = pte_lockptr(0, pmd);
34456+ if (lock)
34457+ spin_lock(ptl);
34458+ else
34459+ spin_unlock(ptl);
34460+ }
34461+ }
34462+ }
34463+ }
34464+#endif
34465+ if (!lock)
34466+ spin_unlock(&mm->page_table_lock);
34467+}
34468+#define pin_lock(mm) _pin_lock(mm, 1)
34469+#define pin_unlock(mm) _pin_lock(mm, 0)
34470+
34471+#define PIN_BATCH 8
34472+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
34473+
34474+static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags,
34475+ unsigned int cpu, unsigned int seq)
34476+{
34477+ struct page *page = virt_to_page(pt);
34478+ unsigned long pfn = page_to_pfn(page);
34479+
34480+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
34481+ (unsigned long)__va(pfn << PAGE_SHIFT),
34482+ pfn_pte(pfn, flags), 0);
34483+ if (unlikely(++seq == PIN_BATCH)) {
34484+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
34485+ PIN_BATCH, NULL)))
34486+ BUG();
34487+ seq = 0;
34488+ }
34489+
34490+ return seq;
34491+}
34492+
34493+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
34494+{
34495+ pgd_t *pgd;
34496+ pud_t *pud;
34497+ pmd_t *pmd;
34498+ pte_t *pte;
34499+ int g,u,m;
34500+ unsigned int cpu, seq;
34501+ multicall_entry_t *mcl;
34502+
34503+ pgd = mm->pgd;
34504+ cpu = get_cpu();
34505+
34506+ /*
34507+ * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
34508+ * be the 'current' task's pagetables (e.g., current may be 32-bit,
34509+ * but the pagetables may be for a 64-bit task).
34510+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
34511+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
34512+ */
34513+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34514+ if (pgd_none(*pgd))
34515+ continue;
34516+ pud = pud_offset(pgd, 0);
34517+ if (PTRS_PER_PUD > 1) /* not folded */
34518+ seq = mm_walk_set_prot(pud,flags,cpu,seq);
34519+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34520+ if (pud_none(*pud))
34521+ continue;
34522+ pmd = pmd_offset(pud, 0);
34523+ if (PTRS_PER_PMD > 1) /* not folded */
34524+ seq = mm_walk_set_prot(pmd,flags,cpu,seq);
34525+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34526+ if (pmd_none(*pmd))
34527+ continue;
34528+ pte = pte_offset_kernel(pmd,0);
34529+ seq = mm_walk_set_prot(pte,flags,cpu,seq);
34530+ }
34531+ }
34532+ }
34533+
34534+ mcl = per_cpu(pb_mcl, cpu);
34535+ if (unlikely(seq > PIN_BATCH - 2)) {
34536+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
34537+ BUG();
34538+ seq = 0;
34539+ }
34540+ MULTI_update_va_mapping(mcl + seq,
34541+ (unsigned long)__user_pgd(mm->pgd),
34542+ pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags),
34543+ 0);
34544+ MULTI_update_va_mapping(mcl + seq + 1,
34545+ (unsigned long)mm->pgd,
34546+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags),
34547+ UVMF_TLB_FLUSH);
34548+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
34549+ BUG();
34550+
34551+ put_cpu();
34552+}
34553+
34554+void mm_pin(struct mm_struct *mm)
34555+{
34556+ if (xen_feature(XENFEAT_writable_page_tables))
34557+ return;
34558+
34559+ pin_lock(mm);
34560+
34561+ mm_walk(mm, PAGE_KERNEL_RO);
34562+ xen_pgd_pin(__pa(mm->pgd)); /* kernel */
34563+ xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
34564+ mm->context.pinned = 1;
34565+ spin_lock(&mm_unpinned_lock);
34566+ list_del(&mm->context.unpinned);
34567+ spin_unlock(&mm_unpinned_lock);
34568+
34569+ pin_unlock(mm);
34570+}
34571+
34572+void mm_unpin(struct mm_struct *mm)
34573+{
34574+ if (xen_feature(XENFEAT_writable_page_tables))
34575+ return;
34576+
34577+ pin_lock(mm);
34578+
34579+ xen_pgd_unpin(__pa(mm->pgd));
34580+ xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
34581+ mm_walk(mm, PAGE_KERNEL);
34582+ mm->context.pinned = 0;
34583+ spin_lock(&mm_unpinned_lock);
34584+ list_add(&mm->context.unpinned, &mm_unpinned);
34585+ spin_unlock(&mm_unpinned_lock);
34586+
34587+ pin_unlock(mm);
34588+}
34589+
34590+void mm_pin_all(void)
34591+{
34592+ if (xen_feature(XENFEAT_writable_page_tables))
34593+ return;
34594+
34595+ /*
34596+ * Allow uninterrupted access to the mm_unpinned list. We don't
34597+ * actually take the mm_unpinned_lock as it is taken inside mm_pin().
34598+ * All other CPUs must be at a safe point (e.g., in stop_machine
34599+ * or offlined entirely).
34600+ */
34601+ preempt_disable();
34602+ while (!list_empty(&mm_unpinned))
34603+ mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
34604+ context.unpinned));
34605+ preempt_enable();
34606+}
34607+
34608+void _arch_dup_mmap(struct mm_struct *mm)
34609+{
34610+ if (!mm->context.pinned)
34611+ mm_pin(mm);
34612+}
34613+
34614+void _arch_exit_mmap(struct mm_struct *mm)
34615+{
34616+ struct task_struct *tsk = current;
34617+
34618+ task_lock(tsk);
34619+
34620+ /*
34621+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
34622+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
34623+ */
34624+ if (tsk->active_mm == mm) {
34625+ tsk->active_mm = &init_mm;
34626+ atomic_inc(&init_mm.mm_count);
34627+
34628+ switch_mm(mm, &init_mm, tsk);
34629+
34630+ atomic_dec(&mm->mm_count);
34631+ BUG_ON(atomic_read(&mm->mm_count) == 0);
34632+ }
34633+
34634+ task_unlock(tsk);
34635+
34636+ if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
34637+ !mm->context.has_foreign_mappings )
34638+ mm_unpin(mm);
34639+}
34640+
34641+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
34642+{
34643+ struct page *pte;
34644+
34645+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
34646+ if (pte) {
34647+ SetPageForeign(pte, pte_free);
34648+ init_page_count(pte);
34649+ }
34650+ return pte;
34651+}
34652+
34653+void pte_free(struct page *pte)
34654+{
34655+ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
34656+
34657+ if (!pte_write(*virt_to_ptep(va)))
34658+ if (HYPERVISOR_update_va_mapping(
34659+ va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
34660+ BUG();
34661+
34662+ ClearPageForeign(pte);
34663+ init_page_count(pte);
34664+
34665+ __free_page(pte);
34666+}
34667+#endif /* CONFIG_XEN */
34668+
34669+pte_t *lookup_address(unsigned long address)
34670+{
34671+ pgd_t *pgd = pgd_offset_k(address);
34672+ pud_t *pud;
34673+ pmd_t *pmd;
34674+ pte_t *pte;
34675+ if (pgd_none(*pgd))
34676+ return NULL;
34677+ pud = pud_offset(pgd, address);
34678+ if (!pud_present(*pud))
34679+ return NULL;
34680+ pmd = pmd_offset(pud, address);
34681+ if (!pmd_present(*pmd))
34682+ return NULL;
34683+ if (pmd_large(*pmd))
34684+ return (pte_t *)pmd;
34685+ pte = pte_offset_kernel(pmd, address);
34686+ if (pte && !pte_present(*pte))
34687+ pte = NULL;
34688+ return pte;
34689+}
34690+
34691+static struct page *split_large_page(unsigned long address, pgprot_t prot,
34692+ pgprot_t ref_prot)
34693+{
34694+ int i;
34695+ unsigned long addr;
34696+ struct page *base = alloc_pages(GFP_KERNEL, 0);
34697+ pte_t *pbase;
34698+ if (!base)
34699+ return NULL;
34700+ /*
34701+ * page_private is used to track the number of entries in
34702+ * the page table page have non standard attributes.
34703+ */
34704+ SetPagePrivate(base);
34705+ page_private(base) = 0;
34706+
34707+ address = __pa(address);
34708+ addr = address & LARGE_PAGE_MASK;
34709+ pbase = (pte_t *)page_address(base);
34710+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
34711+ pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
34712+ addr == address ? prot : ref_prot);
34713+ }
34714+ return base;
34715+}
34716+
34717+
34718+static void flush_kernel_map(void *address)
34719+{
34720+ if (0 && address && cpu_has_clflush) {
34721+ /* is this worth it? */
34722+ int i;
34723+ for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
34724+ asm volatile("clflush (%0)" :: "r" (address + i));
34725+ } else
34726+ asm volatile("wbinvd":::"memory");
34727+ if (address)
34728+ __flush_tlb_one(address);
34729+ else
34730+ __flush_tlb_all();
34731+}
34732+
34733+
34734+static inline void flush_map(unsigned long address)
34735+{
34736+ on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
34737+}
34738+
34739+static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
34740+
34741+static inline void save_page(struct page *fpage)
34742+{
34743+ fpage->lru.next = (struct list_head *)deferred_pages;
34744+ deferred_pages = fpage;
34745+}
34746+
34747+/*
34748+ * No more special protections in this 2/4MB area - revert to a
34749+ * large page again.
34750+ */
34751+static void revert_page(unsigned long address, pgprot_t ref_prot)
34752+{
34753+ pgd_t *pgd;
34754+ pud_t *pud;
34755+ pmd_t *pmd;
34756+ pte_t large_pte;
34757+
34758+ pgd = pgd_offset_k(address);
34759+ BUG_ON(pgd_none(*pgd));
34760+ pud = pud_offset(pgd,address);
34761+ BUG_ON(pud_none(*pud));
34762+ pmd = pmd_offset(pud, address);
34763+ BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
34764+ pgprot_val(ref_prot) |= _PAGE_PSE;
34765+ large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
34766+ set_pte((pte_t *)pmd, large_pte);
34767+}
34768+
34769+static int
34770+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
34771+ pgprot_t ref_prot)
34772+{
34773+ pte_t *kpte;
34774+ struct page *kpte_page;
34775+ unsigned kpte_flags;
34776+ pgprot_t ref_prot2;
34777+ kpte = lookup_address(address);
34778+ if (!kpte) return 0;
34779+ kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
34780+ kpte_flags = pte_val(*kpte);
34781+ if (pgprot_val(prot) != pgprot_val(ref_prot)) {
34782+ if ((kpte_flags & _PAGE_PSE) == 0) {
34783+ set_pte(kpte, pfn_pte(pfn, prot));
34784+ } else {
34785+ /*
34786+ * split_large_page will take the reference for this
34787+ * change_page_attr on the split page.
34788+ */
34789+
34790+ struct page *split;
34791+ ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
34792+
34793+ split = split_large_page(address, prot, ref_prot2);
34794+ if (!split)
34795+ return -ENOMEM;
34796+ set_pte(kpte,mk_pte(split, ref_prot2));
34797+ kpte_page = split;
34798+ }
34799+ page_private(kpte_page)++;
34800+ } else if ((kpte_flags & _PAGE_PSE) == 0) {
34801+ set_pte(kpte, pfn_pte(pfn, ref_prot));
34802+ BUG_ON(page_private(kpte_page) == 0);
34803+ page_private(kpte_page)--;
34804+ } else
34805+ BUG();
34806+
34807+ /* on x86-64 the direct mapping set at boot is not using 4k pages */
34808+ /*
34809+ * ..., but the XEN guest kernels (currently) do:
34810+ * If the pte was reserved, it means it was created at boot
34811+ * time (not via split_large_page) and in turn we must not
34812+ * replace it with a large page.
34813+ */
34814+#ifndef CONFIG_XEN
34815+ BUG_ON(PageReserved(kpte_page));
34816+#else
34817+ if (PageReserved(kpte_page))
34818+ return 0;
34819+#endif
34820+
34821+ if (page_private(kpte_page) == 0) {
34822+ save_page(kpte_page);
34823+ revert_page(address, ref_prot);
34824+ }
34825+ return 0;
34826+}
34827+
34828+/*
34829+ * Change the page attributes of an page in the linear mapping.
34830+ *
34831+ * This should be used when a page is mapped with a different caching policy
34832+ * than write-back somewhere - some CPUs do not like it when mappings with
34833+ * different caching policies exist. This changes the page attributes of the
34834+ * in kernel linear mapping too.
34835+ *
34836+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
34837+ * This function only deals with the kernel linear map.
34838+ *
34839+ * Caller must call global_flush_tlb() after this.
34840+ */
34841+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
34842+{
34843+ int err = 0;
34844+ int i;
34845+
34846+ down_write(&init_mm.mmap_sem);
34847+ for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
34848+ unsigned long pfn = __pa(address) >> PAGE_SHIFT;
34849+
34850+ err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
34851+ if (err)
34852+ break;
34853+ /* Handle kernel mapping too which aliases part of the
34854+ * lowmem */
34855+ if (__pa(address) < KERNEL_TEXT_SIZE) {
34856+ unsigned long addr2;
34857+ pgprot_t prot2 = prot;
34858+ addr2 = __START_KERNEL_map + __pa(address);
34859+ pgprot_val(prot2) &= ~_PAGE_NX;
34860+ err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
34861+ }
34862+ }
34863+ up_write(&init_mm.mmap_sem);
34864+ return err;
34865+}
34866+
34867+/* Don't call this for MMIO areas that may not have a mem_map entry */
34868+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
34869+{
34870+ unsigned long addr = (unsigned long)page_address(page);
34871+ return change_page_attr_addr(addr, numpages, prot);
34872+}
34873+
34874+void global_flush_tlb(void)
34875+{
34876+ struct page *dpage;
34877+
34878+ down_read(&init_mm.mmap_sem);
34879+ dpage = xchg(&deferred_pages, NULL);
34880+ up_read(&init_mm.mmap_sem);
34881+
34882+ flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
34883+ while (dpage) {
34884+ struct page *tmp = dpage;
34885+ dpage = (struct page *)dpage->lru.next;
34886+ ClearPagePrivate(tmp);
34887+ __free_page(tmp);
34888+ }
34889+}
34890+
34891+EXPORT_SYMBOL(change_page_attr);
34892+EXPORT_SYMBOL(global_flush_tlb);
34893Index: head-2008-11-25/drivers/pci/msi-xen.c
34894===================================================================
34895--- /dev/null 1970-01-01 00:00:00.000000000 +0000
34896+++ head-2008-11-25/drivers/pci/msi-xen.c 2008-10-13 13:43:45.000000000 +0200
34897@@ -0,0 +1,809 @@
34898+/*
34899+ * File: msi.c
34900+ * Purpose: PCI Message Signaled Interrupt (MSI)
34901+ *
34902+ * Copyright (C) 2003-2004 Intel
34903+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
34904+ */
34905+
34906+#include <linux/mm.h>
34907+#include <linux/irq.h>
34908+#include <linux/interrupt.h>
34909+#include <linux/init.h>
34910+#include <linux/ioport.h>
34911+#include <linux/smp_lock.h>
34912+#include <linux/pci.h>
34913+#include <linux/proc_fs.h>
34914+
34915+#include <xen/evtchn.h>
34916+
34917+#include <asm/errno.h>
34918+#include <asm/io.h>
34919+#include <asm/smp.h>
34920+
34921+#include "pci.h"
34922+#include "msi.h"
34923+
34924+static int pci_msi_enable = 1;
34925+
34926+static struct msi_ops *msi_ops;
34927+
34928+int msi_register(struct msi_ops *ops)
34929+{
34930+ msi_ops = ops;
34931+ return 0;
34932+}
34933+
34934+static LIST_HEAD(msi_dev_head);
34935+DEFINE_SPINLOCK(msi_dev_lock);
34936+
34937+struct msi_dev_list {
34938+ struct pci_dev *dev;
34939+ struct list_head list;
34940+ spinlock_t pirq_list_lock;
34941+ struct list_head pirq_list_head;
34942+};
34943+
34944+struct msi_pirq_entry {
34945+ struct list_head list;
34946+ int pirq;
34947+ int entry_nr;
34948+};
34949+
34950+static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
34951+{
34952+ struct msi_dev_list *msi_dev_list, *ret = NULL;
34953+ unsigned long flags;
34954+
34955+ spin_lock_irqsave(&msi_dev_lock, flags);
34956+
34957+ list_for_each_entry(msi_dev_list, &msi_dev_head, list)
34958+ if ( msi_dev_list->dev == dev )
34959+ ret = msi_dev_list;
34960+
34961+ if ( ret ) {
34962+ spin_unlock_irqrestore(&msi_dev_lock, flags);
34963+ return ret;
34964+ }
34965+
34966+ /* Has not allocate msi_dev until now. */
34967+ ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
34968+
34969+ /* Failed to allocate msi_dev structure */
34970+ if ( !ret ) {
34971+ spin_unlock_irqrestore(&msi_dev_lock, flags);
34972+ return NULL;
34973+ }
34974+
34975+ ret->dev = dev;
34976+ spin_lock_init(&ret->pirq_list_lock);
34977+ INIT_LIST_HEAD(&ret->pirq_list_head);
34978+ list_add_tail(&ret->list, &msi_dev_head);
34979+ spin_unlock_irqrestore(&msi_dev_lock, flags);
34980+ return ret;
34981+}
34982+
34983+static int attach_pirq_entry(int pirq, int entry_nr,
34984+ struct msi_dev_list *msi_dev_entry)
34985+{
34986+ struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
34987+ unsigned long flags;
34988+
34989+ if (!entry)
34990+ return -ENOMEM;
34991+ entry->pirq = pirq;
34992+ entry->entry_nr = entry_nr;
34993+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
34994+ list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
34995+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
34996+ return 0;
34997+}
34998+
34999+static void detach_pirq_entry(int entry_nr,
35000+ struct msi_dev_list *msi_dev_entry)
35001+{
35002+ unsigned long flags;
35003+ struct msi_pirq_entry *pirq_entry;
35004+
35005+ list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35006+ if (pirq_entry->entry_nr == entry_nr) {
35007+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35008+ list_del(&pirq_entry->list);
35009+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35010+ kfree(pirq_entry);
35011+ return;
35012+ }
35013+ }
35014+}
35015+
35016+/*
35017+ * pciback will provide device's owner
35018+ */
35019+static int (*get_owner)(struct pci_dev *dev);
35020+
35021+int register_msi_get_owner(int (*func)(struct pci_dev *dev))
35022+{
35023+ if (get_owner) {
35024+ printk(KERN_WARNING "register msi_get_owner again\n");
35025+ return -EEXIST;
35026+ }
35027+ get_owner = func;
35028+ return 0;
35029+}
35030+
35031+int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
35032+{
35033+ if (get_owner != func)
35034+ return -EINVAL;
35035+ get_owner = NULL;
35036+ return 0;
35037+}
35038+
35039+static int msi_get_dev_owner(struct pci_dev *dev)
35040+{
35041+ int owner;
35042+
35043+ BUG_ON(!is_initial_xendomain());
35044+ if (get_owner && (owner = get_owner(dev)) >= 0) {
35045+ printk(KERN_INFO "get owner for dev %x get %x \n",
35046+ dev->devfn, owner);
35047+ return owner;
35048+ }
35049+
35050+ return DOMID_SELF;
35051+}
35052+
35053+static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
35054+{
35055+ struct physdev_unmap_pirq unmap;
35056+ int rc;
35057+
35058+ unmap.domid = msi_get_dev_owner(dev);
35059+ /* See comments in msi_map_pirq_to_vector, input parameter pirq
35060+ * mean irq number only if the device belongs to dom0 itself.
35061+ */
35062+ unmap.pirq = (unmap.domid != DOMID_SELF)
35063+ ? pirq : evtchn_get_xen_pirq(pirq);
35064+
35065+ if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
35066+ printk(KERN_WARNING "unmap irq %x failed\n", pirq);
35067+
35068+ if (rc < 0)
35069+ return rc;
35070+
35071+ if (unmap.domid == DOMID_SELF)
35072+ evtchn_map_pirq(pirq, 0);
35073+
35074+ return 0;
35075+}
35076+
35077+static u64 find_table_base(struct pci_dev *dev, int pos)
35078+{
35079+ u8 bar;
35080+ u32 reg;
35081+ unsigned long flags;
35082+
35083+ pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
35084+ bar = reg & PCI_MSIX_FLAGS_BIRMASK;
35085+
35086+ flags = pci_resource_flags(dev, bar);
35087+ if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
35088+ return 0;
35089+
35090+ return pci_resource_start(dev, bar);
35091+}
35092+
35093+/*
35094+ * Protected by msi_lock
35095+ */
35096+static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq,
35097+ int entry_nr, u64 table_base)
35098+{
35099+ struct physdev_map_pirq map_irq;
35100+ int rc;
35101+ domid_t domid = DOMID_SELF;
35102+
35103+ domid = msi_get_dev_owner(dev);
35104+
35105+ map_irq.domid = domid;
35106+ map_irq.type = MAP_PIRQ_TYPE_MSI;
35107+ map_irq.index = -1;
35108+ map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq);
35109+ map_irq.bus = dev->bus->number;
35110+ map_irq.devfn = dev->devfn;
35111+ map_irq.entry_nr = entry_nr;
35112+ map_irq.table_base = table_base;
35113+
35114+ if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
35115+ printk(KERN_WARNING "map irq failed\n");
35116+
35117+ if (rc < 0)
35118+ return rc;
35119+ /* This happens when MSI support is not enabled in Xen. */
35120+ if (rc == 0 && map_irq.pirq < 0)
35121+ return -ENOSYS;
35122+
35123+ BUG_ON(map_irq.pirq <= 0);
35124+
35125+ /* If mapping of this particular MSI is on behalf of another domain,
35126+ * we do not need to get an irq in dom0. This also implies:
35127+ * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
35128+ * to another domain, and will be 'Linux irq' if it belongs to dom0.
35129+ */
35130+ return ((domid != DOMID_SELF) ?
35131+ map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq));
35132+}
35133+
35134+static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
35135+{
35136+ return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base);
35137+}
35138+
35139+static int msi_init(void)
35140+{
35141+ static int status = 0;
35142+
35143+ if (pci_msi_quirk) {
35144+ pci_msi_enable = 0;
35145+ printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n");
35146+ status = -EINVAL;
35147+ }
35148+
35149+ return status;
35150+}
35151+
35152+void pci_scan_msi_device(struct pci_dev *dev) { }
35153+
35154+void disable_msi_mode(struct pci_dev *dev, int pos, int type)
35155+{
35156+ u16 control;
35157+
35158+ pci_read_config_word(dev, msi_control_reg(pos), &control);
35159+ if (type == PCI_CAP_ID_MSI) {
35160+ /* Set enabled bits to single MSI & enable MSI_enable bit */
35161+ msi_disable(control);
35162+ pci_write_config_word(dev, msi_control_reg(pos), control);
35163+ dev->msi_enabled = 0;
35164+ } else {
35165+ msix_disable(control);
35166+ pci_write_config_word(dev, msi_control_reg(pos), control);
35167+ dev->msix_enabled = 0;
35168+ }
35169+ if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35170+ /* PCI Express Endpoint device detected */
35171+ pci_intx(dev, 1); /* enable intx */
35172+ }
35173+}
35174+
35175+static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
35176+{
35177+ u16 control;
35178+
35179+ pci_read_config_word(dev, msi_control_reg(pos), &control);
35180+ if (type == PCI_CAP_ID_MSI) {
35181+ /* Set enabled bits to single MSI & enable MSI_enable bit */
35182+ msi_enable(control, 1);
35183+ pci_write_config_word(dev, msi_control_reg(pos), control);
35184+ dev->msi_enabled = 1;
35185+ } else {
35186+ msix_enable(control);
35187+ pci_write_config_word(dev, msi_control_reg(pos), control);
35188+ dev->msix_enabled = 1;
35189+ }
35190+ if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35191+ /* PCI Express Endpoint device detected */
35192+ pci_intx(dev, 0); /* disable intx */
35193+ }
35194+}
35195+
35196+#ifdef CONFIG_PM
35197+int pci_save_msi_state(struct pci_dev *dev)
35198+{
35199+ int pos;
35200+
35201+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35202+ if (pos <= 0 || dev->no_msi)
35203+ return 0;
35204+
35205+ if (!dev->msi_enabled)
35206+ return 0;
35207+
35208+ /* Restore dev->irq to its default pin-assertion vector */
35209+ msi_unmap_pirq(dev, dev->irq);
35210+ /* Disable MSI mode */
35211+ disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35212+ /* Set the flags for use of restore */
35213+ dev->msi_enabled = 1;
35214+ return 0;
35215+}
35216+
35217+void pci_restore_msi_state(struct pci_dev *dev)
35218+{
35219+ int pos, pirq;
35220+
35221+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35222+ if (pos <= 0)
35223+ return;
35224+
35225+ if (!dev->msi_enabled)
35226+ return;
35227+
35228+ pirq = msi_map_pirq_to_vector(dev, dev->irq, 0, 0);
35229+ if (pirq < 0)
35230+ return;
35231+ enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35232+}
35233+
35234+int pci_save_msix_state(struct pci_dev *dev)
35235+{
35236+ int pos;
35237+ unsigned long flags;
35238+ struct msi_dev_list *msi_dev_entry;
35239+ struct msi_pirq_entry *pirq_entry, *tmp;
35240+
35241+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35242+ if (pos <= 0 || dev->no_msi)
35243+ return 0;
35244+
35245+ /* save the capability */
35246+ if (!dev->msix_enabled)
35247+ return 0;
35248+
35249+ msi_dev_entry = get_msi_dev_pirq_list(dev);
35250+
35251+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35252+ list_for_each_entry_safe(pirq_entry, tmp,
35253+ &msi_dev_entry->pirq_list_head, list)
35254+ msi_unmap_pirq(dev, pirq_entry->pirq);
35255+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35256+
35257+ disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35258+ /* Set the flags for use of restore */
35259+ dev->msix_enabled = 1;
35260+
35261+ return 0;
35262+}
35263+
35264+void pci_restore_msix_state(struct pci_dev *dev)
35265+{
35266+ int pos;
35267+ unsigned long flags;
35268+ u64 table_base;
35269+ struct msi_dev_list *msi_dev_entry;
35270+ struct msi_pirq_entry *pirq_entry, *tmp;
35271+
35272+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35273+ if (pos <= 0)
35274+ return;
35275+
35276+ if (!dev->msix_enabled)
35277+ return;
35278+
35279+ msi_dev_entry = get_msi_dev_pirq_list(dev);
35280+ table_base = find_table_base(dev, pos);
35281+ if (!table_base)
35282+ return;
35283+
35284+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35285+ list_for_each_entry_safe(pirq_entry, tmp,
35286+ &msi_dev_entry->pirq_list_head, list) {
35287+ int rc = msi_map_pirq_to_vector(dev, pirq_entry->pirq,
35288+ pirq_entry->entry_nr, table_base);
35289+ if (rc < 0)
35290+ printk(KERN_WARNING
35291+ "%s: re-mapping irq #%d (pirq%d) failed: %d\n",
35292+ pci_name(dev), pirq_entry->entry_nr,
35293+ pirq_entry->pirq, rc);
35294+ }
35295+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35296+
35297+ enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35298+}
35299+#endif
35300+
35301+/**
35302+ * msi_capability_init - configure device's MSI capability structure
35303+ * @dev: pointer to the pci_dev data structure of MSI device function
35304+ *
35305+ * Setup the MSI capability structure of device function with a single
35306+ * MSI vector, regardless of device function is capable of handling
35307+ * multiple messages. A return of zero indicates the successful setup
35308+ * of an entry zero with the new MSI vector or non-zero for otherwise.
35309+ **/
35310+static int msi_capability_init(struct pci_dev *dev)
35311+{
35312+ int pos, pirq;
35313+ u16 control;
35314+
35315+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35316+ pci_read_config_word(dev, msi_control_reg(pos), &control);
35317+
35318+ pirq = msi_map_vector(dev, 0, 0);
35319+ if (pirq < 0)
35320+ return -EBUSY;
35321+
35322+ dev->irq = pirq;
35323+ /* Set MSI enabled bits */
35324+ enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35325+ dev->msi_enabled = 1;
35326+
35327+ return 0;
35328+}
35329+
35330+/**
35331+ * msix_capability_init - configure device's MSI-X capability
35332+ * @dev: pointer to the pci_dev data structure of MSI-X device function
35333+ * @entries: pointer to an array of struct msix_entry entries
35334+ * @nvec: number of @entries
35335+ *
35336+ * Setup the MSI-X capability structure of device function with a
35337+ * single MSI-X vector. A return of zero indicates the successful setup of
35338+ * requested MSI-X entries with allocated vectors or non-zero for otherwise.
35339+ **/
35340+static int msix_capability_init(struct pci_dev *dev,
35341+ struct msix_entry *entries, int nvec)
35342+{
35343+ u64 table_base;
35344+ int pirq, i, j, mapped, pos;
35345+ struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
35346+ struct msi_pirq_entry *pirq_entry;
35347+
35348+ if (!msi_dev_entry)
35349+ return -ENOMEM;
35350+
35351+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35352+ table_base = find_table_base(dev, pos);
35353+ if (!table_base)
35354+ return -ENODEV;
35355+
35356+ /* MSI-X Table Initialization */
35357+ for (i = 0; i < nvec; i++) {
35358+ mapped = 0;
35359+ list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35360+ if (pirq_entry->entry_nr == entries[i].entry) {
35361+ printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
35362+ not freed before acquire again.\n", entries[i].entry,
35363+ dev->bus->number, PCI_SLOT(dev->devfn),
35364+ PCI_FUNC(dev->devfn));
35365+ (entries + i)->vector = pirq_entry->pirq;
35366+ mapped = 1;
35367+ break;
35368+ }
35369+ }
35370+ if (mapped)
35371+ continue;
35372+ pirq = msi_map_vector(dev, entries[i].entry, table_base);
35373+ if (pirq < 0)
35374+ break;
35375+ attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
35376+ (entries + i)->vector = pirq;
35377+ }
35378+
35379+ if (i != nvec) {
35380+ for (j = --i; j >= 0; j--) {
35381+ msi_unmap_pirq(dev, entries[j].vector);
35382+ detach_pirq_entry(entries[j].entry, msi_dev_entry);
35383+ entries[j].vector = 0;
35384+ }
35385+ return -EBUSY;
35386+ }
35387+
35388+ enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35389+ dev->msix_enabled = 1;
35390+
35391+ return 0;
35392+}
35393+
35394+/**
35395+ * pci_enable_msi - configure device's MSI capability structure
35396+ * @dev: pointer to the pci_dev data structure of MSI device function
35397+ *
35398+ * Setup the MSI capability structure of device function with
35399+ * a single MSI vector upon its software driver call to request for
35400+ * MSI mode enabled on its hardware device function. A return of zero
35401+ * indicates the successful setup of an entry zero with the new MSI
35402+ * vector or non-zero for otherwise.
35403+ **/
35404+extern int pci_frontend_enable_msi(struct pci_dev *dev);
35405+int pci_enable_msi(struct pci_dev* dev)
35406+{
35407+ struct pci_bus *bus;
35408+ int pos, temp, status = -EINVAL;
35409+
35410+ if (!pci_msi_enable || !dev)
35411+ return status;
35412+
35413+ if (dev->no_msi)
35414+ return status;
35415+
35416+ for (bus = dev->bus; bus; bus = bus->parent)
35417+ if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35418+ return -EINVAL;
35419+
35420+ status = msi_init();
35421+ if (status < 0)
35422+ return status;
35423+
35424+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35425+ if (!is_initial_xendomain())
35426+ {
35427+ int ret;
35428+
35429+ temp = dev->irq;
35430+ ret = pci_frontend_enable_msi(dev);
35431+ if (ret)
35432+ return ret;
35433+
35434+ dev->irq = evtchn_map_pirq(-1, dev->irq);
35435+ dev->irq_old = temp;
35436+
35437+ return ret;
35438+ }
35439+#endif
35440+
35441+ temp = dev->irq;
35442+
35443+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35444+ if (!pos)
35445+ return -EINVAL;
35446+
35447+ /* Check whether driver already requested for MSI-X vectors */
35448+ if (dev->msix_enabled) {
35449+ printk(KERN_INFO "PCI: %s: Can't enable MSI. "
35450+ "Device already has MSI-X vectors assigned\n",
35451+ pci_name(dev));
35452+ dev->irq = temp;
35453+ return -EINVAL;
35454+ }
35455+
35456+ status = msi_capability_init(dev);
35457+ if ( !status )
35458+ dev->irq_old = temp;
35459+ else
35460+ dev->irq = temp;
35461+
35462+ return status;
35463+}
35464+
35465+extern void pci_frontend_disable_msi(struct pci_dev* dev);
35466+void pci_disable_msi(struct pci_dev* dev)
35467+{
35468+ int pos;
35469+ int pirq;
35470+
35471+ if (!pci_msi_enable)
35472+ return;
35473+ if (!dev)
35474+ return;
35475+
35476+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35477+ if (!is_initial_xendomain()) {
35478+ evtchn_map_pirq(dev->irq, 0);
35479+ pci_frontend_disable_msi(dev);
35480+ dev->irq = dev->irq_old;
35481+ return;
35482+ }
35483+#endif
35484+
35485+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35486+ if (!pos)
35487+ return;
35488+
35489+ pirq = dev->irq;
35490+ /* Restore dev->irq to its default pin-assertion vector */
35491+ dev->irq = dev->irq_old;
35492+ msi_unmap_pirq(dev, pirq);
35493+
35494+ /* Disable MSI mode */
35495+ disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35496+}
35497+
35498+/**
35499+ * pci_enable_msix - configure device's MSI-X capability structure
35500+ * @dev: pointer to the pci_dev data structure of MSI-X device function
35501+ * @entries: pointer to an array of MSI-X entries
35502+ * @nvec: number of MSI-X vectors requested for allocation by device driver
35503+ *
35504+ * Setup the MSI-X capability structure of device function with the number
35505+ * of requested vectors upon its software driver call to request for
35506+ * MSI-X mode enabled on its hardware device function. A return of zero
35507+ * indicates the successful configuration of MSI-X capability structure
35508+ * with new allocated MSI-X vectors. A return of < 0 indicates a failure.
35509+ * Or a return of > 0 indicates that driver request is exceeding the number
35510+ * of vectors available. Driver should use the returned value to re-send
35511+ * its request.
35512+ **/
35513+extern int pci_frontend_enable_msix(struct pci_dev *dev,
35514+ struct msix_entry *entries, int nvec);
35515+int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
35516+{
35517+ struct pci_bus *bus;
35518+ int status, pos, nr_entries;
35519+ int i, j, temp;
35520+ u16 control;
35521+
35522+ if (!pci_msi_enable || !dev || !entries)
35523+ return -EINVAL;
35524+
35525+ if (dev->no_msi)
35526+ return -EINVAL;
35527+
35528+ for (bus = dev->bus; bus; bus = bus->parent)
35529+ if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35530+ return -EINVAL;
35531+
35532+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35533+ if (!is_initial_xendomain()) {
35534+ struct msi_dev_list *msi_dev_entry;
35535+ struct msi_pirq_entry *pirq_entry;
35536+ int ret, irq;
35537+
35538+ ret = pci_frontend_enable_msix(dev, entries, nvec);
35539+ if (ret) {
35540+ printk("get %x from pci_frontend_enable_msix\n", ret);
35541+ return ret;
35542+ }
35543+
35544+ msi_dev_entry = get_msi_dev_pirq_list(dev);
35545+ for (i = 0; i < nvec; i++) {
35546+ int mapped = 0;
35547+
35548+ list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35549+ if (pirq_entry->entry_nr == entries[i].entry) {
35550+ irq = pirq_entry->pirq;
35551+ BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
35552+ entries[i].vector = irq;
35553+ mapped = 1;
35554+ break;
35555+ }
35556+ }
35557+ if (mapped)
35558+ continue;
35559+ irq = evtchn_map_pirq(-1, entries[i].vector);
35560+ attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
35561+ entries[i].vector = irq;
35562+ }
35563+ return 0;
35564+ }
35565+#endif
35566+
35567+ status = msi_init();
35568+ if (status < 0)
35569+ return status;
35570+
35571+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35572+ if (!pos)
35573+ return -EINVAL;
35574+
35575+ pci_read_config_word(dev, msi_control_reg(pos), &control);
35576+ nr_entries = multi_msix_capable(control);
35577+ if (nvec > nr_entries)
35578+ return -EINVAL;
35579+
35580+ /* Check for any invalid entries */
35581+ for (i = 0; i < nvec; i++) {
35582+ if (entries[i].entry >= nr_entries)
35583+ return -EINVAL; /* invalid entry */
35584+ for (j = i + 1; j < nvec; j++) {
35585+ if (entries[i].entry == entries[j].entry)
35586+ return -EINVAL; /* duplicate entry */
35587+ }
35588+ }
35589+
35590+ temp = dev->irq;
35591+ /* Check whether driver already requested for MSI vector */
35592+ if (dev->msi_enabled) {
35593+ printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
35594+ "Device already has an MSI vector assigned\n",
35595+ pci_name(dev));
35596+ dev->irq = temp;
35597+ return -EINVAL;
35598+ }
35599+
35600+ status = msix_capability_init(dev, entries, nvec);
35601+
35602+ if ( !status )
35603+ dev->irq_old = temp;
35604+ else
35605+ dev->irq = temp;
35606+
35607+ return status;
35608+}
35609+
35610+extern void pci_frontend_disable_msix(struct pci_dev* dev);
35611+void pci_disable_msix(struct pci_dev* dev)
35612+{
35613+ int pos;
35614+ u16 control;
35615+
35616+
35617+ if (!pci_msi_enable)
35618+ return;
35619+ if (!dev)
35620+ return;
35621+
35622+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35623+ if (!is_initial_xendomain()) {
35624+ struct msi_dev_list *msi_dev_entry;
35625+ struct msi_pirq_entry *pirq_entry, *tmp;
35626+
35627+ pci_frontend_disable_msix(dev);
35628+
35629+ msi_dev_entry = get_msi_dev_pirq_list(dev);
35630+ list_for_each_entry_safe(pirq_entry, tmp,
35631+ &msi_dev_entry->pirq_list_head, list) {
35632+ evtchn_map_pirq(pirq_entry->pirq, 0);
35633+ list_del(&pirq_entry->list);
35634+ kfree(pirq_entry);
35635+ }
35636+
35637+ dev->irq = dev->irq_old;
35638+ return;
35639+ }
35640+#endif
35641+
35642+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35643+ if (!pos)
35644+ return;
35645+
35646+ pci_read_config_word(dev, msi_control_reg(pos), &control);
35647+ if (!(control & PCI_MSIX_FLAGS_ENABLE))
35648+ return;
35649+
35650+ msi_remove_pci_irq_vectors(dev);
35651+
35652+ /* Disable MSI mode */
35653+ disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35654+}
35655+
35656+/**
35657+ * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
35658+ * @dev: pointer to the pci_dev data structure of MSI(X) device function
35659+ *
35660+ * Being called during hotplug remove, from which the device function
35661+ * is hot-removed. All previous assigned MSI/MSI-X vectors, if
35662+ * allocated for this device function, are reclaimed to unused state,
35663+ * which may be used later on.
35664+ **/
35665+void msi_remove_pci_irq_vectors(struct pci_dev* dev)
35666+{
35667+ unsigned long flags;
35668+ struct msi_dev_list *msi_dev_entry;
35669+ struct msi_pirq_entry *pirq_entry, *tmp;
35670+
35671+ if (!pci_msi_enable || !dev)
35672+ return;
35673+
35674+ msi_dev_entry = get_msi_dev_pirq_list(dev);
35675+
35676+ spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35677+ if (!list_empty(&msi_dev_entry->pirq_list_head))
35678+ {
35679+ printk(KERN_WARNING "msix pirqs for dev %02x:%02x:%01x are not freed \
35680+ before acquire again.\n", dev->bus->number, PCI_SLOT(dev->devfn),
35681+ PCI_FUNC(dev->devfn));
35682+ list_for_each_entry_safe(pirq_entry, tmp,
35683+ &msi_dev_entry->pirq_list_head, list) {
35684+ msi_unmap_pirq(dev, pirq_entry->pirq);
35685+ list_del(&pirq_entry->list);
35686+ kfree(pirq_entry);
35687+ }
35688+ }
35689+ spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35690+ dev->irq = dev->irq_old;
35691+}
35692+
35693+void pci_no_msi(void)
35694+{
35695+ pci_msi_enable = 0;
35696+}
35697+
35698+EXPORT_SYMBOL(pci_enable_msi);
35699+EXPORT_SYMBOL(pci_disable_msi);
35700+EXPORT_SYMBOL(pci_enable_msix);
35701+EXPORT_SYMBOL(pci_disable_msix);
35702+#ifdef CONFIG_XEN
35703+EXPORT_SYMBOL(register_msi_get_owner);
35704+EXPORT_SYMBOL(unregister_msi_get_owner);
35705+#endif
35706+
35707Index: head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h
35708===================================================================
35709--- /dev/null 1970-01-01 00:00:00.000000000 +0000
35710+++ head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h 2007-06-22 09:08:06.000000000 +0200
35711@@ -0,0 +1,44 @@
35712+#ifndef AGP_H
35713+#define AGP_H 1
35714+
35715+#include <asm/pgtable.h>
35716+#include <asm/cacheflush.h>
35717+#include <asm/system.h>
35718+
35719+/*
35720+ * Functions to keep the agpgart mappings coherent with the MMU.
35721+ * The GART gives the CPU a physical alias of pages in memory. The alias region is
35722+ * mapped uncacheable. Make sure there are no conflicting mappings
35723+ * with different cachability attributes for the same page. This avoids
35724+ * data corruption on some CPUs.
35725+ */
35726+
35727+/* Caller's responsibility to call global_flush_tlb() for
35728+ * performance reasons */
35729+#define map_page_into_agp(page) ( \
35730+ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
35731+ ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
35732+#define unmap_page_from_agp(page) ( \
35733+ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
35734+ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
35735+ change_page_attr(page, 1, PAGE_KERNEL))
35736+#define flush_agp_mappings() global_flush_tlb()
35737+
35738+/* Could use CLFLUSH here if the cpu supports it. But then it would
35739+ need to be called for each cacheline of the whole page so it may not be
35740+ worth it. Would need a page for it. */
35741+#define flush_agp_cache() wbinvd()
35742+
35743+/* Convert a physical address to an address suitable for the GART. */
35744+#define phys_to_gart(x) phys_to_machine(x)
35745+#define gart_to_phys(x) machine_to_phys(x)
35746+
35747+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
35748+#define alloc_gatt_pages(order) ({ \
35749+ char *_t; dma_addr_t _d; \
35750+ _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
35751+ _t; })
35752+#define free_gatt_pages(table, order) \
35753+ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
35754+
35755+#endif
35756Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h
35757===================================================================
35758--- /dev/null 1970-01-01 00:00:00.000000000 +0000
35759+++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100
35760@@ -0,0 +1,166 @@
35761+#ifndef __ARCH_DESC_H
35762+#define __ARCH_DESC_H
35763+
35764+#include <asm/ldt.h>
35765+#include <asm/segment.h>
35766+
35767+#define CPU_16BIT_STACK_SIZE 1024
35768+
35769+#ifndef __ASSEMBLY__
35770+
35771+#include <linux/preempt.h>
35772+#include <linux/smp.h>
35773+
35774+#include <asm/mmu.h>
35775+
35776+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
35777+
35778+DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
35779+
35780+struct Xgt_desc_struct {
35781+ unsigned short size;
35782+ unsigned long address __attribute__((packed));
35783+ unsigned short pad;
35784+} __attribute__ ((packed));
35785+
35786+extern struct Xgt_desc_struct idt_descr;
35787+DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
35788+
35789+
35790+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
35791+{
35792+ return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
35793+}
35794+
35795+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
35796+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
35797+
35798+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
35799+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
35800+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
35801+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
35802+
35803+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
35804+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
35805+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
35806+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
35807+
35808+/*
35809+ * This is the ldt that every process will get unless we need
35810+ * something other than this.
35811+ */
35812+extern struct desc_struct default_ldt[];
35813+extern void set_intr_gate(unsigned int irq, void * addr);
35814+
35815+#define _set_tssldt_desc(n,addr,limit,type) \
35816+__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
35817+ "movw %w1,2(%2)\n\t" \
35818+ "rorl $16,%1\n\t" \
35819+ "movb %b1,4(%2)\n\t" \
35820+ "movb %4,5(%2)\n\t" \
35821+ "movb $0,6(%2)\n\t" \
35822+ "movb %h1,7(%2)\n\t" \
35823+ "rorl $16,%1" \
35824+ : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
35825+
35826+#ifndef CONFIG_X86_NO_TSS
35827+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
35828+{
35829+ _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
35830+ offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
35831+}
35832+
35833+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
35834+#endif
35835+
35836+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
35837+{
35838+ _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
35839+}
35840+
35841+#define LDT_entry_a(info) \
35842+ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
35843+
35844+#define LDT_entry_b(info) \
35845+ (((info)->base_addr & 0xff000000) | \
35846+ (((info)->base_addr & 0x00ff0000) >> 16) | \
35847+ ((info)->limit & 0xf0000) | \
35848+ (((info)->read_exec_only ^ 1) << 9) | \
35849+ ((info)->contents << 10) | \
35850+ (((info)->seg_not_present ^ 1) << 15) | \
35851+ ((info)->seg_32bit << 22) | \
35852+ ((info)->limit_in_pages << 23) | \
35853+ ((info)->useable << 20) | \
35854+ 0x7000)
35855+
35856+#define LDT_empty(info) (\
35857+ (info)->base_addr == 0 && \
35858+ (info)->limit == 0 && \
35859+ (info)->contents == 0 && \
35860+ (info)->read_exec_only == 1 && \
35861+ (info)->seg_32bit == 0 && \
35862+ (info)->limit_in_pages == 0 && \
35863+ (info)->seg_not_present == 1 && \
35864+ (info)->useable == 0 )
35865+
35866+extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
35867+
35868+#if TLS_SIZE != 24
35869+# error update this code.
35870+#endif
35871+
35872+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
35873+{
35874+#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
35875+ *(u64 *)&t->tls_array[i])) \
35876+ BUG();
35877+ C(0); C(1); C(2);
35878+#undef C
35879+}
35880+
35881+static inline void clear_LDT(void)
35882+{
35883+ int cpu = get_cpu();
35884+
35885+ /*
35886+ * NB. We load the default_ldt for lcall7/27 handling on demand, as
35887+ * it slows down context switching. Noone uses it anyway.
35888+ */
35889+ cpu = cpu; /* XXX avoid compiler warning */
35890+ xen_set_ldt(NULL, 0);
35891+ put_cpu();
35892+}
35893+
35894+/*
35895+ * load one particular LDT into the current CPU
35896+ */
35897+static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
35898+{
35899+ void *segments = pc->ldt;
35900+ int count = pc->size;
35901+
35902+ if (likely(!count))
35903+ segments = NULL;
35904+
35905+ xen_set_ldt(segments, count);
35906+}
35907+
35908+static inline void load_LDT(mm_context_t *pc)
35909+{
35910+ int cpu = get_cpu();
35911+ load_LDT_nolock(pc, cpu);
35912+ put_cpu();
35913+}
35914+
35915+static inline unsigned long get_desc_base(unsigned long *desc)
35916+{
35917+ unsigned long base;
35918+ base = ((desc[0] >> 16) & 0x0000ffff) |
35919+ ((desc[1] << 16) & 0x00ff0000) |
35920+ (desc[1] & 0xff000000);
35921+ return base;
35922+}
35923+
35924+#endif /* !__ASSEMBLY__ */
35925+
35926+#endif
35927Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h
35928===================================================================
35929--- /dev/null 1970-01-01 00:00:00.000000000 +0000
35930+++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-04-02 12:34:02.000000000 +0200
35931@@ -0,0 +1,151 @@
35932+#ifndef _ASM_I386_DMA_MAPPING_H
35933+#define _ASM_I386_DMA_MAPPING_H
35934+
35935+/*
35936+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
35937+ * documentation.
35938+ */
35939+
35940+#include <linux/mm.h>
35941+#include <asm/cache.h>
35942+#include <asm/io.h>
35943+#include <asm/scatterlist.h>
35944+#include <asm/swiotlb.h>
35945+
35946+static inline int
35947+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
35948+{
35949+ dma_addr_t mask = 0xffffffff;
35950+ /* If the device has a mask, use it, otherwise default to 32 bits */
35951+ if (hwdev && hwdev->dma_mask)
35952+ mask = *hwdev->dma_mask;
35953+ return (addr & ~mask) != 0;
35954+}
35955+
35956+extern int range_straddles_page_boundary(paddr_t p, size_t size);
35957+
35958+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
35959+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
35960+
35961+void *dma_alloc_coherent(struct device *dev, size_t size,
35962+ dma_addr_t *dma_handle, gfp_t flag);
35963+
35964+void dma_free_coherent(struct device *dev, size_t size,
35965+ void *vaddr, dma_addr_t dma_handle);
35966+
35967+extern dma_addr_t
35968+dma_map_single(struct device *dev, void *ptr, size_t size,
35969+ enum dma_data_direction direction);
35970+
35971+extern void
35972+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
35973+ enum dma_data_direction direction);
35974+
35975+extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
35976+ int nents, enum dma_data_direction direction);
35977+extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
35978+ int nents, enum dma_data_direction direction);
35979+
35980+#ifdef CONFIG_HIGHMEM
35981+extern dma_addr_t
35982+dma_map_page(struct device *dev, struct page *page, unsigned long offset,
35983+ size_t size, enum dma_data_direction direction);
35984+
35985+extern void
35986+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
35987+ enum dma_data_direction direction);
35988+#else
35989+#define dma_map_page(dev, page, offset, size, dir) \
35990+ dma_map_single(dev, page_address(page) + (offset), (size), (dir))
35991+#define dma_unmap_page dma_unmap_single
35992+#endif
35993+
35994+extern void
35995+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
35996+ enum dma_data_direction direction);
35997+
35998+extern void
35999+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
36000+ enum dma_data_direction direction);
36001+
36002+static inline void
36003+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
36004+ unsigned long offset, size_t size,
36005+ enum dma_data_direction direction)
36006+{
36007+ dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
36008+}
36009+
36010+static inline void
36011+dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
36012+ unsigned long offset, size_t size,
36013+ enum dma_data_direction direction)
36014+{
36015+ dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
36016+}
36017+
36018+static inline void
36019+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
36020+ enum dma_data_direction direction)
36021+{
36022+ if (swiotlb)
36023+ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
36024+ flush_write_buffers();
36025+}
36026+
36027+static inline void
36028+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
36029+ enum dma_data_direction direction)
36030+{
36031+ if (swiotlb)
36032+ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
36033+ flush_write_buffers();
36034+}
36035+
36036+extern int
36037+dma_mapping_error(dma_addr_t dma_addr);
36038+
36039+extern int
36040+dma_supported(struct device *dev, u64 mask);
36041+
36042+static inline int
36043+dma_set_mask(struct device *dev, u64 mask)
36044+{
36045+ if(!dev->dma_mask || !dma_supported(dev, mask))
36046+ return -EIO;
36047+
36048+ *dev->dma_mask = mask;
36049+
36050+ return 0;
36051+}
36052+
36053+static inline int
36054+dma_get_cache_alignment(void)
36055+{
36056+ /* no easy way to get cache size on all x86, so return the
36057+ * maximum possible, to be safe */
36058+ return (1 << INTERNODE_CACHE_SHIFT);
36059+}
36060+
36061+#define dma_is_consistent(d) (1)
36062+
36063+static inline void
36064+dma_cache_sync(void *vaddr, size_t size,
36065+ enum dma_data_direction direction)
36066+{
36067+ flush_write_buffers();
36068+}
36069+
36070+#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
36071+extern int
36072+dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
36073+ dma_addr_t device_addr, size_t size, int flags);
36074+
36075+extern void
36076+dma_release_declared_memory(struct device *dev);
36077+
36078+extern void *
36079+dma_mark_declared_memory_occupied(struct device *dev,
36080+ dma_addr_t device_addr, size_t size);
36081+
36082+#endif
36083Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h
36084===================================================================
36085--- /dev/null 1970-01-01 00:00:00.000000000 +0000
36086+++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200
36087@@ -0,0 +1,155 @@
36088+/*
36089+ * fixmap.h: compile-time virtual memory allocation
36090+ *
36091+ * This file is subject to the terms and conditions of the GNU General Public
36092+ * License. See the file "COPYING" in the main directory of this archive
36093+ * for more details.
36094+ *
36095+ * Copyright (C) 1998 Ingo Molnar
36096+ *
36097+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
36098+ */
36099+
36100+#ifndef _ASM_FIXMAP_H
36101+#define _ASM_FIXMAP_H
36102+
36103+
36104+/* used by vmalloc.c, vsyscall.lds.S.
36105+ *
36106+ * Leave one empty page between vmalloc'ed areas and
36107+ * the start of the fixmap.
36108+ */
36109+extern unsigned long __FIXADDR_TOP;
36110+
36111+#ifndef __ASSEMBLY__
36112+#include <linux/kernel.h>
36113+#include <asm/acpi.h>
36114+#include <asm/apicdef.h>
36115+#include <asm/page.h>
36116+#ifdef CONFIG_HIGHMEM
36117+#include <linux/threads.h>
36118+#include <asm/kmap_types.h>
36119+#endif
36120+
36121+/*
36122+ * Here we define all the compile-time 'special' virtual
36123+ * addresses. The point is to have a constant address at
36124+ * compile time, but to set the physical address only
36125+ * in the boot process. We allocate these special addresses
36126+ * from the end of virtual memory (0xfffff000) backwards.
36127+ * Also this lets us do fail-safe vmalloc(), we
36128+ * can guarantee that these special addresses and
36129+ * vmalloc()-ed addresses never overlap.
36130+ *
36131+ * these 'compile-time allocated' memory buffers are
36132+ * fixed-size 4k pages. (or larger if used with an increment
36133+ * highger than 1) use fixmap_set(idx,phys) to associate
36134+ * physical memory with fixmap indices.
36135+ *
36136+ * TLB entries of such buffers will not be flushed across
36137+ * task switches.
36138+ */
36139+enum fixed_addresses {
36140+ FIX_HOLE,
36141+ FIX_VDSO,
36142+#ifdef CONFIG_X86_LOCAL_APIC
36143+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
36144+#endif
36145+#ifdef CONFIG_X86_IO_APIC
36146+ FIX_IO_APIC_BASE_0,
36147+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
36148+#endif
36149+#ifdef CONFIG_X86_VISWS_APIC
36150+ FIX_CO_CPU, /* Cobalt timer */
36151+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
36152+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
36153+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
36154+#endif
36155+#ifdef CONFIG_X86_F00F_BUG
36156+ FIX_F00F_IDT, /* Virtual mapping for IDT */
36157+#endif
36158+#ifdef CONFIG_X86_CYCLONE_TIMER
36159+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
36160+#endif
36161+#ifdef CONFIG_HIGHMEM
36162+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
36163+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
36164+#endif
36165+#ifdef CONFIG_ACPI
36166+ FIX_ACPI_BEGIN,
36167+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
36168+#endif
36169+#ifdef CONFIG_PCI_MMCONFIG
36170+ FIX_PCIE_MCFG,
36171+#endif
36172+ FIX_SHARED_INFO,
36173+#define NR_FIX_ISAMAPS 256
36174+ FIX_ISAMAP_END,
36175+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
36176+ __end_of_permanent_fixed_addresses,
36177+ /* temporary boot-time mappings, used before ioremap() is functional */
36178+#define NR_FIX_BTMAPS 16
36179+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
36180+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
36181+ FIX_WP_TEST,
36182+ __end_of_fixed_addresses
36183+};
36184+
36185+extern void set_fixaddr_top(unsigned long top);
36186+
36187+extern void __set_fixmap(enum fixed_addresses idx,
36188+ maddr_t phys, pgprot_t flags);
36189+
36190+#define set_fixmap(idx, phys) \
36191+ __set_fixmap(idx, phys, PAGE_KERNEL)
36192+/*
36193+ * Some hardware wants to get fixmapped without caching.
36194+ */
36195+#define set_fixmap_nocache(idx, phys) \
36196+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
36197+
36198+#define clear_fixmap(idx) \
36199+ __set_fixmap(idx, 0, __pgprot(0))
36200+
36201+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
36202+
36203+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
36204+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
36205+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
36206+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
36207+
36208+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
36209+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
36210+
36211+extern void __this_fixmap_does_not_exist(void);
36212+
36213+/*
36214+ * 'index to address' translation. If anyone tries to use the idx
36215+ * directly without tranlation, we catch the bug with a NULL-deference
36216+ * kernel oops. Illegal ranges of incoming indices are caught too.
36217+ */
36218+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
36219+{
36220+ /*
36221+ * this branch gets completely eliminated after inlining,
36222+ * except when someone tries to use fixaddr indices in an
36223+ * illegal way. (such as mixing up address types or using
36224+ * out-of-range indices).
36225+ *
36226+ * If it doesn't get removed, the linker will complain
36227+ * loudly with a reasonably clear error message..
36228+ */
36229+ if (idx >= __end_of_fixed_addresses)
36230+ __this_fixmap_does_not_exist();
36231+
36232+ return __fix_to_virt(idx);
36233+}
36234+
36235+static inline unsigned long virt_to_fix(const unsigned long vaddr)
36236+{
36237+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
36238+ return __virt_to_fix(vaddr);
36239+}
36240+
36241+#endif /* !__ASSEMBLY__ */
36242+#endif
36243Index: head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h
36244===================================================================
36245--- /dev/null 1970-01-01 00:00:00.000000000 +0000
36246+++ head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h 2007-08-06 15:10:49.000000000 +0200
36247@@ -0,0 +1,41 @@
36248+/*
36249+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
36250+ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
36251+ * VA Linux Systems Japan K.K.
36252+ *
36253+ * This program is free software; you can redistribute it and/or modify
36254+ * it under the terms of the GNU General Public License as published by
36255+ * the Free Software Foundation; either version 2 of the License, or
36256+ * (at your option) any later version.
36257+ *
36258+ * This program is distributed in the hope that it will be useful,
36259+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
36260+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36261+ * GNU General Public License for more details.
36262+ *
36263+ * You should have received a copy of the GNU General Public License
36264+ * along with this program; if not, write to the Free Software
36265+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36266+ */
36267+
36268+#ifndef _ASM_I386_GNTTAB_DMA_H
36269+#define _ASM_I386_GNTTAB_DMA_H
36270+
36271+static inline int gnttab_dma_local_pfn(struct page *page)
36272+{
36273+ /* Has it become a local MFN? */
36274+ return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
36275+}
36276+
36277+static inline maddr_t gnttab_dma_map_page(struct page *page)
36278+{
36279+ __gnttab_dma_map_page(page);
36280+ return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
36281+}
36282+
36283+static inline void gnttab_dma_unmap_page(maddr_t maddr)
36284+{
36285+ __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
36286+}
36287+
36288+#endif /* _ASM_I386_GNTTAB_DMA_H */
36289Index: head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h
36290===================================================================
36291--- /dev/null 1970-01-01 00:00:00.000000000 +0000
36292+++ head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100
36293@@ -0,0 +1,97 @@
36294+/*
36295+ * highmem.h: virtual kernel memory mappings for high memory
36296+ *
36297+ * Used in CONFIG_HIGHMEM systems for memory pages which
36298+ * are not addressable by direct kernel virtual addresses.
36299+ *
36300+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
36301+ * Gerhard.Wichert@pdb.siemens.de
36302+ *
36303+ *
36304+ * Redesigned the x86 32-bit VM architecture to deal with
36305+ * up to 16 Terabyte physical memory. With current x86 CPUs
36306+ * we now support up to 64 Gigabytes physical RAM.
36307+ *
36308+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
36309+ */
36310+
36311+#ifndef _ASM_HIGHMEM_H
36312+#define _ASM_HIGHMEM_H
36313+
36314+#ifdef __KERNEL__
36315+
36316+#include <linux/interrupt.h>
36317+#include <linux/threads.h>
36318+#include <asm/kmap_types.h>
36319+#include <asm/tlbflush.h>
36320+
36321+/* declarations for highmem.c */
36322+extern unsigned long highstart_pfn, highend_pfn;
36323+
36324+extern pte_t *kmap_pte;
36325+extern pgprot_t kmap_prot;
36326+extern pte_t *pkmap_page_table;
36327+
36328+/*
36329+ * Right now we initialize only a single pte table. It can be extended
36330+ * easily, subsequent pte tables have to be allocated in one physical
36331+ * chunk of RAM.
36332+ */
36333+#ifdef CONFIG_X86_PAE
36334+#define LAST_PKMAP 512
36335+#else
36336+#define LAST_PKMAP 1024
36337+#endif
36338+/*
36339+ * Ordering is:
36340+ *
36341+ * FIXADDR_TOP
36342+ * fixed_addresses
36343+ * FIXADDR_START
36344+ * temp fixed addresses
36345+ * FIXADDR_BOOT_START
36346+ * Persistent kmap area
36347+ * PKMAP_BASE
36348+ * VMALLOC_END
36349+ * Vmalloc area
36350+ * VMALLOC_START
36351+ * high_memory
36352+ */
36353+#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
36354+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
36355+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
36356+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
36357+
36358+extern void * FASTCALL(kmap_high(struct page *page));
36359+extern void FASTCALL(kunmap_high(struct page *page));
36360+
36361+void *kmap(struct page *page);
36362+void kunmap(struct page *page);
36363+void *kmap_atomic(struct page *page, enum km_type type);
36364+void *kmap_atomic_pte(struct page *page, enum km_type type);
36365+void kunmap_atomic(void *kvaddr, enum km_type type);
36366+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
36367+struct page *kmap_atomic_to_page(void *ptr);
36368+
36369+#define flush_cache_kmaps() do { } while (0)
36370+
36371+void clear_highpage(struct page *);
36372+static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
36373+{
36374+ clear_highpage(page);
36375+}
36376+#define __HAVE_ARCH_CLEAR_HIGHPAGE
36377+#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
36378+
36379+void copy_highpage(struct page *to, struct page *from);
36380+static inline void copy_user_highpage(struct page *to, struct page *from,
36381+ unsigned long vaddr)
36382+{
36383+ copy_highpage(to, from);
36384+}
36385+#define __HAVE_ARCH_COPY_HIGHPAGE
36386+#define __HAVE_ARCH_COPY_USER_HIGHPAGE
36387+
36388+#endif /* __KERNEL__ */
36389+
36390+#endif /* _ASM_HIGHMEM_H */
36391Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h
36392===================================================================
36393--- /dev/null 1970-01-01 00:00:00.000000000 +0000
36394+++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h 2008-11-25 12:22:34.000000000 +0100
36395@@ -0,0 +1,409 @@
36396+/******************************************************************************
36397+ * hypercall.h
36398+ *
36399+ * Linux-specific hypervisor handling.
36400+ *
36401+ * Copyright (c) 2002-2004, K A Fraser
36402+ *
36403+ * This program is free software; you can redistribute it and/or
36404+ * modify it under the terms of the GNU General Public License version 2
36405+ * as published by the Free Software Foundation; or, when distributed
36406+ * separately from the Linux kernel or incorporated into other
36407+ * software packages, subject to the following license:
36408+ *
36409+ * Permission is hereby granted, free of charge, to any person obtaining a copy
36410+ * of this source file (the "Software"), to deal in the Software without
36411+ * restriction, including without limitation the rights to use, copy, modify,
36412+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36413+ * and to permit persons to whom the Software is furnished to do so, subject to
36414+ * the following conditions:
36415+ *
36416+ * The above copyright notice and this permission notice shall be included in
36417+ * all copies or substantial portions of the Software.
36418+ *
36419+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36420+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36421+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36422+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36423+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36424+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36425+ * IN THE SOFTWARE.
36426+ */
36427+
36428+#ifndef __HYPERCALL_H__
36429+#define __HYPERCALL_H__
36430+
36431+#include <linux/string.h> /* memcpy() */
36432+#include <linux/stringify.h>
36433+
36434+#ifndef __HYPERVISOR_H__
36435+# error "please don't include this file directly"
36436+#endif
36437+
36438+#ifdef CONFIG_XEN
36439+#define HYPERCALL_STR(name) \
36440+ "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
36441+#else
36442+#define HYPERCALL_STR(name) \
36443+ "mov hypercall_stubs,%%eax; " \
36444+ "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
36445+ "call *%%eax"
36446+#endif
36447+
36448+#define _hypercall0(type, name) \
36449+({ \
36450+ type __res; \
36451+ asm volatile ( \
36452+ HYPERCALL_STR(name) \
36453+ : "=a" (__res) \
36454+ : \
36455+ : "memory" ); \
36456+ __res; \
36457+})
36458+
36459+#define _hypercall1(type, name, a1) \
36460+({ \
36461+ type __res; \
36462+ long __ign1; \
36463+ asm volatile ( \
36464+ HYPERCALL_STR(name) \
36465+ : "=a" (__res), "=b" (__ign1) \
36466+ : "1" ((long)(a1)) \
36467+ : "memory" ); \
36468+ __res; \
36469+})
36470+
36471+#define _hypercall2(type, name, a1, a2) \
36472+({ \
36473+ type __res; \
36474+ long __ign1, __ign2; \
36475+ asm volatile ( \
36476+ HYPERCALL_STR(name) \
36477+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
36478+ : "1" ((long)(a1)), "2" ((long)(a2)) \
36479+ : "memory" ); \
36480+ __res; \
36481+})
36482+
36483+#define _hypercall3(type, name, a1, a2, a3) \
36484+({ \
36485+ type __res; \
36486+ long __ign1, __ign2, __ign3; \
36487+ asm volatile ( \
36488+ HYPERCALL_STR(name) \
36489+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36490+ "=d" (__ign3) \
36491+ : "1" ((long)(a1)), "2" ((long)(a2)), \
36492+ "3" ((long)(a3)) \
36493+ : "memory" ); \
36494+ __res; \
36495+})
36496+
36497+#define _hypercall4(type, name, a1, a2, a3, a4) \
36498+({ \
36499+ type __res; \
36500+ long __ign1, __ign2, __ign3, __ign4; \
36501+ asm volatile ( \
36502+ HYPERCALL_STR(name) \
36503+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36504+ "=d" (__ign3), "=S" (__ign4) \
36505+ : "1" ((long)(a1)), "2" ((long)(a2)), \
36506+ "3" ((long)(a3)), "4" ((long)(a4)) \
36507+ : "memory" ); \
36508+ __res; \
36509+})
36510+
36511+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
36512+({ \
36513+ type __res; \
36514+ long __ign1, __ign2, __ign3, __ign4, __ign5; \
36515+ asm volatile ( \
36516+ HYPERCALL_STR(name) \
36517+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36518+ "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
36519+ : "1" ((long)(a1)), "2" ((long)(a2)), \
36520+ "3" ((long)(a3)), "4" ((long)(a4)), \
36521+ "5" ((long)(a5)) \
36522+ : "memory" ); \
36523+ __res; \
36524+})
36525+
36526+static inline int __must_check
36527+HYPERVISOR_set_trap_table(
36528+ const trap_info_t *table)
36529+{
36530+ return _hypercall1(int, set_trap_table, table);
36531+}
36532+
36533+static inline int __must_check
36534+HYPERVISOR_mmu_update(
36535+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
36536+ domid_t domid)
36537+{
36538+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
36539+}
36540+
36541+static inline int __must_check
36542+HYPERVISOR_mmuext_op(
36543+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
36544+ domid_t domid)
36545+{
36546+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
36547+}
36548+
36549+static inline int __must_check
36550+HYPERVISOR_set_gdt(
36551+ unsigned long *frame_list, unsigned int entries)
36552+{
36553+ return _hypercall2(int, set_gdt, frame_list, entries);
36554+}
36555+
36556+static inline int __must_check
36557+HYPERVISOR_stack_switch(
36558+ unsigned long ss, unsigned long esp)
36559+{
36560+ return _hypercall2(int, stack_switch, ss, esp);
36561+}
36562+
36563+static inline int __must_check
36564+HYPERVISOR_set_callbacks(
36565+ unsigned long event_selector, unsigned long event_address,
36566+ unsigned long failsafe_selector, unsigned long failsafe_address)
36567+{
36568+ return _hypercall4(int, set_callbacks,
36569+ event_selector, event_address,
36570+ failsafe_selector, failsafe_address);
36571+}
36572+
36573+static inline int
36574+HYPERVISOR_fpu_taskswitch(
36575+ int set)
36576+{
36577+ return _hypercall1(int, fpu_taskswitch, set);
36578+}
36579+
36580+static inline int __must_check
36581+HYPERVISOR_sched_op_compat(
36582+ int cmd, unsigned long arg)
36583+{
36584+ return _hypercall2(int, sched_op_compat, cmd, arg);
36585+}
36586+
36587+static inline int __must_check
36588+HYPERVISOR_sched_op(
36589+ int cmd, void *arg)
36590+{
36591+ return _hypercall2(int, sched_op, cmd, arg);
36592+}
36593+
36594+static inline long __must_check
36595+HYPERVISOR_set_timer_op(
36596+ u64 timeout)
36597+{
36598+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
36599+ unsigned long timeout_lo = (unsigned long)timeout;
36600+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
36601+}
36602+
36603+static inline int __must_check
36604+HYPERVISOR_platform_op(
36605+ struct xen_platform_op *platform_op)
36606+{
36607+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
36608+ return _hypercall1(int, platform_op, platform_op);
36609+}
36610+
36611+static inline int __must_check
36612+HYPERVISOR_set_debugreg(
36613+ unsigned int reg, unsigned long value)
36614+{
36615+ return _hypercall2(int, set_debugreg, reg, value);
36616+}
36617+
36618+static inline unsigned long __must_check
36619+HYPERVISOR_get_debugreg(
36620+ unsigned int reg)
36621+{
36622+ return _hypercall1(unsigned long, get_debugreg, reg);
36623+}
36624+
36625+static inline int __must_check
36626+HYPERVISOR_update_descriptor(
36627+ u64 ma, u64 desc)
36628+{
36629+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
36630+}
36631+
36632+static inline int __must_check
36633+HYPERVISOR_memory_op(
36634+ unsigned int cmd, void *arg)
36635+{
36636+ return _hypercall2(int, memory_op, cmd, arg);
36637+}
36638+
36639+static inline int __must_check
36640+HYPERVISOR_multicall(
36641+ multicall_entry_t *call_list, unsigned int nr_calls)
36642+{
36643+ return _hypercall2(int, multicall, call_list, nr_calls);
36644+}
36645+
36646+static inline int __must_check
36647+HYPERVISOR_update_va_mapping(
36648+ unsigned long va, pte_t new_val, unsigned long flags)
36649+{
36650+ unsigned long pte_hi = 0;
36651+#ifdef CONFIG_X86_PAE
36652+ pte_hi = new_val.pte_high;
36653+#endif
36654+ return _hypercall4(int, update_va_mapping, va,
36655+ new_val.pte_low, pte_hi, flags);
36656+}
36657+
36658+static inline int __must_check
36659+HYPERVISOR_event_channel_op(
36660+ int cmd, void *arg)
36661+{
36662+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
36663+
36664+#if CONFIG_XEN_COMPAT <= 0x030002
36665+ if (unlikely(rc == -ENOSYS)) {
36666+ struct evtchn_op op;
36667+ op.cmd = cmd;
36668+ memcpy(&op.u, arg, sizeof(op.u));
36669+ rc = _hypercall1(int, event_channel_op_compat, &op);
36670+ memcpy(arg, &op.u, sizeof(op.u));
36671+ }
36672+#endif
36673+
36674+ return rc;
36675+}
36676+
36677+static inline int __must_check
36678+HYPERVISOR_xen_version(
36679+ int cmd, void *arg)
36680+{
36681+ return _hypercall2(int, xen_version, cmd, arg);
36682+}
36683+
36684+static inline int __must_check
36685+HYPERVISOR_console_io(
36686+ int cmd, unsigned int count, char *str)
36687+{
36688+ return _hypercall3(int, console_io, cmd, count, str);
36689+}
36690+
36691+static inline int __must_check
36692+HYPERVISOR_physdev_op(
36693+ int cmd, void *arg)
36694+{
36695+ int rc = _hypercall2(int, physdev_op, cmd, arg);
36696+
36697+#if CONFIG_XEN_COMPAT <= 0x030002
36698+ if (unlikely(rc == -ENOSYS)) {
36699+ struct physdev_op op;
36700+ op.cmd = cmd;
36701+ memcpy(&op.u, arg, sizeof(op.u));
36702+ rc = _hypercall1(int, physdev_op_compat, &op);
36703+ memcpy(arg, &op.u, sizeof(op.u));
36704+ }
36705+#endif
36706+
36707+ return rc;
36708+}
36709+
36710+static inline int __must_check
36711+HYPERVISOR_grant_table_op(
36712+ unsigned int cmd, void *uop, unsigned int count)
36713+{
36714+ return _hypercall3(int, grant_table_op, cmd, uop, count);
36715+}
36716+
36717+static inline int __must_check
36718+HYPERVISOR_update_va_mapping_otherdomain(
36719+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
36720+{
36721+ unsigned long pte_hi = 0;
36722+#ifdef CONFIG_X86_PAE
36723+ pte_hi = new_val.pte_high;
36724+#endif
36725+ return _hypercall5(int, update_va_mapping_otherdomain, va,
36726+ new_val.pte_low, pte_hi, flags, domid);
36727+}
36728+
36729+static inline int __must_check
36730+HYPERVISOR_vm_assist(
36731+ unsigned int cmd, unsigned int type)
36732+{
36733+ return _hypercall2(int, vm_assist, cmd, type);
36734+}
36735+
36736+static inline int __must_check
36737+HYPERVISOR_vcpu_op(
36738+ int cmd, unsigned int vcpuid, void *extra_args)
36739+{
36740+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
36741+}
36742+
36743+static inline int __must_check
36744+HYPERVISOR_suspend(
36745+ unsigned long srec)
36746+{
36747+ struct sched_shutdown sched_shutdown = {
36748+ .reason = SHUTDOWN_suspend
36749+ };
36750+
36751+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
36752+ &sched_shutdown, srec);
36753+
36754+#if CONFIG_XEN_COMPAT <= 0x030002
36755+ if (rc == -ENOSYS)
36756+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
36757+ SHUTDOWN_suspend, srec);
36758+#endif
36759+
36760+ return rc;
36761+}
36762+
36763+#if CONFIG_XEN_COMPAT <= 0x030002
36764+static inline int
36765+HYPERVISOR_nmi_op(
36766+ unsigned long op, void *arg)
36767+{
36768+ return _hypercall2(int, nmi_op, op, arg);
36769+}
36770+#endif
36771+
36772+#ifndef CONFIG_XEN
36773+static inline unsigned long __must_check
36774+HYPERVISOR_hvm_op(
36775+ int op, void *arg)
36776+{
36777+ return _hypercall2(unsigned long, hvm_op, op, arg);
36778+}
36779+#endif
36780+
36781+static inline int __must_check
36782+HYPERVISOR_callback_op(
36783+ int cmd, const void *arg)
36784+{
36785+ return _hypercall2(int, callback_op, cmd, arg);
36786+}
36787+
36788+static inline int __must_check
36789+HYPERVISOR_xenoprof_op(
36790+ int op, void *arg)
36791+{
36792+ return _hypercall2(int, xenoprof_op, op, arg);
36793+}
36794+
36795+static inline int __must_check
36796+HYPERVISOR_kexec_op(
36797+ unsigned long op, void *args)
36798+{
36799+ return _hypercall2(int, kexec_op, op, args);
36800+}
36801+
36802+
36803+
36804+#endif /* __HYPERCALL_H__ */
36805Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h
36806===================================================================
36807--- /dev/null 1970-01-01 00:00:00.000000000 +0000
36808+++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h 2008-02-20 09:32:49.000000000 +0100
36809@@ -0,0 +1,259 @@
36810+/******************************************************************************
36811+ * hypervisor.h
36812+ *
36813+ * Linux-specific hypervisor handling.
36814+ *
36815+ * Copyright (c) 2002-2004, K A Fraser
36816+ *
36817+ * This program is free software; you can redistribute it and/or
36818+ * modify it under the terms of the GNU General Public License version 2
36819+ * as published by the Free Software Foundation; or, when distributed
36820+ * separately from the Linux kernel or incorporated into other
36821+ * software packages, subject to the following license:
36822+ *
36823+ * Permission is hereby granted, free of charge, to any person obtaining a copy
36824+ * of this source file (the "Software"), to deal in the Software without
36825+ * restriction, including without limitation the rights to use, copy, modify,
36826+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36827+ * and to permit persons to whom the Software is furnished to do so, subject to
36828+ * the following conditions:
36829+ *
36830+ * The above copyright notice and this permission notice shall be included in
36831+ * all copies or substantial portions of the Software.
36832+ *
36833+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36834+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36835+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36836+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36837+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36838+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36839+ * IN THE SOFTWARE.
36840+ */
36841+
36842+#ifndef __HYPERVISOR_H__
36843+#define __HYPERVISOR_H__
36844+
36845+#include <linux/types.h>
36846+#include <linux/kernel.h>
36847+#include <linux/version.h>
36848+#include <linux/errno.h>
36849+#include <xen/interface/xen.h>
36850+#include <xen/interface/platform.h>
36851+#include <xen/interface/event_channel.h>
36852+#include <xen/interface/physdev.h>
36853+#include <xen/interface/sched.h>
36854+#include <xen/interface/nmi.h>
36855+#include <asm/ptrace.h>
36856+#include <asm/page.h>
36857+#if defined(__i386__)
36858+# ifdef CONFIG_X86_PAE
36859+# include <asm-generic/pgtable-nopud.h>
36860+# else
36861+# include <asm-generic/pgtable-nopmd.h>
36862+# endif
36863+#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
36864+# include <asm-generic/pgtable-nopud.h>
36865+#endif
36866+
36867+extern shared_info_t *HYPERVISOR_shared_info;
36868+
36869+#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
36870+#ifdef CONFIG_SMP
36871+#define current_vcpu_info() vcpu_info(smp_processor_id())
36872+#else
36873+#define current_vcpu_info() vcpu_info(0)
36874+#endif
36875+
36876+#ifdef CONFIG_X86_32
36877+extern unsigned long hypervisor_virt_start;
36878+#endif
36879+
36880+/* arch/xen/i386/kernel/setup.c */
36881+extern start_info_t *xen_start_info;
36882+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
36883+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
36884+#else
36885+#define is_initial_xendomain() 0
36886+#endif
36887+
36888+/* arch/xen/kernel/evtchn.c */
36889+/* Force a proper event-channel callback from Xen. */
36890+void force_evtchn_callback(void);
36891+
36892+/* arch/xen/kernel/process.c */
36893+void xen_cpu_idle (void);
36894+
36895+/* arch/xen/i386/kernel/hypervisor.c */
36896+void do_hypervisor_callback(struct pt_regs *regs);
36897+
36898+/* arch/xen/i386/mm/hypervisor.c */
36899+/*
36900+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
36901+ * be MACHINE addresses.
36902+ */
36903+
36904+void xen_pt_switch(unsigned long ptr);
36905+void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
36906+void xen_load_gs(unsigned int selector); /* x86_64 only */
36907+void xen_tlb_flush(void);
36908+void xen_invlpg(unsigned long ptr);
36909+
36910+void xen_l1_entry_update(pte_t *ptr, pte_t val);
36911+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
36912+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
36913+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
36914+void xen_pgd_pin(unsigned long ptr);
36915+void xen_pgd_unpin(unsigned long ptr);
36916+
36917+void xen_set_ldt(const void *ptr, unsigned int ents);
36918+
36919+#ifdef CONFIG_SMP
36920+#include <linux/cpumask.h>
36921+void xen_tlb_flush_all(void);
36922+void xen_invlpg_all(unsigned long ptr);
36923+void xen_tlb_flush_mask(cpumask_t *mask);
36924+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
36925+#endif
36926+
36927+/* Returns zero on success else negative errno. */
36928+int xen_create_contiguous_region(
36929+ unsigned long vstart, unsigned int order, unsigned int address_bits);
36930+void xen_destroy_contiguous_region(
36931+ unsigned long vstart, unsigned int order);
36932+
36933+struct page;
36934+
36935+int xen_limit_pages_to_max_mfn(
36936+ struct page *pages, unsigned int order, unsigned int address_bits);
36937+
36938+/* Turn jiffies into Xen system time. */
36939+u64 jiffies_to_st(unsigned long jiffies);
36940+
36941+#ifdef CONFIG_XEN_SCRUB_PAGES
36942+void scrub_pages(void *, unsigned int);
36943+#else
36944+#define scrub_pages(_p,_n) ((void)0)
36945+#endif
36946+
36947+#include <xen/hypercall.h>
36948+
36949+#if defined(CONFIG_X86_64)
36950+#define MULTI_UVMFLAGS_INDEX 2
36951+#define MULTI_UVMDOMID_INDEX 3
36952+#else
36953+#define MULTI_UVMFLAGS_INDEX 3
36954+#define MULTI_UVMDOMID_INDEX 4
36955+#endif
36956+
36957+#ifdef CONFIG_XEN
36958+#define is_running_on_xen() 1
36959+#else
36960+extern char *hypercall_stubs;
36961+#define is_running_on_xen() (!!hypercall_stubs)
36962+#endif
36963+
36964+static inline int
36965+HYPERVISOR_yield(
36966+ void)
36967+{
36968+ int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
36969+
36970+#if CONFIG_XEN_COMPAT <= 0x030002
36971+ if (rc == -ENOSYS)
36972+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
36973+#endif
36974+
36975+ return rc;
36976+}
36977+
36978+static inline int
36979+HYPERVISOR_block(
36980+ void)
36981+{
36982+ int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
36983+
36984+#if CONFIG_XEN_COMPAT <= 0x030002
36985+ if (rc == -ENOSYS)
36986+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
36987+#endif
36988+
36989+ return rc;
36990+}
36991+
36992+static inline void /*__noreturn*/
36993+HYPERVISOR_shutdown(
36994+ unsigned int reason)
36995+{
36996+ struct sched_shutdown sched_shutdown = {
36997+ .reason = reason
36998+ };
36999+
37000+ VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
37001+#if CONFIG_XEN_COMPAT <= 0x030002
37002+ VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
37003+#endif
37004+ /* Don't recurse needlessly. */
37005+ BUG_ON(reason != SHUTDOWN_crash);
37006+ for(;;);
37007+}
37008+
37009+static inline int __must_check
37010+HYPERVISOR_poll(
37011+ evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
37012+{
37013+ int rc;
37014+ struct sched_poll sched_poll = {
37015+ .nr_ports = nr_ports,
37016+ .timeout = jiffies_to_st(timeout)
37017+ };
37018+ set_xen_guest_handle(sched_poll.ports, ports);
37019+
37020+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
37021+#if CONFIG_XEN_COMPAT <= 0x030002
37022+ if (rc == -ENOSYS)
37023+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
37024+#endif
37025+
37026+ return rc;
37027+}
37028+
37029+#ifdef CONFIG_XEN
37030+
37031+static inline void
37032+MULTI_update_va_mapping(
37033+ multicall_entry_t *mcl, unsigned long va,
37034+ pte_t new_val, unsigned long flags)
37035+{
37036+ mcl->op = __HYPERVISOR_update_va_mapping;
37037+ mcl->args[0] = va;
37038+#if defined(CONFIG_X86_64)
37039+ mcl->args[1] = new_val.pte;
37040+#elif defined(CONFIG_X86_PAE)
37041+ mcl->args[1] = new_val.pte_low;
37042+ mcl->args[2] = new_val.pte_high;
37043+#else
37044+ mcl->args[1] = new_val.pte_low;
37045+ mcl->args[2] = 0;
37046+#endif
37047+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
37048+}
37049+
37050+static inline void
37051+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
37052+ void *uop, unsigned int count)
37053+{
37054+ mcl->op = __HYPERVISOR_grant_table_op;
37055+ mcl->args[0] = cmd;
37056+ mcl->args[1] = (unsigned long)uop;
37057+ mcl->args[2] = count;
37058+}
37059+
37060+#else /* !defined(CONFIG_XEN) */
37061+
37062+/* Multicalls not supported for HVM guests. */
37063+#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
37064+#define MULTI_grant_table_op(a,b,c,d) ((void)0)
37065+
37066+#endif
37067+
37068+#endif /* __HYPERVISOR_H__ */
37069Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h
37070===================================================================
37071--- /dev/null 1970-01-01 00:00:00.000000000 +0000
37072+++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h 2007-08-16 18:07:01.000000000 +0200
37073@@ -0,0 +1,389 @@
37074+#ifndef _ASM_IO_H
37075+#define _ASM_IO_H
37076+
37077+#include <linux/string.h>
37078+#include <linux/compiler.h>
37079+
37080+/*
37081+ * This file contains the definitions for the x86 IO instructions
37082+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
37083+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
37084+ * versions of the single-IO instructions (inb_p/inw_p/..).
37085+ *
37086+ * This file is not meant to be obfuscating: it's just complicated
37087+ * to (a) handle it all in a way that makes gcc able to optimize it
37088+ * as well as possible and (b) trying to avoid writing the same thing
37089+ * over and over again with slight variations and possibly making a
37090+ * mistake somewhere.
37091+ */
37092+
37093+/*
37094+ * Thanks to James van Artsdalen for a better timing-fix than
37095+ * the two short jumps: using outb's to a nonexistent port seems
37096+ * to guarantee better timings even on fast machines.
37097+ *
37098+ * On the other hand, I'd like to be sure of a non-existent port:
37099+ * I feel a bit unsafe about using 0x80 (should be safe, though)
37100+ *
37101+ * Linus
37102+ */
37103+
37104+ /*
37105+ * Bit simplified and optimized by Jan Hubicka
37106+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
37107+ *
37108+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
37109+ * isa_read[wl] and isa_write[wl] fixed
37110+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
37111+ */
37112+
37113+#define IO_SPACE_LIMIT 0xffff
37114+
37115+#define XQUAD_PORTIO_BASE 0xfe400000
37116+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
37117+
37118+#ifdef __KERNEL__
37119+
37120+#include <asm-generic/iomap.h>
37121+
37122+#include <linux/vmalloc.h>
37123+#include <asm/fixmap.h>
37124+
37125+/*
37126+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
37127+ * access
37128+ */
37129+#define xlate_dev_mem_ptr(p) __va(p)
37130+
37131+/*
37132+ * Convert a virtual cached pointer to an uncached pointer
37133+ */
37134+#define xlate_dev_kmem_ptr(p) p
37135+
37136+/**
37137+ * virt_to_phys - map virtual addresses to physical
37138+ * @address: address to remap
37139+ *
37140+ * The returned physical address is the physical (CPU) mapping for
37141+ * the memory address given. It is only valid to use this function on
37142+ * addresses directly mapped or allocated via kmalloc.
37143+ *
37144+ * This function does not give bus mappings for DMA transfers. In
37145+ * almost all conceivable cases a device driver should not be using
37146+ * this function
37147+ */
37148+
37149+static inline unsigned long virt_to_phys(volatile void * address)
37150+{
37151+ return __pa(address);
37152+}
37153+
37154+/**
37155+ * phys_to_virt - map physical address to virtual
37156+ * @address: address to remap
37157+ *
37158+ * The returned virtual address is a current CPU mapping for
37159+ * the memory address given. It is only valid to use this function on
37160+ * addresses that have a kernel mapping
37161+ *
37162+ * This function does not handle bus mappings for DMA transfers. In
37163+ * almost all conceivable cases a device driver should not be using
37164+ * this function
37165+ */
37166+
37167+static inline void * phys_to_virt(unsigned long address)
37168+{
37169+ return __va(address);
37170+}
37171+
37172+/*
37173+ * Change "struct page" to physical address.
37174+ */
37175+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
37176+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
37177+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
37178+
37179+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
37180+ (unsigned long) bio_offset((bio)))
37181+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
37182+ (unsigned long) (bv)->bv_offset)
37183+
37184+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
37185+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
37186+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
37187+ bvec_to_pseudophys((vec2))))
37188+
37189+extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
37190+
37191+/**
37192+ * ioremap - map bus memory into CPU space
37193+ * @offset: bus address of the memory
37194+ * @size: size of the resource to map
37195+ *
37196+ * ioremap performs a platform specific sequence of operations to
37197+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
37198+ * writew/writel functions and the other mmio helpers. The returned
37199+ * address is not guaranteed to be usable directly as a virtual
37200+ * address.
37201+ */
37202+
37203+static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
37204+{
37205+ return __ioremap(offset, size, 0);
37206+}
37207+
37208+extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
37209+extern void iounmap(volatile void __iomem *addr);
37210+
37211+/*
37212+ * bt_ioremap() and bt_iounmap() are for temporary early boot-time
37213+ * mappings, before the real ioremap() is functional.
37214+ * A boot-time mapping is currently limited to at most 16 pages.
37215+ */
37216+extern void *bt_ioremap(unsigned long offset, unsigned long size);
37217+extern void bt_iounmap(void *addr, unsigned long size);
37218+
37219+/* Use early IO mappings for DMI because it's initialized early */
37220+#define dmi_ioremap bt_ioremap
37221+#define dmi_iounmap bt_iounmap
37222+#define dmi_alloc alloc_bootmem
37223+
37224+/*
37225+ * ISA I/O bus memory addresses are 1:1 with the physical address.
37226+ */
37227+#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
37228+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
37229+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
37230+
37231+/*
37232+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
37233+ * are forbidden in portable PCI drivers.
37234+ *
37235+ * Allow them on x86 for legacy drivers, though.
37236+ */
37237+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
37238+#define bus_to_virt(_x) __va(machine_to_phys(_x))
37239+
37240+/*
37241+ * readX/writeX() are used to access memory mapped devices. On some
37242+ * architectures the memory mapped IO stuff needs to be accessed
37243+ * differently. On the x86 architecture, we just read/write the
37244+ * memory location directly.
37245+ */
37246+
37247+static inline unsigned char readb(const volatile void __iomem *addr)
37248+{
37249+ return *(volatile unsigned char __force *) addr;
37250+}
37251+static inline unsigned short readw(const volatile void __iomem *addr)
37252+{
37253+ return *(volatile unsigned short __force *) addr;
37254+}
37255+static inline unsigned int readl(const volatile void __iomem *addr)
37256+{
37257+ return *(volatile unsigned int __force *) addr;
37258+}
37259+#define readb_relaxed(addr) readb(addr)
37260+#define readw_relaxed(addr) readw(addr)
37261+#define readl_relaxed(addr) readl(addr)
37262+#define __raw_readb readb
37263+#define __raw_readw readw
37264+#define __raw_readl readl
37265+
37266+static inline void writeb(unsigned char b, volatile void __iomem *addr)
37267+{
37268+ *(volatile unsigned char __force *) addr = b;
37269+}
37270+static inline void writew(unsigned short b, volatile void __iomem *addr)
37271+{
37272+ *(volatile unsigned short __force *) addr = b;
37273+}
37274+static inline void writel(unsigned int b, volatile void __iomem *addr)
37275+{
37276+ *(volatile unsigned int __force *) addr = b;
37277+}
37278+#define __raw_writeb writeb
37279+#define __raw_writew writew
37280+#define __raw_writel writel
37281+
37282+#define mmiowb()
37283+
37284+static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
37285+{
37286+ memset((void __force *) addr, val, count);
37287+}
37288+static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
37289+{
37290+ __memcpy(dst, (void __force *) src, count);
37291+}
37292+static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
37293+{
37294+ __memcpy((void __force *) dst, src, count);
37295+}
37296+
37297+/*
37298+ * ISA space is 'always mapped' on a typical x86 system, no need to
37299+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
37300+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
37301+ * are physical addresses. The following constant pointer can be
37302+ * used as the IO-area pointer (it can be iounmapped as well, so the
37303+ * analogy with PCI is quite large):
37304+ */
37305+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
37306+
37307+/*
37308+ * Again, i386 does not require mem IO specific function.
37309+ */
37310+
37311+#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
37312+
37313+/**
37314+ * check_signature - find BIOS signatures
37315+ * @io_addr: mmio address to check
37316+ * @signature: signature block
37317+ * @length: length of signature
37318+ *
37319+ * Perform a signature comparison with the mmio address io_addr. This
37320+ * address should have been obtained by ioremap.
37321+ * Returns 1 on a match.
37322+ */
37323+
37324+static inline int check_signature(volatile void __iomem * io_addr,
37325+ const unsigned char *signature, int length)
37326+{
37327+ int retval = 0;
37328+ do {
37329+ if (readb(io_addr) != *signature)
37330+ goto out;
37331+ io_addr++;
37332+ signature++;
37333+ length--;
37334+ } while (length);
37335+ retval = 1;
37336+out:
37337+ return retval;
37338+}
37339+
37340+/*
37341+ * Cache management
37342+ *
37343+ * This needed for two cases
37344+ * 1. Out of order aware processors
37345+ * 2. Accidentally out of order processors (PPro errata #51)
37346+ */
37347+
37348+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
37349+
37350+static inline void flush_write_buffers(void)
37351+{
37352+ __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
37353+}
37354+
37355+#define dma_cache_inv(_start,_size) flush_write_buffers()
37356+#define dma_cache_wback(_start,_size) flush_write_buffers()
37357+#define dma_cache_wback_inv(_start,_size) flush_write_buffers()
37358+
37359+#else
37360+
37361+/* Nothing to do */
37362+
37363+#define dma_cache_inv(_start,_size) do { } while (0)
37364+#define dma_cache_wback(_start,_size) do { } while (0)
37365+#define dma_cache_wback_inv(_start,_size) do { } while (0)
37366+#define flush_write_buffers()
37367+
37368+#endif
37369+
37370+#endif /* __KERNEL__ */
37371+
37372+#ifdef SLOW_IO_BY_JUMPING
37373+#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
37374+#else
37375+#define __SLOW_DOWN_IO "outb %%al,$0x80;"
37376+#endif
37377+
37378+static inline void slow_down_io(void) {
37379+ __asm__ __volatile__(
37380+ __SLOW_DOWN_IO
37381+#ifdef REALLY_SLOW_IO
37382+ __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
37383+#endif
37384+ : : );
37385+}
37386+
37387+#ifdef CONFIG_X86_NUMAQ
37388+extern void *xquad_portio; /* Where the IO area was mapped */
37389+#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
37390+#define __BUILDIO(bwl,bw,type) \
37391+static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
37392+ if (xquad_portio) \
37393+ write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
37394+ else \
37395+ out##bwl##_local(value, port); \
37396+} \
37397+static inline void out##bwl(unsigned type value, int port) { \
37398+ out##bwl##_quad(value, port, 0); \
37399+} \
37400+static inline unsigned type in##bwl##_quad(int port, int quad) { \
37401+ if (xquad_portio) \
37402+ return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
37403+ else \
37404+ return in##bwl##_local(port); \
37405+} \
37406+static inline unsigned type in##bwl(int port) { \
37407+ return in##bwl##_quad(port, 0); \
37408+}
37409+#else
37410+#define __BUILDIO(bwl,bw,type) \
37411+static inline void out##bwl(unsigned type value, int port) { \
37412+ out##bwl##_local(value, port); \
37413+} \
37414+static inline unsigned type in##bwl(int port) { \
37415+ return in##bwl##_local(port); \
37416+}
37417+#endif
37418+
37419+
37420+#define BUILDIO(bwl,bw,type) \
37421+static inline void out##bwl##_local(unsigned type value, int port) { \
37422+ __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
37423+} \
37424+static inline unsigned type in##bwl##_local(int port) { \
37425+ unsigned type value; \
37426+ __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
37427+ return value; \
37428+} \
37429+static inline void out##bwl##_local_p(unsigned type value, int port) { \
37430+ out##bwl##_local(value, port); \
37431+ slow_down_io(); \
37432+} \
37433+static inline unsigned type in##bwl##_local_p(int port) { \
37434+ unsigned type value = in##bwl##_local(port); \
37435+ slow_down_io(); \
37436+ return value; \
37437+} \
37438+__BUILDIO(bwl,bw,type) \
37439+static inline void out##bwl##_p(unsigned type value, int port) { \
37440+ out##bwl(value, port); \
37441+ slow_down_io(); \
37442+} \
37443+static inline unsigned type in##bwl##_p(int port) { \
37444+ unsigned type value = in##bwl(port); \
37445+ slow_down_io(); \
37446+ return value; \
37447+} \
37448+static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
37449+ __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
37450+} \
37451+static inline void ins##bwl(int port, void *addr, unsigned long count) { \
37452+ __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
37453+}
37454+
37455+BUILDIO(b,b,char)
37456+BUILDIO(w,w,short)
37457+BUILDIO(l,,int)
37458+
37459+/* We will be supplying our own /dev/mem implementation */
37460+#define ARCH_HAS_DEV_MEM
37461+
37462+#endif
37463Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h
37464===================================================================
37465--- /dev/null 1970-01-01 00:00:00.000000000 +0000
37466+++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200
37467@@ -0,0 +1,127 @@
37468+/*
37469+ * include/asm-i386/irqflags.h
37470+ *
37471+ * IRQ flags handling
37472+ *
37473+ * This file gets included from lowlevel asm headers too, to provide
37474+ * wrapped versions of the local_irq_*() APIs, based on the
37475+ * raw_local_irq_*() functions from the lowlevel headers.
37476+ */
37477+#ifndef _ASM_IRQFLAGS_H
37478+#define _ASM_IRQFLAGS_H
37479+
37480+#ifndef __ASSEMBLY__
37481+
37482+/*
37483+ * The use of 'barrier' in the following reflects their use as local-lock
37484+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
37485+ * critical operations are executed. All critical operations must complete
37486+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
37487+ * includes these barriers, for example.
37488+ */
37489+
37490+#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
37491+
37492+#define raw_local_save_flags(flags) \
37493+ do { (flags) = __raw_local_save_flags(); } while (0)
37494+
37495+#define raw_local_irq_restore(x) \
37496+do { \
37497+ vcpu_info_t *_vcpu; \
37498+ barrier(); \
37499+ _vcpu = current_vcpu_info(); \
37500+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
37501+ barrier(); /* unmask then check (avoid races) */ \
37502+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
37503+ force_evtchn_callback(); \
37504+ } \
37505+} while (0)
37506+
37507+#define raw_local_irq_disable() \
37508+do { \
37509+ current_vcpu_info()->evtchn_upcall_mask = 1; \
37510+ barrier(); \
37511+} while (0)
37512+
37513+#define raw_local_irq_enable() \
37514+do { \
37515+ vcpu_info_t *_vcpu; \
37516+ barrier(); \
37517+ _vcpu = current_vcpu_info(); \
37518+ _vcpu->evtchn_upcall_mask = 0; \
37519+ barrier(); /* unmask then check (avoid races) */ \
37520+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
37521+ force_evtchn_callback(); \
37522+} while (0)
37523+
37524+/*
37525+ * Used in the idle loop; sti takes one instruction cycle
37526+ * to complete:
37527+ */
37528+void raw_safe_halt(void);
37529+
37530+/*
37531+ * Used when interrupts are already enabled or to
37532+ * shutdown the processor:
37533+ */
37534+void halt(void);
37535+
37536+static inline int raw_irqs_disabled_flags(unsigned long flags)
37537+{
37538+ return (flags != 0);
37539+}
37540+
37541+#define raw_irqs_disabled() \
37542+({ \
37543+ unsigned long flags = __raw_local_save_flags(); \
37544+ \
37545+ raw_irqs_disabled_flags(flags); \
37546+})
37547+
37548+/*
37549+ * For spinlocks, etc:
37550+ */
37551+#define __raw_local_irq_save() \
37552+({ \
37553+ unsigned long flags = __raw_local_save_flags(); \
37554+ \
37555+ raw_local_irq_disable(); \
37556+ \
37557+ flags; \
37558+})
37559+
37560+#define raw_local_irq_save(flags) \
37561+ do { (flags) = __raw_local_irq_save(); } while (0)
37562+
37563+#endif /* __ASSEMBLY__ */
37564+
37565+/*
37566+ * Do the CPU's IRQ-state tracing from assembly code. We call a
37567+ * C function, so save all the C-clobbered registers:
37568+ */
37569+#ifdef CONFIG_TRACE_IRQFLAGS
37570+
37571+# define TRACE_IRQS_ON \
37572+ pushl %eax; \
37573+ pushl %ecx; \
37574+ pushl %edx; \
37575+ call trace_hardirqs_on; \
37576+ popl %edx; \
37577+ popl %ecx; \
37578+ popl %eax;
37579+
37580+# define TRACE_IRQS_OFF \
37581+ pushl %eax; \
37582+ pushl %ecx; \
37583+ pushl %edx; \
37584+ call trace_hardirqs_off; \
37585+ popl %edx; \
37586+ popl %ecx; \
37587+ popl %eax;
37588+
37589+#else
37590+# define TRACE_IRQS_ON
37591+# define TRACE_IRQS_OFF
37592+#endif
37593+
37594+#endif
37595Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h
37596===================================================================
37597--- /dev/null 1970-01-01 00:00:00.000000000 +0000
37598+++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h 2008-04-02 12:34:02.000000000 +0200
37599@@ -0,0 +1,193 @@
37600+#ifndef _I386_MADDR_H
37601+#define _I386_MADDR_H
37602+
37603+#include <xen/features.h>
37604+#include <xen/interface/xen.h>
37605+
37606+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
37607+#define INVALID_P2M_ENTRY (~0UL)
37608+#define FOREIGN_FRAME_BIT (1UL<<31)
37609+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
37610+
37611+/* Definitions for machine and pseudophysical addresses. */
37612+#ifdef CONFIG_X86_PAE
37613+typedef unsigned long long paddr_t;
37614+typedef unsigned long long maddr_t;
37615+#else
37616+typedef unsigned long paddr_t;
37617+typedef unsigned long maddr_t;
37618+#endif
37619+
37620+#ifdef CONFIG_XEN
37621+
37622+extern unsigned long *phys_to_machine_mapping;
37623+extern unsigned long max_mapnr;
37624+
37625+#undef machine_to_phys_mapping
37626+extern unsigned long *machine_to_phys_mapping;
37627+extern unsigned int machine_to_phys_order;
37628+
37629+static inline unsigned long pfn_to_mfn(unsigned long pfn)
37630+{
37631+ if (xen_feature(XENFEAT_auto_translated_physmap))
37632+ return pfn;
37633+ BUG_ON(max_mapnr && pfn >= max_mapnr);
37634+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
37635+}
37636+
37637+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
37638+{
37639+ if (xen_feature(XENFEAT_auto_translated_physmap))
37640+ return 1;
37641+ BUG_ON(max_mapnr && pfn >= max_mapnr);
37642+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
37643+}
37644+
37645+static inline unsigned long mfn_to_pfn(unsigned long mfn)
37646+{
37647+ unsigned long pfn;
37648+
37649+ if (xen_feature(XENFEAT_auto_translated_physmap))
37650+ return mfn;
37651+
37652+ if (unlikely((mfn >> machine_to_phys_order) != 0))
37653+ return max_mapnr;
37654+
37655+ /* The array access can fail (e.g., device space beyond end of RAM). */
37656+ asm (
37657+ "1: movl %1,%0\n"
37658+ "2:\n"
37659+ ".section .fixup,\"ax\"\n"
37660+ "3: movl %2,%0\n"
37661+ " jmp 2b\n"
37662+ ".previous\n"
37663+ ".section __ex_table,\"a\"\n"
37664+ " .align 4\n"
37665+ " .long 1b,3b\n"
37666+ ".previous"
37667+ : "=r" (pfn)
37668+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
37669+
37670+ return pfn;
37671+}
37672+
37673+/*
37674+ * We detect special mappings in one of two ways:
37675+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
37676+ * to be outside our maximum possible pseudophys range.
37677+ * 2. If the MFN belongs to a different domain then we will certainly
37678+ * not have MFN in our p2m table. Conversely, if the page is ours,
37679+ * then we'll have p2m(m2p(MFN))==MFN.
37680+ * If we detect a special mapping then it doesn't have a 'struct page'.
37681+ * We force !pfn_valid() by returning an out-of-range pointer.
37682+ *
37683+ * NB. These checks require that, for any MFN that is not in our reservation,
37684+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
37685+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
37686+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
37687+ *
37688+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
37689+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
37690+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
37691+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
37692+ */
37693+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
37694+{
37695+ unsigned long pfn = mfn_to_pfn(mfn);
37696+ if ((pfn < max_mapnr)
37697+ && !xen_feature(XENFEAT_auto_translated_physmap)
37698+ && (phys_to_machine_mapping[pfn] != mfn))
37699+ return max_mapnr; /* force !pfn_valid() */
37700+ return pfn;
37701+}
37702+
37703+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
37704+{
37705+ BUG_ON(max_mapnr && pfn >= max_mapnr);
37706+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
37707+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
37708+ return;
37709+ }
37710+ phys_to_machine_mapping[pfn] = mfn;
37711+}
37712+
37713+static inline maddr_t phys_to_machine(paddr_t phys)
37714+{
37715+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37716+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
37717+ return machine;
37718+}
37719+
37720+static inline paddr_t machine_to_phys(maddr_t machine)
37721+{
37722+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37723+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
37724+ return phys;
37725+}
37726+
37727+#ifdef CONFIG_X86_PAE
37728+static inline paddr_t pte_phys_to_machine(paddr_t phys)
37729+{
37730+ /*
37731+ * In PAE mode, the NX bit needs to be dealt with in the value
37732+ * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
37733+ * but for i386 the conversion to ulong for the argument will
37734+ * clip it off.
37735+ */
37736+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37737+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
37738+ return machine;
37739+}
37740+
37741+static inline paddr_t pte_machine_to_phys(maddr_t machine)
37742+{
37743+ /*
37744+ * In PAE mode, the NX bit needs to be dealt with in the value
37745+ * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
37746+ * but for i386 the conversion to ulong for the argument will
37747+ * clip it off.
37748+ */
37749+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37750+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
37751+ return phys;
37752+}
37753+#endif
37754+
37755+#ifdef CONFIG_X86_PAE
37756+#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
37757+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
37758+{
37759+ pte_t pte;
37760+
37761+ pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
37762+ (pgprot_val(pgprot) >> 32);
37763+ pte.pte_high &= (__supported_pte_mask >> 32);
37764+ pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
37765+ __supported_pte_mask;
37766+ return pte;
37767+}
37768+#else
37769+#define __pte_ma(x) ((pte_t) { (x) } )
37770+#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
37771+#endif
37772+
37773+#else /* !CONFIG_XEN */
37774+
37775+#define pfn_to_mfn(pfn) (pfn)
37776+#define mfn_to_pfn(mfn) (mfn)
37777+#define mfn_to_local_pfn(mfn) (mfn)
37778+#define set_phys_to_machine(pfn, mfn) ((void)0)
37779+#define phys_to_machine_mapping_valid(pfn) (1)
37780+#define phys_to_machine(phys) ((maddr_t)(phys))
37781+#define machine_to_phys(mach) ((paddr_t)(mach))
37782+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
37783+#define __pte_ma(x) __pte(x)
37784+
37785+#endif /* !CONFIG_XEN */
37786+
37787+/* VIRT <-> MACHINE conversion */
37788+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
37789+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
37790+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
37791+
37792+#endif /* _I386_MADDR_H */
37793Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h
37794===================================================================
37795--- /dev/null 1970-01-01 00:00:00.000000000 +0000
37796+++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200
37797@@ -0,0 +1,108 @@
37798+#ifndef __I386_SCHED_H
37799+#define __I386_SCHED_H
37800+
37801+#include <asm/desc.h>
37802+#include <asm/atomic.h>
37803+#include <asm/pgalloc.h>
37804+#include <asm/tlbflush.h>
37805+
37806+/*
37807+ * Used for LDT copy/destruction.
37808+ */
37809+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
37810+void destroy_context(struct mm_struct *mm);
37811+
37812+
37813+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
37814+{
37815+#if 0 /* XEN: no lazy tlb */
37816+ unsigned cpu = smp_processor_id();
37817+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
37818+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
37819+#endif
37820+}
37821+
37822+#define prepare_arch_switch(next) __prepare_arch_switch()
37823+
37824+static inline void __prepare_arch_switch(void)
37825+{
37826+ /*
37827+ * Save away %fs and %gs. No need to save %es and %ds, as those
37828+ * are always kernel segments while inside the kernel. Must
37829+ * happen before reload of cr3/ldt (i.e., not in __switch_to).
37830+ */
37831+ asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
37832+ : "=m" (current->thread.fs),
37833+ "=m" (current->thread.gs));
37834+ asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
37835+ : : "r" (0) );
37836+}
37837+
37838+extern void mm_pin(struct mm_struct *mm);
37839+extern void mm_unpin(struct mm_struct *mm);
37840+void mm_pin_all(void);
37841+
37842+static inline void switch_mm(struct mm_struct *prev,
37843+ struct mm_struct *next,
37844+ struct task_struct *tsk)
37845+{
37846+ int cpu = smp_processor_id();
37847+ struct mmuext_op _op[2], *op = _op;
37848+
37849+ if (likely(prev != next)) {
37850+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
37851+ !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
37852+
37853+ /* stop flush ipis for the previous mm */
37854+ cpu_clear(cpu, prev->cpu_vm_mask);
37855+#if 0 /* XEN: no lazy tlb */
37856+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37857+ per_cpu(cpu_tlbstate, cpu).active_mm = next;
37858+#endif
37859+ cpu_set(cpu, next->cpu_vm_mask);
37860+
37861+ /* Re-load page tables: load_cr3(next->pgd) */
37862+ op->cmd = MMUEXT_NEW_BASEPTR;
37863+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
37864+ op++;
37865+
37866+ /*
37867+ * load the LDT, if the LDT is different:
37868+ */
37869+ if (unlikely(prev->context.ldt != next->context.ldt)) {
37870+ /* load_LDT_nolock(&next->context, cpu) */
37871+ op->cmd = MMUEXT_SET_LDT;
37872+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
37873+ op->arg2.nr_ents = next->context.size;
37874+ op++;
37875+ }
37876+
37877+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
37878+ }
37879+#if 0 /* XEN: no lazy tlb */
37880+ else {
37881+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37882+ BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
37883+
37884+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37885+ /* We were in lazy tlb mode and leave_mm disabled
37886+ * tlb flush IPI delivery. We must reload %cr3.
37887+ */
37888+ load_cr3(next->pgd);
37889+ load_LDT_nolock(&next->context, cpu);
37890+ }
37891+ }
37892+#endif
37893+}
37894+
37895+#define deactivate_mm(tsk, mm) \
37896+ asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
37897+
37898+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
37899+{
37900+ if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
37901+ mm_pin(next);
37902+ switch_mm(prev, next, NULL);
37903+}
37904+
37905+#endif
37906Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h
37907===================================================================
37908--- /dev/null 1970-01-01 00:00:00.000000000 +0000
37909+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h 2007-09-14 11:14:51.000000000 +0200
37910@@ -0,0 +1,148 @@
37911+#ifndef __i386_PCI_H
37912+#define __i386_PCI_H
37913+
37914+
37915+#ifdef __KERNEL__
37916+#include <linux/mm.h> /* for struct page */
37917+
37918+/* Can be used to override the logic in pci_scan_bus for skipping
37919+ already-configured bus numbers - to be used for buggy BIOSes
37920+ or architectures with incomplete PCI setup by the loader */
37921+
37922+#ifdef CONFIG_PCI
37923+extern unsigned int pcibios_assign_all_busses(void);
37924+#else
37925+#define pcibios_assign_all_busses() 0
37926+#endif
37927+
37928+#include <asm/hypervisor.h>
37929+#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
37930+
37931+extern unsigned long pci_mem_start;
37932+#define PCIBIOS_MIN_IO 0x1000
37933+#define PCIBIOS_MIN_MEM (pci_mem_start)
37934+
37935+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
37936+
37937+void pcibios_config_init(void);
37938+struct pci_bus * pcibios_scan_root(int bus);
37939+
37940+void pcibios_set_master(struct pci_dev *dev);
37941+void pcibios_penalize_isa_irq(int irq, int active);
37942+struct irq_routing_table *pcibios_get_irq_routing_table(void);
37943+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
37944+
37945+/* Dynamic DMA mapping stuff.
37946+ * i386 has everything mapped statically.
37947+ */
37948+
37949+#include <linux/types.h>
37950+#include <linux/slab.h>
37951+#include <asm/scatterlist.h>
37952+#include <linux/string.h>
37953+#include <asm/io.h>
37954+
37955+struct pci_dev;
37956+
37957+#ifdef CONFIG_SWIOTLB
37958+
37959+
37960+/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
37961+#define PCI_DMA_BUS_IS_PHYS (0)
37962+
37963+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
37964+ dma_addr_t ADDR_NAME;
37965+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
37966+ __u32 LEN_NAME;
37967+#define pci_unmap_addr(PTR, ADDR_NAME) \
37968+ ((PTR)->ADDR_NAME)
37969+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
37970+ (((PTR)->ADDR_NAME) = (VAL))
37971+#define pci_unmap_len(PTR, LEN_NAME) \
37972+ ((PTR)->LEN_NAME)
37973+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
37974+ (((PTR)->LEN_NAME) = (VAL))
37975+
37976+#else
37977+
37978+/* The PCI address space does equal the physical memory
37979+ * address space. The networking and block device layers use
37980+ * this boolean for bounce buffer decisions.
37981+ */
37982+#define PCI_DMA_BUS_IS_PHYS (1)
37983+
37984+/* pci_unmap_{page,single} is a nop so... */
37985+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
37986+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
37987+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
37988+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
37989+#define pci_unmap_len(PTR, LEN_NAME) (0)
37990+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
37991+
37992+#endif
37993+
37994+/* This is always fine. */
37995+#define pci_dac_dma_supported(pci_dev, mask) (1)
37996+
37997+static inline dma64_addr_t
37998+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
37999+{
38000+ return ((dma64_addr_t) page_to_phys(page) +
38001+ (dma64_addr_t) offset);
38002+}
38003+
38004+static inline struct page *
38005+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
38006+{
38007+ return pfn_to_page(dma_addr >> PAGE_SHIFT);
38008+}
38009+
38010+static inline unsigned long
38011+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
38012+{
38013+ return (dma_addr & ~PAGE_MASK);
38014+}
38015+
38016+static inline void
38017+pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38018+{
38019+}
38020+
38021+static inline void
38022+pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38023+{
38024+ flush_write_buffers();
38025+}
38026+
38027+#define HAVE_PCI_MMAP
38028+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
38029+ enum pci_mmap_state mmap_state, int write_combine);
38030+
38031+
38032+static inline void pcibios_add_platform_entries(struct pci_dev *dev)
38033+{
38034+}
38035+
38036+#ifdef CONFIG_PCI
38037+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
38038+ enum pci_dma_burst_strategy *strat,
38039+ unsigned long *strategy_parameter)
38040+{
38041+ *strat = PCI_DMA_BURST_INFINITY;
38042+ *strategy_parameter = ~0UL;
38043+}
38044+#endif
38045+
38046+#endif /* __KERNEL__ */
38047+
38048+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
38049+#include <xen/pcifront.h>
38050+#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
38051+
38052+/* implement the pci_ DMA API in terms of the generic device dma_ one */
38053+#include <asm-generic/pci-dma-compat.h>
38054+
38055+/* generic pci stuff */
38056+#include <asm-generic/pci.h>
38057+
38058+#endif /* __i386_PCI_H */
38059Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h
38060===================================================================
38061--- /dev/null 1970-01-01 00:00:00.000000000 +0000
38062+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h 2008-07-21 11:00:33.000000000 +0200
38063@@ -0,0 +1,59 @@
38064+#ifndef _I386_PGALLOC_H
38065+#define _I386_PGALLOC_H
38066+
38067+#include <asm/fixmap.h>
38068+#include <linux/threads.h>
38069+#include <linux/mm.h> /* for struct page */
38070+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
38071+
38072+#define pmd_populate_kernel(mm, pmd, pte) \
38073+ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
38074+
38075+#define pmd_populate(mm, pmd, pte) \
38076+do { \
38077+ unsigned long pfn = page_to_pfn(pte); \
38078+ if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \
38079+ if (!PageHighMem(pte)) \
38080+ BUG_ON(HYPERVISOR_update_va_mapping( \
38081+ (unsigned long)__va(pfn << PAGE_SHIFT), \
38082+ pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
38083+ else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
38084+ kmap_flush_unused(); \
38085+ set_pmd(pmd, \
38086+ __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
38087+ } else \
38088+ *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
38089+} while (0)
38090+
38091+/*
38092+ * Allocate and free page tables.
38093+ */
38094+extern pgd_t *pgd_alloc(struct mm_struct *);
38095+extern void pgd_free(pgd_t *pgd);
38096+
38097+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
38098+extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
38099+
38100+static inline void pte_free_kernel(pte_t *pte)
38101+{
38102+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
38103+ free_page((unsigned long)pte);
38104+}
38105+
38106+extern void pte_free(struct page *pte);
38107+
38108+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
38109+
38110+#ifdef CONFIG_X86_PAE
38111+/*
38112+ * In the PAE case we free the pmds as part of the pgd.
38113+ */
38114+#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
38115+#define pmd_free(x) do { } while (0)
38116+#define __pmd_free_tlb(tlb,x) do { } while (0)
38117+#define pud_populate(mm, pmd, pte) BUG()
38118+#endif
38119+
38120+#define check_pgt_cache() do { } while (0)
38121+
38122+#endif /* _I386_PGALLOC_H */
38123Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
38124===================================================================
38125--- /dev/null 1970-01-01 00:00:00.000000000 +0000
38126+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h 2007-06-12 13:14:02.000000000 +0200
38127@@ -0,0 +1,24 @@
38128+#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
38129+#define _I386_PGTABLE_3LEVEL_DEFS_H
38130+
38131+#define HAVE_SHARED_KERNEL_PMD 0
38132+
38133+/*
38134+ * PGDIR_SHIFT determines what a top-level page table entry can map
38135+ */
38136+#define PGDIR_SHIFT 30
38137+#define PTRS_PER_PGD 4
38138+
38139+/*
38140+ * PMD_SHIFT determines the size of the area a middle-level
38141+ * page table can map
38142+ */
38143+#define PMD_SHIFT 21
38144+#define PTRS_PER_PMD 512
38145+
38146+/*
38147+ * entries per page directory level
38148+ */
38149+#define PTRS_PER_PTE 512
38150+
38151+#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
38152Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h
38153===================================================================
38154--- /dev/null 1970-01-01 00:00:00.000000000 +0000
38155+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200
38156@@ -0,0 +1,211 @@
38157+#ifndef _I386_PGTABLE_3LEVEL_H
38158+#define _I386_PGTABLE_3LEVEL_H
38159+
38160+#include <asm-generic/pgtable-nopud.h>
38161+
38162+/*
38163+ * Intel Physical Address Extension (PAE) Mode - three-level page
38164+ * tables on PPro+ CPUs.
38165+ *
38166+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
38167+ */
38168+
38169+#define pte_ERROR(e) \
38170+ printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
38171+ &(e), __pte_val(e), pte_pfn(e))
38172+#define pmd_ERROR(e) \
38173+ printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38174+ &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38175+#define pgd_ERROR(e) \
38176+ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38177+ &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38178+
38179+#define pud_none(pud) 0
38180+#define pud_bad(pud) 0
38181+#define pud_present(pud) 1
38182+
38183+/*
38184+ * Is the pte executable?
38185+ */
38186+static inline int pte_x(pte_t pte)
38187+{
38188+ return !(__pte_val(pte) & _PAGE_NX);
38189+}
38190+
38191+/*
38192+ * All present user-pages with !NX bit are user-executable:
38193+ */
38194+static inline int pte_exec(pte_t pte)
38195+{
38196+ return pte_user(pte) && pte_x(pte);
38197+}
38198+/*
38199+ * All present pages with !NX bit are kernel-executable:
38200+ */
38201+static inline int pte_exec_kernel(pte_t pte)
38202+{
38203+ return pte_x(pte);
38204+}
38205+
38206+/* Rules for using set_pte: the pte being assigned *must* be
38207+ * either not present or in a state where the hardware will
38208+ * not attempt to update the pte. In places where this is
38209+ * not possible, use pte_get_and_clear to obtain the old pte
38210+ * value and then use set_pte to update it. -ben
38211+ */
38212+#define __HAVE_ARCH_SET_PTE_ATOMIC
38213+
38214+static inline void set_pte(pte_t *ptep, pte_t pte)
38215+{
38216+ ptep->pte_high = pte.pte_high;
38217+ smp_wmb();
38218+ ptep->pte_low = pte.pte_low;
38219+}
38220+#define set_pte_atomic(pteptr,pteval) \
38221+ set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
38222+
38223+#define set_pte_at(_mm,addr,ptep,pteval) do { \
38224+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
38225+ HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
38226+ set_pte((ptep), (pteval)); \
38227+} while (0)
38228+
38229+#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
38230+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
38231+ HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
38232+ set_pte((ptep), (pteval)); \
38233+ xen_invlpg((addr)); \
38234+ } \
38235+} while (0)
38236+
38237+#define set_pmd(pmdptr,pmdval) \
38238+ xen_l2_entry_update((pmdptr), (pmdval))
38239+#define set_pud(pudptr,pudval) \
38240+ xen_l3_entry_update((pudptr), (pudval))
38241+
38242+/*
38243+ * Pentium-II erratum A13: in PAE mode we explicitly have to flush
38244+ * the TLB via cr3 if the top-level pgd is changed...
38245+ * We do not let the generic code free and clear pgd entries due to
38246+ * this erratum.
38247+ */
38248+static inline void pud_clear (pud_t * pud) { }
38249+
38250+#define pud_page(pud) \
38251+((struct page *) __va(pud_val(pud) & PAGE_MASK))
38252+
38253+#define pud_page_kernel(pud) \
38254+((unsigned long) __va(pud_val(pud) & PAGE_MASK))
38255+
38256+
38257+/* Find an entry in the second-level page table.. */
38258+#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
38259+ pmd_index(address))
38260+
38261+static inline int pte_none(pte_t pte)
38262+{
38263+ return !(pte.pte_low | pte.pte_high);
38264+}
38265+
38266+/*
38267+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
38268+ * entry, so clear the bottom half first and enforce ordering with a compiler
38269+ * barrier.
38270+ */
38271+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38272+{
38273+ if ((mm != current->mm && mm != &init_mm)
38274+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38275+ ptep->pte_low = 0;
38276+ smp_wmb();
38277+ ptep->pte_high = 0;
38278+ }
38279+}
38280+
38281+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
38282+
38283+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38284+{
38285+ pte_t pte = *ptep;
38286+ if (!pte_none(pte)) {
38287+ if ((mm != &init_mm) ||
38288+ HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38289+ uint64_t val = __pte_val(pte);
38290+ if (__cmpxchg64(ptep, val, 0) != val) {
38291+ /* xchg acts as a barrier before the setting of the high bits */
38292+ pte.pte_low = xchg(&ptep->pte_low, 0);
38293+ pte.pte_high = ptep->pte_high;
38294+ ptep->pte_high = 0;
38295+ }
38296+ }
38297+ }
38298+ return pte;
38299+}
38300+
38301+#define ptep_clear_flush(vma, addr, ptep) \
38302+({ \
38303+ pte_t *__ptep = (ptep); \
38304+ pte_t __res = *__ptep; \
38305+ if (!pte_none(__res) && \
38306+ ((vma)->vm_mm != current->mm || \
38307+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
38308+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38309+ UVMF_INVLPG|UVMF_MULTI))) { \
38310+ __ptep->pte_low = 0; \
38311+ smp_wmb(); \
38312+ __ptep->pte_high = 0; \
38313+ flush_tlb_page(vma, addr); \
38314+ } \
38315+ __res; \
38316+})
38317+
38318+static inline int pte_same(pte_t a, pte_t b)
38319+{
38320+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
38321+}
38322+
38323+#define pte_page(x) pfn_to_page(pte_pfn(x))
38324+
38325+#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
38326+ ((_pte).pte_high << (32-PAGE_SHIFT)))
38327+#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
38328+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
38329+#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \
38330+ (_pte).pte_low & _PAGE_PRESENT ? \
38331+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
38332+ __pte_mfn(_pte))
38333+
38334+extern unsigned long long __supported_pte_mask;
38335+
38336+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
38337+{
38338+ return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
38339+ pgprot_val(pgprot)) & __supported_pte_mask);
38340+}
38341+
38342+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
38343+{
38344+ return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
38345+ pgprot_val(pgprot)) & __supported_pte_mask);
38346+}
38347+
38348+/*
38349+ * Bits 0, 6 and 7 are taken in the low part of the pte,
38350+ * put the 32 bits of offset into the high part.
38351+ */
38352+#define pte_to_pgoff(pte) ((pte).pte_high)
38353+#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
38354+#define PTE_FILE_MAX_BITS 32
38355+
38356+/* Encode and de-code a swap entry */
38357+#define __swp_type(x) (((x).val) & 0x1f)
38358+#define __swp_offset(x) ((x).val >> 5)
38359+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
38360+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
38361+#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
38362+
38363+#define __pmd_free_tlb(tlb, x) do { } while (0)
38364+
38365+void vmalloc_sync_all(void);
38366+
38367+#endif /* _I386_PGTABLE_3LEVEL_H */
38368Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h
38369===================================================================
38370--- /dev/null 1970-01-01 00:00:00.000000000 +0000
38371+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-07-21 11:00:33.000000000 +0200
38372@@ -0,0 +1,537 @@
38373+#ifndef _I386_PGTABLE_H
38374+#define _I386_PGTABLE_H
38375+
38376+#include <asm/hypervisor.h>
38377+
38378+/*
38379+ * The Linux memory management assumes a three-level page table setup. On
38380+ * the i386, we use that, but "fold" the mid level into the top-level page
38381+ * table, so that we physically have the same two-level page table as the
38382+ * i386 mmu expects.
38383+ *
38384+ * This file contains the functions and defines necessary to modify and use
38385+ * the i386 page table tree.
38386+ */
38387+#ifndef __ASSEMBLY__
38388+#include <asm/processor.h>
38389+#include <asm/fixmap.h>
38390+#include <linux/threads.h>
38391+
38392+#ifndef _I386_BITOPS_H
38393+#include <asm/bitops.h>
38394+#endif
38395+
38396+#include <linux/slab.h>
38397+#include <linux/list.h>
38398+#include <linux/spinlock.h>
38399+
38400+/* Is this pagetable pinned? */
38401+#define PG_pinned PG_arch_1
38402+
38403+struct mm_struct;
38404+struct vm_area_struct;
38405+
38406+/*
38407+ * ZERO_PAGE is a global shared page that is always zero: used
38408+ * for zero-mapped memory areas etc..
38409+ */
38410+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
38411+extern unsigned long empty_zero_page[1024];
38412+extern pgd_t *swapper_pg_dir;
38413+extern kmem_cache_t *pgd_cache;
38414+extern kmem_cache_t *pmd_cache;
38415+extern spinlock_t pgd_lock;
38416+extern struct page *pgd_list;
38417+
38418+void pmd_ctor(void *, kmem_cache_t *, unsigned long);
38419+void pgd_ctor(void *, kmem_cache_t *, unsigned long);
38420+void pgd_dtor(void *, kmem_cache_t *, unsigned long);
38421+void pgtable_cache_init(void);
38422+void paging_init(void);
38423+
38424+/*
38425+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
38426+ * implements both the traditional 2-level x86 page tables and the
38427+ * newer 3-level PAE-mode page tables.
38428+ */
38429+#ifdef CONFIG_X86_PAE
38430+# include <asm/pgtable-3level-defs.h>
38431+# define PMD_SIZE (1UL << PMD_SHIFT)
38432+# define PMD_MASK (~(PMD_SIZE-1))
38433+#else
38434+# include <asm/pgtable-2level-defs.h>
38435+#endif
38436+
38437+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
38438+#define PGDIR_MASK (~(PGDIR_SIZE-1))
38439+
38440+#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
38441+#define FIRST_USER_ADDRESS 0
38442+
38443+#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
38444+#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
38445+
38446+#define TWOLEVEL_PGDIR_SHIFT 22
38447+#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
38448+#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
38449+
38450+/* Just any arbitrary offset to the start of the vmalloc VM area: the
38451+ * current 8MB value just means that there will be a 8MB "hole" after the
38452+ * physical memory until the kernel virtual memory starts. That means that
38453+ * any out-of-bounds memory accesses will hopefully be caught.
38454+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
38455+ * area for the same reason. ;)
38456+ */
38457+#define VMALLOC_OFFSET (8*1024*1024)
38458+#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \
38459+ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
38460+#ifdef CONFIG_HIGHMEM
38461+# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
38462+#else
38463+# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
38464+#endif
38465+
38466+/*
38467+ * _PAGE_PSE set in the page directory entry just means that
38468+ * the page directory entry points directly to a 4MB-aligned block of
38469+ * memory.
38470+ */
38471+#define _PAGE_BIT_PRESENT 0
38472+#define _PAGE_BIT_RW 1
38473+#define _PAGE_BIT_USER 2
38474+#define _PAGE_BIT_PWT 3
38475+#define _PAGE_BIT_PCD 4
38476+#define _PAGE_BIT_ACCESSED 5
38477+#define _PAGE_BIT_DIRTY 6
38478+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38479+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
38480+/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
38481+#define _PAGE_BIT_UNUSED2 10
38482+#define _PAGE_BIT_UNUSED3 11
38483+#define _PAGE_BIT_NX 63
38484+
38485+#define _PAGE_PRESENT 0x001
38486+#define _PAGE_RW 0x002
38487+#define _PAGE_USER 0x004
38488+#define _PAGE_PWT 0x008
38489+#define _PAGE_PCD 0x010
38490+#define _PAGE_ACCESSED 0x020
38491+#define _PAGE_DIRTY 0x040
38492+#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38493+#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
38494+/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
38495+#define _PAGE_UNUSED2 0x400
38496+#define _PAGE_UNUSED3 0x800
38497+
38498+/* If _PAGE_PRESENT is clear, we use these: */
38499+#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
38500+#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
38501+ pte_present gives true */
38502+#ifdef CONFIG_X86_PAE
38503+#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
38504+#else
38505+#define _PAGE_NX 0
38506+#endif
38507+
38508+/* Mapped page is I/O or foreign and has no associated page struct. */
38509+#define _PAGE_IO 0x200
38510+
38511+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
38512+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
38513+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
38514+
38515+#define PAGE_NONE \
38516+ __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
38517+#define PAGE_SHARED \
38518+ __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38519+
38520+#define PAGE_SHARED_EXEC \
38521+ __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38522+#define PAGE_COPY_NOEXEC \
38523+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38524+#define PAGE_COPY_EXEC \
38525+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38526+#define PAGE_COPY \
38527+ PAGE_COPY_NOEXEC
38528+#define PAGE_READONLY \
38529+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38530+#define PAGE_READONLY_EXEC \
38531+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38532+
38533+#define _PAGE_KERNEL \
38534+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
38535+#define _PAGE_KERNEL_EXEC \
38536+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
38537+
38538+extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
38539+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
38540+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
38541+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
38542+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
38543+
38544+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
38545+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
38546+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
38547+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
38548+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
38549+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
38550+
38551+/*
38552+ * The i386 can't do page protection for execute, and considers that
38553+ * the same are read. Also, write permissions imply read permissions.
38554+ * This is the closest we can get..
38555+ */
38556+#define __P000 PAGE_NONE
38557+#define __P001 PAGE_READONLY
38558+#define __P010 PAGE_COPY
38559+#define __P011 PAGE_COPY
38560+#define __P100 PAGE_READONLY_EXEC
38561+#define __P101 PAGE_READONLY_EXEC
38562+#define __P110 PAGE_COPY_EXEC
38563+#define __P111 PAGE_COPY_EXEC
38564+
38565+#define __S000 PAGE_NONE
38566+#define __S001 PAGE_READONLY
38567+#define __S010 PAGE_SHARED
38568+#define __S011 PAGE_SHARED
38569+#define __S100 PAGE_READONLY_EXEC
38570+#define __S101 PAGE_READONLY_EXEC
38571+#define __S110 PAGE_SHARED_EXEC
38572+#define __S111 PAGE_SHARED_EXEC
38573+
38574+/*
38575+ * Define this if things work differently on an i386 and an i486:
38576+ * it will (on an i486) warn about kernel memory accesses that are
38577+ * done without a 'access_ok(VERIFY_WRITE,..)'
38578+ */
38579+#undef TEST_ACCESS_OK
38580+
38581+/* The boot page tables (all created as a single array) */
38582+extern unsigned long pg0[];
38583+
38584+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
38585+
38586+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
38587+#define pmd_none(x) (!(unsigned long)__pmd_val(x))
38588+#if CONFIG_XEN_COMPAT <= 0x030002
38589+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
38590+ can temporarily clear it. */
38591+#define pmd_present(x) (__pmd_val(x))
38592+#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
38593+#else
38594+#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
38595+#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
38596+#endif
38597+
38598+
38599+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
38600+
38601+/*
38602+ * The following only work if pte_present() is true.
38603+ * Undefined behaviour if not..
38604+ */
38605+static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38606+static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38607+static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
38608+static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
38609+static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
38610+static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
38611+
38612+/*
38613+ * The following only works if pte_present() is not true.
38614+ */
38615+static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
38616+
38617+static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38618+static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38619+static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
38620+static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
38621+static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
38622+static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38623+static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38624+static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
38625+static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
38626+static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
38627+static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
38628+
38629+#ifdef CONFIG_X86_PAE
38630+# include <asm/pgtable-3level.h>
38631+#else
38632+# include <asm/pgtable-2level.h>
38633+#endif
38634+
38635+#define ptep_test_and_clear_dirty(vma, addr, ptep) \
38636+({ \
38637+ pte_t __pte = *(ptep); \
38638+ int __ret = pte_dirty(__pte); \
38639+ if (__ret) { \
38640+ __pte = pte_mkclean(__pte); \
38641+ if ((vma)->vm_mm != current->mm || \
38642+ HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38643+ (ptep)->pte_low = __pte.pte_low; \
38644+ } \
38645+ __ret; \
38646+})
38647+
38648+#define ptep_test_and_clear_young(vma, addr, ptep) \
38649+({ \
38650+ pte_t __pte = *(ptep); \
38651+ int __ret = pte_young(__pte); \
38652+ if (__ret) \
38653+ __pte = pte_mkold(__pte); \
38654+ if ((vma)->vm_mm != current->mm || \
38655+ HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38656+ (ptep)->pte_low = __pte.pte_low; \
38657+ __ret; \
38658+})
38659+
38660+#define ptep_get_and_clear_full(mm, addr, ptep, full) \
38661+ ((full) ? ({ \
38662+ pte_t __res = *(ptep); \
38663+ if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \
38664+ xen_l1_entry_update(ptep, __pte(0)); \
38665+ else \
38666+ *(ptep) = __pte(0); \
38667+ __res; \
38668+ }) : \
38669+ ptep_get_and_clear(mm, addr, ptep))
38670+
38671+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38672+{
38673+ pte_t pte = *ptep;
38674+ if (pte_write(pte))
38675+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
38676+}
38677+
38678+/*
38679+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
38680+ *
38681+ * dst - pointer to pgd range anwhere on a pgd page
38682+ * src - ""
38683+ * count - the number of pgds to copy.
38684+ *
38685+ * dst and src can be on the same page, but the range must not overlap,
38686+ * and must not cross a page boundary.
38687+ */
38688+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
38689+{
38690+ memcpy(dst, src, count * sizeof(pgd_t));
38691+}
38692+
38693+/*
38694+ * Macro to mark a page protection value as "uncacheable". On processors which do not support
38695+ * it, this is a no-op.
38696+ */
38697+#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
38698+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
38699+
38700+/*
38701+ * Conversion functions: convert a page and protection to a page entry,
38702+ * and a page entry and page directory to the page they refer to.
38703+ */
38704+
38705+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
38706+
38707+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
38708+{
38709+ /*
38710+ * Since this might change the present bit (which controls whether
38711+ * a pte_t object has undergone p2m translation), we must use
38712+ * pte_val() on the input pte and __pte() for the return value.
38713+ */
38714+ paddr_t pteval = pte_val(pte);
38715+
38716+ pteval &= _PAGE_CHG_MASK;
38717+ pteval |= pgprot_val(newprot);
38718+#ifdef CONFIG_X86_PAE
38719+ pteval &= __supported_pte_mask;
38720+#endif
38721+ return __pte(pteval);
38722+}
38723+
38724+#define pmd_large(pmd) \
38725+((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
38726+
38727+/*
38728+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
38729+ *
38730+ * this macro returns the index of the entry in the pgd page which would
38731+ * control the given virtual address
38732+ */
38733+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
38734+#define pgd_index_k(addr) pgd_index(addr)
38735+
38736+/*
38737+ * pgd_offset() returns a (pgd_t *)
38738+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
38739+ */
38740+#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
38741+
38742+/*
38743+ * a shortcut which implies the use of the kernel's pgd, instead
38744+ * of a process's
38745+ */
38746+#define pgd_offset_k(address) pgd_offset(&init_mm, address)
38747+
38748+/*
38749+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
38750+ *
38751+ * this macro returns the index of the entry in the pmd page which would
38752+ * control the given virtual address
38753+ */
38754+#define pmd_index(address) \
38755+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
38756+
38757+/*
38758+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
38759+ *
38760+ * this macro returns the index of the entry in the pte page which would
38761+ * control the given virtual address
38762+ */
38763+#define pte_index(address) \
38764+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
38765+#define pte_offset_kernel(dir, address) \
38766+ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
38767+
38768+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
38769+
38770+#define pmd_page_kernel(pmd) \
38771+ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
38772+
38773+/*
38774+ * Helper function that returns the kernel pagetable entry controlling
38775+ * the virtual address 'address'. NULL means no pagetable entry present.
38776+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
38777+ * as a pte too.
38778+ */
38779+extern pte_t *lookup_address(unsigned long address);
38780+
38781+/*
38782+ * Make a given kernel text page executable/non-executable.
38783+ * Returns the previous executability setting of that page (which
38784+ * is used to restore the previous state). Used by the SMP bootup code.
38785+ * NOTE: this is an __init function for security reasons.
38786+ */
38787+#ifdef CONFIG_X86_PAE
38788+ extern int set_kernel_exec(unsigned long vaddr, int enable);
38789+#else
38790+ static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
38791+#endif
38792+
38793+extern void noexec_setup(const char *str);
38794+
38795+#if defined(CONFIG_HIGHPTE)
38796+#define pte_offset_map(dir, address) \
38797+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
38798+ pte_index(address))
38799+#define pte_offset_map_nested(dir, address) \
38800+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
38801+ pte_index(address))
38802+#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
38803+#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
38804+#else
38805+#define pte_offset_map(dir, address) \
38806+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
38807+#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
38808+#define pte_unmap(pte) do { } while (0)
38809+#define pte_unmap_nested(pte) do { } while (0)
38810+#endif
38811+
38812+#define __HAVE_ARCH_PTEP_ESTABLISH
38813+#define ptep_establish(vma, address, ptep, pteval) \
38814+ do { \
38815+ if ( likely((vma)->vm_mm == current->mm) ) { \
38816+ BUG_ON(HYPERVISOR_update_va_mapping(address, \
38817+ pteval, \
38818+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38819+ UVMF_INVLPG|UVMF_MULTI)); \
38820+ } else { \
38821+ xen_l1_entry_update(ptep, pteval); \
38822+ flush_tlb_page(vma, address); \
38823+ } \
38824+ } while (0)
38825+
38826+/*
38827+ * The i386 doesn't have any external MMU info: the kernel page
38828+ * tables contain all the necessary information.
38829+ *
38830+ * Also, we only update the dirty/accessed state if we set
38831+ * the dirty bit by hand in the kernel, since the hardware
38832+ * will do the accessed bit for us, and we don't want to
38833+ * race with other CPU's that might be updating the dirty
38834+ * bit at the same time.
38835+ */
38836+#define update_mmu_cache(vma,address,pte) do { } while (0)
38837+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
38838+#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
38839+ do { \
38840+ if (dirty) \
38841+ ptep_establish(vma, address, ptep, entry); \
38842+ } while (0)
38843+
38844+#include <xen/features.h>
38845+void make_lowmem_page_readonly(void *va, unsigned int feature);
38846+void make_lowmem_page_writable(void *va, unsigned int feature);
38847+void make_page_readonly(void *va, unsigned int feature);
38848+void make_page_writable(void *va, unsigned int feature);
38849+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
38850+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
38851+
38852+#define virt_to_ptep(va) \
38853+({ \
38854+ pte_t *__ptep = lookup_address((unsigned long)(va)); \
38855+ BUG_ON(!__ptep || !pte_present(*__ptep)); \
38856+ __ptep; \
38857+})
38858+
38859+#define arbitrary_virt_to_machine(va) \
38860+ (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
38861+ | ((unsigned long)(va) & (PAGE_SIZE - 1)))
38862+
38863+#endif /* !__ASSEMBLY__ */
38864+
38865+#ifdef CONFIG_FLATMEM
38866+#define kern_addr_valid(addr) (1)
38867+#endif /* CONFIG_FLATMEM */
38868+
38869+int direct_remap_pfn_range(struct vm_area_struct *vma,
38870+ unsigned long address,
38871+ unsigned long mfn,
38872+ unsigned long size,
38873+ pgprot_t prot,
38874+ domid_t domid);
38875+int direct_kernel_remap_pfn_range(unsigned long address,
38876+ unsigned long mfn,
38877+ unsigned long size,
38878+ pgprot_t prot,
38879+ domid_t domid);
38880+int create_lookup_pte_addr(struct mm_struct *mm,
38881+ unsigned long address,
38882+ uint64_t *ptep);
38883+int touch_pte_range(struct mm_struct *mm,
38884+ unsigned long address,
38885+ unsigned long size);
38886+
38887+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
38888+ unsigned long addr, unsigned long end, pgprot_t newprot);
38889+
38890+#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
38891+ xen_change_pte_range(mm, pmd, addr, end, newprot)
38892+
38893+#define io_remap_pfn_range(vma,from,pfn,size,prot) \
38894+direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
38895+
38896+#define MK_IOSPACE_PFN(space, pfn) (pfn)
38897+#define GET_IOSPACE(pfn) 0
38898+#define GET_PFN(pfn) (pfn)
38899+
38900+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
38901+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
38902+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
38903+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
38904+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
38905+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
38906+#define __HAVE_ARCH_PTE_SAME
38907+#include <asm-generic/pgtable.h>
38908+
38909+#endif /* _I386_PGTABLE_H */
38910Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h
38911===================================================================
38912--- /dev/null 1970-01-01 00:00:00.000000000 +0000
38913+++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100
38914@@ -0,0 +1,743 @@
38915+/*
38916+ * include/asm-i386/processor.h
38917+ *
38918+ * Copyright (C) 1994 Linus Torvalds
38919+ */
38920+
38921+#ifndef __ASM_I386_PROCESSOR_H
38922+#define __ASM_I386_PROCESSOR_H
38923+
38924+#include <asm/vm86.h>
38925+#include <asm/math_emu.h>
38926+#include <asm/segment.h>
38927+#include <asm/page.h>
38928+#include <asm/types.h>
38929+#include <asm/sigcontext.h>
38930+#include <asm/cpufeature.h>
38931+#include <asm/msr.h>
38932+#include <asm/system.h>
38933+#include <linux/cache.h>
38934+#include <linux/threads.h>
38935+#include <asm/percpu.h>
38936+#include <linux/cpumask.h>
38937+#include <xen/interface/physdev.h>
38938+
38939+/* flag for disabling the tsc */
38940+extern int tsc_disable;
38941+
38942+struct desc_struct {
38943+ unsigned long a,b;
38944+};
38945+
38946+#define desc_empty(desc) \
38947+ (!((desc)->a | (desc)->b))
38948+
38949+#define desc_equal(desc1, desc2) \
38950+ (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
38951+/*
38952+ * Default implementation of macro that returns current
38953+ * instruction pointer ("program counter").
38954+ */
38955+#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
38956+
38957+/*
38958+ * CPU type and hardware bug flags. Kept separately for each CPU.
38959+ * Members of this structure are referenced in head.S, so think twice
38960+ * before touching them. [mj]
38961+ */
38962+
38963+struct cpuinfo_x86 {
38964+ __u8 x86; /* CPU family */
38965+ __u8 x86_vendor; /* CPU vendor */
38966+ __u8 x86_model;
38967+ __u8 x86_mask;
38968+ char wp_works_ok; /* It doesn't on 386's */
38969+ char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
38970+ char hard_math;
38971+ char rfu;
38972+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
38973+ unsigned long x86_capability[NCAPINTS];
38974+ char x86_vendor_id[16];
38975+ char x86_model_id[64];
38976+ int x86_cache_size; /* in KB - valid for CPUS which support this
38977+ call */
38978+ int x86_cache_alignment; /* In bytes */
38979+ char fdiv_bug;
38980+ char f00f_bug;
38981+ char coma_bug;
38982+ char pad0;
38983+ int x86_power;
38984+ unsigned long loops_per_jiffy;
38985+#ifdef CONFIG_SMP
38986+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
38987+#endif
38988+ unsigned char x86_max_cores; /* cpuid returned max cores value */
38989+ unsigned char apicid;
38990+#ifdef CONFIG_SMP
38991+ unsigned char booted_cores; /* number of cores as seen by OS */
38992+ __u8 phys_proc_id; /* Physical processor id. */
38993+ __u8 cpu_core_id; /* Core id */
38994+#endif
38995+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
38996+
38997+#define X86_VENDOR_INTEL 0
38998+#define X86_VENDOR_CYRIX 1
38999+#define X86_VENDOR_AMD 2
39000+#define X86_VENDOR_UMC 3
39001+#define X86_VENDOR_NEXGEN 4
39002+#define X86_VENDOR_CENTAUR 5
39003+#define X86_VENDOR_RISE 6
39004+#define X86_VENDOR_TRANSMETA 7
39005+#define X86_VENDOR_NSC 8
39006+#define X86_VENDOR_NUM 9
39007+#define X86_VENDOR_UNKNOWN 0xff
39008+
39009+/*
39010+ * capabilities of CPUs
39011+ */
39012+
39013+extern struct cpuinfo_x86 boot_cpu_data;
39014+extern struct cpuinfo_x86 new_cpu_data;
39015+#ifndef CONFIG_X86_NO_TSS
39016+extern struct tss_struct doublefault_tss;
39017+DECLARE_PER_CPU(struct tss_struct, init_tss);
39018+#endif
39019+
39020+#ifdef CONFIG_SMP
39021+extern struct cpuinfo_x86 cpu_data[];
39022+#define current_cpu_data cpu_data[smp_processor_id()]
39023+#else
39024+#define cpu_data (&boot_cpu_data)
39025+#define current_cpu_data boot_cpu_data
39026+#endif
39027+
39028+extern int cpu_llc_id[NR_CPUS];
39029+extern char ignore_fpu_irq;
39030+
39031+extern void identify_cpu(struct cpuinfo_x86 *);
39032+extern void print_cpu_info(struct cpuinfo_x86 *);
39033+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
39034+extern unsigned short num_cache_leaves;
39035+
39036+#ifdef CONFIG_X86_HT
39037+extern void detect_ht(struct cpuinfo_x86 *c);
39038+#else
39039+static inline void detect_ht(struct cpuinfo_x86 *c) {}
39040+#endif
39041+
39042+/*
39043+ * EFLAGS bits
39044+ */
39045+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
39046+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
39047+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
39048+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
39049+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
39050+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
39051+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
39052+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
39053+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
39054+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
39055+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
39056+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
39057+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
39058+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
39059+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
39060+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
39061+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
39062+
39063+/*
39064+ * Generic CPUID function
39065+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
39066+ * resulting in stale register contents being returned.
39067+ */
39068+static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
39069+{
39070+ __asm__(XEN_CPUID
39071+ : "=a" (*eax),
39072+ "=b" (*ebx),
39073+ "=c" (*ecx),
39074+ "=d" (*edx)
39075+ : "0" (op), "c"(0));
39076+}
39077+
39078+/* Some CPUID calls want 'count' to be placed in ecx */
39079+static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
39080+ int *edx)
39081+{
39082+ __asm__(XEN_CPUID
39083+ : "=a" (*eax),
39084+ "=b" (*ebx),
39085+ "=c" (*ecx),
39086+ "=d" (*edx)
39087+ : "0" (op), "c" (count));
39088+}
39089+
39090+/*
39091+ * CPUID functions returning a single datum
39092+ */
39093+static inline unsigned int cpuid_eax(unsigned int op)
39094+{
39095+ unsigned int eax;
39096+
39097+ __asm__(XEN_CPUID
39098+ : "=a" (eax)
39099+ : "0" (op)
39100+ : "bx", "cx", "dx");
39101+ return eax;
39102+}
39103+static inline unsigned int cpuid_ebx(unsigned int op)
39104+{
39105+ unsigned int eax, ebx;
39106+
39107+ __asm__(XEN_CPUID
39108+ : "=a" (eax), "=b" (ebx)
39109+ : "0" (op)
39110+ : "cx", "dx" );
39111+ return ebx;
39112+}
39113+static inline unsigned int cpuid_ecx(unsigned int op)
39114+{
39115+ unsigned int eax, ecx;
39116+
39117+ __asm__(XEN_CPUID
39118+ : "=a" (eax), "=c" (ecx)
39119+ : "0" (op)
39120+ : "bx", "dx" );
39121+ return ecx;
39122+}
39123+static inline unsigned int cpuid_edx(unsigned int op)
39124+{
39125+ unsigned int eax, edx;
39126+
39127+ __asm__(XEN_CPUID
39128+ : "=a" (eax), "=d" (edx)
39129+ : "0" (op)
39130+ : "bx", "cx");
39131+ return edx;
39132+}
39133+
39134+#define load_cr3(pgdir) write_cr3(__pa(pgdir))
39135+
39136+/*
39137+ * Intel CPU features in CR4
39138+ */
39139+#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
39140+#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
39141+#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
39142+#define X86_CR4_DE 0x0008 /* enable debugging extensions */
39143+#define X86_CR4_PSE 0x0010 /* enable page size extensions */
39144+#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
39145+#define X86_CR4_MCE 0x0040 /* Machine check enable */
39146+#define X86_CR4_PGE 0x0080 /* enable global pages */
39147+#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
39148+#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
39149+#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
39150+
39151+/*
39152+ * Save the cr4 feature set we're using (ie
39153+ * Pentium 4MB enable and PPro Global page
39154+ * enable), so that any CPU's that boot up
39155+ * after us can get the correct flags.
39156+ */
39157+extern unsigned long mmu_cr4_features;
39158+
39159+static inline void set_in_cr4 (unsigned long mask)
39160+{
39161+ unsigned cr4;
39162+ mmu_cr4_features |= mask;
39163+ cr4 = read_cr4();
39164+ cr4 |= mask;
39165+ write_cr4(cr4);
39166+}
39167+
39168+static inline void clear_in_cr4 (unsigned long mask)
39169+{
39170+ unsigned cr4;
39171+ mmu_cr4_features &= ~mask;
39172+ cr4 = read_cr4();
39173+ cr4 &= ~mask;
39174+ write_cr4(cr4);
39175+}
39176+
39177+/*
39178+ * NSC/Cyrix CPU configuration register indexes
39179+ */
39180+
39181+#define CX86_PCR0 0x20
39182+#define CX86_GCR 0xb8
39183+#define CX86_CCR0 0xc0
39184+#define CX86_CCR1 0xc1
39185+#define CX86_CCR2 0xc2
39186+#define CX86_CCR3 0xc3
39187+#define CX86_CCR4 0xe8
39188+#define CX86_CCR5 0xe9
39189+#define CX86_CCR6 0xea
39190+#define CX86_CCR7 0xeb
39191+#define CX86_PCR1 0xf0
39192+#define CX86_DIR0 0xfe
39193+#define CX86_DIR1 0xff
39194+#define CX86_ARR_BASE 0xc4
39195+#define CX86_RCR_BASE 0xdc
39196+
39197+/*
39198+ * NSC/Cyrix CPU indexed register access macros
39199+ */
39200+
39201+#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
39202+
39203+#define setCx86(reg, data) do { \
39204+ outb((reg), 0x22); \
39205+ outb((data), 0x23); \
39206+} while (0)
39207+
39208+/* Stop speculative execution */
39209+static inline void sync_core(void)
39210+{
39211+ int tmp;
39212+ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
39213+}
39214+
39215+static inline void __monitor(const void *eax, unsigned long ecx,
39216+ unsigned long edx)
39217+{
39218+ /* "monitor %eax,%ecx,%edx;" */
39219+ asm volatile(
39220+ ".byte 0x0f,0x01,0xc8;"
39221+ : :"a" (eax), "c" (ecx), "d"(edx));
39222+}
39223+
39224+static inline void __mwait(unsigned long eax, unsigned long ecx)
39225+{
39226+ /* "mwait %eax,%ecx;" */
39227+ asm volatile(
39228+ ".byte 0x0f,0x01,0xc9;"
39229+ : :"a" (eax), "c" (ecx));
39230+}
39231+
39232+/* from system description table in BIOS. Mostly for MCA use, but
39233+others may find it useful. */
39234+extern unsigned int machine_id;
39235+extern unsigned int machine_submodel_id;
39236+extern unsigned int BIOS_revision;
39237+extern unsigned int mca_pentium_flag;
39238+
39239+/* Boot loader type from the setup header */
39240+extern int bootloader_type;
39241+
39242+/*
39243+ * User space process size: 3GB (default).
39244+ */
39245+#define TASK_SIZE (PAGE_OFFSET)
39246+
39247+/* This decides where the kernel will search for a free chunk of vm
39248+ * space during mmap's.
39249+ */
39250+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
39251+
39252+#define HAVE_ARCH_PICK_MMAP_LAYOUT
39253+
39254+/*
39255+ * Size of io_bitmap.
39256+ */
39257+#define IO_BITMAP_BITS 65536
39258+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
39259+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
39260+#ifndef CONFIG_X86_NO_TSS
39261+#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
39262+#endif
39263+#define INVALID_IO_BITMAP_OFFSET 0x8000
39264+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
39265+
39266+struct i387_fsave_struct {
39267+ long cwd;
39268+ long swd;
39269+ long twd;
39270+ long fip;
39271+ long fcs;
39272+ long foo;
39273+ long fos;
39274+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39275+ long status; /* software status information */
39276+};
39277+
39278+struct i387_fxsave_struct {
39279+ unsigned short cwd;
39280+ unsigned short swd;
39281+ unsigned short twd;
39282+ unsigned short fop;
39283+ long fip;
39284+ long fcs;
39285+ long foo;
39286+ long fos;
39287+ long mxcsr;
39288+ long mxcsr_mask;
39289+ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
39290+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
39291+ long padding[56];
39292+} __attribute__ ((aligned (16)));
39293+
39294+struct i387_soft_struct {
39295+ long cwd;
39296+ long swd;
39297+ long twd;
39298+ long fip;
39299+ long fcs;
39300+ long foo;
39301+ long fos;
39302+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39303+ unsigned char ftop, changed, lookahead, no_update, rm, alimit;
39304+ struct info *info;
39305+ unsigned long entry_eip;
39306+};
39307+
39308+union i387_union {
39309+ struct i387_fsave_struct fsave;
39310+ struct i387_fxsave_struct fxsave;
39311+ struct i387_soft_struct soft;
39312+};
39313+
39314+typedef struct {
39315+ unsigned long seg;
39316+} mm_segment_t;
39317+
39318+struct thread_struct;
39319+
39320+#ifndef CONFIG_X86_NO_TSS
39321+struct tss_struct {
39322+ unsigned short back_link,__blh;
39323+ unsigned long esp0;
39324+ unsigned short ss0,__ss0h;
39325+ unsigned long esp1;
39326+ unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
39327+ unsigned long esp2;
39328+ unsigned short ss2,__ss2h;
39329+ unsigned long __cr3;
39330+ unsigned long eip;
39331+ unsigned long eflags;
39332+ unsigned long eax,ecx,edx,ebx;
39333+ unsigned long esp;
39334+ unsigned long ebp;
39335+ unsigned long esi;
39336+ unsigned long edi;
39337+ unsigned short es, __esh;
39338+ unsigned short cs, __csh;
39339+ unsigned short ss, __ssh;
39340+ unsigned short ds, __dsh;
39341+ unsigned short fs, __fsh;
39342+ unsigned short gs, __gsh;
39343+ unsigned short ldt, __ldth;
39344+ unsigned short trace, io_bitmap_base;
39345+ /*
39346+ * The extra 1 is there because the CPU will access an
39347+ * additional byte beyond the end of the IO permission
39348+ * bitmap. The extra byte must be all 1 bits, and must
39349+ * be within the limit.
39350+ */
39351+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
39352+ /*
39353+ * Cache the current maximum and the last task that used the bitmap:
39354+ */
39355+ unsigned long io_bitmap_max;
39356+ struct thread_struct *io_bitmap_owner;
39357+ /*
39358+ * pads the TSS to be cacheline-aligned (size is 0x100)
39359+ */
39360+ unsigned long __cacheline_filler[35];
39361+ /*
39362+ * .. and then another 0x100 bytes for emergency kernel stack
39363+ */
39364+ unsigned long stack[64];
39365+} __attribute__((packed));
39366+#endif
39367+
39368+#define ARCH_MIN_TASKALIGN 16
39369+
39370+struct thread_struct {
39371+/* cached TLS descriptors. */
39372+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
39373+ unsigned long esp0;
39374+ unsigned long sysenter_cs;
39375+ unsigned long eip;
39376+ unsigned long esp;
39377+ unsigned long fs;
39378+ unsigned long gs;
39379+/* Hardware debugging registers */
39380+ unsigned long debugreg[8]; /* %%db0-7 debug registers */
39381+/* fault info */
39382+ unsigned long cr2, trap_no, error_code;
39383+/* floating point info */
39384+ union i387_union i387;
39385+/* virtual 86 mode info */
39386+ struct vm86_struct __user * vm86_info;
39387+ unsigned long screen_bitmap;
39388+ unsigned long v86flags, v86mask, saved_esp0;
39389+ unsigned int saved_fs, saved_gs;
39390+/* IO permissions */
39391+ unsigned long *io_bitmap_ptr;
39392+ unsigned long iopl;
39393+/* max allowed port in the bitmap, in bytes: */
39394+ unsigned long io_bitmap_max;
39395+};
39396+
39397+#define INIT_THREAD { \
39398+ .vm86_info = NULL, \
39399+ .sysenter_cs = __KERNEL_CS, \
39400+ .io_bitmap_ptr = NULL, \
39401+}
39402+
39403+#ifndef CONFIG_X86_NO_TSS
39404+/*
39405+ * Note that the .io_bitmap member must be extra-big. This is because
39406+ * the CPU will access an additional byte beyond the end of the IO
39407+ * permission bitmap. The extra byte must be all 1 bits, and must
39408+ * be within the limit.
39409+ */
39410+#define INIT_TSS { \
39411+ .esp0 = sizeof(init_stack) + (long)&init_stack, \
39412+ .ss0 = __KERNEL_DS, \
39413+ .ss1 = __KERNEL_CS, \
39414+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
39415+ .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
39416+}
39417+
39418+static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
39419+{
39420+ tss->esp0 = thread->esp0;
39421+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
39422+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
39423+ tss->ss1 = thread->sysenter_cs;
39424+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
39425+ }
39426+}
39427+#define load_esp0(tss, thread) \
39428+ __load_esp0(tss, thread)
39429+#else
39430+#define load_esp0(tss, thread) do { \
39431+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
39432+ BUG(); \
39433+} while (0)
39434+#endif
39435+
39436+#define start_thread(regs, new_eip, new_esp) do { \
39437+ __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
39438+ set_fs(USER_DS); \
39439+ regs->xds = __USER_DS; \
39440+ regs->xes = __USER_DS; \
39441+ regs->xss = __USER_DS; \
39442+ regs->xcs = __USER_CS; \
39443+ regs->eip = new_eip; \
39444+ regs->esp = new_esp; \
39445+} while (0)
39446+
39447+/*
39448+ * These special macros can be used to get or set a debugging register
39449+ */
39450+#define get_debugreg(var, register) \
39451+ (var) = HYPERVISOR_get_debugreg((register))
39452+#define set_debugreg(value, register) \
39453+ WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
39454+
39455+/*
39456+ * Set IOPL bits in EFLAGS from given mask
39457+ */
39458+static inline void set_iopl_mask(unsigned mask)
39459+{
39460+ struct physdev_set_iopl set_iopl;
39461+
39462+ /* Force the change at ring 0. */
39463+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
39464+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
39465+}
39466+
39467+/* Forward declaration, a strange C thing */
39468+struct task_struct;
39469+struct mm_struct;
39470+
39471+/* Free all resources held by a thread. */
39472+extern void release_thread(struct task_struct *);
39473+
39474+/* Prepare to copy thread state - unlazy all lazy status */
39475+extern void prepare_to_copy(struct task_struct *tsk);
39476+
39477+/*
39478+ * create a kernel thread without removing it from tasklists
39479+ */
39480+extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
39481+
39482+extern unsigned long thread_saved_pc(struct task_struct *tsk);
39483+void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
39484+
39485+unsigned long get_wchan(struct task_struct *p);
39486+
39487+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
39488+#define KSTK_TOP(info) \
39489+({ \
39490+ unsigned long *__ptr = (unsigned long *)(info); \
39491+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
39492+})
39493+
39494+/*
39495+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
39496+ * This is necessary to guarantee that the entire "struct pt_regs"
39497+ * is accessable even if the CPU haven't stored the SS/ESP registers
39498+ * on the stack (interrupt gate does not save these registers
39499+ * when switching to the same priv ring).
39500+ * Therefore beware: accessing the xss/esp fields of the
39501+ * "struct pt_regs" is possible, but they may contain the
39502+ * completely wrong values.
39503+ */
39504+#define task_pt_regs(task) \
39505+({ \
39506+ struct pt_regs *__regs__; \
39507+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
39508+ __regs__ - 1; \
39509+})
39510+
39511+#define KSTK_EIP(task) (task_pt_regs(task)->eip)
39512+#define KSTK_ESP(task) (task_pt_regs(task)->esp)
39513+
39514+
39515+struct microcode_header {
39516+ unsigned int hdrver;
39517+ unsigned int rev;
39518+ unsigned int date;
39519+ unsigned int sig;
39520+ unsigned int cksum;
39521+ unsigned int ldrver;
39522+ unsigned int pf;
39523+ unsigned int datasize;
39524+ unsigned int totalsize;
39525+ unsigned int reserved[3];
39526+};
39527+
39528+struct microcode {
39529+ struct microcode_header hdr;
39530+ unsigned int bits[0];
39531+};
39532+
39533+typedef struct microcode microcode_t;
39534+typedef struct microcode_header microcode_header_t;
39535+
39536+/* microcode format is extended from prescott processors */
39537+struct extended_signature {
39538+ unsigned int sig;
39539+ unsigned int pf;
39540+ unsigned int cksum;
39541+};
39542+
39543+struct extended_sigtable {
39544+ unsigned int count;
39545+ unsigned int cksum;
39546+ unsigned int reserved[3];
39547+ struct extended_signature sigs[0];
39548+};
39549+
39550+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
39551+static inline void rep_nop(void)
39552+{
39553+ __asm__ __volatile__("rep;nop": : :"memory");
39554+}
39555+
39556+#define cpu_relax() rep_nop()
39557+
39558+/* generic versions from gas */
39559+#define GENERIC_NOP1 ".byte 0x90\n"
39560+#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
39561+#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
39562+#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
39563+#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
39564+#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
39565+#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
39566+#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
39567+
39568+/* Opteron nops */
39569+#define K8_NOP1 GENERIC_NOP1
39570+#define K8_NOP2 ".byte 0x66,0x90\n"
39571+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
39572+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
39573+#define K8_NOP5 K8_NOP3 K8_NOP2
39574+#define K8_NOP6 K8_NOP3 K8_NOP3
39575+#define K8_NOP7 K8_NOP4 K8_NOP3
39576+#define K8_NOP8 K8_NOP4 K8_NOP4
39577+
39578+/* K7 nops */
39579+/* uses eax dependencies (arbitary choice) */
39580+#define K7_NOP1 GENERIC_NOP1
39581+#define K7_NOP2 ".byte 0x8b,0xc0\n"
39582+#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
39583+#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
39584+#define K7_NOP5 K7_NOP4 ASM_NOP1
39585+#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
39586+#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
39587+#define K7_NOP8 K7_NOP7 ASM_NOP1
39588+
39589+#ifdef CONFIG_MK8
39590+#define ASM_NOP1 K8_NOP1
39591+#define ASM_NOP2 K8_NOP2
39592+#define ASM_NOP3 K8_NOP3
39593+#define ASM_NOP4 K8_NOP4
39594+#define ASM_NOP5 K8_NOP5
39595+#define ASM_NOP6 K8_NOP6
39596+#define ASM_NOP7 K8_NOP7
39597+#define ASM_NOP8 K8_NOP8
39598+#elif defined(CONFIG_MK7)
39599+#define ASM_NOP1 K7_NOP1
39600+#define ASM_NOP2 K7_NOP2
39601+#define ASM_NOP3 K7_NOP3
39602+#define ASM_NOP4 K7_NOP4
39603+#define ASM_NOP5 K7_NOP5
39604+#define ASM_NOP6 K7_NOP6
39605+#define ASM_NOP7 K7_NOP7
39606+#define ASM_NOP8 K7_NOP8
39607+#else
39608+#define ASM_NOP1 GENERIC_NOP1
39609+#define ASM_NOP2 GENERIC_NOP2
39610+#define ASM_NOP3 GENERIC_NOP3
39611+#define ASM_NOP4 GENERIC_NOP4
39612+#define ASM_NOP5 GENERIC_NOP5
39613+#define ASM_NOP6 GENERIC_NOP6
39614+#define ASM_NOP7 GENERIC_NOP7
39615+#define ASM_NOP8 GENERIC_NOP8
39616+#endif
39617+
39618+#define ASM_NOP_MAX 8
39619+
39620+/* Prefetch instructions for Pentium III and AMD Athlon */
39621+/* It's not worth to care about 3dnow! prefetches for the K6
39622+ because they are microcoded there and very slow.
39623+ However we don't do prefetches for pre XP Athlons currently
39624+ That should be fixed. */
39625+#define ARCH_HAS_PREFETCH
39626+static inline void prefetch(const void *x)
39627+{
39628+ alternative_input(ASM_NOP4,
39629+ "prefetchnta (%1)",
39630+ X86_FEATURE_XMM,
39631+ "r" (x));
39632+}
39633+
39634+#define ARCH_HAS_PREFETCH
39635+#define ARCH_HAS_PREFETCHW
39636+#define ARCH_HAS_SPINLOCK_PREFETCH
39637+
39638+/* 3dnow! prefetch to get an exclusive cache line. Useful for
39639+ spinlocks to avoid one state transition in the cache coherency protocol. */
39640+static inline void prefetchw(const void *x)
39641+{
39642+ alternative_input(ASM_NOP4,
39643+ "prefetchw (%1)",
39644+ X86_FEATURE_3DNOW,
39645+ "r" (x));
39646+}
39647+#define spin_lock_prefetch(x) prefetchw(x)
39648+
39649+extern void select_idle_routine(const struct cpuinfo_x86 *c);
39650+
39651+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
39652+
39653+extern unsigned long boot_option_idle_override;
39654+extern void enable_sep_cpu(void);
39655+extern int sysenter_setup(void);
39656+
39657+#endif /* __ASM_I386_PROCESSOR_H */
39658Index: head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h
39659===================================================================
39660--- /dev/null 1970-01-01 00:00:00.000000000 +0000
39661+++ head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h 2007-06-12 13:14:02.000000000 +0200
39662@@ -0,0 +1,117 @@
39663+#ifndef _ASM_SEGMENT_H
39664+#define _ASM_SEGMENT_H
39665+
39666+/*
39667+ * The layout of the per-CPU GDT under Linux:
39668+ *
39669+ * 0 - null
39670+ * 1 - reserved
39671+ * 2 - reserved
39672+ * 3 - reserved
39673+ *
39674+ * 4 - unused <==== new cacheline
39675+ * 5 - unused
39676+ *
39677+ * ------- start of TLS (Thread-Local Storage) segments:
39678+ *
39679+ * 6 - TLS segment #1 [ glibc's TLS segment ]
39680+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
39681+ * 8 - TLS segment #3
39682+ * 9 - reserved
39683+ * 10 - reserved
39684+ * 11 - reserved
39685+ *
39686+ * ------- start of kernel segments:
39687+ *
39688+ * 12 - kernel code segment <==== new cacheline
39689+ * 13 - kernel data segment
39690+ * 14 - default user CS
39691+ * 15 - default user DS
39692+ * 16 - TSS
39693+ * 17 - LDT
39694+ * 18 - PNPBIOS support (16->32 gate)
39695+ * 19 - PNPBIOS support
39696+ * 20 - PNPBIOS support
39697+ * 21 - PNPBIOS support
39698+ * 22 - PNPBIOS support
39699+ * 23 - APM BIOS support
39700+ * 24 - APM BIOS support
39701+ * 25 - APM BIOS support
39702+ *
39703+ * 26 - ESPFIX small SS
39704+ * 27 - unused
39705+ * 28 - unused
39706+ * 29 - unused
39707+ * 30 - unused
39708+ * 31 - TSS for double fault handler
39709+ */
39710+#define GDT_ENTRY_TLS_ENTRIES 3
39711+#define GDT_ENTRY_TLS_MIN 6
39712+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
39713+
39714+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
39715+
39716+#define GDT_ENTRY_DEFAULT_USER_CS 14
39717+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
39718+
39719+#define GDT_ENTRY_DEFAULT_USER_DS 15
39720+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
39721+
39722+#define GDT_ENTRY_KERNEL_BASE 12
39723+
39724+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
39725+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
39726+#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39727+
39728+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
39729+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
39730+#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39731+
39732+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
39733+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
39734+
39735+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
39736+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
39737+
39738+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
39739+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
39740+
39741+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
39742+
39743+/*
39744+ * The GDT has 32 entries
39745+ */
39746+#define GDT_ENTRIES 32
39747+
39748+#define GDT_SIZE (GDT_ENTRIES * 8)
39749+
39750+/* Simple and small GDT entries for booting only */
39751+
39752+#define GDT_ENTRY_BOOT_CS 2
39753+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
39754+
39755+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
39756+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
39757+
39758+/* The PnP BIOS entries in the GDT */
39759+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
39760+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
39761+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
39762+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
39763+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
39764+
39765+/* The PnP BIOS selectors */
39766+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
39767+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
39768+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
39769+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
39770+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
39771+
39772+/*
39773+ * The interrupt descriptor table has room for 256 idt's,
39774+ * the global descriptor table is dependent on the number
39775+ * of tasks we can have..
39776+ */
39777+#define IDT_ENTRIES 256
39778+
39779+#endif
39780Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h
39781===================================================================
39782--- /dev/null 1970-01-01 00:00:00.000000000 +0000
39783+++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200
39784@@ -0,0 +1,103 @@
39785+#ifndef __ASM_SMP_H
39786+#define __ASM_SMP_H
39787+
39788+/*
39789+ * We need the APIC definitions automatically as part of 'smp.h'
39790+ */
39791+#ifndef __ASSEMBLY__
39792+#include <linux/kernel.h>
39793+#include <linux/threads.h>
39794+#include <linux/cpumask.h>
39795+#endif
39796+
39797+#ifdef CONFIG_X86_LOCAL_APIC
39798+#ifndef __ASSEMBLY__
39799+#include <asm/fixmap.h>
39800+#include <asm/bitops.h>
39801+#include <asm/mpspec.h>
39802+#ifdef CONFIG_X86_IO_APIC
39803+#include <asm/io_apic.h>
39804+#endif
39805+#include <asm/apic.h>
39806+#endif
39807+#endif
39808+
39809+#define BAD_APICID 0xFFu
39810+#ifdef CONFIG_SMP
39811+#ifndef __ASSEMBLY__
39812+
39813+/*
39814+ * Private routines/data
39815+ */
39816+
39817+extern void smp_alloc_memory(void);
39818+extern int pic_mode;
39819+extern int smp_num_siblings;
39820+extern cpumask_t cpu_sibling_map[];
39821+extern cpumask_t cpu_core_map[];
39822+
39823+extern void (*mtrr_hook) (void);
39824+extern void zap_low_mappings (void);
39825+extern void lock_ipi_call_lock(void);
39826+extern void unlock_ipi_call_lock(void);
39827+
39828+#define MAX_APICID 256
39829+extern u8 x86_cpu_to_apicid[];
39830+
39831+#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
39832+
39833+#ifdef CONFIG_HOTPLUG_CPU
39834+extern void cpu_exit_clear(void);
39835+extern void cpu_uninit(void);
39836+#endif
39837+
39838+/*
39839+ * This function is needed by all SMP systems. It must _always_ be valid
39840+ * from the initial startup. We map APIC_BASE very early in page_setup(),
39841+ * so this is correct in the x86 case.
39842+ */
39843+#define raw_smp_processor_id() (current_thread_info()->cpu)
39844+
39845+extern cpumask_t cpu_possible_map;
39846+#define cpu_callin_map cpu_possible_map
39847+
39848+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
39849+static inline int num_booting_cpus(void)
39850+{
39851+ return cpus_weight(cpu_possible_map);
39852+}
39853+
39854+#ifdef CONFIG_X86_LOCAL_APIC
39855+
39856+#ifdef APIC_DEFINITION
39857+extern int hard_smp_processor_id(void);
39858+#else
39859+#include <mach_apicdef.h>
39860+static inline int hard_smp_processor_id(void)
39861+{
39862+ /* we don't want to mark this access volatile - bad code generation */
39863+ return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
39864+}
39865+#endif
39866+
39867+static __inline int logical_smp_processor_id(void)
39868+{
39869+ /* we don't want to mark this access volatile - bad code generation */
39870+ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
39871+}
39872+
39873+#endif
39874+
39875+extern int __cpu_disable(void);
39876+extern void __cpu_die(unsigned int cpu);
39877+extern void prefill_possible_map(void);
39878+#endif /* !__ASSEMBLY__ */
39879+
39880+#else /* CONFIG_SMP */
39881+
39882+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
39883+
39884+#define NO_PROC_ID 0xFF /* No processor magic marker */
39885+
39886+#endif
39887+#endif
39888Index: head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h
39889===================================================================
39890--- /dev/null 1970-01-01 00:00:00.000000000 +0000
39891+++ head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h 2007-06-12 13:14:02.000000000 +0200
39892@@ -0,0 +1,43 @@
39893+#ifndef _ASM_SWIOTLB_H
39894+#define _ASM_SWIOTLB_H 1
39895+
39896+/* SWIOTLB interface */
39897+
39898+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
39899+ int dir);
39900+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
39901+ size_t size, int dir);
39902+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
39903+ dma_addr_t dev_addr,
39904+ size_t size, int dir);
39905+extern void swiotlb_sync_single_for_device(struct device *hwdev,
39906+ dma_addr_t dev_addr,
39907+ size_t size, int dir);
39908+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
39909+ struct scatterlist *sg, int nelems,
39910+ int dir);
39911+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
39912+ struct scatterlist *sg, int nelems,
39913+ int dir);
39914+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
39915+ int nents, int direction);
39916+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
39917+ int nents, int direction);
39918+extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
39919+#ifdef CONFIG_HIGHMEM
39920+extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
39921+ unsigned long offset, size_t size,
39922+ enum dma_data_direction direction);
39923+extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
39924+ size_t size, enum dma_data_direction direction);
39925+#endif
39926+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
39927+extern void swiotlb_init(void);
39928+
39929+#ifdef CONFIG_SWIOTLB
39930+extern int swiotlb;
39931+#else
39932+#define swiotlb 0
39933+#endif
39934+
39935+#endif
39936Index: head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h
39937===================================================================
39938--- /dev/null 1970-01-01 00:00:00.000000000 +0000
39939+++ head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200
39940@@ -0,0 +1,126 @@
39941+#ifndef __XEN_SYNCH_BITOPS_H__
39942+#define __XEN_SYNCH_BITOPS_H__
39943+
39944+/*
39945+ * Copyright 1992, Linus Torvalds.
39946+ * Heavily modified to provide guaranteed strong synchronisation
39947+ * when communicating with Xen or other guest OSes running on other CPUs.
39948+ */
39949+
39950+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
39951+#include <xen/platform-compat.h>
39952+#endif
39953+
39954+#define ADDR (*(volatile long *) addr)
39955+
39956+static __inline__ void synch_set_bit(int nr, volatile void * addr)
39957+{
39958+ __asm__ __volatile__ (
39959+ "lock btsl %1,%0"
39960+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
39961+}
39962+
39963+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
39964+{
39965+ __asm__ __volatile__ (
39966+ "lock btrl %1,%0"
39967+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
39968+}
39969+
39970+static __inline__ void synch_change_bit(int nr, volatile void * addr)
39971+{
39972+ __asm__ __volatile__ (
39973+ "lock btcl %1,%0"
39974+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
39975+}
39976+
39977+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
39978+{
39979+ int oldbit;
39980+ __asm__ __volatile__ (
39981+ "lock btsl %2,%1\n\tsbbl %0,%0"
39982+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39983+ return oldbit;
39984+}
39985+
39986+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
39987+{
39988+ int oldbit;
39989+ __asm__ __volatile__ (
39990+ "lock btrl %2,%1\n\tsbbl %0,%0"
39991+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39992+ return oldbit;
39993+}
39994+
39995+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
39996+{
39997+ int oldbit;
39998+
39999+ __asm__ __volatile__ (
40000+ "lock btcl %2,%1\n\tsbbl %0,%0"
40001+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
40002+ return oldbit;
40003+}
40004+
40005+struct __synch_xchg_dummy { unsigned long a[100]; };
40006+#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
40007+
40008+#define synch_cmpxchg(ptr, old, new) \
40009+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
40010+ (unsigned long)(old), \
40011+ (unsigned long)(new), \
40012+ sizeof(*(ptr))))
40013+
40014+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
40015+ unsigned long old,
40016+ unsigned long new, int size)
40017+{
40018+ unsigned long prev;
40019+ switch (size) {
40020+ case 1:
40021+ __asm__ __volatile__("lock; cmpxchgb %b1,%2"
40022+ : "=a"(prev)
40023+ : "q"(new), "m"(*__synch_xg(ptr)),
40024+ "0"(old)
40025+ : "memory");
40026+ return prev;
40027+ case 2:
40028+ __asm__ __volatile__("lock; cmpxchgw %w1,%2"
40029+ : "=a"(prev)
40030+ : "r"(new), "m"(*__synch_xg(ptr)),
40031+ "0"(old)
40032+ : "memory");
40033+ return prev;
40034+#ifdef CONFIG_X86_64
40035+ case 4:
40036+ __asm__ __volatile__("lock; cmpxchgl %k1,%2"
40037+ : "=a"(prev)
40038+ : "r"(new), "m"(*__synch_xg(ptr)),
40039+ "0"(old)
40040+ : "memory");
40041+ return prev;
40042+ case 8:
40043+ __asm__ __volatile__("lock; cmpxchgq %1,%2"
40044+ : "=a"(prev)
40045+ : "r"(new), "m"(*__synch_xg(ptr)),
40046+ "0"(old)
40047+ : "memory");
40048+ return prev;
40049+#else
40050+ case 4:
40051+ __asm__ __volatile__("lock; cmpxchgl %1,%2"
40052+ : "=a"(prev)
40053+ : "r"(new), "m"(*__synch_xg(ptr)),
40054+ "0"(old)
40055+ : "memory");
40056+ return prev;
40057+#endif
40058+ }
40059+ return old;
40060+}
40061+
40062+#define synch_test_bit test_bit
40063+
40064+#define synch_cmpxchg_subword synch_cmpxchg
40065+
40066+#endif /* __XEN_SYNCH_BITOPS_H__ */
40067Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h
40068===================================================================
40069--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40070+++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200
40071@@ -0,0 +1,488 @@
40072+#ifndef __ASM_SYSTEM_H
40073+#define __ASM_SYSTEM_H
40074+
40075+#include <linux/kernel.h>
40076+#include <asm/segment.h>
40077+#include <asm/cpufeature.h>
40078+#include <linux/bitops.h> /* for LOCK_PREFIX */
40079+#include <asm/synch_bitops.h>
40080+#include <asm/hypervisor.h>
40081+
40082+#ifdef __KERNEL__
40083+
40084+struct task_struct; /* one of the stranger aspects of C forward declarations.. */
40085+extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
40086+
40087+/*
40088+ * Saving eflags is important. It switches not only IOPL between tasks,
40089+ * it also protects other tasks from NT leaking through sysenter etc.
40090+ */
40091+#define switch_to(prev,next,last) do { \
40092+ unsigned long esi,edi; \
40093+ asm volatile("pushfl\n\t" /* Save flags */ \
40094+ "pushl %%ebp\n\t" \
40095+ "movl %%esp,%0\n\t" /* save ESP */ \
40096+ "movl %5,%%esp\n\t" /* restore ESP */ \
40097+ "movl $1f,%1\n\t" /* save EIP */ \
40098+ "pushl %6\n\t" /* restore EIP */ \
40099+ "jmp __switch_to\n" \
40100+ "1:\t" \
40101+ "popl %%ebp\n\t" \
40102+ "popfl" \
40103+ :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
40104+ "=a" (last),"=S" (esi),"=D" (edi) \
40105+ :"m" (next->thread.esp),"m" (next->thread.eip), \
40106+ "2" (prev), "d" (next)); \
40107+} while (0)
40108+
40109+#define _set_base(addr,base) do { unsigned long __pr; \
40110+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40111+ "rorl $16,%%edx\n\t" \
40112+ "movb %%dl,%2\n\t" \
40113+ "movb %%dh,%3" \
40114+ :"=&d" (__pr) \
40115+ :"m" (*((addr)+2)), \
40116+ "m" (*((addr)+4)), \
40117+ "m" (*((addr)+7)), \
40118+ "0" (base) \
40119+ ); } while(0)
40120+
40121+#define _set_limit(addr,limit) do { unsigned long __lr; \
40122+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40123+ "rorl $16,%%edx\n\t" \
40124+ "movb %2,%%dh\n\t" \
40125+ "andb $0xf0,%%dh\n\t" \
40126+ "orb %%dh,%%dl\n\t" \
40127+ "movb %%dl,%2" \
40128+ :"=&d" (__lr) \
40129+ :"m" (*(addr)), \
40130+ "m" (*((addr)+6)), \
40131+ "0" (limit) \
40132+ ); } while(0)
40133+
40134+#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
40135+#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
40136+
40137+/*
40138+ * Load a segment. Fall back on loading the zero
40139+ * segment if something goes wrong..
40140+ */
40141+#define loadsegment(seg,value) \
40142+ asm volatile("\n" \
40143+ "1:\t" \
40144+ "mov %0,%%" #seg "\n" \
40145+ "2:\n" \
40146+ ".section .fixup,\"ax\"\n" \
40147+ "3:\t" \
40148+ "pushl $0\n\t" \
40149+ "popl %%" #seg "\n\t" \
40150+ "jmp 2b\n" \
40151+ ".previous\n" \
40152+ ".section __ex_table,\"a\"\n\t" \
40153+ ".align 4\n\t" \
40154+ ".long 1b,3b\n" \
40155+ ".previous" \
40156+ : :"rm" (value))
40157+
40158+/*
40159+ * Save a segment register away
40160+ */
40161+#define savesegment(seg, value) \
40162+ asm volatile("mov %%" #seg ",%0":"=rm" (value))
40163+
40164+#define read_cr0() ({ \
40165+ unsigned int __dummy; \
40166+ __asm__ __volatile__( \
40167+ "movl %%cr0,%0\n\t" \
40168+ :"=r" (__dummy)); \
40169+ __dummy; \
40170+})
40171+#define write_cr0(x) \
40172+ __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
40173+
40174+#define read_cr2() (current_vcpu_info()->arch.cr2)
40175+#define write_cr2(x) \
40176+ __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
40177+
40178+#define read_cr3() ({ \
40179+ unsigned int __dummy; \
40180+ __asm__ ( \
40181+ "movl %%cr3,%0\n\t" \
40182+ :"=r" (__dummy)); \
40183+ __dummy = xen_cr3_to_pfn(__dummy); \
40184+ mfn_to_pfn(__dummy) << PAGE_SHIFT; \
40185+})
40186+#define write_cr3(x) ({ \
40187+ unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
40188+ __dummy = xen_pfn_to_cr3(__dummy); \
40189+ __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
40190+})
40191+#define read_cr4() ({ \
40192+ unsigned int __dummy; \
40193+ __asm__( \
40194+ "movl %%cr4,%0\n\t" \
40195+ :"=r" (__dummy)); \
40196+ __dummy; \
40197+})
40198+#define read_cr4_safe() ({ \
40199+ unsigned int __dummy; \
40200+ /* This could fault if %cr4 does not exist */ \
40201+ __asm__("1: movl %%cr4, %0 \n" \
40202+ "2: \n" \
40203+ ".section __ex_table,\"a\" \n" \
40204+ ".long 1b,2b \n" \
40205+ ".previous \n" \
40206+ : "=r" (__dummy): "0" (0)); \
40207+ __dummy; \
40208+})
40209+
40210+#define write_cr4(x) \
40211+ __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
40212+
40213+/*
40214+ * Clear and set 'TS' bit respectively
40215+ */
40216+#define clts() (HYPERVISOR_fpu_taskswitch(0))
40217+#define stts() (HYPERVISOR_fpu_taskswitch(1))
40218+
40219+#endif /* __KERNEL__ */
40220+
40221+#define wbinvd() \
40222+ __asm__ __volatile__ ("wbinvd": : :"memory")
40223+
40224+static inline unsigned long get_limit(unsigned long segment)
40225+{
40226+ unsigned long __limit;
40227+ __asm__("lsll %1,%0"
40228+ :"=r" (__limit):"r" (segment));
40229+ return __limit+1;
40230+}
40231+
40232+#define nop() __asm__ __volatile__ ("nop")
40233+
40234+#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
40235+
40236+#define tas(ptr) (xchg((ptr),1))
40237+
40238+struct __xchg_dummy { unsigned long a[100]; };
40239+#define __xg(x) ((struct __xchg_dummy *)(x))
40240+
40241+
40242+#ifdef CONFIG_X86_CMPXCHG64
40243+
40244+/*
40245+ * The semantics of XCHGCMP8B are a bit strange, this is why
40246+ * there is a loop and the loading of %%eax and %%edx has to
40247+ * be inside. This inlines well in most cases, the cached
40248+ * cost is around ~38 cycles. (in the future we might want
40249+ * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
40250+ * might have an implicit FPU-save as a cost, so it's not
40251+ * clear which path to go.)
40252+ *
40253+ * cmpxchg8b must be used with the lock prefix here to allow
40254+ * the instruction to be executed atomically, see page 3-102
40255+ * of the instruction set reference 24319102.pdf. We need
40256+ * the reader side to see the coherent 64bit value.
40257+ */
40258+static inline void __set_64bit (unsigned long long * ptr,
40259+ unsigned int low, unsigned int high)
40260+{
40261+ __asm__ __volatile__ (
40262+ "\n1:\t"
40263+ "movl (%0), %%eax\n\t"
40264+ "movl 4(%0), %%edx\n\t"
40265+ "lock cmpxchg8b (%0)\n\t"
40266+ "jnz 1b"
40267+ : /* no outputs */
40268+ : "D"(ptr),
40269+ "b"(low),
40270+ "c"(high)
40271+ : "ax","dx","memory");
40272+}
40273+
40274+static inline void __set_64bit_constant (unsigned long long *ptr,
40275+ unsigned long long value)
40276+{
40277+ __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
40278+}
40279+#define ll_low(x) *(((unsigned int*)&(x))+0)
40280+#define ll_high(x) *(((unsigned int*)&(x))+1)
40281+
40282+static inline void __set_64bit_var (unsigned long long *ptr,
40283+ unsigned long long value)
40284+{
40285+ __set_64bit(ptr,ll_low(value), ll_high(value));
40286+}
40287+
40288+#define set_64bit(ptr,value) \
40289+(__builtin_constant_p(value) ? \
40290+ __set_64bit_constant(ptr, value) : \
40291+ __set_64bit_var(ptr, value) )
40292+
40293+#define _set_64bit(ptr,value) \
40294+(__builtin_constant_p(value) ? \
40295+ __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
40296+ __set_64bit(ptr, ll_low(value), ll_high(value)) )
40297+
40298+#endif
40299+
40300+/*
40301+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
40302+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
40303+ * but generally the primitive is invalid, *ptr is output argument. --ANK
40304+ */
40305+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
40306+{
40307+ switch (size) {
40308+ case 1:
40309+ __asm__ __volatile__("xchgb %b0,%1"
40310+ :"=q" (x)
40311+ :"m" (*__xg(ptr)), "0" (x)
40312+ :"memory");
40313+ break;
40314+ case 2:
40315+ __asm__ __volatile__("xchgw %w0,%1"
40316+ :"=r" (x)
40317+ :"m" (*__xg(ptr)), "0" (x)
40318+ :"memory");
40319+ break;
40320+ case 4:
40321+ __asm__ __volatile__("xchgl %0,%1"
40322+ :"=r" (x)
40323+ :"m" (*__xg(ptr)), "0" (x)
40324+ :"memory");
40325+ break;
40326+ }
40327+ return x;
40328+}
40329+
40330+/*
40331+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
40332+ * store NEW in MEM. Return the initial value in MEM. Success is
40333+ * indicated by comparing RETURN with OLD.
40334+ */
40335+
40336+#ifdef CONFIG_X86_CMPXCHG
40337+#define __HAVE_ARCH_CMPXCHG 1
40338+#define cmpxchg(ptr,o,n)\
40339+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
40340+ (unsigned long)(n),sizeof(*(ptr))))
40341+#endif
40342+
40343+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
40344+ unsigned long new, int size)
40345+{
40346+ unsigned long prev;
40347+ switch (size) {
40348+ case 1:
40349+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
40350+ : "=a"(prev)
40351+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
40352+ : "memory");
40353+ return prev;
40354+ case 2:
40355+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
40356+ : "=a"(prev)
40357+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
40358+ : "memory");
40359+ return prev;
40360+ case 4:
40361+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
40362+ : "=a"(prev)
40363+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
40364+ : "memory");
40365+ return prev;
40366+ }
40367+ return old;
40368+}
40369+
40370+#ifndef CONFIG_X86_CMPXCHG
40371+/*
40372+ * Building a kernel capable running on 80386. It may be necessary to
40373+ * simulate the cmpxchg on the 80386 CPU. For that purpose we define
40374+ * a function for each of the sizes we support.
40375+ */
40376+
40377+extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
40378+extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
40379+extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
40380+
40381+static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
40382+ unsigned long new, int size)
40383+{
40384+ switch (size) {
40385+ case 1:
40386+ return cmpxchg_386_u8(ptr, old, new);
40387+ case 2:
40388+ return cmpxchg_386_u16(ptr, old, new);
40389+ case 4:
40390+ return cmpxchg_386_u32(ptr, old, new);
40391+ }
40392+ return old;
40393+}
40394+
40395+#define cmpxchg(ptr,o,n) \
40396+({ \
40397+ __typeof__(*(ptr)) __ret; \
40398+ if (likely(boot_cpu_data.x86 > 3)) \
40399+ __ret = __cmpxchg((ptr), (unsigned long)(o), \
40400+ (unsigned long)(n), sizeof(*(ptr))); \
40401+ else \
40402+ __ret = cmpxchg_386((ptr), (unsigned long)(o), \
40403+ (unsigned long)(n), sizeof(*(ptr))); \
40404+ __ret; \
40405+})
40406+#endif
40407+
40408+#ifdef CONFIG_X86_CMPXCHG64
40409+
40410+static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
40411+ unsigned long long new)
40412+{
40413+ unsigned long long prev;
40414+ __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
40415+ : "=A"(prev)
40416+ : "b"((unsigned long)new),
40417+ "c"((unsigned long)(new >> 32)),
40418+ "m"(*__xg(ptr)),
40419+ "0"(old)
40420+ : "memory");
40421+ return prev;
40422+}
40423+
40424+#define cmpxchg64(ptr,o,n)\
40425+ ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
40426+ (unsigned long long)(n)))
40427+
40428+#endif
40429+
40430+/*
40431+ * Force strict CPU ordering.
40432+ * And yes, this is required on UP too when we're talking
40433+ * to devices.
40434+ *
40435+ * For now, "wmb()" doesn't actually do anything, as all
40436+ * Intel CPU's follow what Intel calls a *Processor Order*,
40437+ * in which all writes are seen in the program order even
40438+ * outside the CPU.
40439+ *
40440+ * I expect future Intel CPU's to have a weaker ordering,
40441+ * but I'd also expect them to finally get their act together
40442+ * and add some real memory barriers if so.
40443+ *
40444+ * Some non intel clones support out of order store. wmb() ceases to be a
40445+ * nop for these.
40446+ */
40447+
40448+
40449+/*
40450+ * Actually only lfence would be needed for mb() because all stores done
40451+ * by the kernel should be already ordered. But keep a full barrier for now.
40452+ */
40453+
40454+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
40455+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
40456+
40457+/**
40458+ * read_barrier_depends - Flush all pending reads that subsequents reads
40459+ * depend on.
40460+ *
40461+ * No data-dependent reads from memory-like regions are ever reordered
40462+ * over this barrier. All reads preceding this primitive are guaranteed
40463+ * to access memory (but not necessarily other CPUs' caches) before any
40464+ * reads following this primitive that depend on the data return by
40465+ * any of the preceding reads. This primitive is much lighter weight than
40466+ * rmb() on most CPUs, and is never heavier weight than is
40467+ * rmb().
40468+ *
40469+ * These ordering constraints are respected by both the local CPU
40470+ * and the compiler.
40471+ *
40472+ * Ordering is not guaranteed by anything other than these primitives,
40473+ * not even by data dependencies. See the documentation for
40474+ * memory_barrier() for examples and URLs to more information.
40475+ *
40476+ * For example, the following code would force ordering (the initial
40477+ * value of "a" is zero, "b" is one, and "p" is "&a"):
40478+ *
40479+ * <programlisting>
40480+ * CPU 0 CPU 1
40481+ *
40482+ * b = 2;
40483+ * memory_barrier();
40484+ * p = &b; q = p;
40485+ * read_barrier_depends();
40486+ * d = *q;
40487+ * </programlisting>
40488+ *
40489+ * because the read of "*q" depends on the read of "p" and these
40490+ * two reads are separated by a read_barrier_depends(). However,
40491+ * the following code, with the same initial values for "a" and "b":
40492+ *
40493+ * <programlisting>
40494+ * CPU 0 CPU 1
40495+ *
40496+ * a = 2;
40497+ * memory_barrier();
40498+ * b = 3; y = b;
40499+ * read_barrier_depends();
40500+ * x = a;
40501+ * </programlisting>
40502+ *
40503+ * does not enforce ordering, since there is no data dependency between
40504+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
40505+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
40506+ * in cases like this where there are no data dependencies.
40507+ **/
40508+
40509+#define read_barrier_depends() do { } while(0)
40510+
40511+#ifdef CONFIG_X86_OOSTORE
40512+/* Actually there are no OOO store capable CPUs for now that do SSE,
40513+ but make it already an possibility. */
40514+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
40515+#else
40516+#define wmb() __asm__ __volatile__ ("": : :"memory")
40517+#endif
40518+
40519+#ifdef CONFIG_SMP
40520+#define smp_mb() mb()
40521+#define smp_rmb() rmb()
40522+#define smp_wmb() wmb()
40523+#define smp_read_barrier_depends() read_barrier_depends()
40524+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
40525+#else
40526+#define smp_mb() barrier()
40527+#define smp_rmb() barrier()
40528+#define smp_wmb() barrier()
40529+#define smp_read_barrier_depends() do { } while(0)
40530+#define set_mb(var, value) do { var = value; barrier(); } while (0)
40531+#endif
40532+
40533+#include <linux/irqflags.h>
40534+
40535+/*
40536+ * disable hlt during certain critical i/o operations
40537+ */
40538+#define HAVE_DISABLE_HLT
40539+void disable_hlt(void);
40540+void enable_hlt(void);
40541+
40542+extern int es7000_plat;
40543+void cpu_idle_wait(void);
40544+
40545+/*
40546+ * On SMP systems, when the scheduler does migration-cost autodetection,
40547+ * it needs a way to flush as much of the CPU's caches as possible:
40548+ */
40549+static inline void sched_cacheflush(void)
40550+{
40551+ wbinvd();
40552+}
40553+
40554+extern unsigned long arch_align_stack(unsigned long sp);
40555+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
40556+
40557+void default_idle(void);
40558+
40559+#endif
40560Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h
40561===================================================================
40562--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40563+++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100
40564@@ -0,0 +1,101 @@
40565+#ifndef _I386_TLBFLUSH_H
40566+#define _I386_TLBFLUSH_H
40567+
40568+#include <linux/mm.h>
40569+#include <asm/processor.h>
40570+
40571+#define __flush_tlb() xen_tlb_flush()
40572+#define __flush_tlb_global() xen_tlb_flush()
40573+#define __flush_tlb_all() xen_tlb_flush()
40574+
40575+extern unsigned long pgkern_mask;
40576+
40577+#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
40578+
40579+#define __flush_tlb_single(addr) xen_invlpg(addr)
40580+
40581+#define __flush_tlb_one(addr) __flush_tlb_single(addr)
40582+
40583+/*
40584+ * TLB flushing:
40585+ *
40586+ * - flush_tlb() flushes the current mm struct TLBs
40587+ * - flush_tlb_all() flushes all processes TLBs
40588+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
40589+ * - flush_tlb_page(vma, vmaddr) flushes one page
40590+ * - flush_tlb_range(vma, start, end) flushes a range of pages
40591+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
40592+ * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
40593+ *
40594+ * ..but the i386 has somewhat limited tlb flushing capabilities,
40595+ * and page-granular flushes are available only on i486 and up.
40596+ */
40597+
40598+#ifndef CONFIG_SMP
40599+
40600+#define flush_tlb() __flush_tlb()
40601+#define flush_tlb_all() __flush_tlb_all()
40602+#define local_flush_tlb() __flush_tlb()
40603+
40604+static inline void flush_tlb_mm(struct mm_struct *mm)
40605+{
40606+ if (mm == current->active_mm)
40607+ __flush_tlb();
40608+}
40609+
40610+static inline void flush_tlb_page(struct vm_area_struct *vma,
40611+ unsigned long addr)
40612+{
40613+ if (vma->vm_mm == current->active_mm)
40614+ __flush_tlb_one(addr);
40615+}
40616+
40617+static inline void flush_tlb_range(struct vm_area_struct *vma,
40618+ unsigned long start, unsigned long end)
40619+{
40620+ if (vma->vm_mm == current->active_mm)
40621+ __flush_tlb();
40622+}
40623+
40624+#else
40625+
40626+#include <asm/smp.h>
40627+
40628+#define local_flush_tlb() \
40629+ __flush_tlb()
40630+
40631+#define flush_tlb_all xen_tlb_flush_all
40632+#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
40633+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
40634+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
40635+
40636+#define flush_tlb() flush_tlb_current_task()
40637+
40638+static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
40639+{
40640+ flush_tlb_mm(vma->vm_mm);
40641+}
40642+
40643+#define TLBSTATE_OK 1
40644+#define TLBSTATE_LAZY 2
40645+
40646+struct tlb_state
40647+{
40648+ struct mm_struct *active_mm;
40649+ int state;
40650+ char __cacheline_padding[L1_CACHE_BYTES-8];
40651+};
40652+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
40653+
40654+
40655+#endif
40656+
40657+#define flush_tlb_kernel_range(start, end) flush_tlb_all()
40658+
40659+static inline void flush_tlb_pgtables(struct mm_struct *mm,
40660+ unsigned long start, unsigned long end)
40661+{
40662+ /* i386 does not keep any page table caches in TLB */
40663+}
40664+
40665+#endif /* _I386_TLBFLUSH_H */
40666Index: head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h
40667===================================================================
40668--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40669+++ head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200
40670@@ -0,0 +1,20 @@
40671+/*
40672+ * Access to VGA videoram
40673+ *
40674+ * (c) 1998 Martin Mares <mj@ucw.cz>
40675+ */
40676+
40677+#ifndef _LINUX_ASM_VGA_H_
40678+#define _LINUX_ASM_VGA_H_
40679+
40680+/*
40681+ * On the PC, we can just recalculate addresses and then
40682+ * access the videoram directly without any black magic.
40683+ */
40684+
40685+#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
40686+
40687+#define vga_readb(x) (*(x))
40688+#define vga_writeb(x,y) (*(y) = (x))
40689+
40690+#endif
40691Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h
40692===================================================================
40693--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40694+++ head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h 2007-06-12 13:14:02.000000000 +0200
40695@@ -0,0 +1,48 @@
40696+/******************************************************************************
40697+ * asm-i386/mach-xen/asm/xenoprof.h
40698+ *
40699+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
40700+ * VA Linux Systems Japan K.K.
40701+ *
40702+ * This program is free software; you can redistribute it and/or modify
40703+ * it under the terms of the GNU General Public License as published by
40704+ * the Free Software Foundation; either version 2 of the License, or
40705+ * (at your option) any later version.
40706+ *
40707+ * This program is distributed in the hope that it will be useful,
40708+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
40709+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40710+ * GNU General Public License for more details.
40711+ *
40712+ * You should have received a copy of the GNU General Public License
40713+ * along with this program; if not, write to the Free Software
40714+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40715+ *
40716+ */
40717+#ifndef __ASM_XENOPROF_H__
40718+#define __ASM_XENOPROF_H__
40719+#ifdef CONFIG_XEN
40720+
40721+struct super_block;
40722+struct dentry;
40723+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
40724+#define HAVE_XENOPROF_CREATE_FILES
40725+
40726+struct xenoprof_init;
40727+void xenoprof_arch_init_counter(struct xenoprof_init *init);
40728+void xenoprof_arch_counter(void);
40729+void xenoprof_arch_start(void);
40730+void xenoprof_arch_stop(void);
40731+
40732+struct xenoprof_arch_shared_buffer {
40733+ /* nothing */
40734+};
40735+struct xenoprof_shared_buffer;
40736+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
40737+struct xenoprof_get_buffer;
40738+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
40739+struct xenoprof_passive;
40740+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
40741+
40742+#endif /* CONFIG_XEN */
40743+#endif /* __ASM_XENOPROF_H__ */
40744Index: head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h
40745===================================================================
40746--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40747+++ head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200
40748@@ -0,0 +1,125 @@
40749+/*
40750+ * This file should contain #defines for all of the interrupt vector
40751+ * numbers used by this architecture.
40752+ *
40753+ * In addition, there are some standard defines:
40754+ *
40755+ * FIRST_EXTERNAL_VECTOR:
40756+ * The first free place for external interrupts
40757+ *
40758+ * SYSCALL_VECTOR:
40759+ * The IRQ vector a syscall makes the user to kernel transition
40760+ * under.
40761+ *
40762+ * TIMER_IRQ:
40763+ * The IRQ number the timer interrupt comes in at.
40764+ *
40765+ * NR_IRQS:
40766+ * The total number of interrupt vectors (including all the
40767+ * architecture specific interrupts) needed.
40768+ *
40769+ */
40770+#ifndef _ASM_IRQ_VECTORS_H
40771+#define _ASM_IRQ_VECTORS_H
40772+
40773+/*
40774+ * IDT vectors usable for external interrupt sources start
40775+ * at 0x20:
40776+ */
40777+#define FIRST_EXTERNAL_VECTOR 0x20
40778+
40779+#define SYSCALL_VECTOR 0x80
40780+
40781+/*
40782+ * Vectors 0x20-0x2f are used for ISA interrupts.
40783+ */
40784+
40785+#if 0
40786+/*
40787+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
40788+ *
40789+ * some of the following vectors are 'rare', they are merged
40790+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
40791+ * TLB, reschedule and local APIC vectors are performance-critical.
40792+ *
40793+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
40794+ */
40795+#define SPURIOUS_APIC_VECTOR 0xff
40796+#define ERROR_APIC_VECTOR 0xfe
40797+#define INVALIDATE_TLB_VECTOR 0xfd
40798+#define RESCHEDULE_VECTOR 0xfc
40799+#define CALL_FUNCTION_VECTOR 0xfb
40800+
40801+#define THERMAL_APIC_VECTOR 0xf0
40802+/*
40803+ * Local APIC timer IRQ vector is on a different priority level,
40804+ * to work around the 'lost local interrupt if more than 2 IRQ
40805+ * sources per level' errata.
40806+ */
40807+#define LOCAL_TIMER_VECTOR 0xef
40808+#endif
40809+
40810+#define SPURIOUS_APIC_VECTOR 0xff
40811+#define ERROR_APIC_VECTOR 0xfe
40812+
40813+/*
40814+ * First APIC vector available to drivers: (vectors 0x30-0xee)
40815+ * we start at 0x31 to spread out vectors evenly between priority
40816+ * levels. (0x80 is the syscall vector)
40817+ */
40818+#define FIRST_DEVICE_VECTOR 0x31
40819+#define FIRST_SYSTEM_VECTOR 0xef
40820+
40821+/*
40822+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
40823+ * Right now the APIC is mostly only used for SMP.
40824+ * 256 vectors is an architectural limit. (we can have
40825+ * more than 256 devices theoretically, but they will
40826+ * have to use shared interrupts)
40827+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
40828+ * the usable vector space is 0x20-0xff (224 vectors)
40829+ */
40830+
40831+#define RESCHEDULE_VECTOR 0
40832+#define CALL_FUNCTION_VECTOR 1
40833+#define NR_IPIS 2
40834+
40835+/*
40836+ * The maximum number of vectors supported by i386 processors
40837+ * is limited to 256. For processors other than i386, NR_VECTORS
40838+ * should be changed accordingly.
40839+ */
40840+#define NR_VECTORS 256
40841+
40842+#define FPU_IRQ 13
40843+
40844+#define FIRST_VM86_IRQ 3
40845+#define LAST_VM86_IRQ 15
40846+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
40847+
40848+/*
40849+ * The flat IRQ space is divided into two regions:
40850+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
40851+ * if we have physical device-access privilege. This region is at the
40852+ * start of the IRQ space so that existing device drivers do not need
40853+ * to be modified to translate physical IRQ numbers into our IRQ space.
40854+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
40855+ * are bound using the provided bind/unbind functions.
40856+ */
40857+
40858+#define PIRQ_BASE 0
40859+#if !defined(MAX_IO_APICS)
40860+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40861+#elif NR_CPUS < MAX_IO_APICS
40862+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40863+#else
40864+# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
40865+#endif
40866+
40867+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
40868+#define NR_DYNIRQS 256
40869+
40870+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
40871+#define NR_IRQ_VECTORS NR_IRQS
40872+
40873+#endif /* _ASM_IRQ_VECTORS_H */
40874Index: head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h
40875===================================================================
40876--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40877+++ head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h 2007-06-12 13:14:02.000000000 +0200
40878@@ -0,0 +1,33 @@
40879+/*
40880+ * include/asm-xen/asm-i386/mach-xen/mach_traps.h
40881+ *
40882+ * Machine specific NMI handling for Xen
40883+ */
40884+#ifndef _MACH_TRAPS_H
40885+#define _MACH_TRAPS_H
40886+
40887+#include <linux/bitops.h>
40888+#include <xen/interface/nmi.h>
40889+
40890+static inline void clear_mem_error(unsigned char reason) {}
40891+static inline void clear_io_check_error(unsigned char reason) {}
40892+
40893+static inline unsigned char get_nmi_reason(void)
40894+{
40895+ shared_info_t *s = HYPERVISOR_shared_info;
40896+ unsigned char reason = 0;
40897+
40898+ /* construct a value which looks like it came from
40899+ * port 0x61.
40900+ */
40901+ if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
40902+ reason |= 0x40;
40903+ if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
40904+ reason |= 0x80;
40905+
40906+ return reason;
40907+}
40908+
40909+static inline void reassert_nmi(void) {}
40910+
40911+#endif /* !_MACH_TRAPS_H */
40912Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h
40913===================================================================
40914--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40915+++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h 2007-06-12 13:14:02.000000000 +0200
40916@@ -0,0 +1,5 @@
40917+/* Hook to call BIOS initialisation function */
40918+
40919+#define ARCH_SETUP machine_specific_arch_setup();
40920+
40921+void __init machine_specific_arch_setup(void);
40922Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h
40923===================================================================
40924--- /dev/null 1970-01-01 00:00:00.000000000 +0000
40925+++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100
40926@@ -0,0 +1,265 @@
40927+/* Written 2000 by Andi Kleen */
40928+#ifndef __ARCH_DESC_H
40929+#define __ARCH_DESC_H
40930+
40931+#include <linux/threads.h>
40932+#include <asm/ldt.h>
40933+
40934+#ifndef __ASSEMBLY__
40935+
40936+#include <linux/string.h>
40937+#include <linux/smp.h>
40938+
40939+#include <asm/segment.h>
40940+#include <asm/mmu.h>
40941+
40942+// 8 byte segment descriptor
40943+struct desc_struct {
40944+ u16 limit0;
40945+ u16 base0;
40946+ unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
40947+ unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
40948+} __attribute__((packed));
40949+
40950+struct n_desc_struct {
40951+ unsigned int a,b;
40952+};
40953+
40954+enum {
40955+ GATE_INTERRUPT = 0xE,
40956+ GATE_TRAP = 0xF,
40957+ GATE_CALL = 0xC,
40958+};
40959+
40960+// 16byte gate
40961+struct gate_struct {
40962+ u16 offset_low;
40963+ u16 segment;
40964+ unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
40965+ u16 offset_middle;
40966+ u32 offset_high;
40967+ u32 zero1;
40968+} __attribute__((packed));
40969+
40970+#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
40971+#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
40972+#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
40973+
40974+enum {
40975+ DESC_TSS = 0x9,
40976+ DESC_LDT = 0x2,
40977+};
40978+
40979+// LDT or TSS descriptor in the GDT. 16 bytes.
40980+struct ldttss_desc {
40981+ u16 limit0;
40982+ u16 base0;
40983+ unsigned base1 : 8, type : 5, dpl : 2, p : 1;
40984+ unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
40985+ u32 base3;
40986+ u32 zero1;
40987+} __attribute__((packed));
40988+
40989+struct desc_ptr {
40990+ unsigned short size;
40991+ unsigned long address;
40992+} __attribute__((packed)) ;
40993+
40994+extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
40995+
40996+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
40997+
40998+#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
40999+#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
41000+
41001+static inline void clear_LDT(void)
41002+{
41003+ int cpu = get_cpu();
41004+
41005+ /*
41006+ * NB. We load the default_ldt for lcall7/27 handling on demand, as
41007+ * it slows down context switching. Noone uses it anyway.
41008+ */
41009+ cpu = cpu; /* XXX avoid compiler warning */
41010+ xen_set_ldt(NULL, 0);
41011+ put_cpu();
41012+}
41013+
41014+/*
41015+ * This is the ldt that every process will get unless we need
41016+ * something other than this.
41017+ */
41018+extern struct desc_struct default_ldt[];
41019+#ifndef CONFIG_X86_NO_IDT
41020+extern struct gate_struct idt_table[];
41021+#endif
41022+extern struct desc_ptr cpu_gdt_descr[];
41023+
41024+/* the cpu gdt accessor */
41025+#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
41026+
41027+static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
41028+{
41029+ struct gate_struct s;
41030+ s.offset_low = PTR_LOW(func);
41031+ s.segment = __KERNEL_CS;
41032+ s.ist = ist;
41033+ s.p = 1;
41034+ s.dpl = dpl;
41035+ s.zero0 = 0;
41036+ s.zero1 = 0;
41037+ s.type = type;
41038+ s.offset_middle = PTR_MIDDLE(func);
41039+ s.offset_high = PTR_HIGH(func);
41040+ /* does not need to be atomic because it is only done once at setup time */
41041+ memcpy(adr, &s, 16);
41042+}
41043+
41044+#ifndef CONFIG_X86_NO_IDT
41045+static inline void set_intr_gate(int nr, void *func)
41046+{
41047+ BUG_ON((unsigned)nr > 0xFF);
41048+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
41049+}
41050+
41051+static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
41052+{
41053+ BUG_ON((unsigned)nr > 0xFF);
41054+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
41055+}
41056+
41057+static inline void set_system_gate(int nr, void *func)
41058+{
41059+ BUG_ON((unsigned)nr > 0xFF);
41060+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
41061+}
41062+
41063+static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
41064+{
41065+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
41066+}
41067+#endif
41068+
41069+static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
41070+ unsigned size)
41071+{
41072+ struct ldttss_desc d;
41073+ memset(&d,0,sizeof(d));
41074+ d.limit0 = size & 0xFFFF;
41075+ d.base0 = PTR_LOW(tss);
41076+ d.base1 = PTR_MIDDLE(tss) & 0xFF;
41077+ d.type = type;
41078+ d.p = 1;
41079+ d.limit1 = (size >> 16) & 0xF;
41080+ d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
41081+ d.base3 = PTR_HIGH(tss);
41082+ memcpy(ptr, &d, 16);
41083+}
41084+
41085+#ifndef CONFIG_X86_NO_TSS
41086+static inline void set_tss_desc(unsigned cpu, void *addr)
41087+{
41088+ /*
41089+ * sizeof(unsigned long) coming from an extra "long" at the end
41090+ * of the iobitmap. See tss_struct definition in processor.h
41091+ *
41092+ * -1? seg base+limit should be pointing to the address of the
41093+ * last valid byte
41094+ */
41095+ set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
41096+ (unsigned long)addr, DESC_TSS,
41097+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
41098+}
41099+#endif
41100+
41101+static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
41102+{
41103+ set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
41104+ DESC_LDT, size * 8 - 1);
41105+}
41106+
41107+static inline void set_seg_base(unsigned cpu, int entry, void *base)
41108+{
41109+ struct desc_struct *d = &cpu_gdt(cpu)[entry];
41110+ u32 addr = (u32)(u64)base;
41111+ BUG_ON((u64)base >> 32);
41112+ d->base0 = addr & 0xffff;
41113+ d->base1 = (addr >> 16) & 0xff;
41114+ d->base2 = (addr >> 24) & 0xff;
41115+}
41116+
41117+#define LDT_entry_a(info) \
41118+ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
41119+/* Don't allow setting of the lm bit. It is useless anyways because
41120+ 64bit system calls require __USER_CS. */
41121+#define LDT_entry_b(info) \
41122+ (((info)->base_addr & 0xff000000) | \
41123+ (((info)->base_addr & 0x00ff0000) >> 16) | \
41124+ ((info)->limit & 0xf0000) | \
41125+ (((info)->read_exec_only ^ 1) << 9) | \
41126+ ((info)->contents << 10) | \
41127+ (((info)->seg_not_present ^ 1) << 15) | \
41128+ ((info)->seg_32bit << 22) | \
41129+ ((info)->limit_in_pages << 23) | \
41130+ ((info)->useable << 20) | \
41131+ /* ((info)->lm << 21) | */ \
41132+ 0x7000)
41133+
41134+#define LDT_empty(info) (\
41135+ (info)->base_addr == 0 && \
41136+ (info)->limit == 0 && \
41137+ (info)->contents == 0 && \
41138+ (info)->read_exec_only == 1 && \
41139+ (info)->seg_32bit == 0 && \
41140+ (info)->limit_in_pages == 0 && \
41141+ (info)->seg_not_present == 1 && \
41142+ (info)->useable == 0 && \
41143+ (info)->lm == 0)
41144+
41145+#if TLS_SIZE != 24
41146+# error update this code.
41147+#endif
41148+
41149+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
41150+{
41151+#if 0
41152+ u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
41153+ gdt[0] = t->tls_array[0];
41154+ gdt[1] = t->tls_array[1];
41155+ gdt[2] = t->tls_array[2];
41156+#endif
41157+#define C(i) \
41158+ if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
41159+ t->tls_array[i])) \
41160+ BUG();
41161+
41162+ C(0); C(1); C(2);
41163+#undef C
41164+}
41165+
41166+/*
41167+ * load one particular LDT into the current CPU
41168+ */
41169+static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
41170+{
41171+ void *segments = pc->ldt;
41172+ int count = pc->size;
41173+
41174+ if (likely(!count))
41175+ segments = NULL;
41176+
41177+ xen_set_ldt(segments, count);
41178+}
41179+
41180+static inline void load_LDT(mm_context_t *pc)
41181+{
41182+ int cpu = get_cpu();
41183+ load_LDT_nolock(pc, cpu);
41184+ put_cpu();
41185+}
41186+
41187+extern struct desc_ptr idt_descr;
41188+
41189+#endif /* !__ASSEMBLY__ */
41190+
41191+#endif
41192Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h
41193===================================================================
41194--- /dev/null 1970-01-01 00:00:00.000000000 +0000
41195+++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2007-06-12 13:14:13.000000000 +0200
41196@@ -0,0 +1,207 @@
41197+#ifndef _X8664_DMA_MAPPING_H
41198+#define _X8664_DMA_MAPPING_H 1
41199+
41200+/*
41201+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
41202+ * documentation.
41203+ */
41204+
41205+
41206+#include <asm/scatterlist.h>
41207+#include <asm/io.h>
41208+#include <asm/swiotlb.h>
41209+
41210+struct dma_mapping_ops {
41211+ int (*mapping_error)(dma_addr_t dma_addr);
41212+ void* (*alloc_coherent)(struct device *dev, size_t size,
41213+ dma_addr_t *dma_handle, gfp_t gfp);
41214+ void (*free_coherent)(struct device *dev, size_t size,
41215+ void *vaddr, dma_addr_t dma_handle);
41216+ dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
41217+ size_t size, int direction);
41218+ /* like map_single, but doesn't check the device mask */
41219+ dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
41220+ size_t size, int direction);
41221+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
41222+ size_t size, int direction);
41223+ void (*sync_single_for_cpu)(struct device *hwdev,
41224+ dma_addr_t dma_handle, size_t size,
41225+ int direction);
41226+ void (*sync_single_for_device)(struct device *hwdev,
41227+ dma_addr_t dma_handle, size_t size,
41228+ int direction);
41229+ void (*sync_single_range_for_cpu)(struct device *hwdev,
41230+ dma_addr_t dma_handle, unsigned long offset,
41231+ size_t size, int direction);
41232+ void (*sync_single_range_for_device)(struct device *hwdev,
41233+ dma_addr_t dma_handle, unsigned long offset,
41234+ size_t size, int direction);
41235+ void (*sync_sg_for_cpu)(struct device *hwdev,
41236+ struct scatterlist *sg, int nelems,
41237+ int direction);
41238+ void (*sync_sg_for_device)(struct device *hwdev,
41239+ struct scatterlist *sg, int nelems,
41240+ int direction);
41241+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
41242+ int nents, int direction);
41243+ void (*unmap_sg)(struct device *hwdev,
41244+ struct scatterlist *sg, int nents,
41245+ int direction);
41246+ int (*dma_supported)(struct device *hwdev, u64 mask);
41247+ int is_phys;
41248+};
41249+
41250+extern dma_addr_t bad_dma_address;
41251+extern struct dma_mapping_ops* dma_ops;
41252+extern int iommu_merge;
41253+
41254+static inline int valid_dma_direction(int dma_direction)
41255+{
41256+ return ((dma_direction == DMA_BIDIRECTIONAL) ||
41257+ (dma_direction == DMA_TO_DEVICE) ||
41258+ (dma_direction == DMA_FROM_DEVICE));
41259+}
41260+
41261+#if 0
41262+static inline int dma_mapping_error(dma_addr_t dma_addr)
41263+{
41264+ if (dma_ops->mapping_error)
41265+ return dma_ops->mapping_error(dma_addr);
41266+
41267+ return (dma_addr == bad_dma_address);
41268+}
41269+
41270+extern void *dma_alloc_coherent(struct device *dev, size_t size,
41271+ dma_addr_t *dma_handle, gfp_t gfp);
41272+extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
41273+ dma_addr_t dma_handle);
41274+
41275+static inline dma_addr_t
41276+dma_map_single(struct device *hwdev, void *ptr, size_t size,
41277+ int direction)
41278+{
41279+ BUG_ON(!valid_dma_direction(direction));
41280+ return dma_ops->map_single(hwdev, ptr, size, direction);
41281+}
41282+
41283+static inline void
41284+dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
41285+ int direction)
41286+{
41287+ BUG_ON(!valid_dma_direction(direction));
41288+ dma_ops->unmap_single(dev, addr, size, direction);
41289+}
41290+
41291+#define dma_map_page(dev,page,offset,size,dir) \
41292+ dma_map_single((dev), page_address(page)+(offset), (size), (dir))
41293+
41294+#define dma_unmap_page dma_unmap_single
41295+
41296+static inline void
41297+dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41298+ size_t size, int direction)
41299+{
41300+ BUG_ON(!valid_dma_direction(direction));
41301+ if (dma_ops->sync_single_for_cpu)
41302+ dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
41303+ direction);
41304+ flush_write_buffers();
41305+}
41306+
41307+static inline void
41308+dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
41309+ size_t size, int direction)
41310+{
41311+ BUG_ON(!valid_dma_direction(direction));
41312+ if (dma_ops->sync_single_for_device)
41313+ dma_ops->sync_single_for_device(hwdev, dma_handle, size,
41314+ direction);
41315+ flush_write_buffers();
41316+}
41317+
41318+static inline void
41319+dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41320+ unsigned long offset, size_t size, int direction)
41321+{
41322+ BUG_ON(!valid_dma_direction(direction));
41323+ if (dma_ops->sync_single_range_for_cpu) {
41324+ dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
41325+ }
41326+
41327+ flush_write_buffers();
41328+}
41329+
41330+static inline void
41331+dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
41332+ unsigned long offset, size_t size, int direction)
41333+{
41334+ BUG_ON(!valid_dma_direction(direction));
41335+ if (dma_ops->sync_single_range_for_device)
41336+ dma_ops->sync_single_range_for_device(hwdev, dma_handle,
41337+ offset, size, direction);
41338+
41339+ flush_write_buffers();
41340+}
41341+
41342+static inline void
41343+dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
41344+ int nelems, int direction)
41345+{
41346+ BUG_ON(!valid_dma_direction(direction));
41347+ if (dma_ops->sync_sg_for_cpu)
41348+ dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
41349+ flush_write_buffers();
41350+}
41351+
41352+static inline void
41353+dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
41354+ int nelems, int direction)
41355+{
41356+ BUG_ON(!valid_dma_direction(direction));
41357+ if (dma_ops->sync_sg_for_device) {
41358+ dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
41359+ }
41360+
41361+ flush_write_buffers();
41362+}
41363+
41364+static inline int
41365+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
41366+{
41367+ BUG_ON(!valid_dma_direction(direction));
41368+ return dma_ops->map_sg(hwdev, sg, nents, direction);
41369+}
41370+
41371+static inline void
41372+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
41373+ int direction)
41374+{
41375+ BUG_ON(!valid_dma_direction(direction));
41376+ dma_ops->unmap_sg(hwdev, sg, nents, direction);
41377+}
41378+
41379+extern int dma_supported(struct device *hwdev, u64 mask);
41380+
41381+/* same for gart, swiotlb, and nommu */
41382+static inline int dma_get_cache_alignment(void)
41383+{
41384+ return boot_cpu_data.x86_clflush_size;
41385+}
41386+
41387+#define dma_is_consistent(h) 1
41388+
41389+extern int dma_set_mask(struct device *dev, u64 mask);
41390+
41391+static inline void
41392+dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
41393+{
41394+ flush_write_buffers();
41395+}
41396+
41397+extern struct device fallback_dev;
41398+extern int panic_on_overflow;
41399+#endif
41400+
41401+#endif /* _X8664_DMA_MAPPING_H */
41402+
41403+#include <asm-i386/mach-xen/asm/dma-mapping.h>
41404Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h
41405===================================================================
41406--- /dev/null 1970-01-01 00:00:00.000000000 +0000
41407+++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200
41408@@ -0,0 +1,112 @@
41409+/*
41410+ * fixmap.h: compile-time virtual memory allocation
41411+ *
41412+ * This file is subject to the terms and conditions of the GNU General Public
41413+ * License. See the file "COPYING" in the main directory of this archive
41414+ * for more details.
41415+ *
41416+ * Copyright (C) 1998 Ingo Molnar
41417+ */
41418+
41419+#ifndef _ASM_FIXMAP_H
41420+#define _ASM_FIXMAP_H
41421+
41422+#include <linux/kernel.h>
41423+#include <asm/apicdef.h>
41424+#include <asm/page.h>
41425+#include <asm/vsyscall.h>
41426+#include <asm/vsyscall32.h>
41427+#include <asm/acpi.h>
41428+
41429+/*
41430+ * Here we define all the compile-time 'special' virtual
41431+ * addresses. The point is to have a constant address at
41432+ * compile time, but to set the physical address only
41433+ * in the boot process.
41434+ *
41435+ * these 'compile-time allocated' memory buffers are
41436+ * fixed-size 4k pages. (or larger if used with an increment
41437+ * highger than 1) use fixmap_set(idx,phys) to associate
41438+ * physical memory with fixmap indices.
41439+ *
41440+ * TLB entries of such buffers will not be flushed across
41441+ * task switches.
41442+ */
41443+
41444+enum fixed_addresses {
41445+ VSYSCALL_LAST_PAGE,
41446+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
41447+ VSYSCALL_HPET,
41448+ FIX_HPET_BASE,
41449+#ifdef CONFIG_X86_LOCAL_APIC
41450+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
41451+#endif
41452+#ifdef CONFIG_X86_IO_APIC
41453+ FIX_IO_APIC_BASE_0,
41454+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
41455+#endif
41456+#ifdef CONFIG_ACPI
41457+ FIX_ACPI_BEGIN,
41458+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
41459+#endif
41460+ FIX_SHARED_INFO,
41461+#define NR_FIX_ISAMAPS 256
41462+ FIX_ISAMAP_END,
41463+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
41464+ __end_of_permanent_fixed_addresses,
41465+ /* temporary boot-time mappings, used before ioremap() is functional */
41466+#define NR_FIX_BTMAPS 16
41467+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
41468+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
41469+ __end_of_fixed_addresses
41470+};
41471+
41472+extern void __set_fixmap (enum fixed_addresses idx,
41473+ unsigned long phys, pgprot_t flags);
41474+
41475+#define set_fixmap(idx, phys) \
41476+ __set_fixmap(idx, phys, PAGE_KERNEL)
41477+/*
41478+ * Some hardware wants to get fixmapped without caching.
41479+ */
41480+#define set_fixmap_nocache(idx, phys) \
41481+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
41482+
41483+#define clear_fixmap(idx) \
41484+ __set_fixmap(idx, 0, __pgprot(0))
41485+
41486+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
41487+#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
41488+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
41489+
41490+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
41491+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
41492+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
41493+
41494+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
41495+
41496+extern void __this_fixmap_does_not_exist(void);
41497+
41498+/*
41499+ * 'index to address' translation. If anyone tries to use the idx
41500+ * directly without translation, we catch the bug with a NULL-deference
41501+ * kernel oops. Illegal ranges of incoming indices are caught too.
41502+ */
41503+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
41504+{
41505+ /*
41506+ * this branch gets completely eliminated after inlining,
41507+ * except when someone tries to use fixaddr indices in an
41508+ * illegal way. (such as mixing up address types or using
41509+ * out-of-range indices).
41510+ *
41511+ * If it doesn't get removed, the linker will complain
41512+ * loudly with a reasonably clear error message..
41513+ */
41514+ if (idx >= __end_of_fixed_addresses)
41515+ __this_fixmap_does_not_exist();
41516+
41517+ return __fix_to_virt(idx);
41518+}
41519+
41520+#endif
41521Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h
41522===================================================================
41523--- /dev/null 1970-01-01 00:00:00.000000000 +0000
41524+++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h 2008-11-25 12:22:34.000000000 +0100
41525@@ -0,0 +1,408 @@
41526+/******************************************************************************
41527+ * hypercall.h
41528+ *
41529+ * Linux-specific hypervisor handling.
41530+ *
41531+ * Copyright (c) 2002-2004, K A Fraser
41532+ *
41533+ * 64-bit updates:
41534+ * Benjamin Liu <benjamin.liu@intel.com>
41535+ * Jun Nakajima <jun.nakajima@intel.com>
41536+ *
41537+ * This program is free software; you can redistribute it and/or
41538+ * modify it under the terms of the GNU General Public License version 2
41539+ * as published by the Free Software Foundation; or, when distributed
41540+ * separately from the Linux kernel or incorporated into other
41541+ * software packages, subject to the following license:
41542+ *
41543+ * Permission is hereby granted, free of charge, to any person obtaining a copy
41544+ * of this source file (the "Software"), to deal in the Software without
41545+ * restriction, including without limitation the rights to use, copy, modify,
41546+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41547+ * and to permit persons to whom the Software is furnished to do so, subject to
41548+ * the following conditions:
41549+ *
41550+ * The above copyright notice and this permission notice shall be included in
41551+ * all copies or substantial portions of the Software.
41552+ *
41553+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41554+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41555+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41556+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41557+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41558+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41559+ * IN THE SOFTWARE.
41560+ */
41561+
41562+#ifndef __HYPERCALL_H__
41563+#define __HYPERCALL_H__
41564+
41565+#include <linux/string.h> /* memcpy() */
41566+#include <linux/stringify.h>
41567+
41568+#ifndef __HYPERVISOR_H__
41569+# error "please don't include this file directly"
41570+#endif
41571+
41572+#ifdef CONFIG_XEN
41573+#define HYPERCALL_STR(name) \
41574+ "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
41575+#else
41576+#define HYPERCALL_STR(name) \
41577+ "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
41578+ "add hypercall_stubs(%%rip),%%rax; " \
41579+ "call *%%rax"
41580+#endif
41581+
41582+#define _hypercall0(type, name) \
41583+({ \
41584+ type __res; \
41585+ asm volatile ( \
41586+ HYPERCALL_STR(name) \
41587+ : "=a" (__res) \
41588+ : \
41589+ : "memory" ); \
41590+ __res; \
41591+})
41592+
41593+#define _hypercall1(type, name, a1) \
41594+({ \
41595+ type __res; \
41596+ long __ign1; \
41597+ asm volatile ( \
41598+ HYPERCALL_STR(name) \
41599+ : "=a" (__res), "=D" (__ign1) \
41600+ : "1" ((long)(a1)) \
41601+ : "memory" ); \
41602+ __res; \
41603+})
41604+
41605+#define _hypercall2(type, name, a1, a2) \
41606+({ \
41607+ type __res; \
41608+ long __ign1, __ign2; \
41609+ asm volatile ( \
41610+ HYPERCALL_STR(name) \
41611+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
41612+ : "1" ((long)(a1)), "2" ((long)(a2)) \
41613+ : "memory" ); \
41614+ __res; \
41615+})
41616+
41617+#define _hypercall3(type, name, a1, a2, a3) \
41618+({ \
41619+ type __res; \
41620+ long __ign1, __ign2, __ign3; \
41621+ asm volatile ( \
41622+ HYPERCALL_STR(name) \
41623+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41624+ "=d" (__ign3) \
41625+ : "1" ((long)(a1)), "2" ((long)(a2)), \
41626+ "3" ((long)(a3)) \
41627+ : "memory" ); \
41628+ __res; \
41629+})
41630+
41631+#define _hypercall4(type, name, a1, a2, a3, a4) \
41632+({ \
41633+ type __res; \
41634+ long __ign1, __ign2, __ign3; \
41635+ register long __arg4 asm("r10") = (long)(a4); \
41636+ asm volatile ( \
41637+ HYPERCALL_STR(name) \
41638+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41639+ "=d" (__ign3), "+r" (__arg4) \
41640+ : "1" ((long)(a1)), "2" ((long)(a2)), \
41641+ "3" ((long)(a3)) \
41642+ : "memory" ); \
41643+ __res; \
41644+})
41645+
41646+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
41647+({ \
41648+ type __res; \
41649+ long __ign1, __ign2, __ign3; \
41650+ register long __arg4 asm("r10") = (long)(a4); \
41651+ register long __arg5 asm("r8") = (long)(a5); \
41652+ asm volatile ( \
41653+ HYPERCALL_STR(name) \
41654+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41655+ "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
41656+ : "1" ((long)(a1)), "2" ((long)(a2)), \
41657+ "3" ((long)(a3)) \
41658+ : "memory" ); \
41659+ __res; \
41660+})
41661+
41662+static inline int __must_check
41663+HYPERVISOR_set_trap_table(
41664+ const trap_info_t *table)
41665+{
41666+ return _hypercall1(int, set_trap_table, table);
41667+}
41668+
41669+static inline int __must_check
41670+HYPERVISOR_mmu_update(
41671+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
41672+ domid_t domid)
41673+{
41674+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
41675+}
41676+
41677+static inline int __must_check
41678+HYPERVISOR_mmuext_op(
41679+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
41680+ domid_t domid)
41681+{
41682+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
41683+}
41684+
41685+static inline int __must_check
41686+HYPERVISOR_set_gdt(
41687+ unsigned long *frame_list, unsigned int entries)
41688+{
41689+ return _hypercall2(int, set_gdt, frame_list, entries);
41690+}
41691+
41692+static inline int __must_check
41693+HYPERVISOR_stack_switch(
41694+ unsigned long ss, unsigned long esp)
41695+{
41696+ return _hypercall2(int, stack_switch, ss, esp);
41697+}
41698+
41699+static inline int __must_check
41700+HYPERVISOR_set_callbacks(
41701+ unsigned long event_address, unsigned long failsafe_address,
41702+ unsigned long syscall_address)
41703+{
41704+ return _hypercall3(int, set_callbacks,
41705+ event_address, failsafe_address, syscall_address);
41706+}
41707+
41708+static inline int
41709+HYPERVISOR_fpu_taskswitch(
41710+ int set)
41711+{
41712+ return _hypercall1(int, fpu_taskswitch, set);
41713+}
41714+
41715+static inline int __must_check
41716+HYPERVISOR_sched_op_compat(
41717+ int cmd, unsigned long arg)
41718+{
41719+ return _hypercall2(int, sched_op_compat, cmd, arg);
41720+}
41721+
41722+static inline int __must_check
41723+HYPERVISOR_sched_op(
41724+ int cmd, void *arg)
41725+{
41726+ return _hypercall2(int, sched_op, cmd, arg);
41727+}
41728+
41729+static inline long __must_check
41730+HYPERVISOR_set_timer_op(
41731+ u64 timeout)
41732+{
41733+ return _hypercall1(long, set_timer_op, timeout);
41734+}
41735+
41736+static inline int __must_check
41737+HYPERVISOR_platform_op(
41738+ struct xen_platform_op *platform_op)
41739+{
41740+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
41741+ return _hypercall1(int, platform_op, platform_op);
41742+}
41743+
41744+static inline int __must_check
41745+HYPERVISOR_set_debugreg(
41746+ unsigned int reg, unsigned long value)
41747+{
41748+ return _hypercall2(int, set_debugreg, reg, value);
41749+}
41750+
41751+static inline unsigned long __must_check
41752+HYPERVISOR_get_debugreg(
41753+ unsigned int reg)
41754+{
41755+ return _hypercall1(unsigned long, get_debugreg, reg);
41756+}
41757+
41758+static inline int __must_check
41759+HYPERVISOR_update_descriptor(
41760+ unsigned long ma, unsigned long word)
41761+{
41762+ return _hypercall2(int, update_descriptor, ma, word);
41763+}
41764+
41765+static inline int __must_check
41766+HYPERVISOR_memory_op(
41767+ unsigned int cmd, void *arg)
41768+{
41769+ return _hypercall2(int, memory_op, cmd, arg);
41770+}
41771+
41772+static inline int __must_check
41773+HYPERVISOR_multicall(
41774+ multicall_entry_t *call_list, unsigned int nr_calls)
41775+{
41776+ return _hypercall2(int, multicall, call_list, nr_calls);
41777+}
41778+
41779+static inline int __must_check
41780+HYPERVISOR_update_va_mapping(
41781+ unsigned long va, pte_t new_val, unsigned long flags)
41782+{
41783+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
41784+}
41785+
41786+static inline int __must_check
41787+HYPERVISOR_event_channel_op(
41788+ int cmd, void *arg)
41789+{
41790+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
41791+
41792+#if CONFIG_XEN_COMPAT <= 0x030002
41793+ if (unlikely(rc == -ENOSYS)) {
41794+ struct evtchn_op op;
41795+ op.cmd = cmd;
41796+ memcpy(&op.u, arg, sizeof(op.u));
41797+ rc = _hypercall1(int, event_channel_op_compat, &op);
41798+ memcpy(arg, &op.u, sizeof(op.u));
41799+ }
41800+#endif
41801+
41802+ return rc;
41803+}
41804+
41805+static inline int __must_check
41806+HYPERVISOR_xen_version(
41807+ int cmd, void *arg)
41808+{
41809+ return _hypercall2(int, xen_version, cmd, arg);
41810+}
41811+
41812+static inline int __must_check
41813+HYPERVISOR_console_io(
41814+ int cmd, unsigned int count, char *str)
41815+{
41816+ return _hypercall3(int, console_io, cmd, count, str);
41817+}
41818+
41819+static inline int __must_check
41820+HYPERVISOR_physdev_op(
41821+ int cmd, void *arg)
41822+{
41823+ int rc = _hypercall2(int, physdev_op, cmd, arg);
41824+
41825+#if CONFIG_XEN_COMPAT <= 0x030002
41826+ if (unlikely(rc == -ENOSYS)) {
41827+ struct physdev_op op;
41828+ op.cmd = cmd;
41829+ memcpy(&op.u, arg, sizeof(op.u));
41830+ rc = _hypercall1(int, physdev_op_compat, &op);
41831+ memcpy(arg, &op.u, sizeof(op.u));
41832+ }
41833+#endif
41834+
41835+ return rc;
41836+}
41837+
41838+static inline int __must_check
41839+HYPERVISOR_grant_table_op(
41840+ unsigned int cmd, void *uop, unsigned int count)
41841+{
41842+ return _hypercall3(int, grant_table_op, cmd, uop, count);
41843+}
41844+
41845+static inline int __must_check
41846+HYPERVISOR_update_va_mapping_otherdomain(
41847+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
41848+{
41849+ return _hypercall4(int, update_va_mapping_otherdomain, va,
41850+ new_val.pte, flags, domid);
41851+}
41852+
41853+static inline int __must_check
41854+HYPERVISOR_vm_assist(
41855+ unsigned int cmd, unsigned int type)
41856+{
41857+ return _hypercall2(int, vm_assist, cmd, type);
41858+}
41859+
41860+static inline int __must_check
41861+HYPERVISOR_vcpu_op(
41862+ int cmd, unsigned int vcpuid, void *extra_args)
41863+{
41864+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
41865+}
41866+
41867+static inline int __must_check
41868+HYPERVISOR_set_segment_base(
41869+ int reg, unsigned long value)
41870+{
41871+ return _hypercall2(int, set_segment_base, reg, value);
41872+}
41873+
41874+static inline int __must_check
41875+HYPERVISOR_suspend(
41876+ unsigned long srec)
41877+{
41878+ struct sched_shutdown sched_shutdown = {
41879+ .reason = SHUTDOWN_suspend
41880+ };
41881+
41882+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
41883+ &sched_shutdown, srec);
41884+
41885+#if CONFIG_XEN_COMPAT <= 0x030002
41886+ if (rc == -ENOSYS)
41887+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
41888+ SHUTDOWN_suspend, srec);
41889+#endif
41890+
41891+ return rc;
41892+}
41893+
41894+#if CONFIG_XEN_COMPAT <= 0x030002
41895+static inline int
41896+HYPERVISOR_nmi_op(
41897+ unsigned long op, void *arg)
41898+{
41899+ return _hypercall2(int, nmi_op, op, arg);
41900+}
41901+#endif
41902+
41903+#ifndef CONFIG_XEN
41904+static inline unsigned long __must_check
41905+HYPERVISOR_hvm_op(
41906+ int op, void *arg)
41907+{
41908+ return _hypercall2(unsigned long, hvm_op, op, arg);
41909+}
41910+#endif
41911+
41912+static inline int __must_check
41913+HYPERVISOR_callback_op(
41914+ int cmd, const void *arg)
41915+{
41916+ return _hypercall2(int, callback_op, cmd, arg);
41917+}
41918+
41919+static inline int __must_check
41920+HYPERVISOR_xenoprof_op(
41921+ int op, void *arg)
41922+{
41923+ return _hypercall2(int, xenoprof_op, op, arg);
41924+}
41925+
41926+static inline int __must_check
41927+HYPERVISOR_kexec_op(
41928+ unsigned long op, void *args)
41929+{
41930+ return _hypercall2(int, kexec_op, op, args);
41931+}
41932+
41933+#endif /* __HYPERCALL_H__ */
41934Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h
41935===================================================================
41936--- /dev/null 1970-01-01 00:00:00.000000000 +0000
41937+++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h 2007-08-16 18:07:01.000000000 +0200
41938@@ -0,0 +1,329 @@
41939+#ifndef _ASM_IO_H
41940+#define _ASM_IO_H
41941+
41942+#include <asm/fixmap.h>
41943+
41944+/*
41945+ * This file contains the definitions for the x86 IO instructions
41946+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
41947+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
41948+ * versions of the single-IO instructions (inb_p/inw_p/..).
41949+ *
41950+ * This file is not meant to be obfuscating: it's just complicated
41951+ * to (a) handle it all in a way that makes gcc able to optimize it
41952+ * as well as possible and (b) trying to avoid writing the same thing
41953+ * over and over again with slight variations and possibly making a
41954+ * mistake somewhere.
41955+ */
41956+
41957+/*
41958+ * Thanks to James van Artsdalen for a better timing-fix than
41959+ * the two short jumps: using outb's to a nonexistent port seems
41960+ * to guarantee better timings even on fast machines.
41961+ *
41962+ * On the other hand, I'd like to be sure of a non-existent port:
41963+ * I feel a bit unsafe about using 0x80 (should be safe, though)
41964+ *
41965+ * Linus
41966+ */
41967+
41968+ /*
41969+ * Bit simplified and optimized by Jan Hubicka
41970+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
41971+ *
41972+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
41973+ * isa_read[wl] and isa_write[wl] fixed
41974+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
41975+ */
41976+
41977+#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
41978+
41979+#ifdef REALLY_SLOW_IO
41980+#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
41981+#else
41982+#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
41983+#endif
41984+
41985+/*
41986+ * Talk about misusing macros..
41987+ */
41988+#define __OUT1(s,x) \
41989+static inline void out##s(unsigned x value, unsigned short port) {
41990+
41991+#define __OUT2(s,s1,s2) \
41992+__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
41993+
41994+#define __OUT(s,s1,x) \
41995+__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
41996+__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
41997+
41998+#define __IN1(s) \
41999+static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
42000+
42001+#define __IN2(s,s1,s2) \
42002+__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
42003+
42004+#define __IN(s,s1,i...) \
42005+__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42006+__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42007+
42008+#define __INS(s) \
42009+static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
42010+{ __asm__ __volatile__ ("rep ; ins" #s \
42011+: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42012+
42013+#define __OUTS(s) \
42014+static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
42015+{ __asm__ __volatile__ ("rep ; outs" #s \
42016+: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42017+
42018+#define RETURN_TYPE unsigned char
42019+__IN(b,"")
42020+#undef RETURN_TYPE
42021+#define RETURN_TYPE unsigned short
42022+__IN(w,"")
42023+#undef RETURN_TYPE
42024+#define RETURN_TYPE unsigned int
42025+__IN(l,"")
42026+#undef RETURN_TYPE
42027+
42028+__OUT(b,"b",char)
42029+__OUT(w,"w",short)
42030+__OUT(l,,int)
42031+
42032+__INS(b)
42033+__INS(w)
42034+__INS(l)
42035+
42036+__OUTS(b)
42037+__OUTS(w)
42038+__OUTS(l)
42039+
42040+#define IO_SPACE_LIMIT 0xffff
42041+
42042+#if defined(__KERNEL__) && __x86_64__
42043+
42044+#include <linux/vmalloc.h>
42045+
42046+#ifndef __i386__
42047+/*
42048+ * Change virtual addresses to physical addresses and vv.
42049+ * These are pretty trivial
42050+ */
42051+static inline unsigned long virt_to_phys(volatile void * address)
42052+{
42053+ return __pa(address);
42054+}
42055+
42056+static inline void * phys_to_virt(unsigned long address)
42057+{
42058+ return __va(address);
42059+}
42060+
42061+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42062+#define bus_to_virt(_x) __va(machine_to_phys(_x))
42063+#endif
42064+
42065+/*
42066+ * Change "struct page" to physical address.
42067+ */
42068+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
42069+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
42070+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
42071+
42072+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
42073+ (unsigned long) bio_offset((bio)))
42074+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
42075+ (unsigned long) (bv)->bv_offset)
42076+
42077+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
42078+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
42079+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
42080+ bvec_to_pseudophys((vec2))))
42081+
42082+#include <asm-generic/iomap.h>
42083+
42084+extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
42085+
42086+static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
42087+{
42088+ return __ioremap(offset, size, 0);
42089+}
42090+
42091+extern void *bt_ioremap(unsigned long addr, unsigned long size);
42092+extern void bt_iounmap(void *addr, unsigned long size);
42093+#define early_ioremap bt_ioremap
42094+#define early_iounmap bt_iounmap
42095+
42096+/*
42097+ * This one maps high address device memory and turns off caching for that area.
42098+ * it's useful if some control registers are in such an area and write combining
42099+ * or read caching is not desirable:
42100+ */
42101+extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
42102+extern void iounmap(volatile void __iomem *addr);
42103+
42104+/*
42105+ * ISA I/O bus memory addresses are 1:1 with the physical address.
42106+ */
42107+
42108+#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
42109+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
42110+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
42111+
42112+/*
42113+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
42114+ * are forbidden in portable PCI drivers.
42115+ *
42116+ * Allow them on x86 for legacy drivers, though.
42117+ */
42118+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42119+#define bus_to_virt(_x) __va(machine_to_phys(_x))
42120+
42121+/*
42122+ * readX/writeX() are used to access memory mapped devices. On some
42123+ * architectures the memory mapped IO stuff needs to be accessed
42124+ * differently. On the x86 architecture, we just read/write the
42125+ * memory location directly.
42126+ */
42127+
42128+static inline __u8 __readb(const volatile void __iomem *addr)
42129+{
42130+ return *(__force volatile __u8 *)addr;
42131+}
42132+static inline __u16 __readw(const volatile void __iomem *addr)
42133+{
42134+ return *(__force volatile __u16 *)addr;
42135+}
42136+static __always_inline __u32 __readl(const volatile void __iomem *addr)
42137+{
42138+ return *(__force volatile __u32 *)addr;
42139+}
42140+static inline __u64 __readq(const volatile void __iomem *addr)
42141+{
42142+ return *(__force volatile __u64 *)addr;
42143+}
42144+#define readb(x) __readb(x)
42145+#define readw(x) __readw(x)
42146+#define readl(x) __readl(x)
42147+#define readq(x) __readq(x)
42148+#define readb_relaxed(a) readb(a)
42149+#define readw_relaxed(a) readw(a)
42150+#define readl_relaxed(a) readl(a)
42151+#define readq_relaxed(a) readq(a)
42152+#define __raw_readb readb
42153+#define __raw_readw readw
42154+#define __raw_readl readl
42155+#define __raw_readq readq
42156+
42157+#define mmiowb()
42158+
42159+static inline void __writel(__u32 b, volatile void __iomem *addr)
42160+{
42161+ *(__force volatile __u32 *)addr = b;
42162+}
42163+static inline void __writeq(__u64 b, volatile void __iomem *addr)
42164+{
42165+ *(__force volatile __u64 *)addr = b;
42166+}
42167+static inline void __writeb(__u8 b, volatile void __iomem *addr)
42168+{
42169+ *(__force volatile __u8 *)addr = b;
42170+}
42171+static inline void __writew(__u16 b, volatile void __iomem *addr)
42172+{
42173+ *(__force volatile __u16 *)addr = b;
42174+}
42175+#define writeq(val,addr) __writeq((val),(addr))
42176+#define writel(val,addr) __writel((val),(addr))
42177+#define writew(val,addr) __writew((val),(addr))
42178+#define writeb(val,addr) __writeb((val),(addr))
42179+#define __raw_writeb writeb
42180+#define __raw_writew writew
42181+#define __raw_writel writel
42182+#define __raw_writeq writeq
42183+
42184+void __memcpy_fromio(void*,unsigned long,unsigned);
42185+void __memcpy_toio(unsigned long,const void*,unsigned);
42186+
42187+static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
42188+{
42189+ __memcpy_fromio(to,(unsigned long)from,len);
42190+}
42191+static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
42192+{
42193+ __memcpy_toio((unsigned long)to,from,len);
42194+}
42195+
42196+void memset_io(volatile void __iomem *a, int b, size_t c);
42197+
42198+/*
42199+ * ISA space is 'always mapped' on a typical x86 system, no need to
42200+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
42201+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
42202+ * are physical addresses. The following constant pointer can be
42203+ * used as the IO-area pointer (it can be iounmapped as well, so the
42204+ * analogy with PCI is quite large):
42205+ */
42206+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
42207+
42208+/*
42209+ * Again, x86-64 does not require mem IO specific function.
42210+ */
42211+
42212+#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
42213+
42214+/**
42215+ * check_signature - find BIOS signatures
42216+ * @io_addr: mmio address to check
42217+ * @signature: signature block
42218+ * @length: length of signature
42219+ *
42220+ * Perform a signature comparison with the mmio address io_addr. This
42221+ * address should have been obtained by ioremap.
42222+ * Returns 1 on a match.
42223+ */
42224+
42225+static inline int check_signature(void __iomem *io_addr,
42226+ const unsigned char *signature, int length)
42227+{
42228+ int retval = 0;
42229+ do {
42230+ if (readb(io_addr) != *signature)
42231+ goto out;
42232+ io_addr++;
42233+ signature++;
42234+ length--;
42235+ } while (length);
42236+ retval = 1;
42237+out:
42238+ return retval;
42239+}
42240+
42241+/* Nothing to do */
42242+
42243+#define dma_cache_inv(_start,_size) do { } while (0)
42244+#define dma_cache_wback(_start,_size) do { } while (0)
42245+#define dma_cache_wback_inv(_start,_size) do { } while (0)
42246+
42247+#define flush_write_buffers()
42248+
42249+extern int iommu_bio_merge;
42250+#define BIO_VMERGE_BOUNDARY iommu_bio_merge
42251+
42252+/*
42253+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
42254+ * access
42255+ */
42256+#define xlate_dev_mem_ptr(p) __va(p)
42257+
42258+/*
42259+ * Convert a virtual cached pointer to an uncached pointer
42260+ */
42261+#define xlate_dev_kmem_ptr(p) p
42262+
42263+#endif /* __KERNEL__ */
42264+
42265+#define ARCH_HAS_DEV_MEM
42266+
42267+#endif
42268Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h
42269===================================================================
42270--- /dev/null 1970-01-01 00:00:00.000000000 +0000
42271+++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h 2007-06-12 13:14:13.000000000 +0200
42272@@ -0,0 +1,139 @@
42273+/*
42274+ * include/asm-x86_64/irqflags.h
42275+ *
42276+ * IRQ flags handling
42277+ *
42278+ * This file gets included from lowlevel asm headers too, to provide
42279+ * wrapped versions of the local_irq_*() APIs, based on the
42280+ * raw_local_irq_*() functions from the lowlevel headers.
42281+ */
42282+#ifndef _ASM_IRQFLAGS_H
42283+#define _ASM_IRQFLAGS_H
42284+
42285+#ifndef __ASSEMBLY__
42286+/*
42287+ * Interrupt control:
42288+ */
42289+
42290+/*
42291+ * The use of 'barrier' in the following reflects their use as local-lock
42292+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
42293+ * critical operations are executed. All critical operations must complete
42294+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
42295+ * includes these barriers, for example.
42296+ */
42297+
42298+#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
42299+
42300+#define raw_local_save_flags(flags) \
42301+ do { (flags) = __raw_local_save_flags(); } while (0)
42302+
42303+#define raw_local_irq_restore(x) \
42304+do { \
42305+ vcpu_info_t *_vcpu; \
42306+ barrier(); \
42307+ _vcpu = current_vcpu_info(); \
42308+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
42309+ barrier(); /* unmask then check (avoid races) */ \
42310+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42311+ force_evtchn_callback(); \
42312+ } \
42313+} while (0)
42314+
42315+#ifdef CONFIG_X86_VSMP
42316+
42317+/*
42318+ * Interrupt control for the VSMP architecture:
42319+ */
42320+
42321+static inline void raw_local_irq_disable(void)
42322+{
42323+ unsigned long flags = __raw_local_save_flags();
42324+
42325+ raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
42326+}
42327+
42328+static inline void raw_local_irq_enable(void)
42329+{
42330+ unsigned long flags = __raw_local_save_flags();
42331+
42332+ raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
42333+}
42334+
42335+static inline int raw_irqs_disabled_flags(unsigned long flags)
42336+{
42337+ return !(flags & (1<<9)) || (flags & (1 << 18));
42338+}
42339+
42340+#else /* CONFIG_X86_VSMP */
42341+
42342+#define raw_local_irq_disable() \
42343+do { \
42344+ current_vcpu_info()->evtchn_upcall_mask = 1; \
42345+ barrier(); \
42346+} while (0)
42347+
42348+#define raw_local_irq_enable() \
42349+do { \
42350+ vcpu_info_t *_vcpu; \
42351+ barrier(); \
42352+ _vcpu = current_vcpu_info(); \
42353+ _vcpu->evtchn_upcall_mask = 0; \
42354+ barrier(); /* unmask then check (avoid races) */ \
42355+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42356+ force_evtchn_callback(); \
42357+} while (0)
42358+
42359+static inline int raw_irqs_disabled_flags(unsigned long flags)
42360+{
42361+ return (flags != 0);
42362+}
42363+
42364+#endif
42365+
42366+/*
42367+ * For spinlocks, etc.:
42368+ */
42369+
42370+#define __raw_local_irq_save() \
42371+({ \
42372+ unsigned long flags = __raw_local_save_flags(); \
42373+ \
42374+ raw_local_irq_disable(); \
42375+ \
42376+ flags; \
42377+})
42378+
42379+#define raw_local_irq_save(flags) \
42380+ do { (flags) = __raw_local_irq_save(); } while (0)
42381+
42382+#define raw_irqs_disabled() \
42383+({ \
42384+ unsigned long flags = __raw_local_save_flags(); \
42385+ \
42386+ raw_irqs_disabled_flags(flags); \
42387+})
42388+
42389+/*
42390+ * Used in the idle loop; sti takes one instruction cycle
42391+ * to complete:
42392+ */
42393+void raw_safe_halt(void);
42394+
42395+/*
42396+ * Used when interrupts are already enabled or to
42397+ * shutdown the processor:
42398+ */
42399+void halt(void);
42400+
42401+#else /* __ASSEMBLY__: */
42402+# ifdef CONFIG_TRACE_IRQFLAGS
42403+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
42404+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
42405+# else
42406+# define TRACE_IRQS_ON
42407+# define TRACE_IRQS_OFF
42408+# endif
42409+#endif
42410+
42411+#endif
42412Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h
42413===================================================================
42414--- /dev/null 1970-01-01 00:00:00.000000000 +0000
42415+++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200
42416@@ -0,0 +1,161 @@
42417+#ifndef _X86_64_MADDR_H
42418+#define _X86_64_MADDR_H
42419+
42420+#include <xen/features.h>
42421+#include <xen/interface/xen.h>
42422+
42423+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
42424+#define INVALID_P2M_ENTRY (~0UL)
42425+#define FOREIGN_FRAME_BIT (1UL<<63)
42426+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
42427+
42428+/* Definitions for machine and pseudophysical addresses. */
42429+typedef unsigned long paddr_t;
42430+typedef unsigned long maddr_t;
42431+
42432+#ifdef CONFIG_XEN
42433+
42434+extern unsigned long *phys_to_machine_mapping;
42435+
42436+#undef machine_to_phys_mapping
42437+extern unsigned long *machine_to_phys_mapping;
42438+extern unsigned int machine_to_phys_order;
42439+
42440+static inline unsigned long pfn_to_mfn(unsigned long pfn)
42441+{
42442+ if (xen_feature(XENFEAT_auto_translated_physmap))
42443+ return pfn;
42444+ BUG_ON(end_pfn && pfn >= end_pfn);
42445+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
42446+}
42447+
42448+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
42449+{
42450+ if (xen_feature(XENFEAT_auto_translated_physmap))
42451+ return 1;
42452+ BUG_ON(end_pfn && pfn >= end_pfn);
42453+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
42454+}
42455+
42456+static inline unsigned long mfn_to_pfn(unsigned long mfn)
42457+{
42458+ unsigned long pfn;
42459+
42460+ if (xen_feature(XENFEAT_auto_translated_physmap))
42461+ return mfn;
42462+
42463+ if (unlikely((mfn >> machine_to_phys_order) != 0))
42464+ return end_pfn;
42465+
42466+ /* The array access can fail (e.g., device space beyond end of RAM). */
42467+ asm (
42468+ "1: movq %1,%0\n"
42469+ "2:\n"
42470+ ".section .fixup,\"ax\"\n"
42471+ "3: movq %2,%0\n"
42472+ " jmp 2b\n"
42473+ ".previous\n"
42474+ ".section __ex_table,\"a\"\n"
42475+ " .align 8\n"
42476+ " .quad 1b,3b\n"
42477+ ".previous"
42478+ : "=r" (pfn)
42479+ : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
42480+
42481+ return pfn;
42482+}
42483+
42484+/*
42485+ * We detect special mappings in one of two ways:
42486+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
42487+ * to be outside our maximum possible pseudophys range.
42488+ * 2. If the MFN belongs to a different domain then we will certainly
42489+ * not have MFN in our p2m table. Conversely, if the page is ours,
42490+ * then we'll have p2m(m2p(MFN))==MFN.
42491+ * If we detect a special mapping then it doesn't have a 'struct page'.
42492+ * We force !pfn_valid() by returning an out-of-range pointer.
42493+ *
42494+ * NB. These checks require that, for any MFN that is not in our reservation,
42495+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
42496+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
42497+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
42498+ *
42499+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
42500+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
42501+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
42502+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
42503+ */
42504+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
42505+{
42506+ unsigned long pfn = mfn_to_pfn(mfn);
42507+ if ((pfn < end_pfn)
42508+ && !xen_feature(XENFEAT_auto_translated_physmap)
42509+ && (phys_to_machine_mapping[pfn] != mfn))
42510+ return end_pfn; /* force !pfn_valid() */
42511+ return pfn;
42512+}
42513+
42514+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
42515+{
42516+ BUG_ON(end_pfn && pfn >= end_pfn);
42517+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
42518+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
42519+ return;
42520+ }
42521+ phys_to_machine_mapping[pfn] = mfn;
42522+}
42523+
42524+static inline maddr_t phys_to_machine(paddr_t phys)
42525+{
42526+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
42527+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
42528+ return machine;
42529+}
42530+
42531+static inline paddr_t machine_to_phys(maddr_t machine)
42532+{
42533+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
42534+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
42535+ return phys;
42536+}
42537+
42538+static inline paddr_t pte_phys_to_machine(paddr_t phys)
42539+{
42540+ maddr_t machine;
42541+ machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42542+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
42543+ return machine;
42544+}
42545+
42546+static inline paddr_t pte_machine_to_phys(maddr_t machine)
42547+{
42548+ paddr_t phys;
42549+ phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42550+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
42551+ return phys;
42552+}
42553+
42554+#define __pte_ma(x) ((pte_t) { (x) } )
42555+#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
42556+
42557+#else /* !CONFIG_XEN */
42558+
42559+#define pfn_to_mfn(pfn) (pfn)
42560+#define mfn_to_pfn(mfn) (mfn)
42561+#define mfn_to_local_pfn(mfn) (mfn)
42562+#define set_phys_to_machine(pfn, mfn) ((void)0)
42563+#define phys_to_machine_mapping_valid(pfn) (1)
42564+#define phys_to_machine(phys) ((maddr_t)(phys))
42565+#define machine_to_phys(mach) ((paddr_t)(mach))
42566+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
42567+#define __pte_ma(x) __pte(x)
42568+
42569+#endif /* !CONFIG_XEN */
42570+
42571+/* VIRT <-> MACHINE conversion */
42572+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
42573+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
42574+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
42575+
42576+#endif /* _X86_64_MADDR_H */
42577+
42578Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h
42579===================================================================
42580--- /dev/null 1970-01-01 00:00:00.000000000 +0000
42581+++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h 2007-06-12 13:14:13.000000000 +0200
42582@@ -0,0 +1,136 @@
42583+#ifndef __X86_64_MMU_CONTEXT_H
42584+#define __X86_64_MMU_CONTEXT_H
42585+
42586+#include <asm/desc.h>
42587+#include <asm/atomic.h>
42588+#include <asm/pgalloc.h>
42589+#include <asm/page.h>
42590+#include <asm/pda.h>
42591+#include <asm/pgtable.h>
42592+#include <asm/tlbflush.h>
42593+
42594+/*
42595+ * possibly do the LDT unload here?
42596+ */
42597+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
42598+void destroy_context(struct mm_struct *mm);
42599+
42600+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
42601+{
42602+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42603+ if (read_pda(mmu_state) == TLBSTATE_OK)
42604+ write_pda(mmu_state, TLBSTATE_LAZY);
42605+#endif
42606+}
42607+
42608+#define prepare_arch_switch(next) __prepare_arch_switch()
42609+
42610+static inline void __prepare_arch_switch(void)
42611+{
42612+ /*
42613+ * Save away %es, %ds, %fs and %gs. Must happen before reload
42614+ * of cr3/ldt (i.e., not in __switch_to).
42615+ */
42616+ __asm__ __volatile__ (
42617+ "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
42618+ : "=m" (current->thread.es),
42619+ "=m" (current->thread.ds),
42620+ "=m" (current->thread.fsindex),
42621+ "=m" (current->thread.gsindex) );
42622+
42623+ if (current->thread.ds)
42624+ __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
42625+
42626+ if (current->thread.es)
42627+ __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
42628+
42629+ if (current->thread.fsindex) {
42630+ __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
42631+ current->thread.fs = 0;
42632+ }
42633+
42634+ if (current->thread.gsindex) {
42635+ load_gs_index(0);
42636+ current->thread.gs = 0;
42637+ }
42638+}
42639+
42640+extern void mm_pin(struct mm_struct *mm);
42641+extern void mm_unpin(struct mm_struct *mm);
42642+void mm_pin_all(void);
42643+
42644+static inline void load_cr3(pgd_t *pgd)
42645+{
42646+ asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
42647+ "memory");
42648+}
42649+
42650+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
42651+ struct task_struct *tsk)
42652+{
42653+ unsigned cpu = smp_processor_id();
42654+ struct mmuext_op _op[3], *op = _op;
42655+
42656+ if (likely(prev != next)) {
42657+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
42658+ !next->context.pinned);
42659+
42660+ /* stop flush ipis for the previous mm */
42661+ cpu_clear(cpu, prev->cpu_vm_mask);
42662+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42663+ write_pda(mmu_state, TLBSTATE_OK);
42664+ write_pda(active_mm, next);
42665+#endif
42666+ cpu_set(cpu, next->cpu_vm_mask);
42667+
42668+ /* load_cr3(next->pgd) */
42669+ op->cmd = MMUEXT_NEW_BASEPTR;
42670+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
42671+ op++;
42672+
42673+ /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
42674+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
42675+ op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
42676+ op++;
42677+
42678+ if (unlikely(next->context.ldt != prev->context.ldt)) {
42679+ /* load_LDT_nolock(&next->context, cpu) */
42680+ op->cmd = MMUEXT_SET_LDT;
42681+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
42682+ op->arg2.nr_ents = next->context.size;
42683+ op++;
42684+ }
42685+
42686+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
42687+ }
42688+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42689+ else {
42690+ write_pda(mmu_state, TLBSTATE_OK);
42691+ if (read_pda(active_mm) != next)
42692+ out_of_line_bug();
42693+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42694+ /* We were in lazy tlb mode and leave_mm disabled
42695+ * tlb flush IPI delivery. We must reload CR3
42696+ * to make sure to use no freed page tables.
42697+ */
42698+ load_cr3(next->pgd);
42699+ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
42700+ load_LDT_nolock(&next->context, cpu);
42701+ }
42702+ }
42703+#endif
42704+}
42705+
42706+#define deactivate_mm(tsk,mm) do { \
42707+ load_gs_index(0); \
42708+ asm volatile("movl %0,%%fs"::"r"(0)); \
42709+} while(0)
42710+
42711+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
42712+{
42713+ if (!next->context.pinned)
42714+ mm_pin(next);
42715+ switch_mm(prev, next, NULL);
42716+}
42717+
42718+#endif
42719Index: head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h
42720===================================================================
42721--- /dev/null 1970-01-01 00:00:00.000000000 +0000
42722+++ head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h 2008-04-02 12:34:02.000000000 +0200
42723@@ -0,0 +1,212 @@
42724+#ifndef _X86_64_PAGE_H
42725+#define _X86_64_PAGE_H
42726+
42727+/* #include <linux/string.h> */
42728+#ifndef __ASSEMBLY__
42729+#include <linux/kernel.h>
42730+#include <linux/types.h>
42731+#include <asm/bug.h>
42732+#endif
42733+#include <xen/interface/xen.h>
42734+
42735+/*
42736+ * Need to repeat this here in order to not include pgtable.h (which in turn
42737+ * depends on definitions made here), but to be able to use the symbolic
42738+ * below. The preprocessor will warn if the two definitions aren't identical.
42739+ */
42740+#define _PAGE_PRESENT 0x001
42741+#define _PAGE_IO 0x200
42742+
42743+/* PAGE_SHIFT determines the page size */
42744+#define PAGE_SHIFT 12
42745+#ifdef __ASSEMBLY__
42746+#define PAGE_SIZE (0x1 << PAGE_SHIFT)
42747+#else
42748+#define PAGE_SIZE (1UL << PAGE_SHIFT)
42749+#endif
42750+#define PAGE_MASK (~(PAGE_SIZE-1))
42751+
42752+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
42753+#define __PHYSICAL_MASK_SHIFT 46
42754+#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
42755+#define __VIRTUAL_MASK_SHIFT 48
42756+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
42757+
42758+#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
42759+
42760+#define THREAD_ORDER 1
42761+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
42762+#define CURRENT_MASK (~(THREAD_SIZE-1))
42763+
42764+#define EXCEPTION_STACK_ORDER 0
42765+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
42766+
42767+#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
42768+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
42769+
42770+#define IRQSTACK_ORDER 2
42771+#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
42772+
42773+#define STACKFAULT_STACK 1
42774+#define DOUBLEFAULT_STACK 2
42775+#define NMI_STACK 3
42776+#define DEBUG_STACK 4
42777+#define MCE_STACK 5
42778+#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
42779+
42780+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
42781+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
42782+
42783+#define HPAGE_SHIFT PMD_SHIFT
42784+#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
42785+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
42786+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
42787+
42788+#ifdef __KERNEL__
42789+#ifndef __ASSEMBLY__
42790+
42791+extern unsigned long end_pfn;
42792+
42793+#include <asm/maddr.h>
42794+
42795+void clear_page(void *);
42796+void copy_page(void *, void *);
42797+
42798+#define clear_user_page(page, vaddr, pg) clear_page(page)
42799+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
42800+
42801+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
42802+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
42803+
42804+/*
42805+ * These are used to make use of C type-checking..
42806+ */
42807+typedef struct { unsigned long pte; } pte_t;
42808+typedef struct { unsigned long pmd; } pmd_t;
42809+typedef struct { unsigned long pud; } pud_t;
42810+typedef struct { unsigned long pgd; } pgd_t;
42811+#define PTE_MASK PHYSICAL_PAGE_MASK
42812+
42813+typedef struct { unsigned long pgprot; } pgprot_t;
42814+
42815+#define __pte_val(x) ((x).pte)
42816+#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
42817+ == _PAGE_PRESENT ? \
42818+ pte_machine_to_phys(__pte_val(x)) : \
42819+ __pte_val(x))
42820+
42821+#define __pmd_val(x) ((x).pmd)
42822+static inline unsigned long pmd_val(pmd_t x)
42823+{
42824+ unsigned long ret = __pmd_val(x);
42825+#if CONFIG_XEN_COMPAT <= 0x030002
42826+ if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
42827+#else
42828+ if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42829+#endif
42830+ return ret;
42831+}
42832+
42833+#define __pud_val(x) ((x).pud)
42834+static inline unsigned long pud_val(pud_t x)
42835+{
42836+ unsigned long ret = __pud_val(x);
42837+ if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42838+ return ret;
42839+}
42840+
42841+#define __pgd_val(x) ((x).pgd)
42842+static inline unsigned long pgd_val(pgd_t x)
42843+{
42844+ unsigned long ret = __pgd_val(x);
42845+ if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42846+ return ret;
42847+}
42848+
42849+#define pgprot_val(x) ((x).pgprot)
42850+
42851+static inline pte_t __pte(unsigned long x)
42852+{
42853+ if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
42854+ x = pte_phys_to_machine(x);
42855+ return ((pte_t) { (x) });
42856+}
42857+
42858+static inline pmd_t __pmd(unsigned long x)
42859+{
42860+ if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42861+ return ((pmd_t) { (x) });
42862+}
42863+
42864+static inline pud_t __pud(unsigned long x)
42865+{
42866+ if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42867+ return ((pud_t) { (x) });
42868+}
42869+
42870+static inline pgd_t __pgd(unsigned long x)
42871+{
42872+ if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42873+ return ((pgd_t) { (x) });
42874+}
42875+
42876+#define __pgprot(x) ((pgprot_t) { (x) } )
42877+
42878+#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
42879+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42880+#define __START_KERNEL_map 0xffffffff80000000UL
42881+#define __PAGE_OFFSET 0xffff880000000000UL
42882+
42883+#else
42884+#define __PHYSICAL_START CONFIG_PHYSICAL_START
42885+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42886+#define __START_KERNEL_map 0xffffffff80000000
42887+#define __PAGE_OFFSET 0xffff880000000000
42888+#endif /* !__ASSEMBLY__ */
42889+
42890+#if CONFIG_XEN_COMPAT <= 0x030002
42891+#undef LOAD_OFFSET
42892+#define LOAD_OFFSET 0
42893+#endif
42894+
42895+/* to align the pointer to the (next) page boundary */
42896+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
42897+
42898+#define KERNEL_TEXT_SIZE (40UL*1024*1024)
42899+#define KERNEL_TEXT_START 0xffffffff80000000UL
42900+
42901+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
42902+
42903+/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
42904+ Otherwise you risk miscompilation. */
42905+#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
42906+/* __pa_symbol should be used for C visible symbols.
42907+ This seems to be the official gcc blessed way to do such arithmetic. */
42908+#define __pa_symbol(x) \
42909+ ({unsigned long v; \
42910+ asm("" : "=r" (v) : "0" (x)); \
42911+ __pa(v); })
42912+
42913+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
42914+#define __boot_va(x) __va(x)
42915+#define __boot_pa(x) __pa(x)
42916+#ifdef CONFIG_FLATMEM
42917+#define pfn_valid(pfn) ((pfn) < end_pfn)
42918+#endif
42919+
42920+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
42921+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
42922+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
42923+
42924+#define VM_DATA_DEFAULT_FLAGS \
42925+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
42926+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
42927+
42928+#define __HAVE_ARCH_GATE_AREA 1
42929+
42930+#include <asm-generic/memory_model.h>
42931+#include <asm-generic/page.h>
42932+
42933+#endif /* __KERNEL__ */
42934+
42935+#endif /* _X86_64_PAGE_H */
42936Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h
42937===================================================================
42938--- /dev/null 1970-01-01 00:00:00.000000000 +0000
42939+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h 2007-09-14 11:14:51.000000000 +0200
42940@@ -0,0 +1,168 @@
42941+#ifndef __x8664_PCI_H
42942+#define __x8664_PCI_H
42943+
42944+#include <asm/io.h>
42945+
42946+#ifdef __KERNEL__
42947+
42948+#include <linux/mm.h> /* for struct page */
42949+
42950+/* Can be used to override the logic in pci_scan_bus for skipping
42951+ already-configured bus numbers - to be used for buggy BIOSes
42952+ or architectures with incomplete PCI setup by the loader */
42953+
42954+#ifdef CONFIG_PCI
42955+extern unsigned int pcibios_assign_all_busses(void);
42956+#else
42957+#define pcibios_assign_all_busses() 0
42958+#endif
42959+
42960+#include <asm/hypervisor.h>
42961+#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
42962+
42963+extern unsigned long pci_mem_start;
42964+#define PCIBIOS_MIN_IO 0x1000
42965+#define PCIBIOS_MIN_MEM (pci_mem_start)
42966+
42967+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
42968+
42969+void pcibios_config_init(void);
42970+struct pci_bus * pcibios_scan_root(int bus);
42971+extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
42972+extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
42973+
42974+void pcibios_set_master(struct pci_dev *dev);
42975+void pcibios_penalize_isa_irq(int irq, int active);
42976+struct irq_routing_table *pcibios_get_irq_routing_table(void);
42977+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
42978+
42979+#include <linux/types.h>
42980+#include <linux/slab.h>
42981+#include <asm/scatterlist.h>
42982+#include <linux/string.h>
42983+#include <asm/page.h>
42984+
42985+extern void pci_iommu_alloc(void);
42986+extern int iommu_setup(char *opt);
42987+
42988+/* The PCI address space does equal the physical memory
42989+ * address space. The networking and block device layers use
42990+ * this boolean for bounce buffer decisions
42991+ *
42992+ * On AMD64 it mostly equals, but we set it to zero if a hardware
42993+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
42994+ */
42995+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
42996+
42997+#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
42998+
42999+/*
43000+ * x86-64 always supports DAC, but sometimes it is useful to force
43001+ * devices through the IOMMU to get automatic sg list merging.
43002+ * Optional right now.
43003+ */
43004+extern int iommu_sac_force;
43005+#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force)
43006+
43007+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43008+ dma_addr_t ADDR_NAME;
43009+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43010+ __u32 LEN_NAME;
43011+#define pci_unmap_addr(PTR, ADDR_NAME) \
43012+ ((PTR)->ADDR_NAME)
43013+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43014+ (((PTR)->ADDR_NAME) = (VAL))
43015+#define pci_unmap_len(PTR, LEN_NAME) \
43016+ ((PTR)->LEN_NAME)
43017+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43018+ (((PTR)->LEN_NAME) = (VAL))
43019+
43020+#elif defined(CONFIG_SWIOTLB)
43021+
43022+#define pci_dac_dma_supported(pci_dev, mask) 1
43023+
43024+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43025+ dma_addr_t ADDR_NAME;
43026+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43027+ __u32 LEN_NAME;
43028+#define pci_unmap_addr(PTR, ADDR_NAME) \
43029+ ((PTR)->ADDR_NAME)
43030+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43031+ (((PTR)->ADDR_NAME) = (VAL))
43032+#define pci_unmap_len(PTR, LEN_NAME) \
43033+ ((PTR)->LEN_NAME)
43034+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43035+ (((PTR)->LEN_NAME) = (VAL))
43036+
43037+#else
43038+/* No IOMMU */
43039+
43040+#define pci_dac_dma_supported(pci_dev, mask) 1
43041+
43042+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
43043+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
43044+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
43045+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
43046+#define pci_unmap_len(PTR, LEN_NAME) (0)
43047+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
43048+
43049+#endif
43050+
43051+#include <asm-generic/pci-dma-compat.h>
43052+
43053+static inline dma64_addr_t
43054+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
43055+{
43056+ return ((dma64_addr_t) page_to_phys(page) +
43057+ (dma64_addr_t) offset);
43058+}
43059+
43060+static inline struct page *
43061+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
43062+{
43063+ return virt_to_page(__va(dma_addr));
43064+}
43065+
43066+static inline unsigned long
43067+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
43068+{
43069+ return (dma_addr & ~PAGE_MASK);
43070+}
43071+
43072+static inline void
43073+pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43074+{
43075+}
43076+
43077+static inline void
43078+pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43079+{
43080+ flush_write_buffers();
43081+}
43082+
43083+#ifdef CONFIG_PCI
43084+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
43085+ enum pci_dma_burst_strategy *strat,
43086+ unsigned long *strategy_parameter)
43087+{
43088+ *strat = PCI_DMA_BURST_INFINITY;
43089+ *strategy_parameter = ~0UL;
43090+}
43091+#endif
43092+
43093+#define HAVE_PCI_MMAP
43094+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
43095+ enum pci_mmap_state mmap_state, int write_combine);
43096+
43097+static inline void pcibios_add_platform_entries(struct pci_dev *dev)
43098+{
43099+}
43100+
43101+#endif /* __KERNEL__ */
43102+
43103+/* generic pci stuff */
43104+#ifdef CONFIG_PCI
43105+#include <asm-generic/pci.h>
43106+#endif
43107+
43108+#endif /* __x8664_PCI_H */
43109Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h
43110===================================================================
43111--- /dev/null 1970-01-01 00:00:00.000000000 +0000
43112+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h 2007-06-18 08:38:13.000000000 +0200
43113@@ -0,0 +1,204 @@
43114+#ifndef _X86_64_PGALLOC_H
43115+#define _X86_64_PGALLOC_H
43116+
43117+#include <asm/fixmap.h>
43118+#include <asm/pda.h>
43119+#include <linux/threads.h>
43120+#include <linux/mm.h>
43121+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
43122+
43123+#include <xen/features.h>
43124+void make_page_readonly(void *va, unsigned int feature);
43125+void make_page_writable(void *va, unsigned int feature);
43126+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
43127+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
43128+
43129+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43130+
43131+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
43132+{
43133+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
43134+}
43135+
43136+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
43137+{
43138+ if (unlikely((mm)->context.pinned)) {
43139+ BUG_ON(HYPERVISOR_update_va_mapping(
43140+ (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
43141+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
43142+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
43143+ } else {
43144+ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
43145+ }
43146+}
43147+
43148+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
43149+{
43150+ if (unlikely((mm)->context.pinned)) {
43151+ BUG_ON(HYPERVISOR_update_va_mapping(
43152+ (unsigned long)pmd,
43153+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
43154+ PAGE_KERNEL_RO), 0));
43155+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
43156+ } else {
43157+ *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
43158+ }
43159+}
43160+
43161+/*
43162+ * We need to use the batch mode here, but pgd_pupulate() won't be
43163+ * be called frequently.
43164+ */
43165+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
43166+{
43167+ if (unlikely((mm)->context.pinned)) {
43168+ BUG_ON(HYPERVISOR_update_va_mapping(
43169+ (unsigned long)pud,
43170+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
43171+ PAGE_KERNEL_RO), 0));
43172+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
43173+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
43174+ } else {
43175+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
43176+ *(__user_pgd(pgd)) = *(pgd);
43177+ }
43178+}
43179+
43180+extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
43181+extern void pte_free(struct page *pte);
43182+
43183+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
43184+{
43185+ struct page *pg;
43186+
43187+ pg = pte_alloc_one(mm, addr);
43188+ return pg ? page_address(pg) : NULL;
43189+}
43190+
43191+static inline void pmd_free(pmd_t *pmd)
43192+{
43193+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
43194+ pte_free(virt_to_page(pmd));
43195+}
43196+
43197+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
43198+{
43199+ struct page *pg;
43200+
43201+ pg = pte_alloc_one(mm, addr);
43202+ return pg ? page_address(pg) : NULL;
43203+}
43204+
43205+static inline void pud_free(pud_t *pud)
43206+{
43207+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
43208+ pte_free(virt_to_page(pud));
43209+}
43210+
43211+static inline void pgd_list_add(pgd_t *pgd)
43212+{
43213+ struct page *page = virt_to_page(pgd);
43214+
43215+ spin_lock(&pgd_lock);
43216+ page->index = (pgoff_t)pgd_list;
43217+ if (pgd_list)
43218+ pgd_list->private = (unsigned long)&page->index;
43219+ pgd_list = page;
43220+ page->private = (unsigned long)&pgd_list;
43221+ spin_unlock(&pgd_lock);
43222+}
43223+
43224+static inline void pgd_list_del(pgd_t *pgd)
43225+{
43226+ struct page *next, **pprev, *page = virt_to_page(pgd);
43227+
43228+ spin_lock(&pgd_lock);
43229+ next = (struct page *)page->index;
43230+ pprev = (struct page **)page->private;
43231+ *pprev = next;
43232+ if (next)
43233+ next->private = (unsigned long)pprev;
43234+ spin_unlock(&pgd_lock);
43235+}
43236+
43237+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
43238+{
43239+ /*
43240+ * We allocate two contiguous pages for kernel and user.
43241+ */
43242+ unsigned boundary;
43243+ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
43244+ if (!pgd)
43245+ return NULL;
43246+ pgd_list_add(pgd);
43247+ /*
43248+ * Copy kernel pointers in from init.
43249+ * Could keep a freelist or slab cache of those because the kernel
43250+ * part never changes.
43251+ */
43252+ boundary = pgd_index(__PAGE_OFFSET);
43253+ memset(pgd, 0, boundary * sizeof(pgd_t));
43254+ memcpy(pgd + boundary,
43255+ init_level4_pgt + boundary,
43256+ (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
43257+
43258+ memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
43259+ /*
43260+ * Set level3_user_pgt for vsyscall area
43261+ */
43262+ __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
43263+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
43264+ return pgd;
43265+}
43266+
43267+static inline void pgd_free(pgd_t *pgd)
43268+{
43269+ pte_t *ptep = virt_to_ptep(pgd);
43270+
43271+ if (!pte_write(*ptep)) {
43272+ xen_pgd_unpin(__pa(pgd));
43273+ BUG_ON(HYPERVISOR_update_va_mapping(
43274+ (unsigned long)pgd,
43275+ pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
43276+ 0));
43277+ }
43278+
43279+ ptep = virt_to_ptep(__user_pgd(pgd));
43280+
43281+ if (!pte_write(*ptep)) {
43282+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
43283+ BUG_ON(HYPERVISOR_update_va_mapping(
43284+ (unsigned long)__user_pgd(pgd),
43285+ pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
43286+ PAGE_KERNEL),
43287+ 0));
43288+ }
43289+
43290+ pgd_list_del(pgd);
43291+ free_pages((unsigned long)pgd, 1);
43292+}
43293+
43294+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
43295+{
43296+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
43297+ if (pte)
43298+ make_page_readonly(pte, XENFEAT_writable_page_tables);
43299+
43300+ return pte;
43301+}
43302+
43303+/* Should really implement gc for free page table pages. This could be
43304+ done with a reference count in struct page. */
43305+
43306+static inline void pte_free_kernel(pte_t *pte)
43307+{
43308+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
43309+ make_page_writable(pte, XENFEAT_writable_page_tables);
43310+ free_page((unsigned long)pte);
43311+}
43312+
43313+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
43314+#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43315+#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43316+
43317+#endif /* _X86_64_PGALLOC_H */
43318Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h
43319===================================================================
43320--- /dev/null 1970-01-01 00:00:00.000000000 +0000
43321+++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-07-21 11:00:33.000000000 +0200
43322@@ -0,0 +1,583 @@
43323+#ifndef _X86_64_PGTABLE_H
43324+#define _X86_64_PGTABLE_H
43325+
43326+/*
43327+ * This file contains the functions and defines necessary to modify and use
43328+ * the x86-64 page table tree.
43329+ */
43330+#include <asm/processor.h>
43331+#include <asm/fixmap.h>
43332+#include <asm/bitops.h>
43333+#include <linux/threads.h>
43334+#include <linux/sched.h>
43335+#include <asm/pda.h>
43336+#ifdef CONFIG_XEN
43337+#include <asm/hypervisor.h>
43338+
43339+extern pud_t level3_user_pgt[512];
43340+
43341+extern void xen_init_pt(void);
43342+
43343+extern pte_t *lookup_address(unsigned long address);
43344+
43345+#define virt_to_ptep(va) \
43346+({ \
43347+ pte_t *__ptep = lookup_address((unsigned long)(va)); \
43348+ BUG_ON(!__ptep || !pte_present(*__ptep)); \
43349+ __ptep; \
43350+})
43351+
43352+#define arbitrary_virt_to_machine(va) \
43353+ (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
43354+ | ((unsigned long)(va) & (PAGE_SIZE - 1)))
43355+#endif
43356+
43357+extern pud_t level3_kernel_pgt[512];
43358+extern pud_t level3_physmem_pgt[512];
43359+extern pud_t level3_ident_pgt[512];
43360+extern pmd_t level2_kernel_pgt[512];
43361+extern pgd_t init_level4_pgt[];
43362+extern pgd_t boot_level4_pgt[];
43363+extern unsigned long __supported_pte_mask;
43364+
43365+#define swapper_pg_dir init_level4_pgt
43366+
43367+extern int nonx_setup(char *str);
43368+extern void paging_init(void);
43369+extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
43370+
43371+extern unsigned long pgkern_mask;
43372+
43373+/*
43374+ * ZERO_PAGE is a global shared page that is always zero: used
43375+ * for zero-mapped memory areas etc..
43376+ */
43377+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
43378+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
43379+
43380+/*
43381+ * PGDIR_SHIFT determines what a top-level page table entry can map
43382+ */
43383+#define PGDIR_SHIFT 39
43384+#define PTRS_PER_PGD 512
43385+
43386+/*
43387+ * 3rd level page
43388+ */
43389+#define PUD_SHIFT 30
43390+#define PTRS_PER_PUD 512
43391+
43392+/*
43393+ * PMD_SHIFT determines the size of the area a middle-level
43394+ * page table can map
43395+ */
43396+#define PMD_SHIFT 21
43397+#define PTRS_PER_PMD 512
43398+
43399+/*
43400+ * entries per page directory level
43401+ */
43402+#define PTRS_PER_PTE 512
43403+
43404+#define pte_ERROR(e) \
43405+ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43406+ &(e), __pte_val(e), pte_pfn(e))
43407+#define pmd_ERROR(e) \
43408+ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43409+ &(e), __pmd_val(e), pmd_pfn(e))
43410+#define pud_ERROR(e) \
43411+ printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43412+ &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43413+#define pgd_ERROR(e) \
43414+ printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43415+ &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43416+
43417+#define pgd_none(x) (!__pgd_val(x))
43418+#define pud_none(x) (!__pud_val(x))
43419+
43420+static inline void set_pte(pte_t *dst, pte_t val)
43421+{
43422+ *dst = val;
43423+}
43424+
43425+#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
43426+#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
43427+#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
43428+
43429+static inline void pud_clear (pud_t * pud)
43430+{
43431+ set_pud(pud, __pud(0));
43432+}
43433+
43434+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43435+
43436+static inline void pgd_clear (pgd_t * pgd)
43437+{
43438+ set_pgd(pgd, __pgd(0));
43439+ set_pgd(__user_pgd(pgd), __pgd(0));
43440+}
43441+
43442+#define pud_page(pud) \
43443+ ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
43444+
43445+#define pte_same(a, b) ((a).pte == (b).pte)
43446+
43447+#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
43448+
43449+#define PMD_SIZE (1UL << PMD_SHIFT)
43450+#define PMD_MASK (~(PMD_SIZE-1))
43451+#define PUD_SIZE (1UL << PUD_SHIFT)
43452+#define PUD_MASK (~(PUD_SIZE-1))
43453+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
43454+#define PGDIR_MASK (~(PGDIR_SIZE-1))
43455+
43456+#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
43457+#define FIRST_USER_ADDRESS 0
43458+
43459+#ifndef __ASSEMBLY__
43460+#define MAXMEM 0x3fffffffffffUL
43461+#define VMALLOC_START 0xffffc20000000000UL
43462+#define VMALLOC_END 0xffffe1ffffffffffUL
43463+#define MODULES_VADDR 0xffffffff88000000UL
43464+#define MODULES_END 0xfffffffffff00000UL
43465+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
43466+
43467+#define _PAGE_BIT_PRESENT 0
43468+#define _PAGE_BIT_RW 1
43469+#define _PAGE_BIT_USER 2
43470+#define _PAGE_BIT_PWT 3
43471+#define _PAGE_BIT_PCD 4
43472+#define _PAGE_BIT_ACCESSED 5
43473+#define _PAGE_BIT_DIRTY 6
43474+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
43475+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
43476+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
43477+
43478+#define _PAGE_PRESENT 0x001
43479+#define _PAGE_RW 0x002
43480+#define _PAGE_USER 0x004
43481+#define _PAGE_PWT 0x008
43482+#define _PAGE_PCD 0x010
43483+#define _PAGE_ACCESSED 0x020
43484+#define _PAGE_DIRTY 0x040
43485+#define _PAGE_PSE 0x080 /* 2MB page */
43486+#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
43487+#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
43488+
43489+#define _PAGE_PROTNONE 0x080 /* If not present */
43490+#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
43491+
43492+/* Mapped page is I/O or foreign and has no associated page struct. */
43493+#define _PAGE_IO 0x200
43494+
43495+#if CONFIG_XEN_COMPAT <= 0x030002
43496+extern unsigned int __kernel_page_user;
43497+#else
43498+#define __kernel_page_user 0
43499+#endif
43500+
43501+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
43502+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
43503+
43504+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
43505+
43506+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
43507+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43508+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
43509+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43510+#define PAGE_COPY PAGE_COPY_NOEXEC
43511+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43512+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43513+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43514+#define __PAGE_KERNEL \
43515+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43516+#define __PAGE_KERNEL_EXEC \
43517+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
43518+#define __PAGE_KERNEL_NOCACHE \
43519+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43520+#define __PAGE_KERNEL_RO \
43521+ (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43522+#define __PAGE_KERNEL_VSYSCALL \
43523+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43524+#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
43525+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
43526+#define __PAGE_KERNEL_LARGE \
43527+ (__PAGE_KERNEL | _PAGE_PSE)
43528+#define __PAGE_KERNEL_LARGE_EXEC \
43529+ (__PAGE_KERNEL_EXEC | _PAGE_PSE)
43530+
43531+/*
43532+ * We don't support GLOBAL page in xenolinux64
43533+ */
43534+#define MAKE_GLOBAL(x) __pgprot((x))
43535+
43536+#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
43537+#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
43538+#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
43539+#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
43540+#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
43541+#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
43542+#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
43543+#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
43544+
43545+/* xwr */
43546+#define __P000 PAGE_NONE
43547+#define __P001 PAGE_READONLY
43548+#define __P010 PAGE_COPY
43549+#define __P011 PAGE_COPY
43550+#define __P100 PAGE_READONLY_EXEC
43551+#define __P101 PAGE_READONLY_EXEC
43552+#define __P110 PAGE_COPY_EXEC
43553+#define __P111 PAGE_COPY_EXEC
43554+
43555+#define __S000 PAGE_NONE
43556+#define __S001 PAGE_READONLY
43557+#define __S010 PAGE_SHARED
43558+#define __S011 PAGE_SHARED
43559+#define __S100 PAGE_READONLY_EXEC
43560+#define __S101 PAGE_READONLY_EXEC
43561+#define __S110 PAGE_SHARED_EXEC
43562+#define __S111 PAGE_SHARED_EXEC
43563+
43564+static inline unsigned long pgd_bad(pgd_t pgd)
43565+{
43566+ unsigned long val = __pgd_val(pgd);
43567+ val &= ~PTE_MASK;
43568+ val &= ~(_PAGE_USER | _PAGE_DIRTY);
43569+ return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43570+}
43571+
43572+static inline unsigned long pud_bad(pud_t pud)
43573+{
43574+ unsigned long val = __pud_val(pud);
43575+ val &= ~PTE_MASK;
43576+ val &= ~(_PAGE_USER | _PAGE_DIRTY);
43577+ return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43578+}
43579+
43580+#define set_pte_at(_mm,addr,ptep,pteval) do { \
43581+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
43582+ HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
43583+ set_pte((ptep), (pteval)); \
43584+} while (0)
43585+
43586+#define pte_none(x) (!(x).pte)
43587+#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
43588+#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
43589+
43590+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
43591+
43592+#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
43593+#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
43594+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
43595+#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
43596+ (_pte).pte & _PAGE_PRESENT ? \
43597+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
43598+ __pte_mfn(_pte))
43599+
43600+#define pte_page(x) pfn_to_page(pte_pfn(x))
43601+
43602+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
43603+{
43604+ unsigned long pte = page_nr << PAGE_SHIFT;
43605+ pte |= pgprot_val(pgprot);
43606+ pte &= __supported_pte_mask;
43607+ return __pte(pte);
43608+}
43609+
43610+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43611+{
43612+ pte_t pte = *ptep;
43613+ if (!pte_none(pte)) {
43614+ if ((mm != &init_mm) ||
43615+ HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
43616+ pte = __pte_ma(xchg(&ptep->pte, 0));
43617+ }
43618+ return pte;
43619+}
43620+
43621+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
43622+{
43623+ if (full) {
43624+ pte_t pte = *ptep;
43625+ if (mm->context.pinned)
43626+ xen_l1_entry_update(ptep, __pte(0));
43627+ else
43628+ *ptep = __pte(0);
43629+ return pte;
43630+ }
43631+ return ptep_get_and_clear(mm, addr, ptep);
43632+}
43633+
43634+#define ptep_clear_flush(vma, addr, ptep) \
43635+({ \
43636+ pte_t *__ptep = (ptep); \
43637+ pte_t __res = *__ptep; \
43638+ if (!pte_none(__res) && \
43639+ ((vma)->vm_mm != current->mm || \
43640+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
43641+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43642+ UVMF_INVLPG|UVMF_MULTI))) { \
43643+ __ptep->pte = 0; \
43644+ flush_tlb_page(vma, addr); \
43645+ } \
43646+ __res; \
43647+})
43648+
43649+/*
43650+ * The following only work if pte_present() is true.
43651+ * Undefined behaviour if not..
43652+ */
43653+#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
43654+static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43655+static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43656+static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43657+static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
43658+static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
43659+static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
43660+static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
43661+static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
43662+
43663+static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43664+static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43665+static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
43666+static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
43667+static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
43668+static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43669+static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43670+static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
43671+static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
43672+static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
43673+static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
43674+
43675+#define ptep_test_and_clear_dirty(vma, addr, ptep) \
43676+({ \
43677+ pte_t __pte = *(ptep); \
43678+ int __ret = pte_dirty(__pte); \
43679+ if (__ret) \
43680+ set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
43681+ __ret; \
43682+})
43683+
43684+#define ptep_test_and_clear_young(vma, addr, ptep) \
43685+({ \
43686+ pte_t __pte = *(ptep); \
43687+ int __ret = pte_young(__pte); \
43688+ if (__ret) \
43689+ set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
43690+ __ret; \
43691+})
43692+
43693+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43694+{
43695+ pte_t pte = *ptep;
43696+ if (pte_write(pte))
43697+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
43698+}
43699+
43700+/*
43701+ * Macro to mark a page protection value as "uncacheable".
43702+ */
43703+#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
43704+
43705+static inline int pmd_large(pmd_t pte) {
43706+ return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
43707+}
43708+
43709+
43710+/*
43711+ * Conversion functions: convert a page and protection to a page entry,
43712+ * and a page entry and page directory to the page they refer to.
43713+ */
43714+
43715+/*
43716+ * Level 4 access.
43717+ * Never use these in the common code.
43718+ */
43719+#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
43720+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
43721+#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
43722+#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
43723+#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
43724+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
43725+
43726+/* PUD - Level3 access */
43727+/* to find an entry in a page-table-directory. */
43728+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
43729+#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
43730+#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
43731+
43732+/* PMD - Level 2 access */
43733+#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
43734+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
43735+
43736+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
43737+#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
43738+ pmd_index(address))
43739+#define pmd_none(x) (!__pmd_val(x))
43740+#if CONFIG_XEN_COMPAT <= 0x030002
43741+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
43742+ can temporarily clear it. */
43743+#define pmd_present(x) (__pmd_val(x))
43744+#else
43745+#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
43746+#endif
43747+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
43748+#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
43749+ != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
43750+#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
43751+#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43752+
43753+#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
43754+#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
43755+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
43756+
43757+/* PTE - Level 1 access. */
43758+
43759+/* page, protection -> pte */
43760+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
43761+#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
43762+
43763+/* physical address -> PTE */
43764+static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
43765+{
43766+ unsigned long pteval;
43767+ pteval = physpage | pgprot_val(pgprot);
43768+ return __pte(pteval);
43769+}
43770+
43771+/* Change flags of a PTE */
43772+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
43773+{
43774+ /*
43775+ * Since this might change the present bit (which controls whether
43776+ * a pte_t object has undergone p2m translation), we must use
43777+ * pte_val() on the input pte and __pte() for the return value.
43778+ */
43779+ unsigned long pteval = pte_val(pte);
43780+
43781+ pteval &= _PAGE_CHG_MASK;
43782+ pteval |= pgprot_val(newprot);
43783+ pteval &= __supported_pte_mask;
43784+ return __pte(pteval);
43785+}
43786+
43787+#define pte_index(address) \
43788+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
43789+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
43790+ pte_index(address))
43791+
43792+/* x86-64 always has all page tables mapped. */
43793+#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
43794+#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
43795+#define pte_unmap(pte) /* NOP */
43796+#define pte_unmap_nested(pte) /* NOP */
43797+
43798+#define update_mmu_cache(vma,address,pte) do { } while (0)
43799+
43800+/*
43801+ * Rules for using ptep_establish: the pte MUST be a user pte, and
43802+ * must be a present->present transition.
43803+ */
43804+#define __HAVE_ARCH_PTEP_ESTABLISH
43805+#define ptep_establish(vma, address, ptep, pteval) \
43806+ do { \
43807+ if ( likely((vma)->vm_mm == current->mm) ) { \
43808+ BUG_ON(HYPERVISOR_update_va_mapping(address, \
43809+ pteval, \
43810+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43811+ UVMF_INVLPG|UVMF_MULTI)); \
43812+ } else { \
43813+ xen_l1_entry_update(ptep, pteval); \
43814+ flush_tlb_page(vma, address); \
43815+ } \
43816+ } while (0)
43817+
43818+/* We only update the dirty/accessed state if we set
43819+ * the dirty bit by hand in the kernel, since the hardware
43820+ * will do the accessed bit for us, and we don't want to
43821+ * race with other CPU's that might be updating the dirty
43822+ * bit at the same time. */
43823+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
43824+#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
43825+ do { \
43826+ if (dirty) \
43827+ ptep_establish(vma, address, ptep, entry); \
43828+ } while (0)
43829+
43830+/* Encode and de-code a swap entry */
43831+#define __swp_type(x) (((x).val >> 1) & 0x3f)
43832+#define __swp_offset(x) ((x).val >> 8)
43833+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
43834+#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
43835+#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
43836+
43837+extern spinlock_t pgd_lock;
43838+extern struct page *pgd_list;
43839+void vmalloc_sync_all(void);
43840+
43841+#endif /* !__ASSEMBLY__ */
43842+
43843+extern int kern_addr_valid(unsigned long addr);
43844+
43845+#define DOMID_LOCAL (0xFFFFU)
43846+
43847+struct vm_area_struct;
43848+
43849+int direct_remap_pfn_range(struct vm_area_struct *vma,
43850+ unsigned long address,
43851+ unsigned long mfn,
43852+ unsigned long size,
43853+ pgprot_t prot,
43854+ domid_t domid);
43855+
43856+int direct_kernel_remap_pfn_range(unsigned long address,
43857+ unsigned long mfn,
43858+ unsigned long size,
43859+ pgprot_t prot,
43860+ domid_t domid);
43861+
43862+int create_lookup_pte_addr(struct mm_struct *mm,
43863+ unsigned long address,
43864+ uint64_t *ptep);
43865+
43866+int touch_pte_range(struct mm_struct *mm,
43867+ unsigned long address,
43868+ unsigned long size);
43869+
43870+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43871+ unsigned long addr, unsigned long end, pgprot_t newprot);
43872+
43873+#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
43874+ xen_change_pte_range(mm, pmd, addr, end, newprot)
43875+
43876+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
43877+ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
43878+
43879+#define MK_IOSPACE_PFN(space, pfn) (pfn)
43880+#define GET_IOSPACE(pfn) 0
43881+#define GET_PFN(pfn) (pfn)
43882+
43883+#define HAVE_ARCH_UNMAPPED_AREA
43884+
43885+#define pgtable_cache_init() do { } while (0)
43886+#define check_pgt_cache() do { } while (0)
43887+
43888+#define PAGE_AGP PAGE_KERNEL_NOCACHE
43889+#define HAVE_PAGE_AGP 1
43890+
43891+/* fs/proc/kcore.c */
43892+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
43893+#define kc_offset_to_vaddr(o) \
43894+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
43895+
43896+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
43897+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
43898+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
43899+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
43900+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
43901+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
43902+#define __HAVE_ARCH_PTE_SAME
43903+#include <asm-generic/pgtable.h>
43904+
43905+#endif /* _X86_64_PGTABLE_H */
43906Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h
43907===================================================================
43908--- /dev/null 1970-01-01 00:00:00.000000000 +0000
43909+++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100
43910@@ -0,0 +1,502 @@
43911+/*
43912+ * include/asm-x86_64/processor.h
43913+ *
43914+ * Copyright (C) 1994 Linus Torvalds
43915+ */
43916+
43917+#ifndef __ASM_X86_64_PROCESSOR_H
43918+#define __ASM_X86_64_PROCESSOR_H
43919+
43920+#include <asm/segment.h>
43921+#include <asm/page.h>
43922+#include <asm/types.h>
43923+#include <asm/sigcontext.h>
43924+#include <asm/cpufeature.h>
43925+#include <linux/threads.h>
43926+#include <asm/msr.h>
43927+#include <asm/current.h>
43928+#include <asm/system.h>
43929+#include <asm/mmsegment.h>
43930+#include <asm/percpu.h>
43931+#include <linux/personality.h>
43932+#include <linux/cpumask.h>
43933+
43934+#define TF_MASK 0x00000100
43935+#define IF_MASK 0x00000200
43936+#define IOPL_MASK 0x00003000
43937+#define NT_MASK 0x00004000
43938+#define VM_MASK 0x00020000
43939+#define AC_MASK 0x00040000
43940+#define VIF_MASK 0x00080000 /* virtual interrupt flag */
43941+#define VIP_MASK 0x00100000 /* virtual interrupt pending */
43942+#define ID_MASK 0x00200000
43943+
43944+#define desc_empty(desc) \
43945+ (!((desc)->a | (desc)->b))
43946+
43947+#define desc_equal(desc1, desc2) \
43948+ (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
43949+
43950+/*
43951+ * Default implementation of macro that returns current
43952+ * instruction pointer ("program counter").
43953+ */
43954+#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
43955+
43956+/*
43957+ * CPU type and hardware bug flags. Kept separately for each CPU.
43958+ */
43959+
43960+struct cpuinfo_x86 {
43961+ __u8 x86; /* CPU family */
43962+ __u8 x86_vendor; /* CPU vendor */
43963+ __u8 x86_model;
43964+ __u8 x86_mask;
43965+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
43966+ __u32 x86_capability[NCAPINTS];
43967+ char x86_vendor_id[16];
43968+ char x86_model_id[64];
43969+ int x86_cache_size; /* in KB */
43970+ int x86_clflush_size;
43971+ int x86_cache_alignment;
43972+ int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
43973+ __u8 x86_virt_bits, x86_phys_bits;
43974+ __u8 x86_max_cores; /* cpuid returned max cores value */
43975+ __u32 x86_power;
43976+ __u32 extended_cpuid_level; /* Max extended CPUID function supported */
43977+ unsigned long loops_per_jiffy;
43978+#ifdef CONFIG_SMP
43979+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
43980+#endif
43981+ __u8 apicid;
43982+#ifdef CONFIG_SMP
43983+ __u8 booted_cores; /* number of cores as seen by OS */
43984+ __u8 phys_proc_id; /* Physical Processor id. */
43985+ __u8 cpu_core_id; /* Core id. */
43986+#endif
43987+} ____cacheline_aligned;
43988+
43989+#define X86_VENDOR_INTEL 0
43990+#define X86_VENDOR_CYRIX 1
43991+#define X86_VENDOR_AMD 2
43992+#define X86_VENDOR_UMC 3
43993+#define X86_VENDOR_NEXGEN 4
43994+#define X86_VENDOR_CENTAUR 5
43995+#define X86_VENDOR_RISE 6
43996+#define X86_VENDOR_TRANSMETA 7
43997+#define X86_VENDOR_NUM 8
43998+#define X86_VENDOR_UNKNOWN 0xff
43999+
44000+#ifdef CONFIG_SMP
44001+extern struct cpuinfo_x86 cpu_data[];
44002+#define current_cpu_data cpu_data[smp_processor_id()]
44003+#else
44004+#define cpu_data (&boot_cpu_data)
44005+#define current_cpu_data boot_cpu_data
44006+#endif
44007+
44008+extern char ignore_irq13;
44009+
44010+extern void identify_cpu(struct cpuinfo_x86 *);
44011+extern void print_cpu_info(struct cpuinfo_x86 *);
44012+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
44013+extern unsigned short num_cache_leaves;
44014+
44015+/*
44016+ * EFLAGS bits
44017+ */
44018+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
44019+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
44020+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
44021+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
44022+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
44023+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
44024+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
44025+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
44026+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
44027+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
44028+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
44029+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
44030+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
44031+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
44032+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
44033+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
44034+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
44035+
44036+/*
44037+ * Intel CPU features in CR4
44038+ */
44039+#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
44040+#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
44041+#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
44042+#define X86_CR4_DE 0x0008 /* enable debugging extensions */
44043+#define X86_CR4_PSE 0x0010 /* enable page size extensions */
44044+#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
44045+#define X86_CR4_MCE 0x0040 /* Machine check enable */
44046+#define X86_CR4_PGE 0x0080 /* enable global pages */
44047+#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
44048+#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
44049+#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
44050+
44051+/*
44052+ * Save the cr4 feature set we're using (ie
44053+ * Pentium 4MB enable and PPro Global page
44054+ * enable), so that any CPU's that boot up
44055+ * after us can get the correct flags.
44056+ */
44057+extern unsigned long mmu_cr4_features;
44058+
44059+static inline void set_in_cr4 (unsigned long mask)
44060+{
44061+ mmu_cr4_features |= mask;
44062+ __asm__("movq %%cr4,%%rax\n\t"
44063+ "orq %0,%%rax\n\t"
44064+ "movq %%rax,%%cr4\n"
44065+ : : "irg" (mask)
44066+ :"ax");
44067+}
44068+
44069+static inline void clear_in_cr4 (unsigned long mask)
44070+{
44071+ mmu_cr4_features &= ~mask;
44072+ __asm__("movq %%cr4,%%rax\n\t"
44073+ "andq %0,%%rax\n\t"
44074+ "movq %%rax,%%cr4\n"
44075+ : : "irg" (~mask)
44076+ :"ax");
44077+}
44078+
44079+
44080+/*
44081+ * User space process size. 47bits minus one guard page.
44082+ */
44083+#define TASK_SIZE64 (0x800000000000UL - 4096)
44084+
44085+/* This decides where the kernel will search for a free chunk of vm
44086+ * space during mmap's.
44087+ */
44088+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
44089+
44090+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44091+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44092+
44093+#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
44094+
44095+/*
44096+ * Size of io_bitmap.
44097+ */
44098+#define IO_BITMAP_BITS 65536
44099+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
44100+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
44101+#ifndef CONFIG_X86_NO_TSS
44102+#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
44103+#endif
44104+#define INVALID_IO_BITMAP_OFFSET 0x8000
44105+
44106+struct i387_fxsave_struct {
44107+ u16 cwd;
44108+ u16 swd;
44109+ u16 twd;
44110+ u16 fop;
44111+ u64 rip;
44112+ u64 rdp;
44113+ u32 mxcsr;
44114+ u32 mxcsr_mask;
44115+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
44116+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
44117+ u32 padding[24];
44118+} __attribute__ ((aligned (16)));
44119+
44120+union i387_union {
44121+ struct i387_fxsave_struct fxsave;
44122+};
44123+
44124+#ifndef CONFIG_X86_NO_TSS
44125+struct tss_struct {
44126+ u32 reserved1;
44127+ u64 rsp0;
44128+ u64 rsp1;
44129+ u64 rsp2;
44130+ u64 reserved2;
44131+ u64 ist[7];
44132+ u32 reserved3;
44133+ u32 reserved4;
44134+ u16 reserved5;
44135+ u16 io_bitmap_base;
44136+ /*
44137+ * The extra 1 is there because the CPU will access an
44138+ * additional byte beyond the end of the IO permission
44139+ * bitmap. The extra byte must be all 1 bits, and must
44140+ * be within the limit. Thus we have:
44141+ *
44142+ * 128 bytes, the bitmap itself, for ports 0..0x3ff
44143+ * 8 bytes, for an extra "long" of ~0UL
44144+ */
44145+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
44146+} __attribute__((packed)) ____cacheline_aligned;
44147+
44148+DECLARE_PER_CPU(struct tss_struct,init_tss);
44149+#endif
44150+
44151+
44152+extern struct cpuinfo_x86 boot_cpu_data;
44153+#ifndef CONFIG_X86_NO_TSS
44154+/* Save the original ist values for checking stack pointers during debugging */
44155+struct orig_ist {
44156+ unsigned long ist[7];
44157+};
44158+DECLARE_PER_CPU(struct orig_ist, orig_ist);
44159+#endif
44160+
44161+#ifdef CONFIG_X86_VSMP
44162+#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
44163+#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
44164+#else
44165+#define ARCH_MIN_TASKALIGN 16
44166+#define ARCH_MIN_MMSTRUCT_ALIGN 0
44167+#endif
44168+
44169+struct thread_struct {
44170+ unsigned long rsp0;
44171+ unsigned long rsp;
44172+ unsigned long userrsp; /* Copy from PDA */
44173+ unsigned long fs;
44174+ unsigned long gs;
44175+ unsigned short es, ds, fsindex, gsindex;
44176+/* Hardware debugging registers */
44177+ unsigned long debugreg0;
44178+ unsigned long debugreg1;
44179+ unsigned long debugreg2;
44180+ unsigned long debugreg3;
44181+ unsigned long debugreg6;
44182+ unsigned long debugreg7;
44183+/* fault info */
44184+ unsigned long cr2, trap_no, error_code;
44185+/* floating point info */
44186+ union i387_union i387 __attribute__((aligned(16)));
44187+/* IO permissions. the bitmap could be moved into the GDT, that would make
44188+ switch faster for a limited number of ioperm using tasks. -AK */
44189+ int ioperm;
44190+ unsigned long *io_bitmap_ptr;
44191+ unsigned io_bitmap_max;
44192+/* cached TLS descriptors. */
44193+ u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
44194+ unsigned int iopl;
44195+} __attribute__((aligned(16)));
44196+
44197+#define INIT_THREAD { \
44198+ .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44199+}
44200+
44201+#ifndef CONFIG_X86_NO_TSS
44202+#define INIT_TSS { \
44203+ .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44204+}
44205+#endif
44206+
44207+#define INIT_MMAP \
44208+{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
44209+
44210+#define start_thread(regs,new_rip,new_rsp) do { \
44211+ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
44212+ load_gs_index(0); \
44213+ (regs)->rip = (new_rip); \
44214+ (regs)->rsp = (new_rsp); \
44215+ write_pda(oldrsp, (new_rsp)); \
44216+ (regs)->cs = __USER_CS; \
44217+ (regs)->ss = __USER_DS; \
44218+ (regs)->eflags = 0x200; \
44219+ set_fs(USER_DS); \
44220+} while(0)
44221+
44222+#define get_debugreg(var, register) \
44223+ var = HYPERVISOR_get_debugreg(register)
44224+#define set_debugreg(value, register) do { \
44225+ if (HYPERVISOR_set_debugreg(register, value)) \
44226+ BUG(); \
44227+} while (0)
44228+
44229+struct task_struct;
44230+struct mm_struct;
44231+
44232+/* Free all resources held by a thread. */
44233+extern void release_thread(struct task_struct *);
44234+
44235+/* Prepare to copy thread state - unlazy all lazy status */
44236+extern void prepare_to_copy(struct task_struct *tsk);
44237+
44238+/*
44239+ * create a kernel thread without removing it from tasklists
44240+ */
44241+extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
44242+
44243+/*
44244+ * Return saved PC of a blocked thread.
44245+ * What is this good for? it will be always the scheduler or ret_from_fork.
44246+ */
44247+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
44248+
44249+extern unsigned long get_wchan(struct task_struct *p);
44250+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
44251+#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
44252+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
44253+
44254+
44255+struct microcode_header {
44256+ unsigned int hdrver;
44257+ unsigned int rev;
44258+ unsigned int date;
44259+ unsigned int sig;
44260+ unsigned int cksum;
44261+ unsigned int ldrver;
44262+ unsigned int pf;
44263+ unsigned int datasize;
44264+ unsigned int totalsize;
44265+ unsigned int reserved[3];
44266+};
44267+
44268+struct microcode {
44269+ struct microcode_header hdr;
44270+ unsigned int bits[0];
44271+};
44272+
44273+typedef struct microcode microcode_t;
44274+typedef struct microcode_header microcode_header_t;
44275+
44276+/* microcode format is extended from prescott processors */
44277+struct extended_signature {
44278+ unsigned int sig;
44279+ unsigned int pf;
44280+ unsigned int cksum;
44281+};
44282+
44283+struct extended_sigtable {
44284+ unsigned int count;
44285+ unsigned int cksum;
44286+ unsigned int reserved[3];
44287+ struct extended_signature sigs[0];
44288+};
44289+
44290+
44291+#define ASM_NOP1 K8_NOP1
44292+#define ASM_NOP2 K8_NOP2
44293+#define ASM_NOP3 K8_NOP3
44294+#define ASM_NOP4 K8_NOP4
44295+#define ASM_NOP5 K8_NOP5
44296+#define ASM_NOP6 K8_NOP6
44297+#define ASM_NOP7 K8_NOP7
44298+#define ASM_NOP8 K8_NOP8
44299+
44300+/* Opteron nops */
44301+#define K8_NOP1 ".byte 0x90\n"
44302+#define K8_NOP2 ".byte 0x66,0x90\n"
44303+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
44304+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
44305+#define K8_NOP5 K8_NOP3 K8_NOP2
44306+#define K8_NOP6 K8_NOP3 K8_NOP3
44307+#define K8_NOP7 K8_NOP4 K8_NOP3
44308+#define K8_NOP8 K8_NOP4 K8_NOP4
44309+
44310+#define ASM_NOP_MAX 8
44311+
44312+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
44313+static inline void rep_nop(void)
44314+{
44315+ __asm__ __volatile__("rep;nop": : :"memory");
44316+}
44317+
44318+/* Stop speculative execution */
44319+static inline void sync_core(void)
44320+{
44321+ int tmp;
44322+ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
44323+}
44324+
44325+#define cpu_has_fpu 1
44326+
44327+#define ARCH_HAS_PREFETCH
44328+static inline void prefetch(void *x)
44329+{
44330+ asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
44331+}
44332+
44333+#define ARCH_HAS_PREFETCHW 1
44334+static inline void prefetchw(void *x)
44335+{
44336+ alternative_input("prefetcht0 (%1)",
44337+ "prefetchw (%1)",
44338+ X86_FEATURE_3DNOW,
44339+ "r" (x));
44340+}
44341+
44342+#define ARCH_HAS_SPINLOCK_PREFETCH 1
44343+
44344+#define spin_lock_prefetch(x) prefetchw(x)
44345+
44346+#define cpu_relax() rep_nop()
44347+
44348+/*
44349+ * NSC/Cyrix CPU configuration register indexes
44350+ */
44351+#define CX86_CCR0 0xc0
44352+#define CX86_CCR1 0xc1
44353+#define CX86_CCR2 0xc2
44354+#define CX86_CCR3 0xc3
44355+#define CX86_CCR4 0xe8
44356+#define CX86_CCR5 0xe9
44357+#define CX86_CCR6 0xea
44358+#define CX86_CCR7 0xeb
44359+#define CX86_DIR0 0xfe
44360+#define CX86_DIR1 0xff
44361+#define CX86_ARR_BASE 0xc4
44362+#define CX86_RCR_BASE 0xdc
44363+
44364+/*
44365+ * NSC/Cyrix CPU indexed register access macros
44366+ */
44367+
44368+#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
44369+
44370+#define setCx86(reg, data) do { \
44371+ outb((reg), 0x22); \
44372+ outb((data), 0x23); \
44373+} while (0)
44374+
44375+static inline void serialize_cpu(void)
44376+{
44377+ __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
44378+}
44379+
44380+static inline void __monitor(const void *eax, unsigned long ecx,
44381+ unsigned long edx)
44382+{
44383+ /* "monitor %eax,%ecx,%edx;" */
44384+ asm volatile(
44385+ ".byte 0x0f,0x01,0xc8;"
44386+ : :"a" (eax), "c" (ecx), "d"(edx));
44387+}
44388+
44389+static inline void __mwait(unsigned long eax, unsigned long ecx)
44390+{
44391+ /* "mwait %eax,%ecx;" */
44392+ asm volatile(
44393+ ".byte 0x0f,0x01,0xc9;"
44394+ : :"a" (eax), "c" (ecx));
44395+}
44396+
44397+#define stack_current() \
44398+({ \
44399+ struct thread_info *ti; \
44400+ asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44401+ ti->task; \
44402+})
44403+
44404+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
44405+
44406+extern unsigned long boot_option_idle_override;
44407+/* Boot loader type from the setup header */
44408+extern int bootloader_type;
44409+
44410+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
44411+
44412+#endif /* __ASM_X86_64_PROCESSOR_H */
44413Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h
44414===================================================================
44415--- /dev/null 1970-01-01 00:00:00.000000000 +0000
44416+++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200
44417@@ -0,0 +1,150 @@
44418+#ifndef __ASM_SMP_H
44419+#define __ASM_SMP_H
44420+
44421+/*
44422+ * We need the APIC definitions automatically as part of 'smp.h'
44423+ */
44424+#ifndef __ASSEMBLY__
44425+#include <linux/threads.h>
44426+#include <linux/cpumask.h>
44427+#include <linux/bitops.h>
44428+extern int disable_apic;
44429+#endif
44430+
44431+#ifdef CONFIG_X86_LOCAL_APIC
44432+#ifndef __ASSEMBLY__
44433+#include <asm/fixmap.h>
44434+#include <asm/mpspec.h>
44435+#ifdef CONFIG_X86_IO_APIC
44436+#include <asm/io_apic.h>
44437+#endif
44438+#include <asm/apic.h>
44439+#include <asm/thread_info.h>
44440+#endif
44441+#endif
44442+
44443+#ifdef CONFIG_SMP
44444+#ifndef ASSEMBLY
44445+
44446+#include <asm/pda.h>
44447+
44448+struct pt_regs;
44449+
44450+extern cpumask_t cpu_present_mask;
44451+extern cpumask_t cpu_possible_map;
44452+extern cpumask_t cpu_online_map;
44453+extern cpumask_t cpu_initialized;
44454+
44455+/*
44456+ * Private routines/data
44457+ */
44458+
44459+extern void smp_alloc_memory(void);
44460+extern volatile unsigned long smp_invalidate_needed;
44461+extern int pic_mode;
44462+extern void lock_ipi_call_lock(void);
44463+extern void unlock_ipi_call_lock(void);
44464+extern int smp_num_siblings;
44465+extern void smp_send_reschedule(int cpu);
44466+void smp_stop_cpu(void);
44467+extern int smp_call_function_single(int cpuid, void (*func) (void *info),
44468+ void *info, int retry, int wait);
44469+
44470+extern cpumask_t cpu_sibling_map[NR_CPUS];
44471+extern cpumask_t cpu_core_map[NR_CPUS];
44472+extern u8 cpu_llc_id[NR_CPUS];
44473+
44474+#define SMP_TRAMPOLINE_BASE 0x6000
44475+
44476+/*
44477+ * On x86 all CPUs are mapped 1:1 to the APIC space.
44478+ * This simplifies scheduling and IPI sending and
44479+ * compresses data structures.
44480+ */
44481+
44482+static inline int num_booting_cpus(void)
44483+{
44484+ return cpus_weight(cpu_possible_map);
44485+}
44486+
44487+#define raw_smp_processor_id() read_pda(cpunumber)
44488+
44489+#ifdef CONFIG_X86_LOCAL_APIC
44490+static inline int hard_smp_processor_id(void)
44491+{
44492+ /* we don't want to mark this access volatile - bad code generation */
44493+ return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
44494+}
44495+#endif
44496+
44497+extern int safe_smp_processor_id(void);
44498+extern int __cpu_disable(void);
44499+extern void __cpu_die(unsigned int cpu);
44500+extern void prefill_possible_map(void);
44501+extern unsigned num_processors;
44502+extern unsigned disabled_cpus;
44503+
44504+#endif /* !ASSEMBLY */
44505+
44506+#define NO_PROC_ID 0xFF /* No processor magic marker */
44507+
44508+#endif
44509+
44510+#ifndef ASSEMBLY
44511+/*
44512+ * Some lowlevel functions might want to know about
44513+ * the real APIC ID <-> CPU # mapping.
44514+ */
44515+extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
44516+extern u8 x86_cpu_to_log_apicid[NR_CPUS];
44517+extern u8 bios_cpu_apicid[];
44518+
44519+#ifdef CONFIG_X86_LOCAL_APIC
44520+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
44521+{
44522+ return cpus_addr(cpumask)[0];
44523+}
44524+
44525+static inline int cpu_present_to_apicid(int mps_cpu)
44526+{
44527+ if (mps_cpu < NR_CPUS)
44528+ return (int)bios_cpu_apicid[mps_cpu];
44529+ else
44530+ return BAD_APICID;
44531+}
44532+#endif
44533+
44534+#endif /* !ASSEMBLY */
44535+
44536+#ifndef CONFIG_SMP
44537+#define stack_smp_processor_id() 0
44538+#define safe_smp_processor_id() 0
44539+#define cpu_logical_map(x) (x)
44540+#else
44541+#include <asm/thread_info.h>
44542+#define stack_smp_processor_id() \
44543+({ \
44544+ struct thread_info *ti; \
44545+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44546+ ti->cpu; \
44547+})
44548+#endif
44549+
44550+#ifndef __ASSEMBLY__
44551+#ifdef CONFIG_X86_LOCAL_APIC
44552+static __inline int logical_smp_processor_id(void)
44553+{
44554+ /* we don't want to mark this access volatile - bad code generation */
44555+ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
44556+}
44557+#endif
44558+#endif
44559+
44560+#ifdef CONFIG_SMP
44561+#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
44562+#else
44563+#define cpu_physical_id(cpu) boot_cpu_id
44564+#endif
44565+
44566+#endif
44567+
44568Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h
44569===================================================================
44570--- /dev/null 1970-01-01 00:00:00.000000000 +0000
44571+++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100
44572@@ -0,0 +1,256 @@
44573+#ifndef __ASM_SYSTEM_H
44574+#define __ASM_SYSTEM_H
44575+
44576+#include <linux/kernel.h>
44577+#include <asm/segment.h>
44578+#include <asm/alternative.h>
44579+
44580+#include <asm/synch_bitops.h>
44581+#include <asm/hypervisor.h>
44582+#include <xen/interface/arch-x86_64.h>
44583+
44584+#ifdef __KERNEL__
44585+
44586+#define __STR(x) #x
44587+#define STR(x) __STR(x)
44588+
44589+#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
44590+#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
44591+
44592+/* frame pointer must be last for get_wchan */
44593+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
44594+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
44595+
44596+#define __EXTRA_CLOBBER \
44597+ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
44598+
44599+#define switch_to(prev,next,last) \
44600+ asm volatile(SAVE_CONTEXT \
44601+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
44602+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
44603+ "call __switch_to\n\t" \
44604+ ".globl thread_return\n" \
44605+ "thread_return:\n\t" \
44606+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
44607+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
44608+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
44609+ "movq %%rax,%%rdi\n\t" \
44610+ "jc ret_from_fork\n\t" \
44611+ RESTORE_CONTEXT \
44612+ : "=a" (last) \
44613+ : [next] "S" (next), [prev] "D" (prev), \
44614+ [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
44615+ [ti_flags] "i" (offsetof(struct thread_info, flags)),\
44616+ [tif_fork] "i" (TIF_FORK), \
44617+ [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
44618+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
44619+ : "memory", "cc" __EXTRA_CLOBBER)
44620+
44621+extern void load_gs_index(unsigned);
44622+
44623+/*
44624+ * Load a segment. Fall back on loading the zero
44625+ * segment if something goes wrong..
44626+ */
44627+#define loadsegment(seg,value) \
44628+ asm volatile("\n" \
44629+ "1:\t" \
44630+ "movl %k0,%%" #seg "\n" \
44631+ "2:\n" \
44632+ ".section .fixup,\"ax\"\n" \
44633+ "3:\t" \
44634+ "movl %1,%%" #seg "\n\t" \
44635+ "jmp 2b\n" \
44636+ ".previous\n" \
44637+ ".section __ex_table,\"a\"\n\t" \
44638+ ".align 8\n\t" \
44639+ ".quad 1b,3b\n" \
44640+ ".previous" \
44641+ : :"r" (value), "r" (0))
44642+
44643+/*
44644+ * Clear and set 'TS' bit respectively
44645+ */
44646+#define clts() (HYPERVISOR_fpu_taskswitch(0))
44647+
44648+static inline unsigned long read_cr0(void)
44649+{
44650+ unsigned long cr0;
44651+ asm volatile("movq %%cr0,%0" : "=r" (cr0));
44652+ return cr0;
44653+}
44654+
44655+static inline void write_cr0(unsigned long val)
44656+{
44657+ asm volatile("movq %0,%%cr0" :: "r" (val));
44658+}
44659+
44660+#define read_cr3() ({ \
44661+ unsigned long __dummy; \
44662+ asm("movq %%cr3,%0" : "=r" (__dummy)); \
44663+ machine_to_phys(__dummy); \
44664+})
44665+
44666+static inline unsigned long read_cr4(void)
44667+{
44668+ unsigned long cr4;
44669+ asm("movq %%cr4,%0" : "=r" (cr4));
44670+ return cr4;
44671+}
44672+
44673+static inline void write_cr4(unsigned long val)
44674+{
44675+ asm volatile("movq %0,%%cr4" :: "r" (val));
44676+}
44677+
44678+#define stts() (HYPERVISOR_fpu_taskswitch(1))
44679+
44680+#define wbinvd() \
44681+ __asm__ __volatile__ ("wbinvd": : :"memory");
44682+
44683+/*
44684+ * On SMP systems, when the scheduler does migration-cost autodetection,
44685+ * it needs a way to flush as much of the CPU's caches as possible.
44686+ */
44687+static inline void sched_cacheflush(void)
44688+{
44689+ wbinvd();
44690+}
44691+
44692+#endif /* __KERNEL__ */
44693+
44694+#define nop() __asm__ __volatile__ ("nop")
44695+
44696+#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
44697+
44698+#define tas(ptr) (xchg((ptr),1))
44699+
44700+#define __xg(x) ((volatile long *)(x))
44701+
44702+static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
44703+{
44704+ *ptr = val;
44705+}
44706+
44707+#define _set_64bit set_64bit
44708+
44709+/*
44710+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
44711+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
44712+ * but generally the primitive is invalid, *ptr is output argument. --ANK
44713+ */
44714+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
44715+{
44716+ switch (size) {
44717+ case 1:
44718+ __asm__ __volatile__("xchgb %b0,%1"
44719+ :"=q" (x)
44720+ :"m" (*__xg(ptr)), "0" (x)
44721+ :"memory");
44722+ break;
44723+ case 2:
44724+ __asm__ __volatile__("xchgw %w0,%1"
44725+ :"=r" (x)
44726+ :"m" (*__xg(ptr)), "0" (x)
44727+ :"memory");
44728+ break;
44729+ case 4:
44730+ __asm__ __volatile__("xchgl %k0,%1"
44731+ :"=r" (x)
44732+ :"m" (*__xg(ptr)), "0" (x)
44733+ :"memory");
44734+ break;
44735+ case 8:
44736+ __asm__ __volatile__("xchgq %0,%1"
44737+ :"=r" (x)
44738+ :"m" (*__xg(ptr)), "0" (x)
44739+ :"memory");
44740+ break;
44741+ }
44742+ return x;
44743+}
44744+
44745+/*
44746+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
44747+ * store NEW in MEM. Return the initial value in MEM. Success is
44748+ * indicated by comparing RETURN with OLD.
44749+ */
44750+
44751+#define __HAVE_ARCH_CMPXCHG 1
44752+
44753+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
44754+ unsigned long new, int size)
44755+{
44756+ unsigned long prev;
44757+ switch (size) {
44758+ case 1:
44759+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
44760+ : "=a"(prev)
44761+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
44762+ : "memory");
44763+ return prev;
44764+ case 2:
44765+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
44766+ : "=a"(prev)
44767+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
44768+ : "memory");
44769+ return prev;
44770+ case 4:
44771+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
44772+ : "=a"(prev)
44773+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
44774+ : "memory");
44775+ return prev;
44776+ case 8:
44777+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
44778+ : "=a"(prev)
44779+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
44780+ : "memory");
44781+ return prev;
44782+ }
44783+ return old;
44784+}
44785+
44786+#define cmpxchg(ptr,o,n)\
44787+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
44788+ (unsigned long)(n),sizeof(*(ptr))))
44789+
44790+#ifdef CONFIG_SMP
44791+#define smp_mb() mb()
44792+#define smp_rmb() rmb()
44793+#define smp_wmb() wmb()
44794+#define smp_read_barrier_depends() do {} while(0)
44795+#else
44796+#define smp_mb() barrier()
44797+#define smp_rmb() barrier()
44798+#define smp_wmb() barrier()
44799+#define smp_read_barrier_depends() do {} while(0)
44800+#endif
44801+
44802+
44803+/*
44804+ * Force strict CPU ordering.
44805+ * And yes, this is required on UP too when we're talking
44806+ * to devices.
44807+ */
44808+#define mb() asm volatile("mfence":::"memory")
44809+#define rmb() asm volatile("lfence":::"memory")
44810+
44811+#ifdef CONFIG_UNORDERED_IO
44812+#define wmb() asm volatile("sfence" ::: "memory")
44813+#else
44814+#define wmb() asm volatile("" ::: "memory")
44815+#endif
44816+#define read_barrier_depends() do {} while(0)
44817+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
44818+
44819+#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
44820+
44821+#include <linux/irqflags.h>
44822+
44823+void cpu_idle_wait(void);
44824+
44825+extern unsigned long arch_align_stack(unsigned long sp);
44826+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
44827+
44828+#endif
44829Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h
44830===================================================================
44831--- /dev/null 1970-01-01 00:00:00.000000000 +0000
44832+++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100
44833@@ -0,0 +1,103 @@
44834+#ifndef _X8664_TLBFLUSH_H
44835+#define _X8664_TLBFLUSH_H
44836+
44837+#include <linux/mm.h>
44838+#include <asm/processor.h>
44839+
44840+#define __flush_tlb() xen_tlb_flush()
44841+
44842+/*
44843+ * Global pages have to be flushed a bit differently. Not a real
44844+ * performance problem because this does not happen often.
44845+ */
44846+#define __flush_tlb_global() xen_tlb_flush()
44847+
44848+
44849+extern unsigned long pgkern_mask;
44850+
44851+#define __flush_tlb_all() __flush_tlb_global()
44852+
44853+#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
44854+
44855+
44856+/*
44857+ * TLB flushing:
44858+ *
44859+ * - flush_tlb() flushes the current mm struct TLBs
44860+ * - flush_tlb_all() flushes all processes TLBs
44861+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
44862+ * - flush_tlb_page(vma, vmaddr) flushes one page
44863+ * - flush_tlb_range(vma, start, end) flushes a range of pages
44864+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
44865+ * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
44866+ *
44867+ * x86-64 can only flush individual pages or full VMs. For a range flush
44868+ * we always do the full VM. Might be worth trying if for a small
44869+ * range a few INVLPGs in a row are a win.
44870+ */
44871+
44872+#ifndef CONFIG_SMP
44873+
44874+#define flush_tlb() __flush_tlb()
44875+#define flush_tlb_all() __flush_tlb_all()
44876+#define local_flush_tlb() __flush_tlb()
44877+
44878+static inline void flush_tlb_mm(struct mm_struct *mm)
44879+{
44880+ if (mm == current->active_mm)
44881+ __flush_tlb();
44882+}
44883+
44884+static inline void flush_tlb_page(struct vm_area_struct *vma,
44885+ unsigned long addr)
44886+{
44887+ if (vma->vm_mm == current->active_mm)
44888+ __flush_tlb_one(addr);
44889+}
44890+
44891+static inline void flush_tlb_range(struct vm_area_struct *vma,
44892+ unsigned long start, unsigned long end)
44893+{
44894+ if (vma->vm_mm == current->active_mm)
44895+ __flush_tlb();
44896+}
44897+
44898+#else
44899+
44900+#include <asm/smp.h>
44901+
44902+#define local_flush_tlb() \
44903+ __flush_tlb()
44904+
44905+#define flush_tlb_all xen_tlb_flush_all
44906+#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
44907+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
44908+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
44909+
44910+#define flush_tlb() flush_tlb_current_task()
44911+
44912+static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
44913+{
44914+ flush_tlb_mm(vma->vm_mm);
44915+}
44916+
44917+#define TLBSTATE_OK 1
44918+#define TLBSTATE_LAZY 2
44919+
44920+/* Roughly an IPI every 20MB with 4k pages for freeing page table
44921+ ranges. Cost is about 42k of memory for each CPU. */
44922+#define ARCH_FREE_PTE_NR 5350
44923+
44924+#endif
44925+
44926+#define flush_tlb_kernel_range(start, end) flush_tlb_all()
44927+
44928+static inline void flush_tlb_pgtables(struct mm_struct *mm,
44929+ unsigned long start, unsigned long end)
44930+{
44931+ /* x86_64 does not keep any page table caches in a software TLB.
44932+ The CPUs do in their hardware TLBs, but they are handled
44933+ by the normal TLB flushing algorithms. */
44934+}
44935+
44936+#endif /* _X8664_TLBFLUSH_H */
44937Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h
44938===================================================================
44939--- /dev/null 1970-01-01 00:00:00.000000000 +0000
44940+++ head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200
44941@@ -0,0 +1,328 @@
44942+/*
44943+ * x86-64 changes / gcc fixes from Andi Kleen.
44944+ * Copyright 2002 Andi Kleen, SuSE Labs.
44945+ *
44946+ * This hasn't been optimized for the hammer yet, but there are likely
44947+ * no advantages to be gotten from x86-64 here anyways.
44948+ */
44949+
44950+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
44951+
44952+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
44953+ tell it to do a clts before the register saving. */
44954+#define XMMS_SAVE do { \
44955+ preempt_disable(); \
44956+ if (!(current_thread_info()->status & TS_USEDFPU)) \
44957+ clts(); \
44958+ __asm__ __volatile__ ( \
44959+ "movups %%xmm0,(%1) ;\n\t" \
44960+ "movups %%xmm1,0x10(%1) ;\n\t" \
44961+ "movups %%xmm2,0x20(%1) ;\n\t" \
44962+ "movups %%xmm3,0x30(%1) ;\n\t" \
44963+ : "=&r" (cr0) \
44964+ : "r" (xmm_save) \
44965+ : "memory"); \
44966+} while(0)
44967+
44968+#define XMMS_RESTORE do { \
44969+ asm volatile ( \
44970+ "sfence ;\n\t" \
44971+ "movups (%1),%%xmm0 ;\n\t" \
44972+ "movups 0x10(%1),%%xmm1 ;\n\t" \
44973+ "movups 0x20(%1),%%xmm2 ;\n\t" \
44974+ "movups 0x30(%1),%%xmm3 ;\n\t" \
44975+ : \
44976+ : "r" (cr0), "r" (xmm_save) \
44977+ : "memory"); \
44978+ if (!(current_thread_info()->status & TS_USEDFPU)) \
44979+ stts(); \
44980+ preempt_enable(); \
44981+} while(0)
44982+
44983+#define OFFS(x) "16*("#x")"
44984+#define PF_OFFS(x) "256+16*("#x")"
44985+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
44986+#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
44987+#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44988+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
44989+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44990+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
44991+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
44992+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
44993+#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
44994+#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
44995+#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
44996+#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
44997+#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
44998+
44999+
45000+static void
45001+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45002+{
45003+ unsigned int lines = bytes >> 8;
45004+ unsigned long cr0;
45005+ xmm_store_t xmm_save[4];
45006+
45007+ XMMS_SAVE;
45008+
45009+ asm volatile (
45010+#undef BLOCK
45011+#define BLOCK(i) \
45012+ LD(i,0) \
45013+ LD(i+1,1) \
45014+ PF1(i) \
45015+ PF1(i+2) \
45016+ LD(i+2,2) \
45017+ LD(i+3,3) \
45018+ PF0(i+4) \
45019+ PF0(i+6) \
45020+ XO1(i,0) \
45021+ XO1(i+1,1) \
45022+ XO1(i+2,2) \
45023+ XO1(i+3,3) \
45024+ ST(i,0) \
45025+ ST(i+1,1) \
45026+ ST(i+2,2) \
45027+ ST(i+3,3) \
45028+
45029+
45030+ PF0(0)
45031+ PF0(2)
45032+
45033+ " .align 32 ;\n"
45034+ " 1: ;\n"
45035+
45036+ BLOCK(0)
45037+ BLOCK(4)
45038+ BLOCK(8)
45039+ BLOCK(12)
45040+
45041+ " addq %[inc], %[p1] ;\n"
45042+ " addq %[inc], %[p2] ;\n"
45043+ " decl %[cnt] ; jnz 1b"
45044+ : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
45045+ : [inc] "r" (256UL)
45046+ : "memory");
45047+
45048+ XMMS_RESTORE;
45049+}
45050+
45051+static void
45052+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45053+ unsigned long *p3)
45054+{
45055+ unsigned int lines = bytes >> 8;
45056+ xmm_store_t xmm_save[4];
45057+ unsigned long cr0;
45058+
45059+ XMMS_SAVE;
45060+
45061+ __asm__ __volatile__ (
45062+#undef BLOCK
45063+#define BLOCK(i) \
45064+ PF1(i) \
45065+ PF1(i+2) \
45066+ LD(i,0) \
45067+ LD(i+1,1) \
45068+ LD(i+2,2) \
45069+ LD(i+3,3) \
45070+ PF2(i) \
45071+ PF2(i+2) \
45072+ PF0(i+4) \
45073+ PF0(i+6) \
45074+ XO1(i,0) \
45075+ XO1(i+1,1) \
45076+ XO1(i+2,2) \
45077+ XO1(i+3,3) \
45078+ XO2(i,0) \
45079+ XO2(i+1,1) \
45080+ XO2(i+2,2) \
45081+ XO2(i+3,3) \
45082+ ST(i,0) \
45083+ ST(i+1,1) \
45084+ ST(i+2,2) \
45085+ ST(i+3,3) \
45086+
45087+
45088+ PF0(0)
45089+ PF0(2)
45090+
45091+ " .align 32 ;\n"
45092+ " 1: ;\n"
45093+
45094+ BLOCK(0)
45095+ BLOCK(4)
45096+ BLOCK(8)
45097+ BLOCK(12)
45098+
45099+ " addq %[inc], %[p1] ;\n"
45100+ " addq %[inc], %[p2] ;\n"
45101+ " addq %[inc], %[p3] ;\n"
45102+ " decl %[cnt] ; jnz 1b"
45103+ : [cnt] "+r" (lines),
45104+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
45105+ : [inc] "r" (256UL)
45106+ : "memory");
45107+ XMMS_RESTORE;
45108+}
45109+
45110+static void
45111+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45112+ unsigned long *p3, unsigned long *p4)
45113+{
45114+ unsigned int lines = bytes >> 8;
45115+ xmm_store_t xmm_save[4];
45116+ unsigned long cr0;
45117+
45118+ XMMS_SAVE;
45119+
45120+ __asm__ __volatile__ (
45121+#undef BLOCK
45122+#define BLOCK(i) \
45123+ PF1(i) \
45124+ PF1(i+2) \
45125+ LD(i,0) \
45126+ LD(i+1,1) \
45127+ LD(i+2,2) \
45128+ LD(i+3,3) \
45129+ PF2(i) \
45130+ PF2(i+2) \
45131+ XO1(i,0) \
45132+ XO1(i+1,1) \
45133+ XO1(i+2,2) \
45134+ XO1(i+3,3) \
45135+ PF3(i) \
45136+ PF3(i+2) \
45137+ PF0(i+4) \
45138+ PF0(i+6) \
45139+ XO2(i,0) \
45140+ XO2(i+1,1) \
45141+ XO2(i+2,2) \
45142+ XO2(i+3,3) \
45143+ XO3(i,0) \
45144+ XO3(i+1,1) \
45145+ XO3(i+2,2) \
45146+ XO3(i+3,3) \
45147+ ST(i,0) \
45148+ ST(i+1,1) \
45149+ ST(i+2,2) \
45150+ ST(i+3,3) \
45151+
45152+
45153+ PF0(0)
45154+ PF0(2)
45155+
45156+ " .align 32 ;\n"
45157+ " 1: ;\n"
45158+
45159+ BLOCK(0)
45160+ BLOCK(4)
45161+ BLOCK(8)
45162+ BLOCK(12)
45163+
45164+ " addq %[inc], %[p1] ;\n"
45165+ " addq %[inc], %[p2] ;\n"
45166+ " addq %[inc], %[p3] ;\n"
45167+ " addq %[inc], %[p4] ;\n"
45168+ " decl %[cnt] ; jnz 1b"
45169+ : [cnt] "+c" (lines),
45170+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
45171+ : [inc] "r" (256UL)
45172+ : "memory" );
45173+
45174+ XMMS_RESTORE;
45175+}
45176+
45177+static void
45178+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45179+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
45180+{
45181+ unsigned int lines = bytes >> 8;
45182+ xmm_store_t xmm_save[4];
45183+ unsigned long cr0;
45184+
45185+ XMMS_SAVE;
45186+
45187+ __asm__ __volatile__ (
45188+#undef BLOCK
45189+#define BLOCK(i) \
45190+ PF1(i) \
45191+ PF1(i+2) \
45192+ LD(i,0) \
45193+ LD(i+1,1) \
45194+ LD(i+2,2) \
45195+ LD(i+3,3) \
45196+ PF2(i) \
45197+ PF2(i+2) \
45198+ XO1(i,0) \
45199+ XO1(i+1,1) \
45200+ XO1(i+2,2) \
45201+ XO1(i+3,3) \
45202+ PF3(i) \
45203+ PF3(i+2) \
45204+ XO2(i,0) \
45205+ XO2(i+1,1) \
45206+ XO2(i+2,2) \
45207+ XO2(i+3,3) \
45208+ PF4(i) \
45209+ PF4(i+2) \
45210+ PF0(i+4) \
45211+ PF0(i+6) \
45212+ XO3(i,0) \
45213+ XO3(i+1,1) \
45214+ XO3(i+2,2) \
45215+ XO3(i+3,3) \
45216+ XO4(i,0) \
45217+ XO4(i+1,1) \
45218+ XO4(i+2,2) \
45219+ XO4(i+3,3) \
45220+ ST(i,0) \
45221+ ST(i+1,1) \
45222+ ST(i+2,2) \
45223+ ST(i+3,3) \
45224+
45225+
45226+ PF0(0)
45227+ PF0(2)
45228+
45229+ " .align 32 ;\n"
45230+ " 1: ;\n"
45231+
45232+ BLOCK(0)
45233+ BLOCK(4)
45234+ BLOCK(8)
45235+ BLOCK(12)
45236+
45237+ " addq %[inc], %[p1] ;\n"
45238+ " addq %[inc], %[p2] ;\n"
45239+ " addq %[inc], %[p3] ;\n"
45240+ " addq %[inc], %[p4] ;\n"
45241+ " addq %[inc], %[p5] ;\n"
45242+ " decl %[cnt] ; jnz 1b"
45243+ : [cnt] "+c" (lines),
45244+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
45245+ [p5] "+r" (p5)
45246+ : [inc] "r" (256UL)
45247+ : "memory");
45248+
45249+ XMMS_RESTORE;
45250+}
45251+
45252+static struct xor_block_template xor_block_sse = {
45253+ .name = "generic_sse",
45254+ .do_2 = xor_sse_2,
45255+ .do_3 = xor_sse_3,
45256+ .do_4 = xor_sse_4,
45257+ .do_5 = xor_sse_5,
45258+};
45259+
45260+#undef XOR_TRY_TEMPLATES
45261+#define XOR_TRY_TEMPLATES \
45262+ do { \
45263+ xor_speed(&xor_block_sse); \
45264+ } while (0)
45265+
45266+/* We force the use of the SSE xor block because it can write around L2.
45267+ We may also be able to load into the L1 only depending on how the cpu
45268+ deals with a load to a line that is being prefetched. */
45269+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
45270Index: head-2008-11-25/include/asm-x86/mach-xen/mach_time.h
45271===================================================================
45272--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45273+++ head-2008-11-25/include/asm-x86/mach-xen/mach_time.h 2007-06-12 13:14:13.000000000 +0200
45274@@ -0,0 +1,111 @@
45275+/*
45276+ * include/asm-i386/mach-default/mach_time.h
45277+ *
45278+ * Machine specific set RTC function for generic.
45279+ * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
45280+ */
45281+#ifndef _MACH_TIME_H
45282+#define _MACH_TIME_H
45283+
45284+#include <asm-i386/mc146818rtc.h>
45285+
45286+/* for check timing call set_rtc_mmss() 500ms */
45287+/* used in arch/i386/time.c::do_timer_interrupt() */
45288+#define USEC_AFTER 500000
45289+#define USEC_BEFORE 500000
45290+
45291+/*
45292+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
45293+ * called 500 ms after the second nowtime has started, because when
45294+ * nowtime is written into the registers of the CMOS clock, it will
45295+ * jump to the next second precisely 500 ms later. Check the Motorola
45296+ * MC146818A or Dallas DS12887 data sheet for details.
45297+ *
45298+ * BUG: This routine does not handle hour overflow properly; it just
45299+ * sets the minutes. Usually you'll only notice that after reboot!
45300+ */
45301+static inline int mach_set_rtc_mmss(unsigned long nowtime)
45302+{
45303+ int retval = 0;
45304+ int real_seconds, real_minutes, cmos_minutes;
45305+ unsigned char save_control, save_freq_select;
45306+
45307+ save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
45308+ CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
45309+
45310+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
45311+ CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
45312+
45313+ cmos_minutes = CMOS_READ(RTC_MINUTES);
45314+ if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
45315+ BCD_TO_BIN(cmos_minutes);
45316+
45317+ /*
45318+ * since we're only adjusting minutes and seconds,
45319+ * don't interfere with hour overflow. This avoids
45320+ * messing with unknown time zones but requires your
45321+ * RTC not to be off by more than 15 minutes
45322+ */
45323+ real_seconds = nowtime % 60;
45324+ real_minutes = nowtime / 60;
45325+ if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
45326+ real_minutes += 30; /* correct for half hour time zone */
45327+ real_minutes %= 60;
45328+
45329+ if (abs(real_minutes - cmos_minutes) < 30) {
45330+ if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45331+ BIN_TO_BCD(real_seconds);
45332+ BIN_TO_BCD(real_minutes);
45333+ }
45334+ CMOS_WRITE(real_seconds,RTC_SECONDS);
45335+ CMOS_WRITE(real_minutes,RTC_MINUTES);
45336+ } else {
45337+ printk(KERN_WARNING
45338+ "set_rtc_mmss: can't update from %d to %d\n",
45339+ cmos_minutes, real_minutes);
45340+ retval = -1;
45341+ }
45342+
45343+ /* The following flags have to be released exactly in this order,
45344+ * otherwise the DS12887 (popular MC146818A clone with integrated
45345+ * battery and quartz) will not reset the oscillator and will not
45346+ * update precisely 500 ms later. You won't find this mentioned in
45347+ * the Dallas Semiconductor data sheets, but who believes data
45348+ * sheets anyway ... -- Markus Kuhn
45349+ */
45350+ CMOS_WRITE(save_control, RTC_CONTROL);
45351+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
45352+
45353+ return retval;
45354+}
45355+
45356+static inline unsigned long mach_get_cmos_time(void)
45357+{
45358+ unsigned int year, mon, day, hour, min, sec;
45359+
45360+ do {
45361+ sec = CMOS_READ(RTC_SECONDS);
45362+ min = CMOS_READ(RTC_MINUTES);
45363+ hour = CMOS_READ(RTC_HOURS);
45364+ day = CMOS_READ(RTC_DAY_OF_MONTH);
45365+ mon = CMOS_READ(RTC_MONTH);
45366+ year = CMOS_READ(RTC_YEAR);
45367+ } while (sec != CMOS_READ(RTC_SECONDS));
45368+
45369+ if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45370+ BCD_TO_BIN(sec);
45371+ BCD_TO_BIN(min);
45372+ BCD_TO_BIN(hour);
45373+ BCD_TO_BIN(day);
45374+ BCD_TO_BIN(mon);
45375+ BCD_TO_BIN(year);
45376+ }
45377+
45378+ year += 1900;
45379+ if (year < 1970)
45380+ year += 100;
45381+
45382+ return mktime(year, mon, day, hour, min, sec);
45383+}
45384+
45385+#endif /* !_MACH_TIME_H */
45386Index: head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h
45387===================================================================
45388--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45389+++ head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h 2007-06-12 13:14:13.000000000 +0200
45390@@ -0,0 +1,50 @@
45391+/*
45392+ * include/asm-i386/mach-default/mach_timer.h
45393+ *
45394+ * Machine specific calibrate_tsc() for generic.
45395+ * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
45396+ */
45397+/* ------ Calibrate the TSC -------
45398+ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
45399+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
45400+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
45401+ * output busy loop as low as possible. We avoid reading the CTC registers
45402+ * directly because of the awkward 8-bit access mechanism of the 82C54
45403+ * device.
45404+ */
45405+#ifndef _MACH_TIMER_H
45406+#define _MACH_TIMER_H
45407+
45408+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
45409+#define CALIBRATE_LATCH \
45410+ ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
45411+
45412+static inline void mach_prepare_counter(void)
45413+{
45414+ /* Set the Gate high, disable speaker */
45415+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
45416+
45417+ /*
45418+ * Now let's take care of CTC channel 2
45419+ *
45420+ * Set the Gate high, program CTC channel 2 for mode 0,
45421+ * (interrupt on terminal count mode), binary count,
45422+ * load 5 * LATCH count, (LSB and MSB) to begin countdown.
45423+ *
45424+ * Some devices need a delay here.
45425+ */
45426+ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
45427+ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
45428+ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
45429+}
45430+
45431+static inline void mach_countup(unsigned long *count_p)
45432+{
45433+ unsigned long count = 0;
45434+ do {
45435+ count++;
45436+ } while ((inb_p(0x61) & 0x20) == 0);
45437+ *count_p = count;
45438+}
45439+
45440+#endif /* !_MACH_TIMER_H */
45441Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h
45442===================================================================
45443--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45444+++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200
45445@@ -0,0 +1,63 @@
45446+/**
45447+ * machine_specific_* - Hooks for machine specific setup.
45448+ *
45449+ * Description:
45450+ * This is included late in kernel/setup.c so that it can make
45451+ * use of all of the static functions.
45452+ **/
45453+
45454+#include <xen/interface/callback.h>
45455+
45456+extern void hypervisor_callback(void);
45457+extern void failsafe_callback(void);
45458+extern void nmi(void);
45459+
45460+static void __init machine_specific_arch_setup(void)
45461+{
45462+ int ret;
45463+ static struct callback_register __initdata event = {
45464+ .type = CALLBACKTYPE_event,
45465+ .address = (unsigned long) hypervisor_callback,
45466+ };
45467+ static struct callback_register __initdata failsafe = {
45468+ .type = CALLBACKTYPE_failsafe,
45469+ .address = (unsigned long)failsafe_callback,
45470+ };
45471+ static struct callback_register __initdata syscall = {
45472+ .type = CALLBACKTYPE_syscall,
45473+ .address = (unsigned long)system_call,
45474+ };
45475+#ifdef CONFIG_X86_LOCAL_APIC
45476+ static struct callback_register __initdata nmi_cb = {
45477+ .type = CALLBACKTYPE_nmi,
45478+ .address = (unsigned long)nmi,
45479+ };
45480+#endif
45481+
45482+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
45483+ if (ret == 0)
45484+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
45485+ if (ret == 0)
45486+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
45487+#if CONFIG_XEN_COMPAT <= 0x030002
45488+ if (ret == -ENOSYS)
45489+ ret = HYPERVISOR_set_callbacks(
45490+ event.address,
45491+ failsafe.address,
45492+ syscall.address);
45493+#endif
45494+ BUG_ON(ret);
45495+
45496+#ifdef CONFIG_X86_LOCAL_APIC
45497+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
45498+#if CONFIG_XEN_COMPAT <= 0x030002
45499+ if (ret == -ENOSYS) {
45500+ static struct xennmi_callback __initdata cb = {
45501+ .handler_address = (unsigned long)nmi
45502+ };
45503+
45504+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
45505+ }
45506+#endif
45507+#endif
45508+}
45509Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h
45510===================================================================
45511--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45512+++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200
45513@@ -0,0 +1,5 @@
45514+/* Hook to call BIOS initialisation function */
45515+
45516+#define ARCH_SETUP machine_specific_arch_setup();
45517+
45518+static void __init machine_specific_arch_setup(void);
45519Index: head-2008-11-25/include/xen/blkif.h
45520===================================================================
45521--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45522+++ head-2008-11-25/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200
45523@@ -0,0 +1,123 @@
45524+/*
45525+ * Permission is hereby granted, free of charge, to any person obtaining a copy
45526+ * of this software and associated documentation files (the "Software"), to
45527+ * deal in the Software without restriction, including without limitation the
45528+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
45529+ * sell copies of the Software, and to permit persons to whom the Software is
45530+ * furnished to do so, subject to the following conditions:
45531+ *
45532+ * The above copyright notice and this permission notice shall be included in
45533+ * all copies or substantial portions of the Software.
45534+ *
45535+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45536+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45537+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45538+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45539+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45540+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
45541+ * DEALINGS IN THE SOFTWARE.
45542+ */
45543+
45544+#ifndef __XEN_BLKIF_H__
45545+#define __XEN_BLKIF_H__
45546+
45547+#include <xen/interface/io/ring.h>
45548+#include <xen/interface/io/blkif.h>
45549+#include <xen/interface/io/protocols.h>
45550+
45551+/* Not a real protocol. Used to generate ring structs which contain
45552+ * the elements common to all protocols only. This way we get a
45553+ * compiler-checkable way to use common struct elements, so we can
45554+ * avoid using switch(protocol) in a number of places. */
45555+struct blkif_common_request {
45556+ char dummy;
45557+};
45558+struct blkif_common_response {
45559+ char dummy;
45560+};
45561+
45562+/* i386 protocol version */
45563+#pragma pack(push, 4)
45564+struct blkif_x86_32_request {
45565+ uint8_t operation; /* BLKIF_OP_??? */
45566+ uint8_t nr_segments; /* number of segments */
45567+ blkif_vdev_t handle; /* only for read/write requests */
45568+ uint64_t id; /* private guest value, echoed in resp */
45569+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45570+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45571+};
45572+struct blkif_x86_32_response {
45573+ uint64_t id; /* copied from request */
45574+ uint8_t operation; /* copied from request */
45575+ int16_t status; /* BLKIF_RSP_??? */
45576+};
45577+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
45578+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
45579+#pragma pack(pop)
45580+
45581+/* x86_64 protocol version */
45582+struct blkif_x86_64_request {
45583+ uint8_t operation; /* BLKIF_OP_??? */
45584+ uint8_t nr_segments; /* number of segments */
45585+ blkif_vdev_t handle; /* only for read/write requests */
45586+ uint64_t __attribute__((__aligned__(8))) id;
45587+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45588+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45589+};
45590+struct blkif_x86_64_response {
45591+ uint64_t __attribute__((__aligned__(8))) id;
45592+ uint8_t operation; /* copied from request */
45593+ int16_t status; /* BLKIF_RSP_??? */
45594+};
45595+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
45596+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
45597+
45598+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
45599+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
45600+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
45601+
45602+union blkif_back_rings {
45603+ blkif_back_ring_t native;
45604+ blkif_common_back_ring_t common;
45605+ blkif_x86_32_back_ring_t x86_32;
45606+ blkif_x86_64_back_ring_t x86_64;
45607+};
45608+typedef union blkif_back_rings blkif_back_rings_t;
45609+
45610+enum blkif_protocol {
45611+ BLKIF_PROTOCOL_NATIVE = 1,
45612+ BLKIF_PROTOCOL_X86_32 = 2,
45613+ BLKIF_PROTOCOL_X86_64 = 3,
45614+};
45615+
45616+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
45617+{
45618+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45619+ dst->operation = src->operation;
45620+ dst->nr_segments = src->nr_segments;
45621+ dst->handle = src->handle;
45622+ dst->id = src->id;
45623+ dst->sector_number = src->sector_number;
45624+ barrier();
45625+ if (n > dst->nr_segments)
45626+ n = dst->nr_segments;
45627+ for (i = 0; i < n; i++)
45628+ dst->seg[i] = src->seg[i];
45629+}
45630+
45631+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
45632+{
45633+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45634+ dst->operation = src->operation;
45635+ dst->nr_segments = src->nr_segments;
45636+ dst->handle = src->handle;
45637+ dst->id = src->id;
45638+ dst->sector_number = src->sector_number;
45639+ barrier();
45640+ if (n > dst->nr_segments)
45641+ n = dst->nr_segments;
45642+ for (i = 0; i < n; i++)
45643+ dst->seg[i] = src->seg[i];
45644+}
45645+
45646+#endif /* __XEN_BLKIF_H__ */
45647Index: head-2008-11-25/include/xen/compat_ioctl.h
45648===================================================================
45649--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45650+++ head-2008-11-25/include/xen/compat_ioctl.h 2007-07-10 09:42:30.000000000 +0200
45651@@ -0,0 +1,45 @@
45652+/*
45653+ * This program is free software; you can redistribute it and/or
45654+ * modify it under the terms of the GNU General Public License as
45655+ * published by the Free Software Foundation; either version 2 of the
45656+ * License, or (at your option) any later version.
45657+ *
45658+ * This program is distributed in the hope that it will be useful,
45659+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
45660+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
45661+ * GNU General Public License for more details.
45662+ *
45663+ * You should have received a copy of the GNU General Public License
45664+ * along with this program; if not, write to the Free Software
45665+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
45666+ *
45667+ * Copyright IBM Corp. 2007
45668+ *
45669+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
45670+ * Hollis Blanchard <hollisb@us.ibm.com>
45671+ */
45672+
45673+#ifndef __LINUX_XEN_COMPAT_H__
45674+#define __LINUX_XEN_COMPAT_H__
45675+
45676+#include <linux/compat.h>
45677+
45678+extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg);
45679+struct privcmd_mmap_32 {
45680+ int num;
45681+ domid_t dom;
45682+ compat_uptr_t entry;
45683+};
45684+
45685+struct privcmd_mmapbatch_32 {
45686+ int num; /* number of pages to populate */
45687+ domid_t dom; /* target domain */
45688+ __u64 addr; /* virtual address */
45689+ compat_uptr_t arr; /* array of mfns - top nibble set on err */
45690+};
45691+#define IOCTL_PRIVCMD_MMAP_32 \
45692+ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
45693+#define IOCTL_PRIVCMD_MMAPBATCH_32 \
45694+ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
45695+
45696+#endif /* __LINUX_XEN_COMPAT_H__ */
45697Index: head-2008-11-25/include/xen/cpu_hotplug.h
45698===================================================================
45699--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45700+++ head-2008-11-25/include/xen/cpu_hotplug.h 2007-08-16 18:07:01.000000000 +0200
45701@@ -0,0 +1,41 @@
45702+#ifndef __XEN_CPU_HOTPLUG_H__
45703+#define __XEN_CPU_HOTPLUG_H__
45704+
45705+#include <linux/kernel.h>
45706+#include <linux/cpumask.h>
45707+
45708+#if defined(CONFIG_X86) && defined(CONFIG_SMP)
45709+extern cpumask_t cpu_initialized_map;
45710+#endif
45711+
45712+#if defined(CONFIG_HOTPLUG_CPU)
45713+
45714+int cpu_up_check(unsigned int cpu);
45715+void init_xenbus_allowed_cpumask(void);
45716+int smp_suspend(void);
45717+void smp_resume(void);
45718+
45719+void cpu_bringup(void);
45720+
45721+#else /* !defined(CONFIG_HOTPLUG_CPU) */
45722+
45723+#define cpu_up_check(cpu) (0)
45724+#define init_xenbus_allowed_cpumask() ((void)0)
45725+
45726+static inline int smp_suspend(void)
45727+{
45728+ if (num_online_cpus() > 1) {
45729+ printk(KERN_WARNING "Can't suspend SMP guests "
45730+ "without CONFIG_HOTPLUG_CPU\n");
45731+ return -EOPNOTSUPP;
45732+ }
45733+ return 0;
45734+}
45735+
45736+static inline void smp_resume(void)
45737+{
45738+}
45739+
45740+#endif /* !defined(CONFIG_HOTPLUG_CPU) */
45741+
45742+#endif /* __XEN_CPU_HOTPLUG_H__ */
45743Index: head-2008-11-25/include/xen/driver_util.h
45744===================================================================
45745--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45746+++ head-2008-11-25/include/xen/driver_util.h 2007-06-12 13:14:19.000000000 +0200
45747@@ -0,0 +1,14 @@
45748+
45749+#ifndef __ASM_XEN_DRIVER_UTIL_H__
45750+#define __ASM_XEN_DRIVER_UTIL_H__
45751+
45752+#include <linux/vmalloc.h>
45753+#include <linux/device.h>
45754+
45755+/* Allocate/destroy a 'vmalloc' VM area. */
45756+extern struct vm_struct *alloc_vm_area(unsigned long size);
45757+extern void free_vm_area(struct vm_struct *area);
45758+
45759+extern struct class *get_xen_class(void);
45760+
45761+#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
45762Index: head-2008-11-25/include/xen/evtchn.h
45763===================================================================
45764--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45765+++ head-2008-11-25/include/xen/evtchn.h 2008-09-15 13:40:15.000000000 +0200
45766@@ -0,0 +1,160 @@
45767+/******************************************************************************
45768+ * evtchn.h
45769+ *
45770+ * Communication via Xen event channels.
45771+ * Also definitions for the device that demuxes notifications to userspace.
45772+ *
45773+ * Copyright (c) 2004-2005, K A Fraser
45774+ *
45775+ * This program is free software; you can redistribute it and/or
45776+ * modify it under the terms of the GNU General Public License version 2
45777+ * as published by the Free Software Foundation; or, when distributed
45778+ * separately from the Linux kernel or incorporated into other
45779+ * software packages, subject to the following license:
45780+ *
45781+ * Permission is hereby granted, free of charge, to any person obtaining a copy
45782+ * of this source file (the "Software"), to deal in the Software without
45783+ * restriction, including without limitation the rights to use, copy, modify,
45784+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45785+ * and to permit persons to whom the Software is furnished to do so, subject to
45786+ * the following conditions:
45787+ *
45788+ * The above copyright notice and this permission notice shall be included in
45789+ * all copies or substantial portions of the Software.
45790+ *
45791+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45792+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45793+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45794+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45795+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45796+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45797+ * IN THE SOFTWARE.
45798+ */
45799+
45800+#ifndef __ASM_EVTCHN_H__
45801+#define __ASM_EVTCHN_H__
45802+
45803+#include <linux/interrupt.h>
45804+#include <asm/hypervisor.h>
45805+#include <asm/ptrace.h>
45806+#include <asm/synch_bitops.h>
45807+#include <xen/interface/event_channel.h>
45808+#include <linux/smp.h>
45809+
45810+/*
45811+ * LOW-LEVEL DEFINITIONS
45812+ */
45813+
45814+/*
45815+ * Dynamically bind an event source to an IRQ-like callback handler.
45816+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
45817+ * The IRQ argument passed to the callback handler is the same as returned
45818+ * from the bind call. It may not correspond to a Linux IRQ number.
45819+ * Returns IRQ or negative errno.
45820+ */
45821+int bind_caller_port_to_irqhandler(
45822+ unsigned int caller_port,
45823+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
45824+ unsigned long irqflags,
45825+ const char *devname,
45826+ void *dev_id);
45827+int bind_listening_port_to_irqhandler(
45828+ unsigned int remote_domain,
45829+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
45830+ unsigned long irqflags,
45831+ const char *devname,
45832+ void *dev_id);
45833+int bind_interdomain_evtchn_to_irqhandler(
45834+ unsigned int remote_domain,
45835+ unsigned int remote_port,
45836+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
45837+ unsigned long irqflags,
45838+ const char *devname,
45839+ void *dev_id);
45840+int bind_virq_to_irqhandler(
45841+ unsigned int virq,
45842+ unsigned int cpu,
45843+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
45844+ unsigned long irqflags,
45845+ const char *devname,
45846+ void *dev_id);
45847+int bind_ipi_to_irqhandler(
45848+ unsigned int ipi,
45849+ unsigned int cpu,
45850+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
45851+ unsigned long irqflags,
45852+ const char *devname,
45853+ void *dev_id);
45854+
45855+/*
45856+ * Common unbind function for all event sources. Takes IRQ to unbind from.
45857+ * Automatically closes the underlying event channel (except for bindings
45858+ * made with bind_caller_port_to_irqhandler()).
45859+ */
45860+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
45861+
45862+void irq_resume(void);
45863+
45864+/* Entry point for notifications into Linux subsystems. */
45865+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
45866+
45867+/* Entry point for notifications into the userland character device. */
45868+void evtchn_device_upcall(int port);
45869+
45870+/* Mark a PIRQ as unavailable for dynamic allocation. */
45871+void evtchn_register_pirq(int irq);
45872+/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
45873+int evtchn_map_pirq(int irq, int xen_pirq);
45874+/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
45875+int evtchn_get_xen_pirq(int irq);
45876+
45877+void mask_evtchn(int port);
45878+void disable_all_local_evtchn(void);
45879+void unmask_evtchn(int port);
45880+
45881+#ifdef CONFIG_SMP
45882+void rebind_evtchn_to_cpu(int port, unsigned int cpu);
45883+#else
45884+#define rebind_evtchn_to_cpu(port, cpu) ((void)0)
45885+#endif
45886+
45887+static inline int test_and_set_evtchn_mask(int port)
45888+{
45889+ shared_info_t *s = HYPERVISOR_shared_info;
45890+ return synch_test_and_set_bit(port, s->evtchn_mask);
45891+}
45892+
45893+static inline void clear_evtchn(int port)
45894+{
45895+ shared_info_t *s = HYPERVISOR_shared_info;
45896+ synch_clear_bit(port, s->evtchn_pending);
45897+}
45898+
45899+static inline void notify_remote_via_evtchn(int port)
45900+{
45901+ struct evtchn_send send = { .port = port };
45902+ VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
45903+}
45904+
45905+/*
45906+ * Use these to access the event channel underlying the IRQ handle returned
45907+ * by bind_*_to_irqhandler().
45908+ */
45909+void notify_remote_via_irq(int irq);
45910+int irq_to_evtchn_port(int irq);
45911+
45912+#define PIRQ_SET_MAPPING 0x0
45913+#define PIRQ_CLEAR_MAPPING 0x1
45914+#define PIRQ_GET_MAPPING 0x3
45915+int pirq_mapstatus(int pirq, int action);
45916+int set_pirq_hw_action(int pirq, int (*action)(int pirq, int action));
45917+int clear_pirq_hw_action(int pirq);
45918+
45919+#define PIRQ_STARTUP 1
45920+#define PIRQ_SHUTDOWN 2
45921+#define PIRQ_ENABLE 3
45922+#define PIRQ_DISABLE 4
45923+#define PIRQ_END 5
45924+#define PIRQ_ACK 6
45925+
45926+#endif /* __ASM_EVTCHN_H__ */
45927Index: head-2008-11-25/include/xen/firmware.h
45928===================================================================
45929--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45930+++ head-2008-11-25/include/xen/firmware.h 2007-07-02 08:16:19.000000000 +0200
45931@@ -0,0 +1,10 @@
45932+#ifndef __XEN_FIRMWARE_H__
45933+#define __XEN_FIRMWARE_H__
45934+
45935+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
45936+void copy_edd(void);
45937+#endif
45938+
45939+void copy_edid(void);
45940+
45941+#endif /* __XEN_FIRMWARE_H__ */
45942Index: head-2008-11-25/include/xen/gnttab.h
45943===================================================================
45944--- /dev/null 1970-01-01 00:00:00.000000000 +0000
45945+++ head-2008-11-25/include/xen/gnttab.h 2008-11-04 11:13:10.000000000 +0100
45946@@ -0,0 +1,164 @@
45947+/******************************************************************************
45948+ * gnttab.h
45949+ *
45950+ * Two sets of functionality:
45951+ * 1. Granting foreign access to our memory reservation.
45952+ * 2. Accessing others' memory reservations via grant references.
45953+ * (i.e., mechanisms for both sender and recipient of grant references)
45954+ *
45955+ * Copyright (c) 2004-2005, K A Fraser
45956+ * Copyright (c) 2005, Christopher Clark
45957+ *
45958+ * This program is free software; you can redistribute it and/or
45959+ * modify it under the terms of the GNU General Public License version 2
45960+ * as published by the Free Software Foundation; or, when distributed
45961+ * separately from the Linux kernel or incorporated into other
45962+ * software packages, subject to the following license:
45963+ *
45964+ * Permission is hereby granted, free of charge, to any person obtaining a copy
45965+ * of this source file (the "Software"), to deal in the Software without
45966+ * restriction, including without limitation the rights to use, copy, modify,
45967+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45968+ * and to permit persons to whom the Software is furnished to do so, subject to
45969+ * the following conditions:
45970+ *
45971+ * The above copyright notice and this permission notice shall be included in
45972+ * all copies or substantial portions of the Software.
45973+ *
45974+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45975+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45976+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45977+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45978+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45979+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45980+ * IN THE SOFTWARE.
45981+ */
45982+
45983+#ifndef __ASM_GNTTAB_H__
45984+#define __ASM_GNTTAB_H__
45985+
45986+#include <asm/hypervisor.h>
45987+#include <asm/maddr.h> /* maddr_t */
45988+#include <linux/mm.h>
45989+#include <xen/interface/grant_table.h>
45990+#include <xen/features.h>
45991+
45992+struct gnttab_free_callback {
45993+ struct gnttab_free_callback *next;
45994+ void (*fn)(void *);
45995+ void *arg;
45996+ u16 count;
45997+ u8 queued;
45998+};
45999+
46000+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
46001+ int flags);
46002+
46003+/*
46004+ * End access through the given grant reference, iff the grant entry is no
46005+ * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
46006+ * use.
46007+ */
46008+int gnttab_end_foreign_access_ref(grant_ref_t ref);
46009+
46010+/*
46011+ * Eventually end access through the given grant reference, and once that
46012+ * access has been ended, free the given page too. Access will be ended
46013+ * immediately iff the grant entry is not in use, otherwise it will happen
46014+ * some time later. page may be 0, in which case no freeing will occur.
46015+ */
46016+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
46017+
46018+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
46019+
46020+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
46021+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
46022+
46023+int gnttab_query_foreign_access(grant_ref_t ref);
46024+
46025+/*
46026+ * operations on reserved batches of grant references
46027+ */
46028+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
46029+
46030+void gnttab_free_grant_reference(grant_ref_t ref);
46031+
46032+void gnttab_free_grant_references(grant_ref_t head);
46033+
46034+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
46035+
46036+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
46037+
46038+void gnttab_release_grant_reference(grant_ref_t *private_head,
46039+ grant_ref_t release);
46040+
46041+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
46042+ void (*fn)(void *), void *arg, u16 count);
46043+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
46044+
46045+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
46046+ unsigned long frame, int flags);
46047+
46048+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
46049+ unsigned long pfn);
46050+
46051+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
46052+void __gnttab_dma_map_page(struct page *page);
46053+static inline void __gnttab_dma_unmap_page(struct page *page)
46054+{
46055+}
46056+
46057+void gnttab_reset_grant_page(struct page *page);
46058+
46059+int gnttab_suspend(void);
46060+int gnttab_resume(void);
46061+
46062+void *arch_gnttab_alloc_shared(unsigned long *frames);
46063+
46064+static inline void
46065+gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
46066+ uint32_t flags, grant_ref_t ref, domid_t domid)
46067+{
46068+ if (flags & GNTMAP_contains_pte)
46069+ map->host_addr = addr;
46070+ else if (xen_feature(XENFEAT_auto_translated_physmap))
46071+ map->host_addr = __pa(addr);
46072+ else
46073+ map->host_addr = addr;
46074+
46075+ map->flags = flags;
46076+ map->ref = ref;
46077+ map->dom = domid;
46078+}
46079+
46080+static inline void
46081+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
46082+ uint32_t flags, grant_handle_t handle)
46083+{
46084+ if (flags & GNTMAP_contains_pte)
46085+ unmap->host_addr = addr;
46086+ else if (xen_feature(XENFEAT_auto_translated_physmap))
46087+ unmap->host_addr = __pa(addr);
46088+ else
46089+ unmap->host_addr = addr;
46090+
46091+ unmap->handle = handle;
46092+ unmap->dev_bus_addr = 0;
46093+}
46094+
46095+static inline void
46096+gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
46097+ maddr_t new_addr, grant_handle_t handle)
46098+{
46099+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
46100+ unmap->host_addr = __pa(addr);
46101+ unmap->new_addr = __pa(new_addr);
46102+ } else {
46103+ unmap->host_addr = addr;
46104+ unmap->new_addr = new_addr;
46105+ }
46106+
46107+ unmap->handle = handle;
46108+}
46109+
46110+#endif /* __ASM_GNTTAB_H__ */
46111Index: head-2008-11-25/include/xen/hvm.h
46112===================================================================
46113--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46114+++ head-2008-11-25/include/xen/hvm.h 2007-06-12 13:14:19.000000000 +0200
46115@@ -0,0 +1,23 @@
46116+/* Simple wrappers around HVM functions */
46117+#ifndef XEN_HVM_H__
46118+#define XEN_HVM_H__
46119+
46120+#include <xen/interface/hvm/params.h>
46121+
46122+static inline unsigned long hvm_get_parameter(int idx)
46123+{
46124+ struct xen_hvm_param xhv;
46125+ int r;
46126+
46127+ xhv.domid = DOMID_SELF;
46128+ xhv.index = idx;
46129+ r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
46130+ if (r < 0) {
46131+ printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
46132+ idx, r);
46133+ return 0;
46134+ }
46135+ return xhv.value;
46136+}
46137+
46138+#endif /* XEN_HVM_H__ */
46139Index: head-2008-11-25/include/xen/hypercall.h
46140===================================================================
46141--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46142+++ head-2008-11-25/include/xen/hypercall.h 2008-01-28 12:24:19.000000000 +0100
46143@@ -0,0 +1,30 @@
46144+#ifndef __XEN_HYPERCALL_H__
46145+#define __XEN_HYPERCALL_H__
46146+
46147+#include <asm/hypercall.h>
46148+
46149+static inline int __must_check
46150+HYPERVISOR_multicall_check(
46151+ multicall_entry_t *call_list, unsigned int nr_calls,
46152+ const unsigned long *rc_list)
46153+{
46154+ int rc = HYPERVISOR_multicall(call_list, nr_calls);
46155+
46156+ if (unlikely(rc < 0))
46157+ return rc;
46158+ BUG_ON(rc);
46159+ BUG_ON((int)nr_calls < 0);
46160+
46161+ for ( ; nr_calls > 0; --nr_calls, ++call_list)
46162+ if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
46163+ return nr_calls;
46164+
46165+ return 0;
46166+}
46167+
46168+/* A construct to ignore the return value of hypercall wrappers in a few
46169+ * exceptional cases (simply casting the function result to void doesn't
46170+ * avoid the compiler warning): */
46171+#define VOID(expr) ((void)((expr)?:0))
46172+
46173+#endif /* __XEN_HYPERCALL_H__ */
46174Index: head-2008-11-25/include/xen/hypervisor_sysfs.h
46175===================================================================
46176--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46177+++ head-2008-11-25/include/xen/hypervisor_sysfs.h 2007-06-22 09:08:06.000000000 +0200
46178@@ -0,0 +1,30 @@
46179+/*
46180+ * copyright (c) 2006 IBM Corporation
46181+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
46182+ *
46183+ * This program is free software; you can redistribute it and/or modify
46184+ * it under the terms of the GNU General Public License version 2 as
46185+ * published by the Free Software Foundation.
46186+ */
46187+
46188+#ifndef _HYP_SYSFS_H_
46189+#define _HYP_SYSFS_H_
46190+
46191+#include <linux/kobject.h>
46192+#include <linux/sysfs.h>
46193+
46194+#define HYPERVISOR_ATTR_RO(_name) \
46195+static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
46196+
46197+#define HYPERVISOR_ATTR_RW(_name) \
46198+static struct hyp_sysfs_attr _name##_attr = \
46199+ __ATTR(_name, 0644, _name##_show, _name##_store)
46200+
46201+struct hyp_sysfs_attr {
46202+ struct attribute attr;
46203+ ssize_t (*show)(struct hyp_sysfs_attr *, char *);
46204+ ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
46205+ void *hyp_attr_data;
46206+};
46207+
46208+#endif /* _HYP_SYSFS_H_ */
46209Index: head-2008-11-25/include/xen/pcifront.h
46210===================================================================
46211--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46212+++ head-2008-11-25/include/xen/pcifront.h 2007-06-18 08:38:13.000000000 +0200
46213@@ -0,0 +1,83 @@
46214+/*
46215+ * PCI Frontend - arch-dependendent declarations
46216+ *
46217+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
46218+ */
46219+#ifndef __XEN_ASM_PCIFRONT_H__
46220+#define __XEN_ASM_PCIFRONT_H__
46221+
46222+#include <linux/spinlock.h>
46223+
46224+#ifdef __KERNEL__
46225+
46226+#ifndef __ia64__
46227+
46228+struct pcifront_device;
46229+struct pci_bus;
46230+
46231+struct pcifront_sd {
46232+ int domain;
46233+ struct pcifront_device *pdev;
46234+};
46235+
46236+static inline struct pcifront_device *
46237+pcifront_get_pdev(struct pcifront_sd *sd)
46238+{
46239+ return sd->pdev;
46240+}
46241+
46242+static inline void pcifront_init_sd(struct pcifront_sd *sd,
46243+ unsigned int domain, unsigned int bus,
46244+ struct pcifront_device *pdev)
46245+{
46246+ sd->domain = domain;
46247+ sd->pdev = pdev;
46248+}
46249+
46250+#if defined(CONFIG_PCI_DOMAINS)
46251+static inline int pci_domain_nr(struct pci_bus *bus)
46252+{
46253+ struct pcifront_sd *sd = bus->sysdata;
46254+ return sd->domain;
46255+}
46256+static inline int pci_proc_domain(struct pci_bus *bus)
46257+{
46258+ return pci_domain_nr(bus);
46259+}
46260+#endif /* CONFIG_PCI_DOMAINS */
46261+
46262+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46263+ struct pcifront_sd *sd)
46264+{
46265+}
46266+
46267+#else /* __ia64__ */
46268+
46269+#include <linux/acpi.h>
46270+#include <asm/pci.h>
46271+#define pcifront_sd pci_controller
46272+
46273+extern void xen_add_resource(struct pci_controller *, unsigned int,
46274+ unsigned int, struct acpi_resource *);
46275+extern void xen_pcibios_setup_root_windows(struct pci_bus *,
46276+ struct pci_controller *);
46277+
46278+static inline struct pcifront_device *
46279+pcifront_get_pdev(struct pcifront_sd *sd)
46280+{
46281+ return (struct pcifront_device *)sd->platform_data;
46282+}
46283+
46284+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46285+ struct pcifront_sd *sd)
46286+{
46287+ xen_pcibios_setup_root_windows(bus, sd);
46288+}
46289+
46290+#endif /* __ia64__ */
46291+
46292+extern struct rw_semaphore pci_bus_sem;
46293+
46294+#endif /* __KERNEL__ */
46295+
46296+#endif /* __XEN_ASM_PCIFRONT_H__ */
46297Index: head-2008-11-25/include/xen/public/evtchn.h
46298===================================================================
46299--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46300+++ head-2008-11-25/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200
46301@@ -0,0 +1,88 @@
46302+/******************************************************************************
46303+ * evtchn.h
46304+ *
46305+ * Interface to /dev/xen/evtchn.
46306+ *
46307+ * Copyright (c) 2003-2005, K A Fraser
46308+ *
46309+ * This program is free software; you can redistribute it and/or
46310+ * modify it under the terms of the GNU General Public License version 2
46311+ * as published by the Free Software Foundation; or, when distributed
46312+ * separately from the Linux kernel or incorporated into other
46313+ * software packages, subject to the following license:
46314+ *
46315+ * Permission is hereby granted, free of charge, to any person obtaining a copy
46316+ * of this source file (the "Software"), to deal in the Software without
46317+ * restriction, including without limitation the rights to use, copy, modify,
46318+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46319+ * and to permit persons to whom the Software is furnished to do so, subject to
46320+ * the following conditions:
46321+ *
46322+ * The above copyright notice and this permission notice shall be included in
46323+ * all copies or substantial portions of the Software.
46324+ *
46325+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46326+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46327+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46328+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46329+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46330+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46331+ * IN THE SOFTWARE.
46332+ */
46333+
46334+#ifndef __LINUX_PUBLIC_EVTCHN_H__
46335+#define __LINUX_PUBLIC_EVTCHN_H__
46336+
46337+/*
46338+ * Bind a fresh port to VIRQ @virq.
46339+ * Return allocated port.
46340+ */
46341+#define IOCTL_EVTCHN_BIND_VIRQ \
46342+ _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
46343+struct ioctl_evtchn_bind_virq {
46344+ unsigned int virq;
46345+};
46346+
46347+/*
46348+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
46349+ * Return allocated port.
46350+ */
46351+#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
46352+ _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
46353+struct ioctl_evtchn_bind_interdomain {
46354+ unsigned int remote_domain, remote_port;
46355+};
46356+
46357+/*
46358+ * Allocate a fresh port for binding to @remote_domain.
46359+ * Return allocated port.
46360+ */
46361+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
46362+ _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
46363+struct ioctl_evtchn_bind_unbound_port {
46364+ unsigned int remote_domain;
46365+};
46366+
46367+/*
46368+ * Unbind previously allocated @port.
46369+ */
46370+#define IOCTL_EVTCHN_UNBIND \
46371+ _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
46372+struct ioctl_evtchn_unbind {
46373+ unsigned int port;
46374+};
46375+
46376+/*
46377+ * Unbind previously allocated @port.
46378+ */
46379+#define IOCTL_EVTCHN_NOTIFY \
46380+ _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
46381+struct ioctl_evtchn_notify {
46382+ unsigned int port;
46383+};
46384+
46385+/* Clear and reinitialise the event buffer. Clear error condition. */
46386+#define IOCTL_EVTCHN_RESET \
46387+ _IOC(_IOC_NONE, 'E', 5, 0)
46388+
46389+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
46390Index: head-2008-11-25/include/xen/public/gntdev.h
46391===================================================================
46392--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46393+++ head-2008-11-25/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200
46394@@ -0,0 +1,119 @@
46395+/******************************************************************************
46396+ * gntdev.h
46397+ *
46398+ * Interface to /dev/xen/gntdev.
46399+ *
46400+ * Copyright (c) 2007, D G Murray
46401+ *
46402+ * This program is free software; you can redistribute it and/or
46403+ * modify it under the terms of the GNU General Public License version 2
46404+ * as published by the Free Software Foundation; or, when distributed
46405+ * separately from the Linux kernel or incorporated into other
46406+ * software packages, subject to the following license:
46407+ *
46408+ * Permission is hereby granted, free of charge, to any person obtaining a copy
46409+ * of this source file (the "Software"), to deal in the Software without
46410+ * restriction, including without limitation the rights to use, copy, modify,
46411+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46412+ * and to permit persons to whom the Software is furnished to do so, subject to
46413+ * the following conditions:
46414+ *
46415+ * The above copyright notice and this permission notice shall be included in
46416+ * all copies or substantial portions of the Software.
46417+ *
46418+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46419+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46420+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46421+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46422+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46423+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46424+ * IN THE SOFTWARE.
46425+ */
46426+
46427+#ifndef __LINUX_PUBLIC_GNTDEV_H__
46428+#define __LINUX_PUBLIC_GNTDEV_H__
46429+
46430+struct ioctl_gntdev_grant_ref {
46431+ /* The domain ID of the grant to be mapped. */
46432+ uint32_t domid;
46433+ /* The grant reference of the grant to be mapped. */
46434+ uint32_t ref;
46435+};
46436+
46437+/*
46438+ * Inserts the grant references into the mapping table of an instance
46439+ * of gntdev. N.B. This does not perform the mapping, which is deferred
46440+ * until mmap() is called with @index as the offset.
46441+ */
46442+#define IOCTL_GNTDEV_MAP_GRANT_REF \
46443+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
46444+struct ioctl_gntdev_map_grant_ref {
46445+ /* IN parameters */
46446+ /* The number of grants to be mapped. */
46447+ uint32_t count;
46448+ uint32_t pad;
46449+ /* OUT parameters */
46450+ /* The offset to be used on a subsequent call to mmap(). */
46451+ uint64_t index;
46452+ /* Variable IN parameter. */
46453+ /* Array of grant references, of size @count. */
46454+ struct ioctl_gntdev_grant_ref refs[1];
46455+};
46456+
46457+/*
46458+ * Removes the grant references from the mapping table of an instance of
46459+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
46460+ * before this ioctl is called, or an error will result.
46461+ */
46462+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
46463+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
46464+struct ioctl_gntdev_unmap_grant_ref {
46465+ /* IN parameters */
46466+ /* The offset was returned by the corresponding map operation. */
46467+ uint64_t index;
46468+ /* The number of pages to be unmapped. */
46469+ uint32_t count;
46470+ uint32_t pad;
46471+};
46472+
46473+/*
46474+ * Returns the offset in the driver's address space that corresponds
46475+ * to @vaddr. This can be used to perform a munmap(), followed by an
46476+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
46477+ * the caller. The number of pages that were allocated at the same time as
46478+ * @vaddr is returned in @count.
46479+ *
46480+ * N.B. Where more than one page has been mapped into a contiguous range, the
46481+ * supplied @vaddr must correspond to the start of the range; otherwise
46482+ * an error will result. It is only possible to munmap() the entire
46483+ * contiguously-allocated range at once, and not any subrange thereof.
46484+ */
46485+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
46486+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
46487+struct ioctl_gntdev_get_offset_for_vaddr {
46488+ /* IN parameters */
46489+ /* The virtual address of the first mapped page in a range. */
46490+ uint64_t vaddr;
46491+ /* OUT parameters */
46492+ /* The offset that was used in the initial mmap() operation. */
46493+ uint64_t offset;
46494+ /* The number of pages mapped in the VM area that begins at @vaddr. */
46495+ uint32_t count;
46496+ uint32_t pad;
46497+};
46498+
46499+/*
46500+ * Sets the maximum number of grants that may mapped at once by this gntdev
46501+ * instance.
46502+ *
46503+ * N.B. This must be called before any other ioctl is performed on the device.
46504+ */
46505+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
46506+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
46507+struct ioctl_gntdev_set_max_grants {
46508+ /* IN parameter */
46509+ /* The maximum number of grants that may be mapped at once. */
46510+ uint32_t count;
46511+};
46512+
46513+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
46514Index: head-2008-11-25/include/xen/public/privcmd.h
46515===================================================================
46516--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46517+++ head-2008-11-25/include/xen/public/privcmd.h 2007-06-12 13:14:19.000000000 +0200
46518@@ -0,0 +1,79 @@
46519+/******************************************************************************
46520+ * privcmd.h
46521+ *
46522+ * Interface to /proc/xen/privcmd.
46523+ *
46524+ * Copyright (c) 2003-2005, K A Fraser
46525+ *
46526+ * This program is free software; you can redistribute it and/or
46527+ * modify it under the terms of the GNU General Public License version 2
46528+ * as published by the Free Software Foundation; or, when distributed
46529+ * separately from the Linux kernel or incorporated into other
46530+ * software packages, subject to the following license:
46531+ *
46532+ * Permission is hereby granted, free of charge, to any person obtaining a copy
46533+ * of this source file (the "Software"), to deal in the Software without
46534+ * restriction, including without limitation the rights to use, copy, modify,
46535+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46536+ * and to permit persons to whom the Software is furnished to do so, subject to
46537+ * the following conditions:
46538+ *
46539+ * The above copyright notice and this permission notice shall be included in
46540+ * all copies or substantial portions of the Software.
46541+ *
46542+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46543+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46544+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46545+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46546+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46547+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46548+ * IN THE SOFTWARE.
46549+ */
46550+
46551+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
46552+#define __LINUX_PUBLIC_PRIVCMD_H__
46553+
46554+#include <linux/types.h>
46555+
46556+#ifndef __user
46557+#define __user
46558+#endif
46559+
46560+typedef struct privcmd_hypercall
46561+{
46562+ __u64 op;
46563+ __u64 arg[5];
46564+} privcmd_hypercall_t;
46565+
46566+typedef struct privcmd_mmap_entry {
46567+ __u64 va;
46568+ __u64 mfn;
46569+ __u64 npages;
46570+} privcmd_mmap_entry_t;
46571+
46572+typedef struct privcmd_mmap {
46573+ int num;
46574+ domid_t dom; /* target domain */
46575+ privcmd_mmap_entry_t __user *entry;
46576+} privcmd_mmap_t;
46577+
46578+typedef struct privcmd_mmapbatch {
46579+ int num; /* number of pages to populate */
46580+ domid_t dom; /* target domain */
46581+ __u64 addr; /* virtual address */
46582+ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
46583+} privcmd_mmapbatch_t;
46584+
46585+/*
46586+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
46587+ * @arg: &privcmd_hypercall_t
46588+ * Return: Value returned from execution of the specified hypercall.
46589+ */
46590+#define IOCTL_PRIVCMD_HYPERCALL \
46591+ _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
46592+#define IOCTL_PRIVCMD_MMAP \
46593+ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
46594+#define IOCTL_PRIVCMD_MMAPBATCH \
46595+ _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
46596+
46597+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
46598Index: head-2008-11-25/include/xen/xen_proc.h
46599===================================================================
46600--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46601+++ head-2008-11-25/include/xen/xen_proc.h 2007-06-12 13:14:19.000000000 +0200
46602@@ -0,0 +1,12 @@
46603+
46604+#ifndef __ASM_XEN_PROC_H__
46605+#define __ASM_XEN_PROC_H__
46606+
46607+#include <linux/proc_fs.h>
46608+
46609+extern struct proc_dir_entry *create_xen_proc_entry(
46610+ const char *name, mode_t mode);
46611+extern void remove_xen_proc_entry(
46612+ const char *name);
46613+
46614+#endif /* __ASM_XEN_PROC_H__ */
46615Index: head-2008-11-25/include/xen/xencons.h
46616===================================================================
46617--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46618+++ head-2008-11-25/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200
46619@@ -0,0 +1,17 @@
46620+#ifndef __ASM_XENCONS_H__
46621+#define __ASM_XENCONS_H__
46622+
46623+struct dom0_vga_console_info;
46624+void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
46625+
46626+void xencons_force_flush(void);
46627+void xencons_resume(void);
46628+
46629+/* Interrupt work hooks. Receive data, or kick data out. */
46630+void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
46631+void xencons_tx(void);
46632+
46633+int xencons_ring_init(void);
46634+int xencons_ring_send(const char *data, unsigned len);
46635+
46636+#endif /* __ASM_XENCONS_H__ */
46637Index: head-2008-11-25/include/xen/xenoprof.h
46638===================================================================
46639--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46640+++ head-2008-11-25/include/xen/xenoprof.h 2007-06-12 13:14:19.000000000 +0200
46641@@ -0,0 +1,42 @@
46642+/******************************************************************************
46643+ * xen/xenoprof.h
46644+ *
46645+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
46646+ * VA Linux Systems Japan K.K.
46647+ *
46648+ * This program is free software; you can redistribute it and/or modify
46649+ * it under the terms of the GNU General Public License as published by
46650+ * the Free Software Foundation; either version 2 of the License, or
46651+ * (at your option) any later version.
46652+ *
46653+ * This program is distributed in the hope that it will be useful,
46654+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
46655+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
46656+ * GNU General Public License for more details.
46657+ *
46658+ * You should have received a copy of the GNU General Public License
46659+ * along with this program; if not, write to the Free Software
46660+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46661+ *
46662+ */
46663+
46664+#ifndef __XEN_XENOPROF_H__
46665+#define __XEN_XENOPROF_H__
46666+#ifdef CONFIG_XEN
46667+
46668+#include <asm/xenoprof.h>
46669+
46670+struct oprofile_operations;
46671+int xenoprofile_init(struct oprofile_operations * ops);
46672+void xenoprofile_exit(void);
46673+
46674+struct xenoprof_shared_buffer {
46675+ char *buffer;
46676+ struct xenoprof_arch_shared_buffer arch;
46677+};
46678+#else
46679+#define xenoprofile_init(ops) (-ENOSYS)
46680+#define xenoprofile_exit() do { } while (0)
46681+
46682+#endif /* CONFIG_XEN */
46683+#endif /* __XEN_XENOPROF_H__ */
46684Index: head-2008-11-25/lib/swiotlb-xen.c
46685===================================================================
46686--- /dev/null 1970-01-01 00:00:00.000000000 +0000
46687+++ head-2008-11-25/lib/swiotlb-xen.c 2008-09-15 13:40:15.000000000 +0200
46688@@ -0,0 +1,739 @@
46689+/*
46690+ * Dynamic DMA mapping support.
46691+ *
46692+ * This implementation is a fallback for platforms that do not support
46693+ * I/O TLBs (aka DMA address translation hardware).
46694+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
46695+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
46696+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
46697+ * David Mosberger-Tang <davidm@hpl.hp.com>
46698+ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
46699+ */
46700+
46701+#include <linux/cache.h>
46702+#include <linux/mm.h>
46703+#include <linux/module.h>
46704+#include <linux/pci.h>
46705+#include <linux/spinlock.h>
46706+#include <linux/string.h>
46707+#include <linux/types.h>
46708+#include <linux/ctype.h>
46709+#include <linux/init.h>
46710+#include <linux/bootmem.h>
46711+#include <linux/highmem.h>
46712+#include <asm/io.h>
46713+#include <asm/pci.h>
46714+#include <asm/dma.h>
46715+#include <asm/uaccess.h>
46716+#include <xen/gnttab.h>
46717+#include <xen/interface/memory.h>
46718+#include <asm-i386/mach-xen/asm/gnttab_dma.h>
46719+
46720+int swiotlb;
46721+EXPORT_SYMBOL(swiotlb);
46722+
46723+#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
46724+
46725+/*
46726+ * Maximum allowable number of contiguous slabs to map,
46727+ * must be a power of 2. What is the appropriate value ?
46728+ * The complexity of {map,unmap}_single is linearly dependent on this value.
46729+ */
46730+#define IO_TLB_SEGSIZE 128
46731+
46732+/*
46733+ * log of the size of each IO TLB slab. The number of slabs is command line
46734+ * controllable.
46735+ */
46736+#define IO_TLB_SHIFT 11
46737+
46738+int swiotlb_force;
46739+
46740+static char *iotlb_virt_start;
46741+static unsigned long iotlb_nslabs;
46742+
46743+/*
46744+ * Used to do a quick range check in swiotlb_unmap_single and
46745+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
46746+ * API.
46747+ */
46748+static unsigned long iotlb_pfn_start, iotlb_pfn_end;
46749+
46750+/* Does the given dma address reside within the swiotlb aperture? */
46751+static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
46752+{
46753+ unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
46754+ return (pfn_valid(pfn)
46755+ && (pfn >= iotlb_pfn_start)
46756+ && (pfn < iotlb_pfn_end));
46757+}
46758+
46759+/*
46760+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
46761+ */
46762+static unsigned long io_tlb_overflow = 32*1024;
46763+
46764+void *io_tlb_overflow_buffer;
46765+
46766+/*
46767+ * This is a free list describing the number of free entries available from
46768+ * each index
46769+ */
46770+static unsigned int *io_tlb_list;
46771+static unsigned int io_tlb_index;
46772+
46773+/*
46774+ * We need to save away the original address corresponding to a mapped entry
46775+ * for the sync operations.
46776+ */
46777+static struct phys_addr {
46778+ struct page *page;
46779+ unsigned int offset;
46780+} *io_tlb_orig_addr;
46781+
46782+/*
46783+ * Protect the above data structures in the map and unmap calls
46784+ */
46785+static DEFINE_SPINLOCK(io_tlb_lock);
46786+
46787+static unsigned int dma_bits;
46788+static unsigned int __initdata max_dma_bits = 32;
46789+static int __init
46790+setup_dma_bits(char *str)
46791+{
46792+ max_dma_bits = simple_strtoul(str, NULL, 0);
46793+ return 0;
46794+}
46795+__setup("dma_bits=", setup_dma_bits);
46796+
46797+static int __init
46798+setup_io_tlb_npages(char *str)
46799+{
46800+ /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
46801+ if (isdigit(*str)) {
46802+ iotlb_nslabs = simple_strtoul(str, &str, 0) <<
46803+ (20 - IO_TLB_SHIFT);
46804+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46805+ }
46806+ if (*str == ',')
46807+ ++str;
46808+ /*
46809+ * NB. 'force' enables the swiotlb, but doesn't force its use for
46810+ * every DMA like it does on native Linux. 'off' forcibly disables
46811+ * use of the swiotlb.
46812+ */
46813+ if (!strcmp(str, "force"))
46814+ swiotlb_force = 1;
46815+ else if (!strcmp(str, "off"))
46816+ swiotlb_force = -1;
46817+ return 1;
46818+}
46819+__setup("swiotlb=", setup_io_tlb_npages);
46820+/* make io_tlb_overflow tunable too? */
46821+
46822+/*
46823+ * Statically reserve bounce buffer space and initialize bounce buffer data
46824+ * structures for the software IO TLB used to implement the PCI DMA API.
46825+ */
46826+void
46827+swiotlb_init_with_default_size (size_t default_size)
46828+{
46829+ unsigned long i, bytes;
46830+ int rc;
46831+
46832+ if (!iotlb_nslabs) {
46833+ iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
46834+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46835+ }
46836+
46837+ bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
46838+
46839+ /*
46840+ * Get IO TLB memory from the low pages
46841+ */
46842+ iotlb_virt_start = alloc_bootmem_low_pages(bytes);
46843+ if (!iotlb_virt_start)
46844+ panic("Cannot allocate SWIOTLB buffer!\n");
46845+
46846+ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
46847+ for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
46848+ do {
46849+ rc = xen_create_contiguous_region(
46850+ (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
46851+ get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
46852+ dma_bits);
46853+ } while (rc && dma_bits++ < max_dma_bits);
46854+ if (rc) {
46855+ if (i == 0)
46856+ panic("No suitable physical memory available for SWIOTLB buffer!\n"
46857+ "Use dom0_mem Xen boot parameter to reserve\n"
46858+ "some DMA memory (e.g., dom0_mem=-128M).\n");
46859+ iotlb_nslabs = i;
46860+ i <<= IO_TLB_SHIFT;
46861+ free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
46862+ bytes = i;
46863+ for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
46864+ unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
46865+
46866+ if (bits > dma_bits)
46867+ dma_bits = bits;
46868+ }
46869+ break;
46870+ }
46871+ }
46872+
46873+ /*
46874+ * Allocate and initialize the free list array. This array is used
46875+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
46876+ */
46877+ io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
46878+ for (i = 0; i < iotlb_nslabs; i++)
46879+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
46880+ io_tlb_index = 0;
46881+ io_tlb_orig_addr = alloc_bootmem(
46882+ iotlb_nslabs * sizeof(*io_tlb_orig_addr));
46883+
46884+ /*
46885+ * Get the overflow emergency buffer
46886+ */
46887+ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
46888+ if (!io_tlb_overflow_buffer)
46889+ panic("Cannot allocate SWIOTLB overflow buffer!\n");
46890+
46891+ do {
46892+ rc = xen_create_contiguous_region(
46893+ (unsigned long)io_tlb_overflow_buffer,
46894+ get_order(io_tlb_overflow),
46895+ dma_bits);
46896+ } while (rc && dma_bits++ < max_dma_bits);
46897+ if (rc)
46898+ panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
46899+
46900+ iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
46901+ iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
46902+
46903+ printk(KERN_INFO "Software IO TLB enabled: \n"
46904+ " Aperture: %lu megabytes\n"
46905+ " Kernel range: %p - %p\n"
46906+ " Address size: %u bits\n",
46907+ bytes >> 20,
46908+ iotlb_virt_start, iotlb_virt_start + bytes,
46909+ dma_bits);
46910+}
46911+
46912+void
46913+swiotlb_init(void)
46914+{
46915+ long ram_end;
46916+ size_t defsz = 64 * (1 << 20); /* 64MB default size */
46917+
46918+ if (swiotlb_force == 1) {
46919+ swiotlb = 1;
46920+ } else if ((swiotlb_force != -1) &&
46921+ is_running_on_xen() &&
46922+ is_initial_xendomain()) {
46923+ /* Domain 0 always has a swiotlb. */
46924+ ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
46925+ if (ram_end <= 0x7ffff)
46926+ defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
46927+ swiotlb = 1;
46928+ }
46929+
46930+ if (swiotlb)
46931+ swiotlb_init_with_default_size(defsz);
46932+ else
46933+ printk(KERN_INFO "Software IO TLB disabled\n");
46934+}
46935+
46936+/*
46937+ * We use __copy_to_user_inatomic to transfer to the host buffer because the
46938+ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
46939+ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
46940+ * unnecessary copy from the aperture to the host buffer, and a page fault.
46941+ */
46942+static void
46943+__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
46944+{
46945+ if (PageHighMem(buffer.page)) {
46946+ size_t len, bytes;
46947+ char *dev, *host, *kmp;
46948+ len = size;
46949+ while (len != 0) {
46950+ unsigned long flags;
46951+
46952+ if (((bytes = len) + buffer.offset) > PAGE_SIZE)
46953+ bytes = PAGE_SIZE - buffer.offset;
46954+ local_irq_save(flags); /* protects KM_BOUNCE_READ */
46955+ kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
46956+ dev = dma_addr + size - len;
46957+ host = kmp + buffer.offset;
46958+ if (dir == DMA_FROM_DEVICE) {
46959+ if (__copy_to_user_inatomic(host, dev, bytes))
46960+ /* inaccessible */;
46961+ } else
46962+ memcpy(dev, host, bytes);
46963+ kunmap_atomic(kmp, KM_BOUNCE_READ);
46964+ local_irq_restore(flags);
46965+ len -= bytes;
46966+ buffer.page++;
46967+ buffer.offset = 0;
46968+ }
46969+ } else {
46970+ char *host = (char *)phys_to_virt(
46971+ page_to_pseudophys(buffer.page)) + buffer.offset;
46972+ if (dir == DMA_FROM_DEVICE) {
46973+ if (__copy_to_user_inatomic(host, dma_addr, size))
46974+ /* inaccessible */;
46975+ } else if (dir == DMA_TO_DEVICE)
46976+ memcpy(dma_addr, host, size);
46977+ }
46978+}
46979+
46980+/*
46981+ * Allocates bounce buffer and returns its kernel virtual address.
46982+ */
46983+static void *
46984+map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
46985+{
46986+ unsigned long flags;
46987+ char *dma_addr;
46988+ unsigned int nslots, stride, index, wrap;
46989+ struct phys_addr slot_buf;
46990+ int i;
46991+
46992+ /*
46993+ * For mappings greater than a page, we limit the stride (and
46994+ * hence alignment) to a page size.
46995+ */
46996+ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
46997+ if (size > PAGE_SIZE)
46998+ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
46999+ else
47000+ stride = 1;
47001+
47002+ BUG_ON(!nslots);
47003+
47004+ /*
47005+ * Find suitable number of IO TLB entries size that will fit this
47006+ * request and allocate a buffer from that IO TLB pool.
47007+ */
47008+ spin_lock_irqsave(&io_tlb_lock, flags);
47009+ {
47010+ wrap = index = ALIGN(io_tlb_index, stride);
47011+
47012+ if (index >= iotlb_nslabs)
47013+ wrap = index = 0;
47014+
47015+ do {
47016+ /*
47017+ * If we find a slot that indicates we have 'nslots'
47018+ * number of contiguous buffers, we allocate the
47019+ * buffers from that slot and mark the entries as '0'
47020+ * indicating unavailable.
47021+ */
47022+ if (io_tlb_list[index] >= nslots) {
47023+ int count = 0;
47024+
47025+ for (i = index; i < (int)(index + nslots); i++)
47026+ io_tlb_list[i] = 0;
47027+ for (i = index - 1;
47028+ (OFFSET(i, IO_TLB_SEGSIZE) !=
47029+ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47030+ i--)
47031+ io_tlb_list[i] = ++count;
47032+ dma_addr = iotlb_virt_start +
47033+ (index << IO_TLB_SHIFT);
47034+
47035+ /*
47036+ * Update the indices to avoid searching in
47037+ * the next round.
47038+ */
47039+ io_tlb_index =
47040+ ((index + nslots) < iotlb_nslabs
47041+ ? (index + nslots) : 0);
47042+
47043+ goto found;
47044+ }
47045+ index += stride;
47046+ if (index >= iotlb_nslabs)
47047+ index = 0;
47048+ } while (index != wrap);
47049+
47050+ spin_unlock_irqrestore(&io_tlb_lock, flags);
47051+ return NULL;
47052+ }
47053+ found:
47054+ spin_unlock_irqrestore(&io_tlb_lock, flags);
47055+
47056+ /*
47057+ * Save away the mapping from the original address to the DMA address.
47058+ * This is needed when we sync the memory. Then we sync the buffer if
47059+ * needed.
47060+ */
47061+ slot_buf = buffer;
47062+ for (i = 0; i < nslots; i++) {
47063+ slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
47064+ slot_buf.offset &= PAGE_SIZE - 1;
47065+ io_tlb_orig_addr[index+i] = slot_buf;
47066+ slot_buf.offset += 1 << IO_TLB_SHIFT;
47067+ }
47068+ if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47069+ __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
47070+
47071+ return dma_addr;
47072+}
47073+
47074+static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
47075+{
47076+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47077+ struct phys_addr buffer = io_tlb_orig_addr[index];
47078+ buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
47079+ buffer.page += buffer.offset >> PAGE_SHIFT;
47080+ buffer.offset &= PAGE_SIZE - 1;
47081+ return buffer;
47082+}
47083+
47084+/*
47085+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
47086+ */
47087+static void
47088+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47089+{
47090+ unsigned long flags;
47091+ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
47092+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47093+ struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47094+
47095+ /*
47096+ * First, sync the memory before unmapping the entry
47097+ */
47098+ if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47099+ __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
47100+
47101+ /*
47102+ * Return the buffer to the free list by setting the corresponding
47103+ * entries to indicate the number of contigous entries available.
47104+ * While returning the entries to the free list, we merge the entries
47105+ * with slots below and above the pool being returned.
47106+ */
47107+ spin_lock_irqsave(&io_tlb_lock, flags);
47108+ {
47109+ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
47110+ io_tlb_list[index + nslots] : 0);
47111+ /*
47112+ * Step 1: return the slots to the free list, merging the
47113+ * slots with superceeding slots
47114+ */
47115+ for (i = index + nslots - 1; i >= index; i--)
47116+ io_tlb_list[i] = ++count;
47117+ /*
47118+ * Step 2: merge the returned slots with the preceding slots,
47119+ * if available (non zero)
47120+ */
47121+ for (i = index - 1;
47122+ (OFFSET(i, IO_TLB_SEGSIZE) !=
47123+ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47124+ i--)
47125+ io_tlb_list[i] = ++count;
47126+ }
47127+ spin_unlock_irqrestore(&io_tlb_lock, flags);
47128+}
47129+
47130+static void
47131+sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47132+{
47133+ struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47134+ BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
47135+ __sync_single(buffer, dma_addr, size, dir);
47136+}
47137+
47138+static void
47139+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
47140+{
47141+ /*
47142+ * Ran out of IOMMU space for this operation. This is very bad.
47143+ * Unfortunately the drivers cannot handle this operation properly.
47144+ * unless they check for pci_dma_mapping_error (most don't)
47145+ * When the mapping is small enough return a static buffer to limit
47146+ * the damage, or panic when the transfer is too big.
47147+ */
47148+ printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
47149+ "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
47150+
47151+ if (size > io_tlb_overflow && do_panic) {
47152+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47153+ panic("PCI-DMA: Memory would be corrupted\n");
47154+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47155+ panic("PCI-DMA: Random memory would be DMAed\n");
47156+ }
47157+}
47158+
47159+/*
47160+ * Map a single buffer of the indicated size for DMA in streaming mode. The
47161+ * PCI address to use is returned.
47162+ *
47163+ * Once the device is given the dma address, the device owns this memory until
47164+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
47165+ */
47166+dma_addr_t
47167+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
47168+{
47169+ dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
47170+ offset_in_page(ptr);
47171+ void *map;
47172+ struct phys_addr buffer;
47173+
47174+ BUG_ON(dir == DMA_NONE);
47175+
47176+ /*
47177+ * If the pointer passed in happens to be in the device's DMA window,
47178+ * we can safely return the device addr and not worry about bounce
47179+ * buffering it.
47180+ */
47181+ if (!range_straddles_page_boundary(__pa(ptr), size) &&
47182+ !address_needs_mapping(hwdev, dev_addr))
47183+ return dev_addr;
47184+
47185+ /*
47186+ * Oh well, have to allocate and map a bounce buffer.
47187+ */
47188+ gnttab_dma_unmap_page(dev_addr);
47189+ buffer.page = virt_to_page(ptr);
47190+ buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
47191+ map = map_single(hwdev, buffer, size, dir);
47192+ if (!map) {
47193+ swiotlb_full(hwdev, size, dir, 1);
47194+ map = io_tlb_overflow_buffer;
47195+ }
47196+
47197+ dev_addr = virt_to_bus(map);
47198+ return dev_addr;
47199+}
47200+
47201+/*
47202+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
47203+ * match what was provided for in a previous swiotlb_map_single call. All
47204+ * other usages are undefined.
47205+ *
47206+ * After this call, reads by the cpu to the buffer are guaranteed to see
47207+ * whatever the device wrote there.
47208+ */
47209+void
47210+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
47211+ int dir)
47212+{
47213+ BUG_ON(dir == DMA_NONE);
47214+ if (in_swiotlb_aperture(dev_addr))
47215+ unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
47216+ else
47217+ gnttab_dma_unmap_page(dev_addr);
47218+}
47219+
47220+/*
47221+ * Make physical memory consistent for a single streaming mode DMA translation
47222+ * after a transfer.
47223+ *
47224+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
47225+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
47226+ * call this function before doing so. At the next point you give the PCI dma
47227+ * address back to the card, you must first perform a
47228+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
47229+ */
47230+void
47231+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
47232+ size_t size, int dir)
47233+{
47234+ BUG_ON(dir == DMA_NONE);
47235+ if (in_swiotlb_aperture(dev_addr))
47236+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47237+}
47238+
47239+void
47240+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
47241+ size_t size, int dir)
47242+{
47243+ BUG_ON(dir == DMA_NONE);
47244+ if (in_swiotlb_aperture(dev_addr))
47245+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47246+}
47247+
47248+/*
47249+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
47250+ * This is the scatter-gather version of the above swiotlb_map_single
47251+ * interface. Here the scatter gather list elements are each tagged with the
47252+ * appropriate dma address and length. They are obtained via
47253+ * sg_dma_{address,length}(SG).
47254+ *
47255+ * NOTE: An implementation may be able to use a smaller number of
47256+ * DMA address/length pairs than there are SG table elements.
47257+ * (for example via virtual mapping capabilities)
47258+ * The routine returns the number of addr/length pairs actually
47259+ * used, at most nents.
47260+ *
47261+ * Device ownership issues as mentioned above for swiotlb_map_single are the
47262+ * same here.
47263+ */
47264+int
47265+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47266+ int dir)
47267+{
47268+ struct phys_addr buffer;
47269+ dma_addr_t dev_addr;
47270+ char *map;
47271+ int i;
47272+
47273+ BUG_ON(dir == DMA_NONE);
47274+
47275+ for (i = 0; i < nelems; i++, sg++) {
47276+ dev_addr = gnttab_dma_map_page(sg->page) + sg->offset;
47277+
47278+ if (range_straddles_page_boundary(page_to_pseudophys(sg->page)
47279+ + sg->offset, sg->length)
47280+ || address_needs_mapping(hwdev, dev_addr)) {
47281+ gnttab_dma_unmap_page(dev_addr);
47282+ buffer.page = sg->page;
47283+ buffer.offset = sg->offset;
47284+ map = map_single(hwdev, buffer, sg->length, dir);
47285+ if (!map) {
47286+ /* Don't panic here, we expect map_sg users
47287+ to do proper error handling. */
47288+ swiotlb_full(hwdev, sg->length, dir, 0);
47289+ swiotlb_unmap_sg(hwdev, sg - i, i, dir);
47290+ sg[0].dma_length = 0;
47291+ return 0;
47292+ }
47293+ sg->dma_address = (dma_addr_t)virt_to_bus(map);
47294+ } else
47295+ sg->dma_address = dev_addr;
47296+ sg->dma_length = sg->length;
47297+ }
47298+ return nelems;
47299+}
47300+
47301+/*
47302+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
47303+ * concerning calls here are the same as for swiotlb_unmap_single() above.
47304+ */
47305+void
47306+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47307+ int dir)
47308+{
47309+ int i;
47310+
47311+ BUG_ON(dir == DMA_NONE);
47312+
47313+ for (i = 0; i < nelems; i++, sg++)
47314+ if (in_swiotlb_aperture(sg->dma_address))
47315+ unmap_single(hwdev,
47316+ (void *)bus_to_virt(sg->dma_address),
47317+ sg->dma_length, dir);
47318+ else
47319+ gnttab_dma_unmap_page(sg->dma_address);
47320+}
47321+
47322+/*
47323+ * Make physical memory consistent for a set of streaming mode DMA translations
47324+ * after a transfer.
47325+ *
47326+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
47327+ * and usage.
47328+ */
47329+void
47330+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
47331+ int nelems, int dir)
47332+{
47333+ int i;
47334+
47335+ BUG_ON(dir == DMA_NONE);
47336+
47337+ for (i = 0; i < nelems; i++, sg++)
47338+ if (in_swiotlb_aperture(sg->dma_address))
47339+ sync_single(hwdev,
47340+ (void *)bus_to_virt(sg->dma_address),
47341+ sg->dma_length, dir);
47342+}
47343+
47344+void
47345+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
47346+ int nelems, int dir)
47347+{
47348+ int i;
47349+
47350+ BUG_ON(dir == DMA_NONE);
47351+
47352+ for (i = 0; i < nelems; i++, sg++)
47353+ if (in_swiotlb_aperture(sg->dma_address))
47354+ sync_single(hwdev,
47355+ (void *)bus_to_virt(sg->dma_address),
47356+ sg->dma_length, dir);
47357+}
47358+
47359+#ifdef CONFIG_HIGHMEM
47360+
47361+dma_addr_t
47362+swiotlb_map_page(struct device *hwdev, struct page *page,
47363+ unsigned long offset, size_t size,
47364+ enum dma_data_direction direction)
47365+{
47366+ struct phys_addr buffer;
47367+ dma_addr_t dev_addr;
47368+ char *map;
47369+
47370+ dev_addr = gnttab_dma_map_page(page) + offset;
47371+ if (address_needs_mapping(hwdev, dev_addr)) {
47372+ gnttab_dma_unmap_page(dev_addr);
47373+ buffer.page = page;
47374+ buffer.offset = offset;
47375+ map = map_single(hwdev, buffer, size, direction);
47376+ if (!map) {
47377+ swiotlb_full(hwdev, size, direction, 1);
47378+ map = io_tlb_overflow_buffer;
47379+ }
47380+ dev_addr = (dma_addr_t)virt_to_bus(map);
47381+ }
47382+
47383+ return dev_addr;
47384+}
47385+
47386+void
47387+swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
47388+ size_t size, enum dma_data_direction direction)
47389+{
47390+ BUG_ON(direction == DMA_NONE);
47391+ if (in_swiotlb_aperture(dma_address))
47392+ unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
47393+ else
47394+ gnttab_dma_unmap_page(dma_address);
47395+}
47396+
47397+#endif
47398+
47399+int
47400+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
47401+{
47402+ return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
47403+}
47404+
47405+/*
47406+ * Return whether the given PCI device DMA address mask can be supported
47407+ * properly. For example, if your device can only drive the low 24-bits
47408+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
47409+ * this function.
47410+ */
47411+int
47412+swiotlb_dma_supported (struct device *hwdev, u64 mask)
47413+{
47414+ return (mask >= ((1UL << dma_bits) - 1));
47415+}
47416+
47417+EXPORT_SYMBOL(swiotlb_init);
47418+EXPORT_SYMBOL(swiotlb_map_single);
47419+EXPORT_SYMBOL(swiotlb_unmap_single);
47420+EXPORT_SYMBOL(swiotlb_map_sg);
47421+EXPORT_SYMBOL(swiotlb_unmap_sg);
47422+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
47423+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
47424+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
47425+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
47426+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
47427+EXPORT_SYMBOL(swiotlb_dma_supported);
47428Index: head-2008-11-25/scripts/Makefile.xen.awk
47429===================================================================
47430--- /dev/null 1970-01-01 00:00:00.000000000 +0000
47431+++ head-2008-11-25/scripts/Makefile.xen.awk 2007-08-06 15:10:49.000000000 +0200
47432@@ -0,0 +1,34 @@
47433+BEGIN {
47434+ is_rule = 0
47435+}
47436+
47437+/^[[:space:]]*#/ {
47438+ next
47439+}
47440+
47441+/^[[:space:]]*$/ {
47442+ if (is_rule)
47443+ print("")
47444+ is_rule = 0
47445+ next
47446+}
47447+
47448+/:[[:space:]]*%\.[cS][[:space:]]/ {
47449+ line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
47450+ line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
47451+ print line
47452+ is_rule = 1
47453+ next
47454+}
47455+
47456+/^[^\t]$/ {
47457+ if (is_rule)
47458+ print("")
47459+ is_rule = 0
47460+ next
47461+}
47462+
47463+is_rule {
47464+ print $0
47465+ next
47466+}