]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/60008_xen3-auto-xen-arch.patch1
Stop dhcpcd before starting if it was running
[people/pmueller/ipfire-2.x.git] / src / patches / 60008_xen3-auto-xen-arch.patch1
1 Subject: xen3 xen-arch
2 From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd)
3 Patch-mainline: obsolete
4 Acked-by: jbeulich@novell.com
5
6 List of files having Xen derivates (perhaps created during the merging
7 of newer kernel versions), for xen-port-patches.py to pick up (i.e. this
8 must be retained here until the XenSource tree has these in the right
9 places):
10 +++ linux/arch/x86/kernel/acpi/sleep-xen.c
11 +++ linux/arch/x86/kernel/cpu/common_64-xen.c
12 +++ linux/arch/x86/kernel/e820-xen.c
13 +++ linux/arch/x86/kernel/head-xen.c
14 +++ linux/arch/x86/kernel/head32-xen.c
15 +++ linux/arch/x86/kernel/ioport-xen.c
16 +++ linux/arch/x86/kernel/ipi-xen.c
17 +++ linux/arch/x86/kernel/ldt-xen.c
18 +++ linux/arch/x86/kernel/mpparse-xen.c
19 +++ linux/arch/x86/kernel/pci-nommu-xen.c
20 +++ linux/arch/x86/kernel/process-xen.c
21 +++ linux/arch/x86/kernel/setup-xen.c
22 +++ linux/arch/x86/kernel/setup_percpu-xen.c
23 +++ linux/arch/x86/kernel/smp-xen.c
24 +++ linux/arch/x86/mm/fault-xen.c
25 +++ linux/arch/x86/mm/ioremap-xen.c
26 +++ linux/arch/x86/mm/pageattr-xen.c
27 +++ linux/arch/x86/mm/pat-xen.c
28 +++ linux/arch/x86/mm/pgtable-xen.c
29 +++ linux/arch/x86/vdso/vdso32-setup-xen.c
30 +++ linux/drivers/char/mem-xen.c
31 +++ linux/include/asm-x86/mach-xen/asm/desc.h
32 +++ linux/include/asm-x86/mach-xen/asm/dma-mapping.h
33 +++ linux/include/asm-x86/mach-xen/asm/fixmap.h
34 +++ linux/include/asm-x86/mach-xen/asm/io.h
35 +++ linux/include/asm-x86/mach-xen/asm/irq_vectors.h
36 +++ linux/include/asm-x86/mach-xen/asm/irqflags.h
37 +++ linux/include/asm-x86/mach-xen/asm/mmu_context.h
38 +++ linux/include/asm-x86/mach-xen/asm/page.h
39 +++ linux/include/asm-x86/mach-xen/asm/pci.h
40 +++ linux/include/asm-x86/mach-xen/asm/pgalloc.h
41 +++ linux/include/asm-x86/mach-xen/asm/pgtable.h
42 +++ linux/include/asm-x86/mach-xen/asm/processor.h
43 +++ linux/include/asm-x86/mach-xen/asm/segment.h
44 +++ linux/include/asm-x86/mach-xen/asm/smp.h
45 +++ linux/include/asm-x86/mach-xen/asm/spinlock.h
46 +++ linux/include/asm-x86/mach-xen/asm/swiotlb.h
47 +++ linux/include/asm-x86/mach-xen/asm/system.h
48 +++ linux/include/asm-x86/mach-xen/asm/tlbflush.h
49 +++ linux/include/asm-x86/mach-xen/asm/xor.h
50
51 List of files folded into their native counterparts (and hence removed
52 from this patch for xen-port-patches.py to not needlessly pick them up;
53 for reference, prefixed with the version the removal occured):
54 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level.h
55 2.6.18/include/asm-x86/mach-xen/asm/pgtable-2level-defs.h
56 2.6.19/include/asm-x86/mach-xen/asm/ptrace.h
57 2.6.23/arch/x86/kernel/vsyscall-note_32-xen.S
58 2.6.23/include/asm-x86/mach-xen/asm/ptrace_64.h
59 2.6.24/arch/x86/kernel/early_printk_32-xen.c
60 2.6.24/include/asm-x86/mach-xen/asm/arch_hooks_64.h
61 2.6.24/include/asm-x86/mach-xen/asm/bootsetup_64.h
62 2.6.24/include/asm-x86/mach-xen/asm/mmu_32.h
63 2.6.24/include/asm-x86/mach-xen/asm/mmu_64.h
64 2.6.24/include/asm-x86/mach-xen/asm/nmi_64.h
65 2.6.24/include/asm-x86/mach-xen/asm/setup.h
66 2.6.24/include/asm-x86/mach-xen/asm/time_64.h (added in 2.6.20)
67 2.6.25/arch/x86/ia32/syscall32-xen.c
68 2.6.25/arch/x86/ia32/syscall32_syscall-xen.S
69 2.6.25/arch/x86/ia32/vsyscall-int80.S
70 2.6.25/arch/x86/kernel/acpi/boot-xen.c
71 2.6.25/include/asm-x86/mach-xen/asm/msr.h
72 2.6.25/include/asm-x86/mach-xen/asm/page_32.h
73 2.6.25/include/asm-x86/mach-xen/asm/spinlock_32.h
74 2.6.25/include/asm-x86/mach-xen/asm/timer.h (added in 2.6.24)
75 2.6.25/include/asm-x86/mach-xen/asm/timer_64.h
76 2.6.26/arch/x86/kernel/pci-dma_32-xen.c
77 2.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c
78 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_32.h
79 2.6.26/include/asm-x86/mach-xen/asm/dma-mapping_64.h
80 2.6.26/include/asm-x86/mach-xen/asm/nmi.h (added in 2.6.24)
81 2.6.26/include/asm-x86/mach-xen/asm/scatterlist.h (added in 2.6.24)
82 2.6.26/include/asm-x86/mach-xen/asm/scatterlist_32.h
83 2.6.26/include/xen/xencomm.h
84 2.6.27/arch/x86/kernel/e820_32-xen.c
85 2.6.27/include/asm-x86/mach-xen/asm/e820.h (added in 2.6.24)
86 2.6.27/include/asm-x86/mach-xen/asm/e820_64.h
87 2.6.27/include/asm-x86/mach-xen/asm/hw_irq.h (added in 2.6.24)
88 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_32.h
89 2.6.27/include/asm-x86/mach-xen/asm/hw_irq_64.h
90 2.6.27/include/asm-x86/mach-xen/asm/irq.h (added in 2.6.24)
91 2.6.27/include/asm-x86/mach-xen/asm/irq_64.h
92
93 Index: head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c
94 ===================================================================
95 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
96 +++ head-2008-11-25/arch/x86/kernel/acpi/processor_extcntl_xen.c 2008-10-01 15:43:24.000000000 +0200
97 @@ -0,0 +1,209 @@
98 +/*
99 + * processor_extcntl_xen.c - interface to notify Xen
100 + *
101 + * Copyright (C) 2008, Intel corporation
102 + *
103 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104 + *
105 + * This program is free software; you can redistribute it and/or modify
106 + * it under the terms of the GNU General Public License as published by
107 + * the Free Software Foundation; either version 2 of the License, or (at
108 + * your option) any later version.
109 + *
110 + * This program is distributed in the hope that it will be useful, but
111 + * WITHOUT ANY WARRANTY; without even the implied warranty of
112 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
113 + * General Public License for more details.
114 + *
115 + * You should have received a copy of the GNU General Public License along
116 + * with this program; if not, write to the Free Software Foundation, Inc.,
117 + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
118 + *
119 + */
120 +
121 +#include <linux/kernel.h>
122 +#include <linux/init.h>
123 +#include <linux/types.h>
124 +#include <linux/acpi.h>
125 +#include <linux/pm.h>
126 +#include <linux/cpu.h>
127 +
128 +#include <linux/cpufreq.h>
129 +#include <acpi/processor.h>
130 +#include <asm/hypercall.h>
131 +
132 +static int xen_cx_notifier(struct acpi_processor *pr, int action)
133 +{
134 + int ret, count = 0, i;
135 + xen_platform_op_t op = {
136 + .cmd = XENPF_set_processor_pminfo,
137 + .interface_version = XENPF_INTERFACE_VERSION,
138 + .u.set_pminfo.id = pr->acpi_id,
139 + .u.set_pminfo.type = XEN_PM_CX,
140 + };
141 + struct xen_processor_cx *data, *buf;
142 + struct acpi_processor_cx *cx;
143 +
144 + if (action == PROCESSOR_PM_CHANGE)
145 + return -EINVAL;
146 +
147 + /* Convert to Xen defined structure and hypercall */
148 + buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
149 + GFP_KERNEL);
150 + if (!buf)
151 + return -ENOMEM;
152 +
153 + data = buf;
154 + for (i = 1; i <= pr->power.count; i++) {
155 + cx = &pr->power.states[i];
156 + /* Skip invalid cstate entry */
157 + if (!cx->valid)
158 + continue;
159 +
160 + data->type = cx->type;
161 + data->latency = cx->latency;
162 + data->power = cx->power;
163 + data->reg.space_id = cx->reg.space_id;
164 + data->reg.bit_width = cx->reg.bit_width;
165 + data->reg.bit_offset = cx->reg.bit_offset;
166 + data->reg.access_size = cx->reg.reserved;
167 + data->reg.address = cx->reg.address;
168 +
169 + /* Get dependency relationships */
170 + if (cx->csd_count) {
171 + printk("Wow! _CSD is found. Not support for now!\n");
172 + kfree(buf);
173 + return -EINVAL;
174 + } else {
175 + data->dpcnt = 0;
176 + set_xen_guest_handle(data->dp, NULL);
177 + }
178 +
179 + data++;
180 + count++;
181 + }
182 +
183 + if (!count) {
184 + printk("No available Cx info for cpu %d\n", pr->acpi_id);
185 + kfree(buf);
186 + return -EINVAL;
187 + }
188 +
189 + op.u.set_pminfo.power.count = count;
190 + op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
191 + op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
192 + op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
193 + op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done;
194 +
195 + set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
196 + ret = HYPERVISOR_platform_op(&op);
197 + kfree(buf);
198 + return ret;
199 +}
200 +
201 +static int xen_px_notifier(struct acpi_processor *pr, int action)
202 +{
203 + int ret = -EINVAL;
204 + xen_platform_op_t op = {
205 + .cmd = XENPF_set_processor_pminfo,
206 + .interface_version = XENPF_INTERFACE_VERSION,
207 + .u.set_pminfo.id = pr->acpi_id,
208 + .u.set_pminfo.type = XEN_PM_PX,
209 + };
210 + struct xen_processor_performance *perf;
211 + struct xen_processor_px *states = NULL;
212 + struct acpi_processor_performance *px;
213 + struct acpi_psd_package *pdomain;
214 +
215 + if (!pr)
216 + return -EINVAL;
217 +
218 + perf = &op.u.set_pminfo.perf;
219 + px = pr->performance;
220 +
221 + switch(action) {
222 + case PROCESSOR_PM_CHANGE:
223 + /* ppc dynamic handle */
224 + perf->flags = XEN_PX_PPC;
225 + perf->platform_limit = pr->performance_platform_limit;
226 +
227 + ret = HYPERVISOR_platform_op(&op);
228 + break;
229 +
230 + case PROCESSOR_PM_INIT:
231 + /* px normal init */
232 + perf->flags = XEN_PX_PPC |
233 + XEN_PX_PCT |
234 + XEN_PX_PSS |
235 + XEN_PX_PSD;
236 +
237 + /* ppc */
238 + perf->platform_limit = pr->performance_platform_limit;
239 +
240 + /* pct */
241 + xen_convert_pct_reg(&perf->control_register, &px->control_register);
242 + xen_convert_pct_reg(&perf->status_register, &px->status_register);
243 +
244 + /* pss */
245 + perf->state_count = px->state_count;
246 + states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
247 + if (!states)
248 + return -ENOMEM;
249 + xen_convert_pss_states(states, px->states, px->state_count);
250 + set_xen_guest_handle(perf->states, states);
251 +
252 + /* psd */
253 + pdomain = &px->domain_info;
254 + xen_convert_psd_pack(&perf->domain_info, pdomain);
255 + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
256 + perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
257 + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
258 + perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
259 + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
260 + perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
261 + else {
262 + ret = -ENODEV;
263 + kfree(states);
264 + break;
265 + }
266 +
267 + ret = HYPERVISOR_platform_op(&op);
268 + kfree(states);
269 + break;
270 +
271 + default:
272 + break;
273 + }
274 +
275 + return ret;
276 +}
277 +
278 +static int xen_tx_notifier(struct acpi_processor *pr, int action)
279 +{
280 + return -EINVAL;
281 +}
282 +static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
283 +{
284 + return -EINVAL;
285 +}
286 +
287 +static struct processor_extcntl_ops xen_extcntl_ops = {
288 + .hotplug = xen_hotplug_notifier,
289 +};
290 +
291 +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops)
292 +{
293 + unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
294 +
295 + if (!pmbits)
296 + return;
297 + if (pmbits & XEN_PROCESSOR_PM_CX)
298 + xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
299 + if (pmbits & XEN_PROCESSOR_PM_PX)
300 + xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
301 + if (pmbits & XEN_PROCESSOR_PM_TX)
302 + xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
303 +
304 + *ops = &xen_extcntl_ops;
305 +}
306 +EXPORT_SYMBOL(arch_acpi_processor_init_extcntl);
307 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c
308 ===================================================================
309 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
310 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200
311 @@ -0,0 +1,113 @@
312 +/*
313 + * sleep.c - x86-specific ACPI sleep support.
314 + *
315 + * Copyright (C) 2001-2003 Patrick Mochel
316 + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
317 + */
318 +
319 +#include <linux/acpi.h>
320 +#include <linux/bootmem.h>
321 +#include <linux/dmi.h>
322 +#include <linux/cpumask.h>
323 +
324 +#include <asm/smp.h>
325 +
326 +#ifndef CONFIG_ACPI_PV_SLEEP
327 +/* address in low memory of the wakeup routine. */
328 +unsigned long acpi_wakeup_address = 0;
329 +unsigned long acpi_video_flags;
330 +extern char wakeup_start, wakeup_end;
331 +
332 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
333 +#endif
334 +
335 +/**
336 + * acpi_save_state_mem - save kernel state
337 + *
338 + * Create an identity mapped page table and copy the wakeup routine to
339 + * low memory.
340 + */
341 +int acpi_save_state_mem(void)
342 +{
343 +#ifndef CONFIG_ACPI_PV_SLEEP
344 + if (!acpi_wakeup_address)
345 + return 1;
346 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
347 + &wakeup_end - &wakeup_start);
348 + acpi_copy_wakeup_routine(acpi_wakeup_address);
349 +#endif
350 + return 0;
351 +}
352 +
353 +/*
354 + * acpi_restore_state - undo effects of acpi_save_state_mem
355 + */
356 +void acpi_restore_state_mem(void)
357 +{
358 +}
359 +
360 +/**
361 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
362 + *
363 + * We allocate a page from the first 1MB of memory for the wakeup
364 + * routine for when we come back from a sleep state. The
365 + * runtime allocator allows specification of <16MB pages, but not
366 + * <1MB pages.
367 + */
368 +void __init acpi_reserve_bootmem(void)
369 +{
370 +#ifndef CONFIG_ACPI_PV_SLEEP
371 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
372 + printk(KERN_ERR
373 + "ACPI: Wakeup code way too big, S3 disabled.\n");
374 + return;
375 + }
376 +
377 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
378 + if (!acpi_wakeup_address)
379 + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
380 +#endif
381 +}
382 +
383 +#ifndef CONFIG_ACPI_PV_SLEEP
384 +static int __init acpi_sleep_setup(char *str)
385 +{
386 + while ((str != NULL) && (*str != '\0')) {
387 + if (strncmp(str, "s3_bios", 7) == 0)
388 + acpi_video_flags = 1;
389 + if (strncmp(str, "s3_mode", 7) == 0)
390 + acpi_video_flags |= 2;
391 + str = strchr(str, ',');
392 + if (str != NULL)
393 + str += strspn(str, ", \t");
394 + }
395 + return 1;
396 +}
397 +
398 +__setup("acpi_sleep=", acpi_sleep_setup);
399 +
400 +static __init int reset_videomode_after_s3(struct dmi_system_id *d)
401 +{
402 + acpi_video_flags |= 2;
403 + return 0;
404 +}
405 +
406 +static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
407 + { /* Reset video mode after returning from ACPI S3 sleep */
408 + .callback = reset_videomode_after_s3,
409 + .ident = "Toshiba Satellite 4030cdt",
410 + .matches = {
411 + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
412 + },
413 + },
414 + {}
415 +};
416 +
417 +static int __init acpisleep_dmi_init(void)
418 +{
419 + dmi_check_system(acpisleep_dmi_table);
420 + return 0;
421 +}
422 +
423 +core_initcall(acpisleep_dmi_init);
424 +#endif /* CONFIG_ACPI_PV_SLEEP */
425 Index: head-2008-11-25/arch/x86/kernel/apic_32-xen.c
426 ===================================================================
427 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
428 +++ head-2008-11-25/arch/x86/kernel/apic_32-xen.c 2007-06-12 13:12:48.000000000 +0200
429 @@ -0,0 +1,155 @@
430 +/*
431 + * Local APIC handling, local APIC timers
432 + *
433 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
434 + *
435 + * Fixes
436 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
437 + * thanks to Eric Gilmore
438 + * and Rolf G. Tews
439 + * for testing these extensively.
440 + * Maciej W. Rozycki : Various updates and fixes.
441 + * Mikael Pettersson : Power Management for UP-APIC.
442 + * Pavel Machek and
443 + * Mikael Pettersson : PM converted to driver model.
444 + */
445 +
446 +#include <linux/init.h>
447 +
448 +#include <linux/mm.h>
449 +#include <linux/delay.h>
450 +#include <linux/bootmem.h>
451 +#include <linux/smp_lock.h>
452 +#include <linux/interrupt.h>
453 +#include <linux/mc146818rtc.h>
454 +#include <linux/kernel_stat.h>
455 +#include <linux/sysdev.h>
456 +#include <linux/cpu.h>
457 +#include <linux/module.h>
458 +
459 +#include <asm/atomic.h>
460 +#include <asm/smp.h>
461 +#include <asm/mtrr.h>
462 +#include <asm/mpspec.h>
463 +#include <asm/desc.h>
464 +#include <asm/arch_hooks.h>
465 +#include <asm/hpet.h>
466 +#include <asm/i8253.h>
467 +#include <asm/nmi.h>
468 +
469 +#include <mach_apic.h>
470 +#include <mach_apicdef.h>
471 +#include <mach_ipi.h>
472 +
473 +#include "io_ports.h"
474 +
475 +#ifndef CONFIG_XEN
476 +/*
477 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
478 + * IPIs in place of local APIC timers
479 + */
480 +static cpumask_t timer_bcast_ipi;
481 +#endif
482 +
483 +/*
484 + * Knob to control our willingness to enable the local APIC.
485 + */
486 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
487 +
488 +/*
489 + * Debug level
490 + */
491 +int apic_verbosity;
492 +
493 +#ifndef CONFIG_XEN
494 +static int modern_apic(void)
495 +{
496 + unsigned int lvr, version;
497 + /* AMD systems use old APIC versions, so check the CPU */
498 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
499 + boot_cpu_data.x86 >= 0xf)
500 + return 1;
501 + lvr = apic_read(APIC_LVR);
502 + version = GET_APIC_VERSION(lvr);
503 + return version >= 0x14;
504 +}
505 +#endif /* !CONFIG_XEN */
506 +
507 +/*
508 + * 'what should we do if we get a hw irq event on an illegal vector'.
509 + * each architecture has to answer this themselves.
510 + */
511 +void ack_bad_irq(unsigned int irq)
512 +{
513 + printk("unexpected IRQ trap at vector %02x\n", irq);
514 + /*
515 + * Currently unexpected vectors happen only on SMP and APIC.
516 + * We _must_ ack these because every local APIC has only N
517 + * irq slots per priority level, and a 'hanging, unacked' IRQ
518 + * holds up an irq slot - in excessive cases (when multiple
519 + * unexpected vectors occur) that might lock up the APIC
520 + * completely.
521 + * But only ack when the APIC is enabled -AK
522 + */
523 + if (cpu_has_apic)
524 + ack_APIC_irq();
525 +}
526 +
527 +int get_physical_broadcast(void)
528 +{
529 + return 0xff;
530 +}
531 +
532 +#ifndef CONFIG_XEN
533 +#ifndef CONFIG_SMP
534 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
535 +{
536 + int cpu = smp_processor_id();
537 +
538 + /*
539 + * the NMI deadlock-detector uses this.
540 + */
541 + per_cpu(irq_stat, cpu).apic_timer_irqs++;
542 +
543 + smp_local_timer_interrupt(regs);
544 +}
545 +#endif
546 +
547 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
548 +{
549 + cpumask_t mask;
550 +
551 + cpus_and(mask, cpu_online_map, timer_bcast_ipi);
552 + if (!cpus_empty(mask)) {
553 +#ifdef CONFIG_SMP
554 + send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
555 +#else
556 + /*
557 + * We can directly call the apic timer interrupt handler
558 + * in UP case. Minus all irq related functions
559 + */
560 + up_apic_timer_interrupt_call(regs);
561 +#endif
562 + }
563 +}
564 +#endif
565 +
566 +int setup_profiling_timer(unsigned int multiplier)
567 +{
568 + return -EINVAL;
569 +}
570 +
571 +/*
572 + * This initializes the IO-APIC and APIC hardware if this is
573 + * a UP kernel.
574 + */
575 +int __init APIC_init_uniprocessor (void)
576 +{
577 +#ifdef CONFIG_X86_IO_APIC
578 + if (smp_found_config)
579 + if (!skip_ioapic_setup && nr_ioapics)
580 + setup_IO_APIC();
581 +#endif
582 +
583 + return 0;
584 +}
585 Index: head-2008-11-25/arch/x86/kernel/cpu/common-xen.c
586 ===================================================================
587 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
588 +++ head-2008-11-25/arch/x86/kernel/cpu/common-xen.c 2007-12-10 08:47:31.000000000 +0100
589 @@ -0,0 +1,743 @@
590 +#include <linux/init.h>
591 +#include <linux/string.h>
592 +#include <linux/delay.h>
593 +#include <linux/smp.h>
594 +#include <linux/module.h>
595 +#include <linux/percpu.h>
596 +#include <linux/bootmem.h>
597 +#include <asm/semaphore.h>
598 +#include <asm/processor.h>
599 +#include <asm/i387.h>
600 +#include <asm/msr.h>
601 +#include <asm/io.h>
602 +#include <asm/mmu_context.h>
603 +#include <asm/mtrr.h>
604 +#include <asm/mce.h>
605 +#ifdef CONFIG_X86_LOCAL_APIC
606 +#include <asm/mpspec.h>
607 +#include <asm/apic.h>
608 +#include <mach_apic.h>
609 +#else
610 +#ifdef CONFIG_XEN
611 +#define phys_pkg_id(a,b) a
612 +#endif
613 +#endif
614 +#include <asm/hypervisor.h>
615 +
616 +#include "cpu.h"
617 +
618 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
619 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
620 +
621 +#ifndef CONFIG_XEN
622 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
623 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
624 +#endif
625 +
626 +static int cachesize_override __cpuinitdata = -1;
627 +static int disable_x86_fxsr __cpuinitdata;
628 +static int disable_x86_serial_nr __cpuinitdata = 1;
629 +static int disable_x86_sep __cpuinitdata;
630 +
631 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
632 +
633 +extern int disable_pse;
634 +
635 +static void default_init(struct cpuinfo_x86 * c)
636 +{
637 + /* Not much we can do here... */
638 + /* Check if at least it has cpuid */
639 + if (c->cpuid_level == -1) {
640 + /* No cpuid. It must be an ancient CPU */
641 + if (c->x86 == 4)
642 + strcpy(c->x86_model_id, "486");
643 + else if (c->x86 == 3)
644 + strcpy(c->x86_model_id, "386");
645 + }
646 +}
647 +
648 +static struct cpu_dev default_cpu = {
649 + .c_init = default_init,
650 + .c_vendor = "Unknown",
651 +};
652 +static struct cpu_dev * this_cpu = &default_cpu;
653 +
654 +static int __init cachesize_setup(char *str)
655 +{
656 + get_option (&str, &cachesize_override);
657 + return 1;
658 +}
659 +__setup("cachesize=", cachesize_setup);
660 +
661 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
662 +{
663 + unsigned int *v;
664 + char *p, *q;
665 +
666 + if (cpuid_eax(0x80000000) < 0x80000004)
667 + return 0;
668 +
669 + v = (unsigned int *) c->x86_model_id;
670 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
671 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
672 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
673 + c->x86_model_id[48] = 0;
674 +
675 + /* Intel chips right-justify this string for some dumb reason;
676 + undo that brain damage */
677 + p = q = &c->x86_model_id[0];
678 + while ( *p == ' ' )
679 + p++;
680 + if ( p != q ) {
681 + while ( *p )
682 + *q++ = *p++;
683 + while ( q <= &c->x86_model_id[48] )
684 + *q++ = '\0'; /* Zero-pad the rest */
685 + }
686 +
687 + return 1;
688 +}
689 +
690 +
691 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
692 +{
693 + unsigned int n, dummy, ecx, edx, l2size;
694 +
695 + n = cpuid_eax(0x80000000);
696 +
697 + if (n >= 0x80000005) {
698 + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
699 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
700 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
701 + c->x86_cache_size=(ecx>>24)+(edx>>24);
702 + }
703 +
704 + if (n < 0x80000006) /* Some chips just has a large L1. */
705 + return;
706 +
707 + ecx = cpuid_ecx(0x80000006);
708 + l2size = ecx >> 16;
709 +
710 + /* do processor-specific cache resizing */
711 + if (this_cpu->c_size_cache)
712 + l2size = this_cpu->c_size_cache(c,l2size);
713 +
714 + /* Allow user to override all this if necessary. */
715 + if (cachesize_override != -1)
716 + l2size = cachesize_override;
717 +
718 + if ( l2size == 0 )
719 + return; /* Again, no L2 cache is possible */
720 +
721 + c->x86_cache_size = l2size;
722 +
723 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
724 + l2size, ecx & 0xFF);
725 +}
726 +
727 +/* Naming convention should be: <Name> [(<Codename>)] */
728 +/* This table only is used unless init_<vendor>() below doesn't set it; */
729 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
730 +
731 +/* Look up CPU names by table lookup. */
732 +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
733 +{
734 + struct cpu_model_info *info;
735 +
736 + if ( c->x86_model >= 16 )
737 + return NULL; /* Range check */
738 +
739 + if (!this_cpu)
740 + return NULL;
741 +
742 + info = this_cpu->c_models;
743 +
744 + while (info && info->family) {
745 + if (info->family == c->x86)
746 + return info->model_names[c->x86_model];
747 + info++;
748 + }
749 + return NULL; /* Not found */
750 +}
751 +
752 +
753 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
754 +{
755 + char *v = c->x86_vendor_id;
756 + int i;
757 + static int printed;
758 +
759 + for (i = 0; i < X86_VENDOR_NUM; i++) {
760 + if (cpu_devs[i]) {
761 + if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
762 + (cpu_devs[i]->c_ident[1] &&
763 + !strcmp(v,cpu_devs[i]->c_ident[1]))) {
764 + c->x86_vendor = i;
765 + if (!early)
766 + this_cpu = cpu_devs[i];
767 + return;
768 + }
769 + }
770 + }
771 + if (!printed) {
772 + printed++;
773 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
774 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
775 + }
776 + c->x86_vendor = X86_VENDOR_UNKNOWN;
777 + this_cpu = &default_cpu;
778 +}
779 +
780 +
781 +static int __init x86_fxsr_setup(char * s)
782 +{
783 + disable_x86_fxsr = 1;
784 + return 1;
785 +}
786 +__setup("nofxsr", x86_fxsr_setup);
787 +
788 +
789 +static int __init x86_sep_setup(char * s)
790 +{
791 + disable_x86_sep = 1;
792 + return 1;
793 +}
794 +__setup("nosep", x86_sep_setup);
795 +
796 +
797 +/* Standard macro to see if a specific flag is changeable */
798 +static inline int flag_is_changeable_p(u32 flag)
799 +{
800 + u32 f1, f2;
801 +
802 + asm("pushfl\n\t"
803 + "pushfl\n\t"
804 + "popl %0\n\t"
805 + "movl %0,%1\n\t"
806 + "xorl %2,%0\n\t"
807 + "pushl %0\n\t"
808 + "popfl\n\t"
809 + "pushfl\n\t"
810 + "popl %0\n\t"
811 + "popfl\n\t"
812 + : "=&r" (f1), "=&r" (f2)
813 + : "ir" (flag));
814 +
815 + return ((f1^f2) & flag) != 0;
816 +}
817 +
818 +
819 +/* Probe for the CPUID instruction */
820 +static int __cpuinit have_cpuid_p(void)
821 +{
822 + return flag_is_changeable_p(X86_EFLAGS_ID);
823 +}
824 +
825 +/* Do minimum CPU detection early.
826 + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
827 + The others are not touched to avoid unwanted side effects.
828 +
829 + WARNING: this function is only called on the BP. Don't add code here
830 + that is supposed to run on all CPUs. */
831 +static void __init early_cpu_detect(void)
832 +{
833 + struct cpuinfo_x86 *c = &boot_cpu_data;
834 +
835 + c->x86_cache_alignment = 32;
836 +
837 + if (!have_cpuid_p())
838 + return;
839 +
840 + /* Get vendor name */
841 + cpuid(0x00000000, &c->cpuid_level,
842 + (int *)&c->x86_vendor_id[0],
843 + (int *)&c->x86_vendor_id[8],
844 + (int *)&c->x86_vendor_id[4]);
845 +
846 + get_cpu_vendor(c, 1);
847 +
848 + c->x86 = 4;
849 + if (c->cpuid_level >= 0x00000001) {
850 + u32 junk, tfms, cap0, misc;
851 + cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
852 + c->x86 = (tfms >> 8) & 15;
853 + c->x86_model = (tfms >> 4) & 15;
854 + if (c->x86 == 0xf)
855 + c->x86 += (tfms >> 20) & 0xff;
856 + if (c->x86 >= 0x6)
857 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
858 + c->x86_mask = tfms & 15;
859 + if (cap0 & (1<<19))
860 + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
861 + }
862 +}
863 +
864 +void __cpuinit generic_identify(struct cpuinfo_x86 * c)
865 +{
866 + u32 tfms, xlvl;
867 + int ebx;
868 +
869 + if (have_cpuid_p()) {
870 + /* Get vendor name */
871 + cpuid(0x00000000, &c->cpuid_level,
872 + (int *)&c->x86_vendor_id[0],
873 + (int *)&c->x86_vendor_id[8],
874 + (int *)&c->x86_vendor_id[4]);
875 +
876 + get_cpu_vendor(c, 0);
877 + /* Initialize the standard set of capabilities */
878 + /* Note that the vendor-specific code below might override */
879 +
880 + /* Intel-defined flags: level 0x00000001 */
881 + if ( c->cpuid_level >= 0x00000001 ) {
882 + u32 capability, excap;
883 + cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
884 + c->x86_capability[0] = capability;
885 + c->x86_capability[4] = excap;
886 + c->x86 = (tfms >> 8) & 15;
887 + c->x86_model = (tfms >> 4) & 15;
888 + if (c->x86 == 0xf)
889 + c->x86 += (tfms >> 20) & 0xff;
890 + if (c->x86 >= 0x6)
891 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
892 + c->x86_mask = tfms & 15;
893 +#ifdef CONFIG_X86_HT
894 + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
895 +#else
896 + c->apicid = (ebx >> 24) & 0xFF;
897 +#endif
898 + } else {
899 + /* Have CPUID level 0 only - unheard of */
900 + c->x86 = 4;
901 + }
902 +
903 + /* AMD-defined flags: level 0x80000001 */
904 + xlvl = cpuid_eax(0x80000000);
905 + if ( (xlvl & 0xffff0000) == 0x80000000 ) {
906 + if ( xlvl >= 0x80000001 ) {
907 + c->x86_capability[1] = cpuid_edx(0x80000001);
908 + c->x86_capability[6] = cpuid_ecx(0x80000001);
909 + }
910 + if ( xlvl >= 0x80000004 )
911 + get_model_name(c); /* Default name */
912 + }
913 + }
914 +
915 + early_intel_workaround(c);
916 +
917 +#ifdef CONFIG_X86_HT
918 + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
919 +#endif
920 +}
921 +
922 +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
923 +{
924 + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
925 + /* Disable processor serial number */
926 + unsigned long lo,hi;
927 + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
928 + lo |= 0x200000;
929 + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
930 + printk(KERN_NOTICE "CPU serial number disabled.\n");
931 + clear_bit(X86_FEATURE_PN, c->x86_capability);
932 +
933 + /* Disabling the serial number may affect the cpuid level */
934 + c->cpuid_level = cpuid_eax(0);
935 + }
936 +}
937 +
938 +static int __init x86_serial_nr_setup(char *s)
939 +{
940 + disable_x86_serial_nr = 0;
941 + return 1;
942 +}
943 +__setup("serialnumber", x86_serial_nr_setup);
944 +
945 +
946 +
947 +/*
948 + * This does the hard work of actually picking apart the CPU stuff...
949 + */
950 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
951 +{
952 + int i;
953 +
954 + c->loops_per_jiffy = loops_per_jiffy;
955 + c->x86_cache_size = -1;
956 + c->x86_vendor = X86_VENDOR_UNKNOWN;
957 + c->cpuid_level = -1; /* CPUID not detected */
958 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
959 + c->x86_vendor_id[0] = '\0'; /* Unset */
960 + c->x86_model_id[0] = '\0'; /* Unset */
961 + c->x86_max_cores = 1;
962 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
963 +
964 + if (!have_cpuid_p()) {
965 + /* First of all, decide if this is a 486 or higher */
966 + /* It's a 486 if we can modify the AC flag */
967 + if ( flag_is_changeable_p(X86_EFLAGS_AC) )
968 + c->x86 = 4;
969 + else
970 + c->x86 = 3;
971 + }
972 +
973 + generic_identify(c);
974 +
975 + printk(KERN_DEBUG "CPU: After generic identify, caps:");
976 + for (i = 0; i < NCAPINTS; i++)
977 + printk(" %08lx", c->x86_capability[i]);
978 + printk("\n");
979 +
980 + if (this_cpu->c_identify) {
981 + this_cpu->c_identify(c);
982 +
983 + printk(KERN_DEBUG "CPU: After vendor identify, caps:");
984 + for (i = 0; i < NCAPINTS; i++)
985 + printk(" %08lx", c->x86_capability[i]);
986 + printk("\n");
987 + }
988 +
989 + /*
990 + * Vendor-specific initialization. In this section we
991 + * canonicalize the feature flags, meaning if there are
992 + * features a certain CPU supports which CPUID doesn't
993 + * tell us, CPUID claiming incorrect flags, or other bugs,
994 + * we handle them here.
995 + *
996 + * At the end of this section, c->x86_capability better
997 + * indicate the features this CPU genuinely supports!
998 + */
999 + if (this_cpu->c_init)
1000 + this_cpu->c_init(c);
1001 +
1002 + /* Disable the PN if appropriate */
1003 + squash_the_stupid_serial_number(c);
1004 +
1005 + /*
1006 + * The vendor-specific functions might have changed features. Now
1007 + * we do "generic changes."
1008 + */
1009 +
1010 + /* TSC disabled? */
1011 + if ( tsc_disable )
1012 + clear_bit(X86_FEATURE_TSC, c->x86_capability);
1013 +
1014 + /* FXSR disabled? */
1015 + if (disable_x86_fxsr) {
1016 + clear_bit(X86_FEATURE_FXSR, c->x86_capability);
1017 + clear_bit(X86_FEATURE_XMM, c->x86_capability);
1018 + }
1019 +
1020 + /* SEP disabled? */
1021 + if (disable_x86_sep)
1022 + clear_bit(X86_FEATURE_SEP, c->x86_capability);
1023 +
1024 + if (disable_pse)
1025 + clear_bit(X86_FEATURE_PSE, c->x86_capability);
1026 +
1027 + /* If the model name is still unset, do table lookup. */
1028 + if ( !c->x86_model_id[0] ) {
1029 + char *p;
1030 + p = table_lookup_model(c);
1031 + if ( p )
1032 + strcpy(c->x86_model_id, p);
1033 + else
1034 + /* Last resort... */
1035 + sprintf(c->x86_model_id, "%02x/%02x",
1036 + c->x86, c->x86_model);
1037 + }
1038 +
1039 + /* Now the feature flags better reflect actual CPU features! */
1040 +
1041 + printk(KERN_DEBUG "CPU: After all inits, caps:");
1042 + for (i = 0; i < NCAPINTS; i++)
1043 + printk(" %08lx", c->x86_capability[i]);
1044 + printk("\n");
1045 +
1046 + /*
1047 + * On SMP, boot_cpu_data holds the common feature set between
1048 + * all CPUs; so make sure that we indicate which features are
1049 + * common between the CPUs. The first time this routine gets
1050 + * executed, c == &boot_cpu_data.
1051 + */
1052 + if ( c != &boot_cpu_data ) {
1053 + /* AND the already accumulated flags with these */
1054 + for ( i = 0 ; i < NCAPINTS ; i++ )
1055 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1056 + }
1057 +
1058 + /* Init Machine Check Exception if available. */
1059 + mcheck_init(c);
1060 +
1061 + if (c == &boot_cpu_data)
1062 + sysenter_setup();
1063 + enable_sep_cpu();
1064 +
1065 + if (c == &boot_cpu_data)
1066 + mtrr_bp_init();
1067 + else
1068 + mtrr_ap_init();
1069 +}
1070 +
1071 +#ifdef CONFIG_X86_HT
1072 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
1073 +{
1074 + u32 eax, ebx, ecx, edx;
1075 + int index_msb, core_bits;
1076 +
1077 + cpuid(1, &eax, &ebx, &ecx, &edx);
1078 +
1079 + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
1080 + return;
1081 +
1082 + smp_num_siblings = (ebx & 0xff0000) >> 16;
1083 +
1084 + if (smp_num_siblings == 1) {
1085 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
1086 + } else if (smp_num_siblings > 1 ) {
1087 +
1088 + if (smp_num_siblings > NR_CPUS) {
1089 + printk(KERN_WARNING "CPU: Unsupported number of the "
1090 + "siblings %d", smp_num_siblings);
1091 + smp_num_siblings = 1;
1092 + return;
1093 + }
1094 +
1095 + index_msb = get_count_order(smp_num_siblings);
1096 + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
1097 +
1098 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
1099 + c->phys_proc_id);
1100 +
1101 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
1102 +
1103 + index_msb = get_count_order(smp_num_siblings) ;
1104 +
1105 + core_bits = get_count_order(c->x86_max_cores);
1106 +
1107 + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
1108 + ((1 << core_bits) - 1);
1109 +
1110 + if (c->x86_max_cores > 1)
1111 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
1112 + c->cpu_core_id);
1113 + }
1114 +}
1115 +#endif
1116 +
1117 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1118 +{
1119 + char *vendor = NULL;
1120 +
1121 + if (c->x86_vendor < X86_VENDOR_NUM)
1122 + vendor = this_cpu->c_vendor;
1123 + else if (c->cpuid_level >= 0)
1124 + vendor = c->x86_vendor_id;
1125 +
1126 + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
1127 + printk("%s ", vendor);
1128 +
1129 + if (!c->x86_model_id[0])
1130 + printk("%d86", c->x86);
1131 + else
1132 + printk("%s", c->x86_model_id);
1133 +
1134 + if (c->x86_mask || c->cpuid_level >= 0)
1135 + printk(" stepping %02x\n", c->x86_mask);
1136 + else
1137 + printk("\n");
1138 +}
1139 +
1140 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1141 +
1142 +/* This is hacky. :)
1143 + * We're emulating future behavior.
1144 + * In the future, the cpu-specific init functions will be called implicitly
1145 + * via the magic of initcalls.
1146 + * They will insert themselves into the cpu_devs structure.
1147 + * Then, when cpu_init() is called, we can just iterate over that array.
1148 + */
1149 +
1150 +extern int intel_cpu_init(void);
1151 +extern int cyrix_init_cpu(void);
1152 +extern int nsc_init_cpu(void);
1153 +extern int amd_init_cpu(void);
1154 +extern int centaur_init_cpu(void);
1155 +extern int transmeta_init_cpu(void);
1156 +extern int rise_init_cpu(void);
1157 +extern int nexgen_init_cpu(void);
1158 +extern int umc_init_cpu(void);
1159 +
1160 +void __init early_cpu_init(void)
1161 +{
1162 + intel_cpu_init();
1163 + cyrix_init_cpu();
1164 + nsc_init_cpu();
1165 + amd_init_cpu();
1166 + centaur_init_cpu();
1167 + transmeta_init_cpu();
1168 + rise_init_cpu();
1169 + nexgen_init_cpu();
1170 + umc_init_cpu();
1171 + early_cpu_detect();
1172 +
1173 +#ifdef CONFIG_DEBUG_PAGEALLOC
1174 + /* pse is not compatible with on-the-fly unmapping,
1175 + * disable it even if the cpus claim to support it.
1176 + */
1177 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1178 + disable_pse = 1;
1179 +#endif
1180 +}
1181 +
1182 +static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
1183 +{
1184 + unsigned long frames[16];
1185 + unsigned long va;
1186 + int f;
1187 +
1188 + for (va = gdt_descr->address, f = 0;
1189 + va < gdt_descr->address + gdt_descr->size;
1190 + va += PAGE_SIZE, f++) {
1191 + frames[f] = virt_to_mfn(va);
1192 + make_lowmem_page_readonly(
1193 + (void *)va, XENFEAT_writable_descriptor_tables);
1194 + }
1195 + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
1196 + BUG();
1197 +}
1198 +
1199 +/*
1200 + * cpu_init() initializes state that is per-CPU. Some data is already
1201 + * initialized (naturally) in the bootstrap process, such as the GDT
1202 + * and IDT. We reload them nevertheless, this function acts as a
1203 + * 'CPU state barrier', nothing should get across.
1204 + */
1205 +void __cpuinit cpu_init(void)
1206 +{
1207 + int cpu = smp_processor_id();
1208 +#ifndef CONFIG_X86_NO_TSS
1209 + struct tss_struct * t = &per_cpu(init_tss, cpu);
1210 +#endif
1211 + struct thread_struct *thread = &current->thread;
1212 + struct desc_struct *gdt;
1213 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1214 +
1215 + if (cpu_test_and_set(cpu, cpu_initialized)) {
1216 + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1217 + for (;;) local_irq_enable();
1218 + }
1219 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1220 +
1221 + if (cpu_has_vme || cpu_has_de)
1222 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1223 + if (tsc_disable && cpu_has_tsc) {
1224 + printk(KERN_NOTICE "Disabling TSC...\n");
1225 + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1226 + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1227 + set_in_cr4(X86_CR4_TSD);
1228 + }
1229 +
1230 +#ifndef CONFIG_XEN
1231 + /* The CPU hotplug case */
1232 + if (cpu_gdt_descr->address) {
1233 + gdt = (struct desc_struct *)cpu_gdt_descr->address;
1234 + memset(gdt, 0, PAGE_SIZE);
1235 + goto old_gdt;
1236 + }
1237 + /*
1238 + * This is a horrible hack to allocate the GDT. The problem
1239 + * is that cpu_init() is called really early for the boot CPU
1240 + * (and hence needs bootmem) but much later for the secondary
1241 + * CPUs, when bootmem will have gone away
1242 + */
1243 + if (NODE_DATA(0)->bdata->node_bootmem_map) {
1244 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1245 + /* alloc_bootmem_pages panics on failure, so no check */
1246 + memset(gdt, 0, PAGE_SIZE);
1247 + } else {
1248 + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
1249 + if (unlikely(!gdt)) {
1250 + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
1251 + for (;;)
1252 + local_irq_enable();
1253 + }
1254 + }
1255 +old_gdt:
1256 + /*
1257 + * Initialize the per-CPU GDT with the boot GDT,
1258 + * and set up the GDT descriptor:
1259 + */
1260 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1261 +
1262 + /* Set up GDT entry for 16bit stack */
1263 + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
1264 + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
1265 + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
1266 + (CPU_16BIT_STACK_SIZE - 1);
1267 +
1268 + cpu_gdt_descr->size = GDT_SIZE - 1;
1269 + cpu_gdt_descr->address = (unsigned long)gdt;
1270 +#else
1271 + if (cpu == 0 && cpu_gdt_descr->address == 0) {
1272 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
1273 + /* alloc_bootmem_pages panics on failure, so no check */
1274 + memset(gdt, 0, PAGE_SIZE);
1275 +
1276 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
1277 +
1278 + cpu_gdt_descr->size = GDT_SIZE;
1279 + cpu_gdt_descr->address = (unsigned long)gdt;
1280 + }
1281 +#endif
1282 +
1283 + cpu_gdt_init(cpu_gdt_descr);
1284 +
1285 + /*
1286 + * Set up and load the per-CPU TSS and LDT
1287 + */
1288 + atomic_inc(&init_mm.mm_count);
1289 + current->active_mm = &init_mm;
1290 + if (current->mm)
1291 + BUG();
1292 + enter_lazy_tlb(&init_mm, current);
1293 +
1294 + load_esp0(t, thread);
1295 +
1296 + load_LDT(&init_mm.context);
1297 +
1298 +#ifdef CONFIG_DOUBLEFAULT
1299 + /* Set up doublefault TSS pointer in the GDT */
1300 + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1301 +#endif
1302 +
1303 + /* Clear %fs and %gs. */
1304 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
1305 +
1306 + /* Clear all 6 debug registers: */
1307 + set_debugreg(0, 0);
1308 + set_debugreg(0, 1);
1309 + set_debugreg(0, 2);
1310 + set_debugreg(0, 3);
1311 + set_debugreg(0, 6);
1312 + set_debugreg(0, 7);
1313 +
1314 + /*
1315 + * Force FPU initialization:
1316 + */
1317 + current_thread_info()->status = 0;
1318 + clear_used_math();
1319 + mxcsr_feature_mask_init();
1320 +}
1321 +
1322 +#ifdef CONFIG_HOTPLUG_CPU
1323 +void __cpuinit cpu_uninit(void)
1324 +{
1325 + int cpu = raw_smp_processor_id();
1326 + cpu_clear(cpu, cpu_initialized);
1327 +
1328 + /* lazy TLB state */
1329 + per_cpu(cpu_tlbstate, cpu).state = 0;
1330 + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1331 +}
1332 +#endif
1333 Index: head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c
1334 ===================================================================
1335 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1336 +++ head-2008-11-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100
1337 @@ -0,0 +1,198 @@
1338 +#include <linux/init.h>
1339 +#include <linux/proc_fs.h>
1340 +#include <linux/ctype.h>
1341 +#include <linux/module.h>
1342 +#include <linux/seq_file.h>
1343 +#include <asm/uaccess.h>
1344 +#include <linux/mutex.h>
1345 +
1346 +#include <asm/mtrr.h>
1347 +#include "mtrr.h"
1348 +
1349 +static DEFINE_MUTEX(mtrr_mutex);
1350 +
1351 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
1352 + unsigned int *size, mtrr_type * type)
1353 +{
1354 + struct xen_platform_op op;
1355 +
1356 + op.cmd = XENPF_read_memtype;
1357 + op.u.read_memtype.reg = reg;
1358 + if (unlikely(HYPERVISOR_platform_op(&op)))
1359 + memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
1360 +
1361 + *size = op.u.read_memtype.nr_mfns;
1362 + *base = op.u.read_memtype.mfn;
1363 + *type = op.u.read_memtype.type;
1364 +}
1365 +
1366 +struct mtrr_ops generic_mtrr_ops = {
1367 + .use_intel_if = 1,
1368 + .get = generic_get_mtrr,
1369 +};
1370 +
1371 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1372 +unsigned int num_var_ranges;
1373 +unsigned int *usage_table;
1374 +
1375 +static void __init set_num_var_ranges(void)
1376 +{
1377 + struct xen_platform_op op;
1378 +
1379 + for (num_var_ranges = 0; ; num_var_ranges++) {
1380 + op.cmd = XENPF_read_memtype;
1381 + op.u.read_memtype.reg = num_var_ranges;
1382 + if (HYPERVISOR_platform_op(&op) != 0)
1383 + break;
1384 + }
1385 +}
1386 +
1387 +static void __init init_table(void)
1388 +{
1389 + int i, max;
1390 +
1391 + max = num_var_ranges;
1392 + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1393 + == NULL) {
1394 + printk(KERN_ERR "mtrr: could not allocate\n");
1395 + return;
1396 + }
1397 + for (i = 0; i < max; i++)
1398 + usage_table[i] = 0;
1399 +}
1400 +
1401 +int mtrr_add_page(unsigned long base, unsigned long size,
1402 + unsigned int type, char increment)
1403 +{
1404 + int error;
1405 + struct xen_platform_op op;
1406 +
1407 + mutex_lock(&mtrr_mutex);
1408 +
1409 + op.cmd = XENPF_add_memtype;
1410 + op.u.add_memtype.mfn = base;
1411 + op.u.add_memtype.nr_mfns = size;
1412 + op.u.add_memtype.type = type;
1413 + error = HYPERVISOR_platform_op(&op);
1414 + if (error) {
1415 + mutex_unlock(&mtrr_mutex);
1416 + BUG_ON(error > 0);
1417 + return error;
1418 + }
1419 +
1420 + if (increment)
1421 + ++usage_table[op.u.add_memtype.reg];
1422 +
1423 + mutex_unlock(&mtrr_mutex);
1424 +
1425 + return op.u.add_memtype.reg;
1426 +}
1427 +
1428 +static int mtrr_check(unsigned long base, unsigned long size)
1429 +{
1430 + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
1431 + printk(KERN_WARNING
1432 + "mtrr: size and base must be multiples of 4 kiB\n");
1433 + printk(KERN_DEBUG
1434 + "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
1435 + dump_stack();
1436 + return -1;
1437 + }
1438 + return 0;
1439 +}
1440 +
1441 +int
1442 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1443 + char increment)
1444 +{
1445 + if (mtrr_check(base, size))
1446 + return -EINVAL;
1447 + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
1448 + increment);
1449 +}
1450 +
1451 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
1452 +{
1453 + unsigned i;
1454 + mtrr_type ltype;
1455 + unsigned long lbase;
1456 + unsigned int lsize;
1457 + int error = -EINVAL;
1458 + struct xen_platform_op op;
1459 +
1460 + mutex_lock(&mtrr_mutex);
1461 +
1462 + if (reg < 0) {
1463 + /* Search for existing MTRR */
1464 + for (i = 0; i < num_var_ranges; ++i) {
1465 + mtrr_if->get(i, &lbase, &lsize, &ltype);
1466 + if (lbase == base && lsize == size) {
1467 + reg = i;
1468 + break;
1469 + }
1470 + }
1471 + if (reg < 0) {
1472 + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
1473 + size);
1474 + goto out;
1475 + }
1476 + }
1477 + if (usage_table[reg] < 1) {
1478 + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1479 + goto out;
1480 + }
1481 + if (--usage_table[reg] < 1) {
1482 + op.cmd = XENPF_del_memtype;
1483 + op.u.del_memtype.handle = 0;
1484 + op.u.del_memtype.reg = reg;
1485 + error = HYPERVISOR_platform_op(&op);
1486 + if (error) {
1487 + BUG_ON(error > 0);
1488 + goto out;
1489 + }
1490 + }
1491 + error = reg;
1492 + out:
1493 + mutex_unlock(&mtrr_mutex);
1494 + return error;
1495 +}
1496 +
1497 +int
1498 +mtrr_del(int reg, unsigned long base, unsigned long size)
1499 +{
1500 + if (mtrr_check(base, size))
1501 + return -EINVAL;
1502 + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
1503 +}
1504 +
1505 +EXPORT_SYMBOL(mtrr_add);
1506 +EXPORT_SYMBOL(mtrr_del);
1507 +
1508 +void __init mtrr_bp_init(void)
1509 +{
1510 +}
1511 +
1512 +void mtrr_ap_init(void)
1513 +{
1514 +}
1515 +
1516 +static int __init mtrr_init(void)
1517 +{
1518 + struct cpuinfo_x86 *c = &boot_cpu_data;
1519 +
1520 + if (!is_initial_xendomain())
1521 + return -ENODEV;
1522 +
1523 + if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
1524 + (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
1525 + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
1526 + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
1527 + return -ENODEV;
1528 +
1529 + set_num_var_ranges();
1530 + init_table();
1531 +
1532 + return 0;
1533 +}
1534 +
1535 +subsys_initcall(mtrr_init);
1536 Index: head-2008-11-25/arch/x86/kernel/entry_32-xen.S
1537 ===================================================================
1538 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1539 +++ head-2008-11-25/arch/x86/kernel/entry_32-xen.S 2007-12-10 08:47:31.000000000 +0100
1540 @@ -0,0 +1,1238 @@
1541 +/*
1542 + * linux/arch/i386/entry.S
1543 + *
1544 + * Copyright (C) 1991, 1992 Linus Torvalds
1545 + */
1546 +
1547 +/*
1548 + * entry.S contains the system-call and fault low-level handling routines.
1549 + * This also contains the timer-interrupt handler, as well as all interrupts
1550 + * and faults that can result in a task-switch.
1551 + *
1552 + * NOTE: This code handles signal-recognition, which happens every time
1553 + * after a timer-interrupt and after each system call.
1554 + *
1555 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
1556 + * on a 486.
1557 + *
1558 + * Stack layout in 'ret_from_system_call':
1559 + * ptrace needs to have all regs on the stack.
1560 + * if the order here is changed, it needs to be
1561 + * updated in fork.c:copy_process, signal.c:do_signal,
1562 + * ptrace.c and ptrace.h
1563 + *
1564 + * 0(%esp) - %ebx
1565 + * 4(%esp) - %ecx
1566 + * 8(%esp) - %edx
1567 + * C(%esp) - %esi
1568 + * 10(%esp) - %edi
1569 + * 14(%esp) - %ebp
1570 + * 18(%esp) - %eax
1571 + * 1C(%esp) - %ds
1572 + * 20(%esp) - %es
1573 + * 24(%esp) - orig_eax
1574 + * 28(%esp) - %eip
1575 + * 2C(%esp) - %cs
1576 + * 30(%esp) - %eflags
1577 + * 34(%esp) - %oldesp
1578 + * 38(%esp) - %oldss
1579 + *
1580 + * "current" is in register %ebx during any slow entries.
1581 + */
1582 +
1583 +#include <linux/linkage.h>
1584 +#include <asm/thread_info.h>
1585 +#include <asm/irqflags.h>
1586 +#include <asm/errno.h>
1587 +#include <asm/segment.h>
1588 +#include <asm/smp.h>
1589 +#include <asm/page.h>
1590 +#include <asm/desc.h>
1591 +#include <asm/dwarf2.h>
1592 +#include "irq_vectors.h"
1593 +#include <xen/interface/xen.h>
1594 +
1595 +#define nr_syscalls ((syscall_table_size)/4)
1596 +
1597 +EBX = 0x00
1598 +ECX = 0x04
1599 +EDX = 0x08
1600 +ESI = 0x0C
1601 +EDI = 0x10
1602 +EBP = 0x14
1603 +EAX = 0x18
1604 +DS = 0x1C
1605 +ES = 0x20
1606 +ORIG_EAX = 0x24
1607 +EIP = 0x28
1608 +CS = 0x2C
1609 +EFLAGS = 0x30
1610 +OLDESP = 0x34
1611 +OLDSS = 0x38
1612 +
1613 +CF_MASK = 0x00000001
1614 +TF_MASK = 0x00000100
1615 +IF_MASK = 0x00000200
1616 +DF_MASK = 0x00000400
1617 +NT_MASK = 0x00004000
1618 +VM_MASK = 0x00020000
1619 +/* Pseudo-eflags. */
1620 +NMI_MASK = 0x80000000
1621 +
1622 +#ifndef CONFIG_XEN
1623 +#define DISABLE_INTERRUPTS cli
1624 +#define ENABLE_INTERRUPTS sti
1625 +#else
1626 +/* Offsets into shared_info_t. */
1627 +#define evtchn_upcall_pending /* 0 */
1628 +#define evtchn_upcall_mask 1
1629 +
1630 +#define sizeof_vcpu_shift 6
1631 +
1632 +#ifdef CONFIG_SMP
1633 +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1634 + shl $sizeof_vcpu_shift,%esi ; \
1635 + addl HYPERVISOR_shared_info,%esi
1636 +#else
1637 +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1638 +#endif
1639 +
1640 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1641 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1642 +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1643 + __DISABLE_INTERRUPTS
1644 +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1645 + __ENABLE_INTERRUPTS
1646 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1647 +#endif
1648 +
1649 +#ifdef CONFIG_PREEMPT
1650 +#define preempt_stop cli; TRACE_IRQS_OFF
1651 +#else
1652 +#define preempt_stop
1653 +#define resume_kernel restore_nocheck
1654 +#endif
1655 +
1656 +.macro TRACE_IRQS_IRET
1657 +#ifdef CONFIG_TRACE_IRQFLAGS
1658 + testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1659 + jz 1f
1660 + TRACE_IRQS_ON
1661 +1:
1662 +#endif
1663 +.endm
1664 +
1665 +#ifdef CONFIG_VM86
1666 +#define resume_userspace_sig check_userspace
1667 +#else
1668 +#define resume_userspace_sig resume_userspace
1669 +#endif
1670 +
1671 +#define SAVE_ALL \
1672 + cld; \
1673 + pushl %es; \
1674 + CFI_ADJUST_CFA_OFFSET 4;\
1675 + /*CFI_REL_OFFSET es, 0;*/\
1676 + pushl %ds; \
1677 + CFI_ADJUST_CFA_OFFSET 4;\
1678 + /*CFI_REL_OFFSET ds, 0;*/\
1679 + pushl %eax; \
1680 + CFI_ADJUST_CFA_OFFSET 4;\
1681 + CFI_REL_OFFSET eax, 0;\
1682 + pushl %ebp; \
1683 + CFI_ADJUST_CFA_OFFSET 4;\
1684 + CFI_REL_OFFSET ebp, 0;\
1685 + pushl %edi; \
1686 + CFI_ADJUST_CFA_OFFSET 4;\
1687 + CFI_REL_OFFSET edi, 0;\
1688 + pushl %esi; \
1689 + CFI_ADJUST_CFA_OFFSET 4;\
1690 + CFI_REL_OFFSET esi, 0;\
1691 + pushl %edx; \
1692 + CFI_ADJUST_CFA_OFFSET 4;\
1693 + CFI_REL_OFFSET edx, 0;\
1694 + pushl %ecx; \
1695 + CFI_ADJUST_CFA_OFFSET 4;\
1696 + CFI_REL_OFFSET ecx, 0;\
1697 + pushl %ebx; \
1698 + CFI_ADJUST_CFA_OFFSET 4;\
1699 + CFI_REL_OFFSET ebx, 0;\
1700 + movl $(__USER_DS), %edx; \
1701 + movl %edx, %ds; \
1702 + movl %edx, %es;
1703 +
1704 +#define RESTORE_INT_REGS \
1705 + popl %ebx; \
1706 + CFI_ADJUST_CFA_OFFSET -4;\
1707 + CFI_RESTORE ebx;\
1708 + popl %ecx; \
1709 + CFI_ADJUST_CFA_OFFSET -4;\
1710 + CFI_RESTORE ecx;\
1711 + popl %edx; \
1712 + CFI_ADJUST_CFA_OFFSET -4;\
1713 + CFI_RESTORE edx;\
1714 + popl %esi; \
1715 + CFI_ADJUST_CFA_OFFSET -4;\
1716 + CFI_RESTORE esi;\
1717 + popl %edi; \
1718 + CFI_ADJUST_CFA_OFFSET -4;\
1719 + CFI_RESTORE edi;\
1720 + popl %ebp; \
1721 + CFI_ADJUST_CFA_OFFSET -4;\
1722 + CFI_RESTORE ebp;\
1723 + popl %eax; \
1724 + CFI_ADJUST_CFA_OFFSET -4;\
1725 + CFI_RESTORE eax
1726 +
1727 +#define RESTORE_REGS \
1728 + RESTORE_INT_REGS; \
1729 +1: popl %ds; \
1730 + CFI_ADJUST_CFA_OFFSET -4;\
1731 + /*CFI_RESTORE ds;*/\
1732 +2: popl %es; \
1733 + CFI_ADJUST_CFA_OFFSET -4;\
1734 + /*CFI_RESTORE es;*/\
1735 +.section .fixup,"ax"; \
1736 +3: movl $0,(%esp); \
1737 + jmp 1b; \
1738 +4: movl $0,(%esp); \
1739 + jmp 2b; \
1740 +.previous; \
1741 +.section __ex_table,"a";\
1742 + .align 4; \
1743 + .long 1b,3b; \
1744 + .long 2b,4b; \
1745 +.previous
1746 +
1747 +#define RING0_INT_FRAME \
1748 + CFI_STARTPROC simple;\
1749 + CFI_DEF_CFA esp, 3*4;\
1750 + /*CFI_OFFSET cs, -2*4;*/\
1751 + CFI_OFFSET eip, -3*4
1752 +
1753 +#define RING0_EC_FRAME \
1754 + CFI_STARTPROC simple;\
1755 + CFI_DEF_CFA esp, 4*4;\
1756 + /*CFI_OFFSET cs, -2*4;*/\
1757 + CFI_OFFSET eip, -3*4
1758 +
1759 +#define RING0_PTREGS_FRAME \
1760 + CFI_STARTPROC simple;\
1761 + CFI_DEF_CFA esp, OLDESP-EBX;\
1762 + /*CFI_OFFSET cs, CS-OLDESP;*/\
1763 + CFI_OFFSET eip, EIP-OLDESP;\
1764 + /*CFI_OFFSET es, ES-OLDESP;*/\
1765 + /*CFI_OFFSET ds, DS-OLDESP;*/\
1766 + CFI_OFFSET eax, EAX-OLDESP;\
1767 + CFI_OFFSET ebp, EBP-OLDESP;\
1768 + CFI_OFFSET edi, EDI-OLDESP;\
1769 + CFI_OFFSET esi, ESI-OLDESP;\
1770 + CFI_OFFSET edx, EDX-OLDESP;\
1771 + CFI_OFFSET ecx, ECX-OLDESP;\
1772 + CFI_OFFSET ebx, EBX-OLDESP
1773 +
1774 +ENTRY(ret_from_fork)
1775 + CFI_STARTPROC
1776 + pushl %eax
1777 + CFI_ADJUST_CFA_OFFSET 4
1778 + call schedule_tail
1779 + GET_THREAD_INFO(%ebp)
1780 + popl %eax
1781 + CFI_ADJUST_CFA_OFFSET -4
1782 + pushl $0x0202 # Reset kernel eflags
1783 + CFI_ADJUST_CFA_OFFSET 4
1784 + popfl
1785 + CFI_ADJUST_CFA_OFFSET -4
1786 + jmp syscall_exit
1787 + CFI_ENDPROC
1788 +
1789 +/*
1790 + * Return to user mode is not as complex as all this looks,
1791 + * but we want the default path for a system call return to
1792 + * go as quickly as possible which is why some of this is
1793 + * less clear than it otherwise should be.
1794 + */
1795 +
1796 + # userspace resumption stub bypassing syscall exit tracing
1797 + ALIGN
1798 + RING0_PTREGS_FRAME
1799 +ret_from_exception:
1800 + preempt_stop
1801 +ret_from_intr:
1802 + GET_THREAD_INFO(%ebp)
1803 +check_userspace:
1804 + movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1805 + movb CS(%esp), %al
1806 + testl $(VM_MASK | 2), %eax
1807 + jz resume_kernel
1808 +ENTRY(resume_userspace)
1809 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1810 + # setting need_resched or sigpending
1811 + # between sampling and the iret
1812 + movl TI_flags(%ebp), %ecx
1813 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1814 + # int/exception return?
1815 + jne work_pending
1816 + jmp restore_all
1817 +
1818 +#ifdef CONFIG_PREEMPT
1819 +ENTRY(resume_kernel)
1820 + cli
1821 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1822 + jnz restore_nocheck
1823 +need_resched:
1824 + movl TI_flags(%ebp), %ecx # need_resched set ?
1825 + testb $_TIF_NEED_RESCHED, %cl
1826 + jz restore_all
1827 + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1828 + jz restore_all
1829 + call preempt_schedule_irq
1830 + jmp need_resched
1831 +#endif
1832 + CFI_ENDPROC
1833 +
1834 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
1835 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
1836 +
1837 + # sysenter call handler stub
1838 +ENTRY(sysenter_entry)
1839 + CFI_STARTPROC simple
1840 + CFI_DEF_CFA esp, 0
1841 + CFI_REGISTER esp, ebp
1842 + movl SYSENTER_stack_esp0(%esp),%esp
1843 +sysenter_past_esp:
1844 + /*
1845 + * No need to follow this irqs on/off section: the syscall
1846 + * disabled irqs and here we enable it straight after entry:
1847 + */
1848 + sti
1849 + pushl $(__USER_DS)
1850 + CFI_ADJUST_CFA_OFFSET 4
1851 + /*CFI_REL_OFFSET ss, 0*/
1852 + pushl %ebp
1853 + CFI_ADJUST_CFA_OFFSET 4
1854 + CFI_REL_OFFSET esp, 0
1855 + pushfl
1856 + CFI_ADJUST_CFA_OFFSET 4
1857 + pushl $(__USER_CS)
1858 + CFI_ADJUST_CFA_OFFSET 4
1859 + /*CFI_REL_OFFSET cs, 0*/
1860 + /*
1861 + * Push current_thread_info()->sysenter_return to the stack.
1862 + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1863 + * pushed above; +8 corresponds to copy_thread's esp0 setting.
1864 + */
1865 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1866 + CFI_ADJUST_CFA_OFFSET 4
1867 + CFI_REL_OFFSET eip, 0
1868 +
1869 +/*
1870 + * Load the potential sixth argument from user stack.
1871 + * Careful about security.
1872 + */
1873 + cmpl $__PAGE_OFFSET-3,%ebp
1874 + jae syscall_fault
1875 +1: movl (%ebp),%ebp
1876 +.section __ex_table,"a"
1877 + .align 4
1878 + .long 1b,syscall_fault
1879 +.previous
1880 +
1881 + pushl %eax
1882 + CFI_ADJUST_CFA_OFFSET 4
1883 + SAVE_ALL
1884 + GET_THREAD_INFO(%ebp)
1885 +
1886 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1887 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1888 + jnz syscall_trace_entry
1889 + cmpl $(nr_syscalls), %eax
1890 + jae syscall_badsys
1891 + call *sys_call_table(,%eax,4)
1892 + movl %eax,EAX(%esp)
1893 + DISABLE_INTERRUPTS
1894 + TRACE_IRQS_OFF
1895 + movl TI_flags(%ebp), %ecx
1896 + testw $_TIF_ALLWORK_MASK, %cx
1897 + jne syscall_exit_work
1898 +/* if something modifies registers it must also disable sysexit */
1899 + movl EIP(%esp), %edx
1900 + movl OLDESP(%esp), %ecx
1901 + xorl %ebp,%ebp
1902 +#ifdef CONFIG_XEN
1903 + TRACE_IRQS_ON
1904 + __ENABLE_INTERRUPTS
1905 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1906 + __TEST_PENDING
1907 + jnz 14f # process more events if necessary...
1908 + movl ESI(%esp), %esi
1909 + sysexit
1910 +14: __DISABLE_INTERRUPTS
1911 + TRACE_IRQS_OFF
1912 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1913 + push %esp
1914 + call evtchn_do_upcall
1915 + add $4,%esp
1916 + jmp ret_from_intr
1917 +#else
1918 + TRACE_IRQS_ON
1919 + sti
1920 + sysexit
1921 +#endif /* !CONFIG_XEN */
1922 + CFI_ENDPROC
1923 +
1924 + # pv sysenter call handler stub
1925 +ENTRY(sysenter_entry_pv)
1926 + RING0_INT_FRAME
1927 + movl $__USER_DS,16(%esp)
1928 + movl %ebp,12(%esp)
1929 + movl $__USER_CS,4(%esp)
1930 + addl $4,%esp
1931 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
1932 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1933 +/*
1934 + * Load the potential sixth argument from user stack.
1935 + * Careful about security.
1936 + */
1937 + cmpl $__PAGE_OFFSET-3,%ebp
1938 + jae syscall_fault
1939 +1: movl (%ebp),%ebp
1940 +.section __ex_table,"a"
1941 + .align 4
1942 + .long 1b,syscall_fault
1943 +.previous
1944 + /* fall through */
1945 + CFI_ENDPROC
1946 +ENDPROC(sysenter_entry_pv)
1947 +
1948 + # system call handler stub
1949 +ENTRY(system_call)
1950 + RING0_INT_FRAME # can't unwind into user space anyway
1951 + pushl %eax # save orig_eax
1952 + CFI_ADJUST_CFA_OFFSET 4
1953 + SAVE_ALL
1954 + GET_THREAD_INFO(%ebp)
1955 + testl $TF_MASK,EFLAGS(%esp)
1956 + jz no_singlestep
1957 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1958 +no_singlestep:
1959 + # system call tracing in operation / emulation
1960 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1961 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1962 + jnz syscall_trace_entry
1963 + cmpl $(nr_syscalls), %eax
1964 + jae syscall_badsys
1965 +syscall_call:
1966 + call *sys_call_table(,%eax,4)
1967 + movl %eax,EAX(%esp) # store the return value
1968 +syscall_exit:
1969 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1970 + # setting need_resched or sigpending
1971 + # between sampling and the iret
1972 + TRACE_IRQS_OFF
1973 + movl TI_flags(%ebp), %ecx
1974 + testw $_TIF_ALLWORK_MASK, %cx # current->work
1975 + jne syscall_exit_work
1976 +
1977 +restore_all:
1978 +#ifndef CONFIG_XEN
1979 + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1980 + # Warning: OLDSS(%esp) contains the wrong/random values if we
1981 + # are returning to the kernel.
1982 + # See comments in process.c:copy_thread() for details.
1983 + movb OLDSS(%esp), %ah
1984 + movb CS(%esp), %al
1985 + andl $(VM_MASK | (4 << 8) | 3), %eax
1986 + cmpl $((4 << 8) | 3), %eax
1987 + CFI_REMEMBER_STATE
1988 + je ldt_ss # returning to user-space with LDT SS
1989 +restore_nocheck:
1990 +#else
1991 +restore_nocheck:
1992 + movl EFLAGS(%esp), %eax
1993 + testl $(VM_MASK|NMI_MASK), %eax
1994 + CFI_REMEMBER_STATE
1995 + jnz hypervisor_iret
1996 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1997 + GET_VCPU_INFO
1998 + andb evtchn_upcall_mask(%esi),%al
1999 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2000 + CFI_REMEMBER_STATE
2001 + jnz restore_all_enable_events # != 0 => enable event delivery
2002 +#endif
2003 + TRACE_IRQS_IRET
2004 +restore_nocheck_notrace:
2005 + RESTORE_REGS
2006 + addl $4, %esp
2007 + CFI_ADJUST_CFA_OFFSET -4
2008 +1: iret
2009 +.section .fixup,"ax"
2010 +iret_exc:
2011 +#ifndef CONFIG_XEN
2012 + TRACE_IRQS_ON
2013 + sti
2014 +#endif
2015 + pushl $0 # no error code
2016 + pushl $do_iret_error
2017 + jmp error_code
2018 +.previous
2019 +.section __ex_table,"a"
2020 + .align 4
2021 + .long 1b,iret_exc
2022 +.previous
2023 +
2024 + CFI_RESTORE_STATE
2025 +#ifndef CONFIG_XEN
2026 +ldt_ss:
2027 + larl OLDSS(%esp), %eax
2028 + jnz restore_nocheck
2029 + testl $0x00400000, %eax # returning to 32bit stack?
2030 + jnz restore_nocheck # allright, normal return
2031 + /* If returning to userspace with 16bit stack,
2032 + * try to fix the higher word of ESP, as the CPU
2033 + * won't restore it.
2034 + * This is an "official" bug of all the x86-compatible
2035 + * CPUs, which we can try to work around to make
2036 + * dosemu and wine happy. */
2037 + subl $8, %esp # reserve space for switch16 pointer
2038 + CFI_ADJUST_CFA_OFFSET 8
2039 + cli
2040 + TRACE_IRQS_OFF
2041 + movl %esp, %eax
2042 + /* Set up the 16bit stack frame with switch32 pointer on top,
2043 + * and a switch16 pointer on top of the current frame. */
2044 + call setup_x86_bogus_stack
2045 + CFI_ADJUST_CFA_OFFSET -8 # frame has moved
2046 + TRACE_IRQS_IRET
2047 + RESTORE_REGS
2048 + lss 20+4(%esp), %esp # switch to 16bit stack
2049 +1: iret
2050 +.section __ex_table,"a"
2051 + .align 4
2052 + .long 1b,iret_exc
2053 +.previous
2054 +#else
2055 + ALIGN
2056 +restore_all_enable_events:
2057 + TRACE_IRQS_ON
2058 + __ENABLE_INTERRUPTS
2059 +scrit: /**** START OF CRITICAL REGION ****/
2060 + __TEST_PENDING
2061 + jnz 14f # process more events if necessary...
2062 + RESTORE_REGS
2063 + addl $4, %esp
2064 + CFI_ADJUST_CFA_OFFSET -4
2065 +1: iret
2066 +.section __ex_table,"a"
2067 + .align 4
2068 + .long 1b,iret_exc
2069 +.previous
2070 +14: __DISABLE_INTERRUPTS
2071 + TRACE_IRQS_OFF
2072 + jmp 11f
2073 +ecrit: /**** END OF CRITICAL REGION ****/
2074 +
2075 + CFI_RESTORE_STATE
2076 +hypervisor_iret:
2077 + andl $~NMI_MASK, EFLAGS(%esp)
2078 + RESTORE_REGS
2079 + addl $4, %esp
2080 + CFI_ADJUST_CFA_OFFSET -4
2081 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
2082 +#endif
2083 + CFI_ENDPROC
2084 +
2085 + # perform work that needs to be done immediately before resumption
2086 + ALIGN
2087 + RING0_PTREGS_FRAME # can't unwind into user space anyway
2088 +work_pending:
2089 + testb $_TIF_NEED_RESCHED, %cl
2090 + jz work_notifysig
2091 +work_resched:
2092 + call schedule
2093 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
2094 + # setting need_resched or sigpending
2095 + # between sampling and the iret
2096 + TRACE_IRQS_OFF
2097 + movl TI_flags(%ebp), %ecx
2098 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
2099 + # than syscall tracing?
2100 + jz restore_all
2101 + testb $_TIF_NEED_RESCHED, %cl
2102 + jnz work_resched
2103 +
2104 +work_notifysig: # deal with pending signals and
2105 + # notify-resume requests
2106 + testl $VM_MASK, EFLAGS(%esp)
2107 + movl %esp, %eax
2108 + jne work_notifysig_v86 # returning to kernel-space or
2109 + # vm86-space
2110 + xorl %edx, %edx
2111 + call do_notify_resume
2112 + jmp resume_userspace_sig
2113 +
2114 + ALIGN
2115 +work_notifysig_v86:
2116 +#ifdef CONFIG_VM86
2117 + pushl %ecx # save ti_flags for do_notify_resume
2118 + CFI_ADJUST_CFA_OFFSET 4
2119 + call save_v86_state # %eax contains pt_regs pointer
2120 + popl %ecx
2121 + CFI_ADJUST_CFA_OFFSET -4
2122 + movl %eax, %esp
2123 + xorl %edx, %edx
2124 + call do_notify_resume
2125 + jmp resume_userspace_sig
2126 +#endif
2127 +
2128 + # perform syscall exit tracing
2129 + ALIGN
2130 +syscall_trace_entry:
2131 + movl $-ENOSYS,EAX(%esp)
2132 + movl %esp, %eax
2133 + xorl %edx,%edx
2134 + call do_syscall_trace
2135 + cmpl $0, %eax
2136 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2137 + # so must skip actual syscall
2138 + movl ORIG_EAX(%esp), %eax
2139 + cmpl $(nr_syscalls), %eax
2140 + jnae syscall_call
2141 + jmp syscall_exit
2142 +
2143 + # perform syscall exit tracing
2144 + ALIGN
2145 +syscall_exit_work:
2146 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2147 + jz work_pending
2148 + TRACE_IRQS_ON
2149 + ENABLE_INTERRUPTS # could let do_syscall_trace() call
2150 + # schedule() instead
2151 + movl %esp, %eax
2152 + movl $1, %edx
2153 + call do_syscall_trace
2154 + jmp resume_userspace
2155 + CFI_ENDPROC
2156 +
2157 + RING0_INT_FRAME # can't unwind into user space anyway
2158 +syscall_fault:
2159 + pushl %eax # save orig_eax
2160 + CFI_ADJUST_CFA_OFFSET 4
2161 + SAVE_ALL
2162 + GET_THREAD_INFO(%ebp)
2163 + movl $-EFAULT,EAX(%esp)
2164 + jmp resume_userspace
2165 +
2166 +syscall_badsys:
2167 + movl $-ENOSYS,EAX(%esp)
2168 + jmp resume_userspace
2169 + CFI_ENDPROC
2170 +
2171 +#ifndef CONFIG_XEN
2172 +#define FIXUP_ESPFIX_STACK \
2173 + movl %esp, %eax; \
2174 + /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2175 + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2176 + /* copy data from 16bit stack to 32bit stack */ \
2177 + call fixup_x86_bogus_stack; \
2178 + /* put ESP to the proper location */ \
2179 + movl %eax, %esp;
2180 +#define UNWIND_ESPFIX_STACK \
2181 + pushl %eax; \
2182 + CFI_ADJUST_CFA_OFFSET 4; \
2183 + movl %ss, %eax; \
2184 + /* see if on 16bit stack */ \
2185 + cmpw $__ESPFIX_SS, %ax; \
2186 + je 28f; \
2187 +27: popl %eax; \
2188 + CFI_ADJUST_CFA_OFFSET -4; \
2189 +.section .fixup,"ax"; \
2190 +28: movl $__KERNEL_DS, %eax; \
2191 + movl %eax, %ds; \
2192 + movl %eax, %es; \
2193 + /* switch to 32bit stack */ \
2194 + FIXUP_ESPFIX_STACK; \
2195 + jmp 27b; \
2196 +.previous
2197 +
2198 +/*
2199 + * Build the entry stubs and pointer table with
2200 + * some assembler magic.
2201 + */
2202 +.data
2203 +ENTRY(interrupt)
2204 +.text
2205 +
2206 +vector=0
2207 +ENTRY(irq_entries_start)
2208 + RING0_INT_FRAME
2209 +.rept NR_IRQS
2210 + ALIGN
2211 + .if vector
2212 + CFI_ADJUST_CFA_OFFSET -4
2213 + .endif
2214 +1: pushl $~(vector)
2215 + CFI_ADJUST_CFA_OFFSET 4
2216 + jmp common_interrupt
2217 +.data
2218 + .long 1b
2219 +.text
2220 +vector=vector+1
2221 +.endr
2222 +
2223 +/*
2224 + * the CPU automatically disables interrupts when executing an IRQ vector,
2225 + * so IRQ-flags tracing has to follow that:
2226 + */
2227 + ALIGN
2228 +common_interrupt:
2229 + SAVE_ALL
2230 + TRACE_IRQS_OFF
2231 + movl %esp,%eax
2232 + call do_IRQ
2233 + jmp ret_from_intr
2234 + CFI_ENDPROC
2235 +
2236 +#define BUILD_INTERRUPT(name, nr) \
2237 +ENTRY(name) \
2238 + RING0_INT_FRAME; \
2239 + pushl $~(nr); \
2240 + CFI_ADJUST_CFA_OFFSET 4; \
2241 + SAVE_ALL; \
2242 + TRACE_IRQS_OFF \
2243 + movl %esp,%eax; \
2244 + call smp_/**/name; \
2245 + jmp ret_from_intr; \
2246 + CFI_ENDPROC
2247 +
2248 +/* The include is where all of the SMP etc. interrupts come from */
2249 +#include "entry_arch.h"
2250 +#else
2251 +#define UNWIND_ESPFIX_STACK
2252 +#endif
2253 +
2254 +ENTRY(divide_error)
2255 + RING0_INT_FRAME
2256 + pushl $0 # no error code
2257 + CFI_ADJUST_CFA_OFFSET 4
2258 + pushl $do_divide_error
2259 + CFI_ADJUST_CFA_OFFSET 4
2260 + ALIGN
2261 +error_code:
2262 + pushl %ds
2263 + CFI_ADJUST_CFA_OFFSET 4
2264 + /*CFI_REL_OFFSET ds, 0*/
2265 + pushl %eax
2266 + CFI_ADJUST_CFA_OFFSET 4
2267 + CFI_REL_OFFSET eax, 0
2268 + xorl %eax, %eax
2269 + pushl %ebp
2270 + CFI_ADJUST_CFA_OFFSET 4
2271 + CFI_REL_OFFSET ebp, 0
2272 + pushl %edi
2273 + CFI_ADJUST_CFA_OFFSET 4
2274 + CFI_REL_OFFSET edi, 0
2275 + pushl %esi
2276 + CFI_ADJUST_CFA_OFFSET 4
2277 + CFI_REL_OFFSET esi, 0
2278 + pushl %edx
2279 + CFI_ADJUST_CFA_OFFSET 4
2280 + CFI_REL_OFFSET edx, 0
2281 + decl %eax # eax = -1
2282 + pushl %ecx
2283 + CFI_ADJUST_CFA_OFFSET 4
2284 + CFI_REL_OFFSET ecx, 0
2285 + pushl %ebx
2286 + CFI_ADJUST_CFA_OFFSET 4
2287 + CFI_REL_OFFSET ebx, 0
2288 + cld
2289 + pushl %es
2290 + CFI_ADJUST_CFA_OFFSET 4
2291 + /*CFI_REL_OFFSET es, 0*/
2292 + UNWIND_ESPFIX_STACK
2293 + popl %ecx
2294 + CFI_ADJUST_CFA_OFFSET -4
2295 + /*CFI_REGISTER es, ecx*/
2296 + movl ES(%esp), %edi # get the function address
2297 + movl ORIG_EAX(%esp), %edx # get the error code
2298 + movl %eax, ORIG_EAX(%esp)
2299 + movl %ecx, ES(%esp)
2300 + /*CFI_REL_OFFSET es, ES*/
2301 + movl $(__USER_DS), %ecx
2302 + movl %ecx, %ds
2303 + movl %ecx, %es
2304 + movl %esp,%eax # pt_regs pointer
2305 + call *%edi
2306 + jmp ret_from_exception
2307 + CFI_ENDPROC
2308 +
2309 +#ifdef CONFIG_XEN
2310 +# A note on the "critical region" in our callback handler.
2311 +# We want to avoid stacking callback handlers due to events occurring
2312 +# during handling of the last event. To do this, we keep events disabled
2313 +# until we've done all processing. HOWEVER, we must enable events before
2314 +# popping the stack frame (can't be done atomically) and so it would still
2315 +# be possible to get enough handler activations to overflow the stack.
2316 +# Although unlikely, bugs of that kind are hard to track down, so we'd
2317 +# like to avoid the possibility.
2318 +# So, on entry to the handler we detect whether we interrupted an
2319 +# existing activation in its critical region -- if so, we pop the current
2320 +# activation and restart the handler using the previous one.
2321 +#
2322 +# The sysexit critical region is slightly different. sysexit
2323 +# atomically removes the entire stack frame. If we interrupt in the
2324 +# critical region we know that the entire frame is present and correct
2325 +# so we can simply throw away the new one.
2326 +ENTRY(hypervisor_callback)
2327 + RING0_INT_FRAME
2328 + pushl %eax
2329 + CFI_ADJUST_CFA_OFFSET 4
2330 + SAVE_ALL
2331 + movl EIP(%esp),%eax
2332 + cmpl $scrit,%eax
2333 + jb 11f
2334 + cmpl $ecrit,%eax
2335 + jb critical_region_fixup
2336 + cmpl $sysexit_scrit,%eax
2337 + jb 11f
2338 + cmpl $sysexit_ecrit,%eax
2339 + ja 11f
2340 + addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2341 +11: push %esp
2342 + CFI_ADJUST_CFA_OFFSET 4
2343 + call evtchn_do_upcall
2344 + add $4,%esp
2345 + CFI_ADJUST_CFA_OFFSET -4
2346 + jmp ret_from_intr
2347 + CFI_ENDPROC
2348 +
2349 +# [How we do the fixup]. We want to merge the current stack frame with the
2350 +# just-interrupted frame. How we do this depends on where in the critical
2351 +# region the interrupted handler was executing, and so how many saved
2352 +# registers are in each frame. We do this quickly using the lookup table
2353 +# 'critical_fixup_table'. For each byte offset in the critical region, it
2354 +# provides the number of bytes which have already been popped from the
2355 +# interrupted stack frame.
2356 +critical_region_fixup:
2357 + movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
2358 + cmpb $0xff,%cl # 0xff => vcpu_info critical region
2359 + jne 15f
2360 + xorl %ecx,%ecx
2361 +15: leal (%esp,%ecx),%esi # %esi points at end of src region
2362 + leal OLDESP(%esp),%edi # %edi points at end of dst region
2363 + shrl $2,%ecx # convert words to bytes
2364 + je 17f # skip loop if nothing to copy
2365 +16: subl $4,%esi # pre-decrementing copy loop
2366 + subl $4,%edi
2367 + movl (%esi),%eax
2368 + movl %eax,(%edi)
2369 + loop 16b
2370 +17: movl %edi,%esp # final %edi is top of merged stack
2371 + jmp 11b
2372 +
2373 +.section .rodata,"a"
2374 +critical_fixup_table:
2375 + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
2376 + .byte 0xff,0xff # jnz 14f
2377 + .byte 0x00 # pop %ebx
2378 + .byte 0x04 # pop %ecx
2379 + .byte 0x08 # pop %edx
2380 + .byte 0x0c # pop %esi
2381 + .byte 0x10 # pop %edi
2382 + .byte 0x14 # pop %ebp
2383 + .byte 0x18 # pop %eax
2384 + .byte 0x1c # pop %ds
2385 + .byte 0x20 # pop %es
2386 + .byte 0x24,0x24,0x24 # add $4,%esp
2387 + .byte 0x28 # iret
2388 + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2389 + .byte 0x00,0x00 # jmp 11b
2390 +.previous
2391 +
2392 +# Hypervisor uses this for application faults while it executes.
2393 +# We get here for two reasons:
2394 +# 1. Fault while reloading DS, ES, FS or GS
2395 +# 2. Fault while executing IRET
2396 +# Category 1 we fix up by reattempting the load, and zeroing the segment
2397 +# register if the load fails.
2398 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
2399 +# normal Linux return path in this case because if we use the IRET hypercall
2400 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
2401 +# We distinguish between categories by maintaining a status value in EAX.
2402 +ENTRY(failsafe_callback)
2403 + pushl %eax
2404 + movl $1,%eax
2405 +1: mov 4(%esp),%ds
2406 +2: mov 8(%esp),%es
2407 +3: mov 12(%esp),%fs
2408 +4: mov 16(%esp),%gs
2409 + testl %eax,%eax
2410 + popl %eax
2411 + jz 5f
2412 + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
2413 + jmp iret_exc
2414 +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
2415 + RING0_INT_FRAME
2416 + pushl $0
2417 + SAVE_ALL
2418 + jmp ret_from_exception
2419 +.section .fixup,"ax"; \
2420 +6: xorl %eax,%eax; \
2421 + movl %eax,4(%esp); \
2422 + jmp 1b; \
2423 +7: xorl %eax,%eax; \
2424 + movl %eax,8(%esp); \
2425 + jmp 2b; \
2426 +8: xorl %eax,%eax; \
2427 + movl %eax,12(%esp); \
2428 + jmp 3b; \
2429 +9: xorl %eax,%eax; \
2430 + movl %eax,16(%esp); \
2431 + jmp 4b; \
2432 +.previous; \
2433 +.section __ex_table,"a"; \
2434 + .align 4; \
2435 + .long 1b,6b; \
2436 + .long 2b,7b; \
2437 + .long 3b,8b; \
2438 + .long 4b,9b; \
2439 +.previous
2440 +#endif
2441 + CFI_ENDPROC
2442 +
2443 +ENTRY(coprocessor_error)
2444 + RING0_INT_FRAME
2445 + pushl $0
2446 + CFI_ADJUST_CFA_OFFSET 4
2447 + pushl $do_coprocessor_error
2448 + CFI_ADJUST_CFA_OFFSET 4
2449 + jmp error_code
2450 + CFI_ENDPROC
2451 +
2452 +ENTRY(simd_coprocessor_error)
2453 + RING0_INT_FRAME
2454 + pushl $0
2455 + CFI_ADJUST_CFA_OFFSET 4
2456 + pushl $do_simd_coprocessor_error
2457 + CFI_ADJUST_CFA_OFFSET 4
2458 + jmp error_code
2459 + CFI_ENDPROC
2460 +
2461 +ENTRY(device_not_available)
2462 + RING0_INT_FRAME
2463 + pushl $-1 # mark this as an int
2464 + CFI_ADJUST_CFA_OFFSET 4
2465 + SAVE_ALL
2466 +#ifndef CONFIG_XEN
2467 + movl %cr0, %eax
2468 + testl $0x4, %eax # EM (math emulation bit)
2469 + je device_available_emulate
2470 + pushl $0 # temporary storage for ORIG_EIP
2471 + CFI_ADJUST_CFA_OFFSET 4
2472 + call math_emulate
2473 + addl $4, %esp
2474 + CFI_ADJUST_CFA_OFFSET -4
2475 + jmp ret_from_exception
2476 +device_available_emulate:
2477 +#endif
2478 + preempt_stop
2479 + call math_state_restore
2480 + jmp ret_from_exception
2481 + CFI_ENDPROC
2482 +
2483 +#ifndef CONFIG_XEN
2484 +/*
2485 + * Debug traps and NMI can happen at the one SYSENTER instruction
2486 + * that sets up the real kernel stack. Check here, since we can't
2487 + * allow the wrong stack to be used.
2488 + *
2489 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2490 + * already pushed 3 words if it hits on the sysenter instruction:
2491 + * eflags, cs and eip.
2492 + *
2493 + * We just load the right stack, and push the three (known) values
2494 + * by hand onto the new stack - while updating the return eip past
2495 + * the instruction that would have done it for sysenter.
2496 + */
2497 +#define FIX_STACK(offset, ok, label) \
2498 + cmpw $__KERNEL_CS,4(%esp); \
2499 + jne ok; \
2500 +label: \
2501 + movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2502 + pushfl; \
2503 + pushl $__KERNEL_CS; \
2504 + pushl $sysenter_past_esp
2505 +#endif /* CONFIG_XEN */
2506 +
2507 +KPROBE_ENTRY(debug)
2508 + RING0_INT_FRAME
2509 +#ifndef CONFIG_XEN
2510 + cmpl $sysenter_entry,(%esp)
2511 + jne debug_stack_correct
2512 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2513 +debug_stack_correct:
2514 +#endif /* !CONFIG_XEN */
2515 + pushl $-1 # mark this as an int
2516 + CFI_ADJUST_CFA_OFFSET 4
2517 + SAVE_ALL
2518 + xorl %edx,%edx # error code 0
2519 + movl %esp,%eax # pt_regs pointer
2520 + call do_debug
2521 + jmp ret_from_exception
2522 + CFI_ENDPROC
2523 + .previous .text
2524 +#ifndef CONFIG_XEN
2525 +/*
2526 + * NMI is doubly nasty. It can happen _while_ we're handling
2527 + * a debug fault, and the debug fault hasn't yet been able to
2528 + * clear up the stack. So we first check whether we got an
2529 + * NMI on the sysenter entry path, but after that we need to
2530 + * check whether we got an NMI on the debug path where the debug
2531 + * fault happened on the sysenter path.
2532 + */
2533 +ENTRY(nmi)
2534 + RING0_INT_FRAME
2535 + pushl %eax
2536 + CFI_ADJUST_CFA_OFFSET 4
2537 + movl %ss, %eax
2538 + cmpw $__ESPFIX_SS, %ax
2539 + popl %eax
2540 + CFI_ADJUST_CFA_OFFSET -4
2541 + je nmi_16bit_stack
2542 + cmpl $sysenter_entry,(%esp)
2543 + je nmi_stack_fixup
2544 + pushl %eax
2545 + CFI_ADJUST_CFA_OFFSET 4
2546 + movl %esp,%eax
2547 + /* Do not access memory above the end of our stack page,
2548 + * it might not exist.
2549 + */
2550 + andl $(THREAD_SIZE-1),%eax
2551 + cmpl $(THREAD_SIZE-20),%eax
2552 + popl %eax
2553 + CFI_ADJUST_CFA_OFFSET -4
2554 + jae nmi_stack_correct
2555 + cmpl $sysenter_entry,12(%esp)
2556 + je nmi_debug_stack_check
2557 +nmi_stack_correct:
2558 + pushl %eax
2559 + CFI_ADJUST_CFA_OFFSET 4
2560 + SAVE_ALL
2561 + xorl %edx,%edx # zero error code
2562 + movl %esp,%eax # pt_regs pointer
2563 + call do_nmi
2564 + jmp restore_nocheck_notrace
2565 + CFI_ENDPROC
2566 +
2567 +nmi_stack_fixup:
2568 + FIX_STACK(12,nmi_stack_correct, 1)
2569 + jmp nmi_stack_correct
2570 +nmi_debug_stack_check:
2571 + cmpw $__KERNEL_CS,16(%esp)
2572 + jne nmi_stack_correct
2573 + cmpl $debug,(%esp)
2574 + jb nmi_stack_correct
2575 + cmpl $debug_esp_fix_insn,(%esp)
2576 + ja nmi_stack_correct
2577 + FIX_STACK(24,nmi_stack_correct, 1)
2578 + jmp nmi_stack_correct
2579 +
2580 +nmi_16bit_stack:
2581 + RING0_INT_FRAME
2582 + /* create the pointer to lss back */
2583 + pushl %ss
2584 + CFI_ADJUST_CFA_OFFSET 4
2585 + pushl %esp
2586 + CFI_ADJUST_CFA_OFFSET 4
2587 + movzwl %sp, %esp
2588 + addw $4, (%esp)
2589 + /* copy the iret frame of 12 bytes */
2590 + .rept 3
2591 + pushl 16(%esp)
2592 + CFI_ADJUST_CFA_OFFSET 4
2593 + .endr
2594 + pushl %eax
2595 + CFI_ADJUST_CFA_OFFSET 4
2596 + SAVE_ALL
2597 + FIXUP_ESPFIX_STACK # %eax == %esp
2598 + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2599 + xorl %edx,%edx # zero error code
2600 + call do_nmi
2601 + RESTORE_REGS
2602 + lss 12+4(%esp), %esp # back to 16bit stack
2603 +1: iret
2604 + CFI_ENDPROC
2605 +.section __ex_table,"a"
2606 + .align 4
2607 + .long 1b,iret_exc
2608 +.previous
2609 +#else
2610 +ENTRY(nmi)
2611 + RING0_INT_FRAME
2612 + pushl %eax
2613 + CFI_ADJUST_CFA_OFFSET 4
2614 + SAVE_ALL
2615 + xorl %edx,%edx # zero error code
2616 + movl %esp,%eax # pt_regs pointer
2617 + call do_nmi
2618 + orl $NMI_MASK, EFLAGS(%esp)
2619 + jmp restore_all
2620 + CFI_ENDPROC
2621 +#endif
2622 +
2623 +KPROBE_ENTRY(int3)
2624 + RING0_INT_FRAME
2625 + pushl $-1 # mark this as an int
2626 + CFI_ADJUST_CFA_OFFSET 4
2627 + SAVE_ALL
2628 + xorl %edx,%edx # zero error code
2629 + movl %esp,%eax # pt_regs pointer
2630 + call do_int3
2631 + jmp ret_from_exception
2632 + CFI_ENDPROC
2633 + .previous .text
2634 +
2635 +ENTRY(overflow)
2636 + RING0_INT_FRAME
2637 + pushl $0
2638 + CFI_ADJUST_CFA_OFFSET 4
2639 + pushl $do_overflow
2640 + CFI_ADJUST_CFA_OFFSET 4
2641 + jmp error_code
2642 + CFI_ENDPROC
2643 +
2644 +ENTRY(bounds)
2645 + RING0_INT_FRAME
2646 + pushl $0
2647 + CFI_ADJUST_CFA_OFFSET 4
2648 + pushl $do_bounds
2649 + CFI_ADJUST_CFA_OFFSET 4
2650 + jmp error_code
2651 + CFI_ENDPROC
2652 +
2653 +ENTRY(invalid_op)
2654 + RING0_INT_FRAME
2655 + pushl $0
2656 + CFI_ADJUST_CFA_OFFSET 4
2657 + pushl $do_invalid_op
2658 + CFI_ADJUST_CFA_OFFSET 4
2659 + jmp error_code
2660 + CFI_ENDPROC
2661 +
2662 +ENTRY(coprocessor_segment_overrun)
2663 + RING0_INT_FRAME
2664 + pushl $0
2665 + CFI_ADJUST_CFA_OFFSET 4
2666 + pushl $do_coprocessor_segment_overrun
2667 + CFI_ADJUST_CFA_OFFSET 4
2668 + jmp error_code
2669 + CFI_ENDPROC
2670 +
2671 +ENTRY(invalid_TSS)
2672 + RING0_EC_FRAME
2673 + pushl $do_invalid_TSS
2674 + CFI_ADJUST_CFA_OFFSET 4
2675 + jmp error_code
2676 + CFI_ENDPROC
2677 +
2678 +ENTRY(segment_not_present)
2679 + RING0_EC_FRAME
2680 + pushl $do_segment_not_present
2681 + CFI_ADJUST_CFA_OFFSET 4
2682 + jmp error_code
2683 + CFI_ENDPROC
2684 +
2685 +ENTRY(stack_segment)
2686 + RING0_EC_FRAME
2687 + pushl $do_stack_segment
2688 + CFI_ADJUST_CFA_OFFSET 4
2689 + jmp error_code
2690 + CFI_ENDPROC
2691 +
2692 +KPROBE_ENTRY(general_protection)
2693 + RING0_EC_FRAME
2694 + pushl $do_general_protection
2695 + CFI_ADJUST_CFA_OFFSET 4
2696 + jmp error_code
2697 + CFI_ENDPROC
2698 + .previous .text
2699 +
2700 +ENTRY(alignment_check)
2701 + RING0_EC_FRAME
2702 + pushl $do_alignment_check
2703 + CFI_ADJUST_CFA_OFFSET 4
2704 + jmp error_code
2705 + CFI_ENDPROC
2706 +
2707 +KPROBE_ENTRY(page_fault)
2708 + RING0_EC_FRAME
2709 + pushl $do_page_fault
2710 + CFI_ADJUST_CFA_OFFSET 4
2711 + jmp error_code
2712 + CFI_ENDPROC
2713 + .previous .text
2714 +
2715 +#ifdef CONFIG_X86_MCE
2716 +ENTRY(machine_check)
2717 + RING0_INT_FRAME
2718 + pushl $0
2719 + CFI_ADJUST_CFA_OFFSET 4
2720 + pushl machine_check_vector
2721 + CFI_ADJUST_CFA_OFFSET 4
2722 + jmp error_code
2723 + CFI_ENDPROC
2724 +#endif
2725 +
2726 +#ifndef CONFIG_XEN
2727 +ENTRY(spurious_interrupt_bug)
2728 + RING0_INT_FRAME
2729 + pushl $0
2730 + CFI_ADJUST_CFA_OFFSET 4
2731 + pushl $do_spurious_interrupt_bug
2732 + CFI_ADJUST_CFA_OFFSET 4
2733 + jmp error_code
2734 + CFI_ENDPROC
2735 +#endif /* !CONFIG_XEN */
2736 +
2737 +#ifdef CONFIG_STACK_UNWIND
2738 +ENTRY(arch_unwind_init_running)
2739 + CFI_STARTPROC
2740 + movl 4(%esp), %edx
2741 + movl (%esp), %ecx
2742 + leal 4(%esp), %eax
2743 + movl %ebx, EBX(%edx)
2744 + xorl %ebx, %ebx
2745 + movl %ebx, ECX(%edx)
2746 + movl %ebx, EDX(%edx)
2747 + movl %esi, ESI(%edx)
2748 + movl %edi, EDI(%edx)
2749 + movl %ebp, EBP(%edx)
2750 + movl %ebx, EAX(%edx)
2751 + movl $__USER_DS, DS(%edx)
2752 + movl $__USER_DS, ES(%edx)
2753 + movl %ebx, ORIG_EAX(%edx)
2754 + movl %ecx, EIP(%edx)
2755 + movl 12(%esp), %ecx
2756 + movl $__KERNEL_CS, CS(%edx)
2757 + movl %ebx, EFLAGS(%edx)
2758 + movl %eax, OLDESP(%edx)
2759 + movl 8(%esp), %eax
2760 + movl %ecx, 8(%esp)
2761 + movl EBX(%edx), %ebx
2762 + movl $__KERNEL_DS, OLDSS(%edx)
2763 + jmpl *%eax
2764 + CFI_ENDPROC
2765 +ENDPROC(arch_unwind_init_running)
2766 +#endif
2767 +
2768 +ENTRY(fixup_4gb_segment)
2769 + RING0_EC_FRAME
2770 + pushl $do_fixup_4gb_segment
2771 + CFI_ADJUST_CFA_OFFSET 4
2772 + jmp error_code
2773 + CFI_ENDPROC
2774 +
2775 +.section .rodata,"a"
2776 +#include "syscall_table.S"
2777 +
2778 +syscall_table_size=(.-sys_call_table)
2779 Index: head-2008-11-25/arch/x86/kernel/fixup.c
2780 ===================================================================
2781 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2782 +++ head-2008-11-25/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100
2783 @@ -0,0 +1,88 @@
2784 +/******************************************************************************
2785 + * fixup.c
2786 + *
2787 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
2788 + * Used to avoid repeated slow emulation of common instructions used by the
2789 + * user-space TLS (Thread-Local Storage) libraries.
2790 + *
2791 + * **** NOTE ****
2792 + * Issues with the binary rewriting have caused it to be removed. Instead
2793 + * we rely on Xen's emulator to boot the kernel, and then print a banner
2794 + * message recommending that the user disables /lib/tls.
2795 + *
2796 + * Copyright (c) 2004, K A Fraser
2797 + *
2798 + * This program is free software; you can redistribute it and/or modify
2799 + * it under the terms of the GNU General Public License as published by
2800 + * the Free Software Foundation; either version 2 of the License, or
2801 + * (at your option) any later version.
2802 + *
2803 + * This program is distributed in the hope that it will be useful,
2804 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2805 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2806 + * GNU General Public License for more details.
2807 + *
2808 + * You should have received a copy of the GNU General Public License
2809 + * along with this program; if not, write to the Free Software
2810 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2811 + */
2812 +
2813 +#include <linux/init.h>
2814 +#include <linux/sched.h>
2815 +#include <linux/slab.h>
2816 +#include <linux/kernel.h>
2817 +#include <linux/delay.h>
2818 +#include <linux/version.h>
2819 +
2820 +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
2821 +
2822 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2823 +{
2824 + static unsigned long printed = 0;
2825 + char info[100];
2826 + int i;
2827 +
2828 + /* Ignore statically-linked init. */
2829 + if (current->tgid == 1)
2830 + return;
2831 +
2832 + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
2833 + VMASST_TYPE_4gb_segments_notify));
2834 +
2835 + if (test_and_set_bit(0, &printed))
2836 + return;
2837 +
2838 + sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
2839 +
2840 + DP("");
2841 + DP("***************************************************************");
2842 + DP("***************************************************************");
2843 + DP("** WARNING: Currently emulating unsupported memory accesses **");
2844 + DP("** in /lib/tls glibc libraries. The emulation is **");
2845 + DP("** slow. To ensure full performance you should **");
2846 + DP("** install a 'xen-friendly' (nosegneg) version of **");
2847 + DP("** the library, or disable tls support by executing **");
2848 + DP("** the following as root: **");
2849 + DP("** mv /lib/tls /lib/tls.disabled **");
2850 + DP("** Offending process: %-38.38s **", info);
2851 + DP("***************************************************************");
2852 + DP("***************************************************************");
2853 + DP("");
2854 +
2855 + for (i = 5; i > 0; i--) {
2856 + touch_softlockup_watchdog();
2857 + printk("Pausing... %d", i);
2858 + mdelay(1000);
2859 + printk("\b\b\b\b\b\b\b\b\b\b\b\b");
2860 + }
2861 +
2862 + printk("Continuing...\n\n");
2863 +}
2864 +
2865 +static int __init fixup_init(void)
2866 +{
2867 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
2868 + VMASST_TYPE_4gb_segments_notify));
2869 + return 0;
2870 +}
2871 +__initcall(fixup_init);
2872 Index: head-2008-11-25/arch/x86/kernel/head_32-xen.S
2873 ===================================================================
2874 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2875 +++ head-2008-11-25/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200
2876 @@ -0,0 +1,207 @@
2877 +
2878 +
2879 +.text
2880 +#include <linux/elfnote.h>
2881 +#include <linux/threads.h>
2882 +#include <linux/linkage.h>
2883 +#include <asm/segment.h>
2884 +#include <asm/page.h>
2885 +#include <asm/cache.h>
2886 +#include <asm/thread_info.h>
2887 +#include <asm/asm-offsets.h>
2888 +#include <asm/dwarf2.h>
2889 +#include <xen/interface/xen.h>
2890 +#include <xen/interface/elfnote.h>
2891 +
2892 +/*
2893 + * References to members of the new_cpu_data structure.
2894 + */
2895 +
2896 +#define X86 new_cpu_data+CPUINFO_x86
2897 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
2898 +#define X86_MODEL new_cpu_data+CPUINFO_x86_model
2899 +#define X86_MASK new_cpu_data+CPUINFO_x86_mask
2900 +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
2901 +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
2902 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
2903 +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
2904 +
2905 +#define VIRT_ENTRY_OFFSET 0x0
2906 +.org VIRT_ENTRY_OFFSET
2907 +ENTRY(startup_32)
2908 + movl %esi,xen_start_info
2909 + cld
2910 +
2911 + /* Set up the stack pointer */
2912 + movl $(init_thread_union+THREAD_SIZE),%esp
2913 +
2914 + /* get vendor info */
2915 + xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2916 + XEN_CPUID
2917 + movl %eax,X86_CPUID # save CPUID level
2918 + movl %ebx,X86_VENDOR_ID # lo 4 chars
2919 + movl %edx,X86_VENDOR_ID+4 # next 4 chars
2920 + movl %ecx,X86_VENDOR_ID+8 # last 4 chars
2921 +
2922 + movl $1,%eax # Use the CPUID instruction to get CPU type
2923 + XEN_CPUID
2924 + movb %al,%cl # save reg for future use
2925 + andb $0x0f,%ah # mask processor family
2926 + movb %ah,X86
2927 + andb $0xf0,%al # mask model
2928 + shrb $4,%al
2929 + movb %al,X86_MODEL
2930 + andb $0x0f,%cl # mask mask revision
2931 + movb %cl,X86_MASK
2932 + movl %edx,X86_CAPABILITY
2933 +
2934 + movb $1,X86_HARD_MATH
2935 +
2936 + xorl %eax,%eax # Clear FS/GS and LDT
2937 + movl %eax,%fs
2938 + movl %eax,%gs
2939 + cld # gcc2 wants the direction flag cleared at all times
2940 +
2941 + pushl %eax # fake return address
2942 + jmp start_kernel
2943 +
2944 +#define HYPERCALL_PAGE_OFFSET 0x1000
2945 +.org HYPERCALL_PAGE_OFFSET
2946 +ENTRY(hypercall_page)
2947 + CFI_STARTPROC
2948 +.skip 0x1000
2949 + CFI_ENDPROC
2950 +
2951 +/*
2952 + * Real beginning of normal "text" segment
2953 + */
2954 +ENTRY(stext)
2955 +ENTRY(_stext)
2956 +
2957 +/*
2958 + * BSS section
2959 + */
2960 +.section ".bss.page_aligned","w"
2961 +ENTRY(empty_zero_page)
2962 + .fill 4096,1,0
2963 +
2964 +/*
2965 + * This starts the data section.
2966 + */
2967 +.data
2968 +
2969 +/*
2970 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
2971 + */
2972 + .align L1_CACHE_BYTES
2973 +ENTRY(cpu_gdt_table)
2974 + .quad 0x0000000000000000 /* NULL descriptor */
2975 + .quad 0x0000000000000000 /* 0x0b reserved */
2976 + .quad 0x0000000000000000 /* 0x13 reserved */
2977 + .quad 0x0000000000000000 /* 0x1b reserved */
2978 + .quad 0x0000000000000000 /* 0x20 unused */
2979 + .quad 0x0000000000000000 /* 0x28 unused */
2980 + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
2981 + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
2982 + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
2983 + .quad 0x0000000000000000 /* 0x4b reserved */
2984 + .quad 0x0000000000000000 /* 0x53 reserved */
2985 + .quad 0x0000000000000000 /* 0x5b reserved */
2986 +
2987 + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
2988 + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
2989 + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
2990 + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
2991 +
2992 + .quad 0x0000000000000000 /* 0x80 TSS descriptor */
2993 + .quad 0x0000000000000000 /* 0x88 LDT descriptor */
2994 +
2995 + /*
2996 + * Segments used for calling PnP BIOS have byte granularity.
2997 + * They code segments and data segments have fixed 64k limits,
2998 + * the transfer segment sizes are set at run time.
2999 + */
3000 + .quad 0x0000000000000000 /* 0x90 32-bit code */
3001 + .quad 0x0000000000000000 /* 0x98 16-bit code */
3002 + .quad 0x0000000000000000 /* 0xa0 16-bit data */
3003 + .quad 0x0000000000000000 /* 0xa8 16-bit data */
3004 + .quad 0x0000000000000000 /* 0xb0 16-bit data */
3005 +
3006 + /*
3007 + * The APM segments have byte granularity and their bases
3008 + * are set at run time. All have 64k limits.
3009 + */
3010 + .quad 0x0000000000000000 /* 0xb8 APM CS code */
3011 + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
3012 + .quad 0x0000000000000000 /* 0xc8 APM DS data */
3013 +
3014 + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
3015 + .quad 0x0000000000000000 /* 0xd8 - unused */
3016 + .quad 0x0000000000000000 /* 0xe0 - unused */
3017 + .quad 0x0000000000000000 /* 0xe8 - unused */
3018 + .quad 0x0000000000000000 /* 0xf0 - unused */
3019 + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
3020 +
3021 +#if CONFIG_XEN_COMPAT <= 0x030002
3022 +/*
3023 + * __xen_guest information
3024 + */
3025 +.macro utoa value
3026 + .if (\value) < 0 || (\value) >= 0x10
3027 + utoa (((\value)>>4)&0x0fffffff)
3028 + .endif
3029 + .if ((\value) & 0xf) < 10
3030 + .byte '0' + ((\value) & 0xf)
3031 + .else
3032 + .byte 'A' + ((\value) & 0xf) - 10
3033 + .endif
3034 +.endm
3035 +
3036 +.section __xen_guest
3037 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
3038 + .ascii ",XEN_VER=xen-3.0"
3039 + .ascii ",VIRT_BASE=0x"
3040 + utoa __PAGE_OFFSET
3041 + .ascii ",ELF_PADDR_OFFSET=0x"
3042 + utoa __PAGE_OFFSET
3043 + .ascii ",VIRT_ENTRY=0x"
3044 + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
3045 + .ascii ",HYPERCALL_PAGE=0x"
3046 + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3047 + .ascii ",FEATURES=writable_page_tables"
3048 + .ascii "|writable_descriptor_tables"
3049 + .ascii "|auto_translated_physmap"
3050 + .ascii "|pae_pgdir_above_4gb"
3051 + .ascii "|supervisor_mode_kernel"
3052 +#ifdef CONFIG_X86_PAE
3053 + .ascii ",PAE=yes[extended-cr3]"
3054 +#else
3055 + .ascii ",PAE=no"
3056 +#endif
3057 + .ascii ",LOADER=generic"
3058 + .byte 0
3059 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
3060 +
3061 +
3062 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
3063 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
3064 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
3065 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
3066 +#if CONFIG_XEN_COMPAT <= 0x030002
3067 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
3068 +#else
3069 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
3070 +#endif
3071 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
3072 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
3073 + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
3074 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
3075 +#ifdef CONFIG_X86_PAE
3076 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
3077 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
3078 +#else
3079 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
3080 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT)
3081 +#endif
3082 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
3083 + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
3084 Index: head-2008-11-25/arch/x86/kernel/init_task-xen.c
3085 ===================================================================
3086 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3087 +++ head-2008-11-25/arch/x86/kernel/init_task-xen.c 2007-06-12 13:12:48.000000000 +0200
3088 @@ -0,0 +1,51 @@
3089 +#include <linux/mm.h>
3090 +#include <linux/module.h>
3091 +#include <linux/sched.h>
3092 +#include <linux/init.h>
3093 +#include <linux/init_task.h>
3094 +#include <linux/fs.h>
3095 +#include <linux/mqueue.h>
3096 +
3097 +#include <asm/uaccess.h>
3098 +#include <asm/pgtable.h>
3099 +#include <asm/desc.h>
3100 +
3101 +static struct fs_struct init_fs = INIT_FS;
3102 +static struct files_struct init_files = INIT_FILES;
3103 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3104 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3105 +
3106 +#define swapper_pg_dir ((pgd_t *)NULL)
3107 +struct mm_struct init_mm = INIT_MM(init_mm);
3108 +#undef swapper_pg_dir
3109 +
3110 +EXPORT_SYMBOL(init_mm);
3111 +
3112 +/*
3113 + * Initial thread structure.
3114 + *
3115 + * We need to make sure that this is THREAD_SIZE aligned due to the
3116 + * way process stacks are handled. This is done by having a special
3117 + * "init_task" linker map entry..
3118 + */
3119 +union thread_union init_thread_union
3120 + __attribute__((__section__(".data.init_task"))) =
3121 + { INIT_THREAD_INFO(init_task) };
3122 +
3123 +/*
3124 + * Initial task structure.
3125 + *
3126 + * All other task structs will be allocated on slabs in fork.c
3127 + */
3128 +struct task_struct init_task = INIT_TASK(init_task);
3129 +
3130 +EXPORT_SYMBOL(init_task);
3131 +
3132 +#ifndef CONFIG_X86_NO_TSS
3133 +/*
3134 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3135 + * no more per-task TSS's.
3136 + */
3137 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3138 +#endif
3139 +
3140 Index: head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c
3141 ===================================================================
3142 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3143 +++ head-2008-11-25/arch/x86/kernel/io_apic_32-xen.c 2008-11-25 12:22:34.000000000 +0100
3144 @@ -0,0 +1,2776 @@
3145 +/*
3146 + * Intel IO-APIC support for multi-Pentium hosts.
3147 + *
3148 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3149 + *
3150 + * Many thanks to Stig Venaas for trying out countless experimental
3151 + * patches and reporting/debugging problems patiently!
3152 + *
3153 + * (c) 1999, Multiple IO-APIC support, developed by
3154 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3155 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3156 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
3157 + * and Ingo Molnar <mingo@redhat.com>
3158 + *
3159 + * Fixes
3160 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
3161 + * thanks to Eric Gilmore
3162 + * and Rolf G. Tews
3163 + * for testing these extensively
3164 + * Paul Diefenbaugh : Added full ACPI support
3165 + */
3166 +
3167 +#include <linux/mm.h>
3168 +#include <linux/interrupt.h>
3169 +#include <linux/init.h>
3170 +#include <linux/delay.h>
3171 +#include <linux/sched.h>
3172 +#include <linux/smp_lock.h>
3173 +#include <linux/mc146818rtc.h>
3174 +#include <linux/compiler.h>
3175 +#include <linux/acpi.h>
3176 +#include <linux/module.h>
3177 +#include <linux/sysdev.h>
3178 +
3179 +#include <asm/io.h>
3180 +#include <asm/smp.h>
3181 +#include <asm/desc.h>
3182 +#include <asm/timer.h>
3183 +#include <asm/i8259.h>
3184 +#include <asm/nmi.h>
3185 +
3186 +#include <mach_apic.h>
3187 +
3188 +#include "io_ports.h"
3189 +
3190 +#ifdef CONFIG_XEN
3191 +
3192 +#include <xen/interface/xen.h>
3193 +#include <xen/interface/physdev.h>
3194 +#include <xen/evtchn.h>
3195 +
3196 +/* Fake i8259 */
3197 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
3198 +#define disable_8259A_irq(_irq) ((void)0)
3199 +#define i8259A_irq_pending(_irq) (0)
3200 +
3201 +unsigned long io_apic_irqs;
3202 +
3203 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
3204 +{
3205 + struct physdev_apic apic_op;
3206 + int ret;
3207 +
3208 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3209 + apic_op.reg = reg;
3210 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
3211 + if (ret)
3212 + return ret;
3213 + return apic_op.value;
3214 +}
3215 +
3216 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
3217 +{
3218 + struct physdev_apic apic_op;
3219 +
3220 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3221 + apic_op.reg = reg;
3222 + apic_op.value = value;
3223 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
3224 +}
3225 +
3226 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
3227 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
3228 +
3229 +#endif /* CONFIG_XEN */
3230 +
3231 +int (*ioapic_renumber_irq)(int ioapic, int irq);
3232 +atomic_t irq_mis_count;
3233 +
3234 +/* Where if anywhere is the i8259 connect in external int mode */
3235 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
3236 +
3237 +static DEFINE_SPINLOCK(ioapic_lock);
3238 +static DEFINE_SPINLOCK(vector_lock);
3239 +
3240 +int timer_over_8254 __initdata = 1;
3241 +
3242 +/*
3243 + * Is the SiS APIC rmw bug present ?
3244 + * -1 = don't know, 0 = no, 1 = yes
3245 + */
3246 +int sis_apic_bug = -1;
3247 +
3248 +/*
3249 + * # of IRQ routing registers
3250 + */
3251 +int nr_ioapic_registers[MAX_IO_APICS];
3252 +
3253 +int disable_timer_pin_1 __initdata;
3254 +
3255 +/*
3256 + * Rough estimation of how many shared IRQs there are, can
3257 + * be changed anytime.
3258 + */
3259 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
3260 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
3261 +
3262 +/*
3263 + * This is performance-critical, we want to do it O(1)
3264 + *
3265 + * the indexing order of this array favors 1:1 mappings
3266 + * between pins and IRQs.
3267 + */
3268 +
3269 +static struct irq_pin_list {
3270 + int apic, pin, next;
3271 +} irq_2_pin[PIN_MAP_SIZE];
3272 +
3273 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
3274 +#ifdef CONFIG_PCI_MSI
3275 +#define vector_to_irq(vector) \
3276 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
3277 +#else
3278 +#define vector_to_irq(vector) (vector)
3279 +#endif
3280 +
3281 +/*
3282 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
3283 + * shared ISA-space IRQs, so we have to support them. We are super
3284 + * fast in the common case, and fast for shared ISA-space IRQs.
3285 + */
3286 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
3287 +{
3288 + static int first_free_entry = NR_IRQS;
3289 + struct irq_pin_list *entry = irq_2_pin + irq;
3290 +
3291 + while (entry->next)
3292 + entry = irq_2_pin + entry->next;
3293 +
3294 + if (entry->pin != -1) {
3295 + entry->next = first_free_entry;
3296 + entry = irq_2_pin + entry->next;
3297 + if (++first_free_entry >= PIN_MAP_SIZE)
3298 + panic("io_apic.c: whoops");
3299 + }
3300 + entry->apic = apic;
3301 + entry->pin = pin;
3302 +}
3303 +
3304 +#ifdef CONFIG_XEN
3305 +#define clear_IO_APIC() ((void)0)
3306 +#else
3307 +/*
3308 + * Reroute an IRQ to a different pin.
3309 + */
3310 +static void __init replace_pin_at_irq(unsigned int irq,
3311 + int oldapic, int oldpin,
3312 + int newapic, int newpin)
3313 +{
3314 + struct irq_pin_list *entry = irq_2_pin + irq;
3315 +
3316 + while (1) {
3317 + if (entry->apic == oldapic && entry->pin == oldpin) {
3318 + entry->apic = newapic;
3319 + entry->pin = newpin;
3320 + }
3321 + if (!entry->next)
3322 + break;
3323 + entry = irq_2_pin + entry->next;
3324 + }
3325 +}
3326 +
3327 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
3328 +{
3329 + struct irq_pin_list *entry = irq_2_pin + irq;
3330 + unsigned int pin, reg;
3331 +
3332 + for (;;) {
3333 + pin = entry->pin;
3334 + if (pin == -1)
3335 + break;
3336 + reg = io_apic_read(entry->apic, 0x10 + pin*2);
3337 + reg &= ~disable;
3338 + reg |= enable;
3339 + io_apic_modify(entry->apic, 0x10 + pin*2, reg);
3340 + if (!entry->next)
3341 + break;
3342 + entry = irq_2_pin + entry->next;
3343 + }
3344 +}
3345 +
3346 +/* mask = 1 */
3347 +static void __mask_IO_APIC_irq (unsigned int irq)
3348 +{
3349 + __modify_IO_APIC_irq(irq, 0x00010000, 0);
3350 +}
3351 +
3352 +/* mask = 0 */
3353 +static void __unmask_IO_APIC_irq (unsigned int irq)
3354 +{
3355 + __modify_IO_APIC_irq(irq, 0, 0x00010000);
3356 +}
3357 +
3358 +/* mask = 1, trigger = 0 */
3359 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
3360 +{
3361 + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
3362 +}
3363 +
3364 +/* mask = 0, trigger = 1 */
3365 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
3366 +{
3367 + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
3368 +}
3369 +
3370 +static void mask_IO_APIC_irq (unsigned int irq)
3371 +{
3372 + unsigned long flags;
3373 +
3374 + spin_lock_irqsave(&ioapic_lock, flags);
3375 + __mask_IO_APIC_irq(irq);
3376 + spin_unlock_irqrestore(&ioapic_lock, flags);
3377 +}
3378 +
3379 +static void unmask_IO_APIC_irq (unsigned int irq)
3380 +{
3381 + unsigned long flags;
3382 +
3383 + spin_lock_irqsave(&ioapic_lock, flags);
3384 + __unmask_IO_APIC_irq(irq);
3385 + spin_unlock_irqrestore(&ioapic_lock, flags);
3386 +}
3387 +
3388 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
3389 +{
3390 + struct IO_APIC_route_entry entry;
3391 + unsigned long flags;
3392 +
3393 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
3394 + spin_lock_irqsave(&ioapic_lock, flags);
3395 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3396 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3397 + spin_unlock_irqrestore(&ioapic_lock, flags);
3398 + if (entry.delivery_mode == dest_SMI)
3399 + return;
3400 +
3401 + /*
3402 + * Disable it in the IO-APIC irq-routing table:
3403 + */
3404 + memset(&entry, 0, sizeof(entry));
3405 + entry.mask = 1;
3406 + spin_lock_irqsave(&ioapic_lock, flags);
3407 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3408 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3409 + spin_unlock_irqrestore(&ioapic_lock, flags);
3410 +}
3411 +
3412 +static void clear_IO_APIC (void)
3413 +{
3414 + int apic, pin;
3415 +
3416 + for (apic = 0; apic < nr_ioapics; apic++)
3417 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
3418 + clear_IO_APIC_pin(apic, pin);
3419 +}
3420 +
3421 +#ifdef CONFIG_SMP
3422 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
3423 +{
3424 + unsigned long flags;
3425 + int pin;
3426 + struct irq_pin_list *entry = irq_2_pin + irq;
3427 + unsigned int apicid_value;
3428 + cpumask_t tmp;
3429 +
3430 + cpus_and(tmp, cpumask, cpu_online_map);
3431 + if (cpus_empty(tmp))
3432 + tmp = TARGET_CPUS;
3433 +
3434 + cpus_and(cpumask, tmp, CPU_MASK_ALL);
3435 +
3436 + apicid_value = cpu_mask_to_apicid(cpumask);
3437 + /* Prepare to do the io_apic_write */
3438 + apicid_value = apicid_value << 24;
3439 + spin_lock_irqsave(&ioapic_lock, flags);
3440 + for (;;) {
3441 + pin = entry->pin;
3442 + if (pin == -1)
3443 + break;
3444 + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
3445 + if (!entry->next)
3446 + break;
3447 + entry = irq_2_pin + entry->next;
3448 + }
3449 + set_irq_info(irq, cpumask);
3450 + spin_unlock_irqrestore(&ioapic_lock, flags);
3451 +}
3452 +
3453 +#if defined(CONFIG_IRQBALANCE)
3454 +# include <asm/processor.h> /* kernel_thread() */
3455 +# include <linux/kernel_stat.h> /* kstat */
3456 +# include <linux/slab.h> /* kmalloc() */
3457 +# include <linux/timer.h> /* time_after() */
3458 +
3459 +#ifdef CONFIG_BALANCED_IRQ_DEBUG
3460 +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
3461 +# define Dprintk(x...) do { TDprintk(x); } while (0)
3462 +# else
3463 +# define TDprintk(x...)
3464 +# define Dprintk(x...)
3465 +# endif
3466 +
3467 +#define IRQBALANCE_CHECK_ARCH -999
3468 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3469 +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
3470 +#define BALANCED_IRQ_MORE_DELTA (HZ/10)
3471 +#define BALANCED_IRQ_LESS_DELTA (HZ)
3472 +
3473 +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
3474 +static int physical_balance __read_mostly;
3475 +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
3476 +
3477 +static struct irq_cpu_info {
3478 + unsigned long * last_irq;
3479 + unsigned long * irq_delta;
3480 + unsigned long irq;
3481 +} irq_cpu_data[NR_CPUS];
3482 +
3483 +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
3484 +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
3485 +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
3486 +
3487 +#define IDLE_ENOUGH(cpu,now) \
3488 + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
3489 +
3490 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
3491 +
3492 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
3493 +
3494 +static cpumask_t balance_irq_affinity[NR_IRQS] = {
3495 + [0 ... NR_IRQS-1] = CPU_MASK_ALL
3496 +};
3497 +
3498 +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
3499 +{
3500 + balance_irq_affinity[irq] = mask;
3501 +}
3502 +
3503 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
3504 + unsigned long now, int direction)
3505 +{
3506 + int search_idle = 1;
3507 + int cpu = curr_cpu;
3508 +
3509 + goto inside;
3510 +
3511 + do {
3512 + if (unlikely(cpu == curr_cpu))
3513 + search_idle = 0;
3514 +inside:
3515 + if (direction == 1) {
3516 + cpu++;
3517 + if (cpu >= NR_CPUS)
3518 + cpu = 0;
3519 + } else {
3520 + cpu--;
3521 + if (cpu == -1)
3522 + cpu = NR_CPUS-1;
3523 + }
3524 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
3525 + (search_idle && !IDLE_ENOUGH(cpu,now)));
3526 +
3527 + return cpu;
3528 +}
3529 +
3530 +static inline void balance_irq(int cpu, int irq)
3531 +{
3532 + unsigned long now = jiffies;
3533 + cpumask_t allowed_mask;
3534 + unsigned int new_cpu;
3535 +
3536 + if (irqbalance_disabled)
3537 + return;
3538 +
3539 + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
3540 + new_cpu = move(cpu, allowed_mask, now, 1);
3541 + if (cpu != new_cpu) {
3542 + set_pending_irq(irq, cpumask_of_cpu(new_cpu));
3543 + }
3544 +}
3545 +
3546 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
3547 +{
3548 + int i, j;
3549 + Dprintk("Rotating IRQs among CPUs.\n");
3550 + for_each_online_cpu(i) {
3551 + for (j = 0; j < NR_IRQS; j++) {
3552 + if (!irq_desc[j].action)
3553 + continue;
3554 + /* Is it a significant load ? */
3555 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
3556 + useful_load_threshold)
3557 + continue;
3558 + balance_irq(i, j);
3559 + }
3560 + }
3561 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3562 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3563 + return;
3564 +}
3565 +
3566 +static void do_irq_balance(void)
3567 +{
3568 + int i, j;
3569 + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
3570 + unsigned long move_this_load = 0;
3571 + int max_loaded = 0, min_loaded = 0;
3572 + int load;
3573 + unsigned long useful_load_threshold = balanced_irq_interval + 10;
3574 + int selected_irq;
3575 + int tmp_loaded, first_attempt = 1;
3576 + unsigned long tmp_cpu_irq;
3577 + unsigned long imbalance = 0;
3578 + cpumask_t allowed_mask, target_cpu_mask, tmp;
3579 +
3580 + for_each_possible_cpu(i) {
3581 + int package_index;
3582 + CPU_IRQ(i) = 0;
3583 + if (!cpu_online(i))
3584 + continue;
3585 + package_index = CPU_TO_PACKAGEINDEX(i);
3586 + for (j = 0; j < NR_IRQS; j++) {
3587 + unsigned long value_now, delta;
3588 + /* Is this an active IRQ? */
3589 + if (!irq_desc[j].action)
3590 + continue;
3591 + if ( package_index == i )
3592 + IRQ_DELTA(package_index,j) = 0;
3593 + /* Determine the total count per processor per IRQ */
3594 + value_now = (unsigned long) kstat_cpu(i).irqs[j];
3595 +
3596 + /* Determine the activity per processor per IRQ */
3597 + delta = value_now - LAST_CPU_IRQ(i,j);
3598 +
3599 + /* Update last_cpu_irq[][] for the next time */
3600 + LAST_CPU_IRQ(i,j) = value_now;
3601 +
3602 + /* Ignore IRQs whose rate is less than the clock */
3603 + if (delta < useful_load_threshold)
3604 + continue;
3605 + /* update the load for the processor or package total */
3606 + IRQ_DELTA(package_index,j) += delta;
3607 +
3608 + /* Keep track of the higher numbered sibling as well */
3609 + if (i != package_index)
3610 + CPU_IRQ(i) += delta;
3611 + /*
3612 + * We have sibling A and sibling B in the package
3613 + *
3614 + * cpu_irq[A] = load for cpu A + load for cpu B
3615 + * cpu_irq[B] = load for cpu B
3616 + */
3617 + CPU_IRQ(package_index) += delta;
3618 + }
3619 + }
3620 + /* Find the least loaded processor package */
3621 + for_each_online_cpu(i) {
3622 + if (i != CPU_TO_PACKAGEINDEX(i))
3623 + continue;
3624 + if (min_cpu_irq > CPU_IRQ(i)) {
3625 + min_cpu_irq = CPU_IRQ(i);
3626 + min_loaded = i;
3627 + }
3628 + }
3629 + max_cpu_irq = ULONG_MAX;
3630 +
3631 +tryanothercpu:
3632 + /* Look for heaviest loaded processor.
3633 + * We may come back to get the next heaviest loaded processor.
3634 + * Skip processors with trivial loads.
3635 + */
3636 + tmp_cpu_irq = 0;
3637 + tmp_loaded = -1;
3638 + for_each_online_cpu(i) {
3639 + if (i != CPU_TO_PACKAGEINDEX(i))
3640 + continue;
3641 + if (max_cpu_irq <= CPU_IRQ(i))
3642 + continue;
3643 + if (tmp_cpu_irq < CPU_IRQ(i)) {
3644 + tmp_cpu_irq = CPU_IRQ(i);
3645 + tmp_loaded = i;
3646 + }
3647 + }
3648 +
3649 + if (tmp_loaded == -1) {
3650 + /* In the case of small number of heavy interrupt sources,
3651 + * loading some of the cpus too much. We use Ingo's original
3652 + * approach to rotate them around.
3653 + */
3654 + if (!first_attempt && imbalance >= useful_load_threshold) {
3655 + rotate_irqs_among_cpus(useful_load_threshold);
3656 + return;
3657 + }
3658 + goto not_worth_the_effort;
3659 + }
3660 +
3661 + first_attempt = 0; /* heaviest search */
3662 + max_cpu_irq = tmp_cpu_irq; /* load */
3663 + max_loaded = tmp_loaded; /* processor */
3664 + imbalance = (max_cpu_irq - min_cpu_irq) / 2;
3665 +
3666 + Dprintk("max_loaded cpu = %d\n", max_loaded);
3667 + Dprintk("min_loaded cpu = %d\n", min_loaded);
3668 + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
3669 + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
3670 + Dprintk("load imbalance = %lu\n", imbalance);
3671 +
3672 + /* if imbalance is less than approx 10% of max load, then
3673 + * observe diminishing returns action. - quit
3674 + */
3675 + if (imbalance < (max_cpu_irq >> 3)) {
3676 + Dprintk("Imbalance too trivial\n");
3677 + goto not_worth_the_effort;
3678 + }
3679 +
3680 +tryanotherirq:
3681 + /* if we select an IRQ to move that can't go where we want, then
3682 + * see if there is another one to try.
3683 + */
3684 + move_this_load = 0;
3685 + selected_irq = -1;
3686 + for (j = 0; j < NR_IRQS; j++) {
3687 + /* Is this an active IRQ? */
3688 + if (!irq_desc[j].action)
3689 + continue;
3690 + if (imbalance <= IRQ_DELTA(max_loaded,j))
3691 + continue;
3692 + /* Try to find the IRQ that is closest to the imbalance
3693 + * without going over.
3694 + */
3695 + if (move_this_load < IRQ_DELTA(max_loaded,j)) {
3696 + move_this_load = IRQ_DELTA(max_loaded,j);
3697 + selected_irq = j;
3698 + }
3699 + }
3700 + if (selected_irq == -1) {
3701 + goto tryanothercpu;
3702 + }
3703 +
3704 + imbalance = move_this_load;
3705 +
3706 + /* For physical_balance case, we accumlated both load
3707 + * values in the one of the siblings cpu_irq[],
3708 + * to use the same code for physical and logical processors
3709 + * as much as possible.
3710 + *
3711 + * NOTE: the cpu_irq[] array holds the sum of the load for
3712 + * sibling A and sibling B in the slot for the lowest numbered
3713 + * sibling (A), _AND_ the load for sibling B in the slot for
3714 + * the higher numbered sibling.
3715 + *
3716 + * We seek the least loaded sibling by making the comparison
3717 + * (A+B)/2 vs B
3718 + */
3719 + load = CPU_IRQ(min_loaded) >> 1;
3720 + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
3721 + if (load > CPU_IRQ(j)) {
3722 + /* This won't change cpu_sibling_map[min_loaded] */
3723 + load = CPU_IRQ(j);
3724 + min_loaded = j;
3725 + }
3726 + }
3727 +
3728 + cpus_and(allowed_mask,
3729 + cpu_online_map,
3730 + balance_irq_affinity[selected_irq]);
3731 + target_cpu_mask = cpumask_of_cpu(min_loaded);
3732 + cpus_and(tmp, target_cpu_mask, allowed_mask);
3733 +
3734 + if (!cpus_empty(tmp)) {
3735 +
3736 + Dprintk("irq = %d moved to cpu = %d\n",
3737 + selected_irq, min_loaded);
3738 + /* mark for change destination */
3739 + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
3740 +
3741 + /* Since we made a change, come back sooner to
3742 + * check for more variation.
3743 + */
3744 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
3745 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
3746 + return;
3747 + }
3748 + goto tryanotherirq;
3749 +
3750 +not_worth_the_effort:
3751 + /*
3752 + * if we did not find an IRQ to move, then adjust the time interval
3753 + * upward
3754 + */
3755 + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
3756 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
3757 + Dprintk("IRQ worth rotating not found\n");
3758 + return;
3759 +}
3760 +
3761 +static int balanced_irq(void *unused)
3762 +{
3763 + int i;
3764 + unsigned long prev_balance_time = jiffies;
3765 + long time_remaining = balanced_irq_interval;
3766 +
3767 + daemonize("kirqd");
3768 +
3769 + /* push everything to CPU 0 to give us a starting point. */
3770 + for (i = 0 ; i < NR_IRQS ; i++) {
3771 + irq_desc[i].pending_mask = cpumask_of_cpu(0);
3772 + set_pending_irq(i, cpumask_of_cpu(0));
3773 + }
3774 +
3775 + for ( ; ; ) {
3776 + time_remaining = schedule_timeout_interruptible(time_remaining);
3777 + try_to_freeze();
3778 + if (time_after(jiffies,
3779 + prev_balance_time+balanced_irq_interval)) {
3780 + preempt_disable();
3781 + do_irq_balance();
3782 + prev_balance_time = jiffies;
3783 + time_remaining = balanced_irq_interval;
3784 + preempt_enable();
3785 + }
3786 + }
3787 + return 0;
3788 +}
3789 +
3790 +static int __init balanced_irq_init(void)
3791 +{
3792 + int i;
3793 + struct cpuinfo_x86 *c;
3794 + cpumask_t tmp;
3795 +
3796 + cpus_shift_right(tmp, cpu_online_map, 2);
3797 + c = &boot_cpu_data;
3798 + /* When not overwritten by the command line ask subarchitecture. */
3799 + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
3800 + irqbalance_disabled = NO_BALANCE_IRQ;
3801 + if (irqbalance_disabled)
3802 + return 0;
3803 +
3804 + /* disable irqbalance completely if there is only one processor online */
3805 + if (num_online_cpus() < 2) {
3806 + irqbalance_disabled = 1;
3807 + return 0;
3808 + }
3809 + /*
3810 + * Enable physical balance only if more than 1 physical processor
3811 + * is present
3812 + */
3813 + if (smp_num_siblings > 1 && !cpus_empty(tmp))
3814 + physical_balance = 1;
3815 +
3816 + for_each_online_cpu(i) {
3817 + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3818 + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
3819 + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
3820 + printk(KERN_ERR "balanced_irq_init: out of memory");
3821 + goto failed;
3822 + }
3823 + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
3824 + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
3825 + }
3826 +
3827 + printk(KERN_INFO "Starting balanced_irq\n");
3828 + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
3829 + return 0;
3830 + else
3831 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
3832 +failed:
3833 + for_each_possible_cpu(i) {
3834 + kfree(irq_cpu_data[i].irq_delta);
3835 + irq_cpu_data[i].irq_delta = NULL;
3836 + kfree(irq_cpu_data[i].last_irq);
3837 + irq_cpu_data[i].last_irq = NULL;
3838 + }
3839 + return 0;
3840 +}
3841 +
3842 +int __init irqbalance_disable(char *str)
3843 +{
3844 + irqbalance_disabled = 1;
3845 + return 1;
3846 +}
3847 +
3848 +__setup("noirqbalance", irqbalance_disable);
3849 +
3850 +late_initcall(balanced_irq_init);
3851 +#endif /* CONFIG_IRQBALANCE */
3852 +#endif /* CONFIG_SMP */
3853 +#endif
3854 +
3855 +#ifndef CONFIG_SMP
3856 +void fastcall send_IPI_self(int vector)
3857 +{
3858 +#ifndef CONFIG_XEN
3859 + unsigned int cfg;
3860 +
3861 + /*
3862 + * Wait for idle.
3863 + */
3864 + apic_wait_icr_idle();
3865 + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
3866 + /*
3867 + * Send the IPI. The write to APIC_ICR fires this off.
3868 + */
3869 + apic_write_around(APIC_ICR, cfg);
3870 +#endif
3871 +}
3872 +#endif /* !CONFIG_SMP */
3873 +
3874 +
3875 +/*
3876 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3877 + * specific CPU-side IRQs.
3878 + */
3879 +
3880 +#define MAX_PIRQS 8
3881 +static int pirq_entries [MAX_PIRQS];
3882 +static int pirqs_enabled;
3883 +int skip_ioapic_setup;
3884 +
3885 +static int __init ioapic_setup(char *str)
3886 +{
3887 + skip_ioapic_setup = 1;
3888 + return 1;
3889 +}
3890 +
3891 +__setup("noapic", ioapic_setup);
3892 +
3893 +static int __init ioapic_pirq_setup(char *str)
3894 +{
3895 + int i, max;
3896 + int ints[MAX_PIRQS+1];
3897 +
3898 + get_options(str, ARRAY_SIZE(ints), ints);
3899 +
3900 + for (i = 0; i < MAX_PIRQS; i++)
3901 + pirq_entries[i] = -1;
3902 +
3903 + pirqs_enabled = 1;
3904 + apic_printk(APIC_VERBOSE, KERN_INFO
3905 + "PIRQ redirection, working around broken MP-BIOS.\n");
3906 + max = MAX_PIRQS;
3907 + if (ints[0] < MAX_PIRQS)
3908 + max = ints[0];
3909 +
3910 + for (i = 0; i < max; i++) {
3911 + apic_printk(APIC_VERBOSE, KERN_DEBUG
3912 + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3913 + /*
3914 + * PIRQs are mapped upside down, usually.
3915 + */
3916 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3917 + }
3918 + return 1;
3919 +}
3920 +
3921 +__setup("pirq=", ioapic_pirq_setup);
3922 +
3923 +/*
3924 + * Find the IRQ entry number of a certain pin.
3925 + */
3926 +static int find_irq_entry(int apic, int pin, int type)
3927 +{
3928 + int i;
3929 +
3930 + for (i = 0; i < mp_irq_entries; i++)
3931 + if (mp_irqs[i].mpc_irqtype == type &&
3932 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
3933 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
3934 + mp_irqs[i].mpc_dstirq == pin)
3935 + return i;
3936 +
3937 + return -1;
3938 +}
3939 +
3940 +/*
3941 + * Find the pin to which IRQ[irq] (ISA) is connected
3942 + */
3943 +static int __init find_isa_irq_pin(int irq, int type)
3944 +{
3945 + int i;
3946 +
3947 + for (i = 0; i < mp_irq_entries; i++) {
3948 + int lbus = mp_irqs[i].mpc_srcbus;
3949 +
3950 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3951 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3952 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3953 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3954 + ) &&
3955 + (mp_irqs[i].mpc_irqtype == type) &&
3956 + (mp_irqs[i].mpc_srcbusirq == irq))
3957 +
3958 + return mp_irqs[i].mpc_dstirq;
3959 + }
3960 + return -1;
3961 +}
3962 +
3963 +static int __init find_isa_irq_apic(int irq, int type)
3964 +{
3965 + int i;
3966 +
3967 + for (i = 0; i < mp_irq_entries; i++) {
3968 + int lbus = mp_irqs[i].mpc_srcbus;
3969 +
3970 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3971 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3972 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
3973 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
3974 + ) &&
3975 + (mp_irqs[i].mpc_irqtype == type) &&
3976 + (mp_irqs[i].mpc_srcbusirq == irq))
3977 + break;
3978 + }
3979 + if (i < mp_irq_entries) {
3980 + int apic;
3981 + for(apic = 0; apic < nr_ioapics; apic++) {
3982 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
3983 + return apic;
3984 + }
3985 + }
3986 +
3987 + return -1;
3988 +}
3989 +
3990 +/*
3991 + * Find a specific PCI IRQ entry.
3992 + * Not an __init, possibly needed by modules
3993 + */
3994 +static int pin_2_irq(int idx, int apic, int pin);
3995 +
3996 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
3997 +{
3998 + int apic, i, best_guess = -1;
3999 +
4000 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4001 + "slot:%d, pin:%d.\n", bus, slot, pin);
4002 + if (mp_bus_id_to_pci_bus[bus] == -1) {
4003 + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4004 + return -1;
4005 + }
4006 + for (i = 0; i < mp_irq_entries; i++) {
4007 + int lbus = mp_irqs[i].mpc_srcbus;
4008 +
4009 + for (apic = 0; apic < nr_ioapics; apic++)
4010 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4011 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4012 + break;
4013 +
4014 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4015 + !mp_irqs[i].mpc_irqtype &&
4016 + (bus == lbus) &&
4017 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4018 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4019 +
4020 + if (!(apic || IO_APIC_IRQ(irq)))
4021 + continue;
4022 +
4023 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4024 + return irq;
4025 + /*
4026 + * Use the first all-but-pin matching entry as a
4027 + * best-guess fuzzy result for broken mptables.
4028 + */
4029 + if (best_guess < 0)
4030 + best_guess = irq;
4031 + }
4032 + }
4033 + return best_guess;
4034 +}
4035 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4036 +
4037 +/*
4038 + * This function currently is only a helper for the i386 smp boot process where
4039 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4040 + * so mask in all cases should simply be TARGET_CPUS
4041 + */
4042 +#ifdef CONFIG_SMP
4043 +#ifndef CONFIG_XEN
4044 +void __init setup_ioapic_dest(void)
4045 +{
4046 + int pin, ioapic, irq, irq_entry;
4047 +
4048 + if (skip_ioapic_setup == 1)
4049 + return;
4050 +
4051 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4052 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4053 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4054 + if (irq_entry == -1)
4055 + continue;
4056 + irq = pin_2_irq(irq_entry, ioapic, pin);
4057 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
4058 + }
4059 +
4060 + }
4061 +}
4062 +#endif /* !CONFIG_XEN */
4063 +#endif
4064 +
4065 +/*
4066 + * EISA Edge/Level control register, ELCR
4067 + */
4068 +static int EISA_ELCR(unsigned int irq)
4069 +{
4070 + if (irq < 16) {
4071 + unsigned int port = 0x4d0 + (irq >> 3);
4072 + return (inb(port) >> (irq & 7)) & 1;
4073 + }
4074 + apic_printk(APIC_VERBOSE, KERN_INFO
4075 + "Broken MPtable reports ISA irq %d\n", irq);
4076 + return 0;
4077 +}
4078 +
4079 +/* EISA interrupts are always polarity zero and can be edge or level
4080 + * trigger depending on the ELCR value. If an interrupt is listed as
4081 + * EISA conforming in the MP table, that means its trigger type must
4082 + * be read in from the ELCR */
4083 +
4084 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4085 +#define default_EISA_polarity(idx) (0)
4086 +
4087 +/* ISA interrupts are always polarity zero edge triggered,
4088 + * when listed as conforming in the MP table. */
4089 +
4090 +#define default_ISA_trigger(idx) (0)
4091 +#define default_ISA_polarity(idx) (0)
4092 +
4093 +/* PCI interrupts are always polarity one level triggered,
4094 + * when listed as conforming in the MP table. */
4095 +
4096 +#define default_PCI_trigger(idx) (1)
4097 +#define default_PCI_polarity(idx) (1)
4098 +
4099 +/* MCA interrupts are always polarity zero level triggered,
4100 + * when listed as conforming in the MP table. */
4101 +
4102 +#define default_MCA_trigger(idx) (1)
4103 +#define default_MCA_polarity(idx) (0)
4104 +
4105 +/* NEC98 interrupts are always polarity zero edge triggered,
4106 + * when listed as conforming in the MP table. */
4107 +
4108 +#define default_NEC98_trigger(idx) (0)
4109 +#define default_NEC98_polarity(idx) (0)
4110 +
4111 +static int __init MPBIOS_polarity(int idx)
4112 +{
4113 + int bus = mp_irqs[idx].mpc_srcbus;
4114 + int polarity;
4115 +
4116 + /*
4117 + * Determine IRQ line polarity (high active or low active):
4118 + */
4119 + switch (mp_irqs[idx].mpc_irqflag & 3)
4120 + {
4121 + case 0: /* conforms, ie. bus-type dependent polarity */
4122 + {
4123 + switch (mp_bus_id_to_type[bus])
4124 + {
4125 + case MP_BUS_ISA: /* ISA pin */
4126 + {
4127 + polarity = default_ISA_polarity(idx);
4128 + break;
4129 + }
4130 + case MP_BUS_EISA: /* EISA pin */
4131 + {
4132 + polarity = default_EISA_polarity(idx);
4133 + break;
4134 + }
4135 + case MP_BUS_PCI: /* PCI pin */
4136 + {
4137 + polarity = default_PCI_polarity(idx);
4138 + break;
4139 + }
4140 + case MP_BUS_MCA: /* MCA pin */
4141 + {
4142 + polarity = default_MCA_polarity(idx);
4143 + break;
4144 + }
4145 + case MP_BUS_NEC98: /* NEC 98 pin */
4146 + {
4147 + polarity = default_NEC98_polarity(idx);
4148 + break;
4149 + }
4150 + default:
4151 + {
4152 + printk(KERN_WARNING "broken BIOS!!\n");
4153 + polarity = 1;
4154 + break;
4155 + }
4156 + }
4157 + break;
4158 + }
4159 + case 1: /* high active */
4160 + {
4161 + polarity = 0;
4162 + break;
4163 + }
4164 + case 2: /* reserved */
4165 + {
4166 + printk(KERN_WARNING "broken BIOS!!\n");
4167 + polarity = 1;
4168 + break;
4169 + }
4170 + case 3: /* low active */
4171 + {
4172 + polarity = 1;
4173 + break;
4174 + }
4175 + default: /* invalid */
4176 + {
4177 + printk(KERN_WARNING "broken BIOS!!\n");
4178 + polarity = 1;
4179 + break;
4180 + }
4181 + }
4182 + return polarity;
4183 +}
4184 +
4185 +static int MPBIOS_trigger(int idx)
4186 +{
4187 + int bus = mp_irqs[idx].mpc_srcbus;
4188 + int trigger;
4189 +
4190 + /*
4191 + * Determine IRQ trigger mode (edge or level sensitive):
4192 + */
4193 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
4194 + {
4195 + case 0: /* conforms, ie. bus-type dependent */
4196 + {
4197 + switch (mp_bus_id_to_type[bus])
4198 + {
4199 + case MP_BUS_ISA: /* ISA pin */
4200 + {
4201 + trigger = default_ISA_trigger(idx);
4202 + break;
4203 + }
4204 + case MP_BUS_EISA: /* EISA pin */
4205 + {
4206 + trigger = default_EISA_trigger(idx);
4207 + break;
4208 + }
4209 + case MP_BUS_PCI: /* PCI pin */
4210 + {
4211 + trigger = default_PCI_trigger(idx);
4212 + break;
4213 + }
4214 + case MP_BUS_MCA: /* MCA pin */
4215 + {
4216 + trigger = default_MCA_trigger(idx);
4217 + break;
4218 + }
4219 + case MP_BUS_NEC98: /* NEC 98 pin */
4220 + {
4221 + trigger = default_NEC98_trigger(idx);
4222 + break;
4223 + }
4224 + default:
4225 + {
4226 + printk(KERN_WARNING "broken BIOS!!\n");
4227 + trigger = 1;
4228 + break;
4229 + }
4230 + }
4231 + break;
4232 + }
4233 + case 1: /* edge */
4234 + {
4235 + trigger = 0;
4236 + break;
4237 + }
4238 + case 2: /* reserved */
4239 + {
4240 + printk(KERN_WARNING "broken BIOS!!\n");
4241 + trigger = 1;
4242 + break;
4243 + }
4244 + case 3: /* level */
4245 + {
4246 + trigger = 1;
4247 + break;
4248 + }
4249 + default: /* invalid */
4250 + {
4251 + printk(KERN_WARNING "broken BIOS!!\n");
4252 + trigger = 0;
4253 + break;
4254 + }
4255 + }
4256 + return trigger;
4257 +}
4258 +
4259 +static inline int irq_polarity(int idx)
4260 +{
4261 + return MPBIOS_polarity(idx);
4262 +}
4263 +
4264 +static inline int irq_trigger(int idx)
4265 +{
4266 + return MPBIOS_trigger(idx);
4267 +}
4268 +
4269 +static int pin_2_irq(int idx, int apic, int pin)
4270 +{
4271 + int irq, i;
4272 + int bus = mp_irqs[idx].mpc_srcbus;
4273 +
4274 + /*
4275 + * Debugging check, we are in big trouble if this message pops up!
4276 + */
4277 + if (mp_irqs[idx].mpc_dstirq != pin)
4278 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
4279 +
4280 + switch (mp_bus_id_to_type[bus])
4281 + {
4282 + case MP_BUS_ISA: /* ISA pin */
4283 + case MP_BUS_EISA:
4284 + case MP_BUS_MCA:
4285 + case MP_BUS_NEC98:
4286 + {
4287 + irq = mp_irqs[idx].mpc_srcbusirq;
4288 + break;
4289 + }
4290 + case MP_BUS_PCI: /* PCI pin */
4291 + {
4292 + /*
4293 + * PCI IRQs are mapped in order
4294 + */
4295 + i = irq = 0;
4296 + while (i < apic)
4297 + irq += nr_ioapic_registers[i++];
4298 + irq += pin;
4299 +
4300 + /*
4301 + * For MPS mode, so far only needed by ES7000 platform
4302 + */
4303 + if (ioapic_renumber_irq)
4304 + irq = ioapic_renumber_irq(apic, irq);
4305 +
4306 + break;
4307 + }
4308 + default:
4309 + {
4310 + printk(KERN_ERR "unknown bus type %d.\n",bus);
4311 + irq = 0;
4312 + break;
4313 + }
4314 + }
4315 +
4316 + /*
4317 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
4318 + */
4319 + if ((pin >= 16) && (pin <= 23)) {
4320 + if (pirq_entries[pin-16] != -1) {
4321 + if (!pirq_entries[pin-16]) {
4322 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4323 + "disabling PIRQ%d\n", pin-16);
4324 + } else {
4325 + irq = pirq_entries[pin-16];
4326 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4327 + "using PIRQ%d -> IRQ %d\n",
4328 + pin-16, irq);
4329 + }
4330 + }
4331 + }
4332 + return irq;
4333 +}
4334 +
4335 +static inline int IO_APIC_irq_trigger(int irq)
4336 +{
4337 + int apic, idx, pin;
4338 +
4339 + for (apic = 0; apic < nr_ioapics; apic++) {
4340 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4341 + idx = find_irq_entry(apic,pin,mp_INT);
4342 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
4343 + return irq_trigger(idx);
4344 + }
4345 + }
4346 + /*
4347 + * nonexistent IRQs are edge default
4348 + */
4349 + return 0;
4350 +}
4351 +
4352 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
4353 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
4354 +
4355 +int assign_irq_vector(int irq)
4356 +{
4357 + unsigned long flags;
4358 + int vector;
4359 + struct physdev_irq irq_op;
4360 +
4361 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
4362 +
4363 + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
4364 + return -EINVAL;
4365 +
4366 + spin_lock_irqsave(&vector_lock, flags);
4367 +
4368 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
4369 + spin_unlock_irqrestore(&vector_lock, flags);
4370 + return IO_APIC_VECTOR(irq);
4371 + }
4372 +
4373 + irq_op.irq = irq;
4374 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
4375 + spin_unlock_irqrestore(&vector_lock, flags);
4376 + return -ENOSPC;
4377 + }
4378 +
4379 + vector = irq_op.vector;
4380 + vector_irq[vector] = irq;
4381 + if (irq != AUTO_ASSIGN)
4382 + IO_APIC_VECTOR(irq) = vector;
4383 +
4384 + spin_unlock_irqrestore(&vector_lock, flags);
4385 +
4386 + return vector;
4387 +}
4388 +
4389 +#ifndef CONFIG_XEN
4390 +static struct hw_interrupt_type ioapic_level_type;
4391 +static struct hw_interrupt_type ioapic_edge_type;
4392 +
4393 +#define IOAPIC_AUTO -1
4394 +#define IOAPIC_EDGE 0
4395 +#define IOAPIC_LEVEL 1
4396 +
4397 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
4398 +{
4399 + unsigned idx;
4400 +
4401 + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
4402 +
4403 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
4404 + trigger == IOAPIC_LEVEL)
4405 + irq_desc[idx].chip = &ioapic_level_type;
4406 + else
4407 + irq_desc[idx].chip = &ioapic_edge_type;
4408 + set_intr_gate(vector, interrupt[idx]);
4409 +}
4410 +#else
4411 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
4412 +#endif
4413 +
4414 +static void __init setup_IO_APIC_irqs(void)
4415 +{
4416 + struct IO_APIC_route_entry entry;
4417 + int apic, pin, idx, irq, first_notcon = 1, vector;
4418 + unsigned long flags;
4419 +
4420 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
4421 +
4422 + for (apic = 0; apic < nr_ioapics; apic++) {
4423 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4424 +
4425 + /*
4426 + * add it to the IO-APIC irq-routing table:
4427 + */
4428 + memset(&entry,0,sizeof(entry));
4429 +
4430 + entry.delivery_mode = INT_DELIVERY_MODE;
4431 + entry.dest_mode = INT_DEST_MODE;
4432 + entry.mask = 0; /* enable IRQ */
4433 + entry.dest.logical.logical_dest =
4434 + cpu_mask_to_apicid(TARGET_CPUS);
4435 +
4436 + idx = find_irq_entry(apic,pin,mp_INT);
4437 + if (idx == -1) {
4438 + if (first_notcon) {
4439 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4440 + " IO-APIC (apicid-pin) %d-%d",
4441 + mp_ioapics[apic].mpc_apicid,
4442 + pin);
4443 + first_notcon = 0;
4444 + } else
4445 + apic_printk(APIC_VERBOSE, ", %d-%d",
4446 + mp_ioapics[apic].mpc_apicid, pin);
4447 + continue;
4448 + }
4449 +
4450 + entry.trigger = irq_trigger(idx);
4451 + entry.polarity = irq_polarity(idx);
4452 +
4453 + if (irq_trigger(idx)) {
4454 + entry.trigger = 1;
4455 + entry.mask = 1;
4456 + }
4457 +
4458 + irq = pin_2_irq(idx, apic, pin);
4459 + /*
4460 + * skip adding the timer int on secondary nodes, which causes
4461 + * a small but painful rift in the time-space continuum
4462 + */
4463 + if (multi_timer_check(apic, irq))
4464 + continue;
4465 + else
4466 + add_pin_to_irq(irq, apic, pin);
4467 +
4468 + if (/*!apic &&*/ !IO_APIC_IRQ(irq))
4469 + continue;
4470 +
4471 + if (IO_APIC_IRQ(irq)) {
4472 + vector = assign_irq_vector(irq);
4473 + entry.vector = vector;
4474 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
4475 +
4476 + if (!apic && (irq < 16))
4477 + disable_8259A_irq(irq);
4478 + }
4479 + spin_lock_irqsave(&ioapic_lock, flags);
4480 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4481 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4482 + set_native_irq_info(irq, TARGET_CPUS);
4483 + spin_unlock_irqrestore(&ioapic_lock, flags);
4484 + }
4485 + }
4486 +
4487 + if (!first_notcon)
4488 + apic_printk(APIC_VERBOSE, " not connected.\n");
4489 +}
4490 +
4491 +/*
4492 + * Set up the 8259A-master output pin:
4493 + */
4494 +#ifndef CONFIG_XEN
4495 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
4496 +{
4497 + struct IO_APIC_route_entry entry;
4498 + unsigned long flags;
4499 +
4500 + memset(&entry,0,sizeof(entry));
4501 +
4502 + disable_8259A_irq(0);
4503 +
4504 + /* mask LVT0 */
4505 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
4506 +
4507 + /*
4508 + * We use logical delivery to get the timer IRQ
4509 + * to the first CPU.
4510 + */
4511 + entry.dest_mode = INT_DEST_MODE;
4512 + entry.mask = 0; /* unmask IRQ now */
4513 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4514 + entry.delivery_mode = INT_DELIVERY_MODE;
4515 + entry.polarity = 0;
4516 + entry.trigger = 0;
4517 + entry.vector = vector;
4518 +
4519 + /*
4520 + * The timer IRQ doesn't have to know that behind the
4521 + * scene we have a 8259A-master in AEOI mode ...
4522 + */
4523 + irq_desc[0].chip = &ioapic_edge_type;
4524 +
4525 + /*
4526 + * Add it to the IO-APIC irq-routing table:
4527 + */
4528 + spin_lock_irqsave(&ioapic_lock, flags);
4529 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
4530 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
4531 + spin_unlock_irqrestore(&ioapic_lock, flags);
4532 +
4533 + enable_8259A_irq(0);
4534 +}
4535 +
4536 +static inline void UNEXPECTED_IO_APIC(void)
4537 +{
4538 +}
4539 +
4540 +void __init print_IO_APIC(void)
4541 +{
4542 + int apic, i;
4543 + union IO_APIC_reg_00 reg_00;
4544 + union IO_APIC_reg_01 reg_01;
4545 + union IO_APIC_reg_02 reg_02;
4546 + union IO_APIC_reg_03 reg_03;
4547 + unsigned long flags;
4548 +
4549 + if (apic_verbosity == APIC_QUIET)
4550 + return;
4551 +
4552 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
4553 + for (i = 0; i < nr_ioapics; i++)
4554 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
4555 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
4556 +
4557 + /*
4558 + * We are a bit conservative about what we expect. We have to
4559 + * know about every hardware change ASAP.
4560 + */
4561 + printk(KERN_INFO "testing the IO APIC.......................\n");
4562 +
4563 + for (apic = 0; apic < nr_ioapics; apic++) {
4564 +
4565 + spin_lock_irqsave(&ioapic_lock, flags);
4566 + reg_00.raw = io_apic_read(apic, 0);
4567 + reg_01.raw = io_apic_read(apic, 1);
4568 + if (reg_01.bits.version >= 0x10)
4569 + reg_02.raw = io_apic_read(apic, 2);
4570 + if (reg_01.bits.version >= 0x20)
4571 + reg_03.raw = io_apic_read(apic, 3);
4572 + spin_unlock_irqrestore(&ioapic_lock, flags);
4573 +
4574 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
4575 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
4576 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
4577 + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
4578 + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
4579 + if (reg_00.bits.ID >= get_physical_broadcast())
4580 + UNEXPECTED_IO_APIC();
4581 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
4582 + UNEXPECTED_IO_APIC();
4583 +
4584 + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
4585 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
4586 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
4587 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
4588 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
4589 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
4590 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
4591 + (reg_01.bits.entries != 0x2E) &&
4592 + (reg_01.bits.entries != 0x3F)
4593 + )
4594 + UNEXPECTED_IO_APIC();
4595 +
4596 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
4597 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
4598 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
4599 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
4600 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
4601 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
4602 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
4603 + )
4604 + UNEXPECTED_IO_APIC();
4605 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
4606 + UNEXPECTED_IO_APIC();
4607 +
4608 + /*
4609 + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
4610 + * but the value of reg_02 is read as the previous read register
4611 + * value, so ignore it if reg_02 == reg_01.
4612 + */
4613 + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
4614 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
4615 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
4616 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
4617 + UNEXPECTED_IO_APIC();
4618 + }
4619 +
4620 + /*
4621 + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
4622 + * or reg_03, but the value of reg_0[23] is read as the previous read
4623 + * register value, so ignore it if reg_03 == reg_0[12].
4624 + */
4625 + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
4626 + reg_03.raw != reg_01.raw) {
4627 + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
4628 + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
4629 + if (reg_03.bits.__reserved_1)
4630 + UNEXPECTED_IO_APIC();
4631 + }
4632 +
4633 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
4634 +
4635 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
4636 + " Stat Dest Deli Vect: \n");
4637 +
4638 + for (i = 0; i <= reg_01.bits.entries; i++) {
4639 + struct IO_APIC_route_entry entry;
4640 +
4641 + spin_lock_irqsave(&ioapic_lock, flags);
4642 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
4643 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
4644 + spin_unlock_irqrestore(&ioapic_lock, flags);
4645 +
4646 + printk(KERN_DEBUG " %02x %03X %02X ",
4647 + i,
4648 + entry.dest.logical.logical_dest,
4649 + entry.dest.physical.physical_dest
4650 + );
4651 +
4652 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
4653 + entry.mask,
4654 + entry.trigger,
4655 + entry.irr,
4656 + entry.polarity,
4657 + entry.delivery_status,
4658 + entry.dest_mode,
4659 + entry.delivery_mode,
4660 + entry.vector
4661 + );
4662 + }
4663 + }
4664 + if (use_pci_vector())
4665 + printk(KERN_INFO "Using vector-based indexing\n");
4666 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
4667 + for (i = 0; i < NR_IRQS; i++) {
4668 + struct irq_pin_list *entry = irq_2_pin + i;
4669 + if (entry->pin < 0)
4670 + continue;
4671 + if (use_pci_vector() && !platform_legacy_irq(i))
4672 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
4673 + else
4674 + printk(KERN_DEBUG "IRQ%d ", i);
4675 + for (;;) {
4676 + printk("-> %d:%d", entry->apic, entry->pin);
4677 + if (!entry->next)
4678 + break;
4679 + entry = irq_2_pin + entry->next;
4680 + }
4681 + printk("\n");
4682 + }
4683 +
4684 + printk(KERN_INFO ".................................... done.\n");
4685 +
4686 + return;
4687 +}
4688 +
4689 +static void print_APIC_bitfield (int base)
4690 +{
4691 + unsigned int v;
4692 + int i, j;
4693 +
4694 + if (apic_verbosity == APIC_QUIET)
4695 + return;
4696 +
4697 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
4698 + for (i = 0; i < 8; i++) {
4699 + v = apic_read(base + i*0x10);
4700 + for (j = 0; j < 32; j++) {
4701 + if (v & (1<<j))
4702 + printk("1");
4703 + else
4704 + printk("0");
4705 + }
4706 + printk("\n");
4707 + }
4708 +}
4709 +
4710 +void /*__init*/ print_local_APIC(void * dummy)
4711 +{
4712 + unsigned int v, ver, maxlvt;
4713 +
4714 + if (apic_verbosity == APIC_QUIET)
4715 + return;
4716 +
4717 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
4718 + smp_processor_id(), hard_smp_processor_id());
4719 + v = apic_read(APIC_ID);
4720 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
4721 + v = apic_read(APIC_LVR);
4722 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
4723 + ver = GET_APIC_VERSION(v);
4724 + maxlvt = get_maxlvt();
4725 +
4726 + v = apic_read(APIC_TASKPRI);
4727 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
4728 +
4729 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
4730 + v = apic_read(APIC_ARBPRI);
4731 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
4732 + v & APIC_ARBPRI_MASK);
4733 + v = apic_read(APIC_PROCPRI);
4734 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
4735 + }
4736 +
4737 + v = apic_read(APIC_EOI);
4738 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
4739 + v = apic_read(APIC_RRR);
4740 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
4741 + v = apic_read(APIC_LDR);
4742 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
4743 + v = apic_read(APIC_DFR);
4744 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
4745 + v = apic_read(APIC_SPIV);
4746 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
4747 +
4748 + printk(KERN_DEBUG "... APIC ISR field:\n");
4749 + print_APIC_bitfield(APIC_ISR);
4750 + printk(KERN_DEBUG "... APIC TMR field:\n");
4751 + print_APIC_bitfield(APIC_TMR);
4752 + printk(KERN_DEBUG "... APIC IRR field:\n");
4753 + print_APIC_bitfield(APIC_IRR);
4754 +
4755 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
4756 + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
4757 + apic_write(APIC_ESR, 0);
4758 + v = apic_read(APIC_ESR);
4759 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
4760 + }
4761 +
4762 + v = apic_read(APIC_ICR);
4763 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
4764 + v = apic_read(APIC_ICR2);
4765 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
4766 +
4767 + v = apic_read(APIC_LVTT);
4768 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
4769 +
4770 + if (maxlvt > 3) { /* PC is LVT#4. */
4771 + v = apic_read(APIC_LVTPC);
4772 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
4773 + }
4774 + v = apic_read(APIC_LVT0);
4775 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
4776 + v = apic_read(APIC_LVT1);
4777 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
4778 +
4779 + if (maxlvt > 2) { /* ERR is LVT#3. */
4780 + v = apic_read(APIC_LVTERR);
4781 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
4782 + }
4783 +
4784 + v = apic_read(APIC_TMICT);
4785 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
4786 + v = apic_read(APIC_TMCCT);
4787 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
4788 + v = apic_read(APIC_TDCR);
4789 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
4790 + printk("\n");
4791 +}
4792 +
4793 +void print_all_local_APICs (void)
4794 +{
4795 + on_each_cpu(print_local_APIC, NULL, 1, 1);
4796 +}
4797 +
4798 +void /*__init*/ print_PIC(void)
4799 +{
4800 + unsigned int v;
4801 + unsigned long flags;
4802 +
4803 + if (apic_verbosity == APIC_QUIET)
4804 + return;
4805 +
4806 + printk(KERN_DEBUG "\nprinting PIC contents\n");
4807 +
4808 + spin_lock_irqsave(&i8259A_lock, flags);
4809 +
4810 + v = inb(0xa1) << 8 | inb(0x21);
4811 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
4812 +
4813 + v = inb(0xa0) << 8 | inb(0x20);
4814 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
4815 +
4816 + outb(0x0b,0xa0);
4817 + outb(0x0b,0x20);
4818 + v = inb(0xa0) << 8 | inb(0x20);
4819 + outb(0x0a,0xa0);
4820 + outb(0x0a,0x20);
4821 +
4822 + spin_unlock_irqrestore(&i8259A_lock, flags);
4823 +
4824 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
4825 +
4826 + v = inb(0x4d1) << 8 | inb(0x4d0);
4827 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
4828 +}
4829 +#endif /* !CONFIG_XEN */
4830 +
4831 +static void __init enable_IO_APIC(void)
4832 +{
4833 + union IO_APIC_reg_01 reg_01;
4834 + int i8259_apic, i8259_pin;
4835 + int i, apic;
4836 + unsigned long flags;
4837 +
4838 + for (i = 0; i < PIN_MAP_SIZE; i++) {
4839 + irq_2_pin[i].pin = -1;
4840 + irq_2_pin[i].next = 0;
4841 + }
4842 + if (!pirqs_enabled)
4843 + for (i = 0; i < MAX_PIRQS; i++)
4844 + pirq_entries[i] = -1;
4845 +
4846 + /*
4847 + * The number of IO-APIC IRQ registers (== #pins):
4848 + */
4849 + for (apic = 0; apic < nr_ioapics; apic++) {
4850 + spin_lock_irqsave(&ioapic_lock, flags);
4851 + reg_01.raw = io_apic_read(apic, 1);
4852 + spin_unlock_irqrestore(&ioapic_lock, flags);
4853 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
4854 + }
4855 + for(apic = 0; apic < nr_ioapics; apic++) {
4856 + int pin;
4857 + /* See if any of the pins is in ExtINT mode */
4858 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4859 + struct IO_APIC_route_entry entry;
4860 + spin_lock_irqsave(&ioapic_lock, flags);
4861 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4862 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4863 + spin_unlock_irqrestore(&ioapic_lock, flags);
4864 +
4865 +
4866 + /* If the interrupt line is enabled and in ExtInt mode
4867 + * I have found the pin where the i8259 is connected.
4868 + */
4869 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
4870 + ioapic_i8259.apic = apic;
4871 + ioapic_i8259.pin = pin;
4872 + goto found_i8259;
4873 + }
4874 + }
4875 + }
4876 + found_i8259:
4877 + /* Look to see what if the MP table has reported the ExtINT */
4878 + /* If we could not find the appropriate pin by looking at the ioapic
4879 + * the i8259 probably is not connected the ioapic but give the
4880 + * mptable a chance anyway.
4881 + */
4882 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
4883 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
4884 + /* Trust the MP table if nothing is setup in the hardware */
4885 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
4886 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
4887 + ioapic_i8259.pin = i8259_pin;
4888 + ioapic_i8259.apic = i8259_apic;
4889 + }
4890 + /* Complain if the MP table and the hardware disagree */
4891 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
4892 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
4893 + {
4894 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
4895 + }
4896 +
4897 + /*
4898 + * Do not trust the IO-APIC being empty at bootup
4899 + */
4900 + clear_IO_APIC();
4901 +}
4902 +
4903 +/*
4904 + * Not an __init, needed by the reboot code
4905 + */
4906 +void disable_IO_APIC(void)
4907 +{
4908 + /*
4909 + * Clear the IO-APIC before rebooting:
4910 + */
4911 + clear_IO_APIC();
4912 +
4913 +#ifndef CONFIG_XEN
4914 + /*
4915 + * If the i8259 is routed through an IOAPIC
4916 + * Put that IOAPIC in virtual wire mode
4917 + * so legacy interrupts can be delivered.
4918 + */
4919 + if (ioapic_i8259.pin != -1) {
4920 + struct IO_APIC_route_entry entry;
4921 + unsigned long flags;
4922 +
4923 + memset(&entry, 0, sizeof(entry));
4924 + entry.mask = 0; /* Enabled */
4925 + entry.trigger = 0; /* Edge */
4926 + entry.irr = 0;
4927 + entry.polarity = 0; /* High */
4928 + entry.delivery_status = 0;
4929 + entry.dest_mode = 0; /* Physical */
4930 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
4931 + entry.vector = 0;
4932 + entry.dest.physical.physical_dest =
4933 + GET_APIC_ID(apic_read(APIC_ID));
4934 +
4935 + /*
4936 + * Add it to the IO-APIC irq-routing table:
4937 + */
4938 + spin_lock_irqsave(&ioapic_lock, flags);
4939 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
4940 + *(((int *)&entry)+1));
4941 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
4942 + *(((int *)&entry)+0));
4943 + spin_unlock_irqrestore(&ioapic_lock, flags);
4944 + }
4945 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
4946 +#endif
4947 +}
4948 +
4949 +/*
4950 + * function to set the IO-APIC physical IDs based on the
4951 + * values stored in the MPC table.
4952 + *
4953 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
4954 + */
4955 +
4956 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
4957 +static void __init setup_ioapic_ids_from_mpc(void)
4958 +{
4959 + union IO_APIC_reg_00 reg_00;
4960 + physid_mask_t phys_id_present_map;
4961 + int apic;
4962 + int i;
4963 + unsigned char old_id;
4964 + unsigned long flags;
4965 +
4966 + /*
4967 + * Don't check I/O APIC IDs for xAPIC systems. They have
4968 + * no meaning without the serial APIC bus.
4969 + */
4970 + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4971 + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4972 + return;
4973 + /*
4974 + * This is broken; anything with a real cpu count has to
4975 + * circumvent this idiocy regardless.
4976 + */
4977 + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
4978 +
4979 + /*
4980 + * Set the IOAPIC ID to the value stored in the MPC table.
4981 + */
4982 + for (apic = 0; apic < nr_ioapics; apic++) {
4983 +
4984 + /* Read the register 0 value */
4985 + spin_lock_irqsave(&ioapic_lock, flags);
4986 + reg_00.raw = io_apic_read(apic, 0);
4987 + spin_unlock_irqrestore(&ioapic_lock, flags);
4988 +
4989 + old_id = mp_ioapics[apic].mpc_apicid;
4990 +
4991 + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
4992 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
4993 + apic, mp_ioapics[apic].mpc_apicid);
4994 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
4995 + reg_00.bits.ID);
4996 + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
4997 + }
4998 +
4999 + /*
5000 + * Sanity check, is the ID really free? Every APIC in a
5001 + * system must have a unique ID or we get lots of nice
5002 + * 'stuck on smp_invalidate_needed IPI wait' messages.
5003 + */
5004 + if (check_apicid_used(phys_id_present_map,
5005 + mp_ioapics[apic].mpc_apicid)) {
5006 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5007 + apic, mp_ioapics[apic].mpc_apicid);
5008 + for (i = 0; i < get_physical_broadcast(); i++)
5009 + if (!physid_isset(i, phys_id_present_map))
5010 + break;
5011 + if (i >= get_physical_broadcast())
5012 + panic("Max APIC ID exceeded!\n");
5013 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5014 + i);
5015 + physid_set(i, phys_id_present_map);
5016 + mp_ioapics[apic].mpc_apicid = i;
5017 + } else {
5018 + physid_mask_t tmp;
5019 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5020 + apic_printk(APIC_VERBOSE, "Setting %d in the "
5021 + "phys_id_present_map\n",
5022 + mp_ioapics[apic].mpc_apicid);
5023 + physids_or(phys_id_present_map, phys_id_present_map, tmp);
5024 + }
5025 +
5026 +
5027 + /*
5028 + * We need to adjust the IRQ routing table
5029 + * if the ID changed.
5030 + */
5031 + if (old_id != mp_ioapics[apic].mpc_apicid)
5032 + for (i = 0; i < mp_irq_entries; i++)
5033 + if (mp_irqs[i].mpc_dstapic == old_id)
5034 + mp_irqs[i].mpc_dstapic
5035 + = mp_ioapics[apic].mpc_apicid;
5036 +
5037 + /*
5038 + * Read the right value from the MPC table and
5039 + * write it into the ID register.
5040 + */
5041 + apic_printk(APIC_VERBOSE, KERN_INFO
5042 + "...changing IO-APIC physical APIC ID to %d ...",
5043 + mp_ioapics[apic].mpc_apicid);
5044 +
5045 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5046 + spin_lock_irqsave(&ioapic_lock, flags);
5047 + io_apic_write(apic, 0, reg_00.raw);
5048 + spin_unlock_irqrestore(&ioapic_lock, flags);
5049 +
5050 + /*
5051 + * Sanity check
5052 + */
5053 + spin_lock_irqsave(&ioapic_lock, flags);
5054 + reg_00.raw = io_apic_read(apic, 0);
5055 + spin_unlock_irqrestore(&ioapic_lock, flags);
5056 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5057 + printk("could not set ID!\n");
5058 + else
5059 + apic_printk(APIC_VERBOSE, " ok.\n");
5060 + }
5061 +}
5062 +#else
5063 +static void __init setup_ioapic_ids_from_mpc(void) { }
5064 +#endif
5065 +
5066 +#ifndef CONFIG_XEN
5067 +/*
5068 + * There is a nasty bug in some older SMP boards, their mptable lies
5069 + * about the timer IRQ. We do the following to work around the situation:
5070 + *
5071 + * - timer IRQ defaults to IO-APIC IRQ
5072 + * - if this function detects that timer IRQs are defunct, then we fall
5073 + * back to ISA timer IRQs
5074 + */
5075 +static int __init timer_irq_works(void)
5076 +{
5077 + unsigned long t1 = jiffies;
5078 +
5079 + local_irq_enable();
5080 + /* Let ten ticks pass... */
5081 + mdelay((10 * 1000) / HZ);
5082 +
5083 + /*
5084 + * Expect a few ticks at least, to be sure some possible
5085 + * glue logic does not lock up after one or two first
5086 + * ticks in a non-ExtINT mode. Also the local APIC
5087 + * might have cached one ExtINT interrupt. Finally, at
5088 + * least one tick may be lost due to delays.
5089 + */
5090 + if (jiffies - t1 > 4)
5091 + return 1;
5092 +
5093 + return 0;
5094 +}
5095 +
5096 +/*
5097 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5098 + * number of pending IRQ events unhandled. These cases are very rare,
5099 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5100 + * better to do it this way as thus we do not have to be aware of
5101 + * 'pending' interrupts in the IRQ path, except at this point.
5102 + */
5103 +/*
5104 + * Edge triggered needs to resend any interrupt
5105 + * that was delayed but this is now handled in the device
5106 + * independent code.
5107 + */
5108 +
5109 +/*
5110 + * Starting up a edge-triggered IO-APIC interrupt is
5111 + * nasty - we need to make sure that we get the edge.
5112 + * If it is already asserted for some reason, we need
5113 + * return 1 to indicate that is was pending.
5114 + *
5115 + * This is not complete - we should be able to fake
5116 + * an edge even if it isn't on the 8259A...
5117 + */
5118 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5119 +{
5120 + int was_pending = 0;
5121 + unsigned long flags;
5122 +
5123 + spin_lock_irqsave(&ioapic_lock, flags);
5124 + if (irq < 16) {
5125 + disable_8259A_irq(irq);
5126 + if (i8259A_irq_pending(irq))
5127 + was_pending = 1;
5128 + }
5129 + __unmask_IO_APIC_irq(irq);
5130 + spin_unlock_irqrestore(&ioapic_lock, flags);
5131 +
5132 + return was_pending;
5133 +}
5134 +
5135 +/*
5136 + * Once we have recorded IRQ_PENDING already, we can mask the
5137 + * interrupt for real. This prevents IRQ storms from unhandled
5138 + * devices.
5139 + */
5140 +static void ack_edge_ioapic_irq(unsigned int irq)
5141 +{
5142 + move_irq(irq);
5143 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5144 + == (IRQ_PENDING | IRQ_DISABLED))
5145 + mask_IO_APIC_irq(irq);
5146 + ack_APIC_irq();
5147 +}
5148 +
5149 +/*
5150 + * Level triggered interrupts can just be masked,
5151 + * and shutting down and starting up the interrupt
5152 + * is the same as enabling and disabling them -- except
5153 + * with a startup need to return a "was pending" value.
5154 + *
5155 + * Level triggered interrupts are special because we
5156 + * do not touch any IO-APIC register while handling
5157 + * them. We ack the APIC in the end-IRQ handler, not
5158 + * in the start-IRQ-handler. Protection against reentrance
5159 + * from the same interrupt is still provided, both by the
5160 + * generic IRQ layer and by the fact that an unacked local
5161 + * APIC does not accept IRQs.
5162 + */
5163 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
5164 +{
5165 + unmask_IO_APIC_irq(irq);
5166 +
5167 + return 0; /* don't check for pending */
5168 +}
5169 +
5170 +static void end_level_ioapic_irq (unsigned int irq)
5171 +{
5172 + unsigned long v;
5173 + int i;
5174 +
5175 + move_irq(irq);
5176 +/*
5177 + * It appears there is an erratum which affects at least version 0x11
5178 + * of I/O APIC (that's the 82093AA and cores integrated into various
5179 + * chipsets). Under certain conditions a level-triggered interrupt is
5180 + * erroneously delivered as edge-triggered one but the respective IRR
5181 + * bit gets set nevertheless. As a result the I/O unit expects an EOI
5182 + * message but it will never arrive and further interrupts are blocked
5183 + * from the source. The exact reason is so far unknown, but the
5184 + * phenomenon was observed when two consecutive interrupt requests
5185 + * from a given source get delivered to the same CPU and the source is
5186 + * temporarily disabled in between.
5187 + *
5188 + * A workaround is to simulate an EOI message manually. We achieve it
5189 + * by setting the trigger mode to edge and then to level when the edge
5190 + * trigger mode gets detected in the TMR of a local APIC for a
5191 + * level-triggered interrupt. We mask the source for the time of the
5192 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
5193 + * The idea is from Manfred Spraul. --macro
5194 + */
5195 + i = IO_APIC_VECTOR(irq);
5196 +
5197 + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
5198 +
5199 + ack_APIC_irq();
5200 +
5201 + if (!(v & (1 << (i & 0x1f)))) {
5202 + atomic_inc(&irq_mis_count);
5203 + spin_lock(&ioapic_lock);
5204 + __mask_and_edge_IO_APIC_irq(irq);
5205 + __unmask_and_level_IO_APIC_irq(irq);
5206 + spin_unlock(&ioapic_lock);
5207 + }
5208 +}
5209 +
5210 +#ifdef CONFIG_PCI_MSI
5211 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
5212 +{
5213 + int irq = vector_to_irq(vector);
5214 +
5215 + return startup_edge_ioapic_irq(irq);
5216 +}
5217 +
5218 +static void ack_edge_ioapic_vector(unsigned int vector)
5219 +{
5220 + int irq = vector_to_irq(vector);
5221 +
5222 + move_native_irq(vector);
5223 + ack_edge_ioapic_irq(irq);
5224 +}
5225 +
5226 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
5227 +{
5228 + int irq = vector_to_irq(vector);
5229 +
5230 + return startup_level_ioapic_irq (irq);
5231 +}
5232 +
5233 +static void end_level_ioapic_vector (unsigned int vector)
5234 +{
5235 + int irq = vector_to_irq(vector);
5236 +
5237 + move_native_irq(vector);
5238 + end_level_ioapic_irq(irq);
5239 +}
5240 +
5241 +static void mask_IO_APIC_vector (unsigned int vector)
5242 +{
5243 + int irq = vector_to_irq(vector);
5244 +
5245 + mask_IO_APIC_irq(irq);
5246 +}
5247 +
5248 +static void unmask_IO_APIC_vector (unsigned int vector)
5249 +{
5250 + int irq = vector_to_irq(vector);
5251 +
5252 + unmask_IO_APIC_irq(irq);
5253 +}
5254 +
5255 +#ifdef CONFIG_SMP
5256 +static void set_ioapic_affinity_vector (unsigned int vector,
5257 + cpumask_t cpu_mask)
5258 +{
5259 + int irq = vector_to_irq(vector);
5260 +
5261 + set_native_irq_info(vector, cpu_mask);
5262 + set_ioapic_affinity_irq(irq, cpu_mask);
5263 +}
5264 +#endif
5265 +#endif
5266 +
5267 +static int ioapic_retrigger(unsigned int irq)
5268 +{
5269 + send_IPI_self(IO_APIC_VECTOR(irq));
5270 +
5271 + return 1;
5272 +}
5273 +
5274 +/*
5275 + * Level and edge triggered IO-APIC interrupts need different handling,
5276 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
5277 + * handled with the level-triggered descriptor, but that one has slightly
5278 + * more overhead. Level-triggered interrupts cannot be handled with the
5279 + * edge-triggered handler, without risking IRQ storms and other ugly
5280 + * races.
5281 + */
5282 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
5283 + .typename = "IO-APIC-edge",
5284 + .startup = startup_edge_ioapic,
5285 + .shutdown = shutdown_edge_ioapic,
5286 + .enable = enable_edge_ioapic,
5287 + .disable = disable_edge_ioapic,
5288 + .ack = ack_edge_ioapic,
5289 + .end = end_edge_ioapic,
5290 +#ifdef CONFIG_SMP
5291 + .set_affinity = set_ioapic_affinity,
5292 +#endif
5293 + .retrigger = ioapic_retrigger,
5294 +};
5295 +
5296 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
5297 + .typename = "IO-APIC-level",
5298 + .startup = startup_level_ioapic,
5299 + .shutdown = shutdown_level_ioapic,
5300 + .enable = enable_level_ioapic,
5301 + .disable = disable_level_ioapic,
5302 + .ack = mask_and_ack_level_ioapic,
5303 + .end = end_level_ioapic,
5304 +#ifdef CONFIG_SMP
5305 + .set_affinity = set_ioapic_affinity,
5306 +#endif
5307 + .retrigger = ioapic_retrigger,
5308 +};
5309 +#endif /* !CONFIG_XEN */
5310 +
5311 +static inline void init_IO_APIC_traps(void)
5312 +{
5313 + int irq;
5314 +
5315 + /*
5316 + * NOTE! The local APIC isn't very good at handling
5317 + * multiple interrupts at the same interrupt level.
5318 + * As the interrupt level is determined by taking the
5319 + * vector number and shifting that right by 4, we
5320 + * want to spread these out a bit so that they don't
5321 + * all fall in the same interrupt level.
5322 + *
5323 + * Also, we've got to be careful not to trash gate
5324 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
5325 + */
5326 + for (irq = 0; irq < NR_IRQS ; irq++) {
5327 + int tmp = irq;
5328 + if (use_pci_vector()) {
5329 + if (!platform_legacy_irq(tmp))
5330 + if ((tmp = vector_to_irq(tmp)) == -1)
5331 + continue;
5332 + }
5333 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
5334 + /*
5335 + * Hmm.. We don't have an entry for this,
5336 + * so default to an old-fashioned 8259
5337 + * interrupt if we can..
5338 + */
5339 + if (irq < 16)
5340 + make_8259A_irq(irq);
5341 +#ifndef CONFIG_XEN
5342 + else
5343 + /* Strange. Oh, well.. */
5344 + irq_desc[irq].chip = &no_irq_type;
5345 +#endif
5346 + }
5347 + }
5348 +}
5349 +
5350 +#ifndef CONFIG_XEN
5351 +static void enable_lapic_irq (unsigned int irq)
5352 +{
5353 + unsigned long v;
5354 +
5355 + v = apic_read(APIC_LVT0);
5356 + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
5357 +}
5358 +
5359 +static void disable_lapic_irq (unsigned int irq)
5360 +{
5361 + unsigned long v;
5362 +
5363 + v = apic_read(APIC_LVT0);
5364 + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
5365 +}
5366 +
5367 +static void ack_lapic_irq (unsigned int irq)
5368 +{
5369 + ack_APIC_irq();
5370 +}
5371 +
5372 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
5373 +
5374 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
5375 + .typename = "local-APIC-edge",
5376 + .startup = NULL, /* startup_irq() not used for IRQ0 */
5377 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
5378 + .enable = enable_lapic_irq,
5379 + .disable = disable_lapic_irq,
5380 + .ack = ack_lapic_irq,
5381 + .end = end_lapic_irq
5382 +};
5383 +
5384 +static void setup_nmi (void)
5385 +{
5386 + /*
5387 + * Dirty trick to enable the NMI watchdog ...
5388 + * We put the 8259A master into AEOI mode and
5389 + * unmask on all local APICs LVT0 as NMI.
5390 + *
5391 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
5392 + * is from Maciej W. Rozycki - so we do not have to EOI from
5393 + * the NMI handler or the timer interrupt.
5394 + */
5395 + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
5396 +
5397 + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
5398 +
5399 + apic_printk(APIC_VERBOSE, " done.\n");
5400 +}
5401 +
5402 +/*
5403 + * This looks a bit hackish but it's about the only one way of sending
5404 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
5405 + * not support the ExtINT mode, unfortunately. We need to send these
5406 + * cycles as some i82489DX-based boards have glue logic that keeps the
5407 + * 8259A interrupt line asserted until INTA. --macro
5408 + */
5409 +static inline void unlock_ExtINT_logic(void)
5410 +{
5411 + int apic, pin, i;
5412 + struct IO_APIC_route_entry entry0, entry1;
5413 + unsigned char save_control, save_freq_select;
5414 + unsigned long flags;
5415 +
5416 + pin = find_isa_irq_pin(8, mp_INT);
5417 + apic = find_isa_irq_apic(8, mp_INT);
5418 + if (pin == -1)
5419 + return;
5420 +
5421 + spin_lock_irqsave(&ioapic_lock, flags);
5422 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5423 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5424 + spin_unlock_irqrestore(&ioapic_lock, flags);
5425 + clear_IO_APIC_pin(apic, pin);
5426 +
5427 + memset(&entry1, 0, sizeof(entry1));
5428 +
5429 + entry1.dest_mode = 0; /* physical delivery */
5430 + entry1.mask = 0; /* unmask IRQ now */
5431 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
5432 + entry1.delivery_mode = dest_ExtINT;
5433 + entry1.polarity = entry0.polarity;
5434 + entry1.trigger = 0;
5435 + entry1.vector = 0;
5436 +
5437 + spin_lock_irqsave(&ioapic_lock, flags);
5438 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
5439 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
5440 + spin_unlock_irqrestore(&ioapic_lock, flags);
5441 +
5442 + save_control = CMOS_READ(RTC_CONTROL);
5443 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
5444 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
5445 + RTC_FREQ_SELECT);
5446 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
5447 +
5448 + i = 100;
5449 + while (i-- > 0) {
5450 + mdelay(10);
5451 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
5452 + i -= 10;
5453 + }
5454 +
5455 + CMOS_WRITE(save_control, RTC_CONTROL);
5456 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
5457 + clear_IO_APIC_pin(apic, pin);
5458 +
5459 + spin_lock_irqsave(&ioapic_lock, flags);
5460 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
5461 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
5462 + spin_unlock_irqrestore(&ioapic_lock, flags);
5463 +}
5464 +
5465 +int timer_uses_ioapic_pin_0;
5466 +
5467 +/*
5468 + * This code may look a bit paranoid, but it's supposed to cooperate with
5469 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
5470 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
5471 + * fanatically on his truly buggy board.
5472 + */
5473 +static inline void check_timer(void)
5474 +{
5475 + int apic1, pin1, apic2, pin2;
5476 + int vector;
5477 +
5478 + /*
5479 + * get/set the timer IRQ vector:
5480 + */
5481 + disable_8259A_irq(0);
5482 + vector = assign_irq_vector(0);
5483 + set_intr_gate(vector, interrupt[0]);
5484 +
5485 + /*
5486 + * Subtle, code in do_timer_interrupt() expects an AEOI
5487 + * mode for the 8259A whenever interrupts are routed
5488 + * through I/O APICs. Also IRQ0 has to be enabled in
5489 + * the 8259A which implies the virtual wire has to be
5490 + * disabled in the local APIC.
5491 + */
5492 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5493 + init_8259A(1);
5494 + timer_ack = 1;
5495 + if (timer_over_8254 > 0)
5496 + enable_8259A_irq(0);
5497 +
5498 + pin1 = find_isa_irq_pin(0, mp_INT);
5499 + apic1 = find_isa_irq_apic(0, mp_INT);
5500 + pin2 = ioapic_i8259.pin;
5501 + apic2 = ioapic_i8259.apic;
5502 +
5503 + if (pin1 == 0)
5504 + timer_uses_ioapic_pin_0 = 1;
5505 +
5506 + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
5507 + vector, apic1, pin1, apic2, pin2);
5508 +
5509 + if (pin1 != -1) {
5510 + /*
5511 + * Ok, does IRQ0 through the IOAPIC work?
5512 + */
5513 + unmask_IO_APIC_irq(0);
5514 + if (timer_irq_works()) {
5515 + if (nmi_watchdog == NMI_IO_APIC) {
5516 + disable_8259A_irq(0);
5517 + setup_nmi();
5518 + enable_8259A_irq(0);
5519 + }
5520 + if (disable_timer_pin_1 > 0)
5521 + clear_IO_APIC_pin(0, pin1);
5522 + return;
5523 + }
5524 + clear_IO_APIC_pin(apic1, pin1);
5525 + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
5526 + "IO-APIC\n");
5527 + }
5528 +
5529 + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
5530 + if (pin2 != -1) {
5531 + printk("\n..... (found pin %d) ...", pin2);
5532 + /*
5533 + * legacy devices should be connected to IO APIC #0
5534 + */
5535 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
5536 + if (timer_irq_works()) {
5537 + printk("works.\n");
5538 + if (pin1 != -1)
5539 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
5540 + else
5541 + add_pin_to_irq(0, apic2, pin2);
5542 + if (nmi_watchdog == NMI_IO_APIC) {
5543 + setup_nmi();
5544 + }
5545 + return;
5546 + }
5547 + /*
5548 + * Cleanup, just in case ...
5549 + */
5550 + clear_IO_APIC_pin(apic2, pin2);
5551 + }
5552 + printk(" failed.\n");
5553 +
5554 + if (nmi_watchdog == NMI_IO_APIC) {
5555 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
5556 + nmi_watchdog = 0;
5557 + }
5558 +
5559 + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
5560 +
5561 + disable_8259A_irq(0);
5562 + irq_desc[0].chip = &lapic_irq_type;
5563 + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
5564 + enable_8259A_irq(0);
5565 +
5566 + if (timer_irq_works()) {
5567 + printk(" works.\n");
5568 + return;
5569 + }
5570 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
5571 + printk(" failed.\n");
5572 +
5573 + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
5574 +
5575 + timer_ack = 0;
5576 + init_8259A(0);
5577 + make_8259A_irq(0);
5578 + apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
5579 +
5580 + unlock_ExtINT_logic();
5581 +
5582 + if (timer_irq_works()) {
5583 + printk(" works.\n");
5584 + return;
5585 + }
5586 + printk(" failed :(.\n");
5587 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
5588 + "report. Then try booting with the 'noapic' option");
5589 +}
5590 +#else
5591 +int timer_uses_ioapic_pin_0 = 0;
5592 +#define check_timer() ((void)0)
5593 +#endif
5594 +
5595 +/*
5596 + *
5597 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
5598 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
5599 + * Linux doesn't really care, as it's not actually used
5600 + * for any interrupt handling anyway.
5601 + */
5602 +#define PIC_IRQS (1 << PIC_CASCADE_IR)
5603 +
5604 +void __init setup_IO_APIC(void)
5605 +{
5606 + enable_IO_APIC();
5607 +
5608 + if (acpi_ioapic)
5609 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
5610 + else
5611 + io_apic_irqs = ~PIC_IRQS;
5612 +
5613 + printk("ENABLING IO-APIC IRQs\n");
5614 +
5615 + /*
5616 + * Set up IO-APIC IRQ routing.
5617 + */
5618 + if (!acpi_ioapic)
5619 + setup_ioapic_ids_from_mpc();
5620 +#ifndef CONFIG_XEN
5621 + sync_Arb_IDs();
5622 +#endif
5623 + setup_IO_APIC_irqs();
5624 + init_IO_APIC_traps();
5625 + check_timer();
5626 + if (!acpi_ioapic)
5627 + print_IO_APIC();
5628 +}
5629 +
5630 +static int __init setup_disable_8254_timer(char *s)
5631 +{
5632 + timer_over_8254 = -1;
5633 + return 1;
5634 +}
5635 +static int __init setup_enable_8254_timer(char *s)
5636 +{
5637 + timer_over_8254 = 2;
5638 + return 1;
5639 +}
5640 +
5641 +__setup("disable_8254_timer", setup_disable_8254_timer);
5642 +__setup("enable_8254_timer", setup_enable_8254_timer);
5643 +
5644 +/*
5645 + * Called after all the initialization is done. If we didnt find any
5646 + * APIC bugs then we can allow the modify fast path
5647 + */
5648 +
5649 +static int __init io_apic_bug_finalize(void)
5650 +{
5651 + if(sis_apic_bug == -1)
5652 + sis_apic_bug = 0;
5653 + if (is_initial_xendomain()) {
5654 + struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
5655 + op.u.platform_quirk.quirk_id = sis_apic_bug ?
5656 + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
5657 + VOID(HYPERVISOR_platform_op(&op));
5658 + }
5659 + return 0;
5660 +}
5661 +
5662 +late_initcall(io_apic_bug_finalize);
5663 +
5664 +struct sysfs_ioapic_data {
5665 + struct sys_device dev;
5666 + struct IO_APIC_route_entry entry[0];
5667 +};
5668 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
5669 +
5670 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
5671 +{
5672 + struct IO_APIC_route_entry *entry;
5673 + struct sysfs_ioapic_data *data;
5674 + unsigned long flags;
5675 + int i;
5676 +
5677 + data = container_of(dev, struct sysfs_ioapic_data, dev);
5678 + entry = data->entry;
5679 + spin_lock_irqsave(&ioapic_lock, flags);
5680 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5681 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
5682 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
5683 + }
5684 + spin_unlock_irqrestore(&ioapic_lock, flags);
5685 +
5686 + return 0;
5687 +}
5688 +
5689 +static int ioapic_resume(struct sys_device *dev)
5690 +{
5691 + struct IO_APIC_route_entry *entry;
5692 + struct sysfs_ioapic_data *data;
5693 + unsigned long flags;
5694 + union IO_APIC_reg_00 reg_00;
5695 + int i;
5696 +
5697 + data = container_of(dev, struct sysfs_ioapic_data, dev);
5698 + entry = data->entry;
5699 +
5700 + spin_lock_irqsave(&ioapic_lock, flags);
5701 + reg_00.raw = io_apic_read(dev->id, 0);
5702 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
5703 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
5704 + io_apic_write(dev->id, 0, reg_00.raw);
5705 + }
5706 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
5707 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
5708 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
5709 + }
5710 + spin_unlock_irqrestore(&ioapic_lock, flags);
5711 +
5712 + return 0;
5713 +}
5714 +
5715 +static struct sysdev_class ioapic_sysdev_class = {
5716 + set_kset_name("ioapic"),
5717 +#ifndef CONFIG_XEN
5718 + .suspend = ioapic_suspend,
5719 + .resume = ioapic_resume,
5720 +#endif
5721 +};
5722 +
5723 +static int __init ioapic_init_sysfs(void)
5724 +{
5725 + struct sys_device * dev;
5726 + int i, size, error = 0;
5727 +
5728 + error = sysdev_class_register(&ioapic_sysdev_class);
5729 + if (error)
5730 + return error;
5731 +
5732 + for (i = 0; i < nr_ioapics; i++ ) {
5733 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
5734 + * sizeof(struct IO_APIC_route_entry);
5735 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
5736 + if (!mp_ioapic_data[i]) {
5737 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5738 + continue;
5739 + }
5740 + memset(mp_ioapic_data[i], 0, size);
5741 + dev = &mp_ioapic_data[i]->dev;
5742 + dev->id = i;
5743 + dev->cls = &ioapic_sysdev_class;
5744 + error = sysdev_register(dev);
5745 + if (error) {
5746 + kfree(mp_ioapic_data[i]);
5747 + mp_ioapic_data[i] = NULL;
5748 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
5749 + continue;
5750 + }
5751 + }
5752 +
5753 + return 0;
5754 +}
5755 +
5756 +device_initcall(ioapic_init_sysfs);
5757 +
5758 +/* --------------------------------------------------------------------------
5759 + ACPI-based IOAPIC Configuration
5760 + -------------------------------------------------------------------------- */
5761 +
5762 +#ifdef CONFIG_ACPI
5763 +
5764 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
5765 +{
5766 +#ifndef CONFIG_XEN
5767 + union IO_APIC_reg_00 reg_00;
5768 + static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
5769 + physid_mask_t tmp;
5770 + unsigned long flags;
5771 + int i = 0;
5772 +
5773 + /*
5774 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
5775 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
5776 + * supports up to 16 on one shared APIC bus.
5777 + *
5778 + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
5779 + * advantage of new APIC bus architecture.
5780 + */
5781 +
5782 + if (physids_empty(apic_id_map))
5783 + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
5784 +
5785 + spin_lock_irqsave(&ioapic_lock, flags);
5786 + reg_00.raw = io_apic_read(ioapic, 0);
5787 + spin_unlock_irqrestore(&ioapic_lock, flags);
5788 +
5789 + if (apic_id >= get_physical_broadcast()) {
5790 + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
5791 + "%d\n", ioapic, apic_id, reg_00.bits.ID);
5792 + apic_id = reg_00.bits.ID;
5793 + }
5794 +
5795 + /*
5796 + * Every APIC in a system must have a unique ID or we get lots of nice
5797 + * 'stuck on smp_invalidate_needed IPI wait' messages.
5798 + */
5799 + if (check_apicid_used(apic_id_map, apic_id)) {
5800 +
5801 + for (i = 0; i < get_physical_broadcast(); i++) {
5802 + if (!check_apicid_used(apic_id_map, i))
5803 + break;
5804 + }
5805 +
5806 + if (i == get_physical_broadcast())
5807 + panic("Max apic_id exceeded!\n");
5808 +
5809 + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
5810 + "trying %d\n", ioapic, apic_id, i);
5811 +
5812 + apic_id = i;
5813 + }
5814 +
5815 + tmp = apicid_to_cpu_present(apic_id);
5816 + physids_or(apic_id_map, apic_id_map, tmp);
5817 +
5818 + if (reg_00.bits.ID != apic_id) {
5819 + reg_00.bits.ID = apic_id;
5820 +
5821 + spin_lock_irqsave(&ioapic_lock, flags);
5822 + io_apic_write(ioapic, 0, reg_00.raw);
5823 + reg_00.raw = io_apic_read(ioapic, 0);
5824 + spin_unlock_irqrestore(&ioapic_lock, flags);
5825 +
5826 + /* Sanity check */
5827 + if (reg_00.bits.ID != apic_id) {
5828 + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
5829 + return -1;
5830 + }
5831 + }
5832 +
5833 + apic_printk(APIC_VERBOSE, KERN_INFO
5834 + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
5835 +#endif /* !CONFIG_XEN */
5836 +
5837 + return apic_id;
5838 +}
5839 +
5840 +
5841 +int __init io_apic_get_version (int ioapic)
5842 +{
5843 + union IO_APIC_reg_01 reg_01;
5844 + unsigned long flags;
5845 +
5846 + spin_lock_irqsave(&ioapic_lock, flags);
5847 + reg_01.raw = io_apic_read(ioapic, 1);
5848 + spin_unlock_irqrestore(&ioapic_lock, flags);
5849 +
5850 + return reg_01.bits.version;
5851 +}
5852 +
5853 +
5854 +int __init io_apic_get_redir_entries (int ioapic)
5855 +{
5856 + union IO_APIC_reg_01 reg_01;
5857 + unsigned long flags;
5858 +
5859 + spin_lock_irqsave(&ioapic_lock, flags);
5860 + reg_01.raw = io_apic_read(ioapic, 1);
5861 + spin_unlock_irqrestore(&ioapic_lock, flags);
5862 +
5863 + return reg_01.bits.entries;
5864 +}
5865 +
5866 +
5867 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
5868 +{
5869 + struct IO_APIC_route_entry entry;
5870 + unsigned long flags;
5871 +
5872 + if (!IO_APIC_IRQ(irq)) {
5873 + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
5874 + ioapic);
5875 + return -EINVAL;
5876 + }
5877 +
5878 + /*
5879 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
5880 + * Note that we mask (disable) IRQs now -- these get enabled when the
5881 + * corresponding device driver registers for this IRQ.
5882 + */
5883 +
5884 + memset(&entry,0,sizeof(entry));
5885 +
5886 + entry.delivery_mode = INT_DELIVERY_MODE;
5887 + entry.dest_mode = INT_DEST_MODE;
5888 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5889 + entry.trigger = edge_level;
5890 + entry.polarity = active_high_low;
5891 + entry.mask = 1;
5892 +
5893 + /*
5894 + * IRQs < 16 are already in the irq_2_pin[] map
5895 + */
5896 + if (irq >= 16)
5897 + add_pin_to_irq(irq, ioapic, pin);
5898 +
5899 + entry.vector = assign_irq_vector(irq);
5900 +
5901 + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
5902 + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
5903 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
5904 + edge_level, active_high_low);
5905 +
5906 + ioapic_register_intr(irq, entry.vector, edge_level);
5907 +
5908 + if (!ioapic && (irq < 16))
5909 + disable_8259A_irq(irq);
5910 +
5911 + spin_lock_irqsave(&ioapic_lock, flags);
5912 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
5913 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
5914 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
5915 + spin_unlock_irqrestore(&ioapic_lock, flags);
5916 +
5917 + return 0;
5918 +}
5919 +
5920 +#endif /* CONFIG_ACPI */
5921 Index: head-2008-11-25/arch/x86/kernel/ioport_32-xen.c
5922 ===================================================================
5923 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5924 +++ head-2008-11-25/arch/x86/kernel/ioport_32-xen.c 2008-01-28 12:24:19.000000000 +0100
5925 @@ -0,0 +1,123 @@
5926 +/*
5927 + * linux/arch/i386/kernel/ioport.c
5928 + *
5929 + * This contains the io-permission bitmap code - written by obz, with changes
5930 + * by Linus.
5931 + */
5932 +
5933 +#include <linux/sched.h>
5934 +#include <linux/kernel.h>
5935 +#include <linux/capability.h>
5936 +#include <linux/errno.h>
5937 +#include <linux/types.h>
5938 +#include <linux/ioport.h>
5939 +#include <linux/smp.h>
5940 +#include <linux/smp_lock.h>
5941 +#include <linux/stddef.h>
5942 +#include <linux/slab.h>
5943 +#include <linux/thread_info.h>
5944 +#include <xen/interface/physdev.h>
5945 +
5946 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
5947 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
5948 +{
5949 + unsigned long mask;
5950 + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
5951 + unsigned int low_index = base & (BITS_PER_LONG-1);
5952 + int length = low_index + extent;
5953 +
5954 + if (low_index != 0) {
5955 + mask = (~0UL << low_index);
5956 + if (length < BITS_PER_LONG)
5957 + mask &= ~(~0UL << length);
5958 + if (new_value)
5959 + *bitmap_base++ |= mask;
5960 + else
5961 + *bitmap_base++ &= ~mask;
5962 + length -= BITS_PER_LONG;
5963 + }
5964 +
5965 + mask = (new_value ? ~0UL : 0UL);
5966 + while (length >= BITS_PER_LONG) {
5967 + *bitmap_base++ = mask;
5968 + length -= BITS_PER_LONG;
5969 + }
5970 +
5971 + if (length > 0) {
5972 + mask = ~(~0UL << length);
5973 + if (new_value)
5974 + *bitmap_base++ |= mask;
5975 + else
5976 + *bitmap_base++ &= ~mask;
5977 + }
5978 +}
5979 +
5980 +
5981 +/*
5982 + * this changes the io permissions bitmap in the current task.
5983 + */
5984 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
5985 +{
5986 + struct thread_struct * t = &current->thread;
5987 + unsigned long *bitmap;
5988 + struct physdev_set_iobitmap set_iobitmap;
5989 +
5990 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
5991 + return -EINVAL;
5992 + if (turn_on && !capable(CAP_SYS_RAWIO))
5993 + return -EPERM;
5994 +
5995 + /*
5996 + * If it's the first ioperm() call in this thread's lifetime, set the
5997 + * IO bitmap up. ioperm() is much less timing critical than clone(),
5998 + * this is why we delay this operation until now:
5999 + */
6000 + if (!t->io_bitmap_ptr) {
6001 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6002 + if (!bitmap)
6003 + return -ENOMEM;
6004 +
6005 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
6006 + t->io_bitmap_ptr = bitmap;
6007 + set_thread_flag(TIF_IO_BITMAP);
6008 +
6009 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
6010 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
6011 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
6012 + &set_iobitmap));
6013 + }
6014 +
6015 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6016 +
6017 + return 0;
6018 +}
6019 +
6020 +/*
6021 + * sys_iopl has to be used when you want to access the IO ports
6022 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6023 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6024 + *
6025 + * Here we just change the eflags value on the stack: we allow
6026 + * only the super-user to do it. This depends on the stack-layout
6027 + * on system-call entry - see also fork() and the signal handling
6028 + * code.
6029 + */
6030 +
6031 +asmlinkage long sys_iopl(unsigned long unused)
6032 +{
6033 + volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6034 + unsigned int level = regs->ebx;
6035 + struct thread_struct *t = &current->thread;
6036 + unsigned int old = (t->iopl >> 12) & 3;
6037 +
6038 + if (level > 3)
6039 + return -EINVAL;
6040 + /* Trying to gain more privileges? */
6041 + if (level > old) {
6042 + if (!capable(CAP_SYS_RAWIO))
6043 + return -EPERM;
6044 + }
6045 + t->iopl = level << 12;
6046 + set_iopl_mask(t->iopl);
6047 + return 0;
6048 +}
6049 Index: head-2008-11-25/arch/x86/kernel/irq_32-xen.c
6050 ===================================================================
6051 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6052 +++ head-2008-11-25/arch/x86/kernel/irq_32-xen.c 2008-10-29 09:55:56.000000000 +0100
6053 @@ -0,0 +1,324 @@
6054 +/*
6055 + * linux/arch/i386/kernel/irq.c
6056 + *
6057 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6058 + *
6059 + * This file contains the lowest level x86-specific interrupt
6060 + * entry, irq-stacks and irq statistics code. All the remaining
6061 + * irq logic is done by the generic kernel/irq/ code and
6062 + * by the x86-specific irq controller code. (e.g. i8259.c and
6063 + * io_apic.c.)
6064 + */
6065 +
6066 +#include <asm/uaccess.h>
6067 +#include <linux/module.h>
6068 +#include <linux/seq_file.h>
6069 +#include <linux/interrupt.h>
6070 +#include <linux/kernel_stat.h>
6071 +#include <linux/notifier.h>
6072 +#include <linux/cpu.h>
6073 +#include <linux/delay.h>
6074 +
6075 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6076 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6077 +
6078 +#ifndef CONFIG_X86_LOCAL_APIC
6079 +/*
6080 + * 'what should we do if we get a hw irq event on an illegal vector'.
6081 + * each architecture has to answer this themselves.
6082 + */
6083 +void ack_bad_irq(unsigned int irq)
6084 +{
6085 + printk("unexpected IRQ trap at vector %02x\n", irq);
6086 +}
6087 +#endif
6088 +
6089 +#ifdef CONFIG_4KSTACKS
6090 +/*
6091 + * per-CPU IRQ handling contexts (thread information and stack)
6092 + */
6093 +union irq_ctx {
6094 + struct thread_info tinfo;
6095 + u32 stack[THREAD_SIZE/sizeof(u32)];
6096 +};
6097 +
6098 +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
6099 +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
6100 +#endif
6101 +
6102 +/*
6103 + * do_IRQ handles all normal device IRQ's (the special
6104 + * SMP cross-CPU interrupts have their own specific
6105 + * handlers).
6106 + */
6107 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6108 +{
6109 + /* high bit used in ret_from_ code */
6110 + int irq = ~regs->orig_eax;
6111 +#ifdef CONFIG_4KSTACKS
6112 + union irq_ctx *curctx, *irqctx;
6113 + u32 *isp;
6114 +#endif
6115 +
6116 + if (unlikely((unsigned)irq >= NR_IRQS)) {
6117 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
6118 + __FUNCTION__, irq);
6119 + BUG();
6120 + }
6121 +
6122 + /*irq_enter();*/
6123 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6124 + /* Debugging check for stack overflow: is there less than 1KB free? */
6125 + {
6126 + long esp;
6127 +
6128 + __asm__ __volatile__("andl %%esp,%0" :
6129 + "=r" (esp) : "0" (THREAD_SIZE - 1));
6130 + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6131 + printk("do_IRQ: stack overflow: %ld\n",
6132 + esp - sizeof(struct thread_info));
6133 + dump_stack();
6134 + }
6135 + }
6136 +#endif
6137 +
6138 +#ifdef CONFIG_4KSTACKS
6139 +
6140 + curctx = (union irq_ctx *) current_thread_info();
6141 + irqctx = hardirq_ctx[smp_processor_id()];
6142 +
6143 + /*
6144 + * this is where we switch to the IRQ stack. However, if we are
6145 + * already using the IRQ stack (because we interrupted a hardirq
6146 + * handler) we can't do that and just have to keep using the
6147 + * current stack (which is the irq stack already after all)
6148 + */
6149 + if (curctx != irqctx) {
6150 + int arg1, arg2, ebx;
6151 +
6152 + /* build the stack frame on the IRQ stack */
6153 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6154 + irqctx->tinfo.task = curctx->tinfo.task;
6155 + irqctx->tinfo.previous_esp = current_stack_pointer;
6156 +
6157 + /*
6158 + * Copy the softirq bits in preempt_count so that the
6159 + * softirq checks work in the hardirq context.
6160 + */
6161 + irqctx->tinfo.preempt_count =
6162 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
6163 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
6164 +
6165 + asm volatile(
6166 + " xchgl %%ebx,%%esp \n"
6167 + " call __do_IRQ \n"
6168 + " movl %%ebx,%%esp \n"
6169 + : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6170 + : "0" (irq), "1" (regs), "2" (isp)
6171 + : "memory", "cc", "ecx"
6172 + );
6173 + } else
6174 +#endif
6175 + __do_IRQ(irq, regs);
6176 +
6177 + /*irq_exit();*/
6178 +
6179 + return 1;
6180 +}
6181 +
6182 +#ifdef CONFIG_4KSTACKS
6183 +
6184 +/*
6185 + * These should really be __section__(".bss.page_aligned") as well, but
6186 + * gcc's 3.0 and earlier don't handle that correctly.
6187 + */
6188 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6189 + __attribute__((__aligned__(THREAD_SIZE)));
6190 +
6191 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6192 + __attribute__((__aligned__(THREAD_SIZE)));
6193 +
6194 +/*
6195 + * allocate per-cpu stacks for hardirq and for softirq processing
6196 + */
6197 +void irq_ctx_init(int cpu)
6198 +{
6199 + union irq_ctx *irqctx;
6200 +
6201 + if (hardirq_ctx[cpu])
6202 + return;
6203 +
6204 + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
6205 + irqctx->tinfo.task = NULL;
6206 + irqctx->tinfo.exec_domain = NULL;
6207 + irqctx->tinfo.cpu = cpu;
6208 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
6209 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6210 +
6211 + hardirq_ctx[cpu] = irqctx;
6212 +
6213 + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
6214 + irqctx->tinfo.task = NULL;
6215 + irqctx->tinfo.exec_domain = NULL;
6216 + irqctx->tinfo.cpu = cpu;
6217 + irqctx->tinfo.preempt_count = 0;
6218 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
6219 +
6220 + softirq_ctx[cpu] = irqctx;
6221 +
6222 + printk("CPU %u irqstacks, hard=%p soft=%p\n",
6223 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
6224 +}
6225 +
6226 +void irq_ctx_exit(int cpu)
6227 +{
6228 + hardirq_ctx[cpu] = NULL;
6229 +}
6230 +
6231 +extern asmlinkage void __do_softirq(void);
6232 +
6233 +asmlinkage void do_softirq(void)
6234 +{
6235 + unsigned long flags;
6236 + struct thread_info *curctx;
6237 + union irq_ctx *irqctx;
6238 + u32 *isp;
6239 +
6240 + if (in_interrupt())
6241 + return;
6242 +
6243 + local_irq_save(flags);
6244 +
6245 + if (local_softirq_pending()) {
6246 + curctx = current_thread_info();
6247 + irqctx = softirq_ctx[smp_processor_id()];
6248 + irqctx->tinfo.task = curctx->task;
6249 + irqctx->tinfo.previous_esp = current_stack_pointer;
6250 +
6251 + /* build the stack frame on the softirq stack */
6252 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6253 +
6254 + asm volatile(
6255 + " xchgl %%ebx,%%esp \n"
6256 + " call __do_softirq \n"
6257 + " movl %%ebx,%%esp \n"
6258 + : "=b"(isp)
6259 + : "0"(isp)
6260 + : "memory", "cc", "edx", "ecx", "eax"
6261 + );
6262 + /*
6263 + * Shouldnt happen, we returned above if in_interrupt():
6264 + */
6265 + WARN_ON_ONCE(softirq_count());
6266 + }
6267 +
6268 + local_irq_restore(flags);
6269 +}
6270 +
6271 +EXPORT_SYMBOL(do_softirq);
6272 +#endif
6273 +
6274 +/*
6275 + * Interrupt statistics:
6276 + */
6277 +
6278 +atomic_t irq_err_count;
6279 +
6280 +/*
6281 + * /proc/interrupts printing:
6282 + */
6283 +
6284 +int show_interrupts(struct seq_file *p, void *v)
6285 +{
6286 + int i = *(loff_t *) v, j;
6287 + struct irqaction * action;
6288 + unsigned long flags;
6289 +
6290 + if (i == 0) {
6291 + seq_printf(p, " ");
6292 + for_each_online_cpu(j)
6293 + seq_printf(p, "CPU%-8d",j);
6294 + seq_putc(p, '\n');
6295 + }
6296 +
6297 + if (i < NR_IRQS) {
6298 + spin_lock_irqsave(&irq_desc[i].lock, flags);
6299 + action = irq_desc[i].action;
6300 + if (!action)
6301 + goto skip;
6302 + seq_printf(p, "%3d: ",i);
6303 +#ifndef CONFIG_SMP
6304 + seq_printf(p, "%10u ", kstat_irqs(i));
6305 +#else
6306 + for_each_online_cpu(j)
6307 + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
6308 +#endif
6309 + seq_printf(p, " %14s", irq_desc[i].chip->typename);
6310 + seq_printf(p, " %s", action->name);
6311 +
6312 + for (action=action->next; action; action = action->next)
6313 + seq_printf(p, ", %s", action->name);
6314 +
6315 + seq_putc(p, '\n');
6316 +skip:
6317 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
6318 + } else if (i == NR_IRQS) {
6319 + seq_printf(p, "NMI: ");
6320 + for_each_online_cpu(j)
6321 + seq_printf(p, "%10u ", nmi_count(j));
6322 + seq_putc(p, '\n');
6323 +#ifdef CONFIG_X86_LOCAL_APIC
6324 + seq_printf(p, "LOC: ");
6325 + for_each_online_cpu(j)
6326 + seq_printf(p, "%10u ",
6327 + per_cpu(irq_stat,j).apic_timer_irqs);
6328 + seq_putc(p, '\n');
6329 +#endif
6330 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
6331 +#if defined(CONFIG_X86_IO_APIC)
6332 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
6333 +#endif
6334 + }
6335 + return 0;
6336 +}
6337 +
6338 +#ifdef CONFIG_HOTPLUG_CPU
6339 +
6340 +void fixup_irqs(cpumask_t map)
6341 +{
6342 + unsigned int irq;
6343 + static int warned;
6344 +
6345 + for (irq = 0; irq < NR_IRQS; irq++) {
6346 + cpumask_t mask;
6347 + if (irq == 2)
6348 + continue;
6349 +
6350 + cpus_and(mask, irq_desc[irq].affinity, map);
6351 + if (any_online_cpu(mask) == NR_CPUS) {
6352 + /*printk("Breaking affinity for irq %i\n", irq);*/
6353 + mask = map;
6354 + }
6355 + if (irq_desc[irq].chip->set_affinity)
6356 + irq_desc[irq].chip->set_affinity(irq, mask);
6357 + else if (irq_desc[irq].action && !(warned++))
6358 + printk("Cannot set affinity for irq %i\n", irq);
6359 + }
6360 +
6361 +#if 0
6362 + barrier();
6363 + /* Ingo Molnar says: "after the IO-APIC masks have been redirected
6364 + [note the nop - the interrupt-enable boundary on x86 is two
6365 + instructions from sti] - to flush out pending hardirqs and
6366 + IPIs. After this point nothing is supposed to reach this CPU." */
6367 + __asm__ __volatile__("sti; nop; cli");
6368 + barrier();
6369 +#else
6370 + /* That doesn't seem sufficient. Give it 1ms. */
6371 + local_irq_enable();
6372 + mdelay(1);
6373 + local_irq_disable();
6374 +#endif
6375 +}
6376 +#endif
6377 +
6378 Index: head-2008-11-25/arch/x86/kernel/ldt_32-xen.c
6379 ===================================================================
6380 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6381 +++ head-2008-11-25/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6382 @@ -0,0 +1,270 @@
6383 +/*
6384 + * linux/kernel/ldt.c
6385 + *
6386 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
6387 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6388 + */
6389 +
6390 +#include <linux/errno.h>
6391 +#include <linux/sched.h>
6392 +#include <linux/string.h>
6393 +#include <linux/mm.h>
6394 +#include <linux/smp.h>
6395 +#include <linux/smp_lock.h>
6396 +#include <linux/vmalloc.h>
6397 +#include <linux/slab.h>
6398 +
6399 +#include <asm/uaccess.h>
6400 +#include <asm/system.h>
6401 +#include <asm/ldt.h>
6402 +#include <asm/desc.h>
6403 +#include <asm/mmu_context.h>
6404 +
6405 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
6406 +static void flush_ldt(void *null)
6407 +{
6408 + if (current->active_mm)
6409 + load_LDT(&current->active_mm->context);
6410 +}
6411 +#endif
6412 +
6413 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
6414 +{
6415 + void *oldldt;
6416 + void *newldt;
6417 + int oldsize;
6418 +
6419 + if (mincount <= pc->size)
6420 + return 0;
6421 + oldsize = pc->size;
6422 + mincount = (mincount+511)&(~511);
6423 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
6424 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
6425 + else
6426 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
6427 +
6428 + if (!newldt)
6429 + return -ENOMEM;
6430 +
6431 + if (oldsize)
6432 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
6433 + oldldt = pc->ldt;
6434 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
6435 + pc->ldt = newldt;
6436 + wmb();
6437 + pc->size = mincount;
6438 + wmb();
6439 +
6440 + if (reload) {
6441 +#ifdef CONFIG_SMP
6442 + cpumask_t mask;
6443 + preempt_disable();
6444 +#endif
6445 + make_pages_readonly(
6446 + pc->ldt,
6447 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6448 + XENFEAT_writable_descriptor_tables);
6449 + load_LDT(pc);
6450 +#ifdef CONFIG_SMP
6451 + mask = cpumask_of_cpu(smp_processor_id());
6452 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
6453 + smp_call_function(flush_ldt, NULL, 1, 1);
6454 + preempt_enable();
6455 +#endif
6456 + }
6457 + if (oldsize) {
6458 + make_pages_writable(
6459 + oldldt,
6460 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
6461 + XENFEAT_writable_descriptor_tables);
6462 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
6463 + vfree(oldldt);
6464 + else
6465 + kfree(oldldt);
6466 + }
6467 + return 0;
6468 +}
6469 +
6470 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
6471 +{
6472 + int err = alloc_ldt(new, old->size, 0);
6473 + if (err < 0)
6474 + return err;
6475 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
6476 + make_pages_readonly(
6477 + new->ldt,
6478 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6479 + XENFEAT_writable_descriptor_tables);
6480 + return 0;
6481 +}
6482 +
6483 +/*
6484 + * we do not have to muck with descriptors here, that is
6485 + * done in switch_mm() as needed.
6486 + */
6487 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
6488 +{
6489 + struct mm_struct * old_mm;
6490 + int retval = 0;
6491 +
6492 + init_MUTEX(&mm->context.sem);
6493 + mm->context.size = 0;
6494 + mm->context.has_foreign_mappings = 0;
6495 + old_mm = current->mm;
6496 + if (old_mm && old_mm->context.size > 0) {
6497 + down(&old_mm->context.sem);
6498 + retval = copy_ldt(&mm->context, &old_mm->context);
6499 + up(&old_mm->context.sem);
6500 + }
6501 + return retval;
6502 +}
6503 +
6504 +/*
6505 + * No need to lock the MM as we are the last user
6506 + */
6507 +void destroy_context(struct mm_struct *mm)
6508 +{
6509 + if (mm->context.size) {
6510 + if (mm == current->active_mm)
6511 + clear_LDT();
6512 + make_pages_writable(
6513 + mm->context.ldt,
6514 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
6515 + XENFEAT_writable_descriptor_tables);
6516 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
6517 + vfree(mm->context.ldt);
6518 + else
6519 + kfree(mm->context.ldt);
6520 + mm->context.size = 0;
6521 + }
6522 +}
6523 +
6524 +static int read_ldt(void __user * ptr, unsigned long bytecount)
6525 +{
6526 + int err;
6527 + unsigned long size;
6528 + struct mm_struct * mm = current->mm;
6529 +
6530 + if (!mm->context.size)
6531 + return 0;
6532 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
6533 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
6534 +
6535 + down(&mm->context.sem);
6536 + size = mm->context.size*LDT_ENTRY_SIZE;
6537 + if (size > bytecount)
6538 + size = bytecount;
6539 +
6540 + err = 0;
6541 + if (copy_to_user(ptr, mm->context.ldt, size))
6542 + err = -EFAULT;
6543 + up(&mm->context.sem);
6544 + if (err < 0)
6545 + goto error_return;
6546 + if (size != bytecount) {
6547 + /* zero-fill the rest */
6548 + if (clear_user(ptr+size, bytecount-size) != 0) {
6549 + err = -EFAULT;
6550 + goto error_return;
6551 + }
6552 + }
6553 + return bytecount;
6554 +error_return:
6555 + return err;
6556 +}
6557 +
6558 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
6559 +{
6560 + int err;
6561 + unsigned long size;
6562 + void *address;
6563 +
6564 + err = 0;
6565 + address = &default_ldt[0];
6566 + size = 5*sizeof(struct desc_struct);
6567 + if (size > bytecount)
6568 + size = bytecount;
6569 +
6570 + err = size;
6571 + if (copy_to_user(ptr, address, size))
6572 + err = -EFAULT;
6573 +
6574 + return err;
6575 +}
6576 +
6577 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
6578 +{
6579 + struct mm_struct * mm = current->mm;
6580 + __u32 entry_1, entry_2;
6581 + int error;
6582 + struct user_desc ldt_info;
6583 +
6584 + error = -EINVAL;
6585 + if (bytecount != sizeof(ldt_info))
6586 + goto out;
6587 + error = -EFAULT;
6588 + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
6589 + goto out;
6590 +
6591 + error = -EINVAL;
6592 + if (ldt_info.entry_number >= LDT_ENTRIES)
6593 + goto out;
6594 + if (ldt_info.contents == 3) {
6595 + if (oldmode)
6596 + goto out;
6597 + if (ldt_info.seg_not_present == 0)
6598 + goto out;
6599 + }
6600 +
6601 + down(&mm->context.sem);
6602 + if (ldt_info.entry_number >= mm->context.size) {
6603 + error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
6604 + if (error < 0)
6605 + goto out_unlock;
6606 + }
6607 +
6608 + /* Allow LDTs to be cleared by the user. */
6609 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
6610 + if (oldmode || LDT_empty(&ldt_info)) {
6611 + entry_1 = 0;
6612 + entry_2 = 0;
6613 + goto install;
6614 + }
6615 + }
6616 +
6617 + entry_1 = LDT_entry_a(&ldt_info);
6618 + entry_2 = LDT_entry_b(&ldt_info);
6619 + if (oldmode)
6620 + entry_2 &= ~(1 << 20);
6621 +
6622 + /* Install the new entry ... */
6623 +install:
6624 + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
6625 + entry_1, entry_2);
6626 +
6627 +out_unlock:
6628 + up(&mm->context.sem);
6629 +out:
6630 + return error;
6631 +}
6632 +
6633 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
6634 +{
6635 + int ret = -ENOSYS;
6636 +
6637 + switch (func) {
6638 + case 0:
6639 + ret = read_ldt(ptr, bytecount);
6640 + break;
6641 + case 1:
6642 + ret = write_ldt(ptr, bytecount, 1);
6643 + break;
6644 + case 2:
6645 + ret = read_default_ldt(ptr, bytecount);
6646 + break;
6647 + case 0x11:
6648 + ret = write_ldt(ptr, bytecount, 0);
6649 + break;
6650 + }
6651 + return ret;
6652 +}
6653 Index: head-2008-11-25/arch/x86/kernel/microcode-xen.c
6654 ===================================================================
6655 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6656 +++ head-2008-11-25/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200
6657 @@ -0,0 +1,144 @@
6658 +/*
6659 + * Intel CPU Microcode Update Driver for Linux
6660 + *
6661 + * Copyright (C) 2000-2004 Tigran Aivazian
6662 + *
6663 + * This driver allows to upgrade microcode on Intel processors
6664 + * belonging to IA-32 family - PentiumPro, Pentium II,
6665 + * Pentium III, Xeon, Pentium 4, etc.
6666 + *
6667 + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
6668 + * Order Number 245472 or free download from:
6669 + *
6670 + * http://developer.intel.com/design/pentium4/manuals/245472.htm
6671 + *
6672 + * For more information, go to http://www.urbanmyth.org/microcode
6673 + *
6674 + * This program is free software; you can redistribute it and/or
6675 + * modify it under the terms of the GNU General Public License
6676 + * as published by the Free Software Foundation; either version
6677 + * 2 of the License, or (at your option) any later version.
6678 + */
6679 +
6680 +//#define DEBUG /* pr_debug */
6681 +#include <linux/capability.h>
6682 +#include <linux/kernel.h>
6683 +#include <linux/init.h>
6684 +#include <linux/sched.h>
6685 +#include <linux/cpumask.h>
6686 +#include <linux/module.h>
6687 +#include <linux/slab.h>
6688 +#include <linux/vmalloc.h>
6689 +#include <linux/miscdevice.h>
6690 +#include <linux/spinlock.h>
6691 +#include <linux/mm.h>
6692 +#include <linux/mutex.h>
6693 +#include <linux/syscalls.h>
6694 +
6695 +#include <asm/msr.h>
6696 +#include <asm/uaccess.h>
6697 +#include <asm/processor.h>
6698 +
6699 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
6700 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
6701 +MODULE_LICENSE("GPL");
6702 +
6703 +static int verbose;
6704 +module_param(verbose, int, 0644);
6705 +
6706 +#define MICROCODE_VERSION "1.14a-xen"
6707 +
6708 +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
6709 +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
6710 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
6711 +
6712 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
6713 +static DEFINE_MUTEX(microcode_mutex);
6714 +
6715 +static int microcode_open (struct inode *unused1, struct file *unused2)
6716 +{
6717 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
6718 +}
6719 +
6720 +
6721 +static int do_microcode_update (const void __user *ubuf, size_t len)
6722 +{
6723 + int err;
6724 + void *kbuf;
6725 +
6726 + kbuf = vmalloc(len);
6727 + if (!kbuf)
6728 + return -ENOMEM;
6729 +
6730 + if (copy_from_user(kbuf, ubuf, len) == 0) {
6731 + struct xen_platform_op op;
6732 +
6733 + op.cmd = XENPF_microcode_update;
6734 + set_xen_guest_handle(op.u.microcode.data, kbuf);
6735 + op.u.microcode.length = len;
6736 + err = HYPERVISOR_platform_op(&op);
6737 + } else
6738 + err = -EFAULT;
6739 +
6740 + vfree(kbuf);
6741 +
6742 + return err;
6743 +}
6744 +
6745 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
6746 +{
6747 + ssize_t ret;
6748 +
6749 + if (len < MC_HEADER_SIZE) {
6750 + printk(KERN_ERR "microcode: not enough data\n");
6751 + return -EINVAL;
6752 + }
6753 +
6754 + mutex_lock(&microcode_mutex);
6755 +
6756 + ret = do_microcode_update(buf, len);
6757 + if (!ret)
6758 + ret = (ssize_t)len;
6759 +
6760 + mutex_unlock(&microcode_mutex);
6761 +
6762 + return ret;
6763 +}
6764 +
6765 +static struct file_operations microcode_fops = {
6766 + .owner = THIS_MODULE,
6767 + .write = microcode_write,
6768 + .open = microcode_open,
6769 +};
6770 +
6771 +static struct miscdevice microcode_dev = {
6772 + .minor = MICROCODE_MINOR,
6773 + .name = "microcode",
6774 + .fops = &microcode_fops,
6775 +};
6776 +
6777 +static int __init microcode_init (void)
6778 +{
6779 + int error;
6780 +
6781 + error = misc_register(&microcode_dev);
6782 + if (error) {
6783 + printk(KERN_ERR
6784 + "microcode: can't misc_register on minor=%d\n",
6785 + MICROCODE_MINOR);
6786 + return error;
6787 + }
6788 +
6789 + printk(KERN_INFO
6790 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
6791 + return 0;
6792 +}
6793 +
6794 +static void __exit microcode_exit (void)
6795 +{
6796 + misc_deregister(&microcode_dev);
6797 +}
6798 +
6799 +module_init(microcode_init)
6800 +module_exit(microcode_exit)
6801 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
6802 Index: head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c
6803 ===================================================================
6804 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6805 +++ head-2008-11-25/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200
6806 @@ -0,0 +1,1185 @@
6807 +/*
6808 + * Intel Multiprocessor Specification 1.1 and 1.4
6809 + * compliant MP-table parsing routines.
6810 + *
6811 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6812 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6813 + *
6814 + * Fixes
6815 + * Erich Boleyn : MP v1.4 and additional changes.
6816 + * Alan Cox : Added EBDA scanning
6817 + * Ingo Molnar : various cleanups and rewrites
6818 + * Maciej W. Rozycki: Bits for default MP configurations
6819 + * Paul Diefenbaugh: Added full ACPI support
6820 + */
6821 +
6822 +#include <linux/mm.h>
6823 +#include <linux/init.h>
6824 +#include <linux/acpi.h>
6825 +#include <linux/delay.h>
6826 +#include <linux/bootmem.h>
6827 +#include <linux/smp_lock.h>
6828 +#include <linux/kernel_stat.h>
6829 +#include <linux/mc146818rtc.h>
6830 +#include <linux/bitops.h>
6831 +
6832 +#include <asm/smp.h>
6833 +#include <asm/acpi.h>
6834 +#include <asm/mtrr.h>
6835 +#include <asm/mpspec.h>
6836 +#include <asm/io_apic.h>
6837 +
6838 +#include <mach_apic.h>
6839 +#include <mach_mpparse.h>
6840 +#include <bios_ebda.h>
6841 +
6842 +/* Have we found an MP table */
6843 +int smp_found_config;
6844 +unsigned int __initdata maxcpus = NR_CPUS;
6845 +
6846 +/*
6847 + * Various Linux-internal data structures created from the
6848 + * MP-table.
6849 + */
6850 +int apic_version [MAX_APICS];
6851 +int mp_bus_id_to_type [MAX_MP_BUSSES];
6852 +int mp_bus_id_to_node [MAX_MP_BUSSES];
6853 +int mp_bus_id_to_local [MAX_MP_BUSSES];
6854 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
6855 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
6856 +static int mp_current_pci_id;
6857 +
6858 +/* I/O APIC entries */
6859 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6860 +
6861 +/* # of MP IRQ source entries */
6862 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6863 +
6864 +/* MP IRQ source entries */
6865 +int mp_irq_entries;
6866 +
6867 +int nr_ioapics;
6868 +
6869 +int pic_mode;
6870 +unsigned long mp_lapic_addr;
6871 +
6872 +unsigned int def_to_bigsmp = 0;
6873 +
6874 +/* Processor that is doing the boot up */
6875 +unsigned int boot_cpu_physical_apicid = -1U;
6876 +/* Internal processor count */
6877 +static unsigned int __devinitdata num_processors;
6878 +
6879 +/* Bitmask of physically existing CPUs */
6880 +physid_mask_t phys_cpu_present_map;
6881 +
6882 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
6883 +
6884 +/*
6885 + * Intel MP BIOS table parsing routines:
6886 + */
6887 +
6888 +
6889 +/*
6890 + * Checksum an MP configuration block.
6891 + */
6892 +
6893 +static int __init mpf_checksum(unsigned char *mp, int len)
6894 +{
6895 + int sum = 0;
6896 +
6897 + while (len--)
6898 + sum += *mp++;
6899 +
6900 + return sum & 0xFF;
6901 +}
6902 +
6903 +/*
6904 + * Have to match translation table entries to main table entries by counter
6905 + * hence the mpc_record variable .... can't see a less disgusting way of
6906 + * doing this ....
6907 + */
6908 +
6909 +static int mpc_record;
6910 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
6911 +
6912 +#ifndef CONFIG_XEN
6913 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
6914 +{
6915 + int ver, apicid;
6916 + physid_mask_t phys_cpu;
6917 +
6918 + if (!(m->mpc_cpuflag & CPU_ENABLED))
6919 + return;
6920 +
6921 + apicid = mpc_apic_id(m, translation_table[mpc_record]);
6922 +
6923 + if (m->mpc_featureflag&(1<<0))
6924 + Dprintk(" Floating point unit present.\n");
6925 + if (m->mpc_featureflag&(1<<7))
6926 + Dprintk(" Machine Exception supported.\n");
6927 + if (m->mpc_featureflag&(1<<8))
6928 + Dprintk(" 64 bit compare & exchange supported.\n");
6929 + if (m->mpc_featureflag&(1<<9))
6930 + Dprintk(" Internal APIC present.\n");
6931 + if (m->mpc_featureflag&(1<<11))
6932 + Dprintk(" SEP present.\n");
6933 + if (m->mpc_featureflag&(1<<12))
6934 + Dprintk(" MTRR present.\n");
6935 + if (m->mpc_featureflag&(1<<13))
6936 + Dprintk(" PGE present.\n");
6937 + if (m->mpc_featureflag&(1<<14))
6938 + Dprintk(" MCA present.\n");
6939 + if (m->mpc_featureflag&(1<<15))
6940 + Dprintk(" CMOV present.\n");
6941 + if (m->mpc_featureflag&(1<<16))
6942 + Dprintk(" PAT present.\n");
6943 + if (m->mpc_featureflag&(1<<17))
6944 + Dprintk(" PSE present.\n");
6945 + if (m->mpc_featureflag&(1<<18))
6946 + Dprintk(" PSN present.\n");
6947 + if (m->mpc_featureflag&(1<<19))
6948 + Dprintk(" Cache Line Flush Instruction present.\n");
6949 + /* 20 Reserved */
6950 + if (m->mpc_featureflag&(1<<21))
6951 + Dprintk(" Debug Trace and EMON Store present.\n");
6952 + if (m->mpc_featureflag&(1<<22))
6953 + Dprintk(" ACPI Thermal Throttle Registers present.\n");
6954 + if (m->mpc_featureflag&(1<<23))
6955 + Dprintk(" MMX present.\n");
6956 + if (m->mpc_featureflag&(1<<24))
6957 + Dprintk(" FXSR present.\n");
6958 + if (m->mpc_featureflag&(1<<25))
6959 + Dprintk(" XMM present.\n");
6960 + if (m->mpc_featureflag&(1<<26))
6961 + Dprintk(" Willamette New Instructions present.\n");
6962 + if (m->mpc_featureflag&(1<<27))
6963 + Dprintk(" Self Snoop present.\n");
6964 + if (m->mpc_featureflag&(1<<28))
6965 + Dprintk(" HT present.\n");
6966 + if (m->mpc_featureflag&(1<<29))
6967 + Dprintk(" Thermal Monitor present.\n");
6968 + /* 30, 31 Reserved */
6969 +
6970 +
6971 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
6972 + Dprintk(" Bootup CPU\n");
6973 + boot_cpu_physical_apicid = m->mpc_apicid;
6974 + }
6975 +
6976 + ver = m->mpc_apicver;
6977 +
6978 + /*
6979 + * Validate version
6980 + */
6981 + if (ver == 0x0) {
6982 + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
6983 + "fixing up to 0x10. (tell your hw vendor)\n",
6984 + m->mpc_apicid);
6985 + ver = 0x10;
6986 + }
6987 + apic_version[m->mpc_apicid] = ver;
6988 +
6989 + phys_cpu = apicid_to_cpu_present(apicid);
6990 + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
6991 +
6992 + if (num_processors >= NR_CPUS) {
6993 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
6994 + " Processor ignored.\n", NR_CPUS);
6995 + return;
6996 + }
6997 +
6998 + if (num_processors >= maxcpus) {
6999 + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7000 + " Processor ignored.\n", maxcpus);
7001 + return;
7002 + }
7003 +
7004 + cpu_set(num_processors, cpu_possible_map);
7005 + num_processors++;
7006 +
7007 + /*
7008 + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
7009 + * but we need to work other dependencies like SMP_SUSPEND etc
7010 + * before this can be done without some confusion.
7011 + * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
7012 + * - Ashok Raj <ashok.raj@intel.com>
7013 + */
7014 + if (num_processors > 8) {
7015 + switch (boot_cpu_data.x86_vendor) {
7016 + case X86_VENDOR_INTEL:
7017 + if (!APIC_XAPIC(ver)) {
7018 + def_to_bigsmp = 0;
7019 + break;
7020 + }
7021 + /* If P4 and above fall through */
7022 + case X86_VENDOR_AMD:
7023 + def_to_bigsmp = 1;
7024 + }
7025 + }
7026 + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7027 +}
7028 +#else
7029 +void __init MP_processor_info (struct mpc_config_processor *m)
7030 +{
7031 + num_processors++;
7032 +}
7033 +#endif /* CONFIG_XEN */
7034 +
7035 +static void __init MP_bus_info (struct mpc_config_bus *m)
7036 +{
7037 + char str[7];
7038 +
7039 + memcpy(str, m->mpc_bustype, 6);
7040 + str[6] = 0;
7041 +
7042 + mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7043 +
7044 + if (m->mpc_busid >= MAX_MP_BUSSES) {
7045 + printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
7046 + " is too large, max. supported is %d\n",
7047 + m->mpc_busid, str, MAX_MP_BUSSES - 1);
7048 + return;
7049 + }
7050 +
7051 + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7052 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7053 + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7054 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7055 + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7056 + mpc_oem_pci_bus(m, translation_table[mpc_record]);
7057 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7058 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7059 + mp_current_pci_id++;
7060 + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7061 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7062 + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7063 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7064 + } else {
7065 + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7066 + }
7067 +}
7068 +
7069 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7070 +{
7071 + if (!(m->mpc_flags & MPC_APIC_USABLE))
7072 + return;
7073 +
7074 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7075 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7076 + if (nr_ioapics >= MAX_IO_APICS) {
7077 + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7078 + MAX_IO_APICS, nr_ioapics);
7079 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7080 + }
7081 + if (!m->mpc_apicaddr) {
7082 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7083 + " found in MP table, skipping!\n");
7084 + return;
7085 + }
7086 + mp_ioapics[nr_ioapics] = *m;
7087 + nr_ioapics++;
7088 +}
7089 +
7090 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7091 +{
7092 + mp_irqs [mp_irq_entries] = *m;
7093 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7094 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7095 + m->mpc_irqtype, m->mpc_irqflag & 3,
7096 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7097 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7098 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7099 + panic("Max # of irq sources exceeded!!\n");
7100 +}
7101 +
7102 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7103 +{
7104 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7105 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7106 + m->mpc_irqtype, m->mpc_irqflag & 3,
7107 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7108 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7109 + /*
7110 + * Well it seems all SMP boards in existence
7111 + * use ExtINT/LVT1 == LINT0 and
7112 + * NMI/LVT2 == LINT1 - the following check
7113 + * will show us if this assumptions is false.
7114 + * Until then we do not have to add baggage.
7115 + */
7116 + if ((m->mpc_irqtype == mp_ExtINT) &&
7117 + (m->mpc_destapiclint != 0))
7118 + BUG();
7119 + if ((m->mpc_irqtype == mp_NMI) &&
7120 + (m->mpc_destapiclint != 1))
7121 + BUG();
7122 +}
7123 +
7124 +#ifdef CONFIG_X86_NUMAQ
7125 +static void __init MP_translation_info (struct mpc_config_translation *m)
7126 +{
7127 + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7128 +
7129 + if (mpc_record >= MAX_MPC_ENTRY)
7130 + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7131 + else
7132 + translation_table[mpc_record] = m; /* stash this for later */
7133 + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7134 + node_set_online(m->trans_quad);
7135 +}
7136 +
7137 +/*
7138 + * Read/parse the MPC oem tables
7139 + */
7140 +
7141 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7142 + unsigned short oemsize)
7143 +{
7144 + int count = sizeof (*oemtable); /* the header size */
7145 + unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7146 +
7147 + mpc_record = 0;
7148 + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7149 + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7150 + {
7151 + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7152 + oemtable->oem_signature[0],
7153 + oemtable->oem_signature[1],
7154 + oemtable->oem_signature[2],
7155 + oemtable->oem_signature[3]);
7156 + return;
7157 + }
7158 + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7159 + {
7160 + printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7161 + return;
7162 + }
7163 + while (count < oemtable->oem_length) {
7164 + switch (*oemptr) {
7165 + case MP_TRANSLATION:
7166 + {
7167 + struct mpc_config_translation *m=
7168 + (struct mpc_config_translation *)oemptr;
7169 + MP_translation_info(m);
7170 + oemptr += sizeof(*m);
7171 + count += sizeof(*m);
7172 + ++mpc_record;
7173 + break;
7174 + }
7175 + default:
7176 + {
7177 + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7178 + return;
7179 + }
7180 + }
7181 + }
7182 +}
7183 +
7184 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
7185 + char *productid)
7186 +{
7187 + if (strncmp(oem, "IBM NUMA", 8))
7188 + printk("Warning! May not be a NUMA-Q system!\n");
7189 + if (mpc->mpc_oemptr)
7190 + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
7191 + mpc->mpc_oemsize);
7192 +}
7193 +#endif /* CONFIG_X86_NUMAQ */
7194 +
7195 +/*
7196 + * Read/parse the MPC
7197 + */
7198 +
7199 +static int __init smp_read_mpc(struct mp_config_table *mpc)
7200 +{
7201 + char str[16];
7202 + char oem[10];
7203 + int count=sizeof(*mpc);
7204 + unsigned char *mpt=((unsigned char *)mpc)+count;
7205 +
7206 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
7207 + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
7208 + *(u32 *)mpc->mpc_signature);
7209 + return 0;
7210 + }
7211 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
7212 + printk(KERN_ERR "SMP mptable: checksum error!\n");
7213 + return 0;
7214 + }
7215 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
7216 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
7217 + mpc->mpc_spec);
7218 + return 0;
7219 + }
7220 + if (!mpc->mpc_lapic) {
7221 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
7222 + return 0;
7223 + }
7224 + memcpy(oem,mpc->mpc_oem,8);
7225 + oem[8]=0;
7226 + printk(KERN_INFO "OEM ID: %s ",oem);
7227 +
7228 + memcpy(str,mpc->mpc_productid,12);
7229 + str[12]=0;
7230 + printk("Product ID: %s ",str);
7231 +
7232 + mps_oem_check(mpc, oem, str);
7233 +
7234 + printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
7235 +
7236 + /*
7237 + * Save the local APIC address (it might be non-default) -- but only
7238 + * if we're not using ACPI.
7239 + */
7240 + if (!acpi_lapic)
7241 + mp_lapic_addr = mpc->mpc_lapic;
7242 +
7243 + /*
7244 + * Now process the configuration blocks.
7245 + */
7246 + mpc_record = 0;
7247 + while (count < mpc->mpc_length) {
7248 + switch(*mpt) {
7249 + case MP_PROCESSOR:
7250 + {
7251 + struct mpc_config_processor *m=
7252 + (struct mpc_config_processor *)mpt;
7253 + /* ACPI may have already provided this data */
7254 + if (!acpi_lapic)
7255 + MP_processor_info(m);
7256 + mpt += sizeof(*m);
7257 + count += sizeof(*m);
7258 + break;
7259 + }
7260 + case MP_BUS:
7261 + {
7262 + struct mpc_config_bus *m=
7263 + (struct mpc_config_bus *)mpt;
7264 + MP_bus_info(m);
7265 + mpt += sizeof(*m);
7266 + count += sizeof(*m);
7267 + break;
7268 + }
7269 + case MP_IOAPIC:
7270 + {
7271 + struct mpc_config_ioapic *m=
7272 + (struct mpc_config_ioapic *)mpt;
7273 + MP_ioapic_info(m);
7274 + mpt+=sizeof(*m);
7275 + count+=sizeof(*m);
7276 + break;
7277 + }
7278 + case MP_INTSRC:
7279 + {
7280 + struct mpc_config_intsrc *m=
7281 + (struct mpc_config_intsrc *)mpt;
7282 +
7283 + MP_intsrc_info(m);
7284 + mpt+=sizeof(*m);
7285 + count+=sizeof(*m);
7286 + break;
7287 + }
7288 + case MP_LINTSRC:
7289 + {
7290 + struct mpc_config_lintsrc *m=
7291 + (struct mpc_config_lintsrc *)mpt;
7292 + MP_lintsrc_info(m);
7293 + mpt+=sizeof(*m);
7294 + count+=sizeof(*m);
7295 + break;
7296 + }
7297 + default:
7298 + {
7299 + count = mpc->mpc_length;
7300 + break;
7301 + }
7302 + }
7303 + ++mpc_record;
7304 + }
7305 + clustered_apic_check();
7306 + if (!num_processors)
7307 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
7308 + return num_processors;
7309 +}
7310 +
7311 +static int __init ELCR_trigger(unsigned int irq)
7312 +{
7313 + unsigned int port;
7314 +
7315 + port = 0x4d0 + (irq >> 3);
7316 + return (inb(port) >> (irq & 7)) & 1;
7317 +}
7318 +
7319 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
7320 +{
7321 + struct mpc_config_intsrc intsrc;
7322 + int i;
7323 + int ELCR_fallback = 0;
7324 +
7325 + intsrc.mpc_type = MP_INTSRC;
7326 + intsrc.mpc_irqflag = 0; /* conforming */
7327 + intsrc.mpc_srcbus = 0;
7328 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
7329 +
7330 + intsrc.mpc_irqtype = mp_INT;
7331 +
7332 + /*
7333 + * If true, we have an ISA/PCI system with no IRQ entries
7334 + * in the MP table. To prevent the PCI interrupts from being set up
7335 + * incorrectly, we try to use the ELCR. The sanity check to see if
7336 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
7337 + * never be level sensitive, so we simply see if the ELCR agrees.
7338 + * If it does, we assume it's valid.
7339 + */
7340 + if (mpc_default_type == 5) {
7341 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
7342 +
7343 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
7344 + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
7345 + else {
7346 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
7347 + ELCR_fallback = 1;
7348 + }
7349 + }
7350 +
7351 + for (i = 0; i < 16; i++) {
7352 + switch (mpc_default_type) {
7353 + case 2:
7354 + if (i == 0 || i == 13)
7355 + continue; /* IRQ0 & IRQ13 not connected */
7356 + /* fall through */
7357 + default:
7358 + if (i == 2)
7359 + continue; /* IRQ2 is never connected */
7360 + }
7361 +
7362 + if (ELCR_fallback) {
7363 + /*
7364 + * If the ELCR indicates a level-sensitive interrupt, we
7365 + * copy that information over to the MP table in the
7366 + * irqflag field (level sensitive, active high polarity).
7367 + */
7368 + if (ELCR_trigger(i))
7369 + intsrc.mpc_irqflag = 13;
7370 + else
7371 + intsrc.mpc_irqflag = 0;
7372 + }
7373 +
7374 + intsrc.mpc_srcbusirq = i;
7375 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
7376 + MP_intsrc_info(&intsrc);
7377 + }
7378 +
7379 + intsrc.mpc_irqtype = mp_ExtINT;
7380 + intsrc.mpc_srcbusirq = 0;
7381 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
7382 + MP_intsrc_info(&intsrc);
7383 +}
7384 +
7385 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
7386 +{
7387 + struct mpc_config_processor processor;
7388 + struct mpc_config_bus bus;
7389 + struct mpc_config_ioapic ioapic;
7390 + struct mpc_config_lintsrc lintsrc;
7391 + int linttypes[2] = { mp_ExtINT, mp_NMI };
7392 + int i;
7393 +
7394 + /*
7395 + * local APIC has default address
7396 + */
7397 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
7398 +
7399 + /*
7400 + * 2 CPUs, numbered 0 & 1.
7401 + */
7402 + processor.mpc_type = MP_PROCESSOR;
7403 + /* Either an integrated APIC or a discrete 82489DX. */
7404 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7405 + processor.mpc_cpuflag = CPU_ENABLED;
7406 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7407 + (boot_cpu_data.x86_model << 4) |
7408 + boot_cpu_data.x86_mask;
7409 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7410 + processor.mpc_reserved[0] = 0;
7411 + processor.mpc_reserved[1] = 0;
7412 + for (i = 0; i < 2; i++) {
7413 + processor.mpc_apicid = i;
7414 + MP_processor_info(&processor);
7415 + }
7416 +
7417 + bus.mpc_type = MP_BUS;
7418 + bus.mpc_busid = 0;
7419 + switch (mpc_default_type) {
7420 + default:
7421 + printk("???\n");
7422 + printk(KERN_ERR "Unknown standard configuration %d\n",
7423 + mpc_default_type);
7424 + /* fall through */
7425 + case 1:
7426 + case 5:
7427 + memcpy(bus.mpc_bustype, "ISA ", 6);
7428 + break;
7429 + case 2:
7430 + case 6:
7431 + case 3:
7432 + memcpy(bus.mpc_bustype, "EISA ", 6);
7433 + break;
7434 + case 4:
7435 + case 7:
7436 + memcpy(bus.mpc_bustype, "MCA ", 6);
7437 + }
7438 + MP_bus_info(&bus);
7439 + if (mpc_default_type > 4) {
7440 + bus.mpc_busid = 1;
7441 + memcpy(bus.mpc_bustype, "PCI ", 6);
7442 + MP_bus_info(&bus);
7443 + }
7444 +
7445 + ioapic.mpc_type = MP_IOAPIC;
7446 + ioapic.mpc_apicid = 2;
7447 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
7448 + ioapic.mpc_flags = MPC_APIC_USABLE;
7449 + ioapic.mpc_apicaddr = 0xFEC00000;
7450 + MP_ioapic_info(&ioapic);
7451 +
7452 + /*
7453 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
7454 + */
7455 + construct_default_ioirq_mptable(mpc_default_type);
7456 +
7457 + lintsrc.mpc_type = MP_LINTSRC;
7458 + lintsrc.mpc_irqflag = 0; /* conforming */
7459 + lintsrc.mpc_srcbusid = 0;
7460 + lintsrc.mpc_srcbusirq = 0;
7461 + lintsrc.mpc_destapic = MP_APIC_ALL;
7462 + for (i = 0; i < 2; i++) {
7463 + lintsrc.mpc_irqtype = linttypes[i];
7464 + lintsrc.mpc_destapiclint = i;
7465 + MP_lintsrc_info(&lintsrc);
7466 + }
7467 +}
7468 +
7469 +static struct intel_mp_floating *mpf_found;
7470 +
7471 +/*
7472 + * Scan the memory blocks for an SMP configuration block.
7473 + */
7474 +void __init get_smp_config (void)
7475 +{
7476 + struct intel_mp_floating *mpf = mpf_found;
7477 +
7478 + /*
7479 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
7480 + * processors, where MPS only supports physical.
7481 + */
7482 + if (acpi_lapic && acpi_ioapic) {
7483 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
7484 + return;
7485 + }
7486 + else if (acpi_lapic)
7487 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
7488 +
7489 + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
7490 + if (mpf->mpf_feature2 & (1<<7)) {
7491 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
7492 + pic_mode = 1;
7493 + } else {
7494 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
7495 + pic_mode = 0;
7496 + }
7497 +
7498 + /*
7499 + * Now see if we need to read further.
7500 + */
7501 + if (mpf->mpf_feature1 != 0) {
7502 +
7503 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
7504 + construct_default_ISA_mptable(mpf->mpf_feature1);
7505 +
7506 + } else if (mpf->mpf_physptr) {
7507 +
7508 + /*
7509 + * Read the physical hardware table. Anything here will
7510 + * override the defaults.
7511 + */
7512 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
7513 + smp_found_config = 0;
7514 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
7515 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
7516 + return;
7517 + }
7518 + /*
7519 + * If there are no explicit MP IRQ entries, then we are
7520 + * broken. We set up most of the low 16 IO-APIC pins to
7521 + * ISA defaults and hope it will work.
7522 + */
7523 + if (!mp_irq_entries) {
7524 + struct mpc_config_bus bus;
7525 +
7526 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
7527 +
7528 + bus.mpc_type = MP_BUS;
7529 + bus.mpc_busid = 0;
7530 + memcpy(bus.mpc_bustype, "ISA ", 6);
7531 + MP_bus_info(&bus);
7532 +
7533 + construct_default_ioirq_mptable(0);
7534 + }
7535 +
7536 + } else
7537 + BUG();
7538 +
7539 + printk(KERN_INFO "Processors: %d\n", num_processors);
7540 + /*
7541 + * Only use the first configuration found.
7542 + */
7543 +}
7544 +
7545 +static int __init smp_scan_config (unsigned long base, unsigned long length)
7546 +{
7547 + unsigned long *bp = isa_bus_to_virt(base);
7548 + struct intel_mp_floating *mpf;
7549 +
7550 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
7551 + if (sizeof(*mpf) != 16)
7552 + printk("Error: MPF size\n");
7553 +
7554 + while (length > 0) {
7555 + mpf = (struct intel_mp_floating *)bp;
7556 + if ((*bp == SMP_MAGIC_IDENT) &&
7557 + (mpf->mpf_length == 1) &&
7558 + !mpf_checksum((unsigned char *)bp, 16) &&
7559 + ((mpf->mpf_specification == 1)
7560 + || (mpf->mpf_specification == 4)) ) {
7561 +
7562 + smp_found_config = 1;
7563 +#ifndef CONFIG_XEN
7564 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
7565 + virt_to_phys(mpf));
7566 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
7567 + if (mpf->mpf_physptr) {
7568 + /*
7569 + * We cannot access to MPC table to compute
7570 + * table size yet, as only few megabytes from
7571 + * the bottom is mapped now.
7572 + * PC-9800's MPC table places on the very last
7573 + * of physical memory; so that simply reserving
7574 + * PAGE_SIZE from mpg->mpf_physptr yields BUG()
7575 + * in reserve_bootmem.
7576 + */
7577 + unsigned long size = PAGE_SIZE;
7578 + unsigned long end = max_low_pfn * PAGE_SIZE;
7579 + if (mpf->mpf_physptr + size > end)
7580 + size = end - mpf->mpf_physptr;
7581 + reserve_bootmem(mpf->mpf_physptr, size);
7582 + }
7583 +#else
7584 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
7585 + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
7586 +#endif
7587 +
7588 + mpf_found = mpf;
7589 + return 1;
7590 + }
7591 + bp += 4;
7592 + length -= 16;
7593 + }
7594 + return 0;
7595 +}
7596 +
7597 +void __init find_smp_config (void)
7598 +{
7599 +#ifndef CONFIG_XEN
7600 + unsigned int address;
7601 +#endif
7602 +
7603 + /*
7604 + * FIXME: Linux assumes you have 640K of base ram..
7605 + * this continues the error...
7606 + *
7607 + * 1) Scan the bottom 1K for a signature
7608 + * 2) Scan the top 1K of base RAM
7609 + * 3) Scan the 64K of bios
7610 + */
7611 + if (smp_scan_config(0x0,0x400) ||
7612 + smp_scan_config(639*0x400,0x400) ||
7613 + smp_scan_config(0xF0000,0x10000))
7614 + return;
7615 + /*
7616 + * If it is an SMP machine we should know now, unless the
7617 + * configuration is in an EISA/MCA bus machine with an
7618 + * extended bios data area.
7619 + *
7620 + * there is a real-mode segmented pointer pointing to the
7621 + * 4K EBDA area at 0x40E, calculate and scan it here.
7622 + *
7623 + * NOTE! There are Linux loaders that will corrupt the EBDA
7624 + * area, and as such this kind of SMP config may be less
7625 + * trustworthy, simply because the SMP table may have been
7626 + * stomped on during early boot. These loaders are buggy and
7627 + * should be fixed.
7628 + *
7629 + * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
7630 + */
7631 +
7632 +#ifndef CONFIG_XEN
7633 + address = get_bios_ebda();
7634 + if (address)
7635 + smp_scan_config(address, 0x400);
7636 +#endif
7637 +}
7638 +
7639 +int es7000_plat;
7640 +
7641 +/* --------------------------------------------------------------------------
7642 + ACPI-based MP Configuration
7643 + -------------------------------------------------------------------------- */
7644 +
7645 +#ifdef CONFIG_ACPI
7646 +
7647 +void __init mp_register_lapic_address (
7648 + u64 address)
7649 +{
7650 +#ifndef CONFIG_XEN
7651 + mp_lapic_addr = (unsigned long) address;
7652 +
7653 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
7654 +
7655 + if (boot_cpu_physical_apicid == -1U)
7656 + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
7657 +
7658 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
7659 +#endif
7660 +}
7661 +
7662 +
7663 +void __devinit mp_register_lapic (
7664 + u8 id,
7665 + u8 enabled)
7666 +{
7667 + struct mpc_config_processor processor;
7668 + int boot_cpu = 0;
7669 +
7670 + if (MAX_APICS - id <= 0) {
7671 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
7672 + id, MAX_APICS);
7673 + return;
7674 + }
7675 +
7676 + if (id == boot_cpu_physical_apicid)
7677 + boot_cpu = 1;
7678 +
7679 +#ifndef CONFIG_XEN
7680 + processor.mpc_type = MP_PROCESSOR;
7681 + processor.mpc_apicid = id;
7682 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
7683 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
7684 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
7685 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
7686 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
7687 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
7688 + processor.mpc_reserved[0] = 0;
7689 + processor.mpc_reserved[1] = 0;
7690 +#endif
7691 +
7692 + MP_processor_info(&processor);
7693 +}
7694 +
7695 +#ifdef CONFIG_X86_IO_APIC
7696 +
7697 +#define MP_ISA_BUS 0
7698 +#define MP_MAX_IOAPIC_PIN 127
7699 +
7700 +static struct mp_ioapic_routing {
7701 + int apic_id;
7702 + int gsi_base;
7703 + int gsi_end;
7704 + u32 pin_programmed[4];
7705 +} mp_ioapic_routing[MAX_IO_APICS];
7706 +
7707 +
7708 +static int mp_find_ioapic (
7709 + int gsi)
7710 +{
7711 + int i = 0;
7712 +
7713 + /* Find the IOAPIC that manages this GSI. */
7714 + for (i = 0; i < nr_ioapics; i++) {
7715 + if ((gsi >= mp_ioapic_routing[i].gsi_base)
7716 + && (gsi <= mp_ioapic_routing[i].gsi_end))
7717 + return i;
7718 + }
7719 +
7720 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
7721 +
7722 + return -1;
7723 +}
7724 +
7725 +
7726 +void __init mp_register_ioapic (
7727 + u8 id,
7728 + u32 address,
7729 + u32 gsi_base)
7730 +{
7731 + int idx = 0;
7732 + int tmpid;
7733 +
7734 + if (nr_ioapics >= MAX_IO_APICS) {
7735 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
7736 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
7737 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
7738 + }
7739 + if (!address) {
7740 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
7741 + " found in MADT table, skipping!\n");
7742 + return;
7743 + }
7744 +
7745 + idx = nr_ioapics++;
7746 +
7747 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
7748 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
7749 + mp_ioapics[idx].mpc_apicaddr = address;
7750 +
7751 +#ifndef CONFIG_XEN
7752 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
7753 +#endif
7754 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
7755 + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
7756 + tmpid = io_apic_get_unique_id(idx, id);
7757 + else
7758 + tmpid = id;
7759 + if (tmpid == -1) {
7760 + nr_ioapics--;
7761 + return;
7762 + }
7763 + mp_ioapics[idx].mpc_apicid = tmpid;
7764 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
7765 +
7766 + /*
7767 + * Build basic GSI lookup table to facilitate gsi->io_apic lookups
7768 + * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
7769 + */
7770 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
7771 + mp_ioapic_routing[idx].gsi_base = gsi_base;
7772 + mp_ioapic_routing[idx].gsi_end = gsi_base +
7773 + io_apic_get_redir_entries(idx);
7774 +
7775 + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
7776 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
7777 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
7778 + mp_ioapic_routing[idx].gsi_base,
7779 + mp_ioapic_routing[idx].gsi_end);
7780 +
7781 + return;
7782 +}
7783 +
7784 +
7785 +void __init mp_override_legacy_irq (
7786 + u8 bus_irq,
7787 + u8 polarity,
7788 + u8 trigger,
7789 + u32 gsi)
7790 +{
7791 + struct mpc_config_intsrc intsrc;
7792 + int ioapic = -1;
7793 + int pin = -1;
7794 +
7795 + /*
7796 + * Convert 'gsi' to 'ioapic.pin'.
7797 + */
7798 + ioapic = mp_find_ioapic(gsi);
7799 + if (ioapic < 0)
7800 + return;
7801 + pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7802 +
7803 + /*
7804 + * TBD: This check is for faulty timer entries, where the override
7805 + * erroneously sets the trigger to level, resulting in a HUGE
7806 + * increase of timer interrupts!
7807 + */
7808 + if ((bus_irq == 0) && (trigger == 3))
7809 + trigger = 1;
7810 +
7811 + intsrc.mpc_type = MP_INTSRC;
7812 + intsrc.mpc_irqtype = mp_INT;
7813 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
7814 + intsrc.mpc_srcbus = MP_ISA_BUS;
7815 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
7816 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
7817 + intsrc.mpc_dstirq = pin; /* INTIN# */
7818 +
7819 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
7820 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7821 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7822 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
7823 +
7824 + mp_irqs[mp_irq_entries] = intsrc;
7825 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7826 + panic("Max # of irq sources exceeded!\n");
7827 +
7828 + return;
7829 +}
7830 +
7831 +void __init mp_config_acpi_legacy_irqs (void)
7832 +{
7833 + struct mpc_config_intsrc intsrc;
7834 + int i = 0;
7835 + int ioapic = -1;
7836 +
7837 + /*
7838 + * Fabricate the legacy ISA bus (bus #31).
7839 + */
7840 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
7841 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
7842 +
7843 + /*
7844 + * Older generations of ES7000 have no legacy identity mappings
7845 + */
7846 + if (es7000_plat == 1)
7847 + return;
7848 +
7849 + /*
7850 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
7851 + */
7852 + ioapic = mp_find_ioapic(0);
7853 + if (ioapic < 0)
7854 + return;
7855 +
7856 + intsrc.mpc_type = MP_INTSRC;
7857 + intsrc.mpc_irqflag = 0; /* Conforming */
7858 + intsrc.mpc_srcbus = MP_ISA_BUS;
7859 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
7860 +
7861 + /*
7862 + * Use the default configuration for the IRQs 0-15. Unless
7863 + * overriden by (MADT) interrupt source override entries.
7864 + */
7865 + for (i = 0; i < 16; i++) {
7866 + int idx;
7867 +
7868 + for (idx = 0; idx < mp_irq_entries; idx++) {
7869 + struct mpc_config_intsrc *irq = mp_irqs + idx;
7870 +
7871 + /* Do we already have a mapping for this ISA IRQ? */
7872 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
7873 + break;
7874 +
7875 + /* Do we already have a mapping for this IOAPIC pin */
7876 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
7877 + (irq->mpc_dstirq == i))
7878 + break;
7879 + }
7880 +
7881 + if (idx != mp_irq_entries) {
7882 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
7883 + continue; /* IRQ already used */
7884 + }
7885 +
7886 + intsrc.mpc_irqtype = mp_INT;
7887 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
7888 + intsrc.mpc_dstirq = i;
7889 +
7890 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
7891 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
7892 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
7893 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
7894 + intsrc.mpc_dstirq);
7895 +
7896 + mp_irqs[mp_irq_entries] = intsrc;
7897 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
7898 + panic("Max # of irq sources exceeded!\n");
7899 + }
7900 +}
7901 +
7902 +#define MAX_GSI_NUM 4096
7903 +
7904 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
7905 +{
7906 + int ioapic = -1;
7907 + int ioapic_pin = 0;
7908 + int idx, bit = 0;
7909 + static int pci_irq = 16;
7910 + /*
7911 + * Mapping between Global System Interrups, which
7912 + * represent all possible interrupts, and IRQs
7913 + * assigned to actual devices.
7914 + */
7915 + static int gsi_to_irq[MAX_GSI_NUM];
7916 +
7917 + /* Don't set up the ACPI SCI because it's already set up */
7918 + if (acpi_fadt.sci_int == gsi)
7919 + return gsi;
7920 +
7921 + ioapic = mp_find_ioapic(gsi);
7922 + if (ioapic < 0) {
7923 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
7924 + return gsi;
7925 + }
7926 +
7927 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
7928 +
7929 + if (ioapic_renumber_irq)
7930 + gsi = ioapic_renumber_irq(ioapic, gsi);
7931 +
7932 + /*
7933 + * Avoid pin reprogramming. PRTs typically include entries
7934 + * with redundant pin->gsi mappings (but unique PCI devices);
7935 + * we only program the IOAPIC on the first.
7936 + */
7937 + bit = ioapic_pin % 32;
7938 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
7939 + if (idx > 3) {
7940 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
7941 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
7942 + ioapic_pin);
7943 + return gsi;
7944 + }
7945 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
7946 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
7947 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
7948 + return gsi_to_irq[gsi];
7949 + }
7950 +
7951 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
7952 +
7953 + if (triggering == ACPI_LEVEL_SENSITIVE) {
7954 + /*
7955 + * For PCI devices assign IRQs in order, avoiding gaps
7956 + * due to unused I/O APIC pins.
7957 + */
7958 + int irq = gsi;
7959 + if (gsi < MAX_GSI_NUM) {
7960 + /*
7961 + * Retain the VIA chipset work-around (gsi > 15), but
7962 + * avoid a problem where the 8254 timer (IRQ0) is setup
7963 + * via an override (so it's not on pin 0 of the ioapic),
7964 + * and at the same time, the pin 0 interrupt is a PCI
7965 + * type. The gsi > 15 test could cause these two pins
7966 + * to be shared as IRQ0, and they are not shareable.
7967 + * So test for this condition, and if necessary, avoid
7968 + * the pin collision.
7969 + */
7970 + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
7971 + gsi = pci_irq++;
7972 + /*
7973 + * Don't assign IRQ used by ACPI SCI
7974 + */
7975 + if (gsi == acpi_fadt.sci_int)
7976 + gsi = pci_irq++;
7977 + gsi_to_irq[irq] = gsi;
7978 + } else {
7979 + printk(KERN_ERR "GSI %u is too high\n", gsi);
7980 + return gsi;
7981 + }
7982 + }
7983 +
7984 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
7985 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
7986 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
7987 + return gsi;
7988 +}
7989 +
7990 +#endif /* CONFIG_X86_IO_APIC */
7991 +#endif /* CONFIG_ACPI */
7992 Index: head-2008-11-25/arch/x86/kernel/pci-dma-xen.c
7993 ===================================================================
7994 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
7995 +++ head-2008-11-25/arch/x86/kernel/pci-dma-xen.c 2008-10-29 09:55:56.000000000 +0100
7996 @@ -0,0 +1,409 @@
7997 +/*
7998 + * Dynamic DMA mapping support.
7999 + *
8000 + * On i386 there is no hardware dynamic DMA address translation,
8001 + * so consistent alloc/free are merely page allocation/freeing.
8002 + * The rest of the dynamic DMA mapping interface is implemented
8003 + * in asm/pci.h.
8004 + */
8005 +
8006 +#include <linux/types.h>
8007 +#include <linux/mm.h>
8008 +#include <linux/string.h>
8009 +#include <linux/pci.h>
8010 +#include <linux/module.h>
8011 +#include <linux/version.h>
8012 +#include <asm/io.h>
8013 +#include <xen/balloon.h>
8014 +#include <xen/gnttab.h>
8015 +#include <asm/swiotlb.h>
8016 +#include <asm/tlbflush.h>
8017 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8018 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
8019 +#include <asm/bug.h>
8020 +
8021 +#ifdef __x86_64__
8022 +#include <asm/proto.h>
8023 +
8024 +int iommu_merge __read_mostly = 0;
8025 +EXPORT_SYMBOL(iommu_merge);
8026 +
8027 +dma_addr_t bad_dma_address __read_mostly;
8028 +EXPORT_SYMBOL(bad_dma_address);
8029 +
8030 +/* This tells the BIO block layer to assume merging. Default to off
8031 + because we cannot guarantee merging later. */
8032 +int iommu_bio_merge __read_mostly = 0;
8033 +EXPORT_SYMBOL(iommu_bio_merge);
8034 +
8035 +int force_iommu __read_mostly= 0;
8036 +
8037 +__init int iommu_setup(char *p)
8038 +{
8039 + return 1;
8040 +}
8041 +
8042 +void __init pci_iommu_alloc(void)
8043 +{
8044 +#ifdef CONFIG_SWIOTLB
8045 + pci_swiotlb_init();
8046 +#endif
8047 +}
8048 +
8049 +static int __init pci_iommu_init(void)
8050 +{
8051 + no_iommu_init();
8052 + return 0;
8053 +}
8054 +
8055 +/* Must execute after PCI subsystem */
8056 +fs_initcall(pci_iommu_init);
8057 +#endif
8058 +
8059 +struct dma_coherent_mem {
8060 + void *virt_base;
8061 + u32 device_base;
8062 + int size;
8063 + int flags;
8064 + unsigned long *bitmap;
8065 +};
8066 +
8067 +#define IOMMU_BUG_ON(test) \
8068 +do { \
8069 + if (unlikely(test)) { \
8070 + printk(KERN_ALERT "Fatal DMA error! " \
8071 + "Please use 'swiotlb=force'\n"); \
8072 + BUG(); \
8073 + } \
8074 +} while (0)
8075 +
8076 +static int check_pages_physically_contiguous(unsigned long pfn,
8077 + unsigned int offset,
8078 + size_t length)
8079 +{
8080 + unsigned long next_mfn;
8081 + int i;
8082 + int nr_pages;
8083 +
8084 + next_mfn = pfn_to_mfn(pfn);
8085 + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
8086 +
8087 + for (i = 1; i < nr_pages; i++) {
8088 + if (pfn_to_mfn(++pfn) != ++next_mfn)
8089 + return 0;
8090 + }
8091 + return 1;
8092 +}
8093 +
8094 +int range_straddles_page_boundary(paddr_t p, size_t size)
8095 +{
8096 + unsigned long pfn = p >> PAGE_SHIFT;
8097 + unsigned int offset = p & ~PAGE_MASK;
8098 +
8099 + return ((offset + size > PAGE_SIZE) &&
8100 + !check_pages_physically_contiguous(pfn, offset, size));
8101 +}
8102 +
8103 +int
8104 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8105 + enum dma_data_direction direction)
8106 +{
8107 + int i, rc;
8108 +
8109 + if (direction == DMA_NONE)
8110 + BUG();
8111 + WARN_ON(nents == 0 || sg[0].length == 0);
8112 +
8113 + if (swiotlb) {
8114 + rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8115 + } else {
8116 + for (i = 0; i < nents; i++ ) {
8117 + BUG_ON(!sg[i].page);
8118 + sg[i].dma_address =
8119 + gnttab_dma_map_page(sg[i].page) + sg[i].offset;
8120 + sg[i].dma_length = sg[i].length;
8121 + IOMMU_BUG_ON(address_needs_mapping(
8122 + hwdev, sg[i].dma_address));
8123 + IOMMU_BUG_ON(range_straddles_page_boundary(
8124 + page_to_pseudophys(sg[i].page) + sg[i].offset,
8125 + sg[i].length));
8126 + }
8127 + rc = nents;
8128 + }
8129 +
8130 + flush_write_buffers();
8131 + return rc;
8132 +}
8133 +EXPORT_SYMBOL(dma_map_sg);
8134 +
8135 +void
8136 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8137 + enum dma_data_direction direction)
8138 +{
8139 + int i;
8140 +
8141 + BUG_ON(direction == DMA_NONE);
8142 + if (swiotlb)
8143 + swiotlb_unmap_sg(hwdev, sg, nents, direction);
8144 + else {
8145 + for (i = 0; i < nents; i++ )
8146 + gnttab_dma_unmap_page(sg[i].dma_address);
8147 + }
8148 +}
8149 +EXPORT_SYMBOL(dma_unmap_sg);
8150 +
8151 +#ifdef CONFIG_HIGHMEM
8152 +dma_addr_t
8153 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8154 + size_t size, enum dma_data_direction direction)
8155 +{
8156 + dma_addr_t dma_addr;
8157 +
8158 + BUG_ON(direction == DMA_NONE);
8159 +
8160 + if (swiotlb) {
8161 + dma_addr = swiotlb_map_page(
8162 + dev, page, offset, size, direction);
8163 + } else {
8164 + dma_addr = gnttab_dma_map_page(page) + offset;
8165 + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8166 + }
8167 +
8168 + return dma_addr;
8169 +}
8170 +EXPORT_SYMBOL(dma_map_page);
8171 +
8172 +void
8173 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8174 + enum dma_data_direction direction)
8175 +{
8176 + BUG_ON(direction == DMA_NONE);
8177 + if (swiotlb)
8178 + swiotlb_unmap_page(dev, dma_address, size, direction);
8179 + else
8180 + gnttab_dma_unmap_page(dma_address);
8181 +}
8182 +EXPORT_SYMBOL(dma_unmap_page);
8183 +#endif /* CONFIG_HIGHMEM */
8184 +
8185 +int
8186 +dma_mapping_error(dma_addr_t dma_addr)
8187 +{
8188 + if (swiotlb)
8189 + return swiotlb_dma_mapping_error(dma_addr);
8190 + return 0;
8191 +}
8192 +EXPORT_SYMBOL(dma_mapping_error);
8193 +
8194 +int
8195 +dma_supported(struct device *dev, u64 mask)
8196 +{
8197 + if (swiotlb)
8198 + return swiotlb_dma_supported(dev, mask);
8199 + /*
8200 + * By default we'll BUG when an infeasible DMA is requested, and
8201 + * request swiotlb=force (see IOMMU_BUG_ON).
8202 + */
8203 + return 1;
8204 +}
8205 +EXPORT_SYMBOL(dma_supported);
8206 +
8207 +void *dma_alloc_coherent(struct device *dev, size_t size,
8208 + dma_addr_t *dma_handle, gfp_t gfp)
8209 +{
8210 + void *ret;
8211 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8212 + unsigned int order = get_order(size);
8213 + unsigned long vstart;
8214 + u64 mask;
8215 +
8216 + /* ignore region specifiers */
8217 + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8218 +
8219 + if (mem) {
8220 + int page = bitmap_find_free_region(mem->bitmap, mem->size,
8221 + order);
8222 + if (page >= 0) {
8223 + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8224 + ret = mem->virt_base + (page << PAGE_SHIFT);
8225 + memset(ret, 0, size);
8226 + return ret;
8227 + }
8228 + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8229 + return NULL;
8230 + }
8231 +
8232 + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8233 + gfp |= GFP_DMA;
8234 +
8235 + vstart = __get_free_pages(gfp, order);
8236 + ret = (void *)vstart;
8237 +
8238 + if (dev != NULL && dev->coherent_dma_mask)
8239 + mask = dev->coherent_dma_mask;
8240 + else
8241 + mask = 0xffffffff;
8242 +
8243 + if (ret != NULL) {
8244 + if (xen_create_contiguous_region(vstart, order,
8245 + fls64(mask)) != 0) {
8246 + free_pages(vstart, order);
8247 + return NULL;
8248 + }
8249 + memset(ret, 0, size);
8250 + *dma_handle = virt_to_bus(ret);
8251 + }
8252 + return ret;
8253 +}
8254 +EXPORT_SYMBOL(dma_alloc_coherent);
8255 +
8256 +void dma_free_coherent(struct device *dev, size_t size,
8257 + void *vaddr, dma_addr_t dma_handle)
8258 +{
8259 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8260 + int order = get_order(size);
8261 +
8262 + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
8263 + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
8264 +
8265 + bitmap_release_region(mem->bitmap, page, order);
8266 + } else {
8267 + xen_destroy_contiguous_region((unsigned long)vaddr, order);
8268 + free_pages((unsigned long)vaddr, order);
8269 + }
8270 +}
8271 +EXPORT_SYMBOL(dma_free_coherent);
8272 +
8273 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
8274 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
8275 + dma_addr_t device_addr, size_t size, int flags)
8276 +{
8277 + void __iomem *mem_base;
8278 + int pages = size >> PAGE_SHIFT;
8279 + int bitmap_size = (pages + 31)/32;
8280 +
8281 + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
8282 + goto out;
8283 + if (!size)
8284 + goto out;
8285 + if (dev->dma_mem)
8286 + goto out;
8287 +
8288 + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
8289 +
8290 + mem_base = ioremap(bus_addr, size);
8291 + if (!mem_base)
8292 + goto out;
8293 +
8294 + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
8295 + if (!dev->dma_mem)
8296 + goto out;
8297 + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
8298 + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
8299 + if (!dev->dma_mem->bitmap)
8300 + goto free1_out;
8301 + memset(dev->dma_mem->bitmap, 0, bitmap_size);
8302 +
8303 + dev->dma_mem->virt_base = mem_base;
8304 + dev->dma_mem->device_base = device_addr;
8305 + dev->dma_mem->size = pages;
8306 + dev->dma_mem->flags = flags;
8307 +
8308 + if (flags & DMA_MEMORY_MAP)
8309 + return DMA_MEMORY_MAP;
8310 +
8311 + return DMA_MEMORY_IO;
8312 +
8313 + free1_out:
8314 + kfree(dev->dma_mem->bitmap);
8315 + out:
8316 + return 0;
8317 +}
8318 +EXPORT_SYMBOL(dma_declare_coherent_memory);
8319 +
8320 +void dma_release_declared_memory(struct device *dev)
8321 +{
8322 + struct dma_coherent_mem *mem = dev->dma_mem;
8323 +
8324 + if(!mem)
8325 + return;
8326 + dev->dma_mem = NULL;
8327 + iounmap(mem->virt_base);
8328 + kfree(mem->bitmap);
8329 + kfree(mem);
8330 +}
8331 +EXPORT_SYMBOL(dma_release_declared_memory);
8332 +
8333 +void *dma_mark_declared_memory_occupied(struct device *dev,
8334 + dma_addr_t device_addr, size_t size)
8335 +{
8336 + struct dma_coherent_mem *mem = dev->dma_mem;
8337 + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
8338 + int pos, err;
8339 +
8340 + if (!mem)
8341 + return ERR_PTR(-EINVAL);
8342 +
8343 + pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
8344 + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
8345 + if (err != 0)
8346 + return ERR_PTR(err);
8347 + return mem->virt_base + (pos << PAGE_SHIFT);
8348 +}
8349 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
8350 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
8351 +
8352 +dma_addr_t
8353 +dma_map_single(struct device *dev, void *ptr, size_t size,
8354 + enum dma_data_direction direction)
8355 +{
8356 + dma_addr_t dma;
8357 +
8358 + if (direction == DMA_NONE)
8359 + BUG();
8360 + WARN_ON(size == 0);
8361 +
8362 + if (swiotlb) {
8363 + dma = swiotlb_map_single(dev, ptr, size, direction);
8364 + } else {
8365 + dma = gnttab_dma_map_page(virt_to_page(ptr)) +
8366 + offset_in_page(ptr);
8367 + IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
8368 + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
8369 + }
8370 +
8371 + flush_write_buffers();
8372 + return dma;
8373 +}
8374 +EXPORT_SYMBOL(dma_map_single);
8375 +
8376 +void
8377 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
8378 + enum dma_data_direction direction)
8379 +{
8380 + if (direction == DMA_NONE)
8381 + BUG();
8382 + if (swiotlb)
8383 + swiotlb_unmap_single(dev, dma_addr, size, direction);
8384 + else
8385 + gnttab_dma_unmap_page(dma_addr);
8386 +}
8387 +EXPORT_SYMBOL(dma_unmap_single);
8388 +
8389 +void
8390 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
8391 + enum dma_data_direction direction)
8392 +{
8393 + if (swiotlb)
8394 + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
8395 +}
8396 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
8397 +
8398 +void
8399 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
8400 + enum dma_data_direction direction)
8401 +{
8402 + if (swiotlb)
8403 + swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
8404 +}
8405 +EXPORT_SYMBOL(dma_sync_single_for_device);
8406 Index: head-2008-11-25/arch/x86/kernel/process_32-xen.c
8407 ===================================================================
8408 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
8409 +++ head-2008-11-25/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200
8410 @@ -0,0 +1,877 @@
8411 +/*
8412 + * linux/arch/i386/kernel/process.c
8413 + *
8414 + * Copyright (C) 1995 Linus Torvalds
8415 + *
8416 + * Pentium III FXSR, SSE support
8417 + * Gareth Hughes <gareth@valinux.com>, May 2000
8418 + */
8419 +
8420 +/*
8421 + * This file handles the architecture-dependent parts of process handling..
8422 + */
8423 +
8424 +#include <stdarg.h>
8425 +
8426 +#include <linux/cpu.h>
8427 +#include <linux/errno.h>
8428 +#include <linux/sched.h>
8429 +#include <linux/fs.h>
8430 +#include <linux/kernel.h>
8431 +#include <linux/mm.h>
8432 +#include <linux/elfcore.h>
8433 +#include <linux/smp.h>
8434 +#include <linux/smp_lock.h>
8435 +#include <linux/stddef.h>
8436 +#include <linux/slab.h>
8437 +#include <linux/vmalloc.h>
8438 +#include <linux/user.h>
8439 +#include <linux/a.out.h>
8440 +#include <linux/interrupt.h>
8441 +#include <linux/utsname.h>
8442 +#include <linux/delay.h>
8443 +#include <linux/reboot.h>
8444 +#include <linux/init.h>
8445 +#include <linux/mc146818rtc.h>
8446 +#include <linux/module.h>
8447 +#include <linux/kallsyms.h>
8448 +#include <linux/ptrace.h>
8449 +#include <linux/random.h>
8450 +
8451 +#include <asm/uaccess.h>
8452 +#include <asm/pgtable.h>
8453 +#include <asm/system.h>
8454 +#include <asm/io.h>
8455 +#include <asm/ldt.h>
8456 +#include <asm/processor.h>
8457 +#include <asm/i387.h>
8458 +#include <asm/desc.h>
8459 +#include <asm/vm86.h>
8460 +#ifdef CONFIG_MATH_EMULATION
8461 +#include <asm/math_emu.h>
8462 +#endif
8463 +
8464 +#include <xen/interface/physdev.h>
8465 +#include <xen/interface/vcpu.h>
8466 +#include <xen/cpu_hotplug.h>
8467 +
8468 +#include <linux/err.h>
8469 +
8470 +#include <asm/tlbflush.h>
8471 +#include <asm/cpu.h>
8472 +
8473 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
8474 +
8475 +static int hlt_counter;
8476 +
8477 +unsigned long boot_option_idle_override = 0;
8478 +EXPORT_SYMBOL(boot_option_idle_override);
8479 +
8480 +/*
8481 + * Return saved PC of a blocked thread.
8482 + */
8483 +unsigned long thread_saved_pc(struct task_struct *tsk)
8484 +{
8485 + return ((unsigned long *)tsk->thread.esp)[3];
8486 +}
8487 +
8488 +/*
8489 + * Powermanagement idle function, if any..
8490 + */
8491 +void (*pm_idle)(void);
8492 +EXPORT_SYMBOL(pm_idle);
8493 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
8494 +
8495 +void disable_hlt(void)
8496 +{
8497 + hlt_counter++;
8498 +}
8499 +
8500 +EXPORT_SYMBOL(disable_hlt);
8501 +
8502 +void enable_hlt(void)
8503 +{
8504 + hlt_counter--;
8505 +}
8506 +
8507 +EXPORT_SYMBOL(enable_hlt);
8508 +
8509 +/*
8510 + * On SMP it's slightly faster (but much more power-consuming!)
8511 + * to poll the ->work.need_resched flag instead of waiting for the
8512 + * cross-CPU IPI to arrive. Use this option with caution.
8513 + */
8514 +static void poll_idle (void)
8515 +{
8516 + local_irq_enable();
8517 +
8518 + asm volatile(
8519 + "2:"
8520 + "testl %0, %1;"
8521 + "rep; nop;"
8522 + "je 2b;"
8523 + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
8524 +}
8525 +
8526 +static void xen_idle(void)
8527 +{
8528 + local_irq_disable();
8529 +
8530 + if (need_resched())
8531 + local_irq_enable();
8532 + else {
8533 + current_thread_info()->status &= ~TS_POLLING;
8534 + smp_mb__after_clear_bit();
8535 + safe_halt();
8536 + current_thread_info()->status |= TS_POLLING;
8537 + }
8538 +}
8539 +#ifdef CONFIG_APM_MODULE
8540 +EXPORT_SYMBOL(default_idle);
8541 +#endif
8542 +
8543 +#ifdef CONFIG_HOTPLUG_CPU
8544 +extern cpumask_t cpu_initialized;
8545 +static inline void play_dead(void)
8546 +{
8547 + idle_task_exit();
8548 + local_irq_disable();
8549 + cpu_clear(smp_processor_id(), cpu_initialized);
8550 + preempt_enable_no_resched();
8551 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
8552 + cpu_bringup();
8553 +}
8554 +#else
8555 +static inline void play_dead(void)
8556 +{
8557 + BUG();
8558 +}
8559 +#endif /* CONFIG_HOTPLUG_CPU */
8560 +
8561 +/*
8562 + * The idle thread. There's no useful work to be
8563 + * done, so just try to conserve power and have a
8564 + * low exit latency (ie sit in a loop waiting for
8565 + * somebody to say that they'd like to reschedule)
8566 + */
8567 +void cpu_idle(void)
8568 +{
8569 + int cpu = smp_processor_id();
8570 +
8571 + current_thread_info()->status |= TS_POLLING;
8572 +
8573 + /* endless idle loop with no priority at all */
8574 + while (1) {
8575 + while (!need_resched()) {
8576 + void (*idle)(void);
8577 +
8578 + if (__get_cpu_var(cpu_idle_state))
8579 + __get_cpu_var(cpu_idle_state) = 0;
8580 +
8581 + rmb();
8582 + idle = xen_idle; /* no alternatives */
8583 +
8584 + if (cpu_is_offline(cpu))
8585 + play_dead();
8586 +
8587 + __get_cpu_var(irq_stat).idle_timestamp = jiffies;
8588 + idle();
8589 + }
8590 + preempt_enable_no_resched();
8591 + schedule();
8592 + preempt_disable();
8593 + }
8594 +}
8595 +
8596 +void cpu_idle_wait(void)
8597 +{
8598 + unsigned int cpu, this_cpu = get_cpu();
8599 + cpumask_t map;
8600 +
8601 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
8602 + put_cpu();
8603 +
8604 + cpus_clear(map);
8605 + for_each_online_cpu(cpu) {
8606 + per_cpu(cpu_idle_state, cpu) = 1;
8607 + cpu_set(cpu, map);
8608 + }
8609 +
8610 + __get_cpu_var(cpu_idle_state) = 0;
8611 +
8612 + wmb();
8613 + do {
8614 + ssleep(1);
8615 + for_each_online_cpu(cpu) {
8616 + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
8617 + cpu_clear(cpu, map);
8618 + }
8619 + cpus_and(map, map, cpu_online_map);
8620 + } while (!cpus_empty(map));
8621 +}
8622 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
8623 +
8624 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
8625 +{
8626 +}
8627 +
8628 +static int __init idle_setup (char *str)
8629 +{
8630 + if (!strncmp(str, "poll", 4)) {
8631 + printk("using polling idle threads.\n");
8632 + pm_idle = poll_idle;
8633 + }
8634 +
8635 + boot_option_idle_override = 1;
8636 + return 1;
8637 +}
8638 +
8639 +__setup("idle=", idle_setup);
8640 +
8641 +void show_regs(struct pt_regs * regs)
8642 +{
8643 + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
8644 +
8645 + printk("\n");
8646 + printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
8647 + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
8648 + print_symbol("EIP is at %s\n", regs->eip);
8649 +
8650 + if (user_mode_vm(regs))
8651 + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
8652 + printk(" EFLAGS: %08lx %s (%s %.*s)\n",
8653 + regs->eflags, print_tainted(), system_utsname.release,
8654 + (int)strcspn(system_utsname.version, " "),
8655 + system_utsname.version);
8656 + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
8657 + regs->eax,regs->ebx,regs->ecx,regs->edx);
8658 + printk("ESI: %08lx EDI: %08lx EBP: %08lx",
8659 + regs->esi, regs->edi, regs->ebp);
8660 + printk(" DS: %04x ES: %04x\n",
8661 + 0xffff & regs->xds,0xffff & regs->xes);
8662 +
8663 + cr0 = read_cr0();
8664 + cr2 = read_cr2();
8665 + cr3 = read_cr3();
8666 + cr4 = read_cr4_safe();
8667 + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
8668 + show_trace(NULL, regs, &regs->esp);
8669 +}
8670 +
8671 +/*
8672 + * This gets run with %ebx containing the
8673 + * function to call, and %edx containing
8674 + * the "args".
8675 + */
8676 +extern void kernel_thread_helper(void);
8677 +__asm__(".section .text\n"
8678 + ".align 4\n"
8679 + "kernel_thread_helper:\n\t"
8680 + "movl %edx,%eax\n\t"
8681 + "pushl %edx\n\t"
8682 + "call *%ebx\n\t"
8683 + "pushl %eax\n\t"
8684 + "call do_exit\n"
8685 + ".previous");
8686 +
8687 +/*
8688 + * Create a kernel thread
8689 + */
8690 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
8691 +{
8692 + struct pt_regs regs;
8693 +
8694 + memset(&regs, 0, sizeof(regs));
8695 +
8696 + regs.ebx = (unsigned long) fn;
8697 + regs.edx = (unsigned long) arg;
8698 +
8699 + regs.xds = __USER_DS;
8700 + regs.xes = __USER_DS;
8701 + regs.orig_eax = -1;
8702 + regs.eip = (unsigned long) kernel_thread_helper;
8703 + regs.xcs = GET_KERNEL_CS();
8704 + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
8705 +
8706 + /* Ok, create the new process.. */
8707 + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
8708 +}
8709 +EXPORT_SYMBOL(kernel_thread);
8710 +
8711 +/*
8712 + * Free current thread data structures etc..
8713 + */
8714 +void exit_thread(void)
8715 +{
8716 + /* The process may have allocated an io port bitmap... nuke it. */
8717 + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
8718 + struct task_struct *tsk = current;
8719 + struct thread_struct *t = &tsk->thread;
8720 + struct physdev_set_iobitmap set_iobitmap;
8721 + memset(&set_iobitmap, 0, sizeof(set_iobitmap));
8722 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
8723 + &set_iobitmap));
8724 + kfree(t->io_bitmap_ptr);
8725 + t->io_bitmap_ptr = NULL;
8726 + clear_thread_flag(TIF_IO_BITMAP);
8727 + }
8728 +}
8729 +
8730 +void flush_thread(void)
8731 +{
8732 + struct task_struct *tsk = current;
8733 +
8734 + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
8735 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
8736 + clear_tsk_thread_flag(tsk, TIF_DEBUG);
8737 + /*
8738 + * Forget coprocessor state..
8739 + */
8740 + clear_fpu(tsk);
8741 + clear_used_math();
8742 +}
8743 +
8744 +void release_thread(struct task_struct *dead_task)
8745 +{
8746 + BUG_ON(dead_task->mm);
8747 + release_vm86_irqs(dead_task);
8748 +}
8749 +
8750 +/*
8751 + * This gets called before we allocate a new thread and copy
8752 + * the current task into it.
8753 + */
8754 +void prepare_to_copy(struct task_struct *tsk)
8755 +{
8756 + unlazy_fpu(tsk);
8757 +}
8758 +
8759 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
8760 + unsigned long unused,
8761 + struct task_struct * p, struct pt_regs * regs)
8762 +{
8763 + struct pt_regs * childregs;
8764 + struct task_struct *tsk;
8765 + int err;
8766 +
8767 + childregs = task_pt_regs(p);
8768 + *childregs = *regs;
8769 + childregs->eax = 0;
8770 + childregs->esp = esp;
8771 +
8772 + p->thread.esp = (unsigned long) childregs;
8773 + p->thread.esp0 = (unsigned long) (childregs+1);
8774 +
8775 + p->thread.eip = (unsigned long) ret_from_fork;
8776 +
8777 + savesegment(fs,p->thread.fs);
8778 + savesegment(gs,p->thread.gs);
8779 +
8780 + tsk = current;
8781 + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
8782 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
8783 + if (!p->thread.io_bitmap_ptr) {
8784 + p->thread.io_bitmap_max = 0;
8785 + return -ENOMEM;
8786 + }
8787 + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
8788 + IO_BITMAP_BYTES);
8789 + set_tsk_thread_flag(p, TIF_IO_BITMAP);
8790 + }
8791 +
8792 + /*
8793 + * Set a new TLS for the child thread?
8794 + */
8795 + if (clone_flags & CLONE_SETTLS) {
8796 + struct desc_struct *desc;
8797 + struct user_desc info;
8798 + int idx;
8799 +
8800 + err = -EFAULT;
8801 + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
8802 + goto out;
8803 + err = -EINVAL;
8804 + if (LDT_empty(&info))
8805 + goto out;
8806 +
8807 + idx = info.entry_number;
8808 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
8809 + goto out;
8810 +
8811 + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
8812 + desc->a = LDT_entry_a(&info);
8813 + desc->b = LDT_entry_b(&info);
8814 + }
8815 +
8816 + p->thread.iopl = current->thread.iopl;
8817 +
8818 + err = 0;
8819 + out:
8820 + if (err && p->thread.io_bitmap_ptr) {
8821 + kfree(p->thread.io_bitmap_ptr);
8822 + p->thread.io_bitmap_max = 0;
8823 + }
8824 + return err;
8825 +}
8826 +
8827 +/*
8828 + * fill in the user structure for a core dump..
8829 + */
8830 +void dump_thread(struct pt_regs * regs, struct user * dump)
8831 +{
8832 + int i;
8833 +
8834 +/* changed the size calculations - should hopefully work better. lbt */
8835 + dump->magic = CMAGIC;
8836 + dump->start_code = 0;
8837 + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
8838 + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
8839 + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
8840 + dump->u_dsize -= dump->u_tsize;
8841 + dump->u_ssize = 0;
8842 + for (i = 0; i < 8; i++)
8843 + dump->u_debugreg[i] = current->thread.debugreg[i];
8844 +
8845 + if (dump->start_stack < TASK_SIZE)
8846 + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
8847 +
8848 + dump->regs.ebx = regs->ebx;
8849 + dump->regs.ecx = regs->ecx;
8850 + dump->regs.edx = regs->edx;
8851 + dump->regs.esi = regs->esi;
8852 + dump->regs.edi = regs->edi;
8853 + dump->regs.ebp = regs->ebp;
8854 + dump->regs.eax = regs->eax;
8855 + dump->regs.ds = regs->xds;
8856 + dump->regs.es = regs->xes;
8857 + savesegment(fs,dump->regs.fs);
8858 + savesegment(gs,dump->regs.gs);
8859 + dump->regs.orig_eax = regs->orig_eax;
8860 + dump->regs.eip = regs->eip;
8861 + dump->regs.cs = regs->xcs;
8862 + dump->regs.eflags = regs->eflags;
8863 + dump->regs.esp = regs->esp;
8864 + dump->regs.ss = regs->xss;
8865 +
8866 + dump->u_fpvalid = dump_fpu (regs, &dump->i387);
8867 +}
8868 +EXPORT_SYMBOL(dump_thread);
8869 +
8870 +/*
8871 + * Capture the user space registers if the task is not running (in user space)
8872 + */
8873 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
8874 +{
8875 + struct pt_regs ptregs = *task_pt_regs(tsk);
8876 + ptregs.xcs &= 0xffff;
8877 + ptregs.xds &= 0xffff;
8878 + ptregs.xes &= 0xffff;
8879 + ptregs.xss &= 0xffff;
8880 +
8881 + elf_core_copy_regs(regs, &ptregs);
8882 +
8883 + return 1;
8884 +}
8885 +
8886 +static noinline void __switch_to_xtra(struct task_struct *next_p)
8887 +{
8888 + struct thread_struct *next;
8889 +
8890 + next = &next_p->thread;
8891 +
8892 + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
8893 + set_debugreg(next->debugreg[0], 0);
8894 + set_debugreg(next->debugreg[1], 1);
8895 + set_debugreg(next->debugreg[2], 2);
8896 + set_debugreg(next->debugreg[3], 3);
8897 + /* no 4 and 5 */
8898 + set_debugreg(next->debugreg[6], 6);
8899 + set_debugreg(next->debugreg[7], 7);
8900 + }
8901 +}
8902 +
8903 +/*
8904 + * This function selects if the context switch from prev to next
8905 + * has to tweak the TSC disable bit in the cr4.
8906 + */
8907 +static inline void disable_tsc(struct task_struct *prev_p,
8908 + struct task_struct *next_p)
8909 +{
8910 + struct thread_info *prev, *next;
8911 +
8912 + /*
8913 + * gcc should eliminate the ->thread_info dereference if
8914 + * has_secure_computing returns 0 at compile time (SECCOMP=n).
8915 + */
8916 + prev = task_thread_info(prev_p);
8917 + next = task_thread_info(next_p);
8918 +
8919 + if (has_secure_computing(prev) || has_secure_computing(next)) {
8920 + /* slow path here */
8921 + if (has_secure_computing(prev) &&
8922 + !has_secure_computing(next)) {
8923 + write_cr4(read_cr4() & ~X86_CR4_TSD);
8924 + } else if (!has_secure_computing(prev) &&
8925 + has_secure_computing(next))
8926 + write_cr4(read_cr4() | X86_CR4_TSD);
8927 + }
8928 +}
8929 +
8930 +/*
8931 + * switch_to(x,yn) should switch tasks from x to y.
8932 + *
8933 + * We fsave/fwait so that an exception goes off at the right time
8934 + * (as a call from the fsave or fwait in effect) rather than to
8935 + * the wrong process. Lazy FP saving no longer makes any sense
8936 + * with modern CPU's, and this simplifies a lot of things (SMP
8937 + * and UP become the same).
8938 + *
8939 + * NOTE! We used to use the x86 hardware context switching. The
8940 + * reason for not using it any more becomes apparent when you
8941 + * try to recover gracefully from saved state that is no longer
8942 + * valid (stale segment register values in particular). With the
8943 + * hardware task-switch, there is no way to fix up bad state in
8944 + * a reasonable manner.
8945 + *
8946 + * The fact that Intel documents the hardware task-switching to
8947 + * be slow is a fairly red herring - this code is not noticeably
8948 + * faster. However, there _is_ some room for improvement here,
8949 + * so the performance issues may eventually be a valid point.
8950 + * More important, however, is the fact that this allows us much
8951 + * more flexibility.
8952 + *
8953 + * The return value (in %eax) will be the "prev" task after
8954 + * the task-switch, and shows up in ret_from_fork in entry.S,
8955 + * for example.
8956 + */
8957 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
8958 +{
8959 + struct thread_struct *prev = &prev_p->thread,
8960 + *next = &next_p->thread;
8961 + int cpu = smp_processor_id();
8962 +#ifndef CONFIG_X86_NO_TSS
8963 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
8964 +#endif
8965 +#if CONFIG_XEN_COMPAT > 0x030002
8966 + struct physdev_set_iopl iopl_op;
8967 + struct physdev_set_iobitmap iobmp_op;
8968 +#else
8969 + struct physdev_op _pdo[2], *pdo = _pdo;
8970 +#define iopl_op pdo->u.set_iopl
8971 +#define iobmp_op pdo->u.set_iobitmap
8972 +#endif
8973 + multicall_entry_t _mcl[8], *mcl = _mcl;
8974 +
8975 + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
8976 +
8977 + /*
8978 + * This is basically '__unlazy_fpu', except that we queue a
8979 + * multicall to indicate FPU task switch, rather than
8980 + * synchronously trapping to Xen.
8981 + */
8982 + if (prev_p->thread_info->status & TS_USEDFPU) {
8983 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
8984 + mcl->op = __HYPERVISOR_fpu_taskswitch;
8985 + mcl->args[0] = 1;
8986 + mcl++;
8987 + }
8988 +#if 0 /* lazy fpu sanity check */
8989 + else BUG_ON(!(read_cr0() & 8));
8990 +#endif
8991 +
8992 + /*
8993 + * Reload esp0.
8994 + * This is load_esp0(tss, next) with a multicall.
8995 + */
8996 + mcl->op = __HYPERVISOR_stack_switch;
8997 + mcl->args[0] = __KERNEL_DS;
8998 + mcl->args[1] = next->esp0;
8999 + mcl++;
9000 +
9001 + /*
9002 + * Load the per-thread Thread-Local Storage descriptor.
9003 + * This is load_TLS(next, cpu) with multicalls.
9004 + */
9005 +#define C(i) do { \
9006 + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
9007 + next->tls_array[i].b != prev->tls_array[i].b)) { \
9008 + mcl->op = __HYPERVISOR_update_descriptor; \
9009 + *(u64 *)&mcl->args[0] = virt_to_machine( \
9010 + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9011 + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
9012 + mcl++; \
9013 + } \
9014 +} while (0)
9015 + C(0); C(1); C(2);
9016 +#undef C
9017 +
9018 + if (unlikely(prev->iopl != next->iopl)) {
9019 + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
9020 +#if CONFIG_XEN_COMPAT > 0x030002
9021 + mcl->op = __HYPERVISOR_physdev_op;
9022 + mcl->args[0] = PHYSDEVOP_set_iopl;
9023 + mcl->args[1] = (unsigned long)&iopl_op;
9024 +#else
9025 + mcl->op = __HYPERVISOR_physdev_op_compat;
9026 + pdo->cmd = PHYSDEVOP_set_iopl;
9027 + mcl->args[0] = (unsigned long)pdo++;
9028 +#endif
9029 + mcl++;
9030 + }
9031 +
9032 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9033 + set_xen_guest_handle(iobmp_op.bitmap,
9034 + (char *)next->io_bitmap_ptr);
9035 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9036 +#if CONFIG_XEN_COMPAT > 0x030002
9037 + mcl->op = __HYPERVISOR_physdev_op;
9038 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
9039 + mcl->args[1] = (unsigned long)&iobmp_op;
9040 +#else
9041 + mcl->op = __HYPERVISOR_physdev_op_compat;
9042 + pdo->cmd = PHYSDEVOP_set_iobitmap;
9043 + mcl->args[0] = (unsigned long)pdo++;
9044 +#endif
9045 + mcl++;
9046 + }
9047 +
9048 +#if CONFIG_XEN_COMPAT <= 0x030002
9049 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
9050 +#endif
9051 + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
9052 + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
9053 + BUG();
9054 +
9055 + /*
9056 + * Restore %fs and %gs if needed.
9057 + *
9058 + * Glibc normally makes %fs be zero, and %gs is one of
9059 + * the TLS segments.
9060 + */
9061 + if (unlikely(next->fs))
9062 + loadsegment(fs, next->fs);
9063 +
9064 + if (next->gs)
9065 + loadsegment(gs, next->gs);
9066 +
9067 + /*
9068 + * Now maybe handle debug registers
9069 + */
9070 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
9071 + __switch_to_xtra(next_p);
9072 +
9073 + disable_tsc(prev_p, next_p);
9074 +
9075 + return prev_p;
9076 +}
9077 +
9078 +asmlinkage int sys_fork(struct pt_regs regs)
9079 +{
9080 + return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9081 +}
9082 +
9083 +asmlinkage int sys_clone(struct pt_regs regs)
9084 +{
9085 + unsigned long clone_flags;
9086 + unsigned long newsp;
9087 + int __user *parent_tidptr, *child_tidptr;
9088 +
9089 + clone_flags = regs.ebx;
9090 + newsp = regs.ecx;
9091 + parent_tidptr = (int __user *)regs.edx;
9092 + child_tidptr = (int __user *)regs.edi;
9093 + if (!newsp)
9094 + newsp = regs.esp;
9095 + return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9096 +}
9097 +
9098 +/*
9099 + * This is trivial, and on the face of it looks like it
9100 + * could equally well be done in user mode.
9101 + *
9102 + * Not so, for quite unobvious reasons - register pressure.
9103 + * In user mode vfork() cannot have a stack frame, and if
9104 + * done by calling the "clone()" system call directly, you
9105 + * do not have enough call-clobbered registers to hold all
9106 + * the information you need.
9107 + */
9108 +asmlinkage int sys_vfork(struct pt_regs regs)
9109 +{
9110 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9111 +}
9112 +
9113 +/*
9114 + * sys_execve() executes a new program.
9115 + */
9116 +asmlinkage int sys_execve(struct pt_regs regs)
9117 +{
9118 + int error;
9119 + char * filename;
9120 +
9121 + filename = getname((char __user *) regs.ebx);
9122 + error = PTR_ERR(filename);
9123 + if (IS_ERR(filename))
9124 + goto out;
9125 + error = do_execve(filename,
9126 + (char __user * __user *) regs.ecx,
9127 + (char __user * __user *) regs.edx,
9128 + &regs);
9129 + if (error == 0) {
9130 + task_lock(current);
9131 + current->ptrace &= ~PT_DTRACE;
9132 + task_unlock(current);
9133 + /* Make sure we don't return using sysenter.. */
9134 + set_thread_flag(TIF_IRET);
9135 + }
9136 + putname(filename);
9137 +out:
9138 + return error;
9139 +}
9140 +
9141 +#define top_esp (THREAD_SIZE - sizeof(unsigned long))
9142 +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
9143 +
9144 +unsigned long get_wchan(struct task_struct *p)
9145 +{
9146 + unsigned long ebp, esp, eip;
9147 + unsigned long stack_page;
9148 + int count = 0;
9149 + if (!p || p == current || p->state == TASK_RUNNING)
9150 + return 0;
9151 + stack_page = (unsigned long)task_stack_page(p);
9152 + esp = p->thread.esp;
9153 + if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9154 + return 0;
9155 + /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9156 + ebp = *(unsigned long *) esp;
9157 + do {
9158 + if (ebp < stack_page || ebp > top_ebp+stack_page)
9159 + return 0;
9160 + eip = *(unsigned long *) (ebp+4);
9161 + if (!in_sched_functions(eip))
9162 + return eip;
9163 + ebp = *(unsigned long *) ebp;
9164 + } while (count++ < 16);
9165 + return 0;
9166 +}
9167 +
9168 +/*
9169 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9170 + */
9171 +static int get_free_idx(void)
9172 +{
9173 + struct thread_struct *t = &current->thread;
9174 + int idx;
9175 +
9176 + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9177 + if (desc_empty(t->tls_array + idx))
9178 + return idx + GDT_ENTRY_TLS_MIN;
9179 + return -ESRCH;
9180 +}
9181 +
9182 +/*
9183 + * Set a given TLS descriptor:
9184 + */
9185 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9186 +{
9187 + struct thread_struct *t = &current->thread;
9188 + struct user_desc info;
9189 + struct desc_struct *desc;
9190 + int cpu, idx;
9191 +
9192 + if (copy_from_user(&info, u_info, sizeof(info)))
9193 + return -EFAULT;
9194 + idx = info.entry_number;
9195 +
9196 + /*
9197 + * index -1 means the kernel should try to find and
9198 + * allocate an empty descriptor:
9199 + */
9200 + if (idx == -1) {
9201 + idx = get_free_idx();
9202 + if (idx < 0)
9203 + return idx;
9204 + if (put_user(idx, &u_info->entry_number))
9205 + return -EFAULT;
9206 + }
9207 +
9208 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9209 + return -EINVAL;
9210 +
9211 + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9212 +
9213 + /*
9214 + * We must not get preempted while modifying the TLS.
9215 + */
9216 + cpu = get_cpu();
9217 +
9218 + if (LDT_empty(&info)) {
9219 + desc->a = 0;
9220 + desc->b = 0;
9221 + } else {
9222 + desc->a = LDT_entry_a(&info);
9223 + desc->b = LDT_entry_b(&info);
9224 + }
9225 + load_TLS(t, cpu);
9226 +
9227 + put_cpu();
9228 +
9229 + return 0;
9230 +}
9231 +
9232 +/*
9233 + * Get the current Thread-Local Storage area:
9234 + */
9235 +
9236 +#define GET_BASE(desc) ( \
9237 + (((desc)->a >> 16) & 0x0000ffff) | \
9238 + (((desc)->b << 16) & 0x00ff0000) | \
9239 + ( (desc)->b & 0xff000000) )
9240 +
9241 +#define GET_LIMIT(desc) ( \
9242 + ((desc)->a & 0x0ffff) | \
9243 + ((desc)->b & 0xf0000) )
9244 +
9245 +#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
9246 +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
9247 +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
9248 +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
9249 +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
9250 +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
9251 +
9252 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9253 +{
9254 + struct user_desc info;
9255 + struct desc_struct *desc;
9256 + int idx;
9257 +
9258 + if (get_user(idx, &u_info->entry_number))
9259 + return -EFAULT;
9260 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9261 + return -EINVAL;
9262 +
9263 + memset(&info, 0, sizeof(info));
9264 +
9265 + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9266 +
9267 + info.entry_number = idx;
9268 + info.base_addr = GET_BASE(desc);
9269 + info.limit = GET_LIMIT(desc);
9270 + info.seg_32bit = GET_32BIT(desc);
9271 + info.contents = GET_CONTENTS(desc);
9272 + info.read_exec_only = !GET_WRITABLE(desc);
9273 + info.limit_in_pages = GET_LIMIT_PAGES(desc);
9274 + info.seg_not_present = !GET_PRESENT(desc);
9275 + info.useable = GET_USEABLE(desc);
9276 +
9277 + if (copy_to_user(u_info, &info, sizeof(info)))
9278 + return -EFAULT;
9279 + return 0;
9280 +}
9281 +
9282 +unsigned long arch_align_stack(unsigned long sp)
9283 +{
9284 + if (randomize_va_space)
9285 + sp -= get_random_int() % 8192;
9286 + return sp & ~0xf;
9287 +}
9288 Index: head-2008-11-25/arch/x86/kernel/quirks-xen.c
9289 ===================================================================
9290 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
9291 +++ head-2008-11-25/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100
9292 @@ -0,0 +1,47 @@
9293 +/*
9294 + * This file contains work-arounds for x86 and x86_64 platform bugs.
9295 + */
9296 +#include <linux/pci.h>
9297 +#include <linux/irq.h>
9298 +
9299 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9300 +
9301 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9302 +{
9303 + u8 config, rev;
9304 + u32 word;
9305 +
9306 + /* BIOS may enable hardware IRQ balancing for
9307 + * E7520/E7320/E7525(revision ID 0x9 and below)
9308 + * based platforms.
9309 + * Disable SW irqbalance/affinity on those platforms.
9310 + */
9311 + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
9312 + if (rev > 0x9)
9313 + return;
9314 +
9315 + printk(KERN_INFO "Intel E7520/7320/7525 detected.");
9316 +
9317 + /* enable access to config space*/
9318 + pci_read_config_byte(dev, 0xf4, &config);
9319 + pci_write_config_byte(dev, 0xf4, config|0x2);
9320 +
9321 + /* read xTPR register */
9322 + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
9323 +
9324 + if (!(word & (1 << 13))) {
9325 + struct xen_platform_op op;
9326 + printk(KERN_INFO "Disabling irq balancing and affinity\n");
9327 + op.cmd = XENPF_platform_quirk;
9328 + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
9329 + WARN_ON(HYPERVISOR_platform_op(&op));
9330 + }
9331 +
9332 + /* put back the original value for config space*/
9333 + if (!(config & 0x2))
9334 + pci_write_config_byte(dev, 0xf4, config);
9335 +}
9336 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
9337 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
9338 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
9339 +#endif
9340 Index: head-2008-11-25/arch/x86/kernel/setup_32-xen.c
9341 ===================================================================
9342 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
9343 +++ head-2008-11-25/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200
9344 @@ -0,0 +1,1919 @@
9345 +/*
9346 + * linux/arch/i386/kernel/setup.c
9347 + *
9348 + * Copyright (C) 1995 Linus Torvalds
9349 + *
9350 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
9351 + *
9352 + * Memory region support
9353 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
9354 + *
9355 + * Added E820 sanitization routine (removes overlapping memory regions);
9356 + * Brian Moyle <bmoyle@mvista.com>, February 2001
9357 + *
9358 + * Moved CPU detection code to cpu/${cpu}.c
9359 + * Patrick Mochel <mochel@osdl.org>, March 2002
9360 + *
9361 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
9362 + * Alex Achenbach <xela@slit.de>, December 2002.
9363 + *
9364 + */
9365 +
9366 +/*
9367 + * This file handles the architecture-dependent parts of initialization
9368 + */
9369 +
9370 +#include <linux/sched.h>
9371 +#include <linux/mm.h>
9372 +#include <linux/mmzone.h>
9373 +#include <linux/screen_info.h>
9374 +#include <linux/ioport.h>
9375 +#include <linux/acpi.h>
9376 +#include <linux/apm_bios.h>
9377 +#include <linux/initrd.h>
9378 +#include <linux/bootmem.h>
9379 +#include <linux/seq_file.h>
9380 +#include <linux/platform_device.h>
9381 +#include <linux/console.h>
9382 +#include <linux/mca.h>
9383 +#include <linux/root_dev.h>
9384 +#include <linux/highmem.h>
9385 +#include <linux/module.h>
9386 +#include <linux/efi.h>
9387 +#include <linux/init.h>
9388 +#include <linux/edd.h>
9389 +#include <linux/nodemask.h>
9390 +#include <linux/kernel.h>
9391 +#include <linux/percpu.h>
9392 +#include <linux/notifier.h>
9393 +#include <linux/kexec.h>
9394 +#include <linux/crash_dump.h>
9395 +#include <linux/dmi.h>
9396 +#include <linux/pfn.h>
9397 +
9398 +#include <video/edid.h>
9399 +
9400 +#include <asm/apic.h>
9401 +#include <asm/e820.h>
9402 +#include <asm/mpspec.h>
9403 +#include <asm/setup.h>
9404 +#include <asm/arch_hooks.h>
9405 +#include <asm/sections.h>
9406 +#include <asm/io_apic.h>
9407 +#include <asm/ist.h>
9408 +#include <asm/io.h>
9409 +#include <asm/hypervisor.h>
9410 +#include <xen/interface/physdev.h>
9411 +#include <xen/interface/memory.h>
9412 +#include <xen/features.h>
9413 +#include <xen/firmware.h>
9414 +#include <xen/xencons.h>
9415 +#include <setup_arch.h>
9416 +#include <bios_ebda.h>
9417 +
9418 +#ifdef CONFIG_XEN
9419 +#include <xen/interface/kexec.h>
9420 +#endif
9421 +
9422 +/* Forward Declaration. */
9423 +void __init find_max_pfn(void);
9424 +
9425 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
9426 +static struct notifier_block xen_panic_block = {
9427 + xen_panic_event, NULL, 0 /* try to go last */
9428 +};
9429 +
9430 +extern char hypercall_page[PAGE_SIZE];
9431 +EXPORT_SYMBOL(hypercall_page);
9432 +
9433 +int disable_pse __devinitdata = 0;
9434 +
9435 +/*
9436 + * Machine setup..
9437 + */
9438 +
9439 +#ifdef CONFIG_EFI
9440 +int efi_enabled = 0;
9441 +EXPORT_SYMBOL(efi_enabled);
9442 +#endif
9443 +
9444 +/* cpu data as detected by the assembly code in head.S */
9445 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9446 +/* common cpu data for all cpus */
9447 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
9448 +EXPORT_SYMBOL(boot_cpu_data);
9449 +
9450 +unsigned long mmu_cr4_features;
9451 +
9452 +#ifdef CONFIG_ACPI
9453 + int acpi_disabled = 0;
9454 +#else
9455 + int acpi_disabled = 1;
9456 +#endif
9457 +EXPORT_SYMBOL(acpi_disabled);
9458 +
9459 +#ifdef CONFIG_ACPI
9460 +int __initdata acpi_force = 0;
9461 +extern acpi_interrupt_flags acpi_sci_flags;
9462 +#endif
9463 +
9464 +/* for MCA, but anyone else can use it if they want */
9465 +unsigned int machine_id;
9466 +#ifdef CONFIG_MCA
9467 +EXPORT_SYMBOL(machine_id);
9468 +#endif
9469 +unsigned int machine_submodel_id;
9470 +unsigned int BIOS_revision;
9471 +unsigned int mca_pentium_flag;
9472 +
9473 +/* For PCI or other memory-mapped resources */
9474 +unsigned long pci_mem_start = 0x10000000;
9475 +#ifdef CONFIG_PCI
9476 +EXPORT_SYMBOL(pci_mem_start);
9477 +#endif
9478 +
9479 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
9480 +int bootloader_type;
9481 +
9482 +/* user-defined highmem size */
9483 +static unsigned int highmem_pages = -1;
9484 +
9485 +/*
9486 + * Setup options
9487 + */
9488 +struct drive_info_struct { char dummy[32]; } drive_info;
9489 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
9490 + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
9491 +EXPORT_SYMBOL(drive_info);
9492 +#endif
9493 +struct screen_info screen_info;
9494 +EXPORT_SYMBOL(screen_info);
9495 +struct apm_info apm_info;
9496 +EXPORT_SYMBOL(apm_info);
9497 +struct sys_desc_table_struct {
9498 + unsigned short length;
9499 + unsigned char table[0];
9500 +};
9501 +struct edid_info edid_info;
9502 +EXPORT_SYMBOL_GPL(edid_info);
9503 +#ifndef CONFIG_XEN
9504 +#define copy_edid() (edid_info = EDID_INFO)
9505 +#endif
9506 +struct ist_info ist_info;
9507 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
9508 + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
9509 +EXPORT_SYMBOL(ist_info);
9510 +#endif
9511 +struct e820map e820;
9512 +#ifdef CONFIG_XEN
9513 +struct e820map machine_e820;
9514 +#endif
9515 +
9516 +extern void early_cpu_init(void);
9517 +extern void generic_apic_probe(char *);
9518 +extern int root_mountflags;
9519 +
9520 +unsigned long saved_videomode;
9521 +
9522 +#define RAMDISK_IMAGE_START_MASK 0x07FF
9523 +#define RAMDISK_PROMPT_FLAG 0x8000
9524 +#define RAMDISK_LOAD_FLAG 0x4000
9525 +
9526 +static char command_line[COMMAND_LINE_SIZE];
9527 +
9528 +unsigned char __initdata boot_params[PARAM_SIZE];
9529 +
9530 +static struct resource data_resource = {
9531 + .name = "Kernel data",
9532 + .start = 0,
9533 + .end = 0,
9534 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9535 +};
9536 +
9537 +static struct resource code_resource = {
9538 + .name = "Kernel code",
9539 + .start = 0,
9540 + .end = 0,
9541 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9542 +};
9543 +
9544 +static struct resource system_rom_resource = {
9545 + .name = "System ROM",
9546 + .start = 0xf0000,
9547 + .end = 0xfffff,
9548 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9549 +};
9550 +
9551 +static struct resource extension_rom_resource = {
9552 + .name = "Extension ROM",
9553 + .start = 0xe0000,
9554 + .end = 0xeffff,
9555 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9556 +};
9557 +
9558 +static struct resource adapter_rom_resources[] = { {
9559 + .name = "Adapter ROM",
9560 + .start = 0xc8000,
9561 + .end = 0,
9562 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9563 +}, {
9564 + .name = "Adapter ROM",
9565 + .start = 0,
9566 + .end = 0,
9567 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9568 +}, {
9569 + .name = "Adapter ROM",
9570 + .start = 0,
9571 + .end = 0,
9572 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9573 +}, {
9574 + .name = "Adapter ROM",
9575 + .start = 0,
9576 + .end = 0,
9577 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9578 +}, {
9579 + .name = "Adapter ROM",
9580 + .start = 0,
9581 + .end = 0,
9582 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9583 +}, {
9584 + .name = "Adapter ROM",
9585 + .start = 0,
9586 + .end = 0,
9587 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9588 +} };
9589 +
9590 +#define ADAPTER_ROM_RESOURCES \
9591 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
9592 +
9593 +static struct resource video_rom_resource = {
9594 + .name = "Video ROM",
9595 + .start = 0xc0000,
9596 + .end = 0xc7fff,
9597 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
9598 +};
9599 +
9600 +static struct resource video_ram_resource = {
9601 + .name = "Video RAM area",
9602 + .start = 0xa0000,
9603 + .end = 0xbffff,
9604 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
9605 +};
9606 +
9607 +static struct resource standard_io_resources[] = { {
9608 + .name = "dma1",
9609 + .start = 0x0000,
9610 + .end = 0x001f,
9611 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9612 +}, {
9613 + .name = "pic1",
9614 + .start = 0x0020,
9615 + .end = 0x0021,
9616 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9617 +}, {
9618 + .name = "timer0",
9619 + .start = 0x0040,
9620 + .end = 0x0043,
9621 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9622 +}, {
9623 + .name = "timer1",
9624 + .start = 0x0050,
9625 + .end = 0x0053,
9626 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9627 +}, {
9628 + .name = "keyboard",
9629 + .start = 0x0060,
9630 + .end = 0x006f,
9631 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9632 +}, {
9633 + .name = "dma page reg",
9634 + .start = 0x0080,
9635 + .end = 0x008f,
9636 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9637 +}, {
9638 + .name = "pic2",
9639 + .start = 0x00a0,
9640 + .end = 0x00a1,
9641 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9642 +}, {
9643 + .name = "dma2",
9644 + .start = 0x00c0,
9645 + .end = 0x00df,
9646 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9647 +}, {
9648 + .name = "fpu",
9649 + .start = 0x00f0,
9650 + .end = 0x00ff,
9651 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
9652 +} };
9653 +
9654 +#define STANDARD_IO_RESOURCES \
9655 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
9656 +
9657 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
9658 +
9659 +static int __init romchecksum(unsigned char *rom, unsigned long length)
9660 +{
9661 + unsigned char *p, sum = 0;
9662 +
9663 + for (p = rom; p < rom + length; p++)
9664 + sum += *p;
9665 + return sum == 0;
9666 +}
9667 +
9668 +static void __init probe_roms(void)
9669 +{
9670 + unsigned long start, length, upper;
9671 + unsigned char *rom;
9672 + int i;
9673 +
9674 +#ifdef CONFIG_XEN
9675 + /* Nothing to do if not running in dom0. */
9676 + if (!is_initial_xendomain())
9677 + return;
9678 +#endif
9679 +
9680 + /* video rom */
9681 + upper = adapter_rom_resources[0].start;
9682 + for (start = video_rom_resource.start; start < upper; start += 2048) {
9683 + rom = isa_bus_to_virt(start);
9684 + if (!romsignature(rom))
9685 + continue;
9686 +
9687 + video_rom_resource.start = start;
9688 +
9689 + /* 0 < length <= 0x7f * 512, historically */
9690 + length = rom[2] * 512;
9691 +
9692 + /* if checksum okay, trust length byte */
9693 + if (length && romchecksum(rom, length))
9694 + video_rom_resource.end = start + length - 1;
9695 +
9696 + request_resource(&iomem_resource, &video_rom_resource);
9697 + break;
9698 + }
9699 +
9700 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
9701 + if (start < upper)
9702 + start = upper;
9703 +
9704 + /* system rom */
9705 + request_resource(&iomem_resource, &system_rom_resource);
9706 + upper = system_rom_resource.start;
9707 +
9708 + /* check for extension rom (ignore length byte!) */
9709 + rom = isa_bus_to_virt(extension_rom_resource.start);
9710 + if (romsignature(rom)) {
9711 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
9712 + if (romchecksum(rom, length)) {
9713 + request_resource(&iomem_resource, &extension_rom_resource);
9714 + upper = extension_rom_resource.start;
9715 + }
9716 + }
9717 +
9718 + /* check for adapter roms on 2k boundaries */
9719 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
9720 + rom = isa_bus_to_virt(start);
9721 + if (!romsignature(rom))
9722 + continue;
9723 +
9724 + /* 0 < length <= 0x7f * 512, historically */
9725 + length = rom[2] * 512;
9726 +
9727 + /* but accept any length that fits if checksum okay */
9728 + if (!length || start + length > upper || !romchecksum(rom, length))
9729 + continue;
9730 +
9731 + adapter_rom_resources[i].start = start;
9732 + adapter_rom_resources[i].end = start + length - 1;
9733 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
9734 +
9735 + start = adapter_rom_resources[i++].end & ~2047UL;
9736 + }
9737 +}
9738 +
9739 +/*
9740 + * Point at the empty zero page to start with. We map the real shared_info
9741 + * page as soon as fixmap is up and running.
9742 + */
9743 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
9744 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
9745 +
9746 +unsigned long *phys_to_machine_mapping;
9747 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
9748 +EXPORT_SYMBOL(phys_to_machine_mapping);
9749 +
9750 +/* Raw start-of-day parameters from the hypervisor. */
9751 +start_info_t *xen_start_info;
9752 +EXPORT_SYMBOL(xen_start_info);
9753 +
9754 +void __init add_memory_region(unsigned long long start,
9755 + unsigned long long size, int type)
9756 +{
9757 + int x;
9758 +
9759 + if (!efi_enabled) {
9760 + x = e820.nr_map;
9761 +
9762 + if (x == E820MAX) {
9763 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
9764 + return;
9765 + }
9766 +
9767 + e820.map[x].addr = start;
9768 + e820.map[x].size = size;
9769 + e820.map[x].type = type;
9770 + e820.nr_map++;
9771 + }
9772 +} /* add_memory_region */
9773 +
9774 +static void __init limit_regions(unsigned long long size)
9775 +{
9776 + unsigned long long current_addr = 0;
9777 + int i;
9778 +
9779 + if (efi_enabled) {
9780 + efi_memory_desc_t *md;
9781 + void *p;
9782 +
9783 + for (p = memmap.map, i = 0; p < memmap.map_end;
9784 + p += memmap.desc_size, i++) {
9785 + md = p;
9786 + current_addr = md->phys_addr + (md->num_pages << 12);
9787 + if (md->type == EFI_CONVENTIONAL_MEMORY) {
9788 + if (current_addr >= size) {
9789 + md->num_pages -=
9790 + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
9791 + memmap.nr_map = i + 1;
9792 + return;
9793 + }
9794 + }
9795 + }
9796 + }
9797 + for (i = 0; i < e820.nr_map; i++) {
9798 + current_addr = e820.map[i].addr + e820.map[i].size;
9799 + if (current_addr < size)
9800 + continue;
9801 +
9802 + if (e820.map[i].type != E820_RAM)
9803 + continue;
9804 +
9805 + if (e820.map[i].addr >= size) {
9806 + /*
9807 + * This region starts past the end of the
9808 + * requested size, skip it completely.
9809 + */
9810 + e820.nr_map = i;
9811 + } else {
9812 + e820.nr_map = i + 1;
9813 + e820.map[i].size -= current_addr - size;
9814 + }
9815 + return;
9816 + }
9817 +#ifdef CONFIG_XEN
9818 + if (i==e820.nr_map && current_addr < size) {
9819 + /*
9820 + * The e820 map finished before our requested size so
9821 + * extend the final entry to the requested address.
9822 + */
9823 + --i;
9824 + if (e820.map[i].type == E820_RAM)
9825 + e820.map[i].size -= current_addr - size;
9826 + else
9827 + add_memory_region(current_addr, size - current_addr, E820_RAM);
9828 + }
9829 +#endif
9830 +}
9831 +
9832 +#define E820_DEBUG 1
9833 +
9834 +static void __init print_memory_map(char *who)
9835 +{
9836 + int i;
9837 +
9838 + for (i = 0; i < e820.nr_map; i++) {
9839 + printk(" %s: %016Lx - %016Lx ", who,
9840 + e820.map[i].addr,
9841 + e820.map[i].addr + e820.map[i].size);
9842 + switch (e820.map[i].type) {
9843 + case E820_RAM: printk("(usable)\n");
9844 + break;
9845 + case E820_RESERVED:
9846 + printk("(reserved)\n");
9847 + break;
9848 + case E820_ACPI:
9849 + printk("(ACPI data)\n");
9850 + break;
9851 + case E820_NVS:
9852 + printk("(ACPI NVS)\n");
9853 + break;
9854 + default: printk("type %lu\n", e820.map[i].type);
9855 + break;
9856 + }
9857 + }
9858 +}
9859 +
9860 +/*
9861 + * Sanitize the BIOS e820 map.
9862 + *
9863 + * Some e820 responses include overlapping entries. The following
9864 + * replaces the original e820 map with a new one, removing overlaps.
9865 + *
9866 + */
9867 +struct change_member {
9868 + struct e820entry *pbios; /* pointer to original bios entry */
9869 + unsigned long long addr; /* address for this change point */
9870 +};
9871 +static struct change_member change_point_list[2*E820MAX] __initdata;
9872 +static struct change_member *change_point[2*E820MAX] __initdata;
9873 +static struct e820entry *overlap_list[E820MAX] __initdata;
9874 +static struct e820entry new_bios[E820MAX] __initdata;
9875 +
9876 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
9877 +{
9878 + struct change_member *change_tmp;
9879 + unsigned long current_type, last_type;
9880 + unsigned long long last_addr;
9881 + int chgidx, still_changing;
9882 + int overlap_entries;
9883 + int new_bios_entry;
9884 + int old_nr, new_nr, chg_nr;
9885 + int i;
9886 +
9887 + /*
9888 + Visually we're performing the following (1,2,3,4 = memory types)...
9889 +
9890 + Sample memory map (w/overlaps):
9891 + ____22__________________
9892 + ______________________4_
9893 + ____1111________________
9894 + _44_____________________
9895 + 11111111________________
9896 + ____________________33__
9897 + ___________44___________
9898 + __________33333_________
9899 + ______________22________
9900 + ___________________2222_
9901 + _________111111111______
9902 + _____________________11_
9903 + _________________4______
9904 +
9905 + Sanitized equivalent (no overlap):
9906 + 1_______________________
9907 + _44_____________________
9908 + ___1____________________
9909 + ____22__________________
9910 + ______11________________
9911 + _________1______________
9912 + __________3_____________
9913 + ___________44___________
9914 + _____________33_________
9915 + _______________2________
9916 + ________________1_______
9917 + _________________4______
9918 + ___________________2____
9919 + ____________________33__
9920 + ______________________4_
9921 + */
9922 +
9923 + /* if there's only one memory region, don't bother */
9924 + if (*pnr_map < 2)
9925 + return -1;
9926 +
9927 + old_nr = *pnr_map;
9928 +
9929 + /* bail out if we find any unreasonable addresses in bios map */
9930 + for (i=0; i<old_nr; i++)
9931 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
9932 + return -1;
9933 +
9934 + /* create pointers for initial change-point information (for sorting) */
9935 + for (i=0; i < 2*old_nr; i++)
9936 + change_point[i] = &change_point_list[i];
9937 +
9938 + /* record all known change-points (starting and ending addresses),
9939 + omitting those that are for empty memory regions */
9940 + chgidx = 0;
9941 + for (i=0; i < old_nr; i++) {
9942 + if (biosmap[i].size != 0) {
9943 + change_point[chgidx]->addr = biosmap[i].addr;
9944 + change_point[chgidx++]->pbios = &biosmap[i];
9945 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
9946 + change_point[chgidx++]->pbios = &biosmap[i];
9947 + }
9948 + }
9949 + chg_nr = chgidx; /* true number of change-points */
9950 +
9951 + /* sort change-point list by memory addresses (low -> high) */
9952 + still_changing = 1;
9953 + while (still_changing) {
9954 + still_changing = 0;
9955 + for (i=1; i < chg_nr; i++) {
9956 + /* if <current_addr> > <last_addr>, swap */
9957 + /* or, if current=<start_addr> & last=<end_addr>, swap */
9958 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
9959 + ((change_point[i]->addr == change_point[i-1]->addr) &&
9960 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
9961 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
9962 + )
9963 + {
9964 + change_tmp = change_point[i];
9965 + change_point[i] = change_point[i-1];
9966 + change_point[i-1] = change_tmp;
9967 + still_changing=1;
9968 + }
9969 + }
9970 + }
9971 +
9972 + /* create a new bios memory map, removing overlaps */
9973 + overlap_entries=0; /* number of entries in the overlap table */
9974 + new_bios_entry=0; /* index for creating new bios map entries */
9975 + last_type = 0; /* start with undefined memory type */
9976 + last_addr = 0; /* start with 0 as last starting address */
9977 + /* loop through change-points, determining affect on the new bios map */
9978 + for (chgidx=0; chgidx < chg_nr; chgidx++)
9979 + {
9980 + /* keep track of all overlapping bios entries */
9981 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
9982 + {
9983 + /* add map entry to overlap list (> 1 entry implies an overlap) */
9984 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
9985 + }
9986 + else
9987 + {
9988 + /* remove entry from list (order independent, so swap with last) */
9989 + for (i=0; i<overlap_entries; i++)
9990 + {
9991 + if (overlap_list[i] == change_point[chgidx]->pbios)
9992 + overlap_list[i] = overlap_list[overlap_entries-1];
9993 + }
9994 + overlap_entries--;
9995 + }
9996 + /* if there are overlapping entries, decide which "type" to use */
9997 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
9998 + current_type = 0;
9999 + for (i=0; i<overlap_entries; i++)
10000 + if (overlap_list[i]->type > current_type)
10001 + current_type = overlap_list[i]->type;
10002 + /* continue building up new bios map based on this information */
10003 + if (current_type != last_type) {
10004 + if (last_type != 0) {
10005 + new_bios[new_bios_entry].size =
10006 + change_point[chgidx]->addr - last_addr;
10007 + /* move forward only if the new size was non-zero */
10008 + if (new_bios[new_bios_entry].size != 0)
10009 + if (++new_bios_entry >= E820MAX)
10010 + break; /* no more space left for new bios entries */
10011 + }
10012 + if (current_type != 0) {
10013 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10014 + new_bios[new_bios_entry].type = current_type;
10015 + last_addr=change_point[chgidx]->addr;
10016 + }
10017 + last_type = current_type;
10018 + }
10019 + }
10020 + new_nr = new_bios_entry; /* retain count for new bios entries */
10021 +
10022 + /* copy new bios mapping into original location */
10023 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10024 + *pnr_map = new_nr;
10025 +
10026 + return 0;
10027 +}
10028 +
10029 +/*
10030 + * Copy the BIOS e820 map into a safe place.
10031 + *
10032 + * Sanity-check it while we're at it..
10033 + *
10034 + * If we're lucky and live on a modern system, the setup code
10035 + * will have given us a memory map that we can use to properly
10036 + * set up memory. If we aren't, we'll fake a memory map.
10037 + *
10038 + * We check to see that the memory map contains at least 2 elements
10039 + * before we'll use it, because the detection code in setup.S may
10040 + * not be perfect and most every PC known to man has two memory
10041 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
10042 + * thinkpad 560x, for example, does not cooperate with the memory
10043 + * detection code.)
10044 + */
10045 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10046 +{
10047 +#ifndef CONFIG_XEN
10048 + /* Only one memory region (or negative)? Ignore it */
10049 + if (nr_map < 2)
10050 + return -1;
10051 +#else
10052 + BUG_ON(nr_map < 1);
10053 +#endif
10054 +
10055 + do {
10056 + unsigned long long start = biosmap->addr;
10057 + unsigned long long size = biosmap->size;
10058 + unsigned long long end = start + size;
10059 + unsigned long type = biosmap->type;
10060 +
10061 + /* Overflow in 64 bits? Ignore the memory map. */
10062 + if (start > end)
10063 + return -1;
10064 +
10065 +#ifndef CONFIG_XEN
10066 + /*
10067 + * Some BIOSes claim RAM in the 640k - 1M region.
10068 + * Not right. Fix it up.
10069 + */
10070 + if (type == E820_RAM) {
10071 + if (start < 0x100000ULL && end > 0xA0000ULL) {
10072 + if (start < 0xA0000ULL)
10073 + add_memory_region(start, 0xA0000ULL-start, type);
10074 + if (end <= 0x100000ULL)
10075 + continue;
10076 + start = 0x100000ULL;
10077 + size = end - start;
10078 + }
10079 + }
10080 +#endif
10081 + add_memory_region(start, size, type);
10082 + } while (biosmap++,--nr_map);
10083 +
10084 +#ifdef CONFIG_XEN
10085 + if (is_initial_xendomain()) {
10086 + struct xen_memory_map memmap;
10087 +
10088 + memmap.nr_entries = E820MAX;
10089 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
10090 +
10091 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
10092 + BUG();
10093 + machine_e820.nr_map = memmap.nr_entries;
10094 + } else
10095 + machine_e820 = e820;
10096 +#endif
10097 +
10098 + return 0;
10099 +}
10100 +
10101 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10102 +struct edd edd;
10103 +#ifdef CONFIG_EDD_MODULE
10104 +EXPORT_SYMBOL(edd);
10105 +#endif
10106 +#ifndef CONFIG_XEN
10107 +/**
10108 + * copy_edd() - Copy the BIOS EDD information
10109 + * from boot_params into a safe place.
10110 + *
10111 + */
10112 +static inline void copy_edd(void)
10113 +{
10114 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10115 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10116 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10117 + edd.edd_info_nr = EDD_NR;
10118 +}
10119 +#endif
10120 +#else
10121 +static inline void copy_edd(void)
10122 +{
10123 +}
10124 +#endif
10125 +
10126 +static void __init parse_cmdline_early (char ** cmdline_p)
10127 +{
10128 + char c = ' ', *to = command_line, *from = saved_command_line;
10129 + int len = 0, max_cmdline;
10130 + int userdef = 0;
10131 +
10132 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10133 + max_cmdline = COMMAND_LINE_SIZE;
10134 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10135 + /* Save unparsed command line copy for /proc/cmdline */
10136 + saved_command_line[max_cmdline-1] = '\0';
10137 +
10138 + for (;;) {
10139 + if (c != ' ')
10140 + goto next_char;
10141 + /*
10142 + * "mem=nopentium" disables the 4MB page tables.
10143 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10144 + * to <mem>, overriding the bios size.
10145 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10146 + * <start> to <start>+<mem>, overriding the bios size.
10147 + *
10148 + * HPA tells me bootloaders need to parse mem=, so no new
10149 + * option should be mem= [also see Documentation/i386/boot.txt]
10150 + */
10151 + if (!memcmp(from, "mem=", 4)) {
10152 + if (to != command_line)
10153 + to--;
10154 + if (!memcmp(from+4, "nopentium", 9)) {
10155 + from += 9+4;
10156 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10157 + disable_pse = 1;
10158 + } else {
10159 + /* If the user specifies memory size, we
10160 + * limit the BIOS-provided memory map to
10161 + * that size. exactmap can be used to specify
10162 + * the exact map. mem=number can be used to
10163 + * trim the existing memory map.
10164 + */
10165 + unsigned long long mem_size;
10166 +
10167 + mem_size = memparse(from+4, &from);
10168 + limit_regions(mem_size);
10169 + userdef=1;
10170 + }
10171 + }
10172 +
10173 + else if (!memcmp(from, "memmap=", 7)) {
10174 + if (to != command_line)
10175 + to--;
10176 + if (!memcmp(from+7, "exactmap", 8)) {
10177 +#ifdef CONFIG_CRASH_DUMP
10178 + /* If we are doing a crash dump, we
10179 + * still need to know the real mem
10180 + * size before original memory map is
10181 + * reset.
10182 + */
10183 + find_max_pfn();
10184 + saved_max_pfn = max_pfn;
10185 +#endif
10186 + from += 8+7;
10187 + e820.nr_map = 0;
10188 + userdef = 1;
10189 + } else {
10190 + /* If the user specifies memory size, we
10191 + * limit the BIOS-provided memory map to
10192 + * that size. exactmap can be used to specify
10193 + * the exact map. mem=number can be used to
10194 + * trim the existing memory map.
10195 + */
10196 + unsigned long long start_at, mem_size;
10197 +
10198 + mem_size = memparse(from+7, &from);
10199 + if (*from == '@') {
10200 + start_at = memparse(from+1, &from);
10201 + add_memory_region(start_at, mem_size, E820_RAM);
10202 + } else if (*from == '#') {
10203 + start_at = memparse(from+1, &from);
10204 + add_memory_region(start_at, mem_size, E820_ACPI);
10205 + } else if (*from == '$') {
10206 + start_at = memparse(from+1, &from);
10207 + add_memory_region(start_at, mem_size, E820_RESERVED);
10208 + } else {
10209 + limit_regions(mem_size);
10210 + userdef=1;
10211 + }
10212 + }
10213 + }
10214 +
10215 + else if (!memcmp(from, "noexec=", 7))
10216 + noexec_setup(from + 7);
10217 +
10218 +
10219 +#ifdef CONFIG_X86_MPPARSE
10220 + /*
10221 + * If the BIOS enumerates physical processors before logical,
10222 + * maxcpus=N at enumeration-time can be used to disable HT.
10223 + */
10224 + else if (!memcmp(from, "maxcpus=", 8)) {
10225 + extern unsigned int maxcpus;
10226 +
10227 + maxcpus = simple_strtoul(from + 8, NULL, 0);
10228 + }
10229 +#endif
10230 +
10231 +#ifdef CONFIG_ACPI
10232 + /* "acpi=off" disables both ACPI table parsing and interpreter */
10233 + else if (!memcmp(from, "acpi=off", 8)) {
10234 + disable_acpi();
10235 + }
10236 +
10237 + /* acpi=force to over-ride black-list */
10238 + else if (!memcmp(from, "acpi=force", 10)) {
10239 + acpi_force = 1;
10240 + acpi_ht = 1;
10241 + acpi_disabled = 0;
10242 + }
10243 +
10244 + /* acpi=strict disables out-of-spec workarounds */
10245 + else if (!memcmp(from, "acpi=strict", 11)) {
10246 + acpi_strict = 1;
10247 + }
10248 +
10249 + /* Limit ACPI just to boot-time to enable HT */
10250 + else if (!memcmp(from, "acpi=ht", 7)) {
10251 + if (!acpi_force)
10252 + disable_acpi();
10253 + acpi_ht = 1;
10254 + }
10255 +
10256 + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10257 + else if (!memcmp(from, "pci=noacpi", 10)) {
10258 + acpi_disable_pci();
10259 + }
10260 + /* "acpi=noirq" disables ACPI interrupt routing */
10261 + else if (!memcmp(from, "acpi=noirq", 10)) {
10262 + acpi_noirq_set();
10263 + }
10264 +
10265 + else if (!memcmp(from, "acpi_sci=edge", 13))
10266 + acpi_sci_flags.trigger = 1;
10267 +
10268 + else if (!memcmp(from, "acpi_sci=level", 14))
10269 + acpi_sci_flags.trigger = 3;
10270 +
10271 + else if (!memcmp(from, "acpi_sci=high", 13))
10272 + acpi_sci_flags.polarity = 1;
10273 +
10274 + else if (!memcmp(from, "acpi_sci=low", 12))
10275 + acpi_sci_flags.polarity = 3;
10276 +
10277 +#ifdef CONFIG_X86_IO_APIC
10278 + else if (!memcmp(from, "acpi_skip_timer_override", 24))
10279 + acpi_skip_timer_override = 1;
10280 +
10281 + if (!memcmp(from, "disable_timer_pin_1", 19))
10282 + disable_timer_pin_1 = 1;
10283 + if (!memcmp(from, "enable_timer_pin_1", 18))
10284 + disable_timer_pin_1 = -1;
10285 +
10286 + /* disable IO-APIC */
10287 + else if (!memcmp(from, "noapic", 6))
10288 + disable_ioapic_setup();
10289 +#endif /* CONFIG_X86_IO_APIC */
10290 +#endif /* CONFIG_ACPI */
10291 +
10292 +#ifdef CONFIG_X86_LOCAL_APIC
10293 + /* enable local APIC */
10294 + else if (!memcmp(from, "lapic", 5))
10295 + lapic_enable();
10296 +
10297 + /* disable local APIC */
10298 + else if (!memcmp(from, "nolapic", 6))
10299 + lapic_disable();
10300 +#endif /* CONFIG_X86_LOCAL_APIC */
10301 +
10302 +#ifdef CONFIG_KEXEC
10303 + /* crashkernel=size@addr specifies the location to reserve for
10304 + * a crash kernel. By reserving this memory we guarantee
10305 + * that linux never set's it up as a DMA target.
10306 + * Useful for holding code to do something appropriate
10307 + * after a kernel panic.
10308 + */
10309 + else if (!memcmp(from, "crashkernel=", 12)) {
10310 +#ifndef CONFIG_XEN
10311 + unsigned long size, base;
10312 + size = memparse(from+12, &from);
10313 + if (*from == '@') {
10314 + base = memparse(from+1, &from);
10315 + /* FIXME: Do I want a sanity check
10316 + * to validate the memory range?
10317 + */
10318 + crashk_res.start = base;
10319 + crashk_res.end = base + size - 1;
10320 + }
10321 +#else
10322 + printk("Ignoring crashkernel command line, "
10323 + "parameter will be supplied by xen\n");
10324 +#endif
10325 + }
10326 +#endif
10327 +#ifdef CONFIG_PROC_VMCORE
10328 + /* elfcorehdr= specifies the location of elf core header
10329 + * stored by the crashed kernel.
10330 + */
10331 + else if (!memcmp(from, "elfcorehdr=", 11))
10332 + elfcorehdr_addr = memparse(from+11, &from);
10333 +#endif
10334 +
10335 + /*
10336 + * highmem=size forces highmem to be exactly 'size' bytes.
10337 + * This works even on boxes that have no highmem otherwise.
10338 + * This also works to reduce highmem size on bigger boxes.
10339 + */
10340 + else if (!memcmp(from, "highmem=", 8))
10341 + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
10342 +
10343 + /*
10344 + * vmalloc=size forces the vmalloc area to be exactly 'size'
10345 + * bytes. This can be used to increase (or decrease) the
10346 + * vmalloc area - the default is 128m.
10347 + */
10348 + else if (!memcmp(from, "vmalloc=", 8))
10349 + __VMALLOC_RESERVE = memparse(from+8, &from);
10350 +
10351 + next_char:
10352 + c = *(from++);
10353 + if (!c)
10354 + break;
10355 + if (COMMAND_LINE_SIZE <= ++len)
10356 + break;
10357 + *(to++) = c;
10358 + }
10359 + *to = '\0';
10360 + *cmdline_p = command_line;
10361 + if (userdef) {
10362 + printk(KERN_INFO "user-defined physical RAM map:\n");
10363 + print_memory_map("user");
10364 + }
10365 +}
10366 +
10367 +/*
10368 + * Callback for efi_memory_walk.
10369 + */
10370 +static int __init
10371 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
10372 +{
10373 + unsigned long *max_pfn = arg, pfn;
10374 +
10375 + if (start < end) {
10376 + pfn = PFN_UP(end -1);
10377 + if (pfn > *max_pfn)
10378 + *max_pfn = pfn;
10379 + }
10380 + return 0;
10381 +}
10382 +
10383 +static int __init
10384 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
10385 +{
10386 + memory_present(0, start, end);
10387 + return 0;
10388 +}
10389 +
10390 +/*
10391 + * This function checks if any part of the range <start,end> is mapped
10392 + * with type.
10393 + */
10394 +int
10395 +e820_any_mapped(u64 start, u64 end, unsigned type)
10396 +{
10397 + int i;
10398 +
10399 +#ifndef CONFIG_XEN
10400 + for (i = 0; i < e820.nr_map; i++) {
10401 + const struct e820entry *ei = &e820.map[i];
10402 +#else
10403 + if (!is_initial_xendomain())
10404 + return 0;
10405 + for (i = 0; i < machine_e820.nr_map; ++i) {
10406 + const struct e820entry *ei = &machine_e820.map[i];
10407 +#endif
10408 +
10409 + if (type && ei->type != type)
10410 + continue;
10411 + if (ei->addr >= end || ei->addr + ei->size <= start)
10412 + continue;
10413 + return 1;
10414 + }
10415 + return 0;
10416 +}
10417 +EXPORT_SYMBOL_GPL(e820_any_mapped);
10418 +
10419 + /*
10420 + * This function checks if the entire range <start,end> is mapped with type.
10421 + *
10422 + * Note: this function only works correct if the e820 table is sorted and
10423 + * not-overlapping, which is the case
10424 + */
10425 +int __init
10426 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
10427 +{
10428 + u64 start = s;
10429 + u64 end = e;
10430 + int i;
10431 +
10432 +#ifndef CONFIG_XEN
10433 + for (i = 0; i < e820.nr_map; i++) {
10434 + struct e820entry *ei = &e820.map[i];
10435 +#else
10436 + if (!is_initial_xendomain())
10437 + return 0;
10438 + for (i = 0; i < machine_e820.nr_map; ++i) {
10439 + const struct e820entry *ei = &machine_e820.map[i];
10440 +#endif
10441 + if (type && ei->type != type)
10442 + continue;
10443 + /* is the region (part) in overlap with the current region ?*/
10444 + if (ei->addr >= end || ei->addr + ei->size <= start)
10445 + continue;
10446 + /* if the region is at the beginning of <start,end> we move
10447 + * start to the end of the region since it's ok until there
10448 + */
10449 + if (ei->addr <= start)
10450 + start = ei->addr + ei->size;
10451 + /* if start is now at or beyond end, we're done, full
10452 + * coverage */
10453 + if (start >= end)
10454 + return 1; /* we're done */
10455 + }
10456 + return 0;
10457 +}
10458 +
10459 +/*
10460 + * Find the highest page frame number we have available
10461 + */
10462 +void __init find_max_pfn(void)
10463 +{
10464 + int i;
10465 +
10466 + max_pfn = 0;
10467 + if (efi_enabled) {
10468 + efi_memmap_walk(efi_find_max_pfn, &max_pfn);
10469 + efi_memmap_walk(efi_memory_present_wrapper, NULL);
10470 + return;
10471 + }
10472 +
10473 + for (i = 0; i < e820.nr_map; i++) {
10474 + unsigned long start, end;
10475 + /* RAM? */
10476 + if (e820.map[i].type != E820_RAM)
10477 + continue;
10478 + start = PFN_UP(e820.map[i].addr);
10479 + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10480 + if (start >= end)
10481 + continue;
10482 + if (end > max_pfn)
10483 + max_pfn = end;
10484 + memory_present(0, start, end);
10485 + }
10486 +}
10487 +
10488 +/*
10489 + * Determine low and high memory ranges:
10490 + */
10491 +unsigned long __init find_max_low_pfn(void)
10492 +{
10493 + unsigned long max_low_pfn;
10494 +
10495 + max_low_pfn = max_pfn;
10496 + if (max_low_pfn > MAXMEM_PFN) {
10497 + if (highmem_pages == -1)
10498 + highmem_pages = max_pfn - MAXMEM_PFN;
10499 + if (highmem_pages + MAXMEM_PFN < max_pfn)
10500 + max_pfn = MAXMEM_PFN + highmem_pages;
10501 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
10502 + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
10503 + highmem_pages = 0;
10504 + }
10505 + max_low_pfn = MAXMEM_PFN;
10506 +#ifndef CONFIG_HIGHMEM
10507 + /* Maximum memory usable is what is directly addressable */
10508 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
10509 + MAXMEM>>20);
10510 + if (max_pfn > MAX_NONPAE_PFN)
10511 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10512 + else
10513 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
10514 + max_pfn = MAXMEM_PFN;
10515 +#else /* !CONFIG_HIGHMEM */
10516 +#ifndef CONFIG_X86_PAE
10517 + if (max_pfn > MAX_NONPAE_PFN) {
10518 + max_pfn = MAX_NONPAE_PFN;
10519 + printk(KERN_WARNING "Warning only 4GB will be used.\n");
10520 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
10521 + }
10522 +#endif /* !CONFIG_X86_PAE */
10523 +#endif /* !CONFIG_HIGHMEM */
10524 + } else {
10525 + if (highmem_pages == -1)
10526 + highmem_pages = 0;
10527 +#ifdef CONFIG_HIGHMEM
10528 + if (highmem_pages >= max_pfn) {
10529 + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
10530 + highmem_pages = 0;
10531 + }
10532 + if (highmem_pages) {
10533 + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
10534 + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
10535 + highmem_pages = 0;
10536 + }
10537 + max_low_pfn -= highmem_pages;
10538 + }
10539 +#else
10540 + if (highmem_pages)
10541 + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
10542 +#endif
10543 + }
10544 + return max_low_pfn;
10545 +}
10546 +
10547 +/*
10548 + * Free all available memory for boot time allocation. Used
10549 + * as a callback function by efi_memory_walk()
10550 + */
10551 +
10552 +static int __init
10553 +free_available_memory(unsigned long start, unsigned long end, void *arg)
10554 +{
10555 + /* check max_low_pfn */
10556 + if (start >= (max_low_pfn << PAGE_SHIFT))
10557 + return 0;
10558 + if (end >= (max_low_pfn << PAGE_SHIFT))
10559 + end = max_low_pfn << PAGE_SHIFT;
10560 + if (start < end)
10561 + free_bootmem(start, end - start);
10562 +
10563 + return 0;
10564 +}
10565 +/*
10566 + * Register fully available low RAM pages with the bootmem allocator.
10567 + */
10568 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
10569 +{
10570 + int i;
10571 +
10572 + if (efi_enabled) {
10573 + efi_memmap_walk(free_available_memory, NULL);
10574 + return;
10575 + }
10576 + for (i = 0; i < e820.nr_map; i++) {
10577 + unsigned long curr_pfn, last_pfn, size;
10578 + /*
10579 + * Reserve usable low memory
10580 + */
10581 + if (e820.map[i].type != E820_RAM)
10582 + continue;
10583 + /*
10584 + * We are rounding up the start address of usable memory:
10585 + */
10586 + curr_pfn = PFN_UP(e820.map[i].addr);
10587 + if (curr_pfn >= max_low_pfn)
10588 + continue;
10589 + /*
10590 + * ... and at the end of the usable range downwards:
10591 + */
10592 + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
10593 +
10594 +#ifdef CONFIG_XEN
10595 + /*
10596 + * Truncate to the number of actual pages currently
10597 + * present.
10598 + */
10599 + if (last_pfn > xen_start_info->nr_pages)
10600 + last_pfn = xen_start_info->nr_pages;
10601 +#endif
10602 +
10603 + if (last_pfn > max_low_pfn)
10604 + last_pfn = max_low_pfn;
10605 +
10606 + /*
10607 + * .. finally, did all the rounding and playing
10608 + * around just make the area go away?
10609 + */
10610 + if (last_pfn <= curr_pfn)
10611 + continue;
10612 +
10613 + size = last_pfn - curr_pfn;
10614 + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
10615 + }
10616 +}
10617 +
10618 +#ifndef CONFIG_XEN
10619 +/*
10620 + * workaround for Dell systems that neglect to reserve EBDA
10621 + */
10622 +static void __init reserve_ebda_region(void)
10623 +{
10624 + unsigned int addr;
10625 + addr = get_bios_ebda();
10626 + if (addr)
10627 + reserve_bootmem(addr, PAGE_SIZE);
10628 +}
10629 +#endif
10630 +
10631 +#ifndef CONFIG_NEED_MULTIPLE_NODES
10632 +void __init setup_bootmem_allocator(void);
10633 +static unsigned long __init setup_memory(void)
10634 +{
10635 + /*
10636 + * partially used pages are not usable - thus
10637 + * we are rounding upwards:
10638 + */
10639 + min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
10640 + xen_start_info->nr_pt_frames;
10641 +
10642 + find_max_pfn();
10643 +
10644 + max_low_pfn = find_max_low_pfn();
10645 +
10646 +#ifdef CONFIG_HIGHMEM
10647 + highstart_pfn = highend_pfn = max_pfn;
10648 + if (max_pfn > max_low_pfn) {
10649 + highstart_pfn = max_low_pfn;
10650 + }
10651 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
10652 + pages_to_mb(highend_pfn - highstart_pfn));
10653 +#endif
10654 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
10655 + pages_to_mb(max_low_pfn));
10656 +
10657 + setup_bootmem_allocator();
10658 +
10659 + return max_low_pfn;
10660 +}
10661 +
10662 +void __init zone_sizes_init(void)
10663 +{
10664 + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
10665 + unsigned int max_dma, low;
10666 +
10667 + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
10668 + low = max_low_pfn;
10669 +
10670 + if (low < max_dma)
10671 + zones_size[ZONE_DMA] = low;
10672 + else {
10673 + zones_size[ZONE_DMA] = max_dma;
10674 + zones_size[ZONE_NORMAL] = low - max_dma;
10675 +#ifdef CONFIG_HIGHMEM
10676 + zones_size[ZONE_HIGHMEM] = highend_pfn - low;
10677 +#endif
10678 + }
10679 + free_area_init(zones_size);
10680 +}
10681 +#else
10682 +extern unsigned long __init setup_memory(void);
10683 +extern void zone_sizes_init(void);
10684 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
10685 +
10686 +void __init setup_bootmem_allocator(void)
10687 +{
10688 + unsigned long bootmap_size;
10689 + /*
10690 + * Initialize the boot-time allocator (with low memory only):
10691 + */
10692 + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
10693 +
10694 + register_bootmem_low_pages(max_low_pfn);
10695 +
10696 + /*
10697 + * Reserve the bootmem bitmap itself as well. We do this in two
10698 + * steps (first step was init_bootmem()) because this catches
10699 + * the (very unlikely) case of us accidentally initializing the
10700 + * bootmem allocator with an invalid RAM area.
10701 + */
10702 + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
10703 + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
10704 +
10705 +#ifndef CONFIG_XEN
10706 + /*
10707 + * reserve physical page 0 - it's a special BIOS page on many boxes,
10708 + * enabling clean reboots, SMP operation, laptop functions.
10709 + */
10710 + reserve_bootmem(0, PAGE_SIZE);
10711 +
10712 + /* reserve EBDA region, it's a 4K region */
10713 + reserve_ebda_region();
10714 +
10715 + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
10716 + PCI prefetch into it (errata #56). Usually the page is reserved anyways,
10717 + unless you have no PS/2 mouse plugged in. */
10718 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
10719 + boot_cpu_data.x86 == 6)
10720 + reserve_bootmem(0xa0000 - 4096, 4096);
10721 +
10722 +#ifdef CONFIG_SMP
10723 + /*
10724 + * But first pinch a few for the stack/trampoline stuff
10725 + * FIXME: Don't need the extra page at 4K, but need to fix
10726 + * trampoline before removing it. (see the GDT stuff)
10727 + */
10728 + reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
10729 +#endif
10730 +#ifdef CONFIG_ACPI_SLEEP
10731 + /*
10732 + * Reserve low memory region for sleep support.
10733 + */
10734 + acpi_reserve_bootmem();
10735 +#endif
10736 +#endif /* !CONFIG_XEN */
10737 +
10738 +#ifdef CONFIG_BLK_DEV_INITRD
10739 + if (xen_start_info->mod_start) {
10740 + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
10741 + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
10742 + initrd_start = INITRD_START + PAGE_OFFSET;
10743 + initrd_end = initrd_start+INITRD_SIZE;
10744 + initrd_below_start_ok = 1;
10745 + }
10746 + else {
10747 + printk(KERN_ERR "initrd extends beyond end of memory "
10748 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
10749 + INITRD_START + INITRD_SIZE,
10750 + max_low_pfn << PAGE_SHIFT);
10751 + initrd_start = 0;
10752 + }
10753 + }
10754 +#endif
10755 +#ifdef CONFIG_KEXEC
10756 +#ifdef CONFIG_XEN
10757 + xen_machine_kexec_setup_resources();
10758 +#else
10759 + if (crashk_res.start != crashk_res.end)
10760 + reserve_bootmem(crashk_res.start,
10761 + crashk_res.end - crashk_res.start + 1);
10762 +#endif
10763 +#endif
10764 +}
10765 +
10766 +/*
10767 + * The node 0 pgdat is initialized before all of these because
10768 + * it's needed for bootmem. node>0 pgdats have their virtual
10769 + * space allocated before the pagetables are in place to access
10770 + * them, so they can't be cleared then.
10771 + *
10772 + * This should all compile down to nothing when NUMA is off.
10773 + */
10774 +void __init remapped_pgdat_init(void)
10775 +{
10776 + int nid;
10777 +
10778 + for_each_online_node(nid) {
10779 + if (nid != 0)
10780 + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
10781 + }
10782 +}
10783 +
10784 +/*
10785 + * Request address space for all standard RAM and ROM resources
10786 + * and also for regions reported as reserved by the e820.
10787 + */
10788 +static void __init
10789 +legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
10790 + struct resource *code_resource,
10791 + struct resource *data_resource)
10792 +{
10793 + int i;
10794 +
10795 + probe_roms();
10796 +
10797 + for (i = 0; i < nr_map; i++) {
10798 + struct resource *res;
10799 +#ifndef CONFIG_RESOURCES_64BIT
10800 + if (e820[i].addr + e820[i].size > 0x100000000ULL)
10801 + continue;
10802 +#endif
10803 + res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
10804 + switch (e820[i].type) {
10805 + case E820_RAM: res->name = "System RAM"; break;
10806 + case E820_ACPI: res->name = "ACPI Tables"; break;
10807 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
10808 + default: res->name = "reserved";
10809 + }
10810 + res->start = e820[i].addr;
10811 + res->end = res->start + e820[i].size - 1;
10812 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
10813 + if (request_resource(&iomem_resource, res)) {
10814 + kfree(res);
10815 + continue;
10816 + }
10817 + if (e820[i].type == E820_RAM) {
10818 + /*
10819 + * We don't know which RAM region contains kernel data,
10820 + * so we try it repeatedly and let the resource manager
10821 + * test it.
10822 + */
10823 +#ifndef CONFIG_XEN
10824 + request_resource(res, code_resource);
10825 + request_resource(res, data_resource);
10826 +#endif
10827 +#ifdef CONFIG_KEXEC
10828 + if (crashk_res.start != crashk_res.end)
10829 + request_resource(res, &crashk_res);
10830 +#ifdef CONFIG_XEN
10831 + xen_machine_kexec_register_resources(res);
10832 +#endif
10833 +#endif
10834 + }
10835 + }
10836 +}
10837 +
10838 +/*
10839 + * Locate a unused range of the physical address space below 4G which
10840 + * can be used for PCI mappings.
10841 + */
10842 +static void __init
10843 +e820_setup_gap(struct e820entry *e820, int nr_map)
10844 +{
10845 + unsigned long gapstart, gapsize, round;
10846 + unsigned long long last;
10847 + int i;
10848 +
10849 + /*
10850 + * Search for the bigest gap in the low 32 bits of the e820
10851 + * memory space.
10852 + */
10853 + last = 0x100000000ull;
10854 + gapstart = 0x10000000;
10855 + gapsize = 0x400000;
10856 + i = nr_map;
10857 + while (--i >= 0) {
10858 + unsigned long long start = e820[i].addr;
10859 + unsigned long long end = start + e820[i].size;
10860 +
10861 + /*
10862 + * Since "last" is at most 4GB, we know we'll
10863 + * fit in 32 bits if this condition is true
10864 + */
10865 + if (last > end) {
10866 + unsigned long gap = last - end;
10867 +
10868 + if (gap > gapsize) {
10869 + gapsize = gap;
10870 + gapstart = end;
10871 + }
10872 + }
10873 + if (start < last)
10874 + last = start;
10875 + }
10876 +
10877 + /*
10878 + * See how much we want to round up: start off with
10879 + * rounding to the next 1MB area.
10880 + */
10881 + round = 0x100000;
10882 + while ((gapsize >> 4) > round)
10883 + round += round;
10884 + /* Fun with two's complement */
10885 + pci_mem_start = (gapstart + round) & -round;
10886 +
10887 + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
10888 + pci_mem_start, gapstart, gapsize);
10889 +}
10890 +
10891 +/*
10892 + * Request address space for all standard resources
10893 + *
10894 + * This is called just before pcibios_init(), which is also a
10895 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
10896 + */
10897 +static int __init request_standard_resources(void)
10898 +{
10899 + int i;
10900 +
10901 + /* Nothing to do if not running in dom0. */
10902 + if (!is_initial_xendomain())
10903 + return 0;
10904 +
10905 + printk("Setting up standard PCI resources\n");
10906 +#ifdef CONFIG_XEN
10907 + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
10908 + &code_resource, &data_resource);
10909 +#else
10910 + if (efi_enabled)
10911 + efi_initialize_iomem_resources(&code_resource, &data_resource);
10912 + else
10913 + legacy_init_iomem_resources(e820.map, e820.nr_map,
10914 + &code_resource, &data_resource);
10915 +#endif
10916 +
10917 + /* EFI systems may still have VGA */
10918 + request_resource(&iomem_resource, &video_ram_resource);
10919 +
10920 + /* request I/O space for devices used on all i[345]86 PCs */
10921 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
10922 + request_resource(&ioport_resource, &standard_io_resources[i]);
10923 + return 0;
10924 +}
10925 +
10926 +subsys_initcall(request_standard_resources);
10927 +
10928 +static void __init register_memory(void)
10929 +{
10930 +#ifdef CONFIG_XEN
10931 + if (is_initial_xendomain())
10932 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
10933 + else
10934 +#endif
10935 + e820_setup_gap(e820.map, e820.nr_map);
10936 +}
10937 +
10938 +#ifdef CONFIG_MCA
10939 +static void set_mca_bus(int x)
10940 +{
10941 + MCA_bus = x;
10942 +}
10943 +#else
10944 +static void set_mca_bus(int x) { }
10945 +#endif
10946 +
10947 +/*
10948 + * Determine if we were loaded by an EFI loader. If so, then we have also been
10949 + * passed the efi memmap, systab, etc., so we should use these data structures
10950 + * for initialization. Note, the efi init code path is determined by the
10951 + * global efi_enabled. This allows the same kernel image to be used on existing
10952 + * systems (with a traditional BIOS) as well as on EFI systems.
10953 + */
10954 +void __init setup_arch(char **cmdline_p)
10955 +{
10956 + int i, j, k, fpp;
10957 + struct physdev_set_iopl set_iopl;
10958 + unsigned long max_low_pfn;
10959 + unsigned long p2m_pages;
10960 +
10961 + /* Force a quick death if the kernel panics (not domain 0). */
10962 + extern int panic_timeout;
10963 + if (!panic_timeout && !is_initial_xendomain())
10964 + panic_timeout = 1;
10965 +
10966 + /* Register a call for panic conditions. */
10967 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
10968 +
10969 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10970 + VMASST_TYPE_4gb_segments));
10971 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
10972 + VMASST_TYPE_writable_pagetables));
10973 +
10974 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
10975 + pre_setup_arch_hook();
10976 + early_cpu_init();
10977 +#ifdef CONFIG_SMP
10978 + prefill_possible_map();
10979 +#endif
10980 +
10981 + /*
10982 + * FIXME: This isn't an official loader_type right
10983 + * now but does currently work with elilo.
10984 + * If we were configured as an EFI kernel, check to make
10985 + * sure that we were loaded correctly from elilo and that
10986 + * the system table is valid. If not, then initialize normally.
10987 + */
10988 +#ifdef CONFIG_EFI
10989 + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
10990 + efi_enabled = 1;
10991 +#endif
10992 +
10993 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
10994 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
10995 + */
10996 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
10997 + drive_info = DRIVE_INFO;
10998 + screen_info = SCREEN_INFO;
10999 + copy_edid();
11000 + apm_info.bios = APM_BIOS_INFO;
11001 + ist_info = IST_INFO;
11002 + saved_videomode = VIDEO_MODE;
11003 + if( SYS_DESC_TABLE.length != 0 ) {
11004 + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11005 + machine_id = SYS_DESC_TABLE.table[0];
11006 + machine_submodel_id = SYS_DESC_TABLE.table[1];
11007 + BIOS_revision = SYS_DESC_TABLE.table[2];
11008 + }
11009 + bootloader_type = LOADER_TYPE;
11010 +
11011 + if (is_initial_xendomain()) {
11012 + const struct dom0_vga_console_info *info =
11013 + (void *)((char *)xen_start_info +
11014 + xen_start_info->console.dom0.info_off);
11015 +
11016 + dom0_init_screen_info(info,
11017 + xen_start_info->console.dom0.info_size);
11018 + xen_start_info->console.domU.mfn = 0;
11019 + xen_start_info->console.domU.evtchn = 0;
11020 + } else
11021 + screen_info.orig_video_isVGA = 0;
11022 +
11023 +#ifdef CONFIG_BLK_DEV_RAM
11024 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11025 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11026 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11027 +#endif
11028 +
11029 + ARCH_SETUP
11030 + if (efi_enabled)
11031 + efi_init();
11032 + else {
11033 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11034 + print_memory_map(machine_specific_memory_setup());
11035 + }
11036 +
11037 + copy_edd();
11038 +
11039 + if (!MOUNT_ROOT_RDONLY)
11040 + root_mountflags &= ~MS_RDONLY;
11041 + init_mm.start_code = (unsigned long) _text;
11042 + init_mm.end_code = (unsigned long) _etext;
11043 + init_mm.end_data = (unsigned long) _edata;
11044 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11045 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11046 +
11047 + code_resource.start = virt_to_phys(_text);
11048 + code_resource.end = virt_to_phys(_etext)-1;
11049 + data_resource.start = virt_to_phys(_etext);
11050 + data_resource.end = virt_to_phys(_edata)-1;
11051 +
11052 + parse_cmdline_early(cmdline_p);
11053 +
11054 +#ifdef CONFIG_EARLY_PRINTK
11055 + {
11056 + char *s = strstr(*cmdline_p, "earlyprintk=");
11057 + if (s) {
11058 + setup_early_printk(strchr(s, '=') + 1);
11059 + printk("early console enabled\n");
11060 + }
11061 + }
11062 +#endif
11063 +
11064 + max_low_pfn = setup_memory();
11065 +
11066 + /*
11067 + * NOTE: before this point _nobody_ is allowed to allocate
11068 + * any memory using the bootmem allocator. Although the
11069 + * alloctor is now initialised only the first 8Mb of the kernel
11070 + * virtual address space has been mapped. All allocations before
11071 + * paging_init() has completed must use the alloc_bootmem_low_pages()
11072 + * variant (which allocates DMA'able memory) and care must be taken
11073 + * not to exceed the 8Mb limit.
11074 + */
11075 +
11076 +#ifdef CONFIG_SMP
11077 + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11078 +#endif
11079 + paging_init();
11080 + remapped_pgdat_init();
11081 + sparse_init();
11082 + zone_sizes_init();
11083 +
11084 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11085 + /*
11086 + * Find and reserve possible boot-time SMP configuration:
11087 + */
11088 + find_smp_config();
11089 +#endif
11090 +
11091 + p2m_pages = max_pfn;
11092 + if (xen_start_info->nr_pages > max_pfn) {
11093 + /*
11094 + * the max_pfn was shrunk (probably by mem= or highmem=
11095 + * kernel parameter); shrink reservation with the HV
11096 + */
11097 + struct xen_memory_reservation reservation = {
11098 + .address_bits = 0,
11099 + .extent_order = 0,
11100 + .domid = DOMID_SELF
11101 + };
11102 + unsigned int difference;
11103 + int ret;
11104 +
11105 + difference = xen_start_info->nr_pages - max_pfn;
11106 +
11107 + set_xen_guest_handle(reservation.extent_start,
11108 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
11109 + reservation.nr_extents = difference;
11110 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
11111 + &reservation);
11112 + BUG_ON (ret != difference);
11113 + }
11114 + else if (max_pfn > xen_start_info->nr_pages)
11115 + p2m_pages = xen_start_info->nr_pages;
11116 +
11117 + /* Make sure we have a correctly sized P->M table. */
11118 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11119 + phys_to_machine_mapping = alloc_bootmem_low_pages(
11120 + max_pfn * sizeof(unsigned long));
11121 + memset(phys_to_machine_mapping, ~0,
11122 + max_pfn * sizeof(unsigned long));
11123 + memcpy(phys_to_machine_mapping,
11124 + (unsigned long *)xen_start_info->mfn_list,
11125 + p2m_pages * sizeof(unsigned long));
11126 + free_bootmem(
11127 + __pa(xen_start_info->mfn_list),
11128 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11129 + sizeof(unsigned long))));
11130 +
11131 + /*
11132 + * Initialise the list of the frames that specify the list of
11133 + * frames that make up the p2m table. Used by save/restore
11134 + */
11135 + pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11136 +
11137 + fpp = PAGE_SIZE/sizeof(unsigned long);
11138 + for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11139 + if ((j % fpp) == 0) {
11140 + k++;
11141 + BUG_ON(k>=16);
11142 + pfn_to_mfn_frame_list[k] =
11143 + alloc_bootmem_low_pages(PAGE_SIZE);
11144 + pfn_to_mfn_frame_list_list[k] =
11145 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
11146 + j=0;
11147 + }
11148 + pfn_to_mfn_frame_list[k][j] =
11149 + virt_to_mfn(&phys_to_machine_mapping[i]);
11150 + }
11151 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11152 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11153 + virt_to_mfn(pfn_to_mfn_frame_list_list);
11154 + }
11155 +
11156 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
11157 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
11158 + if (i != 4 && request_dma(i, "xen") != 0)
11159 + BUG();
11160 +
11161 + /*
11162 + * NOTE: at this point the bootmem allocator is fully available.
11163 + */
11164 +
11165 + if (is_initial_xendomain())
11166 + dmi_scan_machine();
11167 +
11168 +#ifdef CONFIG_X86_GENERICARCH
11169 + generic_apic_probe(*cmdline_p);
11170 +#endif
11171 + if (efi_enabled)
11172 + efi_map_memmap();
11173 +
11174 + set_iopl.iopl = 1;
11175 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
11176 +
11177 +#ifdef CONFIG_ACPI
11178 + if (!is_initial_xendomain()) {
11179 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11180 + acpi_disabled = 1;
11181 + acpi_ht = 0;
11182 + }
11183 +
11184 + /*
11185 + * Parse the ACPI tables for possible boot-time SMP configuration.
11186 + */
11187 + acpi_boot_table_init();
11188 +#endif
11189 +
11190 +#ifdef CONFIG_X86_IO_APIC
11191 + check_acpi_pci(); /* Checks more than just ACPI actually */
11192 +#endif
11193 +
11194 +#ifdef CONFIG_ACPI
11195 + acpi_boot_init();
11196 +
11197 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11198 + if (def_to_bigsmp)
11199 + printk(KERN_WARNING "More than 8 CPUs detected and "
11200 + "CONFIG_X86_PC cannot handle it.\nUse "
11201 + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11202 +#endif
11203 +#endif
11204 +#ifdef CONFIG_X86_LOCAL_APIC
11205 + if (smp_found_config)
11206 + get_smp_config();
11207 +#endif
11208 +
11209 + register_memory();
11210 +
11211 + if (is_initial_xendomain()) {
11212 +#ifdef CONFIG_VT
11213 +#if defined(CONFIG_VGA_CONSOLE)
11214 + if (!efi_enabled ||
11215 + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11216 + conswitchp = &vga_con;
11217 +#elif defined(CONFIG_DUMMY_CONSOLE)
11218 + conswitchp = &dummy_con;
11219 +#endif
11220 +#endif
11221 + } else {
11222 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
11223 + conswitchp = &dummy_con;
11224 +#endif
11225 + }
11226 + tsc_init();
11227 +}
11228 +
11229 +static int
11230 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11231 +{
11232 + HYPERVISOR_shutdown(SHUTDOWN_crash);
11233 + /* we're never actually going to get here... */
11234 + return NOTIFY_DONE;
11235 +}
11236 +
11237 +static __init int add_pcspkr(void)
11238 +{
11239 + struct platform_device *pd;
11240 + int ret;
11241 +
11242 + if (!is_initial_xendomain())
11243 + return 0;
11244 +
11245 + pd = platform_device_alloc("pcspkr", -1);
11246 + if (!pd)
11247 + return -ENOMEM;
11248 +
11249 + ret = platform_device_add(pd);
11250 + if (ret)
11251 + platform_device_put(pd);
11252 +
11253 + return ret;
11254 +}
11255 +device_initcall(add_pcspkr);
11256 +
11257 +/*
11258 + * Local Variables:
11259 + * mode:c
11260 + * c-file-style:"k&r"
11261 + * c-basic-offset:8
11262 + * End:
11263 + */
11264 Index: head-2008-11-25/arch/x86/kernel/smp_32-xen.c
11265 ===================================================================
11266 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
11267 +++ head-2008-11-25/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100
11268 @@ -0,0 +1,605 @@
11269 +/*
11270 + * Intel SMP support routines.
11271 + *
11272 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11273 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11274 + *
11275 + * This code is released under the GNU General Public License version 2 or
11276 + * later.
11277 + */
11278 +
11279 +#include <linux/init.h>
11280 +
11281 +#include <linux/mm.h>
11282 +#include <linux/delay.h>
11283 +#include <linux/spinlock.h>
11284 +#include <linux/smp_lock.h>
11285 +#include <linux/kernel_stat.h>
11286 +#include <linux/mc146818rtc.h>
11287 +#include <linux/cache.h>
11288 +#include <linux/interrupt.h>
11289 +#include <linux/cpu.h>
11290 +#include <linux/module.h>
11291 +
11292 +#include <asm/mtrr.h>
11293 +#include <asm/tlbflush.h>
11294 +#if 0
11295 +#include <mach_apic.h>
11296 +#endif
11297 +#include <xen/evtchn.h>
11298 +
11299 +/*
11300 + * Some notes on x86 processor bugs affecting SMP operation:
11301 + *
11302 + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11303 + * The Linux implications for SMP are handled as follows:
11304 + *
11305 + * Pentium III / [Xeon]
11306 + * None of the E1AP-E3AP errata are visible to the user.
11307 + *
11308 + * E1AP. see PII A1AP
11309 + * E2AP. see PII A2AP
11310 + * E3AP. see PII A3AP
11311 + *
11312 + * Pentium II / [Xeon]
11313 + * None of the A1AP-A3AP errata are visible to the user.
11314 + *
11315 + * A1AP. see PPro 1AP
11316 + * A2AP. see PPro 2AP
11317 + * A3AP. see PPro 7AP
11318 + *
11319 + * Pentium Pro
11320 + * None of 1AP-9AP errata are visible to the normal user,
11321 + * except occasional delivery of 'spurious interrupt' as trap #15.
11322 + * This is very rare and a non-problem.
11323 + *
11324 + * 1AP. Linux maps APIC as non-cacheable
11325 + * 2AP. worked around in hardware
11326 + * 3AP. fixed in C0 and above steppings microcode update.
11327 + * Linux does not use excessive STARTUP_IPIs.
11328 + * 4AP. worked around in hardware
11329 + * 5AP. symmetric IO mode (normal Linux operation) not affected.
11330 + * 'noapic' mode has vector 0xf filled out properly.
11331 + * 6AP. 'noapic' mode might be affected - fixed in later steppings
11332 + * 7AP. We do not assume writes to the LVT deassering IRQs
11333 + * 8AP. We do not enable low power mode (deep sleep) during MP bootup
11334 + * 9AP. We do not use mixed mode
11335 + *
11336 + * Pentium
11337 + * There is a marginal case where REP MOVS on 100MHz SMP
11338 + * machines with B stepping processors can fail. XXX should provide
11339 + * an L1cache=Writethrough or L1cache=off option.
11340 + *
11341 + * B stepping CPUs may hang. There are hardware work arounds
11342 + * for this. We warn about it in case your board doesn't have the work
11343 + * arounds. Basically thats so I can tell anyone with a B stepping
11344 + * CPU and SMP problems "tough".
11345 + *
11346 + * Specific items [From Pentium Processor Specification Update]
11347 + *
11348 + * 1AP. Linux doesn't use remote read
11349 + * 2AP. Linux doesn't trust APIC errors
11350 + * 3AP. We work around this
11351 + * 4AP. Linux never generated 3 interrupts of the same priority
11352 + * to cause a lost local interrupt.
11353 + * 5AP. Remote read is never used
11354 + * 6AP. not affected - worked around in hardware
11355 + * 7AP. not affected - worked around in hardware
11356 + * 8AP. worked around in hardware - we get explicit CS errors if not
11357 + * 9AP. only 'noapic' mode affected. Might generate spurious
11358 + * interrupts, we log only the first one and count the
11359 + * rest silently.
11360 + * 10AP. not affected - worked around in hardware
11361 + * 11AP. Linux reads the APIC between writes to avoid this, as per
11362 + * the documentation. Make sure you preserve this as it affects
11363 + * the C stepping chips too.
11364 + * 12AP. not affected - worked around in hardware
11365 + * 13AP. not affected - worked around in hardware
11366 + * 14AP. we always deassert INIT during bootup
11367 + * 15AP. not affected - worked around in hardware
11368 + * 16AP. not affected - worked around in hardware
11369 + * 17AP. not affected - worked around in hardware
11370 + * 18AP. not affected - worked around in hardware
11371 + * 19AP. not affected - worked around in BIOS
11372 + *
11373 + * If this sounds worrying believe me these bugs are either ___RARE___,
11374 + * or are signal timing bugs worked around in hardware and there's
11375 + * about nothing of note with C stepping upwards.
11376 + */
11377 +
11378 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
11379 +
11380 +/*
11381 + * the following functions deal with sending IPIs between CPUs.
11382 + *
11383 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
11384 + */
11385 +
11386 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
11387 +{
11388 + unsigned int icr = shortcut | APIC_DEST_LOGICAL;
11389 +
11390 + switch (vector) {
11391 + default:
11392 + icr |= APIC_DM_FIXED | vector;
11393 + break;
11394 + case NMI_VECTOR:
11395 + icr |= APIC_DM_NMI;
11396 + break;
11397 + }
11398 + return icr;
11399 +}
11400 +
11401 +static inline int __prepare_ICR2 (unsigned int mask)
11402 +{
11403 + return SET_APIC_DEST_FIELD(mask);
11404 +}
11405 +
11406 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
11407 +
11408 +static inline void __send_IPI_one(unsigned int cpu, int vector)
11409 +{
11410 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
11411 + BUG_ON(irq < 0);
11412 + notify_remote_via_irq(irq);
11413 +}
11414 +
11415 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
11416 +{
11417 + int cpu;
11418 +
11419 + switch (shortcut) {
11420 + case APIC_DEST_SELF:
11421 + __send_IPI_one(smp_processor_id(), vector);
11422 + break;
11423 + case APIC_DEST_ALLBUT:
11424 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11425 + if (cpu == smp_processor_id())
11426 + continue;
11427 + if (cpu_isset(cpu, cpu_online_map)) {
11428 + __send_IPI_one(cpu, vector);
11429 + }
11430 + }
11431 + break;
11432 + default:
11433 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
11434 + vector);
11435 + break;
11436 + }
11437 +}
11438 +
11439 +void fastcall send_IPI_self(int vector)
11440 +{
11441 + __send_IPI_shortcut(APIC_DEST_SELF, vector);
11442 +}
11443 +
11444 +/*
11445 + * This is only used on smaller machines.
11446 + */
11447 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
11448 +{
11449 + unsigned long flags;
11450 + unsigned int cpu;
11451 +
11452 + local_irq_save(flags);
11453 + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
11454 +
11455 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
11456 + if (cpu_isset(cpu, mask)) {
11457 + __send_IPI_one(cpu, vector);
11458 + }
11459 + }
11460 +
11461 + local_irq_restore(flags);
11462 +}
11463 +
11464 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
11465 +{
11466 +
11467 + send_IPI_mask_bitmask(mask, vector);
11468 +}
11469 +
11470 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
11471 +
11472 +#if 0 /* XEN */
11473 +/*
11474 + * Smarter SMP flushing macros.
11475 + * c/o Linus Torvalds.
11476 + *
11477 + * These mean you can really definitely utterly forget about
11478 + * writing to user space from interrupts. (Its not allowed anyway).
11479 + *
11480 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
11481 + */
11482 +
11483 +static cpumask_t flush_cpumask;
11484 +static struct mm_struct * flush_mm;
11485 +static unsigned long flush_va;
11486 +static DEFINE_SPINLOCK(tlbstate_lock);
11487 +#define FLUSH_ALL 0xffffffff
11488 +
11489 +/*
11490 + * We cannot call mmdrop() because we are in interrupt context,
11491 + * instead update mm->cpu_vm_mask.
11492 + *
11493 + * We need to reload %cr3 since the page tables may be going
11494 + * away from under us..
11495 + */
11496 +static inline void leave_mm (unsigned long cpu)
11497 +{
11498 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
11499 + BUG();
11500 + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
11501 + load_cr3(swapper_pg_dir);
11502 +}
11503 +
11504 +/*
11505 + *
11506 + * The flush IPI assumes that a thread switch happens in this order:
11507 + * [cpu0: the cpu that switches]
11508 + * 1) switch_mm() either 1a) or 1b)
11509 + * 1a) thread switch to a different mm
11510 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
11511 + * Stop ipi delivery for the old mm. This is not synchronized with
11512 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
11513 + * for the wrong mm, and in the worst case we perform a superflous
11514 + * tlb flush.
11515 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
11516 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
11517 + * was in lazy tlb mode.
11518 + * 1a3) update cpu_tlbstate[].active_mm
11519 + * Now cpu0 accepts tlb flushes for the new mm.
11520 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
11521 + * Now the other cpus will send tlb flush ipis.
11522 + * 1a4) change cr3.
11523 + * 1b) thread switch without mm change
11524 + * cpu_tlbstate[].active_mm is correct, cpu0 already handles
11525 + * flush ipis.
11526 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
11527 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
11528 + * Atomically set the bit [other cpus will start sending flush ipis],
11529 + * and test the bit.
11530 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
11531 + * 2) switch %%esp, ie current
11532 + *
11533 + * The interrupt must handle 2 special cases:
11534 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
11535 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
11536 + * runs in kernel space, the cpu could load tlb entries for user space
11537 + * pages.
11538 + *
11539 + * The good news is that cpu_tlbstate is local to each cpu, no
11540 + * write/read ordering problems.
11541 + */
11542 +
11543 +/*
11544 + * TLB flush IPI:
11545 + *
11546 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
11547 + * 2) Leave the mm if we are in the lazy tlb mode.
11548 + */
11549 +
11550 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
11551 + struct pt_regs *regs)
11552 +{
11553 + unsigned long cpu;
11554 +
11555 + cpu = get_cpu();
11556 +
11557 + if (!cpu_isset(cpu, flush_cpumask))
11558 + goto out;
11559 + /*
11560 + * This was a BUG() but until someone can quote me the
11561 + * line from the intel manual that guarantees an IPI to
11562 + * multiple CPUs is retried _only_ on the erroring CPUs
11563 + * its staying as a return
11564 + *
11565 + * BUG();
11566 + */
11567 +
11568 + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
11569 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
11570 + if (flush_va == FLUSH_ALL)
11571 + local_flush_tlb();
11572 + else
11573 + __flush_tlb_one(flush_va);
11574 + } else
11575 + leave_mm(cpu);
11576 + }
11577 + smp_mb__before_clear_bit();
11578 + cpu_clear(cpu, flush_cpumask);
11579 + smp_mb__after_clear_bit();
11580 +out:
11581 + put_cpu_no_resched();
11582 +
11583 + return IRQ_HANDLED;
11584 +}
11585 +
11586 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
11587 + unsigned long va)
11588 +{
11589 + /*
11590 + * A couple of (to be removed) sanity checks:
11591 + *
11592 + * - current CPU must not be in mask
11593 + * - mask must exist :)
11594 + */
11595 + BUG_ON(cpus_empty(cpumask));
11596 + BUG_ON(cpu_isset(smp_processor_id(), cpumask));
11597 + BUG_ON(!mm);
11598 +
11599 + /* If a CPU which we ran on has gone down, OK. */
11600 + cpus_and(cpumask, cpumask, cpu_online_map);
11601 + if (cpus_empty(cpumask))
11602 + return;
11603 +
11604 + /*
11605 + * i'm not happy about this global shared spinlock in the
11606 + * MM hot path, but we'll see how contended it is.
11607 + * Temporarily this turns IRQs off, so that lockups are
11608 + * detected by the NMI watchdog.
11609 + */
11610 + spin_lock(&tlbstate_lock);
11611 +
11612 + flush_mm = mm;
11613 + flush_va = va;
11614 +#if NR_CPUS <= BITS_PER_LONG
11615 + atomic_set_mask(cpumask, &flush_cpumask);
11616 +#else
11617 + {
11618 + int k;
11619 + unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
11620 + unsigned long *cpu_mask = (unsigned long *)&cpumask;
11621 + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
11622 + atomic_set_mask(cpu_mask[k], &flush_mask[k]);
11623 + }
11624 +#endif
11625 + /*
11626 + * We have to send the IPI only to
11627 + * CPUs affected.
11628 + */
11629 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
11630 +
11631 + while (!cpus_empty(flush_cpumask))
11632 + /* nothing. lockup detection does not belong here */
11633 + mb();
11634 +
11635 + flush_mm = NULL;
11636 + flush_va = 0;
11637 + spin_unlock(&tlbstate_lock);
11638 +}
11639 +
11640 +void flush_tlb_current_task(void)
11641 +{
11642 + struct mm_struct *mm = current->mm;
11643 + cpumask_t cpu_mask;
11644 +
11645 + preempt_disable();
11646 + cpu_mask = mm->cpu_vm_mask;
11647 + cpu_clear(smp_processor_id(), cpu_mask);
11648 +
11649 + local_flush_tlb();
11650 + if (!cpus_empty(cpu_mask))
11651 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11652 + preempt_enable();
11653 +}
11654 +
11655 +void flush_tlb_mm (struct mm_struct * mm)
11656 +{
11657 + cpumask_t cpu_mask;
11658 +
11659 + preempt_disable();
11660 + cpu_mask = mm->cpu_vm_mask;
11661 + cpu_clear(smp_processor_id(), cpu_mask);
11662 +
11663 + if (current->active_mm == mm) {
11664 + if (current->mm)
11665 + local_flush_tlb();
11666 + else
11667 + leave_mm(smp_processor_id());
11668 + }
11669 + if (!cpus_empty(cpu_mask))
11670 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
11671 +
11672 + preempt_enable();
11673 +}
11674 +
11675 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
11676 +{
11677 + struct mm_struct *mm = vma->vm_mm;
11678 + cpumask_t cpu_mask;
11679 +
11680 + preempt_disable();
11681 + cpu_mask = mm->cpu_vm_mask;
11682 + cpu_clear(smp_processor_id(), cpu_mask);
11683 +
11684 + if (current->active_mm == mm) {
11685 + if(current->mm)
11686 + __flush_tlb_one(va);
11687 + else
11688 + leave_mm(smp_processor_id());
11689 + }
11690 +
11691 + if (!cpus_empty(cpu_mask))
11692 + flush_tlb_others(cpu_mask, mm, va);
11693 +
11694 + preempt_enable();
11695 +}
11696 +EXPORT_SYMBOL(flush_tlb_page);
11697 +
11698 +static void do_flush_tlb_all(void* info)
11699 +{
11700 + unsigned long cpu = smp_processor_id();
11701 +
11702 + __flush_tlb_all();
11703 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
11704 + leave_mm(cpu);
11705 +}
11706 +
11707 +void flush_tlb_all(void)
11708 +{
11709 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
11710 +}
11711 +
11712 +#endif /* XEN */
11713 +
11714 +/*
11715 + * this function sends a 'reschedule' IPI to another CPU.
11716 + * it goes straight through and wastes no time serializing
11717 + * anything. Worst case is that we lose a reschedule ...
11718 + */
11719 +void smp_send_reschedule(int cpu)
11720 +{
11721 + WARN_ON(cpu_is_offline(cpu));
11722 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
11723 +}
11724 +
11725 +/*
11726 + * Structure and data for smp_call_function(). This is designed to minimise
11727 + * static memory requirements. It also looks cleaner.
11728 + */
11729 +static DEFINE_SPINLOCK(call_lock);
11730 +
11731 +struct call_data_struct {
11732 + void (*func) (void *info);
11733 + void *info;
11734 + atomic_t started;
11735 + atomic_t finished;
11736 + int wait;
11737 +};
11738 +
11739 +void lock_ipi_call_lock(void)
11740 +{
11741 + spin_lock_irq(&call_lock);
11742 +}
11743 +
11744 +void unlock_ipi_call_lock(void)
11745 +{
11746 + spin_unlock_irq(&call_lock);
11747 +}
11748 +
11749 +static struct call_data_struct *call_data;
11750 +
11751 +/**
11752 + * smp_call_function(): Run a function on all other CPUs.
11753 + * @func: The function to run. This must be fast and non-blocking.
11754 + * @info: An arbitrary pointer to pass to the function.
11755 + * @nonatomic: currently unused.
11756 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
11757 + *
11758 + * Returns 0 on success, else a negative status code. Does not return until
11759 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
11760 + *
11761 + * You must not call this function with disabled interrupts or from a
11762 + * hardware interrupt handler or from a bottom half handler.
11763 + */
11764 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
11765 + int wait)
11766 +{
11767 + struct call_data_struct data;
11768 + int cpus;
11769 +
11770 + /* Holding any lock stops cpus from going down. */
11771 + spin_lock(&call_lock);
11772 + cpus = num_online_cpus() - 1;
11773 + if (!cpus) {
11774 + spin_unlock(&call_lock);
11775 + return 0;
11776 + }
11777 +
11778 + /* Can deadlock when called with interrupts disabled */
11779 + WARN_ON(irqs_disabled());
11780 +
11781 + data.func = func;
11782 + data.info = info;
11783 + atomic_set(&data.started, 0);
11784 + data.wait = wait;
11785 + if (wait)
11786 + atomic_set(&data.finished, 0);
11787 +
11788 + call_data = &data;
11789 + mb();
11790 +
11791 + /* Send a message to all other CPUs and wait for them to respond */
11792 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
11793 +
11794 + /* Wait for response */
11795 + while (atomic_read(&data.started) != cpus)
11796 + cpu_relax();
11797 +
11798 + if (wait)
11799 + while (atomic_read(&data.finished) != cpus)
11800 + cpu_relax();
11801 + spin_unlock(&call_lock);
11802 +
11803 + return 0;
11804 +}
11805 +EXPORT_SYMBOL(smp_call_function);
11806 +
11807 +static void stop_this_cpu (void * dummy)
11808 +{
11809 + /*
11810 + * Remove this CPU:
11811 + */
11812 + cpu_clear(smp_processor_id(), cpu_online_map);
11813 + local_irq_disable();
11814 + disable_all_local_evtchn();
11815 + if (cpu_data[smp_processor_id()].hlt_works_ok)
11816 + for(;;) halt();
11817 + for (;;);
11818 +}
11819 +
11820 +/*
11821 + * this function calls the 'stop' function on all other CPUs in the system.
11822 + */
11823 +
11824 +void smp_send_stop(void)
11825 +{
11826 + smp_call_function(stop_this_cpu, NULL, 1, 0);
11827 +
11828 + local_irq_disable();
11829 + disable_all_local_evtchn();
11830 + local_irq_enable();
11831 +}
11832 +
11833 +/*
11834 + * Reschedule call back. Nothing to do,
11835 + * all the work is done automatically when
11836 + * we return from the interrupt.
11837 + */
11838 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
11839 + struct pt_regs *regs)
11840 +{
11841 +
11842 + return IRQ_HANDLED;
11843 +}
11844 +
11845 +#include <linux/kallsyms.h>
11846 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
11847 + struct pt_regs *regs)
11848 +{
11849 + void (*func) (void *info) = call_data->func;
11850 + void *info = call_data->info;
11851 + int wait = call_data->wait;
11852 +
11853 + /*
11854 + * Notify initiating CPU that I've grabbed the data and am
11855 + * about to execute the function
11856 + */
11857 + mb();
11858 + atomic_inc(&call_data->started);
11859 + /*
11860 + * At this point the info structure may be out of scope unless wait==1
11861 + */
11862 + irq_enter();
11863 + (*func)(info);
11864 + irq_exit();
11865 +
11866 + if (wait) {
11867 + mb();
11868 + atomic_inc(&call_data->finished);
11869 + }
11870 +
11871 + return IRQ_HANDLED;
11872 +}
11873 +
11874 Index: head-2008-11-25/arch/x86/kernel/time_32-xen.c
11875 ===================================================================
11876 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
11877 +++ head-2008-11-25/arch/x86/kernel/time_32-xen.c 2008-09-01 12:07:31.000000000 +0200
11878 @@ -0,0 +1,1209 @@
11879 +/*
11880 + * linux/arch/i386/kernel/time.c
11881 + *
11882 + * Copyright (C) 1991, 1992, 1995 Linus Torvalds
11883 + *
11884 + * This file contains the PC-specific time handling details:
11885 + * reading the RTC at bootup, etc..
11886 + * 1994-07-02 Alan Modra
11887 + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
11888 + * 1995-03-26 Markus Kuhn
11889 + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
11890 + * precision CMOS clock update
11891 + * 1996-05-03 Ingo Molnar
11892 + * fixed time warps in do_[slow|fast]_gettimeoffset()
11893 + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11894 + * "A Kernel Model for Precision Timekeeping" by Dave Mills
11895 + * 1998-09-05 (Various)
11896 + * More robust do_fast_gettimeoffset() algorithm implemented
11897 + * (works with APM, Cyrix 6x86MX and Centaur C6),
11898 + * monotonic gettimeofday() with fast_get_timeoffset(),
11899 + * drift-proof precision TSC calibration on boot
11900 + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
11901 + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
11902 + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
11903 + * 1998-12-16 Andrea Arcangeli
11904 + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
11905 + * because was not accounting lost_ticks.
11906 + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
11907 + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
11908 + * serialize accesses to xtime/lost_ticks).
11909 + */
11910 +
11911 +#include <linux/errno.h>
11912 +#include <linux/sched.h>
11913 +#include <linux/kernel.h>
11914 +#include <linux/param.h>
11915 +#include <linux/string.h>
11916 +#include <linux/mm.h>
11917 +#include <linux/interrupt.h>
11918 +#include <linux/time.h>
11919 +#include <linux/delay.h>
11920 +#include <linux/init.h>
11921 +#include <linux/smp.h>
11922 +#include <linux/module.h>
11923 +#include <linux/sysdev.h>
11924 +#include <linux/bcd.h>
11925 +#include <linux/efi.h>
11926 +#include <linux/mca.h>
11927 +#include <linux/sysctl.h>
11928 +#include <linux/percpu.h>
11929 +#include <linux/kernel_stat.h>
11930 +#include <linux/posix-timers.h>
11931 +#include <linux/cpufreq.h>
11932 +
11933 +#include <asm/io.h>
11934 +#include <asm/smp.h>
11935 +#include <asm/irq.h>
11936 +#include <asm/msr.h>
11937 +#include <asm/delay.h>
11938 +#include <asm/mpspec.h>
11939 +#include <asm/uaccess.h>
11940 +#include <asm/processor.h>
11941 +#include <asm/timer.h>
11942 +#include <asm/sections.h>
11943 +
11944 +#include "mach_time.h"
11945 +
11946 +#include <linux/timex.h>
11947 +
11948 +#include <asm/hpet.h>
11949 +
11950 +#include <asm/arch_hooks.h>
11951 +
11952 +#include <xen/evtchn.h>
11953 +#include <xen/interface/vcpu.h>
11954 +
11955 +#if defined (__i386__)
11956 +#include <asm/i8259.h>
11957 +#endif
11958 +
11959 +int pit_latch_buggy; /* extern */
11960 +
11961 +#if defined(__x86_64__)
11962 +unsigned long vxtime_hz = PIT_TICK_RATE;
11963 +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
11964 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
11965 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
11966 +struct timespec __xtime __section_xtime;
11967 +struct timezone __sys_tz __section_sys_tz;
11968 +#endif
11969 +
11970 +unsigned int cpu_khz; /* Detected as we calibrate the TSC */
11971 +EXPORT_SYMBOL(cpu_khz);
11972 +
11973 +extern unsigned long wall_jiffies;
11974 +
11975 +DEFINE_SPINLOCK(rtc_lock);
11976 +EXPORT_SYMBOL(rtc_lock);
11977 +
11978 +extern struct init_timer_opts timer_tsc_init;
11979 +extern struct timer_opts timer_tsc;
11980 +#define timer_none timer_tsc
11981 +
11982 +/* These are peridically updated in shared_info, and then copied here. */
11983 +struct shadow_time_info {
11984 + u64 tsc_timestamp; /* TSC at last update of time vals. */
11985 + u64 system_timestamp; /* Time, in nanosecs, since boot. */
11986 + u32 tsc_to_nsec_mul;
11987 + u32 tsc_to_usec_mul;
11988 + int tsc_shift;
11989 + u32 version;
11990 +};
11991 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
11992 +static struct timespec shadow_tv;
11993 +static u32 shadow_tv_version;
11994 +
11995 +static struct timeval monotonic_tv;
11996 +static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED;
11997 +
11998 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
11999 +static u64 processed_system_time; /* System time (ns) at last processing. */
12000 +static DEFINE_PER_CPU(u64, processed_system_time);
12001 +
12002 +/* How much CPU time was spent blocked and how much was 'stolen'? */
12003 +static DEFINE_PER_CPU(u64, processed_stolen_time);
12004 +static DEFINE_PER_CPU(u64, processed_blocked_time);
12005 +
12006 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
12007 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
12008 +
12009 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
12010 +#define NS_PER_TICK (1000000000LL/HZ)
12011 +
12012 +static void __clock_was_set(void *unused)
12013 +{
12014 + clock_was_set();
12015 +}
12016 +static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
12017 +
12018 +/*
12019 + * GCC 4.3 can turn loops over an induction variable into division. We do
12020 + * not support arbitrary 64-bit division, and so must break the induction.
12021 + */
12022 +#define clobber_induction_variable(v) asm ( "" : "+r" (v) )
12023 +
12024 +static inline void __normalize_time(time_t *sec, s64 *nsec)
12025 +{
12026 + while (*nsec >= NSEC_PER_SEC) {
12027 + clobber_induction_variable(*nsec);
12028 + (*nsec) -= NSEC_PER_SEC;
12029 + (*sec)++;
12030 + }
12031 + while (*nsec < 0) {
12032 + clobber_induction_variable(*nsec);
12033 + (*nsec) += NSEC_PER_SEC;
12034 + (*sec)--;
12035 + }
12036 +}
12037 +
12038 +/* Does this guest OS track Xen time, or set its wall clock independently? */
12039 +static int independent_wallclock = 0;
12040 +static int __init __independent_wallclock(char *str)
12041 +{
12042 + independent_wallclock = 1;
12043 + return 1;
12044 +}
12045 +__setup("independent_wallclock", __independent_wallclock);
12046 +
12047 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
12048 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
12049 +static int __init __permitted_clock_jitter(char *str)
12050 +{
12051 + permitted_clock_jitter = simple_strtoul(str, NULL, 0);
12052 + return 1;
12053 +}
12054 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
12055 +
12056 +#if 0
12057 +static void delay_tsc(unsigned long loops)
12058 +{
12059 + unsigned long bclock, now;
12060 +
12061 + rdtscl(bclock);
12062 + do {
12063 + rep_nop();
12064 + rdtscl(now);
12065 + } while ((now - bclock) < loops);
12066 +}
12067 +
12068 +struct timer_opts timer_tsc = {
12069 + .name = "tsc",
12070 + .delay = delay_tsc,
12071 +};
12072 +#endif
12073 +
12074 +/*
12075 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
12076 + * yielding a 64-bit result.
12077 + */
12078 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
12079 +{
12080 + u64 product;
12081 +#ifdef __i386__
12082 + u32 tmp1, tmp2;
12083 +#endif
12084 +
12085 + if (shift < 0)
12086 + delta >>= -shift;
12087 + else
12088 + delta <<= shift;
12089 +
12090 +#ifdef __i386__
12091 + __asm__ (
12092 + "mul %5 ; "
12093 + "mov %4,%%eax ; "
12094 + "mov %%edx,%4 ; "
12095 + "mul %5 ; "
12096 + "xor %5,%5 ; "
12097 + "add %4,%%eax ; "
12098 + "adc %5,%%edx ; "
12099 + : "=A" (product), "=r" (tmp1), "=r" (tmp2)
12100 + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
12101 +#else
12102 + __asm__ (
12103 + "mul %%rdx ; shrd $32,%%rdx,%%rax"
12104 + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
12105 +#endif
12106 +
12107 + return product;
12108 +}
12109 +
12110 +#if 0 /* defined (__i386__) */
12111 +int read_current_timer(unsigned long *timer_val)
12112 +{
12113 + rdtscl(*timer_val);
12114 + return 0;
12115 +}
12116 +#endif
12117 +
12118 +void init_cpu_khz(void)
12119 +{
12120 + u64 __cpu_khz = 1000000ULL << 32;
12121 + struct vcpu_time_info *info = &vcpu_info(0)->time;
12122 + do_div(__cpu_khz, info->tsc_to_system_mul);
12123 + if (info->tsc_shift < 0)
12124 + cpu_khz = __cpu_khz << -info->tsc_shift;
12125 + else
12126 + cpu_khz = __cpu_khz >> info->tsc_shift;
12127 +}
12128 +
12129 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
12130 +{
12131 + u64 now, delta;
12132 + rdtscll(now);
12133 + delta = now - shadow->tsc_timestamp;
12134 + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
12135 +}
12136 +
12137 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
12138 +{
12139 + u64 now, delta;
12140 + rdtscll(now);
12141 + delta = now - shadow->tsc_timestamp;
12142 + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
12143 +}
12144 +
12145 +static void __update_wallclock(time_t sec, long nsec)
12146 +{
12147 + long wtm_nsec, xtime_nsec;
12148 + time_t wtm_sec, xtime_sec;
12149 + u64 tmp, wc_nsec;
12150 +
12151 + /* Adjust wall-clock time base based on wall_jiffies ticks. */
12152 + wc_nsec = processed_system_time;
12153 + wc_nsec += sec * (u64)NSEC_PER_SEC;
12154 + wc_nsec += nsec;
12155 + wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
12156 +
12157 + /* Split wallclock base into seconds and nanoseconds. */
12158 + tmp = wc_nsec;
12159 + xtime_nsec = do_div(tmp, 1000000000);
12160 + xtime_sec = (time_t)tmp;
12161 +
12162 + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
12163 + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
12164 +
12165 + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
12166 + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
12167 +
12168 + ntp_clear();
12169 +}
12170 +
12171 +static void update_wallclock(void)
12172 +{
12173 + shared_info_t *s = HYPERVISOR_shared_info;
12174 +
12175 + do {
12176 + shadow_tv_version = s->wc_version;
12177 + rmb();
12178 + shadow_tv.tv_sec = s->wc_sec;
12179 + shadow_tv.tv_nsec = s->wc_nsec;
12180 + rmb();
12181 + } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
12182 +
12183 + if (!independent_wallclock)
12184 + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
12185 +}
12186 +
12187 +/*
12188 + * Reads a consistent set of time-base values from Xen, into a shadow data
12189 + * area.
12190 + */
12191 +static void get_time_values_from_xen(unsigned int cpu)
12192 +{
12193 + struct vcpu_time_info *src;
12194 + struct shadow_time_info *dst;
12195 + unsigned long flags;
12196 + u32 pre_version, post_version;
12197 +
12198 + src = &vcpu_info(cpu)->time;
12199 + dst = &per_cpu(shadow_time, cpu);
12200 +
12201 + local_irq_save(flags);
12202 +
12203 + do {
12204 + pre_version = dst->version = src->version;
12205 + rmb();
12206 + dst->tsc_timestamp = src->tsc_timestamp;
12207 + dst->system_timestamp = src->system_time;
12208 + dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
12209 + dst->tsc_shift = src->tsc_shift;
12210 + rmb();
12211 + post_version = src->version;
12212 + } while ((pre_version & 1) | (pre_version ^ post_version));
12213 +
12214 + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
12215 +
12216 + local_irq_restore(flags);
12217 +}
12218 +
12219 +static inline int time_values_up_to_date(unsigned int cpu)
12220 +{
12221 + struct vcpu_time_info *src;
12222 + struct shadow_time_info *dst;
12223 +
12224 + src = &vcpu_info(cpu)->time;
12225 + dst = &per_cpu(shadow_time, cpu);
12226 +
12227 + rmb();
12228 + return (dst->version == src->version);
12229 +}
12230 +
12231 +/*
12232 + * This is a special lock that is owned by the CPU and holds the index
12233 + * register we are working with. It is required for NMI access to the
12234 + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
12235 + */
12236 +volatile unsigned long cmos_lock = 0;
12237 +EXPORT_SYMBOL(cmos_lock);
12238 +
12239 +/* Routines for accessing the CMOS RAM/RTC. */
12240 +unsigned char rtc_cmos_read(unsigned char addr)
12241 +{
12242 + unsigned char val;
12243 + lock_cmos_prefix(addr);
12244 + outb_p(addr, RTC_PORT(0));
12245 + val = inb_p(RTC_PORT(1));
12246 + lock_cmos_suffix(addr);
12247 + return val;
12248 +}
12249 +EXPORT_SYMBOL(rtc_cmos_read);
12250 +
12251 +void rtc_cmos_write(unsigned char val, unsigned char addr)
12252 +{
12253 + lock_cmos_prefix(addr);
12254 + outb_p(addr, RTC_PORT(0));
12255 + outb_p(val, RTC_PORT(1));
12256 + lock_cmos_suffix(addr);
12257 +}
12258 +EXPORT_SYMBOL(rtc_cmos_write);
12259 +
12260 +/*
12261 + * This version of gettimeofday has microsecond resolution
12262 + * and better than microsecond precision on fast x86 machines with TSC.
12263 + */
12264 +void do_gettimeofday(struct timeval *tv)
12265 +{
12266 + unsigned long seq;
12267 + unsigned long usec, sec;
12268 + unsigned long flags;
12269 + s64 nsec;
12270 + unsigned int cpu;
12271 + struct shadow_time_info *shadow;
12272 + u32 local_time_version;
12273 +
12274 + cpu = get_cpu();
12275 + shadow = &per_cpu(shadow_time, cpu);
12276 +
12277 + do {
12278 + unsigned long lost;
12279 +
12280 + local_time_version = shadow->version;
12281 + seq = read_seqbegin(&xtime_lock);
12282 +
12283 + usec = get_usec_offset(shadow);
12284 + lost = jiffies - wall_jiffies;
12285 +
12286 + if (unlikely(lost))
12287 + usec += lost * (USEC_PER_SEC / HZ);
12288 +
12289 + sec = xtime.tv_sec;
12290 + usec += (xtime.tv_nsec / NSEC_PER_USEC);
12291 +
12292 + nsec = shadow->system_timestamp - processed_system_time;
12293 + __normalize_time(&sec, &nsec);
12294 + usec += (long)nsec / NSEC_PER_USEC;
12295 +
12296 + if (unlikely(!time_values_up_to_date(cpu))) {
12297 + /*
12298 + * We may have blocked for a long time,
12299 + * rendering our calculations invalid
12300 + * (e.g. the time delta may have
12301 + * overflowed). Detect that and recalculate
12302 + * with fresh values.
12303 + */
12304 + get_time_values_from_xen(cpu);
12305 + continue;
12306 + }
12307 + } while (read_seqretry(&xtime_lock, seq) ||
12308 + (local_time_version != shadow->version));
12309 +
12310 + put_cpu();
12311 +
12312 + while (usec >= USEC_PER_SEC) {
12313 + usec -= USEC_PER_SEC;
12314 + sec++;
12315 + }
12316 +
12317 + spin_lock_irqsave(&monotonic_lock, flags);
12318 + if ((sec > monotonic_tv.tv_sec) ||
12319 + ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec)))
12320 + {
12321 + monotonic_tv.tv_sec = sec;
12322 + monotonic_tv.tv_usec = usec;
12323 + } else {
12324 + sec = monotonic_tv.tv_sec;
12325 + usec = monotonic_tv.tv_usec;
12326 + }
12327 + spin_unlock_irqrestore(&monotonic_lock, flags);
12328 +
12329 + tv->tv_sec = sec;
12330 + tv->tv_usec = usec;
12331 +}
12332 +
12333 +EXPORT_SYMBOL(do_gettimeofday);
12334 +
12335 +int do_settimeofday(struct timespec *tv)
12336 +{
12337 + time_t sec;
12338 + s64 nsec;
12339 + unsigned int cpu;
12340 + struct shadow_time_info *shadow;
12341 + struct xen_platform_op op;
12342 +
12343 + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
12344 + return -EINVAL;
12345 +
12346 + cpu = get_cpu();
12347 + shadow = &per_cpu(shadow_time, cpu);
12348 +
12349 + write_seqlock_irq(&xtime_lock);
12350 +
12351 + /*
12352 + * Ensure we don't get blocked for a long time so that our time delta
12353 + * overflows. If that were to happen then our shadow time values would
12354 + * be stale, so we can retry with fresh ones.
12355 + */
12356 + for (;;) {
12357 + nsec = tv->tv_nsec - get_nsec_offset(shadow);
12358 + if (time_values_up_to_date(cpu))
12359 + break;
12360 + get_time_values_from_xen(cpu);
12361 + }
12362 + sec = tv->tv_sec;
12363 + __normalize_time(&sec, &nsec);
12364 +
12365 + if (is_initial_xendomain() && !independent_wallclock) {
12366 + op.cmd = XENPF_settime;
12367 + op.u.settime.secs = sec;
12368 + op.u.settime.nsecs = nsec;
12369 + op.u.settime.system_time = shadow->system_timestamp;
12370 + WARN_ON(HYPERVISOR_platform_op(&op));
12371 + update_wallclock();
12372 + } else if (independent_wallclock) {
12373 + nsec -= shadow->system_timestamp;
12374 + __normalize_time(&sec, &nsec);
12375 + __update_wallclock(sec, nsec);
12376 + }
12377 +
12378 + /* Reset monotonic gettimeofday() timeval. */
12379 + spin_lock(&monotonic_lock);
12380 + monotonic_tv.tv_sec = 0;
12381 + monotonic_tv.tv_usec = 0;
12382 + spin_unlock(&monotonic_lock);
12383 +
12384 + write_sequnlock_irq(&xtime_lock);
12385 +
12386 + put_cpu();
12387 +
12388 + clock_was_set();
12389 + return 0;
12390 +}
12391 +
12392 +EXPORT_SYMBOL(do_settimeofday);
12393 +
12394 +static void sync_xen_wallclock(unsigned long dummy);
12395 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
12396 +static void sync_xen_wallclock(unsigned long dummy)
12397 +{
12398 + time_t sec;
12399 + s64 nsec;
12400 + struct xen_platform_op op;
12401 +
12402 + if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
12403 + return;
12404 +
12405 + write_seqlock_irq(&xtime_lock);
12406 +
12407 + sec = xtime.tv_sec;
12408 + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
12409 + __normalize_time(&sec, &nsec);
12410 +
12411 + op.cmd = XENPF_settime;
12412 + op.u.settime.secs = sec;
12413 + op.u.settime.nsecs = nsec;
12414 + op.u.settime.system_time = processed_system_time;
12415 + WARN_ON(HYPERVISOR_platform_op(&op));
12416 +
12417 + update_wallclock();
12418 +
12419 + write_sequnlock_irq(&xtime_lock);
12420 +
12421 + /* Once per minute. */
12422 + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
12423 +}
12424 +
12425 +static int set_rtc_mmss(unsigned long nowtime)
12426 +{
12427 + int retval;
12428 + unsigned long flags;
12429 +
12430 + if (independent_wallclock || !is_initial_xendomain())
12431 + return 0;
12432 +
12433 + /* gets recalled with irq locally disabled */
12434 + /* XXX - does irqsave resolve this? -johnstul */
12435 + spin_lock_irqsave(&rtc_lock, flags);
12436 + if (efi_enabled)
12437 + retval = efi_set_rtc_mmss(nowtime);
12438 + else
12439 + retval = mach_set_rtc_mmss(nowtime);
12440 + spin_unlock_irqrestore(&rtc_lock, flags);
12441 +
12442 + return retval;
12443 +}
12444 +
12445 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
12446 + * Note: This function is required to return accurate
12447 + * time even in the absence of multiple timer ticks.
12448 + */
12449 +unsigned long long monotonic_clock(void)
12450 +{
12451 + unsigned int cpu = get_cpu();
12452 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12453 + u64 time;
12454 + u32 local_time_version;
12455 +
12456 + do {
12457 + local_time_version = shadow->version;
12458 + barrier();
12459 + time = shadow->system_timestamp + get_nsec_offset(shadow);
12460 + if (!time_values_up_to_date(cpu))
12461 + get_time_values_from_xen(cpu);
12462 + barrier();
12463 + } while (local_time_version != shadow->version);
12464 +
12465 + put_cpu();
12466 +
12467 + return time;
12468 +}
12469 +EXPORT_SYMBOL(monotonic_clock);
12470 +
12471 +#ifdef __x86_64__
12472 +unsigned long long sched_clock(void)
12473 +{
12474 + return monotonic_clock();
12475 +}
12476 +#endif
12477 +
12478 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
12479 +unsigned long profile_pc(struct pt_regs *regs)
12480 +{
12481 + unsigned long pc = instruction_pointer(regs);
12482 +
12483 +#ifdef __x86_64__
12484 + /* Assume the lock function has either no stack frame or only a single word.
12485 + This checks if the address on the stack looks like a kernel text address.
12486 + There is a small window for false hits, but in that case the tick
12487 + is just accounted to the spinlock function.
12488 + Better would be to write these functions in assembler again
12489 + and check exactly. */
12490 + if (!user_mode_vm(regs) && in_lock_functions(pc)) {
12491 + char *v = *(char **)regs->rsp;
12492 + if ((v >= _stext && v <= _etext) ||
12493 + (v >= _sinittext && v <= _einittext) ||
12494 + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
12495 + return (unsigned long)v;
12496 + return ((unsigned long *)regs->rsp)[1];
12497 + }
12498 +#else
12499 + if (!user_mode_vm(regs) && in_lock_functions(pc))
12500 + return *(unsigned long *)(regs->ebp + 4);
12501 +#endif
12502 +
12503 + return pc;
12504 +}
12505 +EXPORT_SYMBOL(profile_pc);
12506 +#endif
12507 +
12508 +/*
12509 + * This is the same as the above, except we _also_ save the current
12510 + * Time Stamp Counter value at the time of the timer interrupt, so that
12511 + * we later on can estimate the time of day more exactly.
12512 + */
12513 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
12514 +{
12515 + s64 delta, delta_cpu, stolen, blocked;
12516 + u64 sched_time;
12517 + unsigned int i, cpu = smp_processor_id();
12518 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
12519 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12520 +
12521 + /*
12522 + * Here we are in the timer irq handler. We just have irqs locally
12523 + * disabled but we don't know if the timer_bh is running on the other
12524 + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
12525 + * the irq version of write_lock because as just said we have irq
12526 + * locally disabled. -arca
12527 + */
12528 + write_seqlock(&xtime_lock);
12529 +
12530 + do {
12531 + get_time_values_from_xen(cpu);
12532 +
12533 + /* Obtain a consistent snapshot of elapsed wallclock cycles. */
12534 + delta = delta_cpu =
12535 + shadow->system_timestamp + get_nsec_offset(shadow);
12536 + delta -= processed_system_time;
12537 + delta_cpu -= per_cpu(processed_system_time, cpu);
12538 +
12539 + /*
12540 + * Obtain a consistent snapshot of stolen/blocked cycles. We
12541 + * can use state_entry_time to detect if we get preempted here.
12542 + */
12543 + do {
12544 + sched_time = runstate->state_entry_time;
12545 + barrier();
12546 + stolen = runstate->time[RUNSTATE_runnable] +
12547 + runstate->time[RUNSTATE_offline] -
12548 + per_cpu(processed_stolen_time, cpu);
12549 + blocked = runstate->time[RUNSTATE_blocked] -
12550 + per_cpu(processed_blocked_time, cpu);
12551 + barrier();
12552 + } while (sched_time != runstate->state_entry_time);
12553 + } while (!time_values_up_to_date(cpu));
12554 +
12555 + if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
12556 + unlikely(delta_cpu < -(s64)permitted_clock_jitter))
12557 + && printk_ratelimit()) {
12558 + printk("Timer ISR/%u: Time went backwards: "
12559 + "delta=%lld delta_cpu=%lld shadow=%lld "
12560 + "off=%lld processed=%lld cpu_processed=%lld\n",
12561 + cpu, delta, delta_cpu, shadow->system_timestamp,
12562 + (s64)get_nsec_offset(shadow),
12563 + processed_system_time,
12564 + per_cpu(processed_system_time, cpu));
12565 + for (i = 0; i < num_online_cpus(); i++)
12566 + printk(" %d: %lld\n", i,
12567 + per_cpu(processed_system_time, i));
12568 + }
12569 +
12570 + /* System-wide jiffy work. */
12571 + while (delta >= NS_PER_TICK) {
12572 + delta -= NS_PER_TICK;
12573 + processed_system_time += NS_PER_TICK;
12574 + do_timer(regs);
12575 + }
12576 +
12577 + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
12578 + update_wallclock();
12579 + if (keventd_up())
12580 + schedule_work(&clock_was_set_work);
12581 + }
12582 +
12583 + write_sequnlock(&xtime_lock);
12584 +
12585 + /*
12586 + * Account stolen ticks.
12587 + * HACK: Passing NULL to account_steal_time()
12588 + * ensures that the ticks are accounted as stolen.
12589 + */
12590 + if ((stolen > 0) && (delta_cpu > 0)) {
12591 + delta_cpu -= stolen;
12592 + if (unlikely(delta_cpu < 0))
12593 + stolen += delta_cpu; /* clamp local-time progress */
12594 + do_div(stolen, NS_PER_TICK);
12595 + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
12596 + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
12597 + account_steal_time(NULL, (cputime_t)stolen);
12598 + }
12599 +
12600 + /*
12601 + * Account blocked ticks.
12602 + * HACK: Passing idle_task to account_steal_time()
12603 + * ensures that the ticks are accounted as idle/wait.
12604 + */
12605 + if ((blocked > 0) && (delta_cpu > 0)) {
12606 + delta_cpu -= blocked;
12607 + if (unlikely(delta_cpu < 0))
12608 + blocked += delta_cpu; /* clamp local-time progress */
12609 + do_div(blocked, NS_PER_TICK);
12610 + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
12611 + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
12612 + account_steal_time(idle_task(cpu), (cputime_t)blocked);
12613 + }
12614 +
12615 + /* Account user/system ticks. */
12616 + if (delta_cpu > 0) {
12617 + do_div(delta_cpu, NS_PER_TICK);
12618 + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
12619 + if (user_mode_vm(regs))
12620 + account_user_time(current, (cputime_t)delta_cpu);
12621 + else
12622 + account_system_time(current, HARDIRQ_OFFSET,
12623 + (cputime_t)delta_cpu);
12624 + }
12625 +
12626 + /* Offlined for more than a few seconds? Avoid lockup warnings. */
12627 + if (stolen > 5*HZ)
12628 + touch_softlockup_watchdog();
12629 +
12630 + /* Local timer processing (see update_process_times()). */
12631 + run_local_timers();
12632 + if (rcu_pending(cpu))
12633 + rcu_check_callbacks(cpu, user_mode_vm(regs));
12634 + scheduler_tick();
12635 + run_posix_cpu_timers(current);
12636 + profile_tick(CPU_PROFILING, regs);
12637 +
12638 + return IRQ_HANDLED;
12639 +}
12640 +
12641 +static void init_missing_ticks_accounting(unsigned int cpu)
12642 +{
12643 + struct vcpu_register_runstate_memory_area area;
12644 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
12645 + int rc;
12646 +
12647 + memset(runstate, 0, sizeof(*runstate));
12648 +
12649 + area.addr.v = runstate;
12650 + rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
12651 + WARN_ON(rc && rc != -ENOSYS);
12652 +
12653 + per_cpu(processed_blocked_time, cpu) =
12654 + runstate->time[RUNSTATE_blocked];
12655 + per_cpu(processed_stolen_time, cpu) =
12656 + runstate->time[RUNSTATE_runnable] +
12657 + runstate->time[RUNSTATE_offline];
12658 +}
12659 +
12660 +/* not static: needed by APM */
12661 +unsigned long get_cmos_time(void)
12662 +{
12663 + unsigned long retval;
12664 + unsigned long flags;
12665 +
12666 + spin_lock_irqsave(&rtc_lock, flags);
12667 +
12668 + if (efi_enabled)
12669 + retval = efi_get_time();
12670 + else
12671 + retval = mach_get_cmos_time();
12672 +
12673 + spin_unlock_irqrestore(&rtc_lock, flags);
12674 +
12675 + return retval;
12676 +}
12677 +EXPORT_SYMBOL(get_cmos_time);
12678 +
12679 +static void sync_cmos_clock(unsigned long dummy);
12680 +
12681 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
12682 +
12683 +static void sync_cmos_clock(unsigned long dummy)
12684 +{
12685 + struct timeval now, next;
12686 + int fail = 1;
12687 +
12688 + /*
12689 + * If we have an externally synchronized Linux clock, then update
12690 + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
12691 + * called as close as possible to 500 ms before the new second starts.
12692 + * This code is run on a timer. If the clock is set, that timer
12693 + * may not expire at the correct time. Thus, we adjust...
12694 + */
12695 + if (!ntp_synced())
12696 + /*
12697 + * Not synced, exit, do not restart a timer (if one is
12698 + * running, let it run out).
12699 + */
12700 + return;
12701 +
12702 + do_gettimeofday(&now);
12703 + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
12704 + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
12705 + fail = set_rtc_mmss(now.tv_sec);
12706 +
12707 + next.tv_usec = USEC_AFTER - now.tv_usec;
12708 + if (next.tv_usec <= 0)
12709 + next.tv_usec += USEC_PER_SEC;
12710 +
12711 + if (!fail)
12712 + next.tv_sec = 659;
12713 + else
12714 + next.tv_sec = 0;
12715 +
12716 + if (next.tv_usec >= USEC_PER_SEC) {
12717 + next.tv_sec++;
12718 + next.tv_usec -= USEC_PER_SEC;
12719 + }
12720 + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
12721 +}
12722 +
12723 +void notify_arch_cmos_timer(void)
12724 +{
12725 + mod_timer(&sync_cmos_timer, jiffies + 1);
12726 + mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
12727 +}
12728 +
12729 +static int timer_resume(struct sys_device *dev)
12730 +{
12731 + extern void time_resume(void);
12732 + time_resume();
12733 + return 0;
12734 +}
12735 +
12736 +static struct sysdev_class timer_sysclass = {
12737 + .resume = timer_resume,
12738 + set_kset_name("timer"),
12739 +};
12740 +
12741 +
12742 +/* XXX this driverfs stuff should probably go elsewhere later -john */
12743 +static struct sys_device device_timer = {
12744 + .id = 0,
12745 + .cls = &timer_sysclass,
12746 +};
12747 +
12748 +static int time_init_device(void)
12749 +{
12750 + int error = sysdev_class_register(&timer_sysclass);
12751 + if (!error)
12752 + error = sysdev_register(&device_timer);
12753 + return error;
12754 +}
12755 +
12756 +device_initcall(time_init_device);
12757 +
12758 +#ifdef CONFIG_HPET_TIMER
12759 +extern void (*late_time_init)(void);
12760 +/* Duplicate of time_init() below, with hpet_enable part added */
12761 +static void __init hpet_time_init(void)
12762 +{
12763 + xtime.tv_sec = get_cmos_time();
12764 + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
12765 + set_normalized_timespec(&wall_to_monotonic,
12766 + -xtime.tv_sec, -xtime.tv_nsec);
12767 +
12768 + if ((hpet_enable() >= 0) && hpet_use_timer) {
12769 + printk("Using HPET for base-timer\n");
12770 + }
12771 +
12772 + time_init_hook();
12773 +}
12774 +#endif
12775 +
12776 +/* Dynamically-mapped IRQ. */
12777 +DEFINE_PER_CPU(int, timer_irq);
12778 +
12779 +extern void (*late_time_init)(void);
12780 +static void setup_cpu0_timer_irq(void)
12781 +{
12782 + per_cpu(timer_irq, 0) =
12783 + bind_virq_to_irqhandler(
12784 + VIRQ_TIMER,
12785 + 0,
12786 + timer_interrupt,
12787 + SA_INTERRUPT,
12788 + "timer0",
12789 + NULL);
12790 + BUG_ON(per_cpu(timer_irq, 0) < 0);
12791 +}
12792 +
12793 +static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
12794 + .period_ns = NS_PER_TICK
12795 +};
12796 +
12797 +void __init time_init(void)
12798 +{
12799 +#ifdef CONFIG_HPET_TIMER
12800 + if (is_hpet_capable()) {
12801 + /*
12802 + * HPET initialization needs to do memory-mapped io. So, let
12803 + * us do a late initialization after mem_init().
12804 + */
12805 + late_time_init = hpet_time_init;
12806 + return;
12807 + }
12808 +#endif
12809 +
12810 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
12811 + &xen_set_periodic_tick)) {
12812 + case 0:
12813 +#if CONFIG_XEN_COMPAT <= 0x030004
12814 + case -ENOSYS:
12815 +#endif
12816 + break;
12817 + default:
12818 + BUG();
12819 + }
12820 +
12821 + get_time_values_from_xen(0);
12822 +
12823 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12824 + per_cpu(processed_system_time, 0) = processed_system_time;
12825 + init_missing_ticks_accounting(0);
12826 +
12827 + update_wallclock();
12828 +
12829 + init_cpu_khz();
12830 + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
12831 + cpu_khz / 1000, cpu_khz % 1000);
12832 +
12833 +#if defined(__x86_64__)
12834 + vxtime.mode = VXTIME_TSC;
12835 + vxtime.quot = (1000000L << 32) / vxtime_hz;
12836 + vxtime.tsc_quot = (1000L << 32) / cpu_khz;
12837 + sync_core();
12838 + rdtscll(vxtime.last_tsc);
12839 +#endif
12840 +
12841 + /* Cannot request_irq() until kmem is initialised. */
12842 + late_time_init = setup_cpu0_timer_irq;
12843 +}
12844 +
12845 +/* Convert jiffies to system time. */
12846 +u64 jiffies_to_st(unsigned long j)
12847 +{
12848 + unsigned long seq;
12849 + long delta;
12850 + u64 st;
12851 +
12852 + do {
12853 + seq = read_seqbegin(&xtime_lock);
12854 + delta = j - jiffies;
12855 + if (delta < 1) {
12856 + /* Triggers in some wrap-around cases, but that's okay:
12857 + * we just end up with a shorter timeout. */
12858 + st = processed_system_time + NS_PER_TICK;
12859 + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
12860 + /* Very long timeout means there is no pending timer.
12861 + * We indicate this to Xen by passing zero timeout. */
12862 + st = 0;
12863 + } else {
12864 + st = processed_system_time + delta * (u64)NS_PER_TICK;
12865 + }
12866 + } while (read_seqretry(&xtime_lock, seq));
12867 +
12868 + return st;
12869 +}
12870 +EXPORT_SYMBOL(jiffies_to_st);
12871 +
12872 +/*
12873 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
12874 + * These functions are based on implementations from arch/s390/kernel/time.c
12875 + */
12876 +static void stop_hz_timer(void)
12877 +{
12878 + struct vcpu_set_singleshot_timer singleshot;
12879 + unsigned int cpu = smp_processor_id();
12880 + unsigned long j;
12881 + int rc;
12882 +
12883 + cpu_set(cpu, nohz_cpu_mask);
12884 +
12885 + /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
12886 + /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
12887 + /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
12888 + /* stop the hz timer then the cpumasks created for subsequent values */
12889 + /* of cur in rcu_start_batch are guaranteed to pick up the updated */
12890 + /* nohz_cpu_mask and so will not depend on this cpu. */
12891 +
12892 + smp_mb();
12893 +
12894 + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
12895 + if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
12896 + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
12897 + cpu_clear(cpu, nohz_cpu_mask);
12898 + j = jiffies + 1;
12899 + }
12900 +
12901 + singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2;
12902 + singleshot.flags = 0;
12903 + rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
12904 +#if CONFIG_XEN_COMPAT <= 0x030004
12905 + if (rc) {
12906 + BUG_ON(rc != -ENOSYS);
12907 + rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
12908 + }
12909 +#endif
12910 + BUG_ON(rc);
12911 +}
12912 +
12913 +static void start_hz_timer(void)
12914 +{
12915 + cpu_clear(smp_processor_id(), nohz_cpu_mask);
12916 +}
12917 +
12918 +void raw_safe_halt(void)
12919 +{
12920 + stop_hz_timer();
12921 + /* Blocking includes an implicit local_irq_enable(). */
12922 + HYPERVISOR_block();
12923 + start_hz_timer();
12924 +}
12925 +EXPORT_SYMBOL(raw_safe_halt);
12926 +
12927 +void halt(void)
12928 +{
12929 + if (irqs_disabled())
12930 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
12931 +}
12932 +EXPORT_SYMBOL(halt);
12933 +
12934 +/* No locking required. Interrupts are disabled on all CPUs. */
12935 +void time_resume(void)
12936 +{
12937 + unsigned int cpu;
12938 +
12939 + init_cpu_khz();
12940 +
12941 + for_each_online_cpu(cpu) {
12942 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12943 + &xen_set_periodic_tick)) {
12944 + case 0:
12945 +#if CONFIG_XEN_COMPAT <= 0x030004
12946 + case -ENOSYS:
12947 +#endif
12948 + break;
12949 + default:
12950 + BUG();
12951 + }
12952 + get_time_values_from_xen(cpu);
12953 + per_cpu(processed_system_time, cpu) =
12954 + per_cpu(shadow_time, 0).system_timestamp;
12955 + init_missing_ticks_accounting(cpu);
12956 + }
12957 +
12958 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
12959 +
12960 + update_wallclock();
12961 +}
12962 +
12963 +#ifdef CONFIG_SMP
12964 +static char timer_name[NR_CPUS][15];
12965 +
12966 +int __cpuinit local_setup_timer(unsigned int cpu)
12967 +{
12968 + int seq, irq;
12969 +
12970 + BUG_ON(cpu == 0);
12971 +
12972 + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
12973 + &xen_set_periodic_tick)) {
12974 + case 0:
12975 +#if CONFIG_XEN_COMPAT <= 0x030004
12976 + case -ENOSYS:
12977 +#endif
12978 + break;
12979 + default:
12980 + BUG();
12981 + }
12982 +
12983 + do {
12984 + seq = read_seqbegin(&xtime_lock);
12985 + /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
12986 + per_cpu(processed_system_time, cpu) =
12987 + per_cpu(shadow_time, 0).system_timestamp;
12988 + init_missing_ticks_accounting(cpu);
12989 + } while (read_seqretry(&xtime_lock, seq));
12990 +
12991 + sprintf(timer_name[cpu], "timer%u", cpu);
12992 + irq = bind_virq_to_irqhandler(VIRQ_TIMER,
12993 + cpu,
12994 + timer_interrupt,
12995 + SA_INTERRUPT,
12996 + timer_name[cpu],
12997 + NULL);
12998 + if (irq < 0)
12999 + return irq;
13000 + per_cpu(timer_irq, cpu) = irq;
13001 +
13002 + return 0;
13003 +}
13004 +
13005 +void __cpuexit local_teardown_timer(unsigned int cpu)
13006 +{
13007 + BUG_ON(cpu == 0);
13008 + unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
13009 +}
13010 +#endif
13011 +
13012 +#ifdef CONFIG_CPU_FREQ
13013 +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
13014 + void *data)
13015 +{
13016 + struct cpufreq_freqs *freq = data;
13017 + struct xen_platform_op op;
13018 +
13019 + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
13020 + return 0;
13021 +
13022 + if (val == CPUFREQ_PRECHANGE)
13023 + return 0;
13024 +
13025 + op.cmd = XENPF_change_freq;
13026 + op.u.change_freq.flags = 0;
13027 + op.u.change_freq.cpu = freq->cpu;
13028 + op.u.change_freq.freq = (u64)freq->new * 1000;
13029 + WARN_ON(HYPERVISOR_platform_op(&op));
13030 +
13031 + return 0;
13032 +}
13033 +
13034 +static struct notifier_block time_cpufreq_notifier_block = {
13035 + .notifier_call = time_cpufreq_notifier
13036 +};
13037 +
13038 +static int __init cpufreq_time_setup(void)
13039 +{
13040 + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
13041 + CPUFREQ_TRANSITION_NOTIFIER)) {
13042 + printk(KERN_ERR "failed to set up cpufreq notifier\n");
13043 + return -ENODEV;
13044 + }
13045 + return 0;
13046 +}
13047 +
13048 +core_initcall(cpufreq_time_setup);
13049 +#endif
13050 +
13051 +/*
13052 + * /proc/sys/xen: This really belongs in another file. It can stay here for
13053 + * now however.
13054 + */
13055 +static ctl_table xen_subtable[] = {
13056 + {
13057 + .ctl_name = 1,
13058 + .procname = "independent_wallclock",
13059 + .data = &independent_wallclock,
13060 + .maxlen = sizeof(independent_wallclock),
13061 + .mode = 0644,
13062 + .proc_handler = proc_dointvec
13063 + },
13064 + {
13065 + .ctl_name = 2,
13066 + .procname = "permitted_clock_jitter",
13067 + .data = &permitted_clock_jitter,
13068 + .maxlen = sizeof(permitted_clock_jitter),
13069 + .mode = 0644,
13070 + .proc_handler = proc_doulongvec_minmax
13071 + },
13072 + { 0 }
13073 +};
13074 +static ctl_table xen_table[] = {
13075 + {
13076 + .ctl_name = 123,
13077 + .procname = "xen",
13078 + .mode = 0555,
13079 + .child = xen_subtable},
13080 + { 0 }
13081 +};
13082 +static int __init xen_sysctl_init(void)
13083 +{
13084 + (void)register_sysctl_table(xen_table, 0);
13085 + return 0;
13086 +}
13087 +__initcall(xen_sysctl_init);
13088 Index: head-2008-11-25/arch/x86/kernel/traps_32-xen.c
13089 ===================================================================
13090 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
13091 +++ head-2008-11-25/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200
13092 @@ -0,0 +1,1190 @@
13093 +/*
13094 + * linux/arch/i386/traps.c
13095 + *
13096 + * Copyright (C) 1991, 1992 Linus Torvalds
13097 + *
13098 + * Pentium III FXSR, SSE support
13099 + * Gareth Hughes <gareth@valinux.com>, May 2000
13100 + */
13101 +
13102 +/*
13103 + * 'Traps.c' handles hardware traps and faults after we have saved some
13104 + * state in 'asm.s'.
13105 + */
13106 +#include <linux/sched.h>
13107 +#include <linux/kernel.h>
13108 +#include <linux/string.h>
13109 +#include <linux/errno.h>
13110 +#include <linux/timer.h>
13111 +#include <linux/mm.h>
13112 +#include <linux/init.h>
13113 +#include <linux/delay.h>
13114 +#include <linux/spinlock.h>
13115 +#include <linux/interrupt.h>
13116 +#include <linux/highmem.h>
13117 +#include <linux/kallsyms.h>
13118 +#include <linux/ptrace.h>
13119 +#include <linux/utsname.h>
13120 +#include <linux/kprobes.h>
13121 +#include <linux/kexec.h>
13122 +#include <linux/unwind.h>
13123 +
13124 +#ifdef CONFIG_EISA
13125 +#include <linux/ioport.h>
13126 +#include <linux/eisa.h>
13127 +#endif
13128 +
13129 +#ifdef CONFIG_MCA
13130 +#include <linux/mca.h>
13131 +#endif
13132 +
13133 +#include <asm/processor.h>
13134 +#include <asm/system.h>
13135 +#include <asm/uaccess.h>
13136 +#include <asm/io.h>
13137 +#include <asm/atomic.h>
13138 +#include <asm/debugreg.h>
13139 +#include <asm/desc.h>
13140 +#include <asm/i387.h>
13141 +#include <asm/nmi.h>
13142 +#include <asm/unwind.h>
13143 +#include <asm/smp.h>
13144 +#include <asm/arch_hooks.h>
13145 +#include <asm/kdebug.h>
13146 +
13147 +#include <linux/module.h>
13148 +
13149 +#include "mach_traps.h"
13150 +
13151 +asmlinkage int system_call(void);
13152 +
13153 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
13154 + { 0, 0 }, { 0, 0 } };
13155 +
13156 +/* Do we ignore FPU interrupts ? */
13157 +char ignore_fpu_irq = 0;
13158 +
13159 +#ifndef CONFIG_X86_NO_IDT
13160 +/*
13161 + * The IDT has to be page-aligned to simplify the Pentium
13162 + * F0 0F bug workaround.. We have a special link segment
13163 + * for this.
13164 + */
13165 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
13166 +#endif
13167 +
13168 +asmlinkage void divide_error(void);
13169 +asmlinkage void debug(void);
13170 +asmlinkage void nmi(void);
13171 +asmlinkage void int3(void);
13172 +asmlinkage void overflow(void);
13173 +asmlinkage void bounds(void);
13174 +asmlinkage void invalid_op(void);
13175 +asmlinkage void device_not_available(void);
13176 +asmlinkage void coprocessor_segment_overrun(void);
13177 +asmlinkage void invalid_TSS(void);
13178 +asmlinkage void segment_not_present(void);
13179 +asmlinkage void stack_segment(void);
13180 +asmlinkage void general_protection(void);
13181 +asmlinkage void page_fault(void);
13182 +asmlinkage void coprocessor_error(void);
13183 +asmlinkage void simd_coprocessor_error(void);
13184 +asmlinkage void alignment_check(void);
13185 +#ifndef CONFIG_XEN
13186 +asmlinkage void spurious_interrupt_bug(void);
13187 +#else
13188 +asmlinkage void fixup_4gb_segment(void);
13189 +#endif
13190 +asmlinkage void machine_check(void);
13191 +
13192 +static int kstack_depth_to_print = 24;
13193 +#ifdef CONFIG_STACK_UNWIND
13194 +static int call_trace = 1;
13195 +#else
13196 +#define call_trace (-1)
13197 +#endif
13198 +ATOMIC_NOTIFIER_HEAD(i386die_chain);
13199 +
13200 +int register_die_notifier(struct notifier_block *nb)
13201 +{
13202 + vmalloc_sync_all();
13203 + return atomic_notifier_chain_register(&i386die_chain, nb);
13204 +}
13205 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
13206 +
13207 +int unregister_die_notifier(struct notifier_block *nb)
13208 +{
13209 + return atomic_notifier_chain_unregister(&i386die_chain, nb);
13210 +}
13211 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
13212 +
13213 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
13214 +{
13215 + return p > (void *)tinfo &&
13216 + p < (void *)tinfo + THREAD_SIZE - 3;
13217 +}
13218 +
13219 +/*
13220 + * Print one address/symbol entries per line.
13221 + */
13222 +static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
13223 +{
13224 + printk(" [<%08lx>] ", addr);
13225 +
13226 + print_symbol("%s\n", addr);
13227 +}
13228 +
13229 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
13230 + unsigned long *stack, unsigned long ebp,
13231 + char *log_lvl)
13232 +{
13233 + unsigned long addr;
13234 +
13235 +#ifdef CONFIG_FRAME_POINTER
13236 + while (valid_stack_ptr(tinfo, (void *)ebp)) {
13237 + addr = *(unsigned long *)(ebp + 4);
13238 + print_addr_and_symbol(addr, log_lvl);
13239 + /*
13240 + * break out of recursive entries (such as
13241 + * end_of_stack_stop_unwind_function):
13242 + */
13243 + if (ebp == *(unsigned long *)ebp)
13244 + break;
13245 + ebp = *(unsigned long *)ebp;
13246 + }
13247 +#else
13248 + while (valid_stack_ptr(tinfo, stack)) {
13249 + addr = *stack++;
13250 + if (__kernel_text_address(addr))
13251 + print_addr_and_symbol(addr, log_lvl);
13252 + }
13253 +#endif
13254 + return ebp;
13255 +}
13256 +
13257 +static asmlinkage int
13258 +show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
13259 +{
13260 + int n = 0;
13261 +
13262 + while (unwind(info) == 0 && UNW_PC(info)) {
13263 + n++;
13264 + print_addr_and_symbol(UNW_PC(info), log_lvl);
13265 + if (arch_unw_user_mode(info))
13266 + break;
13267 + }
13268 + return n;
13269 +}
13270 +
13271 +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
13272 + unsigned long *stack, char *log_lvl)
13273 +{
13274 + unsigned long ebp;
13275 +
13276 + if (!task)
13277 + task = current;
13278 +
13279 + if (call_trace >= 0) {
13280 + int unw_ret = 0;
13281 + struct unwind_frame_info info;
13282 +
13283 + if (regs) {
13284 + if (unwind_init_frame_info(&info, task, regs) == 0)
13285 + unw_ret = show_trace_unwind(&info, log_lvl);
13286 + } else if (task == current)
13287 + unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
13288 + else {
13289 + if (unwind_init_blocked(&info, task) == 0)
13290 + unw_ret = show_trace_unwind(&info, log_lvl);
13291 + }
13292 + if (unw_ret > 0) {
13293 + if (call_trace == 1 && !arch_unw_user_mode(&info)) {
13294 + print_symbol("DWARF2 unwinder stuck at %s\n",
13295 + UNW_PC(&info));
13296 + if (UNW_SP(&info) >= PAGE_OFFSET) {
13297 + printk("Leftover inexact backtrace:\n");
13298 + stack = (void *)UNW_SP(&info);
13299 + } else
13300 + printk("Full inexact backtrace again:\n");
13301 + } else if (call_trace >= 1)
13302 + return;
13303 + else
13304 + printk("Full inexact backtrace again:\n");
13305 + } else
13306 + printk("Inexact backtrace:\n");
13307 + }
13308 +
13309 + if (task == current) {
13310 + /* Grab ebp right from our regs */
13311 + asm ("movl %%ebp, %0" : "=r" (ebp) : );
13312 + } else {
13313 + /* ebp is the last reg pushed by switch_to */
13314 + ebp = *(unsigned long *) task->thread.esp;
13315 + }
13316 +
13317 + while (1) {
13318 + struct thread_info *context;
13319 + context = (struct thread_info *)
13320 + ((unsigned long)stack & (~(THREAD_SIZE - 1)));
13321 + ebp = print_context_stack(context, stack, ebp, log_lvl);
13322 + stack = (unsigned long*)context->previous_esp;
13323 + if (!stack)
13324 + break;
13325 + printk("%s =======================\n", log_lvl);
13326 + }
13327 +}
13328 +
13329 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
13330 +{
13331 + show_trace_log_lvl(task, regs, stack, "");
13332 +}
13333 +
13334 +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
13335 + unsigned long *esp, char *log_lvl)
13336 +{
13337 + unsigned long *stack;
13338 + int i;
13339 +
13340 + if (esp == NULL) {
13341 + if (task)
13342 + esp = (unsigned long*)task->thread.esp;
13343 + else
13344 + esp = (unsigned long *)&esp;
13345 + }
13346 +
13347 + stack = esp;
13348 + for(i = 0; i < kstack_depth_to_print; i++) {
13349 + if (kstack_end(stack))
13350 + break;
13351 + if (i && ((i % 8) == 0))
13352 + printk("\n%s ", log_lvl);
13353 + printk("%08lx ", *stack++);
13354 + }
13355 + printk("\n%sCall Trace:\n", log_lvl);
13356 + show_trace_log_lvl(task, regs, esp, log_lvl);
13357 +}
13358 +
13359 +void show_stack(struct task_struct *task, unsigned long *esp)
13360 +{
13361 + printk(" ");
13362 + show_stack_log_lvl(task, NULL, esp, "");
13363 +}
13364 +
13365 +/*
13366 + * The architecture-independent dump_stack generator
13367 + */
13368 +void dump_stack(void)
13369 +{
13370 + unsigned long stack;
13371 +
13372 + show_trace(current, NULL, &stack);
13373 +}
13374 +
13375 +EXPORT_SYMBOL(dump_stack);
13376 +
13377 +void show_registers(struct pt_regs *regs)
13378 +{
13379 + int i;
13380 + int in_kernel = 1;
13381 + unsigned long esp;
13382 + unsigned short ss;
13383 +
13384 + esp = (unsigned long) (&regs->esp);
13385 + savesegment(ss, ss);
13386 + if (user_mode_vm(regs)) {
13387 + in_kernel = 0;
13388 + esp = regs->esp;
13389 + ss = regs->xss & 0xffff;
13390 + }
13391 + print_modules();
13392 + printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
13393 + "EFLAGS: %08lx (%s %.*s) \n",
13394 + smp_processor_id(), 0xffff & regs->xcs, regs->eip,
13395 + print_tainted(), regs->eflags, system_utsname.release,
13396 + (int)strcspn(system_utsname.version, " "),
13397 + system_utsname.version);
13398 + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
13399 + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
13400 + regs->eax, regs->ebx, regs->ecx, regs->edx);
13401 + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
13402 + regs->esi, regs->edi, regs->ebp, esp);
13403 + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
13404 + regs->xds & 0xffff, regs->xes & 0xffff, ss);
13405 + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
13406 + TASK_COMM_LEN, current->comm, current->pid,
13407 + current_thread_info(), current, current->thread_info);
13408 + /*
13409 + * When in-kernel, we also print out the stack and code at the
13410 + * time of the fault..
13411 + */
13412 + if (in_kernel) {
13413 + u8 __user *eip;
13414 +
13415 + printk("\n" KERN_EMERG "Stack: ");
13416 + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
13417 +
13418 + printk(KERN_EMERG "Code: ");
13419 +
13420 + eip = (u8 __user *)regs->eip - 43;
13421 + for (i = 0; i < 64; i++, eip++) {
13422 + unsigned char c;
13423 +
13424 + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
13425 + printk(" Bad EIP value.");
13426 + break;
13427 + }
13428 + if (eip == (u8 __user *)regs->eip)
13429 + printk("<%02x> ", c);
13430 + else
13431 + printk("%02x ", c);
13432 + }
13433 + }
13434 + printk("\n");
13435 +}
13436 +
13437 +static void handle_BUG(struct pt_regs *regs)
13438 +{
13439 + unsigned long eip = regs->eip;
13440 + unsigned short ud2;
13441 +
13442 + if (eip < PAGE_OFFSET)
13443 + return;
13444 + if (__get_user(ud2, (unsigned short __user *)eip))
13445 + return;
13446 + if (ud2 != 0x0b0f)
13447 + return;
13448 +
13449 + printk(KERN_EMERG "------------[ cut here ]------------\n");
13450 +
13451 +#ifdef CONFIG_DEBUG_BUGVERBOSE
13452 + do {
13453 + unsigned short line;
13454 + char *file;
13455 + char c;
13456 +
13457 + if (__get_user(line, (unsigned short __user *)(eip + 2)))
13458 + break;
13459 + if (__get_user(file, (char * __user *)(eip + 4)) ||
13460 + (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
13461 + file = "<bad filename>";
13462 +
13463 + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
13464 + return;
13465 + } while (0);
13466 +#endif
13467 + printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
13468 +}
13469 +
13470 +/* This is gone through when something in the kernel
13471 + * has done something bad and is about to be terminated.
13472 +*/
13473 +void die(const char * str, struct pt_regs * regs, long err)
13474 +{
13475 + static struct {
13476 + spinlock_t lock;
13477 + u32 lock_owner;
13478 + int lock_owner_depth;
13479 + } die = {
13480 + .lock = SPIN_LOCK_UNLOCKED,
13481 + .lock_owner = -1,
13482 + .lock_owner_depth = 0
13483 + };
13484 + static int die_counter;
13485 + unsigned long flags;
13486 +
13487 + oops_enter();
13488 +
13489 + if (die.lock_owner != raw_smp_processor_id()) {
13490 + console_verbose();
13491 + spin_lock_irqsave(&die.lock, flags);
13492 + die.lock_owner = smp_processor_id();
13493 + die.lock_owner_depth = 0;
13494 + bust_spinlocks(1);
13495 + }
13496 + else
13497 + local_save_flags(flags);
13498 +
13499 + if (++die.lock_owner_depth < 3) {
13500 + int nl = 0;
13501 + unsigned long esp;
13502 + unsigned short ss;
13503 +
13504 + handle_BUG(regs);
13505 + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
13506 +#ifdef CONFIG_PREEMPT
13507 + printk(KERN_EMERG "PREEMPT ");
13508 + nl = 1;
13509 +#endif
13510 +#ifdef CONFIG_SMP
13511 + if (!nl)
13512 + printk(KERN_EMERG);
13513 + printk("SMP ");
13514 + nl = 1;
13515 +#endif
13516 +#ifdef CONFIG_DEBUG_PAGEALLOC
13517 + if (!nl)
13518 + printk(KERN_EMERG);
13519 + printk("DEBUG_PAGEALLOC");
13520 + nl = 1;
13521 +#endif
13522 + if (nl)
13523 + printk("\n");
13524 + if (notify_die(DIE_OOPS, str, regs, err,
13525 + current->thread.trap_no, SIGSEGV) !=
13526 + NOTIFY_STOP) {
13527 + show_registers(regs);
13528 + /* Executive summary in case the oops scrolled away */
13529 + esp = (unsigned long) (&regs->esp);
13530 + savesegment(ss, ss);
13531 + if (user_mode(regs)) {
13532 + esp = regs->esp;
13533 + ss = regs->xss & 0xffff;
13534 + }
13535 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
13536 + print_symbol("%s", regs->eip);
13537 + printk(" SS:ESP %04x:%08lx\n", ss, esp);
13538 + }
13539 + else
13540 + regs = NULL;
13541 + } else
13542 + printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
13543 +
13544 + bust_spinlocks(0);
13545 + die.lock_owner = -1;
13546 + spin_unlock_irqrestore(&die.lock, flags);
13547 +
13548 + if (!regs)
13549 + return;
13550 +
13551 + if (kexec_should_crash(current))
13552 + crash_kexec(regs);
13553 +
13554 + if (in_interrupt())
13555 + panic("Fatal exception in interrupt");
13556 +
13557 + if (panic_on_oops)
13558 + panic("Fatal exception");
13559 +
13560 + oops_exit();
13561 + do_exit(SIGSEGV);
13562 +}
13563 +
13564 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
13565 +{
13566 + if (!user_mode_vm(regs))
13567 + die(str, regs, err);
13568 +}
13569 +
13570 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
13571 + struct pt_regs * regs, long error_code,
13572 + siginfo_t *info)
13573 +{
13574 + struct task_struct *tsk = current;
13575 + tsk->thread.error_code = error_code;
13576 + tsk->thread.trap_no = trapnr;
13577 +
13578 + if (regs->eflags & VM_MASK) {
13579 + if (vm86)
13580 + goto vm86_trap;
13581 + goto trap_signal;
13582 + }
13583 +
13584 + if (!user_mode(regs))
13585 + goto kernel_trap;
13586 +
13587 + trap_signal: {
13588 + if (info)
13589 + force_sig_info(signr, info, tsk);
13590 + else
13591 + force_sig(signr, tsk);
13592 + return;
13593 + }
13594 +
13595 + kernel_trap: {
13596 + if (!fixup_exception(regs))
13597 + die(str, regs, error_code);
13598 + return;
13599 + }
13600 +
13601 + vm86_trap: {
13602 + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
13603 + if (ret) goto trap_signal;
13604 + return;
13605 + }
13606 +}
13607 +
13608 +#define DO_ERROR(trapnr, signr, str, name) \
13609 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13610 +{ \
13611 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13612 + == NOTIFY_STOP) \
13613 + return; \
13614 + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
13615 +}
13616 +
13617 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13618 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13619 +{ \
13620 + siginfo_t info; \
13621 + info.si_signo = signr; \
13622 + info.si_errno = 0; \
13623 + info.si_code = sicode; \
13624 + info.si_addr = (void __user *)siaddr; \
13625 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13626 + == NOTIFY_STOP) \
13627 + return; \
13628 + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
13629 +}
13630 +
13631 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
13632 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13633 +{ \
13634 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13635 + == NOTIFY_STOP) \
13636 + return; \
13637 + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
13638 +}
13639 +
13640 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
13641 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
13642 +{ \
13643 + siginfo_t info; \
13644 + info.si_signo = signr; \
13645 + info.si_errno = 0; \
13646 + info.si_code = sicode; \
13647 + info.si_addr = (void __user *)siaddr; \
13648 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
13649 + == NOTIFY_STOP) \
13650 + return; \
13651 + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
13652 +}
13653 +
13654 +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
13655 +#ifndef CONFIG_KPROBES
13656 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
13657 +#endif
13658 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
13659 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
13660 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
13661 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
13662 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
13663 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
13664 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
13665 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
13666 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
13667 +
13668 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
13669 + long error_code)
13670 +{
13671 + current->thread.error_code = error_code;
13672 + current->thread.trap_no = 13;
13673 +
13674 + if (regs->eflags & VM_MASK)
13675 + goto gp_in_vm86;
13676 +
13677 + if (!user_mode(regs))
13678 + goto gp_in_kernel;
13679 +
13680 + current->thread.error_code = error_code;
13681 + current->thread.trap_no = 13;
13682 + force_sig(SIGSEGV, current);
13683 + return;
13684 +
13685 +gp_in_vm86:
13686 + local_irq_enable();
13687 + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
13688 + return;
13689 +
13690 +gp_in_kernel:
13691 + if (!fixup_exception(regs)) {
13692 + if (notify_die(DIE_GPF, "general protection fault", regs,
13693 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
13694 + return;
13695 + die("general protection fault", regs, error_code);
13696 + }
13697 +}
13698 +
13699 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
13700 +{
13701 + printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
13702 + "to continue\n");
13703 + printk(KERN_EMERG "You probably have a hardware problem with your RAM "
13704 + "chips\n");
13705 +
13706 + /* Clear and disable the memory parity error line. */
13707 + clear_mem_error(reason);
13708 +}
13709 +
13710 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
13711 +{
13712 + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
13713 + show_registers(regs);
13714 +
13715 + /* Re-enable the IOCK line, wait for a few seconds */
13716 + clear_io_check_error(reason);
13717 +}
13718 +
13719 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
13720 +{
13721 +#ifdef CONFIG_MCA
13722 + /* Might actually be able to figure out what the guilty party
13723 + * is. */
13724 + if( MCA_bus ) {
13725 + mca_handle_nmi();
13726 + return;
13727 + }
13728 +#endif
13729 + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
13730 + reason, smp_processor_id());
13731 + printk("Dazed and confused, but trying to continue\n");
13732 + printk("Do you have a strange power saving mode enabled?\n");
13733 +}
13734 +
13735 +static DEFINE_SPINLOCK(nmi_print_lock);
13736 +
13737 +void die_nmi (struct pt_regs *regs, const char *msg)
13738 +{
13739 + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
13740 + NOTIFY_STOP)
13741 + return;
13742 +
13743 + spin_lock(&nmi_print_lock);
13744 + /*
13745 + * We are in trouble anyway, lets at least try
13746 + * to get a message out.
13747 + */
13748 + bust_spinlocks(1);
13749 + printk(KERN_EMERG "%s", msg);
13750 + printk(" on CPU%d, eip %08lx, registers:\n",
13751 + smp_processor_id(), regs->eip);
13752 + show_registers(regs);
13753 + printk(KERN_EMERG "console shuts up ...\n");
13754 + console_silent();
13755 + spin_unlock(&nmi_print_lock);
13756 + bust_spinlocks(0);
13757 +
13758 + /* If we are in kernel we are probably nested up pretty bad
13759 + * and might aswell get out now while we still can.
13760 + */
13761 + if (!user_mode_vm(regs)) {
13762 + current->thread.trap_no = 2;
13763 + crash_kexec(regs);
13764 + }
13765 +
13766 + do_exit(SIGSEGV);
13767 +}
13768 +
13769 +static void default_do_nmi(struct pt_regs * regs)
13770 +{
13771 + unsigned char reason = 0;
13772 +
13773 + /* Only the BSP gets external NMIs from the system. */
13774 + if (!smp_processor_id())
13775 + reason = get_nmi_reason();
13776 +
13777 + if (!(reason & 0xc0)) {
13778 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
13779 + == NOTIFY_STOP)
13780 + return;
13781 +#ifdef CONFIG_X86_LOCAL_APIC
13782 + /*
13783 + * Ok, so this is none of the documented NMI sources,
13784 + * so it must be the NMI watchdog.
13785 + */
13786 + if (nmi_watchdog) {
13787 + nmi_watchdog_tick(regs);
13788 + return;
13789 + }
13790 +#endif
13791 + unknown_nmi_error(reason, regs);
13792 + return;
13793 + }
13794 + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
13795 + return;
13796 + if (reason & 0x80)
13797 + mem_parity_error(reason, regs);
13798 + if (reason & 0x40)
13799 + io_check_error(reason, regs);
13800 + /*
13801 + * Reassert NMI in case it became active meanwhile
13802 + * as it's edge-triggered.
13803 + */
13804 + reassert_nmi();
13805 +}
13806 +
13807 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
13808 +{
13809 + return 0;
13810 +}
13811 +
13812 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
13813 +
13814 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
13815 +{
13816 + int cpu;
13817 +
13818 + nmi_enter();
13819 +
13820 + cpu = smp_processor_id();
13821 +
13822 + ++nmi_count(cpu);
13823 +
13824 + if (!rcu_dereference(nmi_callback)(regs, cpu))
13825 + default_do_nmi(regs);
13826 +
13827 + nmi_exit();
13828 +}
13829 +
13830 +void set_nmi_callback(nmi_callback_t callback)
13831 +{
13832 + vmalloc_sync_all();
13833 + rcu_assign_pointer(nmi_callback, callback);
13834 +}
13835 +EXPORT_SYMBOL_GPL(set_nmi_callback);
13836 +
13837 +void unset_nmi_callback(void)
13838 +{
13839 + nmi_callback = dummy_nmi_callback;
13840 +}
13841 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
13842 +
13843 +#ifdef CONFIG_KPROBES
13844 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
13845 +{
13846 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
13847 + == NOTIFY_STOP)
13848 + return;
13849 + /* This is an interrupt gate, because kprobes wants interrupts
13850 + disabled. Normal trap handlers don't. */
13851 + restore_interrupts(regs);
13852 + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
13853 +}
13854 +#endif
13855 +
13856 +/*
13857 + * Our handling of the processor debug registers is non-trivial.
13858 + * We do not clear them on entry and exit from the kernel. Therefore
13859 + * it is possible to get a watchpoint trap here from inside the kernel.
13860 + * However, the code in ./ptrace.c has ensured that the user can
13861 + * only set watchpoints on userspace addresses. Therefore the in-kernel
13862 + * watchpoint trap can only occur in code which is reading/writing
13863 + * from user space. Such code must not hold kernel locks (since it
13864 + * can equally take a page fault), therefore it is safe to call
13865 + * force_sig_info even though that claims and releases locks.
13866 + *
13867 + * Code in ./signal.c ensures that the debug control register
13868 + * is restored before we deliver any signal, and therefore that
13869 + * user code runs with the correct debug control register even though
13870 + * we clear it here.
13871 + *
13872 + * Being careful here means that we don't have to be as careful in a
13873 + * lot of more complicated places (task switching can be a bit lazy
13874 + * about restoring all the debug state, and ptrace doesn't have to
13875 + * find every occurrence of the TF bit that could be saved away even
13876 + * by user code)
13877 + */
13878 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
13879 +{
13880 + unsigned int condition;
13881 + struct task_struct *tsk = current;
13882 +
13883 + get_debugreg(condition, 6);
13884 +
13885 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
13886 + SIGTRAP) == NOTIFY_STOP)
13887 + return;
13888 + /* It's safe to allow irq's after DR6 has been saved */
13889 + if (regs->eflags & X86_EFLAGS_IF)
13890 + local_irq_enable();
13891 +
13892 + /* Mask out spurious debug traps due to lazy DR7 setting */
13893 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
13894 + if (!tsk->thread.debugreg[7])
13895 + goto clear_dr7;
13896 + }
13897 +
13898 + if (regs->eflags & VM_MASK)
13899 + goto debug_vm86;
13900 +
13901 + /* Save debug status register where ptrace can see it */
13902 + tsk->thread.debugreg[6] = condition;
13903 +
13904 + /*
13905 + * Single-stepping through TF: make sure we ignore any events in
13906 + * kernel space (but re-enable TF when returning to user mode).
13907 + */
13908 + if (condition & DR_STEP) {
13909 + /*
13910 + * We already checked v86 mode above, so we can
13911 + * check for kernel mode by just checking the CPL
13912 + * of CS.
13913 + */
13914 + if (!user_mode(regs))
13915 + goto clear_TF_reenable;
13916 + }
13917 +
13918 + /* Ok, finally something we can handle */
13919 + send_sigtrap(tsk, regs, error_code);
13920 +
13921 + /* Disable additional traps. They'll be re-enabled when
13922 + * the signal is delivered.
13923 + */
13924 +clear_dr7:
13925 + set_debugreg(0, 7);
13926 + return;
13927 +
13928 +debug_vm86:
13929 + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
13930 + return;
13931 +
13932 +clear_TF_reenable:
13933 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
13934 + regs->eflags &= ~TF_MASK;
13935 + return;
13936 +}
13937 +
13938 +/*
13939 + * Note that we play around with the 'TS' bit in an attempt to get
13940 + * the correct behaviour even in the presence of the asynchronous
13941 + * IRQ13 behaviour
13942 + */
13943 +void math_error(void __user *eip)
13944 +{
13945 + struct task_struct * task;
13946 + siginfo_t info;
13947 + unsigned short cwd, swd;
13948 +
13949 + /*
13950 + * Save the info for the exception handler and clear the error.
13951 + */
13952 + task = current;
13953 + save_init_fpu(task);
13954 + task->thread.trap_no = 16;
13955 + task->thread.error_code = 0;
13956 + info.si_signo = SIGFPE;
13957 + info.si_errno = 0;
13958 + info.si_code = __SI_FAULT;
13959 + info.si_addr = eip;
13960 + /*
13961 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
13962 + * status. 0x3f is the exception bits in these regs, 0x200 is the
13963 + * C1 reg you need in case of a stack fault, 0x040 is the stack
13964 + * fault bit. We should only be taking one exception at a time,
13965 + * so if this combination doesn't produce any single exception,
13966 + * then we have a bad program that isn't syncronizing its FPU usage
13967 + * and it will suffer the consequences since we won't be able to
13968 + * fully reproduce the context of the exception
13969 + */
13970 + cwd = get_fpu_cwd(task);
13971 + swd = get_fpu_swd(task);
13972 + switch (swd & ~cwd & 0x3f) {
13973 + case 0x000: /* No unmasked exception */
13974 + return;
13975 + default: /* Multiple exceptions */
13976 + break;
13977 + case 0x001: /* Invalid Op */
13978 + /*
13979 + * swd & 0x240 == 0x040: Stack Underflow
13980 + * swd & 0x240 == 0x240: Stack Overflow
13981 + * User must clear the SF bit (0x40) if set
13982 + */
13983 + info.si_code = FPE_FLTINV;
13984 + break;
13985 + case 0x002: /* Denormalize */
13986 + case 0x010: /* Underflow */
13987 + info.si_code = FPE_FLTUND;
13988 + break;
13989 + case 0x004: /* Zero Divide */
13990 + info.si_code = FPE_FLTDIV;
13991 + break;
13992 + case 0x008: /* Overflow */
13993 + info.si_code = FPE_FLTOVF;
13994 + break;
13995 + case 0x020: /* Precision */
13996 + info.si_code = FPE_FLTRES;
13997 + break;
13998 + }
13999 + force_sig_info(SIGFPE, &info, task);
14000 +}
14001 +
14002 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
14003 +{
14004 + ignore_fpu_irq = 1;
14005 + math_error((void __user *)regs->eip);
14006 +}
14007 +
14008 +static void simd_math_error(void __user *eip)
14009 +{
14010 + struct task_struct * task;
14011 + siginfo_t info;
14012 + unsigned short mxcsr;
14013 +
14014 + /*
14015 + * Save the info for the exception handler and clear the error.
14016 + */
14017 + task = current;
14018 + save_init_fpu(task);
14019 + task->thread.trap_no = 19;
14020 + task->thread.error_code = 0;
14021 + info.si_signo = SIGFPE;
14022 + info.si_errno = 0;
14023 + info.si_code = __SI_FAULT;
14024 + info.si_addr = eip;
14025 + /*
14026 + * The SIMD FPU exceptions are handled a little differently, as there
14027 + * is only a single status/control register. Thus, to determine which
14028 + * unmasked exception was caught we must mask the exception mask bits
14029 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
14030 + */
14031 + mxcsr = get_fpu_mxcsr(task);
14032 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
14033 + case 0x000:
14034 + default:
14035 + break;
14036 + case 0x001: /* Invalid Op */
14037 + info.si_code = FPE_FLTINV;
14038 + break;
14039 + case 0x002: /* Denormalize */
14040 + case 0x010: /* Underflow */
14041 + info.si_code = FPE_FLTUND;
14042 + break;
14043 + case 0x004: /* Zero Divide */
14044 + info.si_code = FPE_FLTDIV;
14045 + break;
14046 + case 0x008: /* Overflow */
14047 + info.si_code = FPE_FLTOVF;
14048 + break;
14049 + case 0x020: /* Precision */
14050 + info.si_code = FPE_FLTRES;
14051 + break;
14052 + }
14053 + force_sig_info(SIGFPE, &info, task);
14054 +}
14055 +
14056 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
14057 + long error_code)
14058 +{
14059 + if (cpu_has_xmm) {
14060 + /* Handle SIMD FPU exceptions on PIII+ processors. */
14061 + ignore_fpu_irq = 1;
14062 + simd_math_error((void __user *)regs->eip);
14063 + } else {
14064 + /*
14065 + * Handle strange cache flush from user space exception
14066 + * in all other cases. This is undocumented behaviour.
14067 + */
14068 + if (regs->eflags & VM_MASK) {
14069 + handle_vm86_fault((struct kernel_vm86_regs *)regs,
14070 + error_code);
14071 + return;
14072 + }
14073 + current->thread.trap_no = 19;
14074 + current->thread.error_code = error_code;
14075 + die_if_kernel("cache flush denied", regs, error_code);
14076 + force_sig(SIGSEGV, current);
14077 + }
14078 +}
14079 +
14080 +#ifndef CONFIG_XEN
14081 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
14082 + long error_code)
14083 +{
14084 +#if 0
14085 + /* No need to warn about this any longer. */
14086 + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
14087 +#endif
14088 +}
14089 +
14090 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
14091 +{
14092 + unsigned long *switch16_ptr, *switch32_ptr;
14093 + struct pt_regs *regs;
14094 + unsigned long stack_top, stack_bot;
14095 + unsigned short iret_frame16_off;
14096 + int cpu = smp_processor_id();
14097 + /* reserve the space on 32bit stack for the magic switch16 pointer */
14098 + memmove(stk, stk + 8, sizeof(struct pt_regs));
14099 + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
14100 + regs = (struct pt_regs *)stk;
14101 + /* now the switch32 on 16bit stack */
14102 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14103 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14104 + switch32_ptr = (unsigned long *)(stack_top - 8);
14105 + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
14106 + /* copy iret frame on 16bit stack */
14107 + memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
14108 + /* fill in the switch pointers */
14109 + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
14110 + switch16_ptr[1] = __ESPFIX_SS;
14111 + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
14112 + 8 - CPU_16BIT_STACK_SIZE;
14113 + switch32_ptr[1] = __KERNEL_DS;
14114 +}
14115 +
14116 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
14117 +{
14118 + unsigned long *switch32_ptr;
14119 + unsigned char *stack16, *stack32;
14120 + unsigned long stack_top, stack_bot;
14121 + int len;
14122 + int cpu = smp_processor_id();
14123 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
14124 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
14125 + switch32_ptr = (unsigned long *)(stack_top - 8);
14126 + /* copy the data from 16bit stack to 32bit stack */
14127 + len = CPU_16BIT_STACK_SIZE - 8 - sp;
14128 + stack16 = (unsigned char *)(stack_bot + sp);
14129 + stack32 = (unsigned char *)
14130 + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
14131 + memcpy(stack32, stack16, len);
14132 + return stack32;
14133 +}
14134 +#endif
14135 +
14136 +/*
14137 + * 'math_state_restore()' saves the current math information in the
14138 + * old math state array, and gets the new ones from the current task
14139 + *
14140 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
14141 + * Don't touch unless you *really* know how it works.
14142 + *
14143 + * Must be called with kernel preemption disabled (in this case,
14144 + * local interrupts are disabled at the call-site in entry.S).
14145 + */
14146 +asmlinkage void math_state_restore(struct pt_regs regs)
14147 +{
14148 + struct thread_info *thread = current_thread_info();
14149 + struct task_struct *tsk = thread->task;
14150 +
14151 + /* NB. 'clts' is done for us by Xen during virtual trap. */
14152 + if (!tsk_used_math(tsk))
14153 + init_fpu(tsk);
14154 + restore_fpu(tsk);
14155 + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
14156 +}
14157 +
14158 +#ifndef CONFIG_MATH_EMULATION
14159 +
14160 +asmlinkage void math_emulate(long arg)
14161 +{
14162 + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
14163 + printk(KERN_EMERG "killing %s.\n",current->comm);
14164 + force_sig(SIGFPE,current);
14165 + schedule();
14166 +}
14167 +
14168 +#endif /* CONFIG_MATH_EMULATION */
14169 +
14170 +#ifdef CONFIG_X86_F00F_BUG
14171 +void __init trap_init_f00f_bug(void)
14172 +{
14173 + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
14174 +
14175 + /*
14176 + * Update the IDT descriptor and reload the IDT so that
14177 + * it uses the read-only mapped virtual address.
14178 + */
14179 + idt_descr.address = fix_to_virt(FIX_F00F_IDT);
14180 + load_idt(&idt_descr);
14181 +}
14182 +#endif
14183 +
14184 +
14185 +/*
14186 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
14187 + * for those that specify <dpl>|4 in the second field.
14188 + */
14189 +static trap_info_t __cpuinitdata trap_table[] = {
14190 + { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
14191 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
14192 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
14193 + { 4, 3, __KERNEL_CS, (unsigned long)overflow },
14194 + { 5, 0, __KERNEL_CS, (unsigned long)bounds },
14195 + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
14196 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
14197 + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
14198 + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
14199 + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
14200 + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
14201 + { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
14202 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
14203 + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
14204 + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
14205 + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
14206 +#ifdef CONFIG_X86_MCE
14207 + { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
14208 +#endif
14209 + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
14210 + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
14211 + { 0, 0, 0, 0 }
14212 +};
14213 +
14214 +void __init trap_init(void)
14215 +{
14216 + int ret;
14217 +
14218 + ret = HYPERVISOR_set_trap_table(trap_table);
14219 + if (ret)
14220 + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
14221 +
14222 + if (cpu_has_fxsr) {
14223 + /*
14224 + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
14225 + * Generates a compile-time "error: zero width for bit-field" if
14226 + * the alignment is wrong.
14227 + */
14228 + struct fxsrAlignAssert {
14229 + int _:!(offsetof(struct task_struct,
14230 + thread.i387.fxsave) & 15);
14231 + };
14232 +
14233 + printk(KERN_INFO "Enabling fast FPU save and restore... ");
14234 + set_in_cr4(X86_CR4_OSFXSR);
14235 + printk("done.\n");
14236 + }
14237 + if (cpu_has_xmm) {
14238 + printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
14239 + "support... ");
14240 + set_in_cr4(X86_CR4_OSXMMEXCPT);
14241 + printk("done.\n");
14242 + }
14243 +
14244 + /*
14245 + * Should be a barrier for any external CPU state.
14246 + */
14247 + cpu_init();
14248 +}
14249 +
14250 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
14251 +{
14252 + const trap_info_t *t = trap_table;
14253 +
14254 + for (t = trap_table; t->address; t++) {
14255 + trap_ctxt[t->vector].flags = t->flags;
14256 + trap_ctxt[t->vector].cs = t->cs;
14257 + trap_ctxt[t->vector].address = t->address;
14258 + }
14259 +}
14260 +
14261 +static int __init kstack_setup(char *s)
14262 +{
14263 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
14264 + return 1;
14265 +}
14266 +__setup("kstack=", kstack_setup);
14267 +
14268 +#ifdef CONFIG_STACK_UNWIND
14269 +static int __init call_trace_setup(char *s)
14270 +{
14271 + if (strcmp(s, "old") == 0)
14272 + call_trace = -1;
14273 + else if (strcmp(s, "both") == 0)
14274 + call_trace = 0;
14275 + else if (strcmp(s, "newfallback") == 0)
14276 + call_trace = 1;
14277 + else if (strcmp(s, "new") == 2)
14278 + call_trace = 2;
14279 + return 1;
14280 +}
14281 +__setup("call_trace=", call_trace_setup);
14282 +#endif
14283 Index: head-2008-11-25/arch/x86/mach-xen/Makefile
14284 ===================================================================
14285 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14286 +++ head-2008-11-25/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200
14287 @@ -0,0 +1,5 @@
14288 +#
14289 +# Makefile for the linux kernel.
14290 +#
14291 +
14292 +obj-y := setup.o
14293 Index: head-2008-11-25/arch/x86/mach-xen/setup.c
14294 ===================================================================
14295 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14296 +++ head-2008-11-25/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200
14297 @@ -0,0 +1,158 @@
14298 +/*
14299 + * Machine specific setup for generic
14300 + */
14301 +
14302 +#include <linux/mm.h>
14303 +#include <linux/smp.h>
14304 +#include <linux/init.h>
14305 +#include <linux/interrupt.h>
14306 +#include <linux/module.h>
14307 +#include <asm/acpi.h>
14308 +#include <asm/arch_hooks.h>
14309 +#include <asm/e820.h>
14310 +#include <asm/setup.h>
14311 +#include <asm/fixmap.h>
14312 +
14313 +#include <xen/interface/callback.h>
14314 +#include <xen/interface/memory.h>
14315 +
14316 +#ifdef CONFIG_HOTPLUG_CPU
14317 +#define DEFAULT_SEND_IPI (1)
14318 +#else
14319 +#define DEFAULT_SEND_IPI (0)
14320 +#endif
14321 +
14322 +int no_broadcast=DEFAULT_SEND_IPI;
14323 +
14324 +static __init int no_ipi_broadcast(char *str)
14325 +{
14326 + get_option(&str, &no_broadcast);
14327 + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
14328 + "IPI Broadcast");
14329 + return 1;
14330 +}
14331 +
14332 +__setup("no_ipi_broadcast", no_ipi_broadcast);
14333 +
14334 +static int __init print_ipi_mode(void)
14335 +{
14336 + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
14337 + "Shortcut");
14338 + return 0;
14339 +}
14340 +
14341 +late_initcall(print_ipi_mode);
14342 +
14343 +/**
14344 + * machine_specific_memory_setup - Hook for machine specific memory setup.
14345 + *
14346 + * Description:
14347 + * This is included late in kernel/setup.c so that it can make
14348 + * use of all of the static functions.
14349 + **/
14350 +
14351 +char * __init machine_specific_memory_setup(void)
14352 +{
14353 + int rc;
14354 + struct xen_memory_map memmap;
14355 + /*
14356 + * This is rather large for a stack variable but this early in
14357 + * the boot process we know we have plenty slack space.
14358 + */
14359 + struct e820entry map[E820MAX];
14360 +
14361 + memmap.nr_entries = E820MAX;
14362 + set_xen_guest_handle(memmap.buffer, map);
14363 +
14364 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
14365 + if ( rc == -ENOSYS ) {
14366 + memmap.nr_entries = 1;
14367 + map[0].addr = 0ULL;
14368 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
14369 + /* 8MB slack (to balance backend allocations). */
14370 + map[0].size += 8ULL << 20;
14371 + map[0].type = E820_RAM;
14372 + rc = 0;
14373 + }
14374 + BUG_ON(rc);
14375 +
14376 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
14377 +
14378 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
14379 +
14380 + return "Xen";
14381 +}
14382 +
14383 +
14384 +extern void hypervisor_callback(void);
14385 +extern void failsafe_callback(void);
14386 +extern void nmi(void);
14387 +
14388 +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
14389 +EXPORT_SYMBOL(machine_to_phys_mapping);
14390 +unsigned int machine_to_phys_order;
14391 +EXPORT_SYMBOL(machine_to_phys_order);
14392 +
14393 +void __init pre_setup_arch_hook(void)
14394 +{
14395 + struct xen_machphys_mapping mapping;
14396 + unsigned long machine_to_phys_nr_ents;
14397 + struct xen_platform_parameters pp;
14398 +
14399 + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
14400 +
14401 + setup_xen_features();
14402 +
14403 + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
14404 + set_fixaddr_top(pp.virt_start);
14405 +
14406 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
14407 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
14408 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
14409 + } else
14410 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
14411 + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
14412 +
14413 + if (!xen_feature(XENFEAT_auto_translated_physmap))
14414 + phys_to_machine_mapping =
14415 + (unsigned long *)xen_start_info->mfn_list;
14416 +}
14417 +
14418 +void __init machine_specific_arch_setup(void)
14419 +{
14420 + int ret;
14421 + static struct callback_register __initdata event = {
14422 + .type = CALLBACKTYPE_event,
14423 + .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
14424 + };
14425 + static struct callback_register __initdata failsafe = {
14426 + .type = CALLBACKTYPE_failsafe,
14427 + .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
14428 + };
14429 + static struct callback_register __initdata nmi_cb = {
14430 + .type = CALLBACKTYPE_nmi,
14431 + .address = { __KERNEL_CS, (unsigned long)nmi },
14432 + };
14433 +
14434 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
14435 + if (ret == 0)
14436 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
14437 +#if CONFIG_XEN_COMPAT <= 0x030002
14438 + if (ret == -ENOSYS)
14439 + ret = HYPERVISOR_set_callbacks(
14440 + event.address.cs, event.address.eip,
14441 + failsafe.address.cs, failsafe.address.eip);
14442 +#endif
14443 + BUG_ON(ret);
14444 +
14445 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
14446 +#if CONFIG_XEN_COMPAT <= 0x030002
14447 + if (ret == -ENOSYS) {
14448 + static struct xennmi_callback __initdata cb = {
14449 + .handler_address = (unsigned long)nmi
14450 + };
14451 +
14452 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
14453 + }
14454 +#endif
14455 +}
14456 Index: head-2008-11-25/arch/x86/lib/scrub.c
14457 ===================================================================
14458 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14459 +++ head-2008-11-25/arch/x86/lib/scrub.c 2008-02-08 12:30:51.000000000 +0100
14460 @@ -0,0 +1,21 @@
14461 +#include <asm/cpufeature.h>
14462 +#include <asm/page.h>
14463 +#include <asm/processor.h>
14464 +
14465 +void scrub_pages(void *v, unsigned int count)
14466 +{
14467 + if (likely(cpu_has_xmm2)) {
14468 + unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
14469 +
14470 + for (; n--; v += sizeof(long) * 4)
14471 + asm("movnti %1,(%0)\n\t"
14472 + "movnti %1,%c2(%0)\n\t"
14473 + "movnti %1,2*%c2(%0)\n\t"
14474 + "movnti %1,3*%c2(%0)\n\t"
14475 + : : "r" (v), "r" (0L), "i" (sizeof(long))
14476 + : "memory");
14477 + asm volatile("sfence" : : : "memory");
14478 + } else
14479 + for (; count--; v += PAGE_SIZE)
14480 + clear_page(v);
14481 +}
14482 Index: head-2008-11-25/arch/x86/mm/fault_32-xen.c
14483 ===================================================================
14484 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14485 +++ head-2008-11-25/arch/x86/mm/fault_32-xen.c 2007-12-10 08:47:31.000000000 +0100
14486 @@ -0,0 +1,779 @@
14487 +/*
14488 + * linux/arch/i386/mm/fault.c
14489 + *
14490 + * Copyright (C) 1995 Linus Torvalds
14491 + */
14492 +
14493 +#include <linux/signal.h>
14494 +#include <linux/sched.h>
14495 +#include <linux/kernel.h>
14496 +#include <linux/errno.h>
14497 +#include <linux/string.h>
14498 +#include <linux/types.h>
14499 +#include <linux/ptrace.h>
14500 +#include <linux/mman.h>
14501 +#include <linux/mm.h>
14502 +#include <linux/smp.h>
14503 +#include <linux/smp_lock.h>
14504 +#include <linux/interrupt.h>
14505 +#include <linux/init.h>
14506 +#include <linux/tty.h>
14507 +#include <linux/vt_kern.h> /* For unblank_screen() */
14508 +#include <linux/highmem.h>
14509 +#include <linux/module.h>
14510 +#include <linux/kprobes.h>
14511 +
14512 +#include <asm/system.h>
14513 +#include <asm/uaccess.h>
14514 +#include <asm/desc.h>
14515 +#include <asm/kdebug.h>
14516 +
14517 +extern void die(const char *,struct pt_regs *,long);
14518 +
14519 +#ifdef CONFIG_KPROBES
14520 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
14521 +int register_page_fault_notifier(struct notifier_block *nb)
14522 +{
14523 + vmalloc_sync_all();
14524 + return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
14525 +}
14526 +
14527 +int unregister_page_fault_notifier(struct notifier_block *nb)
14528 +{
14529 + return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
14530 +}
14531 +
14532 +static inline int notify_page_fault(enum die_val val, const char *str,
14533 + struct pt_regs *regs, long err, int trap, int sig)
14534 +{
14535 + struct die_args args = {
14536 + .regs = regs,
14537 + .str = str,
14538 + .err = err,
14539 + .trapnr = trap,
14540 + .signr = sig
14541 + };
14542 + return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
14543 +}
14544 +#else
14545 +static inline int notify_page_fault(enum die_val val, const char *str,
14546 + struct pt_regs *regs, long err, int trap, int sig)
14547 +{
14548 + return NOTIFY_DONE;
14549 +}
14550 +#endif
14551 +
14552 +
14553 +/*
14554 + * Unlock any spinlocks which will prevent us from getting the
14555 + * message out
14556 + */
14557 +void bust_spinlocks(int yes)
14558 +{
14559 + int loglevel_save = console_loglevel;
14560 +
14561 + if (yes) {
14562 + oops_in_progress = 1;
14563 + return;
14564 + }
14565 +#ifdef CONFIG_VT
14566 + unblank_screen();
14567 +#endif
14568 + oops_in_progress = 0;
14569 + /*
14570 + * OK, the message is on the console. Now we call printk()
14571 + * without oops_in_progress set so that printk will give klogd
14572 + * a poke. Hold onto your hats...
14573 + */
14574 + console_loglevel = 15; /* NMI oopser may have shut the console up */
14575 + printk(" ");
14576 + console_loglevel = loglevel_save;
14577 +}
14578 +
14579 +/*
14580 + * Return EIP plus the CS segment base. The segment limit is also
14581 + * adjusted, clamped to the kernel/user address space (whichever is
14582 + * appropriate), and returned in *eip_limit.
14583 + *
14584 + * The segment is checked, because it might have been changed by another
14585 + * task between the original faulting instruction and here.
14586 + *
14587 + * If CS is no longer a valid code segment, or if EIP is beyond the
14588 + * limit, or if it is a kernel address when CS is not a kernel segment,
14589 + * then the returned value will be greater than *eip_limit.
14590 + *
14591 + * This is slow, but is very rarely executed.
14592 + */
14593 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
14594 + unsigned long *eip_limit)
14595 +{
14596 + unsigned long eip = regs->eip;
14597 + unsigned seg = regs->xcs & 0xffff;
14598 + u32 seg_ar, seg_limit, base, *desc;
14599 +
14600 + /* Unlikely, but must come before segment checks. */
14601 + if (unlikely(regs->eflags & VM_MASK)) {
14602 + base = seg << 4;
14603 + *eip_limit = base + 0xffff;
14604 + return base + (eip & 0xffff);
14605 + }
14606 +
14607 + /* The standard kernel/user address space limit. */
14608 + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
14609 +
14610 + /* By far the most common cases. */
14611 + if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
14612 + return eip;
14613 +
14614 + /* Check the segment exists, is within the current LDT/GDT size,
14615 + that kernel/user (ring 0..3) has the appropriate privilege,
14616 + that it's a code segment, and get the limit. */
14617 + __asm__ ("larl %3,%0; lsll %3,%1"
14618 + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
14619 + if ((~seg_ar & 0x9800) || eip > seg_limit) {
14620 + *eip_limit = 0;
14621 + return 1; /* So that returned eip > *eip_limit. */
14622 + }
14623 +
14624 + /* Get the GDT/LDT descriptor base.
14625 + When you look for races in this code remember that
14626 + LDT and other horrors are only used in user space. */
14627 + if (seg & (1<<2)) {
14628 + /* Must lock the LDT while reading it. */
14629 + down(&current->mm->context.sem);
14630 + desc = current->mm->context.ldt;
14631 + desc = (void *)desc + (seg & ~7);
14632 + } else {
14633 + /* Must disable preemption while reading the GDT. */
14634 + desc = (u32 *)get_cpu_gdt_table(get_cpu());
14635 + desc = (void *)desc + (seg & ~7);
14636 + }
14637 +
14638 + /* Decode the code segment base from the descriptor */
14639 + base = get_desc_base((unsigned long *)desc);
14640 +
14641 + if (seg & (1<<2)) {
14642 + up(&current->mm->context.sem);
14643 + } else
14644 + put_cpu();
14645 +
14646 + /* Adjust EIP and segment limit, and clamp at the kernel limit.
14647 + It's legitimate for segments to wrap at 0xffffffff. */
14648 + seg_limit += base;
14649 + if (seg_limit < *eip_limit && seg_limit >= base)
14650 + *eip_limit = seg_limit;
14651 + return eip + base;
14652 +}
14653 +
14654 +/*
14655 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
14656 + * Check that here and ignore it.
14657 + */
14658 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
14659 +{
14660 + unsigned long limit;
14661 + unsigned long instr = get_segment_eip (regs, &limit);
14662 + int scan_more = 1;
14663 + int prefetch = 0;
14664 + int i;
14665 +
14666 + for (i = 0; scan_more && i < 15; i++) {
14667 + unsigned char opcode;
14668 + unsigned char instr_hi;
14669 + unsigned char instr_lo;
14670 +
14671 + if (instr > limit)
14672 + break;
14673 + if (__get_user(opcode, (unsigned char __user *) instr))
14674 + break;
14675 +
14676 + instr_hi = opcode & 0xf0;
14677 + instr_lo = opcode & 0x0f;
14678 + instr++;
14679 +
14680 + switch (instr_hi) {
14681 + case 0x20:
14682 + case 0x30:
14683 + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
14684 + scan_more = ((instr_lo & 7) == 0x6);
14685 + break;
14686 +
14687 + case 0x60:
14688 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
14689 + scan_more = (instr_lo & 0xC) == 0x4;
14690 + break;
14691 + case 0xF0:
14692 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
14693 + scan_more = !instr_lo || (instr_lo>>1) == 1;
14694 + break;
14695 + case 0x00:
14696 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
14697 + scan_more = 0;
14698 + if (instr > limit)
14699 + break;
14700 + if (__get_user(opcode, (unsigned char __user *) instr))
14701 + break;
14702 + prefetch = (instr_lo == 0xF) &&
14703 + (opcode == 0x0D || opcode == 0x18);
14704 + break;
14705 + default:
14706 + scan_more = 0;
14707 + break;
14708 + }
14709 + }
14710 + return prefetch;
14711 +}
14712 +
14713 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
14714 + unsigned long error_code)
14715 +{
14716 + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
14717 + boot_cpu_data.x86 >= 6)) {
14718 + /* Catch an obscure case of prefetch inside an NX page. */
14719 + if (nx_enabled && (error_code & 16))
14720 + return 0;
14721 + return __is_prefetch(regs, addr);
14722 + }
14723 + return 0;
14724 +}
14725 +
14726 +static noinline void force_sig_info_fault(int si_signo, int si_code,
14727 + unsigned long address, struct task_struct *tsk)
14728 +{
14729 + siginfo_t info;
14730 +
14731 + info.si_signo = si_signo;
14732 + info.si_errno = 0;
14733 + info.si_code = si_code;
14734 + info.si_addr = (void __user *)address;
14735 + force_sig_info(si_signo, &info, tsk);
14736 +}
14737 +
14738 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
14739 +
14740 +#ifdef CONFIG_X86_PAE
14741 +static void dump_fault_path(unsigned long address)
14742 +{
14743 + unsigned long *p, page;
14744 + unsigned long mfn;
14745 +
14746 + page = read_cr3();
14747 + p = (unsigned long *)__va(page);
14748 + p += (address >> 30) * 2;
14749 + printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
14750 + if (p[0] & _PAGE_PRESENT) {
14751 + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14752 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14753 + p = (unsigned long *)__va(page);
14754 + address &= 0x3fffffff;
14755 + p += (address >> 21) * 2;
14756 + printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
14757 + page, p[1], p[0]);
14758 + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
14759 +#ifdef CONFIG_HIGHPTE
14760 + if (mfn_to_pfn(mfn) >= highstart_pfn)
14761 + return;
14762 +#endif
14763 + if (p[0] & _PAGE_PRESENT) {
14764 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
14765 + p = (unsigned long *) __va(page);
14766 + address &= 0x001fffff;
14767 + p += (address >> 12) * 2;
14768 + printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
14769 + page, p[1], p[0]);
14770 + }
14771 + }
14772 +}
14773 +#else
14774 +static void dump_fault_path(unsigned long address)
14775 +{
14776 + unsigned long page;
14777 +
14778 + page = read_cr3();
14779 + page = ((unsigned long *) __va(page))[address >> 22];
14780 + if (oops_may_print())
14781 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
14782 + machine_to_phys(page));
14783 + /*
14784 + * We must not directly access the pte in the highpte
14785 + * case if the page table is located in highmem.
14786 + * And lets rather not kmap-atomic the pte, just in case
14787 + * it's allocated already.
14788 + */
14789 +#ifdef CONFIG_HIGHPTE
14790 + if ((page >> PAGE_SHIFT) >= highstart_pfn)
14791 + return;
14792 +#endif
14793 + if ((page & 1) && oops_may_print()) {
14794 + page &= PAGE_MASK;
14795 + address &= 0x003ff000;
14796 + page = machine_to_phys(page);
14797 + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
14798 + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
14799 + machine_to_phys(page));
14800 + }
14801 +}
14802 +#endif
14803 +
14804 +static int spurious_fault(struct pt_regs *regs,
14805 + unsigned long address,
14806 + unsigned long error_code)
14807 +{
14808 + pgd_t *pgd;
14809 + pud_t *pud;
14810 + pmd_t *pmd;
14811 + pte_t *pte;
14812 +
14813 + /* Reserved-bit violation or user access to kernel space? */
14814 + if (error_code & 0x0c)
14815 + return 0;
14816 +
14817 + pgd = init_mm.pgd + pgd_index(address);
14818 + if (!pgd_present(*pgd))
14819 + return 0;
14820 +
14821 + pud = pud_offset(pgd, address);
14822 + if (!pud_present(*pud))
14823 + return 0;
14824 +
14825 + pmd = pmd_offset(pud, address);
14826 + if (!pmd_present(*pmd))
14827 + return 0;
14828 +
14829 + pte = pte_offset_kernel(pmd, address);
14830 + if (!pte_present(*pte))
14831 + return 0;
14832 + if ((error_code & 0x02) && !pte_write(*pte))
14833 + return 0;
14834 +#ifdef CONFIG_X86_PAE
14835 + if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
14836 + return 0;
14837 +#endif
14838 +
14839 + return 1;
14840 +}
14841 +
14842 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
14843 +{
14844 + unsigned index = pgd_index(address);
14845 + pgd_t *pgd_k;
14846 + pud_t *pud, *pud_k;
14847 + pmd_t *pmd, *pmd_k;
14848 +
14849 + pgd += index;
14850 + pgd_k = init_mm.pgd + index;
14851 +
14852 + if (!pgd_present(*pgd_k))
14853 + return NULL;
14854 +
14855 + /*
14856 + * set_pgd(pgd, *pgd_k); here would be useless on PAE
14857 + * and redundant with the set_pmd() on non-PAE. As would
14858 + * set_pud.
14859 + */
14860 +
14861 + pud = pud_offset(pgd, address);
14862 + pud_k = pud_offset(pgd_k, address);
14863 + if (!pud_present(*pud_k))
14864 + return NULL;
14865 +
14866 + pmd = pmd_offset(pud, address);
14867 + pmd_k = pmd_offset(pud_k, address);
14868 + if (!pmd_present(*pmd_k))
14869 + return NULL;
14870 + if (!pmd_present(*pmd))
14871 +#if CONFIG_XEN_COMPAT > 0x030002
14872 + set_pmd(pmd, *pmd_k);
14873 +#else
14874 + /*
14875 + * When running on older Xen we must launder *pmd_k through
14876 + * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
14877 + */
14878 + set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
14879 +#endif
14880 + else
14881 + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
14882 + return pmd_k;
14883 +}
14884 +
14885 +/*
14886 + * Handle a fault on the vmalloc or module mapping area
14887 + *
14888 + * This assumes no large pages in there.
14889 + */
14890 +static inline int vmalloc_fault(unsigned long address)
14891 +{
14892 + unsigned long pgd_paddr;
14893 + pmd_t *pmd_k;
14894 + pte_t *pte_k;
14895 + /*
14896 + * Synchronize this task's top level page-table
14897 + * with the 'reference' page table.
14898 + *
14899 + * Do _not_ use "current" here. We might be inside
14900 + * an interrupt in the middle of a task switch..
14901 + */
14902 + pgd_paddr = read_cr3();
14903 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
14904 + if (!pmd_k)
14905 + return -1;
14906 + pte_k = pte_offset_kernel(pmd_k, address);
14907 + if (!pte_present(*pte_k))
14908 + return -1;
14909 + return 0;
14910 +}
14911 +
14912 +/*
14913 + * This routine handles page faults. It determines the address,
14914 + * and the problem, and then passes it off to one of the appropriate
14915 + * routines.
14916 + *
14917 + * error_code:
14918 + * bit 0 == 0 means no page found, 1 means protection fault
14919 + * bit 1 == 0 means read, 1 means write
14920 + * bit 2 == 0 means kernel, 1 means user-mode
14921 + * bit 3 == 1 means use of reserved bit detected
14922 + * bit 4 == 1 means fault was an instruction fetch
14923 + */
14924 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
14925 + unsigned long error_code)
14926 +{
14927 + struct task_struct *tsk;
14928 + struct mm_struct *mm;
14929 + struct vm_area_struct * vma;
14930 + unsigned long address;
14931 + int write, si_code;
14932 +
14933 + /* get the address */
14934 + address = read_cr2();
14935 +
14936 + /* Set the "privileged fault" bit to something sane. */
14937 + error_code &= ~4;
14938 + error_code |= (regs->xcs & 2) << 1;
14939 + if (regs->eflags & X86_EFLAGS_VM)
14940 + error_code |= 4;
14941 +
14942 + tsk = current;
14943 +
14944 + si_code = SEGV_MAPERR;
14945 +
14946 + /*
14947 + * We fault-in kernel-space virtual memory on-demand. The
14948 + * 'reference' page table is init_mm.pgd.
14949 + *
14950 + * NOTE! We MUST NOT take any locks for this case. We may
14951 + * be in an interrupt or a critical region, and should
14952 + * only copy the information from the master page table,
14953 + * nothing more.
14954 + *
14955 + * This verifies that the fault happens in kernel space
14956 + * (error_code & 4) == 0, and that the fault was not a
14957 + * protection error (error_code & 9) == 0.
14958 + */
14959 + if (unlikely(address >= TASK_SIZE)) {
14960 +#ifdef CONFIG_XEN
14961 + /* Faults in hypervisor area can never be patched up. */
14962 + if (address >= hypervisor_virt_start)
14963 + goto bad_area_nosemaphore;
14964 +#endif
14965 + if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
14966 + return;
14967 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
14968 + if (spurious_fault(regs, address, error_code))
14969 + return;
14970 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14971 + SIGSEGV) == NOTIFY_STOP)
14972 + return;
14973 + /*
14974 + * Don't take the mm semaphore here. If we fixup a prefetch
14975 + * fault we could otherwise deadlock.
14976 + */
14977 + goto bad_area_nosemaphore;
14978 + }
14979 +
14980 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
14981 + SIGSEGV) == NOTIFY_STOP)
14982 + return;
14983 +
14984 + /* It's safe to allow irq's after cr2 has been saved and the vmalloc
14985 + fault has been handled. */
14986 + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
14987 + local_irq_enable();
14988 +
14989 + mm = tsk->mm;
14990 +
14991 + /*
14992 + * If we're in an interrupt, have no user context or are running in an
14993 + * atomic region then we must not take the fault..
14994 + */
14995 + if (in_atomic() || !mm)
14996 + goto bad_area_nosemaphore;
14997 +
14998 + /* When running in the kernel we expect faults to occur only to
14999 + * addresses in user space. All other faults represent errors in the
15000 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
15001 + * erroneous fault occurring in a code path which already holds mmap_sem
15002 + * we will deadlock attempting to validate the fault against the
15003 + * address space. Luckily the kernel only validly references user
15004 + * space from well defined areas of code, which are listed in the
15005 + * exceptions table.
15006 + *
15007 + * As the vast majority of faults will be valid we will only perform
15008 + * the source reference check when there is a possibilty of a deadlock.
15009 + * Attempt to lock the address space, if we cannot we then validate the
15010 + * source. If this is invalid we can skip the address space check,
15011 + * thus avoiding the deadlock.
15012 + */
15013 + if (!down_read_trylock(&mm->mmap_sem)) {
15014 + if ((error_code & 4) == 0 &&
15015 + !search_exception_tables(regs->eip))
15016 + goto bad_area_nosemaphore;
15017 + down_read(&mm->mmap_sem);
15018 + }
15019 +
15020 + vma = find_vma(mm, address);
15021 + if (!vma)
15022 + goto bad_area;
15023 + if (vma->vm_start <= address)
15024 + goto good_area;
15025 + if (!(vma->vm_flags & VM_GROWSDOWN))
15026 + goto bad_area;
15027 + if (error_code & 4) {
15028 + /*
15029 + * Accessing the stack below %esp is always a bug.
15030 + * The large cushion allows instructions like enter
15031 + * and pusha to work. ("enter $65535,$31" pushes
15032 + * 32 pointers and then decrements %esp by 65535.)
15033 + */
15034 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
15035 + goto bad_area;
15036 + }
15037 + if (expand_stack(vma, address))
15038 + goto bad_area;
15039 +/*
15040 + * Ok, we have a good vm_area for this memory access, so
15041 + * we can handle it..
15042 + */
15043 +good_area:
15044 + si_code = SEGV_ACCERR;
15045 + write = 0;
15046 + switch (error_code & 3) {
15047 + default: /* 3: write, present */
15048 +#ifdef TEST_VERIFY_AREA
15049 + if (regs->cs == GET_KERNEL_CS())
15050 + printk("WP fault at %08lx\n", regs->eip);
15051 +#endif
15052 + /* fall through */
15053 + case 2: /* write, not present */
15054 + if (!(vma->vm_flags & VM_WRITE))
15055 + goto bad_area;
15056 + write++;
15057 + break;
15058 + case 1: /* read, present */
15059 + goto bad_area;
15060 + case 0: /* read, not present */
15061 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
15062 + goto bad_area;
15063 + }
15064 +
15065 + survive:
15066 + /*
15067 + * If for any reason at all we couldn't handle the fault,
15068 + * make sure we exit gracefully rather than endlessly redo
15069 + * the fault.
15070 + */
15071 + switch (handle_mm_fault(mm, vma, address, write)) {
15072 + case VM_FAULT_MINOR:
15073 + tsk->min_flt++;
15074 + break;
15075 + case VM_FAULT_MAJOR:
15076 + tsk->maj_flt++;
15077 + break;
15078 + case VM_FAULT_SIGBUS:
15079 + goto do_sigbus;
15080 + case VM_FAULT_OOM:
15081 + goto out_of_memory;
15082 + default:
15083 + BUG();
15084 + }
15085 +
15086 + /*
15087 + * Did it hit the DOS screen memory VA from vm86 mode?
15088 + */
15089 + if (regs->eflags & VM_MASK) {
15090 + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
15091 + if (bit < 32)
15092 + tsk->thread.screen_bitmap |= 1 << bit;
15093 + }
15094 + up_read(&mm->mmap_sem);
15095 + return;
15096 +
15097 +/*
15098 + * Something tried to access memory that isn't in our memory map..
15099 + * Fix it, but check if it's kernel or user first..
15100 + */
15101 +bad_area:
15102 + up_read(&mm->mmap_sem);
15103 +
15104 +bad_area_nosemaphore:
15105 + /* User mode accesses just cause a SIGSEGV */
15106 + if (error_code & 4) {
15107 + /*
15108 + * Valid to do another page fault here because this one came
15109 + * from user space.
15110 + */
15111 + if (is_prefetch(regs, address, error_code))
15112 + return;
15113 +
15114 + tsk->thread.cr2 = address;
15115 + /* Kernel addresses are always protection faults */
15116 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
15117 + tsk->thread.trap_no = 14;
15118 + force_sig_info_fault(SIGSEGV, si_code, address, tsk);
15119 + return;
15120 + }
15121 +
15122 +#ifdef CONFIG_X86_F00F_BUG
15123 + /*
15124 + * Pentium F0 0F C7 C8 bug workaround.
15125 + */
15126 + if (boot_cpu_data.f00f_bug) {
15127 + unsigned long nr;
15128 +
15129 + nr = (address - idt_descr.address) >> 3;
15130 +
15131 + if (nr == 6) {
15132 + do_invalid_op(regs, 0);
15133 + return;
15134 + }
15135 + }
15136 +#endif
15137 +
15138 +no_context:
15139 + /* Are we prepared to handle this kernel fault? */
15140 + if (fixup_exception(regs))
15141 + return;
15142 +
15143 + /*
15144 + * Valid to do another page fault here, because if this fault
15145 + * had been triggered by is_prefetch fixup_exception would have
15146 + * handled it.
15147 + */
15148 + if (is_prefetch(regs, address, error_code))
15149 + return;
15150 +
15151 +/*
15152 + * Oops. The kernel tried to access some bad page. We'll have to
15153 + * terminate things with extreme prejudice.
15154 + */
15155 +
15156 + bust_spinlocks(1);
15157 +
15158 + if (oops_may_print()) {
15159 + #ifdef CONFIG_X86_PAE
15160 + if (error_code & 16) {
15161 + pte_t *pte = lookup_address(address);
15162 +
15163 + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
15164 + printk(KERN_CRIT "kernel tried to execute "
15165 + "NX-protected page - exploit attempt? "
15166 + "(uid: %d)\n", current->uid);
15167 + }
15168 + #endif
15169 + if (address < PAGE_SIZE)
15170 + printk(KERN_ALERT "BUG: unable to handle kernel NULL "
15171 + "pointer dereference");
15172 + else
15173 + printk(KERN_ALERT "BUG: unable to handle kernel paging"
15174 + " request");
15175 + printk(" at virtual address %08lx\n",address);
15176 + printk(KERN_ALERT " printing eip:\n");
15177 + printk("%08lx\n", regs->eip);
15178 + }
15179 + dump_fault_path(address);
15180 + tsk->thread.cr2 = address;
15181 + tsk->thread.trap_no = 14;
15182 + tsk->thread.error_code = error_code;
15183 + die("Oops", regs, error_code);
15184 + bust_spinlocks(0);
15185 + do_exit(SIGKILL);
15186 +
15187 +/*
15188 + * We ran out of memory, or some other thing happened to us that made
15189 + * us unable to handle the page fault gracefully.
15190 + */
15191 +out_of_memory:
15192 + up_read(&mm->mmap_sem);
15193 + if (tsk->pid == 1) {
15194 + yield();
15195 + down_read(&mm->mmap_sem);
15196 + goto survive;
15197 + }
15198 + printk("VM: killing process %s\n", tsk->comm);
15199 + if (error_code & 4)
15200 + do_exit(SIGKILL);
15201 + goto no_context;
15202 +
15203 +do_sigbus:
15204 + up_read(&mm->mmap_sem);
15205 +
15206 + /* Kernel mode? Handle exceptions or die */
15207 + if (!(error_code & 4))
15208 + goto no_context;
15209 +
15210 + /* User space => ok to do another page fault */
15211 + if (is_prefetch(regs, address, error_code))
15212 + return;
15213 +
15214 + tsk->thread.cr2 = address;
15215 + tsk->thread.error_code = error_code;
15216 + tsk->thread.trap_no = 14;
15217 + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
15218 +}
15219 +
15220 +#if !HAVE_SHARED_KERNEL_PMD
15221 +void vmalloc_sync_all(void)
15222 +{
15223 + /*
15224 + * Note that races in the updates of insync and start aren't
15225 + * problematic: insync can only get set bits added, and updates to
15226 + * start are only improving performance (without affecting correctness
15227 + * if undone).
15228 + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
15229 + * This change works just fine with 2-level paging too.
15230 + */
15231 +#define sync_index(a) ((a) >> PMD_SHIFT)
15232 + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
15233 + static unsigned long start = TASK_SIZE;
15234 + unsigned long address;
15235 +
15236 + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
15237 + for (address = start;
15238 + address >= TASK_SIZE && address < hypervisor_virt_start;
15239 + address += 1UL << PMD_SHIFT) {
15240 + if (!test_bit(sync_index(address), insync)) {
15241 + unsigned long flags;
15242 + struct page *page;
15243 +
15244 + spin_lock_irqsave(&pgd_lock, flags);
15245 + /* XEN: failure path assumes non-empty pgd_list. */
15246 + if (unlikely(!pgd_list)) {
15247 + spin_unlock_irqrestore(&pgd_lock, flags);
15248 + return;
15249 + }
15250 + for (page = pgd_list; page; page =
15251 + (struct page *)page->index)
15252 + if (!vmalloc_sync_one(page_address(page),
15253 + address)) {
15254 + BUG_ON(page != pgd_list);
15255 + break;
15256 + }
15257 + spin_unlock_irqrestore(&pgd_lock, flags);
15258 + if (!page)
15259 + set_bit(sync_index(address), insync);
15260 + }
15261 + if (address == start && test_bit(sync_index(address), insync))
15262 + start = address + (1UL << PMD_SHIFT);
15263 + }
15264 +}
15265 +#endif
15266 Index: head-2008-11-25/arch/x86/mm/highmem_32-xen.c
15267 ===================================================================
15268 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15269 +++ head-2008-11-25/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100
15270 @@ -0,0 +1,183 @@
15271 +#include <linux/highmem.h>
15272 +#include <linux/module.h>
15273 +
15274 +void *kmap(struct page *page)
15275 +{
15276 + might_sleep();
15277 + if (!PageHighMem(page))
15278 + return page_address(page);
15279 + return kmap_high(page);
15280 +}
15281 +
15282 +void kunmap(struct page *page)
15283 +{
15284 + if (in_interrupt())
15285 + BUG();
15286 + if (!PageHighMem(page))
15287 + return;
15288 + kunmap_high(page);
15289 +}
15290 +
15291 +/*
15292 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
15293 + * no global lock is needed and because the kmap code must perform a global TLB
15294 + * invalidation when the kmap pool wraps.
15295 + *
15296 + * However when holding an atomic kmap is is not legal to sleep, so atomic
15297 + * kmaps are appropriate for short, tight code paths only.
15298 + */
15299 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
15300 +{
15301 + enum fixed_addresses idx;
15302 + unsigned long vaddr;
15303 +
15304 + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
15305 + inc_preempt_count();
15306 + if (!PageHighMem(page))
15307 + return page_address(page);
15308 +
15309 + idx = type + KM_TYPE_NR*smp_processor_id();
15310 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15311 +#ifdef CONFIG_DEBUG_HIGHMEM
15312 + if (!pte_none(*(kmap_pte-idx)))
15313 + BUG();
15314 +#endif
15315 + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
15316 +
15317 + return (void*) vaddr;
15318 +}
15319 +
15320 +void *kmap_atomic(struct page *page, enum km_type type)
15321 +{
15322 + return __kmap_atomic(page, type, kmap_prot);
15323 +}
15324 +
15325 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
15326 +void *kmap_atomic_pte(struct page *page, enum km_type type)
15327 +{
15328 + return __kmap_atomic(page, type,
15329 + test_bit(PG_pinned, &page->flags)
15330 + ? PAGE_KERNEL_RO : kmap_prot);
15331 +}
15332 +
15333 +void kunmap_atomic(void *kvaddr, enum km_type type)
15334 +{
15335 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
15336 + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
15337 + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
15338 +
15339 + if (vaddr < FIXADDR_START) { // FIXME
15340 + dec_preempt_count();
15341 + preempt_check_resched();
15342 + return;
15343 + }
15344 +#endif
15345 +
15346 +#if defined(CONFIG_DEBUG_HIGHMEM)
15347 + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
15348 + BUG();
15349 +
15350 + /*
15351 + * force other mappings to Oops if they'll try to access
15352 + * this pte without first remap it
15353 + */
15354 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
15355 + __flush_tlb_one(vaddr);
15356 +#elif defined(CONFIG_XEN)
15357 + /*
15358 + * We must ensure there are no dangling pagetable references when
15359 + * returning memory to Xen (decrease_reservation).
15360 + * XXX TODO: We could make this faster by only zapping when
15361 + * kmap_flush_unused is called but that is trickier and more invasive.
15362 + */
15363 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
15364 +#endif
15365 +
15366 + dec_preempt_count();
15367 + preempt_check_resched();
15368 +}
15369 +
15370 +/* This is the same as kmap_atomic() but can map memory that doesn't
15371 + * have a struct page associated with it.
15372 + */
15373 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
15374 +{
15375 + enum fixed_addresses idx;
15376 + unsigned long vaddr;
15377 +
15378 + inc_preempt_count();
15379 +
15380 + idx = type + KM_TYPE_NR*smp_processor_id();
15381 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
15382 + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
15383 + __flush_tlb_one(vaddr);
15384 +
15385 + return (void*) vaddr;
15386 +}
15387 +
15388 +struct page *kmap_atomic_to_page(void *ptr)
15389 +{
15390 + unsigned long idx, vaddr = (unsigned long)ptr;
15391 + pte_t *pte;
15392 +
15393 + if (vaddr < FIXADDR_START)
15394 + return virt_to_page(ptr);
15395 +
15396 + idx = virt_to_fix(vaddr);
15397 + pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
15398 + return pte_page(*pte);
15399 +}
15400 +
15401 +void clear_highpage(struct page *page)
15402 +{
15403 + void *kaddr;
15404 +
15405 + if (likely(xen_feature(XENFEAT_highmem_assist))
15406 + && PageHighMem(page)) {
15407 + struct mmuext_op meo;
15408 +
15409 + meo.cmd = MMUEXT_CLEAR_PAGE;
15410 + meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
15411 + if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15412 + return;
15413 + }
15414 +
15415 + kaddr = kmap_atomic(page, KM_USER0);
15416 + clear_page(kaddr);
15417 + kunmap_atomic(kaddr, KM_USER0);
15418 +}
15419 +
15420 +void copy_highpage(struct page *to, struct page *from)
15421 +{
15422 + void *vfrom, *vto;
15423 +
15424 + if (likely(xen_feature(XENFEAT_highmem_assist))
15425 + && (PageHighMem(from) || PageHighMem(to))) {
15426 + unsigned long from_pfn = page_to_pfn(from);
15427 + unsigned long to_pfn = page_to_pfn(to);
15428 + struct mmuext_op meo;
15429 +
15430 + meo.cmd = MMUEXT_COPY_PAGE;
15431 + meo.arg1.mfn = pfn_to_mfn(to_pfn);
15432 + meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
15433 + if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
15434 + && mfn_to_pfn(meo.arg1.mfn) == to_pfn
15435 + && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
15436 + return;
15437 + }
15438 +
15439 + vfrom = kmap_atomic(from, KM_USER0);
15440 + vto = kmap_atomic(to, KM_USER1);
15441 + copy_page(vto, vfrom);
15442 + kunmap_atomic(vfrom, KM_USER0);
15443 + kunmap_atomic(vto, KM_USER1);
15444 +}
15445 +
15446 +EXPORT_SYMBOL(kmap);
15447 +EXPORT_SYMBOL(kunmap);
15448 +EXPORT_SYMBOL(kmap_atomic);
15449 +EXPORT_SYMBOL(kmap_atomic_pte);
15450 +EXPORT_SYMBOL(kunmap_atomic);
15451 +EXPORT_SYMBOL(kmap_atomic_to_page);
15452 +EXPORT_SYMBOL(clear_highpage);
15453 +EXPORT_SYMBOL(copy_highpage);
15454 Index: head-2008-11-25/arch/x86/mm/hypervisor.c
15455 ===================================================================
15456 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15457 +++ head-2008-11-25/arch/x86/mm/hypervisor.c 2008-10-29 09:55:56.000000000 +0100
15458 @@ -0,0 +1,547 @@
15459 +/******************************************************************************
15460 + * mm/hypervisor.c
15461 + *
15462 + * Update page tables via the hypervisor.
15463 + *
15464 + * Copyright (c) 2002-2004, K A Fraser
15465 + *
15466 + * This program is free software; you can redistribute it and/or
15467 + * modify it under the terms of the GNU General Public License version 2
15468 + * as published by the Free Software Foundation; or, when distributed
15469 + * separately from the Linux kernel or incorporated into other
15470 + * software packages, subject to the following license:
15471 + *
15472 + * Permission is hereby granted, free of charge, to any person obtaining a copy
15473 + * of this source file (the "Software"), to deal in the Software without
15474 + * restriction, including without limitation the rights to use, copy, modify,
15475 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
15476 + * and to permit persons to whom the Software is furnished to do so, subject to
15477 + * the following conditions:
15478 + *
15479 + * The above copyright notice and this permission notice shall be included in
15480 + * all copies or substantial portions of the Software.
15481 + *
15482 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15483 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15484 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15485 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15486 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
15487 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
15488 + * IN THE SOFTWARE.
15489 + */
15490 +
15491 +#include <linux/sched.h>
15492 +#include <linux/mm.h>
15493 +#include <linux/vmalloc.h>
15494 +#include <asm/page.h>
15495 +#include <asm/pgtable.h>
15496 +#include <asm/hypervisor.h>
15497 +#include <xen/balloon.h>
15498 +#include <xen/features.h>
15499 +#include <xen/interface/memory.h>
15500 +#include <linux/module.h>
15501 +#include <linux/percpu.h>
15502 +#include <asm/tlbflush.h>
15503 +#include <linux/highmem.h>
15504 +
15505 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
15506 +{
15507 + mmu_update_t u;
15508 +#ifdef CONFIG_HIGHPTE
15509 + u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ?
15510 + arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr);
15511 +#else
15512 + u.ptr = virt_to_machine(ptr);
15513 +#endif
15514 + u.val = __pte_val(val);
15515 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15516 +}
15517 +EXPORT_SYMBOL_GPL(xen_l1_entry_update);
15518 +
15519 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
15520 +{
15521 + mmu_update_t u;
15522 + u.ptr = virt_to_machine(ptr);
15523 + u.val = __pmd_val(val);
15524 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15525 +}
15526 +
15527 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
15528 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
15529 +{
15530 + mmu_update_t u;
15531 + u.ptr = virt_to_machine(ptr);
15532 + u.val = __pud_val(val);
15533 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15534 +}
15535 +#endif
15536 +
15537 +#ifdef CONFIG_X86_64
15538 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
15539 +{
15540 + mmu_update_t u;
15541 + u.ptr = virt_to_machine(ptr);
15542 + u.val = __pgd_val(val);
15543 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
15544 +}
15545 +#endif /* CONFIG_X86_64 */
15546 +
15547 +void xen_pt_switch(unsigned long ptr)
15548 +{
15549 + struct mmuext_op op;
15550 + op.cmd = MMUEXT_NEW_BASEPTR;
15551 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15552 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15553 +}
15554 +
15555 +void xen_new_user_pt(unsigned long ptr)
15556 +{
15557 + struct mmuext_op op;
15558 + op.cmd = MMUEXT_NEW_USER_BASEPTR;
15559 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15560 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15561 +}
15562 +
15563 +void xen_tlb_flush(void)
15564 +{
15565 + struct mmuext_op op;
15566 + op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
15567 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15568 +}
15569 +EXPORT_SYMBOL(xen_tlb_flush);
15570 +
15571 +void xen_invlpg(unsigned long ptr)
15572 +{
15573 + struct mmuext_op op;
15574 + op.cmd = MMUEXT_INVLPG_LOCAL;
15575 + op.arg1.linear_addr = ptr & PAGE_MASK;
15576 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15577 +}
15578 +EXPORT_SYMBOL(xen_invlpg);
15579 +
15580 +#ifdef CONFIG_SMP
15581 +
15582 +void xen_tlb_flush_all(void)
15583 +{
15584 + struct mmuext_op op;
15585 + op.cmd = MMUEXT_TLB_FLUSH_ALL;
15586 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15587 +}
15588 +
15589 +void xen_tlb_flush_mask(cpumask_t *mask)
15590 +{
15591 + struct mmuext_op op;
15592 + if ( cpus_empty(*mask) )
15593 + return;
15594 + op.cmd = MMUEXT_TLB_FLUSH_MULTI;
15595 + set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15596 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15597 +}
15598 +
15599 +void xen_invlpg_all(unsigned long ptr)
15600 +{
15601 + struct mmuext_op op;
15602 + op.cmd = MMUEXT_INVLPG_ALL;
15603 + op.arg1.linear_addr = ptr & PAGE_MASK;
15604 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15605 +}
15606 +
15607 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
15608 +{
15609 + struct mmuext_op op;
15610 + if ( cpus_empty(*mask) )
15611 + return;
15612 + op.cmd = MMUEXT_INVLPG_MULTI;
15613 + op.arg1.linear_addr = ptr & PAGE_MASK;
15614 + set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
15615 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15616 +}
15617 +
15618 +#endif /* CONFIG_SMP */
15619 +
15620 +void xen_pgd_pin(unsigned long ptr)
15621 +{
15622 + struct mmuext_op op;
15623 +#ifdef CONFIG_X86_64
15624 + op.cmd = MMUEXT_PIN_L4_TABLE;
15625 +#elif defined(CONFIG_X86_PAE)
15626 + op.cmd = MMUEXT_PIN_L3_TABLE;
15627 +#else
15628 + op.cmd = MMUEXT_PIN_L2_TABLE;
15629 +#endif
15630 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15631 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15632 +}
15633 +
15634 +void xen_pgd_unpin(unsigned long ptr)
15635 +{
15636 + struct mmuext_op op;
15637 + op.cmd = MMUEXT_UNPIN_TABLE;
15638 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
15639 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15640 +}
15641 +
15642 +void xen_set_ldt(const void *ptr, unsigned int ents)
15643 +{
15644 + struct mmuext_op op;
15645 + op.cmd = MMUEXT_SET_LDT;
15646 + op.arg1.linear_addr = (unsigned long)ptr;
15647 + op.arg2.nr_ents = ents;
15648 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
15649 +}
15650 +
15651 +/* Protected by balloon_lock. */
15652 +#define MAX_CONTIG_ORDER 9 /* 2MB */
15653 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
15654 +static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
15655 +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
15656 +
15657 +/* Ensure multi-page extents are contiguous in machine memory. */
15658 +int xen_create_contiguous_region(
15659 + unsigned long vstart, unsigned int order, unsigned int address_bits)
15660 +{
15661 + unsigned long *in_frames = discontig_frames, out_frame;
15662 + unsigned long frame, flags;
15663 + unsigned int i;
15664 + int rc, success;
15665 + struct xen_memory_exchange exchange = {
15666 + .in = {
15667 + .nr_extents = 1UL << order,
15668 + .extent_order = 0,
15669 + .domid = DOMID_SELF
15670 + },
15671 + .out = {
15672 + .nr_extents = 1,
15673 + .extent_order = order,
15674 + .address_bits = address_bits,
15675 + .domid = DOMID_SELF
15676 + }
15677 + };
15678 +
15679 + /*
15680 + * Currently an auto-translated guest will not perform I/O, nor will
15681 + * it require PAE page directories below 4GB. Therefore any calls to
15682 + * this function are redundant and can be ignored.
15683 + */
15684 + if (xen_feature(XENFEAT_auto_translated_physmap))
15685 + return 0;
15686 +
15687 + if (unlikely(order > MAX_CONTIG_ORDER))
15688 + return -ENOMEM;
15689 +
15690 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
15691 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
15692 +
15693 + scrub_pages((void *)vstart, 1 << order);
15694 +
15695 + balloon_lock(flags);
15696 +
15697 + /* 1. Zap current PTEs, remembering MFNs. */
15698 + for (i = 0; i < (1U<<order); i++) {
15699 + in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
15700 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15701 + __pte_ma(0), 0);
15702 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15703 + INVALID_P2M_ENTRY);
15704 + }
15705 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15706 + BUG();
15707 +
15708 + /* 2. Get a new contiguous memory extent. */
15709 + out_frame = __pa(vstart) >> PAGE_SHIFT;
15710 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15711 + success = (exchange.nr_exchanged == (1UL << order));
15712 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15713 + BUG_ON(success && (rc != 0));
15714 +#if CONFIG_XEN_COMPAT <= 0x030002
15715 + if (unlikely(rc == -ENOSYS)) {
15716 + /* Compatibility when XENMEM_exchange is unsupported. */
15717 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15718 + &exchange.in) != (1UL << order))
15719 + BUG();
15720 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15721 + &exchange.out) == 1);
15722 + if (!success) {
15723 + /* Couldn't get special memory: fall back to normal. */
15724 + for (i = 0; i < (1U<<order); i++)
15725 + in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
15726 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15727 + &exchange.in) != (1UL<<order))
15728 + BUG();
15729 + }
15730 + }
15731 +#endif
15732 +
15733 + /* 3. Map the new extent in place of old pages. */
15734 + for (i = 0; i < (1U<<order); i++) {
15735 + frame = success ? (out_frame + i) : in_frames[i];
15736 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15737 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15738 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15739 + }
15740 +
15741 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15742 + ? UVMF_TLB_FLUSH|UVMF_ALL
15743 + : UVMF_INVLPG|UVMF_ALL;
15744 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15745 + BUG();
15746 +
15747 + balloon_unlock(flags);
15748 +
15749 + return success ? 0 : -ENOMEM;
15750 +}
15751 +EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
15752 +
15753 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
15754 +{
15755 + unsigned long *out_frames = discontig_frames, in_frame;
15756 + unsigned long frame, flags;
15757 + unsigned int i;
15758 + int rc, success;
15759 + struct xen_memory_exchange exchange = {
15760 + .in = {
15761 + .nr_extents = 1,
15762 + .extent_order = order,
15763 + .domid = DOMID_SELF
15764 + },
15765 + .out = {
15766 + .nr_extents = 1UL << order,
15767 + .extent_order = 0,
15768 + .domid = DOMID_SELF
15769 + }
15770 + };
15771 +
15772 + if (xen_feature(XENFEAT_auto_translated_physmap))
15773 + return;
15774 +
15775 + if (unlikely(order > MAX_CONTIG_ORDER))
15776 + return;
15777 +
15778 + set_xen_guest_handle(exchange.in.extent_start, &in_frame);
15779 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
15780 +
15781 + scrub_pages((void *)vstart, 1 << order);
15782 +
15783 + balloon_lock(flags);
15784 +
15785 + /* 1. Find start MFN of contiguous extent. */
15786 + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
15787 +
15788 + /* 2. Zap current PTEs. */
15789 + for (i = 0; i < (1U<<order); i++) {
15790 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15791 + __pte_ma(0), 0);
15792 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
15793 + INVALID_P2M_ENTRY);
15794 + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
15795 + }
15796 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15797 + BUG();
15798 +
15799 + /* 3. Do the exchange for non-contiguous MFNs. */
15800 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15801 + success = (exchange.nr_exchanged == 1);
15802 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15803 + BUG_ON(success && (rc != 0));
15804 +#if CONFIG_XEN_COMPAT <= 0x030002
15805 + if (unlikely(rc == -ENOSYS)) {
15806 + /* Compatibility when XENMEM_exchange is unsupported. */
15807 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15808 + &exchange.in) != 1)
15809 + BUG();
15810 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15811 + &exchange.out) != (1UL << order))
15812 + BUG();
15813 + success = 1;
15814 + }
15815 +#endif
15816 +
15817 + /* 4. Map new pages in place of old pages. */
15818 + for (i = 0; i < (1U<<order); i++) {
15819 + frame = success ? out_frames[i] : (in_frame + i);
15820 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
15821 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15822 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
15823 + }
15824 +
15825 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
15826 + ? UVMF_TLB_FLUSH|UVMF_ALL
15827 + : UVMF_INVLPG|UVMF_ALL;
15828 + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
15829 + BUG();
15830 +
15831 + balloon_unlock(flags);
15832 +}
15833 +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
15834 +
15835 +int xen_limit_pages_to_max_mfn(
15836 + struct page *pages, unsigned int order, unsigned int address_bits)
15837 +{
15838 + unsigned long flags, frame;
15839 + unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
15840 + struct page *page;
15841 + unsigned int i, n, nr_mcl;
15842 + int rc, success;
15843 + DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
15844 +
15845 + struct xen_memory_exchange exchange = {
15846 + .in = {
15847 + .extent_order = 0,
15848 + .domid = DOMID_SELF
15849 + },
15850 + .out = {
15851 + .extent_order = 0,
15852 + .address_bits = address_bits,
15853 + .domid = DOMID_SELF
15854 + }
15855 + };
15856 +
15857 + if (xen_feature(XENFEAT_auto_translated_physmap))
15858 + return 0;
15859 +
15860 + if (unlikely(order > MAX_CONTIG_ORDER))
15861 + return -ENOMEM;
15862 +
15863 + bitmap_zero(limit_map, 1U << order);
15864 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
15865 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
15866 +
15867 + /* 0. Scrub the pages. */
15868 + for (i = 0, n = 0; i < 1U<<order ; i++) {
15869 + page = &pages[i];
15870 + if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
15871 + continue;
15872 + __set_bit(i, limit_map);
15873 +
15874 + if (!PageHighMem(page))
15875 + scrub_pages(page_address(page), 1);
15876 +#ifdef CONFIG_XEN_SCRUB_PAGES
15877 + else {
15878 + scrub_pages(kmap(page), 1);
15879 + kunmap(page);
15880 + ++n;
15881 + }
15882 +#endif
15883 + }
15884 + if (bitmap_empty(limit_map, 1U << order))
15885 + return 0;
15886 +
15887 + if (n)
15888 + kmap_flush_unused();
15889 +
15890 + balloon_lock(flags);
15891 +
15892 + /* 1. Zap current PTEs (if any), remembering MFNs. */
15893 + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15894 + if(!test_bit(i, limit_map))
15895 + continue;
15896 + page = &pages[i];
15897 +
15898 + out_frames[n] = page_to_pfn(page);
15899 + in_frames[n] = pfn_to_mfn(out_frames[n]);
15900 +
15901 + if (!PageHighMem(page))
15902 + MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15903 + (unsigned long)page_address(page),
15904 + __pte_ma(0), 0);
15905 +
15906 + set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
15907 + ++n;
15908 + }
15909 + if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15910 + BUG();
15911 +
15912 + /* 2. Get new memory below the required limit. */
15913 + exchange.in.nr_extents = n;
15914 + exchange.out.nr_extents = n;
15915 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
15916 + success = (exchange.nr_exchanged == n);
15917 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
15918 + BUG_ON(success && (rc != 0));
15919 +#if CONFIG_XEN_COMPAT <= 0x030002
15920 + if (unlikely(rc == -ENOSYS)) {
15921 + /* Compatibility when XENMEM_exchange is unsupported. */
15922 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
15923 + &exchange.in) != n)
15924 + BUG();
15925 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
15926 + &exchange.out) != n)
15927 + BUG();
15928 + success = 1;
15929 + }
15930 +#endif
15931 +
15932 + /* 3. Map the new pages in place of old pages. */
15933 + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
15934 + if(!test_bit(i, limit_map))
15935 + continue;
15936 + page = &pages[i];
15937 +
15938 + frame = success ? out_frames[n] : in_frames[n];
15939 +
15940 + if (!PageHighMem(page))
15941 + MULTI_update_va_mapping(cr_mcl + nr_mcl++,
15942 + (unsigned long)page_address(page),
15943 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
15944 +
15945 + set_phys_to_machine(page_to_pfn(page), frame);
15946 + ++n;
15947 + }
15948 + if (nr_mcl) {
15949 + cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
15950 + ? UVMF_TLB_FLUSH|UVMF_ALL
15951 + : UVMF_INVLPG|UVMF_ALL;
15952 + if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
15953 + BUG();
15954 + }
15955 +
15956 + balloon_unlock(flags);
15957 +
15958 + return success ? 0 : -ENOMEM;
15959 +}
15960 +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
15961 +
15962 +#ifdef __i386__
15963 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
15964 +{
15965 + __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
15966 + maddr_t mach_lp = arbitrary_virt_to_machine(lp);
15967 + return HYPERVISOR_update_descriptor(
15968 + mach_lp, (u64)entry_a | ((u64)entry_b<<32));
15969 +}
15970 +#endif
15971 +
15972 +#define MAX_BATCHED_FULL_PTES 32
15973 +
15974 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
15975 + unsigned long addr, unsigned long end, pgprot_t newprot)
15976 +{
15977 + int rc = 0, i = 0;
15978 + mmu_update_t u[MAX_BATCHED_FULL_PTES];
15979 + pte_t *pte;
15980 + spinlock_t *ptl;
15981 +
15982 + if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
15983 + return 0;
15984 +
15985 + pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
15986 + do {
15987 + if (pte_present(*pte)) {
15988 + u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
15989 + | ((unsigned long)pte & ~PAGE_MASK)
15990 + | MMU_PT_UPDATE_PRESERVE_AD;
15991 + u[i].val = __pte_val(pte_modify(*pte, newprot));
15992 + if (++i == MAX_BATCHED_FULL_PTES) {
15993 + if ((rc = HYPERVISOR_mmu_update(
15994 + &u[0], i, NULL, DOMID_SELF)) != 0)
15995 + break;
15996 + i = 0;
15997 + }
15998 + }
15999 + } while (pte++, addr += PAGE_SIZE, addr != end);
16000 + if (i)
16001 + rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
16002 + pte_unmap_unlock(pte - 1, ptl);
16003 + BUG_ON(rc && rc != -ENOSYS);
16004 + return !rc;
16005 +}
16006 Index: head-2008-11-25/arch/x86/mm/init_32-xen.c
16007 ===================================================================
16008 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
16009 +++ head-2008-11-25/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100
16010 @@ -0,0 +1,840 @@
16011 +/*
16012 + * linux/arch/i386/mm/init.c
16013 + *
16014 + * Copyright (C) 1995 Linus Torvalds
16015 + *
16016 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16017 + */
16018 +
16019 +#include <linux/module.h>
16020 +#include <linux/signal.h>
16021 +#include <linux/sched.h>
16022 +#include <linux/kernel.h>
16023 +#include <linux/errno.h>
16024 +#include <linux/string.h>
16025 +#include <linux/types.h>
16026 +#include <linux/ptrace.h>
16027 +#include <linux/mman.h>
16028 +#include <linux/mm.h>
16029 +#include <linux/hugetlb.h>
16030 +#include <linux/swap.h>
16031 +#include <linux/smp.h>
16032 +#include <linux/init.h>
16033 +#include <linux/highmem.h>
16034 +#include <linux/pagemap.h>
16035 +#include <linux/poison.h>
16036 +#include <linux/bootmem.h>
16037 +#include <linux/slab.h>
16038 +#include <linux/proc_fs.h>
16039 +#include <linux/efi.h>
16040 +#include <linux/memory_hotplug.h>
16041 +#include <linux/initrd.h>
16042 +#include <linux/cpumask.h>
16043 +#include <linux/dma-mapping.h>
16044 +#include <linux/scatterlist.h>
16045 +
16046 +#include <asm/processor.h>
16047 +#include <asm/system.h>
16048 +#include <asm/uaccess.h>
16049 +#include <asm/pgtable.h>
16050 +#include <asm/dma.h>
16051 +#include <asm/fixmap.h>
16052 +#include <asm/e820.h>
16053 +#include <asm/apic.h>
16054 +#include <asm/tlb.h>
16055 +#include <asm/tlbflush.h>
16056 +#include <asm/sections.h>
16057 +#include <asm/hypervisor.h>
16058 +#include <asm/swiotlb.h>
16059 +
16060 +unsigned int __VMALLOC_RESERVE = 128 << 20;
16061 +
16062 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
16063 +unsigned long highstart_pfn, highend_pfn;
16064 +
16065 +static int noinline do_test_wp_bit(void);
16066 +
16067 +/*
16068 + * Creates a middle page table and puts a pointer to it in the
16069 + * given global directory entry. This only returns the gd entry
16070 + * in non-PAE compilation mode, since the middle layer is folded.
16071 + */
16072 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
16073 +{
16074 + pud_t *pud;
16075 + pmd_t *pmd_table;
16076 +
16077 +#ifdef CONFIG_X86_PAE
16078 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16079 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
16080 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
16081 + pud = pud_offset(pgd, 0);
16082 + if (pmd_table != pmd_offset(pud, 0))
16083 + BUG();
16084 +#else
16085 + pud = pud_offset(pgd, 0);
16086 + pmd_table = pmd_offset(pud, 0);
16087 +#endif
16088 +
16089 + return pmd_table;
16090 +}
16091 +
16092 +/*
16093 + * Create a page table and place a pointer to it in a middle page
16094 + * directory entry.
16095 + */
16096 +static pte_t * __init one_page_table_init(pmd_t *pmd)
16097 +{
16098 + if (pmd_none(*pmd)) {
16099 + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
16100 + make_lowmem_page_readonly(page_table,
16101 + XENFEAT_writable_page_tables);
16102 + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
16103 + if (page_table != pte_offset_kernel(pmd, 0))
16104 + BUG();
16105 +
16106 + return page_table;
16107 + }
16108 +
16109 + return pte_offset_kernel(pmd, 0);
16110 +}
16111 +
16112 +/*
16113 + * This function initializes a certain range of kernel virtual memory
16114 + * with new bootmem page tables, everywhere page tables are missing in
16115 + * the given range.
16116 + */
16117 +
16118 +/*
16119 + * NOTE: The pagetables are allocated contiguous on the physical space
16120 + * so we can cache the place of the first one and move around without
16121 + * checking the pgd every time.
16122 + */
16123 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
16124 +{
16125 + pgd_t *pgd;
16126 + pud_t *pud;
16127 + pmd_t *pmd;
16128 + int pgd_idx, pmd_idx;
16129 + unsigned long vaddr;
16130 +
16131 + vaddr = start;
16132 + pgd_idx = pgd_index(vaddr);
16133 + pmd_idx = pmd_index(vaddr);
16134 + pgd = pgd_base + pgd_idx;
16135 +
16136 + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
16137 + if (pgd_none(*pgd))
16138 + one_md_table_init(pgd);
16139 + pud = pud_offset(pgd, vaddr);
16140 + pmd = pmd_offset(pud, vaddr);
16141 + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
16142 + if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
16143 + one_page_table_init(pmd);
16144 +
16145 + vaddr += PMD_SIZE;
16146 + }
16147 + pmd_idx = 0;
16148 + }
16149 +}
16150 +
16151 +static inline int is_kernel_text(unsigned long addr)
16152 +{
16153 + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
16154 + return 1;
16155 + return 0;
16156 +}
16157 +
16158 +/*
16159 + * This maps the physical memory to kernel virtual address space, a total
16160 + * of max_low_pfn pages, by creating page tables starting from address
16161 + * PAGE_OFFSET.
16162 + */
16163 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
16164 +{
16165 + unsigned long pfn;
16166 + pgd_t *pgd;
16167 + pmd_t *pmd;
16168 + pte_t *pte;
16169 + int pgd_idx, pmd_idx, pte_ofs;
16170 +
16171 + unsigned long max_ram_pfn = xen_start_info->nr_pages;
16172 + if (max_ram_pfn > max_low_pfn)
16173 + max_ram_pfn = max_low_pfn;
16174 +
16175 + pgd_idx = pgd_index(PAGE_OFFSET);
16176 + pgd = pgd_base + pgd_idx;
16177 + pfn = 0;
16178 + pmd_idx = pmd_index(PAGE_OFFSET);
16179 + pte_ofs = pte_index(PAGE_OFFSET);
16180 +
16181 + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
16182 +#ifdef CONFIG_XEN
16183 + /*
16184 + * Native linux hasn't PAE-paging enabled yet at this
16185 + * point. When running as xen domain we are in PAE
16186 + * mode already, thus we can't simply hook a empty
16187 + * pmd. That would kill the mappings we are currently
16188 + * using ...
16189 + */
16190 + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
16191 +#else
16192 + pmd = one_md_table_init(pgd);
16193 +#endif
16194 + if (pfn >= max_low_pfn)
16195 + continue;
16196 + pmd += pmd_idx;
16197 + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
16198 + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
16199 + if (address >= hypervisor_virt_start)
16200 + continue;
16201 +
16202 + /* Map with big pages if possible, otherwise create normal page tables. */
16203 + if (cpu_has_pse) {
16204 + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
16205 +
16206 + if (is_kernel_text(address) || is_kernel_text(address2))
16207 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
16208 + else
16209 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
16210 + pfn += PTRS_PER_PTE;
16211 + } else {
16212 + pte = one_page_table_init(pmd);
16213 +
16214 + pte += pte_ofs;
16215 + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
16216 + /* XEN: Only map initial RAM allocation. */
16217 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
16218 + continue;
16219 + if (is_kernel_text(address))
16220 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
16221 + else
16222 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
16223 + }
16224 + pte_ofs = 0;
16225 + }
16226 + }
16227 + pmd_idx = 0;
16228 + }
16229 +}
16230 +
16231 +#ifndef CONFIG_XEN
16232 +
16233 +static inline int page_kills_ppro(unsigned long pagenr)
16234 +{
16235 + if (pagenr >= 0x70000 && pagenr <= 0x7003F)
16236 + return 1;
16237 + return 0;
16238 +}
16239 +
16240 +#else
16241 +
16242 +#define page_kills_ppro(p) 0
16243 +
16244 +#endif
16245 +
16246 +extern int is_available_memory(efi_memory_desc_t *);
16247 +
16248 +int page_is_ram(unsigned long pagenr)
16249 +{
16250 + int i;
16251 + unsigned long addr, end;
16252 +
16253 + if (efi_enabled) {
16254 + efi_memory_desc_t *md;
16255 + void *p;
16256 +
16257 + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
16258 + md = p;
16259 + if (!is_available_memory(md))
16260 + continue;
16261 + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16262 + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
16263 +
16264 + if ((pagenr >= addr) && (pagenr < end))
16265 + return 1;
16266 + }
16267 + return 0;
16268 + }
16269 +
16270 + for (i = 0; i < e820.nr_map; i++) {
16271 +
16272 + if (e820.map[i].type != E820_RAM) /* not usable memory */
16273 + continue;
16274 + /*
16275 + * !!!FIXME!!! Some BIOSen report areas as RAM that
16276 + * are not. Notably the 640->1Mb area. We need a sanity
16277 + * check here.
16278 + */
16279 + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
16280 + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
16281 + if ((pagenr >= addr) && (pagenr < end))
16282 + return 1;
16283 + }
16284 + return 0;
16285 +}
16286 +
16287 +#ifdef CONFIG_HIGHMEM
16288 +pte_t *kmap_pte;
16289 +pgprot_t kmap_prot;
16290 +
16291 +#define kmap_get_fixmap_pte(vaddr) \
16292 + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
16293 +
16294 +static void __init kmap_init(void)
16295 +{
16296 + unsigned long kmap_vstart;
16297 +
16298 + /* cache the first kmap pte */
16299 + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
16300 + kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
16301 +
16302 + kmap_prot = PAGE_KERNEL;
16303 +}
16304 +
16305 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
16306 +{
16307 + pgd_t *pgd;
16308 + pud_t *pud;
16309 + pmd_t *pmd;
16310 + pte_t *pte;
16311 + unsigned long vaddr;
16312 +
16313 + vaddr = PKMAP_BASE;
16314 + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
16315 +
16316 + pgd = swapper_pg_dir + pgd_index(vaddr);
16317 + pud = pud_offset(pgd, vaddr);
16318 + pmd = pmd_offset(pud, vaddr);
16319 + pte = pte_offset_kernel(pmd, vaddr);
16320 + pkmap_page_table = pte;
16321 +}
16322 +
16323 +static void __meminit free_new_highpage(struct page *page, int pfn)
16324 +{
16325 + init_page_count(page);
16326 + if (pfn < xen_start_info->nr_pages)
16327 + __free_page(page);
16328 + totalhigh_pages++;
16329 +}
16330 +
16331 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
16332 +{
16333 + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
16334 + ClearPageReserved(page);
16335 + free_new_highpage(page, pfn);
16336 + } else
16337 + SetPageReserved(page);
16338 +}
16339 +
16340 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
16341 +{
16342 + free_new_highpage(page, pfn);
16343 + totalram_pages++;
16344 +#ifdef CONFIG_FLATMEM
16345 + max_mapnr = max(pfn, max_mapnr);
16346 +#endif
16347 + num_physpages++;
16348 + return 0;
16349 +}
16350 +
16351 +/*
16352 + * Not currently handling the NUMA case.
16353 + * Assuming single node and all memory that
16354 + * has been added dynamically that would be
16355 + * onlined here is in HIGHMEM
16356 + */
16357 +void online_page(struct page *page)
16358 +{
16359 + ClearPageReserved(page);
16360 + add_one_highpage_hotplug(page, page_to_pfn(page));
16361 +}
16362 +
16363 +
16364 +#ifdef CONFIG_NUMA
16365 +extern void set_highmem_pages_init(int);
16366 +#else
16367 +static void __init set_highmem_pages_init(int bad_ppro)
16368 +{
16369 + int pfn;
16370 + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
16371 + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
16372 + totalram_pages += totalhigh_pages;
16373 +}
16374 +#endif /* CONFIG_FLATMEM */
16375 +
16376 +#else
16377 +#define kmap_init() do { } while (0)
16378 +#define permanent_kmaps_init(pgd_base) do { } while (0)
16379 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
16380 +#endif /* CONFIG_HIGHMEM */
16381 +
16382 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
16383 +EXPORT_SYMBOL(__PAGE_KERNEL);
16384 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
16385 +
16386 +#ifdef CONFIG_NUMA
16387 +extern void __init remap_numa_kva(void);
16388 +#else
16389 +#define remap_numa_kva() do {} while (0)
16390 +#endif
16391 +
16392 +pgd_t *swapper_pg_dir;
16393 +
16394 +static void __init pagetable_init (void)
16395 +{
16396 + unsigned long vaddr;
16397 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
16398 +
16399 + /* Enable PSE if available */
16400 + if (cpu_has_pse) {
16401 + set_in_cr4(X86_CR4_PSE);
16402 + }
16403 +
16404 + /* Enable PGE if available */
16405 + if (cpu_has_pge) {
16406 + set_in_cr4(X86_CR4_PGE);
16407 + __PAGE_KERNEL |= _PAGE_GLOBAL;
16408 + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
16409 + }
16410 +
16411 + kernel_physical_mapping_init(pgd_base);
16412 + remap_numa_kva();
16413 +
16414 + /*
16415 + * Fixed mappings, only the page table structure has to be
16416 + * created - mappings will be set by set_fixmap():
16417 + */
16418 + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
16419 + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
16420 +
16421 + permanent_kmaps_init(pgd_base);
16422 +}
16423 +
16424 +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
16425 +/*
16426 + * Swap suspend & friends need this for resume because things like the intel-agp
16427 + * driver might have split up a kernel 4MB mapping.
16428 + */
16429 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
16430 + __attribute__ ((aligned (PAGE_SIZE)));
16431 +
16432 +static inline void save_pg_dir(void)
16433 +{
16434 + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
16435 +}
16436 +#else
16437 +static inline void save_pg_dir(void)
16438 +{
16439 +}
16440 +#endif
16441 +
16442 +void zap_low_mappings (void)
16443 +{
16444 + int i;
16445 +
16446 + save_pg_dir();
16447 +
16448 + /*
16449 + * Zap initial low-memory mappings.
16450 + *
16451 + * Note that "pgd_clear()" doesn't do it for
16452 + * us, because pgd_clear() is a no-op on i386.
16453 + */
16454 + for (i = 0; i < USER_PTRS_PER_PGD; i++)
16455 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16456 + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
16457 +#else
16458 + set_pgd(swapper_pg_dir+i, __pgd(0));
16459 +#endif
16460 + flush_tlb_all();
16461 +}
16462 +
16463 +static int disable_nx __initdata = 0;
16464 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
16465 +EXPORT_SYMBOL(__supported_pte_mask);
16466 +
16467 +/*
16468 + * noexec = on|off
16469 + *
16470 + * Control non executable mappings.
16471 + *
16472 + * on Enable
16473 + * off Disable
16474 + */
16475 +void __init noexec_setup(const char *str)
16476 +{
16477 + if (!strncmp(str, "on",2) && cpu_has_nx) {
16478 + __supported_pte_mask |= _PAGE_NX;
16479 + disable_nx = 0;
16480 + } else if (!strncmp(str,"off",3)) {
16481 + disable_nx = 1;
16482 + __supported_pte_mask &= ~_PAGE_NX;
16483 + }
16484 +}
16485 +
16486 +int nx_enabled = 0;
16487 +#ifdef CONFIG_X86_PAE
16488 +
16489 +static void __init set_nx(void)
16490 +{
16491 + unsigned int v[4], l, h;
16492 +
16493 + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
16494 + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
16495 + if ((v[3] & (1 << 20)) && !disable_nx) {
16496 + rdmsr(MSR_EFER, l, h);
16497 + l |= EFER_NX;
16498 + wrmsr(MSR_EFER, l, h);
16499 + nx_enabled = 1;
16500 + __supported_pte_mask |= _PAGE_NX;
16501 + }
16502 + }
16503 +}
16504 +
16505 +/*
16506 + * Enables/disables executability of a given kernel page and
16507 + * returns the previous setting.
16508 + */
16509 +int __init set_kernel_exec(unsigned long vaddr, int enable)
16510 +{
16511 + pte_t *pte;
16512 + int ret = 1;
16513 +
16514 + if (!nx_enabled)
16515 + goto out;
16516 +
16517 + pte = lookup_address(vaddr);
16518 + BUG_ON(!pte);
16519 +
16520 + if (!pte_exec_kernel(*pte))
16521 + ret = 0;
16522 +
16523 + if (enable)
16524 + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
16525 + else
16526 + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
16527 + __flush_tlb_all();
16528 +out:
16529 + return ret;
16530 +}
16531 +
16532 +#endif
16533 +
16534 +/*
16535 + * paging_init() sets up the page tables - note that the first 8MB are
16536 + * already mapped by head.S.
16537 + *
16538 + * This routines also unmaps the page at virtual kernel address 0, so
16539 + * that we can trap those pesky NULL-reference errors in the kernel.
16540 + */
16541 +void __init paging_init(void)
16542 +{
16543 + int i;
16544 +
16545 +#ifdef CONFIG_X86_PAE
16546 + set_nx();
16547 + if (nx_enabled)
16548 + printk("NX (Execute Disable) protection: active\n");
16549 +#endif
16550 +
16551 + pagetable_init();
16552 +
16553 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
16554 + /*
16555 + * We will bail out later - printk doesn't work right now so
16556 + * the user would just see a hanging kernel.
16557 + * when running as xen domain we are already in PAE mode at
16558 + * this point.
16559 + */
16560 + if (cpu_has_pae)
16561 + set_in_cr4(X86_CR4_PAE);
16562 +#endif
16563 + __flush_tlb_all();
16564 +
16565 + kmap_init();
16566 +
16567 + /* Switch to the real shared_info page, and clear the
16568 + * dummy page. */
16569 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
16570 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
16571 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
16572 +
16573 + /* Setup mapping of lower 1st MB */
16574 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
16575 + if (is_initial_xendomain())
16576 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
16577 + else
16578 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
16579 + virt_to_machine(empty_zero_page),
16580 + PAGE_KERNEL_RO);
16581 +}
16582 +
16583 +/*
16584 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
16585 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
16586 + * used to involve black magic jumps to work around some nasty CPU bugs,
16587 + * but fortunately the switch to using exceptions got rid of all that.
16588 + */
16589 +
16590 +static void __init test_wp_bit(void)
16591 +{
16592 + printk("Checking if this processor honours the WP bit even in supervisor mode... ");
16593 +
16594 + /* Any page-aligned address will do, the test is non-destructive */
16595 + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
16596 + boot_cpu_data.wp_works_ok = do_test_wp_bit();
16597 + clear_fixmap(FIX_WP_TEST);
16598 +
16599 + if (!boot_cpu_data.wp_works_ok) {
16600 + printk("No.\n");
16601 +#ifdef CONFIG_X86_WP_WORKS_OK
16602 + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
16603 +#endif
16604 + } else {
16605 + printk("Ok.\n");
16606 + }
16607 +}
16608 +
16609 +static void __init set_max_mapnr_init(void)
16610 +{
16611 +#ifdef CONFIG_HIGHMEM
16612 + num_physpages = highend_pfn;
16613 +#else
16614 + num_physpages = max_low_pfn;
16615 +#endif
16616 +#ifdef CONFIG_FLATMEM
16617 + max_mapnr = num_physpages;
16618 +#endif
16619 +}
16620 +
16621 +static struct kcore_list kcore_mem, kcore_vmalloc;
16622 +
16623 +void __init mem_init(void)
16624 +{
16625 + extern int ppro_with_ram_bug(void);
16626 + int codesize, reservedpages, datasize, initsize;
16627 + int tmp;
16628 + int bad_ppro;
16629 + unsigned long pfn;
16630 +
16631 +#if defined(CONFIG_SWIOTLB)
16632 + swiotlb_init();
16633 +#endif
16634 +
16635 +#ifdef CONFIG_FLATMEM
16636 + if (!mem_map)
16637 + BUG();
16638 +#endif
16639 +
16640 + bad_ppro = ppro_with_ram_bug();
16641 +
16642 +#ifdef CONFIG_HIGHMEM
16643 + /* check that fixmap and pkmap do not overlap */
16644 + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
16645 + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
16646 + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
16647 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
16648 + BUG();
16649 + }
16650 +#endif
16651 +
16652 + set_max_mapnr_init();
16653 +
16654 +#ifdef CONFIG_HIGHMEM
16655 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
16656 +#else
16657 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
16658 +#endif
16659 + printk("vmalloc area: %lx-%lx, maxmem %lx\n",
16660 + VMALLOC_START,VMALLOC_END,MAXMEM);
16661 + BUG_ON(VMALLOC_START > VMALLOC_END);
16662 +
16663 + /* this will put all low memory onto the freelists */
16664 + totalram_pages += free_all_bootmem();
16665 + /* XEN: init and count low-mem pages outside initial allocation. */
16666 + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
16667 + ClearPageReserved(pfn_to_page(pfn));
16668 + init_page_count(pfn_to_page(pfn));
16669 + totalram_pages++;
16670 + }
16671 +
16672 + reservedpages = 0;
16673 + for (tmp = 0; tmp < max_low_pfn; tmp++)
16674 + /*
16675 + * Only count reserved RAM pages
16676 + */
16677 + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
16678 + reservedpages++;
16679 +
16680 + set_highmem_pages_init(bad_ppro);
16681 +
16682 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
16683 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
16684 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
16685 +
16686 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
16687 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
16688 + VMALLOC_END-VMALLOC_START);
16689 +
16690 + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
16691 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
16692 + num_physpages << (PAGE_SHIFT-10),
16693 + codesize >> 10,
16694 + reservedpages << (PAGE_SHIFT-10),
16695 + datasize >> 10,
16696 + initsize >> 10,
16697 + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
16698 + );
16699 +
16700 +#ifdef CONFIG_X86_PAE
16701 + if (!cpu_has_pae)
16702 + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
16703 +#endif
16704 + if (boot_cpu_data.wp_works_ok < 0)
16705 + test_wp_bit();
16706 +
16707 + /*
16708 + * Subtle. SMP is doing it's boot stuff late (because it has to
16709 + * fork idle threads) - but it also needs low mappings for the
16710 + * protected-mode entry to work. We zap these entries only after
16711 + * the WP-bit has been tested.
16712 + */
16713 +#ifndef CONFIG_SMP
16714 + zap_low_mappings();
16715 +#endif
16716 +
16717 + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
16718 +}
16719 +
16720 +/*
16721 + * this is for the non-NUMA, single node SMP system case.
16722 + * Specifically, in the case of x86, we will always add
16723 + * memory to the highmem for now.
16724 + */
16725 +#ifdef CONFIG_MEMORY_HOTPLUG
16726 +#ifndef CONFIG_NEED_MULTIPLE_NODES
16727 +int arch_add_memory(int nid, u64 start, u64 size)
16728 +{
16729 + struct pglist_data *pgdata = &contig_page_data;
16730 + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
16731 + unsigned long start_pfn = start >> PAGE_SHIFT;
16732 + unsigned long nr_pages = size >> PAGE_SHIFT;
16733 +
16734 + return __add_pages(zone, start_pfn, nr_pages);
16735 +}
16736 +
16737 +int remove_memory(u64 start, u64 size)
16738 +{
16739 + return -EINVAL;
16740 +}
16741 +#endif
16742 +#endif
16743 +
16744 +kmem_cache_t *pgd_cache;
16745 +kmem_cache_t *pmd_cache;
16746 +
16747 +void __init pgtable_cache_init(void)
16748 +{
16749 + if (PTRS_PER_PMD > 1) {
16750 + pmd_cache = kmem_cache_create("pmd",
16751 + PTRS_PER_PMD*sizeof(pmd_t),
16752 + PTRS_PER_PMD*sizeof(pmd_t),
16753 + 0,
16754 + pmd_ctor,
16755 + NULL);
16756 + if (!pmd_cache)
16757 + panic("pgtable_cache_init(): cannot create pmd cache");
16758 + }
16759 + pgd_cache = kmem_cache_create("pgd",
16760 +#ifndef CONFIG_XEN
16761 + PTRS_PER_PGD*sizeof(pgd_t),
16762 + PTRS_PER_PGD*sizeof(pgd_t),
16763 +#else
16764 + PAGE_SIZE,
16765 + PAGE_SIZE,
16766 +#endif
16767 + 0,
16768 + pgd_ctor,
16769 + PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
16770 + if (!pgd_cache)
16771 + panic("pgtable_cache_init(): Cannot create pgd cache");
16772 +}
16773 +
16774 +/*
16775 + * This function cannot be __init, since exceptions don't work in that
16776 + * section. Put this after the callers, so that it cannot be inlined.
16777 + */
16778 +static int noinline do_test_wp_bit(void)
16779 +{
16780 + char tmp_reg;
16781 + int flag;
16782 +
16783 + __asm__ __volatile__(
16784 + " movb %0,%1 \n"
16785 + "1: movb %1,%0 \n"
16786 + " xorl %2,%2 \n"
16787 + "2: \n"
16788 + ".section __ex_table,\"a\"\n"
16789 + " .align 4 \n"
16790 + " .long 1b,2b \n"
16791 + ".previous \n"
16792 + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
16793 + "=q" (tmp_reg),
16794 + "=r" (flag)
16795 + :"2" (1)
16796 + :"memory");
16797 +
16798 + return flag;
16799 +}
16800 +
16801 +#ifdef CONFIG_DEBUG_RODATA
16802 +
16803 +void mark_rodata_ro(void)
16804 +{
16805 + unsigned long addr = (unsigned long)__start_rodata;
16806 +
16807 + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
16808 + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
16809 +
16810 + printk("Write protecting the kernel read-only data: %uk\n",
16811 + (__end_rodata - __start_rodata) >> 10);
16812 +
16813 + /*
16814 + * change_page_attr() requires a global_flush_tlb() call after it.
16815 + * We do this after the printk so that if something went wrong in the
16816 + * change, the printk gets out at least to give a better debug hint
16817 + * of who is the culprit.
16818 + */
16819 + global_flush_tlb();
16820 +}
16821 +#endif
16822 +
16823 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
16824 +{
16825 + unsigned long addr;
16826 +
16827 + for (addr = begin; addr < end; addr += PAGE_SIZE) {
16828 + ClearPageReserved(virt_to_page(addr));
16829 + init_page_count(virt_to_page(addr));
16830 + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
16831 + free_page(addr);
16832 + totalram_pages++;
16833 + }
16834 + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
16835 +}
16836 +
16837 +void free_initmem(void)
16838 +{
16839 + free_init_pages("unused kernel memory",
16840 + (unsigned long)(&__init_begin),
16841 + (unsigned long)(&__init_end));
16842 +}
16843 +
16844 +#ifdef CONFIG_BLK_DEV_INITRD
16845 +void free_initrd_mem(unsigned long start, unsigned long end)
16846 +{
16847 + free_init_pages("initrd memory", start, end);
16848 +}
16849 +#endif
16850 +
16851 Index: head-2008-11-25/arch/x86/mm/ioremap_32-xen.c
16852 ===================================================================
16853 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
16854 +++ head-2008-11-25/arch/x86/mm/ioremap_32-xen.c 2008-04-02 12:34:02.000000000 +0200
16855 @@ -0,0 +1,443 @@
16856 +/*
16857 + * arch/i386/mm/ioremap.c
16858 + *
16859 + * Re-map IO memory to kernel address space so that we can access it.
16860 + * This is needed for high PCI addresses that aren't mapped in the
16861 + * 640k-1MB IO memory area on PC's
16862 + *
16863 + * (C) Copyright 1995 1996 Linus Torvalds
16864 + */
16865 +
16866 +#include <linux/vmalloc.h>
16867 +#include <linux/init.h>
16868 +#include <linux/slab.h>
16869 +#include <linux/module.h>
16870 +#include <asm/io.h>
16871 +#include <asm/fixmap.h>
16872 +#include <asm/cacheflush.h>
16873 +#include <asm/tlbflush.h>
16874 +#include <asm/pgtable.h>
16875 +#include <asm/pgalloc.h>
16876 +
16877 +#define ISA_START_ADDRESS 0x0
16878 +#define ISA_END_ADDRESS 0x100000
16879 +
16880 +static int direct_remap_area_pte_fn(pte_t *pte,
16881 + struct page *pmd_page,
16882 + unsigned long address,
16883 + void *data)
16884 +{
16885 + mmu_update_t **v = (mmu_update_t **)data;
16886 +
16887 + BUG_ON(!pte_none(*pte));
16888 +
16889 + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
16890 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
16891 + (*v)++;
16892 +
16893 + return 0;
16894 +}
16895 +
16896 +static int __direct_remap_pfn_range(struct mm_struct *mm,
16897 + unsigned long address,
16898 + unsigned long mfn,
16899 + unsigned long size,
16900 + pgprot_t prot,
16901 + domid_t domid)
16902 +{
16903 + int rc;
16904 + unsigned long i, start_address;
16905 + mmu_update_t *u, *v, *w;
16906 +
16907 + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
16908 + if (u == NULL)
16909 + return -ENOMEM;
16910 +
16911 + start_address = address;
16912 +
16913 + flush_cache_all();
16914 +
16915 + for (i = 0; i < size; i += PAGE_SIZE) {
16916 + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
16917 + /* Flush a full batch after filling in the PTE ptrs. */
16918 + rc = apply_to_page_range(mm, start_address,
16919 + address - start_address,
16920 + direct_remap_area_pte_fn, &w);
16921 + if (rc)
16922 + goto out;
16923 + rc = -EFAULT;
16924 + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
16925 + goto out;
16926 + v = w = u;
16927 + start_address = address;
16928 + }
16929 +
16930 + /*
16931 + * Fill in the machine address: PTE ptr is done later by
16932 + * apply_to_page_range().
16933 + */
16934 + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
16935 +
16936 + mfn++;
16937 + address += PAGE_SIZE;
16938 + v++;
16939 + }
16940 +
16941 + if (v != u) {
16942 + /* Final batch. */
16943 + rc = apply_to_page_range(mm, start_address,
16944 + address - start_address,
16945 + direct_remap_area_pte_fn, &w);
16946 + if (rc)
16947 + goto out;
16948 + rc = -EFAULT;
16949 + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
16950 + goto out;
16951 + }
16952 +
16953 + rc = 0;
16954 +
16955 + out:
16956 + flush_tlb_all();
16957 +
16958 + free_page((unsigned long)u);
16959 +
16960 + return rc;
16961 +}
16962 +
16963 +int direct_remap_pfn_range(struct vm_area_struct *vma,
16964 + unsigned long address,
16965 + unsigned long mfn,
16966 + unsigned long size,
16967 + pgprot_t prot,
16968 + domid_t domid)
16969 +{
16970 + if (xen_feature(XENFEAT_auto_translated_physmap))
16971 + return remap_pfn_range(vma, address, mfn, size, prot);
16972 +
16973 + if (domid == DOMID_SELF)
16974 + return -EINVAL;
16975 +
16976 + vma->vm_flags |= VM_IO | VM_RESERVED;
16977 +
16978 + vma->vm_mm->context.has_foreign_mappings = 1;
16979 +
16980 + return __direct_remap_pfn_range(
16981 + vma->vm_mm, address, mfn, size, prot, domid);
16982 +}
16983 +EXPORT_SYMBOL(direct_remap_pfn_range);
16984 +
16985 +int direct_kernel_remap_pfn_range(unsigned long address,
16986 + unsigned long mfn,
16987 + unsigned long size,
16988 + pgprot_t prot,
16989 + domid_t domid)
16990 +{
16991 + return __direct_remap_pfn_range(
16992 + &init_mm, address, mfn, size, prot, domid);
16993 +}
16994 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
16995 +
16996 +static int lookup_pte_fn(
16997 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
16998 +{
16999 + uint64_t *ptep = (uint64_t *)data;
17000 + if (ptep)
17001 + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17002 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17003 + return 0;
17004 +}
17005 +
17006 +int create_lookup_pte_addr(struct mm_struct *mm,
17007 + unsigned long address,
17008 + uint64_t *ptep)
17009 +{
17010 + return apply_to_page_range(mm, address, PAGE_SIZE,
17011 + lookup_pte_fn, ptep);
17012 +}
17013 +
17014 +EXPORT_SYMBOL(create_lookup_pte_addr);
17015 +
17016 +static int noop_fn(
17017 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
17018 +{
17019 + return 0;
17020 +}
17021 +
17022 +int touch_pte_range(struct mm_struct *mm,
17023 + unsigned long address,
17024 + unsigned long size)
17025 +{
17026 + return apply_to_page_range(mm, address, size, noop_fn, NULL);
17027 +}
17028 +
17029 +EXPORT_SYMBOL(touch_pte_range);
17030 +
17031 +/*
17032 + * Does @address reside within a non-highmem page that is local to this virtual
17033 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
17034 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
17035 + * why this works.
17036 + */
17037 +static inline int is_local_lowmem(unsigned long address)
17038 +{
17039 + extern unsigned long max_low_pfn;
17040 + return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
17041 +}
17042 +
17043 +/*
17044 + * Generic mapping function (not visible outside):
17045 + */
17046 +
17047 +/*
17048 + * Remap an arbitrary physical address space into the kernel virtual
17049 + * address space. Needed when the kernel wants to access high addresses
17050 + * directly.
17051 + *
17052 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
17053 + * have to convert them into an offset in a page-aligned mapping, but the
17054 + * caller shouldn't need to know that small detail.
17055 + */
17056 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
17057 +{
17058 + void __iomem * addr;
17059 + struct vm_struct * area;
17060 + unsigned long offset, last_addr;
17061 + domid_t domid = DOMID_IO;
17062 +
17063 + /* Don't allow wraparound or zero size */
17064 + last_addr = phys_addr + size - 1;
17065 + if (!size || last_addr < phys_addr)
17066 + return NULL;
17067 +
17068 + /*
17069 + * Don't remap the low PCI/ISA area, it's always mapped..
17070 + */
17071 + if (is_initial_xendomain() &&
17072 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17073 + return (void __iomem *) isa_bus_to_virt(phys_addr);
17074 +
17075 + /*
17076 + * Don't allow anybody to remap normal RAM that we're using..
17077 + */
17078 + if (is_local_lowmem(phys_addr)) {
17079 + char *t_addr, *t_end;
17080 + struct page *page;
17081 +
17082 + t_addr = bus_to_virt(phys_addr);
17083 + t_end = t_addr + (size - 1);
17084 +
17085 + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
17086 + if(!PageReserved(page))
17087 + return NULL;
17088 +
17089 + domid = DOMID_SELF;
17090 + }
17091 +
17092 + /*
17093 + * Mappings have to be page-aligned
17094 + */
17095 + offset = phys_addr & ~PAGE_MASK;
17096 + phys_addr &= PAGE_MASK;
17097 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
17098 +
17099 + /*
17100 + * Ok, go for it..
17101 + */
17102 + area = get_vm_area(size, VM_IOREMAP | (flags << 20));
17103 + if (!area)
17104 + return NULL;
17105 + area->phys_addr = phys_addr;
17106 + addr = (void __iomem *) area->addr;
17107 + flags |= _KERNPG_TABLE;
17108 + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
17109 + phys_addr>>PAGE_SHIFT,
17110 + size, __pgprot(flags), domid)) {
17111 + vunmap((void __force *) addr);
17112 + return NULL;
17113 + }
17114 + return (void __iomem *) (offset + (char __iomem *)addr);
17115 +}
17116 +EXPORT_SYMBOL(__ioremap);
17117 +
17118 +/**
17119 + * ioremap_nocache - map bus memory into CPU space
17120 + * @offset: bus address of the memory
17121 + * @size: size of the resource to map
17122 + *
17123 + * ioremap_nocache performs a platform specific sequence of operations to
17124 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
17125 + * writew/writel functions and the other mmio helpers. The returned
17126 + * address is not guaranteed to be usable directly as a virtual
17127 + * address.
17128 + *
17129 + * This version of ioremap ensures that the memory is marked uncachable
17130 + * on the CPU as well as honouring existing caching rules from things like
17131 + * the PCI bus. Note that there are other caches and buffers on many
17132 + * busses. In particular driver authors should read up on PCI writes
17133 + *
17134 + * It's useful if some control registers are in such an area and
17135 + * write combining or read caching is not desirable:
17136 + *
17137 + * Must be freed with iounmap.
17138 + */
17139 +
17140 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
17141 +{
17142 + unsigned long last_addr;
17143 + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
17144 + if (!p)
17145 + return p;
17146 +
17147 + /* Guaranteed to be > phys_addr, as per __ioremap() */
17148 + last_addr = phys_addr + size - 1;
17149 +
17150 + if (is_local_lowmem(last_addr)) {
17151 + struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
17152 + unsigned long npages;
17153 +
17154 + phys_addr &= PAGE_MASK;
17155 +
17156 + /* This might overflow and become zero.. */
17157 + last_addr = PAGE_ALIGN(last_addr);
17158 +
17159 + /* .. but that's ok, because modulo-2**n arithmetic will make
17160 + * the page-aligned "last - first" come out right.
17161 + */
17162 + npages = (last_addr - phys_addr) >> PAGE_SHIFT;
17163 +
17164 + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
17165 + iounmap(p);
17166 + p = NULL;
17167 + }
17168 + global_flush_tlb();
17169 + }
17170 +
17171 + return p;
17172 +}
17173 +EXPORT_SYMBOL(ioremap_nocache);
17174 +
17175 +/**
17176 + * iounmap - Free a IO remapping
17177 + * @addr: virtual address from ioremap_*
17178 + *
17179 + * Caller must ensure there is only one unmapping for the same pointer.
17180 + */
17181 +void iounmap(volatile void __iomem *addr)
17182 +{
17183 + struct vm_struct *p, *o;
17184 +
17185 + if ((void __force *)addr <= high_memory)
17186 + return;
17187 +
17188 + /*
17189 + * __ioremap special-cases the PCI/ISA range by not instantiating a
17190 + * vm_area and by simply returning an address into the kernel mapping
17191 + * of ISA space. So handle that here.
17192 + */
17193 + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17194 + return;
17195 +
17196 + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
17197 +
17198 + /* Use the vm area unlocked, assuming the caller
17199 + ensures there isn't another iounmap for the same address
17200 + in parallel. Reuse of the virtual address is prevented by
17201 + leaving it in the global lists until we're done with it.
17202 + cpa takes care of the direct mappings. */
17203 + read_lock(&vmlist_lock);
17204 + for (p = vmlist; p; p = p->next) {
17205 + if (p->addr == addr)
17206 + break;
17207 + }
17208 + read_unlock(&vmlist_lock);
17209 +
17210 + if (!p) {
17211 + printk("iounmap: bad address %p\n", addr);
17212 + dump_stack();
17213 + return;
17214 + }
17215 +
17216 + /* Reset the direct mapping. Can block */
17217 + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
17218 + /* p->size includes the guard page, but cpa doesn't like that */
17219 + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
17220 + (p->size - PAGE_SIZE) >> PAGE_SHIFT,
17221 + PAGE_KERNEL);
17222 + global_flush_tlb();
17223 + }
17224 +
17225 + /* Finally remove it */
17226 + o = remove_vm_area((void *)addr);
17227 + BUG_ON(p != o || o == NULL);
17228 + kfree(p);
17229 +}
17230 +EXPORT_SYMBOL(iounmap);
17231 +
17232 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
17233 +{
17234 + unsigned long offset, last_addr;
17235 + unsigned int nrpages;
17236 + enum fixed_addresses idx;
17237 +
17238 + /* Don't allow wraparound or zero size */
17239 + last_addr = phys_addr + size - 1;
17240 + if (!size || last_addr < phys_addr)
17241 + return NULL;
17242 +
17243 + /*
17244 + * Don't remap the low PCI/ISA area, it's always mapped..
17245 + */
17246 + if (is_initial_xendomain() &&
17247 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
17248 + return isa_bus_to_virt(phys_addr);
17249 +
17250 + /*
17251 + * Mappings have to be page-aligned
17252 + */
17253 + offset = phys_addr & ~PAGE_MASK;
17254 + phys_addr &= PAGE_MASK;
17255 + size = PAGE_ALIGN(last_addr) - phys_addr;
17256 +
17257 + /*
17258 + * Mappings have to fit in the FIX_BTMAP area.
17259 + */
17260 + nrpages = size >> PAGE_SHIFT;
17261 + if (nrpages > NR_FIX_BTMAPS)
17262 + return NULL;
17263 +
17264 + /*
17265 + * Ok, go for it..
17266 + */
17267 + idx = FIX_BTMAP_BEGIN;
17268 + while (nrpages > 0) {
17269 + set_fixmap(idx, phys_addr);
17270 + phys_addr += PAGE_SIZE;
17271 + --idx;
17272 + --nrpages;
17273 + }
17274 + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
17275 +}
17276 +
17277 +void __init bt_iounmap(void *addr, unsigned long size)
17278 +{
17279 + unsigned long virt_addr;
17280 + unsigned long offset;
17281 + unsigned int nrpages;
17282 + enum fixed_addresses idx;
17283 +
17284 + virt_addr = (unsigned long)addr;
17285 + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
17286 + return;
17287 + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
17288 + return;
17289 + offset = virt_addr & ~PAGE_MASK;
17290 + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
17291 +
17292 + idx = FIX_BTMAP_BEGIN;
17293 + while (nrpages > 0) {
17294 + clear_fixmap(idx);
17295 + --idx;
17296 + --nrpages;
17297 + }
17298 +}
17299 Index: head-2008-11-25/arch/x86/mm/pgtable_32-xen.c
17300 ===================================================================
17301 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
17302 +++ head-2008-11-25/arch/x86/mm/pgtable_32-xen.c 2007-10-09 11:48:25.000000000 +0200
17303 @@ -0,0 +1,725 @@
17304 +/*
17305 + * linux/arch/i386/mm/pgtable.c
17306 + */
17307 +
17308 +#include <linux/sched.h>
17309 +#include <linux/kernel.h>
17310 +#include <linux/errno.h>
17311 +#include <linux/mm.h>
17312 +#include <linux/swap.h>
17313 +#include <linux/smp.h>
17314 +#include <linux/highmem.h>
17315 +#include <linux/slab.h>
17316 +#include <linux/pagemap.h>
17317 +#include <linux/spinlock.h>
17318 +#include <linux/module.h>
17319 +
17320 +#include <asm/system.h>
17321 +#include <asm/pgtable.h>
17322 +#include <asm/pgalloc.h>
17323 +#include <asm/fixmap.h>
17324 +#include <asm/e820.h>
17325 +#include <asm/tlb.h>
17326 +#include <asm/tlbflush.h>
17327 +#include <asm/io.h>
17328 +#include <asm/mmu_context.h>
17329 +
17330 +#include <xen/features.h>
17331 +#include <asm/hypervisor.h>
17332 +
17333 +static void pgd_test_and_unpin(pgd_t *pgd);
17334 +
17335 +void show_mem(void)
17336 +{
17337 + int total = 0, reserved = 0;
17338 + int shared = 0, cached = 0;
17339 + int highmem = 0;
17340 + struct page *page;
17341 + pg_data_t *pgdat;
17342 + unsigned long i;
17343 + unsigned long flags;
17344 +
17345 + printk(KERN_INFO "Mem-info:\n");
17346 + show_free_areas();
17347 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
17348 + for_each_online_pgdat(pgdat) {
17349 + pgdat_resize_lock(pgdat, &flags);
17350 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
17351 + page = pgdat_page_nr(pgdat, i);
17352 + total++;
17353 + if (PageHighMem(page))
17354 + highmem++;
17355 + if (PageReserved(page))
17356 + reserved++;
17357 + else if (PageSwapCache(page))
17358 + cached++;
17359 + else if (page_count(page))
17360 + shared += page_count(page) - 1;
17361 + }
17362 + pgdat_resize_unlock(pgdat, &flags);
17363 + }
17364 + printk(KERN_INFO "%d pages of RAM\n", total);
17365 + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
17366 + printk(KERN_INFO "%d reserved pages\n", reserved);
17367 + printk(KERN_INFO "%d pages shared\n", shared);
17368 + printk(KERN_INFO "%d pages swap cached\n", cached);
17369 +
17370 + printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
17371 + printk(KERN_INFO "%lu pages writeback\n",
17372 + global_page_state(NR_WRITEBACK));
17373 + printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
17374 + printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
17375 + printk(KERN_INFO "%lu pages pagetables\n",
17376 + global_page_state(NR_PAGETABLE));
17377 +}
17378 +
17379 +/*
17380 + * Associate a large virtual page frame with a given physical page frame
17381 + * and protection flags for that frame. pfn is for the base of the page,
17382 + * vaddr is what the page gets mapped to - both must be properly aligned.
17383 + * The pmd must already be instantiated. Assumes PAE mode.
17384 + */
17385 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
17386 +{
17387 + pgd_t *pgd;
17388 + pud_t *pud;
17389 + pmd_t *pmd;
17390 +
17391 + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
17392 + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
17393 + return; /* BUG(); */
17394 + }
17395 + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
17396 + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
17397 + return; /* BUG(); */
17398 + }
17399 + pgd = swapper_pg_dir + pgd_index(vaddr);
17400 + if (pgd_none(*pgd)) {
17401 + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
17402 + return; /* BUG(); */
17403 + }
17404 + pud = pud_offset(pgd, vaddr);
17405 + pmd = pmd_offset(pud, vaddr);
17406 + set_pmd(pmd, pfn_pmd(pfn, flags));
17407 + /*
17408 + * It's enough to flush this one mapping.
17409 + * (PGE mappings get flushed as well)
17410 + */
17411 + __flush_tlb_one(vaddr);
17412 +}
17413 +
17414 +static int nr_fixmaps = 0;
17415 +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
17416 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
17417 +EXPORT_SYMBOL(__FIXADDR_TOP);
17418 +
17419 +void __init set_fixaddr_top(unsigned long top)
17420 +{
17421 + BUG_ON(nr_fixmaps > 0);
17422 + hypervisor_virt_start = top;
17423 + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
17424 +}
17425 +
17426 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
17427 +{
17428 + unsigned long address = __fix_to_virt(idx);
17429 + pte_t pte;
17430 +
17431 + if (idx >= __end_of_fixed_addresses) {
17432 + BUG();
17433 + return;
17434 + }
17435 + switch (idx) {
17436 + case FIX_WP_TEST:
17437 + case FIX_VDSO:
17438 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
17439 + break;
17440 + default:
17441 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
17442 + break;
17443 + }
17444 + if (HYPERVISOR_update_va_mapping(address, pte,
17445 + UVMF_INVLPG|UVMF_ALL))
17446 + BUG();
17447 + nr_fixmaps++;
17448 +}
17449 +
17450 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17451 +{
17452 + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
17453 + if (pte)
17454 + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
17455 + return pte;
17456 +}
17457 +
17458 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17459 +{
17460 + struct page *pte;
17461 +
17462 +#ifdef CONFIG_HIGHPTE
17463 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17464 +#else
17465 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17466 +#endif
17467 + if (pte) {
17468 + SetPageForeign(pte, pte_free);
17469 + init_page_count(pte);
17470 + }
17471 + return pte;
17472 +}
17473 +
17474 +void pte_free(struct page *pte)
17475 +{
17476 + unsigned long pfn = page_to_pfn(pte);
17477 +
17478 + if (!PageHighMem(pte)) {
17479 + unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
17480 +
17481 + if (!pte_write(*virt_to_ptep(va)))
17482 + if (HYPERVISOR_update_va_mapping(
17483 + va, pfn_pte(pfn, PAGE_KERNEL), 0))
17484 + BUG();
17485 + } else
17486 + clear_bit(PG_pinned, &pte->flags);
17487 +
17488 + ClearPageForeign(pte);
17489 + init_page_count(pte);
17490 +
17491 + __free_page(pte);
17492 +}
17493 +
17494 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
17495 +{
17496 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17497 +}
17498 +
17499 +/*
17500 + * List of all pgd's needed for non-PAE so it can invalidate entries
17501 + * in both cached and uncached pgd's; not needed for PAE since the
17502 + * kernel pmd is shared. If PAE were not to share the pmd a similar
17503 + * tactic would be needed. This is essentially codepath-based locking
17504 + * against pageattr.c; it is the unique case in which a valid change
17505 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
17506 + * vmalloc faults work because attached pagetables are never freed.
17507 + * The locking scheme was chosen on the basis of manfred's
17508 + * recommendations and having no core impact whatsoever.
17509 + * -- wli
17510 + */
17511 +DEFINE_SPINLOCK(pgd_lock);
17512 +struct page *pgd_list;
17513 +
17514 +static inline void pgd_list_add(pgd_t *pgd)
17515 +{
17516 + struct page *page = virt_to_page(pgd);
17517 + page->index = (unsigned long)pgd_list;
17518 + if (pgd_list)
17519 + set_page_private(pgd_list, (unsigned long)&page->index);
17520 + pgd_list = page;
17521 + set_page_private(page, (unsigned long)&pgd_list);
17522 +}
17523 +
17524 +static inline void pgd_list_del(pgd_t *pgd)
17525 +{
17526 + struct page *next, **pprev, *page = virt_to_page(pgd);
17527 + next = (struct page *)page->index;
17528 + pprev = (struct page **)page_private(page);
17529 + *pprev = next;
17530 + if (next)
17531 + set_page_private(next, (unsigned long)pprev);
17532 +}
17533 +
17534 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17535 +{
17536 + unsigned long flags;
17537 +
17538 + if (PTRS_PER_PMD > 1) {
17539 + if (HAVE_SHARED_KERNEL_PMD)
17540 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17541 + swapper_pg_dir + USER_PTRS_PER_PGD,
17542 + KERNEL_PGD_PTRS);
17543 + } else {
17544 + spin_lock_irqsave(&pgd_lock, flags);
17545 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
17546 + swapper_pg_dir + USER_PTRS_PER_PGD,
17547 + KERNEL_PGD_PTRS);
17548 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
17549 + pgd_list_add(pgd);
17550 + spin_unlock_irqrestore(&pgd_lock, flags);
17551 + }
17552 +}
17553 +
17554 +/* never called when PTRS_PER_PMD > 1 */
17555 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
17556 +{
17557 + unsigned long flags; /* can be called from interrupt context */
17558 +
17559 + spin_lock_irqsave(&pgd_lock, flags);
17560 + pgd_list_del(pgd);
17561 + spin_unlock_irqrestore(&pgd_lock, flags);
17562 +
17563 + pgd_test_and_unpin(pgd);
17564 +}
17565 +
17566 +pgd_t *pgd_alloc(struct mm_struct *mm)
17567 +{
17568 + int i;
17569 + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
17570 + pmd_t **pmd;
17571 + unsigned long flags;
17572 +
17573 + pgd_test_and_unpin(pgd);
17574 +
17575 + if (PTRS_PER_PMD == 1 || !pgd)
17576 + return pgd;
17577 +
17578 + if (HAVE_SHARED_KERNEL_PMD) {
17579 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17580 + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17581 + if (!pmd)
17582 + goto out_oom;
17583 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
17584 + }
17585 + return pgd;
17586 + }
17587 +
17588 + /*
17589 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
17590 + * allocation). We therefore store virtual addresses of pmds as they
17591 + * do not change across save/restore, and poke the machine addresses
17592 + * into the pgdir under the pgd_lock.
17593 + */
17594 + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
17595 + if (!pmd) {
17596 + kmem_cache_free(pgd_cache, pgd);
17597 + return NULL;
17598 + }
17599 +
17600 + /* Allocate pmds, remember virtual addresses. */
17601 + for (i = 0; i < PTRS_PER_PGD; ++i) {
17602 + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
17603 + if (!pmd[i])
17604 + goto out_oom;
17605 + }
17606 +
17607 + spin_lock_irqsave(&pgd_lock, flags);
17608 +
17609 + /* Protect against save/restore: move below 4GB under pgd_lock. */
17610 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
17611 + int rc = xen_create_contiguous_region(
17612 + (unsigned long)pgd, 0, 32);
17613 + if (rc) {
17614 + spin_unlock_irqrestore(&pgd_lock, flags);
17615 + goto out_oom;
17616 + }
17617 + }
17618 +
17619 + /* Copy kernel pmd contents and write-protect the new pmds. */
17620 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17621 + unsigned long v = (unsigned long)i << PGDIR_SHIFT;
17622 + pgd_t *kpgd = pgd_offset_k(v);
17623 + pud_t *kpud = pud_offset(kpgd, v);
17624 + pmd_t *kpmd = pmd_offset(kpud, v);
17625 + memcpy(pmd[i], kpmd, PAGE_SIZE);
17626 + make_lowmem_page_readonly(
17627 + pmd[i], XENFEAT_writable_page_tables);
17628 + }
17629 +
17630 + /* It is safe to poke machine addresses of pmds under the pmd_lock. */
17631 + for (i = 0; i < PTRS_PER_PGD; i++)
17632 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
17633 +
17634 + /* Ensure this pgd gets picked up and pinned on save/restore. */
17635 + pgd_list_add(pgd);
17636 +
17637 + spin_unlock_irqrestore(&pgd_lock, flags);
17638 +
17639 + kfree(pmd);
17640 +
17641 + return pgd;
17642 +
17643 +out_oom:
17644 + if (HAVE_SHARED_KERNEL_PMD) {
17645 + for (i--; i >= 0; i--)
17646 + kmem_cache_free(pmd_cache,
17647 + (void *)__va(pgd_val(pgd[i])-1));
17648 + } else {
17649 + for (i--; i >= 0; i--)
17650 + kmem_cache_free(pmd_cache, pmd[i]);
17651 + kfree(pmd);
17652 + }
17653 + kmem_cache_free(pgd_cache, pgd);
17654 + return NULL;
17655 +}
17656 +
17657 +void pgd_free(pgd_t *pgd)
17658 +{
17659 + int i;
17660 +
17661 + /*
17662 + * After this the pgd should not be pinned for the duration of this
17663 + * function's execution. We should never sleep and thus never race:
17664 + * 1. User pmds will not become write-protected under our feet due
17665 + * to a concurrent mm_pin_all().
17666 + * 2. The machine addresses in PGD entries will not become invalid
17667 + * due to a concurrent save/restore.
17668 + */
17669 + pgd_test_and_unpin(pgd);
17670 +
17671 + /* in the PAE case user pgd entries are overwritten before usage */
17672 + if (PTRS_PER_PMD > 1) {
17673 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
17674 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17675 + kmem_cache_free(pmd_cache, pmd);
17676 + }
17677 +
17678 + if (!HAVE_SHARED_KERNEL_PMD) {
17679 + unsigned long flags;
17680 + spin_lock_irqsave(&pgd_lock, flags);
17681 + pgd_list_del(pgd);
17682 + spin_unlock_irqrestore(&pgd_lock, flags);
17683 +
17684 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
17685 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
17686 + make_lowmem_page_writable(
17687 + pmd, XENFEAT_writable_page_tables);
17688 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
17689 + kmem_cache_free(pmd_cache, pmd);
17690 + }
17691 +
17692 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
17693 + xen_destroy_contiguous_region(
17694 + (unsigned long)pgd, 0);
17695 + }
17696 + }
17697 +
17698 + /* in the non-PAE case, free_pgtables() clears user pgd entries */
17699 + kmem_cache_free(pgd_cache, pgd);
17700 +}
17701 +
17702 +void make_lowmem_page_readonly(void *va, unsigned int feature)
17703 +{
17704 + pte_t *pte;
17705 + int rc;
17706 +
17707 + if (xen_feature(feature))
17708 + return;
17709 +
17710 + pte = virt_to_ptep(va);
17711 + rc = HYPERVISOR_update_va_mapping(
17712 + (unsigned long)va, pte_wrprotect(*pte), 0);
17713 + BUG_ON(rc);
17714 +}
17715 +
17716 +void make_lowmem_page_writable(void *va, unsigned int feature)
17717 +{
17718 + pte_t *pte;
17719 + int rc;
17720 +
17721 + if (xen_feature(feature))
17722 + return;
17723 +
17724 + pte = virt_to_ptep(va);
17725 + rc = HYPERVISOR_update_va_mapping(
17726 + (unsigned long)va, pte_mkwrite(*pte), 0);
17727 + BUG_ON(rc);
17728 +}
17729 +
17730 +void make_page_readonly(void *va, unsigned int feature)
17731 +{
17732 + pte_t *pte;
17733 + int rc;
17734 +
17735 + if (xen_feature(feature))
17736 + return;
17737 +
17738 + pte = virt_to_ptep(va);
17739 + rc = HYPERVISOR_update_va_mapping(
17740 + (unsigned long)va, pte_wrprotect(*pte), 0);
17741 + if (rc) /* fallback? */
17742 + xen_l1_entry_update(pte, pte_wrprotect(*pte));
17743 + if ((unsigned long)va >= (unsigned long)high_memory) {
17744 + unsigned long pfn = pte_pfn(*pte);
17745 +#ifdef CONFIG_HIGHMEM
17746 + if (pfn >= highstart_pfn)
17747 + kmap_flush_unused(); /* flush stale writable kmaps */
17748 + else
17749 +#endif
17750 + make_lowmem_page_readonly(
17751 + phys_to_virt(pfn << PAGE_SHIFT), feature);
17752 + }
17753 +}
17754 +
17755 +void make_page_writable(void *va, unsigned int feature)
17756 +{
17757 + pte_t *pte;
17758 + int rc;
17759 +
17760 + if (xen_feature(feature))
17761 + return;
17762 +
17763 + pte = virt_to_ptep(va);
17764 + rc = HYPERVISOR_update_va_mapping(
17765 + (unsigned long)va, pte_mkwrite(*pte), 0);
17766 + if (rc) /* fallback? */
17767 + xen_l1_entry_update(pte, pte_mkwrite(*pte));
17768 + if ((unsigned long)va >= (unsigned long)high_memory) {
17769 + unsigned long pfn = pte_pfn(*pte);
17770 +#ifdef CONFIG_HIGHMEM
17771 + if (pfn < highstart_pfn)
17772 +#endif
17773 + make_lowmem_page_writable(
17774 + phys_to_virt(pfn << PAGE_SHIFT), feature);
17775 + }
17776 +}
17777 +
17778 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17779 +{
17780 + if (xen_feature(feature))
17781 + return;
17782 +
17783 + while (nr-- != 0) {
17784 + make_page_readonly(va, feature);
17785 + va = (void *)((unsigned long)va + PAGE_SIZE);
17786 + }
17787 +}
17788 +
17789 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17790 +{
17791 + if (xen_feature(feature))
17792 + return;
17793 +
17794 + while (nr-- != 0) {
17795 + make_page_writable(va, feature);
17796 + va = (void *)((unsigned long)va + PAGE_SIZE);
17797 + }
17798 +}
17799 +
17800 +static void _pin_lock(struct mm_struct *mm, int lock) {
17801 + if (lock)
17802 + spin_lock(&mm->page_table_lock);
17803 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17804 + /* While mm->page_table_lock protects us against insertions and
17805 + * removals of higher level page table pages, it doesn't protect
17806 + * against updates of pte-s. Such updates, however, require the
17807 + * pte pages to be in consistent state (unpinned+writable or
17808 + * pinned+readonly). The pinning and attribute changes, however
17809 + * cannot be done atomically, which is why such updates must be
17810 + * prevented from happening concurrently.
17811 + * Note that no pte lock can ever elsewhere be acquired nesting
17812 + * with an already acquired one in the same mm, or with the mm's
17813 + * page_table_lock already acquired, as that would break in the
17814 + * non-split case (where all these are actually resolving to the
17815 + * one page_table_lock). Thus acquiring all of them here is not
17816 + * going to result in dead locks, and the order of acquires
17817 + * doesn't matter.
17818 + */
17819 + {
17820 + pgd_t *pgd = mm->pgd;
17821 + unsigned g;
17822 +
17823 + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17824 + pud_t *pud;
17825 + unsigned u;
17826 +
17827 + if (pgd_none(*pgd))
17828 + continue;
17829 + pud = pud_offset(pgd, 0);
17830 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17831 + pmd_t *pmd;
17832 + unsigned m;
17833 +
17834 + if (pud_none(*pud))
17835 + continue;
17836 + pmd = pmd_offset(pud, 0);
17837 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17838 + spinlock_t *ptl;
17839 +
17840 + if (pmd_none(*pmd))
17841 + continue;
17842 + ptl = pte_lockptr(0, pmd);
17843 + if (lock)
17844 + spin_lock(ptl);
17845 + else
17846 + spin_unlock(ptl);
17847 + }
17848 + }
17849 + }
17850 + }
17851 +#endif
17852 + if (!lock)
17853 + spin_unlock(&mm->page_table_lock);
17854 +}
17855 +#define pin_lock(mm) _pin_lock(mm, 1)
17856 +#define pin_unlock(mm) _pin_lock(mm, 0)
17857 +
17858 +#define PIN_BATCH 4
17859 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17860 +
17861 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
17862 + unsigned int cpu, unsigned seq)
17863 +{
17864 + unsigned long pfn = page_to_pfn(page);
17865 +
17866 + if (PageHighMem(page)) {
17867 + if (pgprot_val(flags) & _PAGE_RW)
17868 + clear_bit(PG_pinned, &page->flags);
17869 + else
17870 + set_bit(PG_pinned, &page->flags);
17871 + } else {
17872 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17873 + (unsigned long)__va(pfn << PAGE_SHIFT),
17874 + pfn_pte(pfn, flags), 0);
17875 + if (unlikely(++seq == PIN_BATCH)) {
17876 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17877 + PIN_BATCH, NULL)))
17878 + BUG();
17879 + seq = 0;
17880 + }
17881 + }
17882 +
17883 + return seq;
17884 +}
17885 +
17886 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17887 +{
17888 + pgd_t *pgd = pgd_base;
17889 + pud_t *pud;
17890 + pmd_t *pmd;
17891 + int g, u, m;
17892 + unsigned int cpu, seq;
17893 +
17894 + if (xen_feature(XENFEAT_auto_translated_physmap))
17895 + return;
17896 +
17897 + cpu = get_cpu();
17898 +
17899 + for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
17900 + if (pgd_none(*pgd))
17901 + continue;
17902 + pud = pud_offset(pgd, 0);
17903 + if (PTRS_PER_PUD > 1) /* not folded */
17904 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
17905 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17906 + if (pud_none(*pud))
17907 + continue;
17908 + pmd = pmd_offset(pud, 0);
17909 + if (PTRS_PER_PMD > 1) /* not folded */
17910 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
17911 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17912 + if (pmd_none(*pmd))
17913 + continue;
17914 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
17915 + }
17916 + }
17917 + }
17918 +
17919 + if (likely(seq != 0)) {
17920 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17921 + (unsigned long)pgd_base,
17922 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17923 + UVMF_TLB_FLUSH);
17924 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17925 + seq + 1, NULL)))
17926 + BUG();
17927 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
17928 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17929 + UVMF_TLB_FLUSH))
17930 + BUG();
17931 +
17932 + put_cpu();
17933 +}
17934 +
17935 +static void __pgd_pin(pgd_t *pgd)
17936 +{
17937 + pgd_walk(pgd, PAGE_KERNEL_RO);
17938 + kmap_flush_unused();
17939 + xen_pgd_pin(__pa(pgd));
17940 + set_bit(PG_pinned, &virt_to_page(pgd)->flags);
17941 +}
17942 +
17943 +static void __pgd_unpin(pgd_t *pgd)
17944 +{
17945 + xen_pgd_unpin(__pa(pgd));
17946 + pgd_walk(pgd, PAGE_KERNEL);
17947 + clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
17948 +}
17949 +
17950 +static void pgd_test_and_unpin(pgd_t *pgd)
17951 +{
17952 + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
17953 + __pgd_unpin(pgd);
17954 +}
17955 +
17956 +void mm_pin(struct mm_struct *mm)
17957 +{
17958 + if (xen_feature(XENFEAT_writable_page_tables))
17959 + return;
17960 + pin_lock(mm);
17961 + __pgd_pin(mm->pgd);
17962 + pin_unlock(mm);
17963 +}
17964 +
17965 +void mm_unpin(struct mm_struct *mm)
17966 +{
17967 + if (xen_feature(XENFEAT_writable_page_tables))
17968 + return;
17969 + pin_lock(mm);
17970 + __pgd_unpin(mm->pgd);
17971 + pin_unlock(mm);
17972 +}
17973 +
17974 +void mm_pin_all(void)
17975 +{
17976 + struct page *page;
17977 + unsigned long flags;
17978 +
17979 + if (xen_feature(XENFEAT_writable_page_tables))
17980 + return;
17981 +
17982 + /*
17983 + * Allow uninterrupted access to the pgd_list. Also protects
17984 + * __pgd_pin() by disabling preemption.
17985 + * All other CPUs must be at a safe point (e.g., in stop_machine
17986 + * or offlined entirely).
17987 + */
17988 + spin_lock_irqsave(&pgd_lock, flags);
17989 + for (page = pgd_list; page; page = (struct page *)page->index) {
17990 + if (!test_bit(PG_pinned, &page->flags))
17991 + __pgd_pin((pgd_t *)page_address(page));
17992 + }
17993 + spin_unlock_irqrestore(&pgd_lock, flags);
17994 +}
17995 +
17996 +void _arch_dup_mmap(struct mm_struct *mm)
17997 +{
17998 + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
17999 + mm_pin(mm);
18000 +}
18001 +
18002 +void _arch_exit_mmap(struct mm_struct *mm)
18003 +{
18004 + struct task_struct *tsk = current;
18005 +
18006 + task_lock(tsk);
18007 +
18008 + /*
18009 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18010 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18011 + */
18012 + if (tsk->active_mm == mm) {
18013 + tsk->active_mm = &init_mm;
18014 + atomic_inc(&init_mm.mm_count);
18015 +
18016 + switch_mm(mm, &init_mm, tsk);
18017 +
18018 + atomic_dec(&mm->mm_count);
18019 + BUG_ON(atomic_read(&mm->mm_count) == 0);
18020 + }
18021 +
18022 + task_unlock(tsk);
18023 +
18024 + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
18025 + (atomic_read(&mm->mm_count) == 1) &&
18026 + !mm->context.has_foreign_mappings)
18027 + mm_unpin(mm);
18028 +}
18029 Index: head-2008-11-25/arch/x86/oprofile/xenoprof.c
18030 ===================================================================
18031 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
18032 +++ head-2008-11-25/arch/x86/oprofile/xenoprof.c 2008-01-28 12:24:19.000000000 +0100
18033 @@ -0,0 +1,179 @@
18034 +/**
18035 + * @file xenoprof.c
18036 + *
18037 + * @remark Copyright 2002 OProfile authors
18038 + * @remark Read the file COPYING
18039 + *
18040 + * @author John Levon <levon@movementarian.org>
18041 + *
18042 + * Modified by Aravind Menon and Jose Renato Santos for Xen
18043 + * These modifications are:
18044 + * Copyright (C) 2005 Hewlett-Packard Co.
18045 + *
18046 + * x86-specific part
18047 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
18048 + * VA Linux Systems Japan K.K.
18049 + */
18050 +
18051 +#include <linux/init.h>
18052 +#include <linux/oprofile.h>
18053 +#include <linux/sched.h>
18054 +#include <asm/pgtable.h>
18055 +
18056 +#include <xen/driver_util.h>
18057 +#include <xen/interface/xen.h>
18058 +#include <xen/interface/xenoprof.h>
18059 +#include <xen/xenoprof.h>
18060 +#include "op_counter.h"
18061 +
18062 +static unsigned int num_events = 0;
18063 +
18064 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
18065 +{
18066 + num_events = init->num_events;
18067 + /* just in case - make sure we do not overflow event list
18068 + (i.e. counter_config list) */
18069 + if (num_events > OP_MAX_COUNTER) {
18070 + num_events = OP_MAX_COUNTER;
18071 + init->num_events = num_events;
18072 + }
18073 +}
18074 +
18075 +void xenoprof_arch_counter(void)
18076 +{
18077 + int i;
18078 + struct xenoprof_counter counter;
18079 +
18080 + for (i=0; i<num_events; i++) {
18081 + counter.ind = i;
18082 + counter.count = (uint64_t)counter_config[i].count;
18083 + counter.enabled = (uint32_t)counter_config[i].enabled;
18084 + counter.event = (uint32_t)counter_config[i].event;
18085 + counter.kernel = (uint32_t)counter_config[i].kernel;
18086 + counter.user = (uint32_t)counter_config[i].user;
18087 + counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
18088 + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
18089 + &counter));
18090 + }
18091 +}
18092 +
18093 +void xenoprof_arch_start(void)
18094 +{
18095 + /* nothing */
18096 +}
18097 +
18098 +void xenoprof_arch_stop(void)
18099 +{
18100 + /* nothing */
18101 +}
18102 +
18103 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
18104 +{
18105 + if (sbuf->buffer) {
18106 + vunmap(sbuf->buffer);
18107 + sbuf->buffer = NULL;
18108 + }
18109 +}
18110 +
18111 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
18112 + struct xenoprof_shared_buffer * sbuf)
18113 +{
18114 + int npages, ret;
18115 + struct vm_struct *area;
18116 +
18117 + sbuf->buffer = NULL;
18118 + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
18119 + return ret;
18120 +
18121 + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
18122 +
18123 + area = alloc_vm_area(npages * PAGE_SIZE);
18124 + if (area == NULL)
18125 + return -ENOMEM;
18126 +
18127 + if ( (ret = direct_kernel_remap_pfn_range(
18128 + (unsigned long)area->addr,
18129 + get_buffer->buf_gmaddr >> PAGE_SHIFT,
18130 + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
18131 + DOMID_SELF)) ) {
18132 + vunmap(area->addr);
18133 + return ret;
18134 + }
18135 +
18136 + sbuf->buffer = area->addr;
18137 + return ret;
18138 +}
18139 +
18140 +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
18141 + struct xenoprof_shared_buffer * sbuf)
18142 +{
18143 + int ret;
18144 + int npages;
18145 + struct vm_struct *area;
18146 + pgprot_t prot = __pgprot(_KERNPG_TABLE);
18147 +
18148 + sbuf->buffer = NULL;
18149 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
18150 + if (ret)
18151 + goto out;
18152 +
18153 + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
18154 +
18155 + area = alloc_vm_area(npages * PAGE_SIZE);
18156 + if (area == NULL) {
18157 + ret = -ENOMEM;
18158 + goto out;
18159 + }
18160 +
18161 + ret = direct_kernel_remap_pfn_range(
18162 + (unsigned long)area->addr,
18163 + pdomain->buf_gmaddr >> PAGE_SHIFT,
18164 + npages * PAGE_SIZE, prot, DOMID_SELF);
18165 + if (ret) {
18166 + vunmap(area->addr);
18167 + goto out;
18168 + }
18169 + sbuf->buffer = area->addr;
18170 +
18171 +out:
18172 + return ret;
18173 +}
18174 +
18175 +struct op_counter_config counter_config[OP_MAX_COUNTER];
18176 +
18177 +int xenoprof_create_files(struct super_block * sb, struct dentry * root)
18178 +{
18179 + unsigned int i;
18180 +
18181 + for (i = 0; i < num_events; ++i) {
18182 + struct dentry * dir;
18183 + char buf[2];
18184 +
18185 + snprintf(buf, 2, "%d", i);
18186 + dir = oprofilefs_mkdir(sb, root, buf);
18187 + oprofilefs_create_ulong(sb, dir, "enabled",
18188 + &counter_config[i].enabled);
18189 + oprofilefs_create_ulong(sb, dir, "event",
18190 + &counter_config[i].event);
18191 + oprofilefs_create_ulong(sb, dir, "count",
18192 + &counter_config[i].count);
18193 + oprofilefs_create_ulong(sb, dir, "unit_mask",
18194 + &counter_config[i].unit_mask);
18195 + oprofilefs_create_ulong(sb, dir, "kernel",
18196 + &counter_config[i].kernel);
18197 + oprofilefs_create_ulong(sb, dir, "user",
18198 + &counter_config[i].user);
18199 + }
18200 +
18201 + return 0;
18202 +}
18203 +
18204 +int __init oprofile_arch_init(struct oprofile_operations * ops)
18205 +{
18206 + return xenoprofile_init(ops);
18207 +}
18208 +
18209 +void oprofile_arch_exit(void)
18210 +{
18211 + xenoprofile_exit();
18212 +}
18213 Index: head-2008-11-25/arch/x86/pci/irq-xen.c
18214 ===================================================================
18215 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
18216 +++ head-2008-11-25/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100
18217 @@ -0,0 +1,1211 @@
18218 +/*
18219 + * Low-Level PCI Support for PC -- Routing of Interrupts
18220 + *
18221 + * (c) 1999--2000 Martin Mares <mj@ucw.cz>
18222 + */
18223 +
18224 +#include <linux/types.h>
18225 +#include <linux/kernel.h>
18226 +#include <linux/pci.h>
18227 +#include <linux/init.h>
18228 +#include <linux/slab.h>
18229 +#include <linux/interrupt.h>
18230 +#include <linux/dmi.h>
18231 +#include <asm/io.h>
18232 +#include <asm/smp.h>
18233 +#include <asm/io_apic.h>
18234 +#include <linux/irq.h>
18235 +#include <linux/acpi.h>
18236 +
18237 +#include "pci.h"
18238 +
18239 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
18240 +#define PIRQ_VERSION 0x0100
18241 +
18242 +static int broken_hp_bios_irq9;
18243 +static int acer_tm360_irqrouting;
18244 +
18245 +static struct irq_routing_table *pirq_table;
18246 +
18247 +static int pirq_enable_irq(struct pci_dev *dev);
18248 +
18249 +/*
18250 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
18251 + * Avoid using: 13, 14 and 15 (FP error and IDE).
18252 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
18253 + */
18254 +unsigned int pcibios_irq_mask = 0xfff8;
18255 +
18256 +static int pirq_penalty[16] = {
18257 + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
18258 + 0, 0, 0, 0, 1000, 100000, 100000, 100000
18259 +};
18260 +
18261 +struct irq_router {
18262 + char *name;
18263 + u16 vendor, device;
18264 + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
18265 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
18266 +};
18267 +
18268 +struct irq_router_handler {
18269 + u16 vendor;
18270 + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
18271 +};
18272 +
18273 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
18274 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
18275 +
18276 +/*
18277 + * Check passed address for the PCI IRQ Routing Table signature
18278 + * and perform checksum verification.
18279 + */
18280 +
18281 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
18282 +{
18283 + struct irq_routing_table *rt;
18284 + int i;
18285 + u8 sum;
18286 +
18287 + rt = (struct irq_routing_table *) addr;
18288 + if (rt->signature != PIRQ_SIGNATURE ||
18289 + rt->version != PIRQ_VERSION ||
18290 + rt->size % 16 ||
18291 + rt->size < sizeof(struct irq_routing_table))
18292 + return NULL;
18293 + sum = 0;
18294 + for (i=0; i < rt->size; i++)
18295 + sum += addr[i];
18296 + if (!sum) {
18297 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
18298 + return rt;
18299 + }
18300 + return NULL;
18301 +}
18302 +
18303 +
18304 +
18305 +/*
18306 + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
18307 + */
18308 +
18309 +static struct irq_routing_table * __init pirq_find_routing_table(void)
18310 +{
18311 + u8 *addr;
18312 + struct irq_routing_table *rt;
18313 +
18314 +#ifdef CONFIG_XEN
18315 + if (!is_initial_xendomain())
18316 + return NULL;
18317 +#endif
18318 + if (pirq_table_addr) {
18319 + rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
18320 + if (rt)
18321 + return rt;
18322 + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
18323 + }
18324 + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
18325 + rt = pirq_check_routing_table(addr);
18326 + if (rt)
18327 + return rt;
18328 + }
18329 + return NULL;
18330 +}
18331 +
18332 +/*
18333 + * If we have a IRQ routing table, use it to search for peer host
18334 + * bridges. It's a gross hack, but since there are no other known
18335 + * ways how to get a list of buses, we have to go this way.
18336 + */
18337 +
18338 +static void __init pirq_peer_trick(void)
18339 +{
18340 + struct irq_routing_table *rt = pirq_table;
18341 + u8 busmap[256];
18342 + int i;
18343 + struct irq_info *e;
18344 +
18345 + memset(busmap, 0, sizeof(busmap));
18346 + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
18347 + e = &rt->slots[i];
18348 +#ifdef DEBUG
18349 + {
18350 + int j;
18351 + DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
18352 + for(j=0; j<4; j++)
18353 + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
18354 + DBG("\n");
18355 + }
18356 +#endif
18357 + busmap[e->bus] = 1;
18358 + }
18359 + for(i = 1; i < 256; i++) {
18360 + if (!busmap[i] || pci_find_bus(0, i))
18361 + continue;
18362 + if (pci_scan_bus(i, &pci_root_ops, NULL))
18363 + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
18364 + }
18365 + pcibios_last_bus = -1;
18366 +}
18367 +
18368 +/*
18369 + * Code for querying and setting of IRQ routes on various interrupt routers.
18370 + */
18371 +
18372 +void eisa_set_level_irq(unsigned int irq)
18373 +{
18374 + unsigned char mask = 1 << (irq & 7);
18375 + unsigned int port = 0x4d0 + (irq >> 3);
18376 + unsigned char val;
18377 + static u16 eisa_irq_mask;
18378 +
18379 + if (irq >= 16 || (1 << irq) & eisa_irq_mask)
18380 + return;
18381 +
18382 + eisa_irq_mask |= (1 << irq);
18383 + printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
18384 + val = inb(port);
18385 + if (!(val & mask)) {
18386 + DBG(KERN_DEBUG " -> edge");
18387 + outb(val | mask, port);
18388 + }
18389 +}
18390 +
18391 +/*
18392 + * Common IRQ routing practice: nybbles in config space,
18393 + * offset by some magic constant.
18394 + */
18395 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
18396 +{
18397 + u8 x;
18398 + unsigned reg = offset + (nr >> 1);
18399 +
18400 + pci_read_config_byte(router, reg, &x);
18401 + return (nr & 1) ? (x >> 4) : (x & 0xf);
18402 +}
18403 +
18404 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
18405 +{
18406 + u8 x;
18407 + unsigned reg = offset + (nr >> 1);
18408 +
18409 + pci_read_config_byte(router, reg, &x);
18410 + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
18411 + pci_write_config_byte(router, reg, x);
18412 +}
18413 +
18414 +/*
18415 + * ALI pirq entries are damn ugly, and completely undocumented.
18416 + * This has been figured out from pirq tables, and it's not a pretty
18417 + * picture.
18418 + */
18419 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18420 +{
18421 + static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18422 +
18423 + return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18424 +}
18425 +
18426 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18427 +{
18428 + static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18429 + unsigned int val = irqmap[irq];
18430 +
18431 + if (val) {
18432 + write_config_nybble(router, 0x48, pirq-1, val);
18433 + return 1;
18434 + }
18435 + return 0;
18436 +}
18437 +
18438 +/*
18439 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
18440 + * just a pointer to the config space.
18441 + */
18442 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18443 +{
18444 + u8 x;
18445 +
18446 + pci_read_config_byte(router, pirq, &x);
18447 + return (x < 16) ? x : 0;
18448 +}
18449 +
18450 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18451 +{
18452 + pci_write_config_byte(router, pirq, irq);
18453 + return 1;
18454 +}
18455 +
18456 +/*
18457 + * The VIA pirq rules are nibble-based, like ALI,
18458 + * but without the ugly irq number munging.
18459 + * However, PIRQD is in the upper instead of lower 4 bits.
18460 + */
18461 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18462 +{
18463 + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
18464 +}
18465 +
18466 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18467 +{
18468 + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
18469 + return 1;
18470 +}
18471 +
18472 +/*
18473 + * The VIA pirq rules are nibble-based, like ALI,
18474 + * but without the ugly irq number munging.
18475 + * However, for 82C586, nibble map is different .
18476 + */
18477 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18478 +{
18479 + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18480 + return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18481 +}
18482 +
18483 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18484 +{
18485 + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18486 + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18487 + return 1;
18488 +}
18489 +
18490 +/*
18491 + * ITE 8330G pirq rules are nibble-based
18492 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
18493 + * 2+3 are both mapped to irq 9 on my system
18494 + */
18495 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18496 +{
18497 + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18498 + return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18499 +}
18500 +
18501 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18502 +{
18503 + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18504 + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18505 + return 1;
18506 +}
18507 +
18508 +/*
18509 + * OPTI: high four bits are nibble pointer..
18510 + * I wonder what the low bits do?
18511 + */
18512 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18513 +{
18514 + return read_config_nybble(router, 0xb8, pirq >> 4);
18515 +}
18516 +
18517 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18518 +{
18519 + write_config_nybble(router, 0xb8, pirq >> 4, irq);
18520 + return 1;
18521 +}
18522 +
18523 +/*
18524 + * Cyrix: nibble offset 0x5C
18525 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
18526 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
18527 + */
18528 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18529 +{
18530 + return read_config_nybble(router, 0x5C, (pirq-1)^1);
18531 +}
18532 +
18533 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18534 +{
18535 + write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
18536 + return 1;
18537 +}
18538 +
18539 +/*
18540 + * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
18541 + * We have to deal with the following issues here:
18542 + * - vendors have different ideas about the meaning of link values
18543 + * - some onboard devices (integrated in the chipset) have special
18544 + * links and are thus routed differently (i.e. not via PCI INTA-INTD)
18545 + * - different revision of the router have a different layout for
18546 + * the routing registers, particularly for the onchip devices
18547 + *
18548 + * For all routing registers the common thing is we have one byte
18549 + * per routeable link which is defined as:
18550 + * bit 7 IRQ mapping enabled (0) or disabled (1)
18551 + * bits [6:4] reserved (sometimes used for onchip devices)
18552 + * bits [3:0] IRQ to map to
18553 + * allowed: 3-7, 9-12, 14-15
18554 + * reserved: 0, 1, 2, 8, 13
18555 + *
18556 + * The config-space registers located at 0x41/0x42/0x43/0x44 are
18557 + * always used to route the normal PCI INT A/B/C/D respectively.
18558 + * Apparently there are systems implementing PCI routing table using
18559 + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
18560 + * We try our best to handle both link mappings.
18561 + *
18562 + * Currently (2003-05-21) it appears most SiS chipsets follow the
18563 + * definition of routing registers from the SiS-5595 southbridge.
18564 + * According to the SiS 5595 datasheets the revision id's of the
18565 + * router (ISA-bridge) should be 0x01 or 0xb0.
18566 + *
18567 + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
18568 + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
18569 + * They seem to work with the current routing code. However there is
18570 + * some concern because of the two USB-OHCI HCs (original SiS 5595
18571 + * had only one). YMMV.
18572 + *
18573 + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
18574 + *
18575 + * 0x61: IDEIRQ:
18576 + * bits [6:5] must be written 01
18577 + * bit 4 channel-select primary (0), secondary (1)
18578 + *
18579 + * 0x62: USBIRQ:
18580 + * bit 6 OHCI function disabled (0), enabled (1)
18581 + *
18582 + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
18583 + *
18584 + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
18585 + *
18586 + * We support USBIRQ (in addition to INTA-INTD) and keep the
18587 + * IDE, ACPI and DAQ routing untouched as set by the BIOS.
18588 + *
18589 + * Currently the only reported exception is the new SiS 65x chipset
18590 + * which includes the SiS 69x southbridge. Here we have the 85C503
18591 + * router revision 0x04 and there are changes in the register layout
18592 + * mostly related to the different USB HCs with USB 2.0 support.
18593 + *
18594 + * Onchip routing for router rev-id 0x04 (try-and-error observation)
18595 + *
18596 + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
18597 + * bit 6-4 are probably unused, not like 5595
18598 + */
18599 +
18600 +#define PIRQ_SIS_IRQ_MASK 0x0f
18601 +#define PIRQ_SIS_IRQ_DISABLE 0x80
18602 +#define PIRQ_SIS_USB_ENABLE 0x40
18603 +
18604 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18605 +{
18606 + u8 x;
18607 + int reg;
18608 +
18609 + reg = pirq;
18610 + if (reg >= 0x01 && reg <= 0x04)
18611 + reg += 0x40;
18612 + pci_read_config_byte(router, reg, &x);
18613 + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
18614 +}
18615 +
18616 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18617 +{
18618 + u8 x;
18619 + int reg;
18620 +
18621 + reg = pirq;
18622 + if (reg >= 0x01 && reg <= 0x04)
18623 + reg += 0x40;
18624 + pci_read_config_byte(router, reg, &x);
18625 + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
18626 + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
18627 + pci_write_config_byte(router, reg, x);
18628 + return 1;
18629 +}
18630 +
18631 +
18632 +/*
18633 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
18634 + * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
18635 + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
18636 + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
18637 + * for the busbridge to the docking station.
18638 + */
18639 +
18640 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18641 +{
18642 + if (pirq > 8) {
18643 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18644 + return 0;
18645 + }
18646 + return read_config_nybble(router, 0x74, pirq-1);
18647 +}
18648 +
18649 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18650 +{
18651 + if (pirq > 8) {
18652 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18653 + return 0;
18654 + }
18655 + write_config_nybble(router, 0x74, pirq-1, irq);
18656 + return 1;
18657 +}
18658 +
18659 +/*
18660 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
18661 + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
18662 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
18663 + * register is a straight binary coding of desired PIC IRQ (low nibble).
18664 + *
18665 + * The 'link' value in the PIRQ table is already in the correct format
18666 + * for the Index register. There are some special index values:
18667 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
18668 + * and 0x03 for SMBus.
18669 + */
18670 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18671 +{
18672 + outb_p(pirq, 0xc00);
18673 + return inb(0xc01) & 0xf;
18674 +}
18675 +
18676 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18677 +{
18678 + outb_p(pirq, 0xc00);
18679 + outb_p(irq, 0xc01);
18680 + return 1;
18681 +}
18682 +
18683 +/* Support for AMD756 PCI IRQ Routing
18684 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
18685 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
18686 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
18687 + * The AMD756 pirq rules are nibble-based
18688 + * offset 0x56 0-3 PIRQA 4-7 PIRQB
18689 + * offset 0x57 0-3 PIRQC 4-7 PIRQD
18690 + */
18691 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18692 +{
18693 + u8 irq;
18694 + irq = 0;
18695 + if (pirq <= 4)
18696 + {
18697 + irq = read_config_nybble(router, 0x56, pirq - 1);
18698 + }
18699 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
18700 + dev->vendor, dev->device, pirq, irq);
18701 + return irq;
18702 +}
18703 +
18704 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18705 +{
18706 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
18707 + dev->vendor, dev->device, pirq, irq);
18708 + if (pirq <= 4)
18709 + {
18710 + write_config_nybble(router, 0x56, pirq - 1, irq);
18711 + }
18712 + return 1;
18713 +}
18714 +
18715 +#ifdef CONFIG_PCI_BIOS
18716 +
18717 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18718 +{
18719 + struct pci_dev *bridge;
18720 + int pin = pci_get_interrupt_pin(dev, &bridge);
18721 + return pcibios_set_irq_routing(bridge, pin, irq);
18722 +}
18723 +
18724 +#endif
18725 +
18726 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18727 +{
18728 + static struct pci_device_id __initdata pirq_440gx[] = {
18729 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
18730 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
18731 + { },
18732 + };
18733 +
18734 + /* 440GX has a proprietary PIRQ router -- don't use it */
18735 + if (pci_dev_present(pirq_440gx))
18736 + return 0;
18737 +
18738 + switch(device)
18739 + {
18740 + case PCI_DEVICE_ID_INTEL_82371FB_0:
18741 + case PCI_DEVICE_ID_INTEL_82371SB_0:
18742 + case PCI_DEVICE_ID_INTEL_82371AB_0:
18743 + case PCI_DEVICE_ID_INTEL_82371MX:
18744 + case PCI_DEVICE_ID_INTEL_82443MX_0:
18745 + case PCI_DEVICE_ID_INTEL_82801AA_0:
18746 + case PCI_DEVICE_ID_INTEL_82801AB_0:
18747 + case PCI_DEVICE_ID_INTEL_82801BA_0:
18748 + case PCI_DEVICE_ID_INTEL_82801BA_10:
18749 + case PCI_DEVICE_ID_INTEL_82801CA_0:
18750 + case PCI_DEVICE_ID_INTEL_82801CA_12:
18751 + case PCI_DEVICE_ID_INTEL_82801DB_0:
18752 + case PCI_DEVICE_ID_INTEL_82801E_0:
18753 + case PCI_DEVICE_ID_INTEL_82801EB_0:
18754 + case PCI_DEVICE_ID_INTEL_ESB_1:
18755 + case PCI_DEVICE_ID_INTEL_ICH6_0:
18756 + case PCI_DEVICE_ID_INTEL_ICH6_1:
18757 + case PCI_DEVICE_ID_INTEL_ICH7_0:
18758 + case PCI_DEVICE_ID_INTEL_ICH7_1:
18759 + case PCI_DEVICE_ID_INTEL_ICH7_30:
18760 + case PCI_DEVICE_ID_INTEL_ICH7_31:
18761 + case PCI_DEVICE_ID_INTEL_ESB2_0:
18762 + case PCI_DEVICE_ID_INTEL_ICH8_0:
18763 + case PCI_DEVICE_ID_INTEL_ICH8_1:
18764 + case PCI_DEVICE_ID_INTEL_ICH8_2:
18765 + case PCI_DEVICE_ID_INTEL_ICH8_3:
18766 + case PCI_DEVICE_ID_INTEL_ICH8_4:
18767 + case PCI_DEVICE_ID_INTEL_ICH9_0:
18768 + case PCI_DEVICE_ID_INTEL_ICH9_1:
18769 + case PCI_DEVICE_ID_INTEL_ICH9_2:
18770 + case PCI_DEVICE_ID_INTEL_ICH9_3:
18771 + case PCI_DEVICE_ID_INTEL_ICH9_4:
18772 + case PCI_DEVICE_ID_INTEL_ICH9_5:
18773 + r->name = "PIIX/ICH";
18774 + r->get = pirq_piix_get;
18775 + r->set = pirq_piix_set;
18776 + return 1;
18777 + }
18778 + return 0;
18779 +}
18780 +
18781 +static __init int via_router_probe(struct irq_router *r,
18782 + struct pci_dev *router, u16 device)
18783 +{
18784 + /* FIXME: We should move some of the quirk fixup stuff here */
18785 +
18786 + /*
18787 + * work arounds for some buggy BIOSes
18788 + */
18789 + if (device == PCI_DEVICE_ID_VIA_82C586_0) {
18790 + switch(router->device) {
18791 + case PCI_DEVICE_ID_VIA_82C686:
18792 + /*
18793 + * Asus k7m bios wrongly reports 82C686A
18794 + * as 586-compatible
18795 + */
18796 + device = PCI_DEVICE_ID_VIA_82C686;
18797 + break;
18798 + case PCI_DEVICE_ID_VIA_8235:
18799 + /**
18800 + * Asus a7v-x bios wrongly reports 8235
18801 + * as 586-compatible
18802 + */
18803 + device = PCI_DEVICE_ID_VIA_8235;
18804 + break;
18805 + }
18806 + }
18807 +
18808 + switch(device) {
18809 + case PCI_DEVICE_ID_VIA_82C586_0:
18810 + r->name = "VIA";
18811 + r->get = pirq_via586_get;
18812 + r->set = pirq_via586_set;
18813 + return 1;
18814 + case PCI_DEVICE_ID_VIA_82C596:
18815 + case PCI_DEVICE_ID_VIA_82C686:
18816 + case PCI_DEVICE_ID_VIA_8231:
18817 + case PCI_DEVICE_ID_VIA_8233A:
18818 + case PCI_DEVICE_ID_VIA_8235:
18819 + case PCI_DEVICE_ID_VIA_8237:
18820 + /* FIXME: add new ones for 8233/5 */
18821 + r->name = "VIA";
18822 + r->get = pirq_via_get;
18823 + r->set = pirq_via_set;
18824 + return 1;
18825 + }
18826 + return 0;
18827 +}
18828 +
18829 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18830 +{
18831 + switch(device)
18832 + {
18833 + case PCI_DEVICE_ID_VLSI_82C534:
18834 + r->name = "VLSI 82C534";
18835 + r->get = pirq_vlsi_get;
18836 + r->set = pirq_vlsi_set;
18837 + return 1;
18838 + }
18839 + return 0;
18840 +}
18841 +
18842 +
18843 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18844 +{
18845 + switch(device)
18846 + {
18847 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
18848 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
18849 + r->name = "ServerWorks";
18850 + r->get = pirq_serverworks_get;
18851 + r->set = pirq_serverworks_set;
18852 + return 1;
18853 + }
18854 + return 0;
18855 +}
18856 +
18857 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18858 +{
18859 + if (device != PCI_DEVICE_ID_SI_503)
18860 + return 0;
18861 +
18862 + r->name = "SIS";
18863 + r->get = pirq_sis_get;
18864 + r->set = pirq_sis_set;
18865 + return 1;
18866 +}
18867 +
18868 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18869 +{
18870 + switch(device)
18871 + {
18872 + case PCI_DEVICE_ID_CYRIX_5520:
18873 + r->name = "NatSemi";
18874 + r->get = pirq_cyrix_get;
18875 + r->set = pirq_cyrix_set;
18876 + return 1;
18877 + }
18878 + return 0;
18879 +}
18880 +
18881 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18882 +{
18883 + switch(device)
18884 + {
18885 + case PCI_DEVICE_ID_OPTI_82C700:
18886 + r->name = "OPTI";
18887 + r->get = pirq_opti_get;
18888 + r->set = pirq_opti_set;
18889 + return 1;
18890 + }
18891 + return 0;
18892 +}
18893 +
18894 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18895 +{
18896 + switch(device)
18897 + {
18898 + case PCI_DEVICE_ID_ITE_IT8330G_0:
18899 + r->name = "ITE";
18900 + r->get = pirq_ite_get;
18901 + r->set = pirq_ite_set;
18902 + return 1;
18903 + }
18904 + return 0;
18905 +}
18906 +
18907 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18908 +{
18909 + switch(device)
18910 + {
18911 + case PCI_DEVICE_ID_AL_M1533:
18912 + case PCI_DEVICE_ID_AL_M1563:
18913 + printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
18914 + r->name = "ALI";
18915 + r->get = pirq_ali_get;
18916 + r->set = pirq_ali_set;
18917 + return 1;
18918 + }
18919 + return 0;
18920 +}
18921 +
18922 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
18923 +{
18924 + switch(device)
18925 + {
18926 + case PCI_DEVICE_ID_AMD_VIPER_740B:
18927 + r->name = "AMD756";
18928 + break;
18929 + case PCI_DEVICE_ID_AMD_VIPER_7413:
18930 + r->name = "AMD766";
18931 + break;
18932 + case PCI_DEVICE_ID_AMD_VIPER_7443:
18933 + r->name = "AMD768";
18934 + break;
18935 + default:
18936 + return 0;
18937 + }
18938 + r->get = pirq_amd756_get;
18939 + r->set = pirq_amd756_set;
18940 + return 1;
18941 +}
18942 +
18943 +static __initdata struct irq_router_handler pirq_routers[] = {
18944 + { PCI_VENDOR_ID_INTEL, intel_router_probe },
18945 + { PCI_VENDOR_ID_AL, ali_router_probe },
18946 + { PCI_VENDOR_ID_ITE, ite_router_probe },
18947 + { PCI_VENDOR_ID_VIA, via_router_probe },
18948 + { PCI_VENDOR_ID_OPTI, opti_router_probe },
18949 + { PCI_VENDOR_ID_SI, sis_router_probe },
18950 + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
18951 + { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
18952 + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
18953 + { PCI_VENDOR_ID_AMD, amd_router_probe },
18954 + /* Someone with docs needs to add the ATI Radeon IGP */
18955 + { 0, NULL }
18956 +};
18957 +static struct irq_router pirq_router;
18958 +static struct pci_dev *pirq_router_dev;
18959 +
18960 +
18961 +/*
18962 + * FIXME: should we have an option to say "generic for
18963 + * chipset" ?
18964 + */
18965 +
18966 +static void __init pirq_find_router(struct irq_router *r)
18967 +{
18968 + struct irq_routing_table *rt = pirq_table;
18969 + struct irq_router_handler *h;
18970 +
18971 +#ifdef CONFIG_PCI_BIOS
18972 + if (!rt->signature) {
18973 + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
18974 + r->set = pirq_bios_set;
18975 + r->name = "BIOS";
18976 + return;
18977 + }
18978 +#endif
18979 +
18980 + /* Default unless a driver reloads it */
18981 + r->name = "default";
18982 + r->get = NULL;
18983 + r->set = NULL;
18984 +
18985 + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
18986 + rt->rtr_vendor, rt->rtr_device);
18987 +
18988 + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
18989 + if (!pirq_router_dev) {
18990 + DBG(KERN_DEBUG "PCI: Interrupt router not found at "
18991 + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
18992 + return;
18993 + }
18994 +
18995 + for( h = pirq_routers; h->vendor; h++) {
18996 + /* First look for a router match */
18997 + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
18998 + break;
18999 + /* Fall back to a device match */
19000 + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19001 + break;
19002 + }
19003 + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19004 + pirq_router.name,
19005 + pirq_router_dev->vendor,
19006 + pirq_router_dev->device,
19007 + pci_name(pirq_router_dev));
19008 +}
19009 +
19010 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
19011 +{
19012 + struct irq_routing_table *rt = pirq_table;
19013 + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19014 + struct irq_info *info;
19015 +
19016 + for (info = rt->slots; entries--; info++)
19017 + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19018 + return info;
19019 + return NULL;
19020 +}
19021 +
19022 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19023 +{
19024 + u8 pin;
19025 + struct irq_info *info;
19026 + int i, pirq, newirq;
19027 + int irq = 0;
19028 + u32 mask;
19029 + struct irq_router *r = &pirq_router;
19030 + struct pci_dev *dev2 = NULL;
19031 + char *msg = NULL;
19032 +
19033 + /* Find IRQ pin */
19034 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19035 + if (!pin) {
19036 + DBG(KERN_DEBUG " -> no interrupt pin\n");
19037 + return 0;
19038 + }
19039 + pin = pin - 1;
19040 +
19041 + /* Find IRQ routing entry */
19042 +
19043 + if (!pirq_table)
19044 + return 0;
19045 +
19046 + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19047 + info = pirq_get_info(dev);
19048 + if (!info) {
19049 + DBG(" -> not found in routing table\n" KERN_DEBUG);
19050 + return 0;
19051 + }
19052 + pirq = info->irq[pin].link;
19053 + mask = info->irq[pin].bitmap;
19054 + if (!pirq) {
19055 + DBG(" -> not routed\n" KERN_DEBUG);
19056 + return 0;
19057 + }
19058 + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19059 + mask &= pcibios_irq_mask;
19060 +
19061 + /* Work around broken HP Pavilion Notebooks which assign USB to
19062 + IRQ 9 even though it is actually wired to IRQ 11 */
19063 +
19064 + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19065 + dev->irq = 11;
19066 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19067 + r->set(pirq_router_dev, dev, pirq, 11);
19068 + }
19069 +
19070 + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19071 + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19072 + pirq = 0x68;
19073 + mask = 0x400;
19074 + dev->irq = r->get(pirq_router_dev, dev, pirq);
19075 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19076 + }
19077 +
19078 + /*
19079 + * Find the best IRQ to assign: use the one
19080 + * reported by the device if possible.
19081 + */
19082 + newirq = dev->irq;
19083 + if (newirq && !((1 << newirq) & mask)) {
19084 + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19085 + else printk("\n" KERN_WARNING
19086 + "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19087 + "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19088 + pci_name(dev));
19089 + }
19090 + if (!newirq && assign) {
19091 + for (i = 0; i < 16; i++) {
19092 + if (!(mask & (1 << i)))
19093 + continue;
19094 + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
19095 + newirq = i;
19096 + }
19097 + }
19098 + DBG(" -> newirq=%d", newirq);
19099 +
19100 + /* Check if it is hardcoded */
19101 + if ((pirq & 0xf0) == 0xf0) {
19102 + irq = pirq & 0xf;
19103 + DBG(" -> hardcoded IRQ %d\n", irq);
19104 + msg = "Hardcoded";
19105 + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19106 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19107 + DBG(" -> got IRQ %d\n", irq);
19108 + msg = "Found";
19109 + eisa_set_level_irq(irq);
19110 + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19111 + DBG(" -> assigning IRQ %d", newirq);
19112 + if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19113 + eisa_set_level_irq(newirq);
19114 + DBG(" ... OK\n");
19115 + msg = "Assigned";
19116 + irq = newirq;
19117 + }
19118 + }
19119 +
19120 + if (!irq) {
19121 + DBG(" ... failed\n");
19122 + if (newirq && mask == (1 << newirq)) {
19123 + msg = "Guessed";
19124 + irq = newirq;
19125 + } else
19126 + return 0;
19127 + }
19128 + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19129 +
19130 + /* Update IRQ for all devices with the same pirq value */
19131 + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
19132 + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
19133 + if (!pin)
19134 + continue;
19135 + pin--;
19136 + info = pirq_get_info(dev2);
19137 + if (!info)
19138 + continue;
19139 + if (info->irq[pin].link == pirq) {
19140 + /* We refuse to override the dev->irq information. Give a warning! */
19141 + if ( dev2->irq && dev2->irq != irq && \
19142 + (!(pci_probe & PCI_USE_PIRQ_MASK) || \
19143 + ((1 << dev2->irq) & mask)) ) {
19144 +#ifndef CONFIG_PCI_MSI
19145 + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
19146 + pci_name(dev2), dev2->irq, irq);
19147 +#endif
19148 + continue;
19149 + }
19150 + dev2->irq = irq;
19151 + pirq_penalty[irq]++;
19152 + if (dev != dev2)
19153 + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
19154 + }
19155 + }
19156 + return 1;
19157 +}
19158 +
19159 +static void __init pcibios_fixup_irqs(void)
19160 +{
19161 + struct pci_dev *dev = NULL;
19162 + u8 pin;
19163 +
19164 + DBG(KERN_DEBUG "PCI: IRQ fixup\n");
19165 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19166 + /*
19167 + * If the BIOS has set an out of range IRQ number, just ignore it.
19168 + * Also keep track of which IRQ's are already in use.
19169 + */
19170 + if (dev->irq >= 16) {
19171 + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
19172 + dev->irq = 0;
19173 + }
19174 + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
19175 + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
19176 + pirq_penalty[dev->irq] = 0;
19177 + pirq_penalty[dev->irq]++;
19178 + }
19179 +
19180 + dev = NULL;
19181 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
19182 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19183 +#ifdef CONFIG_X86_IO_APIC
19184 + /*
19185 + * Recalculate IRQ numbers if we use the I/O APIC.
19186 + */
19187 + if (io_apic_assign_pci_irqs)
19188 + {
19189 + int irq;
19190 +
19191 + if (pin) {
19192 + pin--; /* interrupt pins are numbered starting from 1 */
19193 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19194 + /*
19195 + * Busses behind bridges are typically not listed in the MP-table.
19196 + * In this case we have to look up the IRQ based on the parent bus,
19197 + * parent slot, and pin number. The SMP code detects such bridged
19198 + * busses itself so we should get into this branch reliably.
19199 + */
19200 + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19201 + struct pci_dev * bridge = dev->bus->self;
19202 +
19203 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19204 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19205 + PCI_SLOT(bridge->devfn), pin);
19206 + if (irq >= 0)
19207 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19208 + pci_name(bridge), 'A' + pin, irq);
19209 + }
19210 + if (irq >= 0) {
19211 + if (use_pci_vector() &&
19212 + !platform_legacy_irq(irq))
19213 + irq = IO_APIC_VECTOR(irq);
19214 +
19215 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19216 + pci_name(dev), 'A' + pin, irq);
19217 + dev->irq = irq;
19218 + }
19219 + }
19220 + }
19221 +#endif
19222 + /*
19223 + * Still no IRQ? Try to lookup one...
19224 + */
19225 + if (pin && !dev->irq)
19226 + pcibios_lookup_irq(dev, 0);
19227 + }
19228 +}
19229 +
19230 +/*
19231 + * Work around broken HP Pavilion Notebooks which assign USB to
19232 + * IRQ 9 even though it is actually wired to IRQ 11
19233 + */
19234 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
19235 +{
19236 + if (!broken_hp_bios_irq9) {
19237 + broken_hp_bios_irq9 = 1;
19238 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19239 + }
19240 + return 0;
19241 +}
19242 +
19243 +/*
19244 + * Work around broken Acer TravelMate 360 Notebooks which assign
19245 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
19246 + */
19247 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
19248 +{
19249 + if (!acer_tm360_irqrouting) {
19250 + acer_tm360_irqrouting = 1;
19251 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
19252 + }
19253 + return 0;
19254 +}
19255 +
19256 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
19257 + {
19258 + .callback = fix_broken_hp_bios_irq9,
19259 + .ident = "HP Pavilion N5400 Series Laptop",
19260 + .matches = {
19261 + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
19262 + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
19263 + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
19264 + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
19265 + },
19266 + },
19267 + {
19268 + .callback = fix_acer_tm360_irqrouting,
19269 + .ident = "Acer TravelMate 36x Laptop",
19270 + .matches = {
19271 + DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
19272 + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
19273 + },
19274 + },
19275 + { }
19276 +};
19277 +
19278 +static int __init pcibios_irq_init(void)
19279 +{
19280 + DBG(KERN_DEBUG "PCI: IRQ init\n");
19281 +
19282 + if (pcibios_enable_irq || raw_pci_ops == NULL)
19283 + return 0;
19284 +
19285 + dmi_check_system(pciirq_dmi_table);
19286 +
19287 + pirq_table = pirq_find_routing_table();
19288 +
19289 +#ifdef CONFIG_PCI_BIOS
19290 + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
19291 + pirq_table = pcibios_get_irq_routing_table();
19292 +#endif
19293 + if (pirq_table) {
19294 + pirq_peer_trick();
19295 + pirq_find_router(&pirq_router);
19296 + if (pirq_table->exclusive_irqs) {
19297 + int i;
19298 + for (i=0; i<16; i++)
19299 + if (!(pirq_table->exclusive_irqs & (1 << i)))
19300 + pirq_penalty[i] += 100;
19301 + }
19302 + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
19303 + if (io_apic_assign_pci_irqs)
19304 + pirq_table = NULL;
19305 + }
19306 +
19307 + pcibios_enable_irq = pirq_enable_irq;
19308 +
19309 + pcibios_fixup_irqs();
19310 + return 0;
19311 +}
19312 +
19313 +subsys_initcall(pcibios_irq_init);
19314 +
19315 +
19316 +static void pirq_penalize_isa_irq(int irq, int active)
19317 +{
19318 + /*
19319 + * If any ISAPnP device reports an IRQ in its list of possible
19320 + * IRQ's, we try to avoid assigning it to PCI devices.
19321 + */
19322 + if (irq < 16) {
19323 + if (active)
19324 + pirq_penalty[irq] += 1000;
19325 + else
19326 + pirq_penalty[irq] += 100;
19327 + }
19328 +}
19329 +
19330 +void pcibios_penalize_isa_irq(int irq, int active)
19331 +{
19332 +#ifdef CONFIG_ACPI
19333 + if (!acpi_noirq)
19334 + acpi_penalize_isa_irq(irq, active);
19335 + else
19336 +#endif
19337 + pirq_penalize_isa_irq(irq, active);
19338 +}
19339 +
19340 +static int pirq_enable_irq(struct pci_dev *dev)
19341 +{
19342 + u8 pin;
19343 + struct pci_dev *temp_dev;
19344 +
19345 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19346 + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
19347 + char *msg = "";
19348 +
19349 + pin--; /* interrupt pins are numbered starting from 1 */
19350 +
19351 + if (io_apic_assign_pci_irqs) {
19352 + int irq;
19353 +
19354 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
19355 + /*
19356 + * Busses behind bridges are typically not listed in the MP-table.
19357 + * In this case we have to look up the IRQ based on the parent bus,
19358 + * parent slot, and pin number. The SMP code detects such bridged
19359 + * busses itself so we should get into this branch reliably.
19360 + */
19361 + temp_dev = dev;
19362 + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
19363 + struct pci_dev * bridge = dev->bus->self;
19364 +
19365 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
19366 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
19367 + PCI_SLOT(bridge->devfn), pin);
19368 + if (irq >= 0)
19369 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
19370 + pci_name(bridge), 'A' + pin, irq);
19371 + dev = bridge;
19372 + }
19373 + dev = temp_dev;
19374 + if (irq >= 0) {
19375 +#ifdef CONFIG_PCI_MSI
19376 + if (!platform_legacy_irq(irq))
19377 + irq = IO_APIC_VECTOR(irq);
19378 +#endif
19379 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
19380 + pci_name(dev), 'A' + pin, irq);
19381 + dev->irq = irq;
19382 + return 0;
19383 + } else
19384 + msg = " Probably buggy MP table.";
19385 + } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
19386 + msg = "";
19387 + else
19388 + msg = " Please try using pci=biosirq.";
19389 +
19390 + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
19391 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
19392 + return 0;
19393 +
19394 + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
19395 + 'A' + pin, pci_name(dev), msg);
19396 + }
19397 + return 0;
19398 +}
19399 +
19400 +int pci_vector_resources(int last, int nr_released)
19401 +{
19402 + int count = nr_released;
19403 +
19404 + int next = last;
19405 + int offset = (last % 8);
19406 +
19407 + while (next < FIRST_SYSTEM_VECTOR) {
19408 + next += 8;
19409 +#ifdef CONFIG_X86_64
19410 + if (next == IA32_SYSCALL_VECTOR)
19411 + continue;
19412 +#else
19413 + if (next == SYSCALL_VECTOR)
19414 + continue;
19415 +#endif
19416 + count++;
19417 + if (next >= FIRST_SYSTEM_VECTOR) {
19418 + if (offset%8) {
19419 + next = FIRST_DEVICE_VECTOR + offset;
19420 + offset++;
19421 + continue;
19422 + }
19423 + count--;
19424 + }
19425 + }
19426 +
19427 + return count;
19428 +}
19429 Index: head-2008-11-25/arch/x86/pci/pcifront.c
19430 ===================================================================
19431 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19432 +++ head-2008-11-25/arch/x86/pci/pcifront.c 2007-06-12 13:12:49.000000000 +0200
19433 @@ -0,0 +1,55 @@
19434 +/*
19435 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
19436 + * to support the Xen PCI Frontend's operation
19437 + *
19438 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
19439 + */
19440 +#include <linux/module.h>
19441 +#include <linux/init.h>
19442 +#include <linux/pci.h>
19443 +#include <asm/acpi.h>
19444 +#include "pci.h"
19445 +
19446 +static int pcifront_enable_irq(struct pci_dev *dev)
19447 +{
19448 + u8 irq;
19449 + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
19450 + dev->irq = irq;
19451 +
19452 + return 0;
19453 +}
19454 +
19455 +extern u8 pci_cache_line_size;
19456 +
19457 +static int __init pcifront_x86_stub_init(void)
19458 +{
19459 + struct cpuinfo_x86 *c = &boot_cpu_data;
19460 +
19461 + /* Only install our method if we haven't found real hardware already */
19462 + if (raw_pci_ops)
19463 + return 0;
19464 +
19465 + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
19466 +
19467 + /* Copied from arch/i386/pci/common.c */
19468 + pci_cache_line_size = 32 >> 2;
19469 + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
19470 + pci_cache_line_size = 64 >> 2; /* K7 & K8 */
19471 + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
19472 + pci_cache_line_size = 128 >> 2; /* P4 */
19473 +
19474 + /* On x86, we need to disable the normal IRQ routing table and
19475 + * just ask the backend
19476 + */
19477 + pcibios_enable_irq = pcifront_enable_irq;
19478 + pcibios_disable_irq = NULL;
19479 +
19480 +#ifdef CONFIG_ACPI
19481 + /* Keep ACPI out of the picture */
19482 + acpi_noirq = 1;
19483 +#endif
19484 +
19485 + return 0;
19486 +}
19487 +
19488 +arch_initcall(pcifront_x86_stub_init);
19489 Index: head-2008-11-25/arch/x86/ia32/ia32entry-xen.S
19490 ===================================================================
19491 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19492 +++ head-2008-11-25/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200
19493 @@ -0,0 +1,666 @@
19494 +/*
19495 + * Compatibility mode system call entry point for x86-64.
19496 + *
19497 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
19498 + */
19499 +
19500 +#include <asm/dwarf2.h>
19501 +#include <asm/calling.h>
19502 +#include <asm/asm-offsets.h>
19503 +#include <asm/current.h>
19504 +#include <asm/errno.h>
19505 +#include <asm/ia32_unistd.h>
19506 +#include <asm/thread_info.h>
19507 +#include <asm/segment.h>
19508 +#include <asm/vsyscall32.h>
19509 +#include <asm/irqflags.h>
19510 +#include <linux/linkage.h>
19511 +
19512 +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19513 +
19514 + .macro IA32_ARG_FIXUP noebp=0
19515 + movl %edi,%r8d
19516 + .if \noebp
19517 + .else
19518 + movl %ebp,%r9d
19519 + .endif
19520 + xchg %ecx,%esi
19521 + movl %ebx,%edi
19522 + movl %edx,%edx /* zero extension */
19523 + .endm
19524 +
19525 + /* clobbers %eax */
19526 + .macro CLEAR_RREGS
19527 + xorl %eax,%eax
19528 + movq %rax,R11(%rsp)
19529 + movq %rax,R10(%rsp)
19530 + movq %rax,R9(%rsp)
19531 + movq %rax,R8(%rsp)
19532 + .endm
19533 +
19534 + .macro LOAD_ARGS32 offset
19535 + movl \offset(%rsp),%r11d
19536 + movl \offset+8(%rsp),%r10d
19537 + movl \offset+16(%rsp),%r9d
19538 + movl \offset+24(%rsp),%r8d
19539 + movl \offset+40(%rsp),%ecx
19540 + movl \offset+48(%rsp),%edx
19541 + movl \offset+56(%rsp),%esi
19542 + movl \offset+64(%rsp),%edi
19543 + movl \offset+72(%rsp),%eax
19544 + .endm
19545 +
19546 + .macro CFI_STARTPROC32 simple
19547 + CFI_STARTPROC \simple
19548 + CFI_UNDEFINED r8
19549 + CFI_UNDEFINED r9
19550 + CFI_UNDEFINED r10
19551 + CFI_UNDEFINED r11
19552 + CFI_UNDEFINED r12
19553 + CFI_UNDEFINED r13
19554 + CFI_UNDEFINED r14
19555 + CFI_UNDEFINED r15
19556 + .endm
19557 +
19558 +/*
19559 + * 32bit SYSENTER instruction entry.
19560 + *
19561 + * Arguments:
19562 + * %eax System call number.
19563 + * %ebx Arg1
19564 + * %ecx Arg2
19565 + * %edx Arg3
19566 + * %esi Arg4
19567 + * %edi Arg5
19568 + * %ebp user stack
19569 + * 0(%ebp) Arg6
19570 + *
19571 + * Interrupts on.
19572 + *
19573 + * This is purely a fast path. For anything complicated we use the int 0x80
19574 + * path below. Set up a complete hardware stack frame to share code
19575 + * with the int 0x80 path.
19576 + */
19577 +ENTRY(ia32_sysenter_target)
19578 + CFI_STARTPROC32 simple
19579 + CFI_DEF_CFA rsp,SS+8-RIP+16
19580 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19581 + CFI_REL_OFFSET rsp,RSP-RIP+16
19582 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19583 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19584 + CFI_REL_OFFSET rip,RIP-RIP+16
19585 + CFI_REL_OFFSET r11,8
19586 + CFI_REL_OFFSET rcx,0
19587 + movq 8(%rsp),%r11
19588 + CFI_RESTORE r11
19589 + popq %rcx
19590 + CFI_ADJUST_CFA_OFFSET -8
19591 + CFI_RESTORE rcx
19592 + movl %ebp,%ebp /* zero extension */
19593 + movl %eax,%eax
19594 + movl $__USER32_DS,40(%rsp)
19595 + movq %rbp,32(%rsp)
19596 + movl $__USER32_CS,16(%rsp)
19597 + movl $VSYSCALL32_SYSEXIT,8(%rsp)
19598 + movq %rax,(%rsp)
19599 + cld
19600 + SAVE_ARGS 0,0,0
19601 + /* no need to do an access_ok check here because rbp has been
19602 + 32bit zero extended */
19603 +1: movl (%rbp),%r9d
19604 + .section __ex_table,"a"
19605 + .quad 1b,ia32_badarg
19606 + .previous
19607 + GET_THREAD_INFO(%r10)
19608 + orl $TS_COMPAT,threadinfo_status(%r10)
19609 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19610 + jnz sysenter_tracesys
19611 +sysenter_do_call:
19612 + cmpl $(IA32_NR_syscalls-1),%eax
19613 + ja ia32_badsys
19614 + IA32_ARG_FIXUP 1
19615 + call *ia32_sys_call_table(,%rax,8)
19616 + movq %rax,RAX-ARGOFFSET(%rsp)
19617 + jmp int_ret_from_sys_call
19618 +
19619 +sysenter_tracesys:
19620 + SAVE_REST
19621 + CLEAR_RREGS
19622 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19623 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19624 + call syscall_trace_enter
19625 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19626 + RESTORE_REST
19627 + movl %ebp, %ebp
19628 + /* no need to do an access_ok check here because rbp has been
19629 + 32bit zero extended */
19630 +1: movl (%rbp),%r9d
19631 + .section __ex_table,"a"
19632 + .quad 1b,ia32_badarg
19633 + .previous
19634 + jmp sysenter_do_call
19635 + CFI_ENDPROC
19636 +ENDPROC(ia32_sysenter_target)
19637 +
19638 +/*
19639 + * 32bit SYSCALL instruction entry.
19640 + *
19641 + * Arguments:
19642 + * %eax System call number.
19643 + * %ebx Arg1
19644 + * %ecx return EIP
19645 + * %edx Arg3
19646 + * %esi Arg4
19647 + * %edi Arg5
19648 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
19649 + * %esp user stack
19650 + * 0(%esp) Arg6
19651 + *
19652 + * Interrupts on.
19653 + *
19654 + * This is purely a fast path. For anything complicated we use the int 0x80
19655 + * path below. Set up a complete hardware stack frame to share code
19656 + * with the int 0x80 path.
19657 + */
19658 +ENTRY(ia32_cstar_target)
19659 + CFI_STARTPROC32 simple
19660 + CFI_DEF_CFA rsp,SS+8-RIP+16
19661 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19662 + CFI_REL_OFFSET rsp,RSP-RIP+16
19663 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19664 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19665 + CFI_REL_OFFSET rip,RIP-RIP+16
19666 + movl %eax,%eax /* zero extension */
19667 + movl RSP-RIP+16(%rsp),%r8d
19668 + SAVE_ARGS -8,1,1
19669 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
19670 + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
19671 + movl %ebp,%ecx
19672 + movl $__USER32_CS,CS-ARGOFFSET(%rsp)
19673 + movl $__USER32_DS,SS-ARGOFFSET(%rsp)
19674 + /* no need to do an access_ok check here because r8 has been
19675 + 32bit zero extended */
19676 + /* hardware stack frame is complete now */
19677 +1: movl (%r8),%r9d
19678 + .section __ex_table,"a"
19679 + .quad 1b,ia32_badarg
19680 + .previous
19681 + GET_THREAD_INFO(%r10)
19682 + orl $TS_COMPAT,threadinfo_status(%r10)
19683 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19684 + jnz cstar_tracesys
19685 +cstar_do_call:
19686 + cmpl $IA32_NR_syscalls-1,%eax
19687 + ja ia32_badsys
19688 + IA32_ARG_FIXUP 1
19689 + call *ia32_sys_call_table(,%rax,8)
19690 + movq %rax,RAX-ARGOFFSET(%rsp)
19691 + jmp int_ret_from_sys_call
19692 +
19693 +cstar_tracesys:
19694 + SAVE_REST
19695 + CLEAR_RREGS
19696 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19697 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19698 + call syscall_trace_enter
19699 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19700 + RESTORE_REST
19701 + movl RSP-ARGOFFSET(%rsp), %r8d
19702 + /* no need to do an access_ok check here because r8 has been
19703 + 32bit zero extended */
19704 +1: movl (%r8),%r9d
19705 + .section __ex_table,"a"
19706 + .quad 1b,ia32_badarg
19707 + .previous
19708 + jmp cstar_do_call
19709 +END(ia32_cstar_target)
19710 +
19711 +ia32_badarg:
19712 + movq $-EFAULT,%rax
19713 + jmp ia32_sysret
19714 + CFI_ENDPROC
19715 +
19716 +/*
19717 + * Emulated IA32 system calls via int 0x80.
19718 + *
19719 + * Arguments:
19720 + * %eax System call number.
19721 + * %ebx Arg1
19722 + * %ecx Arg2
19723 + * %edx Arg3
19724 + * %esi Arg4
19725 + * %edi Arg5
19726 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
19727 + *
19728 + * Notes:
19729 + * Uses the same stack frame as the x86-64 version.
19730 + * All registers except %eax must be saved (but ptrace may violate that)
19731 + * Arguments are zero extended. For system calls that want sign extension and
19732 + * take long arguments a wrapper is needed. Most calls can just be called
19733 + * directly.
19734 + * Assumes it is only called from user space and entered with interrupts on.
19735 + */
19736 +
19737 +ENTRY(ia32_syscall)
19738 + CFI_STARTPROC simple
19739 + CFI_DEF_CFA rsp,SS+8-RIP+16
19740 + /*CFI_REL_OFFSET ss,SS-RIP+16*/
19741 + CFI_REL_OFFSET rsp,RSP-RIP+16
19742 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
19743 + /*CFI_REL_OFFSET cs,CS-RIP+16*/
19744 + CFI_REL_OFFSET rip,RIP-RIP+16
19745 + CFI_REL_OFFSET r11,8
19746 + CFI_REL_OFFSET rcx,0
19747 + movq 8(%rsp),%r11
19748 + CFI_RESTORE r11
19749 + popq %rcx
19750 + CFI_ADJUST_CFA_OFFSET -8
19751 + CFI_RESTORE rcx
19752 + movl %eax,%eax
19753 + movq %rax,(%rsp)
19754 + cld
19755 + /* note the registers are not zero extended to the sf.
19756 + this could be a problem. */
19757 + SAVE_ARGS 0,0,1
19758 + GET_THREAD_INFO(%r10)
19759 + orl $TS_COMPAT,threadinfo_status(%r10)
19760 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
19761 + jnz ia32_tracesys
19762 +ia32_do_syscall:
19763 + cmpl $(IA32_NR_syscalls-1),%eax
19764 + ja ia32_badsys
19765 + IA32_ARG_FIXUP
19766 + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
19767 +ia32_sysret:
19768 + movq %rax,RAX-ARGOFFSET(%rsp)
19769 + jmp int_ret_from_sys_call
19770 +
19771 +ia32_tracesys:
19772 + SAVE_REST
19773 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
19774 + movq %rsp,%rdi /* &pt_regs -> arg1 */
19775 + call syscall_trace_enter
19776 + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
19777 + RESTORE_REST
19778 + jmp ia32_do_syscall
19779 +END(ia32_syscall)
19780 +
19781 +ia32_badsys:
19782 + movq $0,ORIG_RAX-ARGOFFSET(%rsp)
19783 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
19784 + jmp int_ret_from_sys_call
19785 +
19786 +quiet_ni_syscall:
19787 + movq $-ENOSYS,%rax
19788 + ret
19789 + CFI_ENDPROC
19790 +
19791 + .macro PTREGSCALL label, func, arg
19792 + .globl \label
19793 +\label:
19794 + leaq \func(%rip),%rax
19795 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
19796 + jmp ia32_ptregs_common
19797 + .endm
19798 +
19799 + CFI_STARTPROC32
19800 +
19801 + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
19802 + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
19803 + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
19804 + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
19805 + PTREGSCALL stub32_execve, sys32_execve, %rcx
19806 + PTREGSCALL stub32_fork, sys_fork, %rdi
19807 + PTREGSCALL stub32_clone, sys32_clone, %rdx
19808 + PTREGSCALL stub32_vfork, sys_vfork, %rdi
19809 + PTREGSCALL stub32_iopl, sys_iopl, %rsi
19810 + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
19811 +
19812 +ENTRY(ia32_ptregs_common)
19813 + popq %r11
19814 + CFI_ENDPROC
19815 + CFI_STARTPROC32 simple
19816 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
19817 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
19818 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
19819 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
19820 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
19821 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
19822 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
19823 +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
19824 +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
19825 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
19826 +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
19827 + SAVE_REST
19828 + call *%rax
19829 + RESTORE_REST
19830 + jmp ia32_sysret /* misbalances the return cache */
19831 + CFI_ENDPROC
19832 +END(ia32_ptregs_common)
19833 +
19834 + .section .rodata,"a"
19835 + .align 8
19836 +ia32_sys_call_table:
19837 + .quad sys_restart_syscall
19838 + .quad sys_exit
19839 + .quad stub32_fork
19840 + .quad sys_read
19841 + .quad sys_write
19842 + .quad compat_sys_open /* 5 */
19843 + .quad sys_close
19844 + .quad sys32_waitpid
19845 + .quad sys_creat
19846 + .quad sys_link
19847 + .quad sys_unlink /* 10 */
19848 + .quad stub32_execve
19849 + .quad sys_chdir
19850 + .quad compat_sys_time
19851 + .quad sys_mknod
19852 + .quad sys_chmod /* 15 */
19853 + .quad sys_lchown16
19854 + .quad quiet_ni_syscall /* old break syscall holder */
19855 + .quad sys_stat
19856 + .quad sys32_lseek
19857 + .quad sys_getpid /* 20 */
19858 + .quad compat_sys_mount /* mount */
19859 + .quad sys_oldumount /* old_umount */
19860 + .quad sys_setuid16
19861 + .quad sys_getuid16
19862 + .quad compat_sys_stime /* stime */ /* 25 */
19863 + .quad sys32_ptrace /* ptrace */
19864 + .quad sys_alarm
19865 + .quad sys_fstat /* (old)fstat */
19866 + .quad sys_pause
19867 + .quad compat_sys_utime /* 30 */
19868 + .quad quiet_ni_syscall /* old stty syscall holder */
19869 + .quad quiet_ni_syscall /* old gtty syscall holder */
19870 + .quad sys_access
19871 + .quad sys_nice
19872 + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
19873 + .quad sys_sync
19874 + .quad sys32_kill
19875 + .quad sys_rename
19876 + .quad sys_mkdir
19877 + .quad sys_rmdir /* 40 */
19878 + .quad sys_dup
19879 + .quad sys32_pipe
19880 + .quad compat_sys_times
19881 + .quad quiet_ni_syscall /* old prof syscall holder */
19882 + .quad sys_brk /* 45 */
19883 + .quad sys_setgid16
19884 + .quad sys_getgid16
19885 + .quad sys_signal
19886 + .quad sys_geteuid16
19887 + .quad sys_getegid16 /* 50 */
19888 + .quad sys_acct
19889 + .quad sys_umount /* new_umount */
19890 + .quad quiet_ni_syscall /* old lock syscall holder */
19891 + .quad compat_sys_ioctl
19892 + .quad compat_sys_fcntl64 /* 55 */
19893 + .quad quiet_ni_syscall /* old mpx syscall holder */
19894 + .quad sys_setpgid
19895 + .quad quiet_ni_syscall /* old ulimit syscall holder */
19896 + .quad sys32_olduname
19897 + .quad sys_umask /* 60 */
19898 + .quad sys_chroot
19899 + .quad sys32_ustat
19900 + .quad sys_dup2
19901 + .quad sys_getppid
19902 + .quad sys_getpgrp /* 65 */
19903 + .quad sys_setsid
19904 + .quad sys32_sigaction
19905 + .quad sys_sgetmask
19906 + .quad sys_ssetmask
19907 + .quad sys_setreuid16 /* 70 */
19908 + .quad sys_setregid16
19909 + .quad stub32_sigsuspend
19910 + .quad compat_sys_sigpending
19911 + .quad sys_sethostname
19912 + .quad compat_sys_setrlimit /* 75 */
19913 + .quad compat_sys_old_getrlimit /* old_getrlimit */
19914 + .quad compat_sys_getrusage
19915 + .quad sys32_gettimeofday
19916 + .quad sys32_settimeofday
19917 + .quad sys_getgroups16 /* 80 */
19918 + .quad sys_setgroups16
19919 + .quad sys32_old_select
19920 + .quad sys_symlink
19921 + .quad sys_lstat
19922 + .quad sys_readlink /* 85 */
19923 +#ifdef CONFIG_IA32_AOUT
19924 + .quad sys_uselib
19925 +#else
19926 + .quad quiet_ni_syscall
19927 +#endif
19928 + .quad sys_swapon
19929 + .quad sys_reboot
19930 + .quad compat_sys_old_readdir
19931 + .quad sys32_mmap /* 90 */
19932 + .quad sys_munmap
19933 + .quad sys_truncate
19934 + .quad sys_ftruncate
19935 + .quad sys_fchmod
19936 + .quad sys_fchown16 /* 95 */
19937 + .quad sys_getpriority
19938 + .quad sys_setpriority
19939 + .quad quiet_ni_syscall /* old profil syscall holder */
19940 + .quad compat_sys_statfs
19941 + .quad compat_sys_fstatfs /* 100 */
19942 + .quad sys_ioperm
19943 + .quad compat_sys_socketcall
19944 + .quad sys_syslog
19945 + .quad compat_sys_setitimer
19946 + .quad compat_sys_getitimer /* 105 */
19947 + .quad compat_sys_newstat
19948 + .quad compat_sys_newlstat
19949 + .quad compat_sys_newfstat
19950 + .quad sys32_uname
19951 + .quad stub32_iopl /* 110 */
19952 + .quad sys_vhangup
19953 + .quad quiet_ni_syscall /* old "idle" system call */
19954 + .quad sys32_vm86_warning /* vm86old */
19955 + .quad compat_sys_wait4
19956 + .quad sys_swapoff /* 115 */
19957 + .quad sys32_sysinfo
19958 + .quad sys32_ipc
19959 + .quad sys_fsync
19960 + .quad stub32_sigreturn
19961 + .quad stub32_clone /* 120 */
19962 + .quad sys_setdomainname
19963 + .quad sys_uname
19964 + .quad sys_modify_ldt
19965 + .quad compat_sys_adjtimex
19966 + .quad sys32_mprotect /* 125 */
19967 + .quad compat_sys_sigprocmask
19968 + .quad quiet_ni_syscall /* create_module */
19969 + .quad sys_init_module
19970 + .quad sys_delete_module
19971 + .quad quiet_ni_syscall /* 130 get_kernel_syms */
19972 + .quad sys_quotactl
19973 + .quad sys_getpgid
19974 + .quad sys_fchdir
19975 + .quad quiet_ni_syscall /* bdflush */
19976 + .quad sys_sysfs /* 135 */
19977 + .quad sys_personality
19978 + .quad quiet_ni_syscall /* for afs_syscall */
19979 + .quad sys_setfsuid16
19980 + .quad sys_setfsgid16
19981 + .quad sys_llseek /* 140 */
19982 + .quad compat_sys_getdents
19983 + .quad compat_sys_select
19984 + .quad sys_flock
19985 + .quad sys_msync
19986 + .quad compat_sys_readv /* 145 */
19987 + .quad compat_sys_writev
19988 + .quad sys_getsid
19989 + .quad sys_fdatasync
19990 + .quad sys32_sysctl /* sysctl */
19991 + .quad sys_mlock /* 150 */
19992 + .quad sys_munlock
19993 + .quad sys_mlockall
19994 + .quad sys_munlockall
19995 + .quad sys_sched_setparam
19996 + .quad sys_sched_getparam /* 155 */
19997 + .quad sys_sched_setscheduler
19998 + .quad sys_sched_getscheduler
19999 + .quad sys_sched_yield
20000 + .quad sys_sched_get_priority_max
20001 + .quad sys_sched_get_priority_min /* 160 */
20002 + .quad sys_sched_rr_get_interval
20003 + .quad compat_sys_nanosleep
20004 + .quad sys_mremap
20005 + .quad sys_setresuid16
20006 + .quad sys_getresuid16 /* 165 */
20007 + .quad sys32_vm86_warning /* vm86 */
20008 + .quad quiet_ni_syscall /* query_module */
20009 + .quad sys_poll
20010 + .quad compat_sys_nfsservctl
20011 + .quad sys_setresgid16 /* 170 */
20012 + .quad sys_getresgid16
20013 + .quad sys_prctl
20014 + .quad stub32_rt_sigreturn
20015 + .quad sys32_rt_sigaction
20016 + .quad sys32_rt_sigprocmask /* 175 */
20017 + .quad sys32_rt_sigpending
20018 + .quad compat_sys_rt_sigtimedwait
20019 + .quad sys32_rt_sigqueueinfo
20020 + .quad stub32_rt_sigsuspend
20021 + .quad sys32_pread /* 180 */
20022 + .quad sys32_pwrite
20023 + .quad sys_chown16
20024 + .quad sys_getcwd
20025 + .quad sys_capget
20026 + .quad sys_capset
20027 + .quad stub32_sigaltstack
20028 + .quad sys32_sendfile
20029 + .quad quiet_ni_syscall /* streams1 */
20030 + .quad quiet_ni_syscall /* streams2 */
20031 + .quad stub32_vfork /* 190 */
20032 + .quad compat_sys_getrlimit
20033 + .quad sys32_mmap2
20034 + .quad sys32_truncate64
20035 + .quad sys32_ftruncate64
20036 + .quad sys32_stat64 /* 195 */
20037 + .quad sys32_lstat64
20038 + .quad sys32_fstat64
20039 + .quad sys_lchown
20040 + .quad sys_getuid
20041 + .quad sys_getgid /* 200 */
20042 + .quad sys_geteuid
20043 + .quad sys_getegid
20044 + .quad sys_setreuid
20045 + .quad sys_setregid
20046 + .quad sys_getgroups /* 205 */
20047 + .quad sys_setgroups
20048 + .quad sys_fchown
20049 + .quad sys_setresuid
20050 + .quad sys_getresuid
20051 + .quad sys_setresgid /* 210 */
20052 + .quad sys_getresgid
20053 + .quad sys_chown
20054 + .quad sys_setuid
20055 + .quad sys_setgid
20056 + .quad sys_setfsuid /* 215 */
20057 + .quad sys_setfsgid
20058 + .quad sys_pivot_root
20059 + .quad sys_mincore
20060 + .quad sys_madvise
20061 + .quad compat_sys_getdents64 /* 220 getdents64 */
20062 + .quad compat_sys_fcntl64
20063 + .quad quiet_ni_syscall /* tux */
20064 + .quad quiet_ni_syscall /* security */
20065 + .quad sys_gettid
20066 + .quad sys_readahead /* 225 */
20067 + .quad sys_setxattr
20068 + .quad sys_lsetxattr
20069 + .quad sys_fsetxattr
20070 + .quad sys_getxattr
20071 + .quad sys_lgetxattr /* 230 */
20072 + .quad sys_fgetxattr
20073 + .quad sys_listxattr
20074 + .quad sys_llistxattr
20075 + .quad sys_flistxattr
20076 + .quad sys_removexattr /* 235 */
20077 + .quad sys_lremovexattr
20078 + .quad sys_fremovexattr
20079 + .quad sys_tkill
20080 + .quad sys_sendfile64
20081 + .quad compat_sys_futex /* 240 */
20082 + .quad compat_sys_sched_setaffinity
20083 + .quad compat_sys_sched_getaffinity
20084 + .quad sys32_set_thread_area
20085 + .quad sys32_get_thread_area
20086 + .quad compat_sys_io_setup /* 245 */
20087 + .quad sys_io_destroy
20088 + .quad compat_sys_io_getevents
20089 + .quad compat_sys_io_submit
20090 + .quad sys_io_cancel
20091 + .quad sys_fadvise64 /* 250 */
20092 + .quad quiet_ni_syscall /* free_huge_pages */
20093 + .quad sys_exit_group
20094 + .quad sys32_lookup_dcookie
20095 + .quad sys_epoll_create
20096 + .quad sys_epoll_ctl /* 255 */
20097 + .quad sys_epoll_wait
20098 + .quad sys_remap_file_pages
20099 + .quad sys_set_tid_address
20100 + .quad compat_sys_timer_create
20101 + .quad compat_sys_timer_settime /* 260 */
20102 + .quad compat_sys_timer_gettime
20103 + .quad sys_timer_getoverrun
20104 + .quad sys_timer_delete
20105 + .quad compat_sys_clock_settime
20106 + .quad compat_sys_clock_gettime /* 265 */
20107 + .quad compat_sys_clock_getres
20108 + .quad compat_sys_clock_nanosleep
20109 + .quad compat_sys_statfs64
20110 + .quad compat_sys_fstatfs64
20111 + .quad sys_tgkill /* 270 */
20112 + .quad compat_sys_utimes
20113 + .quad sys32_fadvise64_64
20114 + .quad quiet_ni_syscall /* sys_vserver */
20115 + .quad sys_mbind
20116 + .quad compat_sys_get_mempolicy /* 275 */
20117 + .quad sys_set_mempolicy
20118 + .quad compat_sys_mq_open
20119 + .quad sys_mq_unlink
20120 + .quad compat_sys_mq_timedsend
20121 + .quad compat_sys_mq_timedreceive /* 280 */
20122 + .quad compat_sys_mq_notify
20123 + .quad compat_sys_mq_getsetattr
20124 + .quad compat_sys_kexec_load /* reserved for kexec */
20125 + .quad compat_sys_waitid
20126 + .quad quiet_ni_syscall /* 285: sys_altroot */
20127 + .quad sys_add_key
20128 + .quad sys_request_key
20129 + .quad sys_keyctl
20130 + .quad sys_ioprio_set
20131 + .quad sys_ioprio_get /* 290 */
20132 + .quad sys_inotify_init
20133 + .quad sys_inotify_add_watch
20134 + .quad sys_inotify_rm_watch
20135 + .quad sys_migrate_pages
20136 + .quad compat_sys_openat /* 295 */
20137 + .quad sys_mkdirat
20138 + .quad sys_mknodat
20139 + .quad sys_fchownat
20140 + .quad compat_sys_futimesat
20141 + .quad sys32_fstatat /* 300 */
20142 + .quad sys_unlinkat
20143 + .quad sys_renameat
20144 + .quad sys_linkat
20145 + .quad sys_symlinkat
20146 + .quad sys_readlinkat /* 305 */
20147 + .quad sys_fchmodat
20148 + .quad sys_faccessat
20149 + .quad quiet_ni_syscall /* pselect6 for now */
20150 + .quad quiet_ni_syscall /* ppoll for now */
20151 + .quad sys_unshare /* 310 */
20152 + .quad compat_sys_set_robust_list
20153 + .quad compat_sys_get_robust_list
20154 + .quad sys_splice
20155 + .quad sys_sync_file_range
20156 + .quad sys_tee
20157 + .quad compat_sys_vmsplice
20158 + .quad compat_sys_move_pages
20159 +ia32_syscall_end:
20160 Index: head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c
20161 ===================================================================
20162 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20163 +++ head-2008-11-25/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200
20164 @@ -0,0 +1,146 @@
20165 +/*
20166 + * acpi.c - Architecture-Specific Low-Level ACPI Support
20167 + *
20168 + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
20169 + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
20170 + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
20171 + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
20172 + * Copyright (C) 2003 Pavel Machek, SuSE Labs
20173 + *
20174 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20175 + *
20176 + * This program is free software; you can redistribute it and/or modify
20177 + * it under the terms of the GNU General Public License as published by
20178 + * the Free Software Foundation; either version 2 of the License, or
20179 + * (at your option) any later version.
20180 + *
20181 + * This program is distributed in the hope that it will be useful,
20182 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
20183 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20184 + * GNU General Public License for more details.
20185 + *
20186 + * You should have received a copy of the GNU General Public License
20187 + * along with this program; if not, write to the Free Software
20188 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20189 + *
20190 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20191 + */
20192 +
20193 +#include <linux/kernel.h>
20194 +#include <linux/init.h>
20195 +#include <linux/types.h>
20196 +#include <linux/stddef.h>
20197 +#include <linux/slab.h>
20198 +#include <linux/pci.h>
20199 +#include <linux/bootmem.h>
20200 +#include <linux/acpi.h>
20201 +#include <linux/cpumask.h>
20202 +
20203 +#include <asm/mpspec.h>
20204 +#include <asm/io.h>
20205 +#include <asm/apic.h>
20206 +#include <asm/apicdef.h>
20207 +#include <asm/page.h>
20208 +#include <asm/pgtable.h>
20209 +#include <asm/pgalloc.h>
20210 +#include <asm/io_apic.h>
20211 +#include <asm/proto.h>
20212 +#include <asm/tlbflush.h>
20213 +
20214 +/* --------------------------------------------------------------------------
20215 + Low-Level Sleep Support
20216 + -------------------------------------------------------------------------- */
20217 +
20218 +#ifdef CONFIG_ACPI_SLEEP
20219 +
20220 +#ifndef CONFIG_ACPI_PV_SLEEP
20221 +/* address in low memory of the wakeup routine. */
20222 +unsigned long acpi_wakeup_address = 0;
20223 +unsigned long acpi_video_flags;
20224 +extern char wakeup_start, wakeup_end;
20225 +
20226 +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
20227 +
20228 +static pgd_t low_ptr;
20229 +
20230 +static void init_low_mapping(void)
20231 +{
20232 + pgd_t *slot0 = pgd_offset(current->mm, 0UL);
20233 + low_ptr = *slot0;
20234 + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
20235 + WARN_ON(num_online_cpus() != 1);
20236 + local_flush_tlb();
20237 +}
20238 +#endif
20239 +
20240 +/**
20241 + * acpi_save_state_mem - save kernel state
20242 + *
20243 + * Create an identity mapped page table and copy the wakeup routine to
20244 + * low memory.
20245 + */
20246 +int acpi_save_state_mem(void)
20247 +{
20248 +#ifndef CONFIG_ACPI_PV_SLEEP
20249 + init_low_mapping();
20250 +
20251 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
20252 + &wakeup_end - &wakeup_start);
20253 + acpi_copy_wakeup_routine(acpi_wakeup_address);
20254 +#endif
20255 + return 0;
20256 +}
20257 +
20258 +/*
20259 + * acpi_restore_state
20260 + */
20261 +void acpi_restore_state_mem(void)
20262 +{
20263 +#ifndef CONFIG_ACPI_PV_SLEEP
20264 + set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
20265 + local_flush_tlb();
20266 +#endif
20267 +}
20268 +
20269 +/**
20270 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
20271 + *
20272 + * We allocate a page in low memory for the wakeup
20273 + * routine for when we come back from a sleep state. The
20274 + * runtime allocator allows specification of <16M pages, but not
20275 + * <1M pages.
20276 + */
20277 +void __init acpi_reserve_bootmem(void)
20278 +{
20279 +#ifndef CONFIG_ACPI_PV_SLEEP
20280 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
20281 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
20282 + printk(KERN_CRIT
20283 + "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
20284 +#endif
20285 +}
20286 +
20287 +#ifndef CONFIG_ACPI_PV_SLEEP
20288 +static int __init acpi_sleep_setup(char *str)
20289 +{
20290 + while ((str != NULL) && (*str != '\0')) {
20291 + if (strncmp(str, "s3_bios", 7) == 0)
20292 + acpi_video_flags = 1;
20293 + if (strncmp(str, "s3_mode", 7) == 0)
20294 + acpi_video_flags |= 2;
20295 + str = strchr(str, ',');
20296 + if (str != NULL)
20297 + str += strspn(str, ", \t");
20298 + }
20299 +
20300 + return 1;
20301 +}
20302 +
20303 +__setup("acpi_sleep=", acpi_sleep_setup);
20304 +#endif /* CONFIG_ACPI_PV_SLEEP */
20305 +
20306 +#endif /*CONFIG_ACPI_SLEEP */
20307 +
20308 +void acpi_pci_link_exit(void)
20309 +{
20310 +}
20311 Index: head-2008-11-25/arch/x86/kernel/apic_64-xen.c
20312 ===================================================================
20313 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20314 +++ head-2008-11-25/arch/x86/kernel/apic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
20315 @@ -0,0 +1,197 @@
20316 +/*
20317 + * Local APIC handling, local APIC timers
20318 + *
20319 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
20320 + *
20321 + * Fixes
20322 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
20323 + * thanks to Eric Gilmore
20324 + * and Rolf G. Tews
20325 + * for testing these extensively.
20326 + * Maciej W. Rozycki : Various updates and fixes.
20327 + * Mikael Pettersson : Power Management for UP-APIC.
20328 + * Pavel Machek and
20329 + * Mikael Pettersson : PM converted to driver model.
20330 + */
20331 +
20332 +#include <linux/init.h>
20333 +
20334 +#include <linux/mm.h>
20335 +#include <linux/delay.h>
20336 +#include <linux/bootmem.h>
20337 +#include <linux/smp_lock.h>
20338 +#include <linux/interrupt.h>
20339 +#include <linux/mc146818rtc.h>
20340 +#include <linux/kernel_stat.h>
20341 +#include <linux/sysdev.h>
20342 +#include <linux/module.h>
20343 +
20344 +#include <asm/atomic.h>
20345 +#include <asm/smp.h>
20346 +#include <asm/mtrr.h>
20347 +#include <asm/mpspec.h>
20348 +#include <asm/desc.h>
20349 +#include <asm/arch_hooks.h>
20350 +#include <asm/hpet.h>
20351 +#include <asm/idle.h>
20352 +
20353 +int apic_verbosity;
20354 +
20355 +/*
20356 + * 'what should we do if we get a hw irq event on an illegal vector'.
20357 + * each architecture has to answer this themselves.
20358 + */
20359 +void ack_bad_irq(unsigned int irq)
20360 +{
20361 + printk("unexpected IRQ trap at vector %02x\n", irq);
20362 + /*
20363 + * Currently unexpected vectors happen only on SMP and APIC.
20364 + * We _must_ ack these because every local APIC has only N
20365 + * irq slots per priority level, and a 'hanging, unacked' IRQ
20366 + * holds up an irq slot - in excessive cases (when multiple
20367 + * unexpected vectors occur) that might lock up the APIC
20368 + * completely.
20369 + * But don't ack when the APIC is disabled. -AK
20370 + */
20371 + if (!disable_apic)
20372 + ack_APIC_irq();
20373 +}
20374 +
20375 +int setup_profiling_timer(unsigned int multiplier)
20376 +{
20377 + return -EINVAL;
20378 +}
20379 +
20380 +void smp_local_timer_interrupt(struct pt_regs *regs)
20381 +{
20382 + profile_tick(CPU_PROFILING, regs);
20383 +#ifndef CONFIG_XEN
20384 +#ifdef CONFIG_SMP
20385 + update_process_times(user_mode(regs));
20386 +#endif
20387 +#endif
20388 + /*
20389 + * We take the 'long' return path, and there every subsystem
20390 + * grabs the appropriate locks (kernel lock/ irq lock).
20391 + *
20392 + * we might want to decouple profiling from the 'long path',
20393 + * and do the profiling totally in assembly.
20394 + *
20395 + * Currently this isn't too much of an issue (performance wise),
20396 + * we can take more than 100K local irqs per second on a 100 MHz P5.
20397 + */
20398 +}
20399 +
20400 +/*
20401 + * Local APIC timer interrupt. This is the most natural way for doing
20402 + * local interrupts, but local timer interrupts can be emulated by
20403 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
20404 + *
20405 + * [ if a single-CPU system runs an SMP kernel then we call the local
20406 + * interrupt as well. Thus we cannot inline the local irq ... ]
20407 + */
20408 +void smp_apic_timer_interrupt(struct pt_regs *regs)
20409 +{
20410 + /*
20411 + * the NMI deadlock-detector uses this.
20412 + */
20413 + add_pda(apic_timer_irqs, 1);
20414 +
20415 + /*
20416 + * NOTE! We'd better ACK the irq immediately,
20417 + * because timer handling can be slow.
20418 + */
20419 + ack_APIC_irq();
20420 + /*
20421 + * update_process_times() expects us to have done irq_enter().
20422 + * Besides, if we don't timer interrupts ignore the global
20423 + * interrupt lock, which is the WrongThing (tm) to do.
20424 + */
20425 + exit_idle();
20426 + irq_enter();
20427 + smp_local_timer_interrupt(regs);
20428 + irq_exit();
20429 +}
20430 +
20431 +/*
20432 + * This interrupt should _never_ happen with our APIC/SMP architecture
20433 + */
20434 +asmlinkage void smp_spurious_interrupt(void)
20435 +{
20436 + unsigned int v;
20437 + exit_idle();
20438 + irq_enter();
20439 + /*
20440 + * Check if this really is a spurious interrupt and ACK it
20441 + * if it is a vectored one. Just in case...
20442 + * Spurious interrupts should not be ACKed.
20443 + */
20444 + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
20445 + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
20446 + ack_APIC_irq();
20447 +
20448 +#if 0
20449 + static unsigned long last_warning;
20450 + static unsigned long skipped;
20451 +
20452 + /* see sw-dev-man vol 3, chapter 7.4.13.5 */
20453 + if (time_before(last_warning+30*HZ,jiffies)) {
20454 + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
20455 + smp_processor_id(), skipped);
20456 + last_warning = jiffies;
20457 + skipped = 0;
20458 + } else {
20459 + skipped++;
20460 + }
20461 +#endif
20462 + irq_exit();
20463 +}
20464 +
20465 +/*
20466 + * This interrupt should never happen with our APIC/SMP architecture
20467 + */
20468 +
20469 +asmlinkage void smp_error_interrupt(void)
20470 +{
20471 + unsigned int v, v1;
20472 +
20473 + exit_idle();
20474 + irq_enter();
20475 + /* First tickle the hardware, only then report what went on. -- REW */
20476 + v = apic_read(APIC_ESR);
20477 + apic_write(APIC_ESR, 0);
20478 + v1 = apic_read(APIC_ESR);
20479 + ack_APIC_irq();
20480 + atomic_inc(&irq_err_count);
20481 +
20482 + /* Here is what the APIC error bits mean:
20483 + 0: Send CS error
20484 + 1: Receive CS error
20485 + 2: Send accept error
20486 + 3: Receive accept error
20487 + 4: Reserved
20488 + 5: Send illegal vector
20489 + 6: Received illegal vector
20490 + 7: Illegal register address
20491 + */
20492 + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
20493 + smp_processor_id(), v , v1);
20494 + irq_exit();
20495 +}
20496 +
20497 +int disable_apic;
20498 +
20499 +/*
20500 + * This initializes the IO-APIC and APIC hardware if this is
20501 + * a UP kernel.
20502 + */
20503 +int __init APIC_init_uniprocessor (void)
20504 +{
20505 +#ifdef CONFIG_X86_IO_APIC
20506 + if (smp_found_config)
20507 + if (!skip_ioapic_setup && nr_ioapics)
20508 + setup_IO_APIC();
20509 +#endif
20510 +
20511 + return 1;
20512 +}
20513 Index: head-2008-11-25/arch/x86/kernel/e820_64-xen.c
20514 ===================================================================
20515 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
20516 +++ head-2008-11-25/arch/x86/kernel/e820_64-xen.c 2008-04-22 19:56:27.000000000 +0200
20517 @@ -0,0 +1,798 @@
20518 +/*
20519 + * Handle the memory map.
20520 + * The functions here do the job until bootmem takes over.
20521 + *
20522 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
20523 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
20524 + * Alex Achenbach <xela@slit.de>, December 2002.
20525 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
20526 + *
20527 + */
20528 +#include <linux/kernel.h>
20529 +#include <linux/types.h>
20530 +#include <linux/init.h>
20531 +#include <linux/bootmem.h>
20532 +#include <linux/ioport.h>
20533 +#include <linux/string.h>
20534 +#include <linux/kexec.h>
20535 +#include <linux/module.h>
20536 +
20537 +#include <asm/pgtable.h>
20538 +#include <asm/page.h>
20539 +#include <asm/e820.h>
20540 +#include <asm/proto.h>
20541 +#include <asm/bootsetup.h>
20542 +#include <asm/sections.h>
20543 +#include <xen/interface/memory.h>
20544 +
20545 +/*
20546 + * PFN of last memory page.
20547 + */
20548 +unsigned long end_pfn;
20549 +EXPORT_SYMBOL(end_pfn);
20550 +
20551 +/*
20552 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
20553 + * The direct mapping extends to end_pfn_map, so that we can directly access
20554 + * apertures, ACPI and other tables without having to play with fixmaps.
20555 + */
20556 +unsigned long end_pfn_map;
20557 +
20558 +/*
20559 + * Last pfn which the user wants to use.
20560 + */
20561 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
20562 +
20563 +extern struct resource code_resource, data_resource;
20564 +
20565 +#ifdef CONFIG_XEN
20566 +extern struct e820map machine_e820;
20567 +#endif
20568 +
20569 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */
20570 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
20571 +{
20572 + unsigned long addr = *addrp, last = addr + size;
20573 +
20574 +#ifndef CONFIG_XEN
20575 + /* various gunk below that needed for SMP startup */
20576 + if (addr < 0x8000) {
20577 + *addrp = 0x8000;
20578 + return 1;
20579 + }
20580 +
20581 + /* direct mapping tables of the kernel */
20582 + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
20583 + *addrp = table_end << PAGE_SHIFT;
20584 + return 1;
20585 + }
20586 +
20587 + /* initrd */
20588 +#ifdef CONFIG_BLK_DEV_INITRD
20589 + if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
20590 + addr < INITRD_START+INITRD_SIZE) {
20591 + *addrp = INITRD_START + INITRD_SIZE;
20592 + return 1;
20593 + }
20594 +#endif
20595 + /* kernel code + 640k memory hole (later should not be needed, but
20596 + be paranoid for now) */
20597 + if (last >= 640*1024 && addr < 1024*1024) {
20598 + *addrp = 1024*1024;
20599 + return 1;
20600 + }
20601 + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
20602 + *addrp = __pa_symbol(&_end);
20603 + return 1;
20604 + }
20605 +
20606 + if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
20607 + *addrp = ebda_addr + ebda_size;
20608 + return 1;
20609 + }
20610 +
20611 + /* XXX ramdisk image here? */
20612 +#else
20613 + if (last < (table_end<<PAGE_SHIFT)) {
20614 + *addrp = table_end << PAGE_SHIFT;
20615 + return 1;
20616 + }
20617 +#endif
20618 + return 0;
20619 +}
20620 +
20621 +/*
20622 + * This function checks if any part of the range <start,end> is mapped
20623 + * with type.
20624 + */
20625 +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
20626 +{
20627 + int i;
20628 +
20629 +#ifndef CONFIG_XEN
20630 + for (i = 0; i < e820.nr_map; i++) {
20631 + struct e820entry *ei = &e820.map[i];
20632 +#else
20633 + if (!is_initial_xendomain())
20634 + return 0;
20635 + for (i = 0; i < machine_e820.nr_map; i++) {
20636 + const struct e820entry *ei = &machine_e820.map[i];
20637 +#endif
20638 +
20639 + if (type && ei->type != type)
20640 + continue;
20641 + if (ei->addr >= end || ei->addr + ei->size <= start)
20642 + continue;
20643 + return 1;
20644 + }
20645 + return 0;
20646 +}
20647 +EXPORT_SYMBOL_GPL(e820_any_mapped);
20648 +
20649 +/*
20650 + * This function checks if the entire range <start,end> is mapped with type.
20651 + *
20652 + * Note: this function only works correct if the e820 table is sorted and
20653 + * not-overlapping, which is the case
20654 + */
20655 +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
20656 +{
20657 + int i;
20658 +
20659 +#ifndef CONFIG_XEN
20660 + for (i = 0; i < e820.nr_map; i++) {
20661 + struct e820entry *ei = &e820.map[i];
20662 +#else
20663 + if (!is_initial_xendomain())
20664 + return 0;
20665 + for (i = 0; i < machine_e820.nr_map; i++) {
20666 + const struct e820entry *ei = &machine_e820.map[i];
20667 +#endif
20668 +
20669 + if (type && ei->type != type)
20670 + continue;
20671 + /* is the region (part) in overlap with the current region ?*/
20672 + if (ei->addr >= end || ei->addr + ei->size <= start)
20673 + continue;
20674 +
20675 + /* if the region is at the beginning of <start,end> we move
20676 + * start to the end of the region since it's ok until there
20677 + */
20678 + if (ei->addr <= start)
20679 + start = ei->addr + ei->size;
20680 + /* if start is now at or beyond end, we're done, full coverage */
20681 + if (start >= end)
20682 + return 1; /* we're done */
20683 + }
20684 + return 0;
20685 +}
20686 +
20687 +/*
20688 + * Find a free area in a specific range.
20689 + */
20690 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
20691 +{
20692 + int i;
20693 + for (i = 0; i < e820.nr_map; i++) {
20694 + struct e820entry *ei = &e820.map[i];
20695 + unsigned long addr = ei->addr, last;
20696 + if (ei->type != E820_RAM)
20697 + continue;
20698 + if (addr < start)
20699 + addr = start;
20700 + if (addr > ei->addr + ei->size)
20701 + continue;
20702 + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
20703 + ;
20704 + last = addr + size;
20705 + if (last > ei->addr + ei->size)
20706 + continue;
20707 + if (last > end)
20708 + continue;
20709 + return addr;
20710 + }
20711 + return -1UL;
20712 +}
20713 +
20714 +/*
20715 + * Free bootmem based on the e820 table for a node.
20716 + */
20717 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
20718 +{
20719 + int i;
20720 + for (i = 0; i < e820.nr_map; i++) {
20721 + struct e820entry *ei = &e820.map[i];
20722 + unsigned long last, addr;
20723 +
20724 + if (ei->type != E820_RAM ||
20725 + ei->addr+ei->size <= start ||
20726 + ei->addr >= end)
20727 + continue;
20728 +
20729 + addr = round_up(ei->addr, PAGE_SIZE);
20730 + if (addr < start)
20731 + addr = start;
20732 +
20733 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
20734 + if (last >= end)
20735 + last = end;
20736 +
20737 + if (last > addr && last-addr >= PAGE_SIZE)
20738 + free_bootmem_node(pgdat, addr, last-addr);
20739 + }
20740 +}
20741 +
20742 +/*
20743 + * Find the highest page frame number we have available
20744 + */
20745 +unsigned long __init e820_end_of_ram(void)
20746 +{
20747 + int i;
20748 + unsigned long end_pfn = 0;
20749 +
20750 + for (i = 0; i < e820.nr_map; i++) {
20751 + struct e820entry *ei = &e820.map[i];
20752 + unsigned long start, end;
20753 +
20754 + start = round_up(ei->addr, PAGE_SIZE);
20755 + end = round_down(ei->addr + ei->size, PAGE_SIZE);
20756 + if (start >= end)
20757 + continue;
20758 + if (ei->type == E820_RAM) {
20759 + if (end > end_pfn<<PAGE_SHIFT)
20760 + end_pfn = end>>PAGE_SHIFT;
20761 + } else {
20762 + if (end > end_pfn_map<<PAGE_SHIFT)
20763 + end_pfn_map = end>>PAGE_SHIFT;
20764 + }
20765 + }
20766 +
20767 + if (end_pfn > end_pfn_map)
20768 + end_pfn_map = end_pfn;
20769 + if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
20770 + end_pfn_map = MAXMEM>>PAGE_SHIFT;
20771 + if (end_pfn > end_user_pfn)
20772 + end_pfn = end_user_pfn;
20773 + if (end_pfn > end_pfn_map)
20774 + end_pfn = end_pfn_map;
20775 +
20776 + return end_pfn;
20777 +}
20778 +
20779 +/*
20780 + * Compute how much memory is missing in a range.
20781 + * Unlike the other functions in this file the arguments are in page numbers.
20782 + */
20783 +unsigned long __init
20784 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
20785 +{
20786 + unsigned long ram = 0;
20787 + unsigned long start = start_pfn << PAGE_SHIFT;
20788 + unsigned long end = end_pfn << PAGE_SHIFT;
20789 + int i;
20790 + for (i = 0; i < e820.nr_map; i++) {
20791 + struct e820entry *ei = &e820.map[i];
20792 + unsigned long last, addr;
20793 +
20794 + if (ei->type != E820_RAM ||
20795 + ei->addr+ei->size <= start ||
20796 + ei->addr >= end)
20797 + continue;
20798 +
20799 + addr = round_up(ei->addr, PAGE_SIZE);
20800 + if (addr < start)
20801 + addr = start;
20802 +
20803 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
20804 + if (last >= end)
20805 + last = end;
20806 +
20807 + if (last > addr)
20808 + ram += last - addr;
20809 + }
20810 + return ((end - start) - ram) >> PAGE_SHIFT;
20811 +}
20812 +
20813 +/*
20814 + * Mark e820 reserved areas as busy for the resource manager.
20815 + */
20816 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
20817 +{
20818 + int i;
20819 + for (i = 0; i < nr_map; i++) {
20820 + struct resource *res;
20821 + res = alloc_bootmem_low(sizeof(struct resource));
20822 + switch (e820[i].type) {
20823 + case E820_RAM: res->name = "System RAM"; break;
20824 + case E820_ACPI: res->name = "ACPI Tables"; break;
20825 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
20826 + default: res->name = "reserved";
20827 + }
20828 + res->start = e820[i].addr;
20829 + res->end = res->start + e820[i].size - 1;
20830 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
20831 + request_resource(&iomem_resource, res);
20832 + if (e820[i].type == E820_RAM) {
20833 + /*
20834 + * We don't know which RAM region contains kernel data,
20835 + * so we try it repeatedly and let the resource manager
20836 + * test it.
20837 + */
20838 +#ifndef CONFIG_XEN
20839 + request_resource(res, &code_resource);
20840 + request_resource(res, &data_resource);
20841 +#endif
20842 +#ifdef CONFIG_KEXEC
20843 + if (crashk_res.start != crashk_res.end)
20844 + request_resource(res, &crashk_res);
20845 +#ifdef CONFIG_XEN
20846 + xen_machine_kexec_register_resources(res);
20847 +#endif
20848 +#endif
20849 + }
20850 + }
20851 +}
20852 +
20853 +/*
20854 + * Add a memory region to the kernel e820 map.
20855 + */
20856 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
20857 +{
20858 + int x = e820.nr_map;
20859 +
20860 + if (x == E820MAX) {
20861 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
20862 + return;
20863 + }
20864 +
20865 + e820.map[x].addr = start;
20866 + e820.map[x].size = size;
20867 + e820.map[x].type = type;
20868 + e820.nr_map++;
20869 +}
20870 +
20871 +void __init e820_print_map(char *who)
20872 +{
20873 + int i;
20874 +
20875 + for (i = 0; i < e820.nr_map; i++) {
20876 + printk(" %s: %016Lx - %016Lx ", who,
20877 + (unsigned long long) e820.map[i].addr,
20878 + (unsigned long long) (e820.map[i].addr + e820.map[i].size));
20879 + switch (e820.map[i].type) {
20880 + case E820_RAM: printk("(usable)\n");
20881 + break;
20882 + case E820_RESERVED:
20883 + printk("(reserved)\n");
20884 + break;
20885 + case E820_ACPI:
20886 + printk("(ACPI data)\n");
20887 + break;
20888 + case E820_NVS:
20889 + printk("(ACPI NVS)\n");
20890 + break;
20891 + default: printk("type %u\n", e820.map[i].type);
20892 + break;
20893 + }
20894 + }
20895 +}
20896 +
20897 +/*
20898 + * Sanitize the BIOS e820 map.
20899 + *
20900 + * Some e820 responses include overlapping entries. The following
20901 + * replaces the original e820 map with a new one, removing overlaps.
20902 + *
20903 + */
20904 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
20905 +{
20906 + struct change_member {
20907 + struct e820entry *pbios; /* pointer to original bios entry */
20908 + unsigned long long addr; /* address for this change point */
20909 + };
20910 + static struct change_member change_point_list[2*E820MAX] __initdata;
20911 + static struct change_member *change_point[2*E820MAX] __initdata;
20912 + static struct e820entry *overlap_list[E820MAX] __initdata;
20913 + static struct e820entry new_bios[E820MAX] __initdata;
20914 + struct change_member *change_tmp;
20915 + unsigned long current_type, last_type;
20916 + unsigned long long last_addr;
20917 + int chgidx, still_changing;
20918 + int overlap_entries;
20919 + int new_bios_entry;
20920 + int old_nr, new_nr, chg_nr;
20921 + int i;
20922 +
20923 + /*
20924 + Visually we're performing the following (1,2,3,4 = memory types)...
20925 +
20926 + Sample memory map (w/overlaps):
20927 + ____22__________________
20928 + ______________________4_
20929 + ____1111________________
20930 + _44_____________________
20931 + 11111111________________
20932 + ____________________33__
20933 + ___________44___________
20934 + __________33333_________
20935 + ______________22________
20936 + ___________________2222_
20937 + _________111111111______
20938 + _____________________11_
20939 + _________________4______
20940 +
20941 + Sanitized equivalent (no overlap):
20942 + 1_______________________
20943 + _44_____________________
20944 + ___1____________________
20945 + ____22__________________
20946 + ______11________________
20947 + _________1______________
20948 + __________3_____________
20949 + ___________44___________
20950 + _____________33_________
20951 + _______________2________
20952 + ________________1_______
20953 + _________________4______
20954 + ___________________2____
20955 + ____________________33__
20956 + ______________________4_
20957 + */
20958 +
20959 + /* if there's only one memory region, don't bother */
20960 + if (*pnr_map < 2)
20961 + return -1;
20962 +
20963 + old_nr = *pnr_map;
20964 +
20965 + /* bail out if we find any unreasonable addresses in bios map */
20966 + for (i=0; i<old_nr; i++)
20967 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
20968 + return -1;
20969 +
20970 + /* create pointers for initial change-point information (for sorting) */
20971 + for (i=0; i < 2*old_nr; i++)
20972 + change_point[i] = &change_point_list[i];
20973 +
20974 + /* record all known change-points (starting and ending addresses),
20975 + omitting those that are for empty memory regions */
20976 + chgidx = 0;
20977 + for (i=0; i < old_nr; i++) {
20978 + if (biosmap[i].size != 0) {
20979 + change_point[chgidx]->addr = biosmap[i].addr;
20980 + change_point[chgidx++]->pbios = &biosmap[i];
20981 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
20982 + change_point[chgidx++]->pbios = &biosmap[i];
20983 + }
20984 + }
20985 + chg_nr = chgidx;
20986 +
20987 + /* sort change-point list by memory addresses (low -> high) */
20988 + still_changing = 1;
20989 + while (still_changing) {
20990 + still_changing = 0;
20991 + for (i=1; i < chg_nr; i++) {
20992 + /* if <current_addr> > <last_addr>, swap */
20993 + /* or, if current=<start_addr> & last=<end_addr>, swap */
20994 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
20995 + ((change_point[i]->addr == change_point[i-1]->addr) &&
20996 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
20997 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
20998 + )
20999 + {
21000 + change_tmp = change_point[i];
21001 + change_point[i] = change_point[i-1];
21002 + change_point[i-1] = change_tmp;
21003 + still_changing=1;
21004 + }
21005 + }
21006 + }
21007 +
21008 + /* create a new bios memory map, removing overlaps */
21009 + overlap_entries=0; /* number of entries in the overlap table */
21010 + new_bios_entry=0; /* index for creating new bios map entries */
21011 + last_type = 0; /* start with undefined memory type */
21012 + last_addr = 0; /* start with 0 as last starting address */
21013 + /* loop through change-points, determining affect on the new bios map */
21014 + for (chgidx=0; chgidx < chg_nr; chgidx++)
21015 + {
21016 + /* keep track of all overlapping bios entries */
21017 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
21018 + {
21019 + /* add map entry to overlap list (> 1 entry implies an overlap) */
21020 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
21021 + }
21022 + else
21023 + {
21024 + /* remove entry from list (order independent, so swap with last) */
21025 + for (i=0; i<overlap_entries; i++)
21026 + {
21027 + if (overlap_list[i] == change_point[chgidx]->pbios)
21028 + overlap_list[i] = overlap_list[overlap_entries-1];
21029 + }
21030 + overlap_entries--;
21031 + }
21032 + /* if there are overlapping entries, decide which "type" to use */
21033 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
21034 + current_type = 0;
21035 + for (i=0; i<overlap_entries; i++)
21036 + if (overlap_list[i]->type > current_type)
21037 + current_type = overlap_list[i]->type;
21038 + /* continue building up new bios map based on this information */
21039 + if (current_type != last_type) {
21040 + if (last_type != 0) {
21041 + new_bios[new_bios_entry].size =
21042 + change_point[chgidx]->addr - last_addr;
21043 + /* move forward only if the new size was non-zero */
21044 + if (new_bios[new_bios_entry].size != 0)
21045 + if (++new_bios_entry >= E820MAX)
21046 + break; /* no more space left for new bios entries */
21047 + }
21048 + if (current_type != 0) {
21049 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
21050 + new_bios[new_bios_entry].type = current_type;
21051 + last_addr=change_point[chgidx]->addr;
21052 + }
21053 + last_type = current_type;
21054 + }
21055 + }
21056 + new_nr = new_bios_entry; /* retain count for new bios entries */
21057 +
21058 + /* copy new bios mapping into original location */
21059 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
21060 + *pnr_map = new_nr;
21061 +
21062 + return 0;
21063 +}
21064 +
21065 +/*
21066 + * Copy the BIOS e820 map into a safe place.
21067 + *
21068 + * Sanity-check it while we're at it..
21069 + *
21070 + * If we're lucky and live on a modern system, the setup code
21071 + * will have given us a memory map that we can use to properly
21072 + * set up memory. If we aren't, we'll fake a memory map.
21073 + *
21074 + * We check to see that the memory map contains at least 2 elements
21075 + * before we'll use it, because the detection code in setup.S may
21076 + * not be perfect and most every PC known to man has two memory
21077 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
21078 + * thinkpad 560x, for example, does not cooperate with the memory
21079 + * detection code.)
21080 + */
21081 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
21082 +{
21083 +#ifndef CONFIG_XEN
21084 + /* Only one memory region (or negative)? Ignore it */
21085 + if (nr_map < 2)
21086 + return -1;
21087 +#else
21088 + BUG_ON(nr_map < 1);
21089 +#endif
21090 +
21091 + do {
21092 + unsigned long start = biosmap->addr;
21093 + unsigned long size = biosmap->size;
21094 + unsigned long end = start + size;
21095 + unsigned long type = biosmap->type;
21096 +
21097 + /* Overflow in 64 bits? Ignore the memory map. */
21098 + if (start > end)
21099 + return -1;
21100 +
21101 +#ifndef CONFIG_XEN
21102 + /*
21103 + * Some BIOSes claim RAM in the 640k - 1M region.
21104 + * Not right. Fix it up.
21105 + *
21106 + * This should be removed on Hammer which is supposed to not
21107 + * have non e820 covered ISA mappings there, but I had some strange
21108 + * problems so it stays for now. -AK
21109 + */
21110 + if (type == E820_RAM) {
21111 + if (start < 0x100000ULL && end > 0xA0000ULL) {
21112 + if (start < 0xA0000ULL)
21113 + add_memory_region(start, 0xA0000ULL-start, type);
21114 + if (end <= 0x100000ULL)
21115 + continue;
21116 + start = 0x100000ULL;
21117 + size = end - start;
21118 + }
21119 + }
21120 +#endif
21121 +
21122 + add_memory_region(start, size, type);
21123 + } while (biosmap++,--nr_map);
21124 +
21125 +#ifdef CONFIG_XEN
21126 + if (is_initial_xendomain()) {
21127 + struct xen_memory_map memmap;
21128 +
21129 + memmap.nr_entries = E820MAX;
21130 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
21131 +
21132 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
21133 + BUG();
21134 + machine_e820.nr_map = memmap.nr_entries;
21135 + } else
21136 + machine_e820 = e820;
21137 +#endif
21138 +
21139 + return 0;
21140 +}
21141 +
21142 +#ifndef CONFIG_XEN
21143 +void __init setup_memory_region(void)
21144 +{
21145 + char *who = "BIOS-e820";
21146 +
21147 + /*
21148 + * Try to copy the BIOS-supplied E820-map.
21149 + *
21150 + * Otherwise fake a memory map; one section from 0k->640k,
21151 + * the next section from 1mb->appropriate_mem_k
21152 + */
21153 + sanitize_e820_map(E820_MAP, &E820_MAP_NR);
21154 + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
21155 + unsigned long mem_size;
21156 +
21157 + /* compare results from other methods and take the greater */
21158 + if (ALT_MEM_K < EXT_MEM_K) {
21159 + mem_size = EXT_MEM_K;
21160 + who = "BIOS-88";
21161 + } else {
21162 + mem_size = ALT_MEM_K;
21163 + who = "BIOS-e801";
21164 + }
21165 +
21166 + e820.nr_map = 0;
21167 + add_memory_region(0, LOWMEMSIZE(), E820_RAM);
21168 + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
21169 + }
21170 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21171 + e820_print_map(who);
21172 +}
21173 +
21174 +#else /* CONFIG_XEN */
21175 +
21176 +void __init setup_memory_region(void)
21177 +{
21178 + int rc;
21179 + struct xen_memory_map memmap;
21180 + /*
21181 + * This is rather large for a stack variable but this early in
21182 + * the boot process we know we have plenty slack space.
21183 + */
21184 + struct e820entry map[E820MAX];
21185 +
21186 + memmap.nr_entries = E820MAX;
21187 + set_xen_guest_handle(memmap.buffer, map);
21188 +
21189 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
21190 + if ( rc == -ENOSYS ) {
21191 + memmap.nr_entries = 1;
21192 + map[0].addr = 0ULL;
21193 + map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
21194 + /* 8MB slack (to balance backend allocations). */
21195 + map[0].size += 8 << 20;
21196 + map[0].type = E820_RAM;
21197 + rc = 0;
21198 + }
21199 + BUG_ON(rc);
21200 +
21201 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
21202 +
21203 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
21204 +
21205 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
21206 + e820_print_map("Xen");
21207 +}
21208 +#endif
21209 +
21210 +void __init parse_memopt(char *p, char **from)
21211 +{
21212 + int i;
21213 + unsigned long current_end;
21214 + unsigned long end;
21215 +
21216 + end_user_pfn = memparse(p, from);
21217 + end_user_pfn >>= PAGE_SHIFT;
21218 +
21219 + end = end_user_pfn<<PAGE_SHIFT;
21220 + i = e820.nr_map-1;
21221 + current_end = e820.map[i].addr + e820.map[i].size;
21222 +
21223 + if (current_end < end) {
21224 + /*
21225 + * The e820 map ends before our requested size so
21226 + * extend the final entry to the requested address.
21227 + */
21228 + if (e820.map[i].type == E820_RAM)
21229 + e820.map[i].size = end - e820.map[i].addr;
21230 + else
21231 + add_memory_region(current_end, end - current_end, E820_RAM);
21232 + }
21233 +}
21234 +
21235 +void __init parse_memmapopt(char *p, char **from)
21236 +{
21237 + unsigned long long start_at, mem_size;
21238 +
21239 + mem_size = memparse(p, from);
21240 + p = *from;
21241 + if (*p == '@') {
21242 + start_at = memparse(p+1, from);
21243 + add_memory_region(start_at, mem_size, E820_RAM);
21244 + } else if (*p == '#') {
21245 + start_at = memparse(p+1, from);
21246 + add_memory_region(start_at, mem_size, E820_ACPI);
21247 + } else if (*p == '$') {
21248 + start_at = memparse(p+1, from);
21249 + add_memory_region(start_at, mem_size, E820_RESERVED);
21250 + } else {
21251 + end_user_pfn = (mem_size >> PAGE_SHIFT);
21252 + }
21253 + p = *from;
21254 +}
21255 +
21256 +unsigned long pci_mem_start = 0xaeedbabe;
21257 +EXPORT_SYMBOL(pci_mem_start);
21258 +
21259 +/*
21260 + * Search for the biggest gap in the low 32 bits of the e820
21261 + * memory space. We pass this space to PCI to assign MMIO resources
21262 + * for hotplug or unconfigured devices in.
21263 + * Hopefully the BIOS let enough space left.
21264 + */
21265 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
21266 +{
21267 + unsigned long gapstart, gapsize, round;
21268 + unsigned long last;
21269 + int i;
21270 + int found = 0;
21271 +
21272 + last = 0x100000000ull;
21273 + gapstart = 0x10000000;
21274 + gapsize = 0x400000;
21275 + i = nr_map;
21276 + while (--i >= 0) {
21277 + unsigned long long start = e820[i].addr;
21278 + unsigned long long end = start + e820[i].size;
21279 +
21280 + /*
21281 + * Since "last" is at most 4GB, we know we'll
21282 + * fit in 32 bits if this condition is true
21283 + */
21284 + if (last > end) {
21285 + unsigned long gap = last - end;
21286 +
21287 + if (gap > gapsize) {
21288 + gapsize = gap;
21289 + gapstart = end;
21290 + found = 1;
21291 + }
21292 + }
21293 + if (start < last)
21294 + last = start;
21295 + }
21296 +
21297 + if (!found) {
21298 + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
21299 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
21300 + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
21301 + }
21302 +
21303 + /*
21304 + * See how much we want to round up: start off with
21305 + * rounding to the next 1MB area.
21306 + */
21307 + round = 0x100000;
21308 + while ((gapsize >> 4) > round)
21309 + round += round;
21310 + /* Fun with two's complement */
21311 + pci_mem_start = (gapstart + round) & -round;
21312 +
21313 + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
21314 + pci_mem_start, gapstart, gapsize);
21315 +}
21316 Index: head-2008-11-25/arch/x86/kernel/early_printk-xen.c
21317 ===================================================================
21318 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
21319 +++ head-2008-11-25/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200
21320 @@ -0,0 +1,302 @@
21321 +#include <linux/console.h>
21322 +#include <linux/kernel.h>
21323 +#include <linux/init.h>
21324 +#include <linux/string.h>
21325 +#include <linux/screen_info.h>
21326 +#include <asm/io.h>
21327 +#include <asm/processor.h>
21328 +#include <asm/fcntl.h>
21329 +
21330 +/* Simple VGA output */
21331 +
21332 +#ifdef __i386__
21333 +#include <asm/setup.h>
21334 +#define VGABASE (__ISA_IO_base + 0xb8000)
21335 +#else
21336 +#include <asm/bootsetup.h>
21337 +#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
21338 +#endif
21339 +
21340 +#ifndef CONFIG_XEN
21341 +static int max_ypos = 25, max_xpos = 80;
21342 +static int current_ypos = 25, current_xpos = 0;
21343 +
21344 +static void early_vga_write(struct console *con, const char *str, unsigned n)
21345 +{
21346 + char c;
21347 + int i, k, j;
21348 +
21349 + while ((c = *str++) != '\0' && n-- > 0) {
21350 + if (current_ypos >= max_ypos) {
21351 + /* scroll 1 line up */
21352 + for (k = 1, j = 0; k < max_ypos; k++, j++) {
21353 + for (i = 0; i < max_xpos; i++) {
21354 + writew(readw(VGABASE+2*(max_xpos*k+i)),
21355 + VGABASE + 2*(max_xpos*j + i));
21356 + }
21357 + }
21358 + for (i = 0; i < max_xpos; i++)
21359 + writew(0x720, VGABASE + 2*(max_xpos*j + i));
21360 + current_ypos = max_ypos-1;
21361 + }
21362 + if (c == '\n') {
21363 + current_xpos = 0;
21364 + current_ypos++;
21365 + } else if (c != '\r') {
21366 + writew(((0x7 << 8) | (unsigned short) c),
21367 + VGABASE + 2*(max_xpos*current_ypos +
21368 + current_xpos++));
21369 + if (current_xpos >= max_xpos) {
21370 + current_xpos = 0;
21371 + current_ypos++;
21372 + }
21373 + }
21374 + }
21375 +}
21376 +
21377 +static struct console early_vga_console = {
21378 + .name = "earlyvga",
21379 + .write = early_vga_write,
21380 + .flags = CON_PRINTBUFFER,
21381 + .index = -1,
21382 +};
21383 +
21384 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
21385 +
21386 +static int early_serial_base = 0x3f8; /* ttyS0 */
21387 +
21388 +#define XMTRDY 0x20
21389 +
21390 +#define DLAB 0x80
21391 +
21392 +#define TXR 0 /* Transmit register (WRITE) */
21393 +#define RXR 0 /* Receive register (READ) */
21394 +#define IER 1 /* Interrupt Enable */
21395 +#define IIR 2 /* Interrupt ID */
21396 +#define FCR 2 /* FIFO control */
21397 +#define LCR 3 /* Line control */
21398 +#define MCR 4 /* Modem control */
21399 +#define LSR 5 /* Line Status */
21400 +#define MSR 6 /* Modem Status */
21401 +#define DLL 0 /* Divisor Latch Low */
21402 +#define DLH 1 /* Divisor latch High */
21403 +
21404 +static int early_serial_putc(unsigned char ch)
21405 +{
21406 + unsigned timeout = 0xffff;
21407 + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
21408 + cpu_relax();
21409 + outb(ch, early_serial_base + TXR);
21410 + return timeout ? 0 : -1;
21411 +}
21412 +
21413 +static void early_serial_write(struct console *con, const char *s, unsigned n)
21414 +{
21415 + while (*s && n-- > 0) {
21416 + early_serial_putc(*s);
21417 + if (*s == '\n')
21418 + early_serial_putc('\r');
21419 + s++;
21420 + }
21421 +}
21422 +
21423 +#define DEFAULT_BAUD 9600
21424 +
21425 +static __init void early_serial_init(char *s)
21426 +{
21427 + unsigned char c;
21428 + unsigned divisor;
21429 + unsigned baud = DEFAULT_BAUD;
21430 + char *e;
21431 +
21432 + if (*s == ',')
21433 + ++s;
21434 +
21435 + if (*s) {
21436 + unsigned port;
21437 + if (!strncmp(s,"0x",2)) {
21438 + early_serial_base = simple_strtoul(s, &e, 16);
21439 + } else {
21440 + static int bases[] = { 0x3f8, 0x2f8 };
21441 +
21442 + if (!strncmp(s,"ttyS",4))
21443 + s += 4;
21444 + port = simple_strtoul(s, &e, 10);
21445 + if (port > 1 || s == e)
21446 + port = 0;
21447 + early_serial_base = bases[port];
21448 + }
21449 + s += strcspn(s, ",");
21450 + if (*s == ',')
21451 + s++;
21452 + }
21453 +
21454 + outb(0x3, early_serial_base + LCR); /* 8n1 */
21455 + outb(0, early_serial_base + IER); /* no interrupt */
21456 + outb(0, early_serial_base + FCR); /* no fifo */
21457 + outb(0x3, early_serial_base + MCR); /* DTR + RTS */
21458 +
21459 + if (*s) {
21460 + baud = simple_strtoul(s, &e, 0);
21461 + if (baud == 0 || s == e)
21462 + baud = DEFAULT_BAUD;
21463 + }
21464 +
21465 + divisor = 115200 / baud;
21466 + c = inb(early_serial_base + LCR);
21467 + outb(c | DLAB, early_serial_base + LCR);
21468 + outb(divisor & 0xff, early_serial_base + DLL);
21469 + outb((divisor >> 8) & 0xff, early_serial_base + DLH);
21470 + outb(c & ~DLAB, early_serial_base + LCR);
21471 +}
21472 +
21473 +#else /* CONFIG_XEN */
21474 +
21475 +static void
21476 +early_serial_write(struct console *con, const char *s, unsigned count)
21477 +{
21478 + int n;
21479 +
21480 + while (count > 0) {
21481 + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
21482 + if (n <= 0)
21483 + break;
21484 + count -= n;
21485 + s += n;
21486 + }
21487 +}
21488 +
21489 +static __init void early_serial_init(char *s)
21490 +{
21491 +}
21492 +
21493 +/*
21494 + * No early VGA console on Xen, as we do not have convenient ISA-space
21495 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
21496 + */
21497 +#define early_vga_console early_serial_console
21498 +
21499 +#endif
21500 +
21501 +static struct console early_serial_console = {
21502 + .name = "earlyser",
21503 + .write = early_serial_write,
21504 + .flags = CON_PRINTBUFFER,
21505 + .index = -1,
21506 +};
21507 +
21508 +/* Console interface to a host file on AMD's SimNow! */
21509 +
21510 +static int simnow_fd;
21511 +
21512 +enum {
21513 + MAGIC1 = 0xBACCD00A,
21514 + MAGIC2 = 0xCA110000,
21515 + XOPEN = 5,
21516 + XWRITE = 4,
21517 +};
21518 +
21519 +static noinline long simnow(long cmd, long a, long b, long c)
21520 +{
21521 + long ret;
21522 + asm volatile("cpuid" :
21523 + "=a" (ret) :
21524 + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
21525 + return ret;
21526 +}
21527 +
21528 +void __init simnow_init(char *str)
21529 +{
21530 + char *fn = "klog";
21531 + if (*str == '=')
21532 + fn = ++str;
21533 + /* error ignored */
21534 + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
21535 +}
21536 +
21537 +static void simnow_write(struct console *con, const char *s, unsigned n)
21538 +{
21539 + simnow(XWRITE, simnow_fd, (unsigned long)s, n);
21540 +}
21541 +
21542 +static struct console simnow_console = {
21543 + .name = "simnow",
21544 + .write = simnow_write,
21545 + .flags = CON_PRINTBUFFER,
21546 + .index = -1,
21547 +};
21548 +
21549 +/* Direct interface for emergencies */
21550 +struct console *early_console = &early_vga_console;
21551 +static int early_console_initialized = 0;
21552 +
21553 +void early_printk(const char *fmt, ...)
21554 +{
21555 + char buf[512];
21556 + int n;
21557 + va_list ap;
21558 +
21559 + va_start(ap,fmt);
21560 + n = vscnprintf(buf,512,fmt,ap);
21561 + early_console->write(early_console,buf,n);
21562 + va_end(ap);
21563 +}
21564 +
21565 +static int __initdata keep_early;
21566 +
21567 +int __init setup_early_printk(char *opt)
21568 +{
21569 + char *space;
21570 + char buf[256];
21571 +
21572 + if (early_console_initialized)
21573 + return 1;
21574 +
21575 + strlcpy(buf,opt,sizeof(buf));
21576 + space = strchr(buf, ' ');
21577 + if (space)
21578 + *space = 0;
21579 +
21580 + if (strstr(buf,"keep"))
21581 + keep_early = 1;
21582 +
21583 + if (!strncmp(buf, "serial", 6)) {
21584 + early_serial_init(buf + 6);
21585 + early_console = &early_serial_console;
21586 + } else if (!strncmp(buf, "ttyS", 4)) {
21587 + early_serial_init(buf);
21588 + early_console = &early_serial_console;
21589 + } else if (!strncmp(buf, "vga", 3)
21590 +#ifndef CONFIG_XEN
21591 + && SCREEN_INFO.orig_video_isVGA == 1) {
21592 + max_xpos = SCREEN_INFO.orig_video_cols;
21593 + max_ypos = SCREEN_INFO.orig_video_lines;
21594 + current_ypos = SCREEN_INFO.orig_y;
21595 +#else
21596 + || !strncmp(buf, "xen", 3)) {
21597 +#endif
21598 + early_console = &early_vga_console;
21599 + } else if (!strncmp(buf, "simnow", 6)) {
21600 + simnow_init(buf + 6);
21601 + early_console = &simnow_console;
21602 + keep_early = 1;
21603 + }
21604 + early_console_initialized = 1;
21605 + register_console(early_console);
21606 + return 0;
21607 +}
21608 +
21609 +void __init disable_early_printk(void)
21610 +{
21611 + if (!early_console_initialized || !early_console)
21612 + return;
21613 + if (!keep_early) {
21614 + printk("disabling early console\n");
21615 + unregister_console(early_console);
21616 + early_console_initialized = 0;
21617 + } else {
21618 + printk("keeping early console\n");
21619 + }
21620 +}
21621 +
21622 +__setup("earlyprintk=", setup_early_printk);
21623 Index: head-2008-11-25/arch/x86/kernel/entry_64-xen.S
21624 ===================================================================
21625 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
21626 +++ head-2008-11-25/arch/x86/kernel/entry_64-xen.S 2008-10-29 09:55:56.000000000 +0100
21627 @@ -0,0 +1,1322 @@
21628 +/*
21629 + * linux/arch/x86_64/entry.S
21630 + *
21631 + * Copyright (C) 1991, 1992 Linus Torvalds
21632 + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
21633 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
21634 + *
21635 + * $Id$
21636 + *
21637 + * Jun Nakajima <jun.nakajima@intel.com>
21638 + * Asit Mallick <asit.k.mallick@intel.com>
21639 + * Modified for Xen
21640 + */
21641 +
21642 +/*
21643 + * entry.S contains the system-call and fault low-level handling routines.
21644 + *
21645 + * NOTE: This code handles signal-recognition, which happens every time
21646 + * after an interrupt and after each system call.
21647 + *
21648 + * Normal syscalls and interrupts don't save a full stack frame, this is
21649 + * only done for syscall tracing, signals or fork/exec et.al.
21650 + *
21651 + * A note on terminology:
21652 + * - top of stack: Architecture defined interrupt frame from SS to RIP
21653 + * at the top of the kernel process stack.
21654 + * - partial stack frame: partially saved registers upto R11.
21655 + * - full stack frame: Like partial stack frame, but all register saved.
21656 + *
21657 + * TODO:
21658 + * - schedule it carefully for the final hardware.
21659 + */
21660 +
21661 +#define ASSEMBLY 1
21662 +#include <linux/linkage.h>
21663 +#include <asm/segment.h>
21664 +#include <asm/smp.h>
21665 +#include <asm/cache.h>
21666 +#include <asm/errno.h>
21667 +#include <asm/dwarf2.h>
21668 +#include <asm/calling.h>
21669 +#include <asm/asm-offsets.h>
21670 +#include <asm/msr.h>
21671 +#include <asm/unistd.h>
21672 +#include <asm/thread_info.h>
21673 +#include <asm/hw_irq.h>
21674 +#include <asm/page.h>
21675 +#include <asm/irqflags.h>
21676 +#include <asm/errno.h>
21677 +#include <xen/interface/arch-x86_64.h>
21678 +#include <xen/interface/features.h>
21679 +
21680 +#include "xen_entry.S"
21681 +
21682 + .code64
21683 +
21684 +#ifndef CONFIG_PREEMPT
21685 +#define retint_kernel retint_restore_args
21686 +#endif
21687 +
21688 +
21689 +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
21690 +#ifdef CONFIG_TRACE_IRQFLAGS
21691 + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
21692 + jnc 1f
21693 + TRACE_IRQS_ON
21694 +1:
21695 +#endif
21696 +.endm
21697 +
21698 +NMI_MASK = 0x80000000
21699 +
21700 +/*
21701 + * C code is not supposed to know about undefined top of stack. Every time
21702 + * a C function with an pt_regs argument is called from the SYSCALL based
21703 + * fast path FIXUP_TOP_OF_STACK is needed.
21704 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
21705 + * manipulation.
21706 + */
21707 +
21708 + /* %rsp:at FRAMEEND */
21709 + .macro FIXUP_TOP_OF_STACK tmp
21710 + movq $__USER_CS,CS(%rsp)
21711 + movq $-1,RCX(%rsp)
21712 + .endm
21713 +
21714 + .macro RESTORE_TOP_OF_STACK tmp,offset=0
21715 + .endm
21716 +
21717 + .macro FAKE_STACK_FRAME child_rip
21718 + /* push in order ss, rsp, eflags, cs, rip */
21719 + xorl %eax, %eax
21720 + pushq %rax /* ss */
21721 + CFI_ADJUST_CFA_OFFSET 8
21722 + /*CFI_REL_OFFSET ss,0*/
21723 + pushq %rax /* rsp */
21724 + CFI_ADJUST_CFA_OFFSET 8
21725 + CFI_REL_OFFSET rsp,0
21726 + pushq $(1<<9) /* eflags - interrupts on */
21727 + CFI_ADJUST_CFA_OFFSET 8
21728 + /*CFI_REL_OFFSET rflags,0*/
21729 + pushq $__KERNEL_CS /* cs */
21730 + CFI_ADJUST_CFA_OFFSET 8
21731 + /*CFI_REL_OFFSET cs,0*/
21732 + pushq \child_rip /* rip */
21733 + CFI_ADJUST_CFA_OFFSET 8
21734 + CFI_REL_OFFSET rip,0
21735 + pushq %rax /* orig rax */
21736 + CFI_ADJUST_CFA_OFFSET 8
21737 + .endm
21738 +
21739 + .macro UNFAKE_STACK_FRAME
21740 + addq $8*6, %rsp
21741 + CFI_ADJUST_CFA_OFFSET -(6*8)
21742 + .endm
21743 +
21744 + .macro CFI_DEFAULT_STACK start=1,adj=0
21745 + .if \start
21746 + CFI_STARTPROC simple
21747 + CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET
21748 + .else
21749 + CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
21750 + .endif
21751 + .if \adj == 0
21752 + CFI_REL_OFFSET r15,R15
21753 + CFI_REL_OFFSET r14,R14
21754 + CFI_REL_OFFSET r13,R13
21755 + CFI_REL_OFFSET r12,R12
21756 + CFI_REL_OFFSET rbp,RBP
21757 + CFI_REL_OFFSET rbx,RBX
21758 + .endif
21759 + CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET
21760 + CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET
21761 + CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET
21762 + CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET
21763 + CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET
21764 + CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET
21765 + CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET
21766 + CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET
21767 + CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET
21768 + CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET
21769 + /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/
21770 + /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/
21771 + CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET
21772 + /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/
21773 + .endm
21774 +
21775 + /*
21776 + * Must be consistent with the definition in arch-x86/xen-x86_64.h:
21777 + * struct iret_context {
21778 + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
21779 + * };
21780 + * with rax, r11, and rcx being taken care of in the hypercall stub.
21781 + */
21782 + .macro HYPERVISOR_IRET flag
21783 + testb $3,1*8(%rsp)
21784 + jnz 2f
21785 + testl $NMI_MASK,2*8(%rsp)
21786 + jnz 2f
21787 +
21788 + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
21789 + jne 1f
21790 +
21791 + /* Direct iret to kernel space. Correct CS and SS. */
21792 + orl $3,1*8(%rsp)
21793 + orl $3,4*8(%rsp)
21794 +1: iretq
21795 +
21796 +2: /* Slow iret via hypervisor. */
21797 + andl $~NMI_MASK, 2*8(%rsp)
21798 + pushq $\flag
21799 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
21800 + .endm
21801 +
21802 +/*
21803 + * A newly forked process directly context switches into this.
21804 + */
21805 +/* rdi: prev */
21806 +ENTRY(ret_from_fork)
21807 + CFI_DEFAULT_STACK
21808 + push kernel_eflags(%rip)
21809 + CFI_ADJUST_CFA_OFFSET 4
21810 + popf # reset kernel eflags
21811 + CFI_ADJUST_CFA_OFFSET -4
21812 + call schedule_tail
21813 + GET_THREAD_INFO(%rcx)
21814 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
21815 + jnz rff_trace
21816 +rff_action:
21817 + RESTORE_REST
21818 + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
21819 + je int_ret_from_sys_call
21820 + testl $_TIF_IA32,threadinfo_flags(%rcx)
21821 + jnz int_ret_from_sys_call
21822 + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
21823 + jmp ret_from_sys_call
21824 +rff_trace:
21825 + movq %rsp,%rdi
21826 + call syscall_trace_leave
21827 + GET_THREAD_INFO(%rcx)
21828 + jmp rff_action
21829 + CFI_ENDPROC
21830 +END(ret_from_fork)
21831 +
21832 +/*
21833 + * initial frame state for interrupts and exceptions
21834 + */
21835 + .macro _frame ref
21836 + CFI_STARTPROC simple
21837 + CFI_DEF_CFA rsp,SS+8-\ref
21838 + /*CFI_REL_OFFSET ss,SS-\ref*/
21839 + CFI_REL_OFFSET rsp,RSP-\ref
21840 + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
21841 + /*CFI_REL_OFFSET cs,CS-\ref*/
21842 + CFI_REL_OFFSET rip,RIP-\ref
21843 + .endm
21844 +
21845 +/*
21846 + * System call entry. Upto 6 arguments in registers are supported.
21847 + *
21848 + * SYSCALL does not save anything on the stack and does not change the
21849 + * stack pointer.
21850 + */
21851 +
21852 +/*
21853 + * Register setup:
21854 + * rax system call number
21855 + * rdi arg0
21856 + * rcx return address for syscall/sysret, C arg3
21857 + * rsi arg1
21858 + * rdx arg2
21859 + * r10 arg3 (--> moved to rcx for C)
21860 + * r8 arg4
21861 + * r9 arg5
21862 + * r11 eflags for syscall/sysret, temporary for C
21863 + * r12-r15,rbp,rbx saved by C code, not touched.
21864 + *
21865 + * Interrupts are enabled on entry.
21866 + * Only called from user space.
21867 + *
21868 + * XXX if we had a free scratch register we could save the RSP into the stack frame
21869 + * and report it properly in ps. Unfortunately we haven't.
21870 + *
21871 + * When user can change the frames always force IRET. That is because
21872 + * it deals with uncanonical addresses better. SYSRET has trouble
21873 + * with them due to bugs in both AMD and Intel CPUs.
21874 + */
21875 +
21876 +ENTRY(system_call)
21877 + _frame (RIP-0x10)
21878 + SAVE_ARGS -8,0
21879 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
21880 + GET_THREAD_INFO(%rcx)
21881 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
21882 + CFI_REMEMBER_STATE
21883 + jnz tracesys
21884 + cmpq $__NR_syscall_max,%rax
21885 + ja badsys
21886 + movq %r10,%rcx
21887 + call *sys_call_table(,%rax,8) # XXX: rip relative
21888 + movq %rax,RAX-ARGOFFSET(%rsp)
21889 +/*
21890 + * Syscall return path ending with SYSRET (fast path)
21891 + * Has incomplete stack frame and undefined top of stack.
21892 + */
21893 + .globl ret_from_sys_call
21894 +ret_from_sys_call:
21895 + movl $_TIF_ALLWORK_MASK,%edi
21896 + /* edi: flagmask */
21897 +sysret_check:
21898 + GET_THREAD_INFO(%rcx)
21899 + XEN_BLOCK_EVENTS(%rsi)
21900 + TRACE_IRQS_OFF
21901 + movl threadinfo_flags(%rcx),%edx
21902 + andl %edi,%edx
21903 + CFI_REMEMBER_STATE
21904 + jnz sysret_careful
21905 + /*
21906 + * sysretq will re-enable interrupts:
21907 + */
21908 + TRACE_IRQS_ON
21909 + XEN_UNBLOCK_EVENTS(%rsi)
21910 + RESTORE_ARGS 0,8,0
21911 + HYPERVISOR_IRET VGCF_IN_SYSCALL
21912 +
21913 + /* Handle reschedules */
21914 + /* edx: work, edi: workmask */
21915 +sysret_careful:
21916 + CFI_RESTORE_STATE
21917 + bt $TIF_NEED_RESCHED,%edx
21918 + jnc sysret_signal
21919 + TRACE_IRQS_ON
21920 + XEN_UNBLOCK_EVENTS(%rsi)
21921 + pushq %rdi
21922 + CFI_ADJUST_CFA_OFFSET 8
21923 + call schedule
21924 + popq %rdi
21925 + CFI_ADJUST_CFA_OFFSET -8
21926 + jmp sysret_check
21927 +
21928 + /* Handle a signal */
21929 +sysret_signal:
21930 + TRACE_IRQS_ON
21931 +/* sti */
21932 + XEN_UNBLOCK_EVENTS(%rsi)
21933 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
21934 + jz 1f
21935 +
21936 + /* Really a signal */
21937 + /* edx: work flags (arg3) */
21938 + leaq do_notify_resume(%rip),%rax
21939 + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
21940 + xorl %esi,%esi # oldset -> arg2
21941 + call ptregscall_common
21942 +1: movl $_TIF_NEED_RESCHED,%edi
21943 + /* Use IRET because user could have changed frame. This
21944 + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
21945 + XEN_BLOCK_EVENTS(%rsi)
21946 + TRACE_IRQS_OFF
21947 + jmp int_with_check
21948 +
21949 +badsys:
21950 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
21951 + jmp ret_from_sys_call
21952 +
21953 + /* Do syscall tracing */
21954 +tracesys:
21955 + CFI_RESTORE_STATE
21956 + SAVE_REST
21957 + movq $-ENOSYS,RAX(%rsp)
21958 + FIXUP_TOP_OF_STACK %rdi
21959 + movq %rsp,%rdi
21960 + call syscall_trace_enter
21961 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
21962 + RESTORE_REST
21963 + cmpq $__NR_syscall_max,%rax
21964 + ja 1f
21965 + movq %r10,%rcx /* fixup for C */
21966 + call *sys_call_table(,%rax,8)
21967 +1: movq %rax,RAX-ARGOFFSET(%rsp)
21968 + /* Use IRET because user could have changed frame */
21969 + jmp int_ret_from_sys_call
21970 + CFI_ENDPROC
21971 +END(system_call)
21972 +
21973 +/*
21974 + * Syscall return path ending with IRET.
21975 + * Has correct top of stack, but partial stack frame.
21976 + */
21977 +ENTRY(int_ret_from_sys_call)
21978 + CFI_STARTPROC simple
21979 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
21980 + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
21981 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
21982 + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
21983 + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
21984 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
21985 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
21986 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
21987 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
21988 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
21989 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
21990 + CFI_REL_OFFSET r8,R8-ARGOFFSET
21991 + CFI_REL_OFFSET r9,R9-ARGOFFSET
21992 + CFI_REL_OFFSET r10,R10-ARGOFFSET
21993 + CFI_REL_OFFSET r11,R11-ARGOFFSET
21994 + XEN_BLOCK_EVENTS(%rsi)
21995 + TRACE_IRQS_OFF
21996 + testb $3,CS-ARGOFFSET(%rsp)
21997 + jnz 1f
21998 + /* Need to set the proper %ss (not NULL) for ring 3 iretq */
21999 + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
22000 + jmp retint_restore_args # retrun from ring3 kernel
22001 +1:
22002 + movl $_TIF_ALLWORK_MASK,%edi
22003 + /* edi: mask to check */
22004 +int_with_check:
22005 + GET_THREAD_INFO(%rcx)
22006 + movl threadinfo_flags(%rcx),%edx
22007 + andl %edi,%edx
22008 + jnz int_careful
22009 + andl $~TS_COMPAT,threadinfo_status(%rcx)
22010 + jmp retint_restore_args
22011 +
22012 + /* Either reschedule or signal or syscall exit tracking needed. */
22013 + /* First do a reschedule test. */
22014 + /* edx: work, edi: workmask */
22015 +int_careful:
22016 + bt $TIF_NEED_RESCHED,%edx
22017 + jnc int_very_careful
22018 + TRACE_IRQS_ON
22019 +/* sti */
22020 + XEN_UNBLOCK_EVENTS(%rsi)
22021 + pushq %rdi
22022 + CFI_ADJUST_CFA_OFFSET 8
22023 + call schedule
22024 + popq %rdi
22025 + CFI_ADJUST_CFA_OFFSET -8
22026 + XEN_BLOCK_EVENTS(%rsi)
22027 + TRACE_IRQS_OFF
22028 + jmp int_with_check
22029 +
22030 + /* handle signals and tracing -- both require a full stack frame */
22031 +int_very_careful:
22032 + TRACE_IRQS_ON
22033 +/* sti */
22034 + XEN_UNBLOCK_EVENTS(%rsi)
22035 + SAVE_REST
22036 + /* Check for syscall exit trace */
22037 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
22038 + jz int_signal
22039 + pushq %rdi
22040 + CFI_ADJUST_CFA_OFFSET 8
22041 + leaq 8(%rsp),%rdi # &ptregs -> arg1
22042 + call syscall_trace_leave
22043 + popq %rdi
22044 + CFI_ADJUST_CFA_OFFSET -8
22045 + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
22046 + XEN_BLOCK_EVENTS(%rsi)
22047 + TRACE_IRQS_OFF
22048 + jmp int_restore_rest
22049 +
22050 +int_signal:
22051 + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
22052 + jz 1f
22053 + movq %rsp,%rdi # &ptregs -> arg1
22054 + xorl %esi,%esi # oldset -> arg2
22055 + call do_notify_resume
22056 +1: movl $_TIF_NEED_RESCHED,%edi
22057 +int_restore_rest:
22058 + RESTORE_REST
22059 + XEN_BLOCK_EVENTS(%rsi)
22060 + TRACE_IRQS_OFF
22061 + jmp int_with_check
22062 + CFI_ENDPROC
22063 +END(int_ret_from_sys_call)
22064 +
22065 +/*
22066 + * Certain special system calls that need to save a complete full stack frame.
22067 + */
22068 +
22069 + .macro PTREGSCALL label,func,arg
22070 + .globl \label
22071 +\label:
22072 + leaq \func(%rip),%rax
22073 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
22074 + jmp ptregscall_common
22075 +END(\label)
22076 + .endm
22077 +
22078 + CFI_STARTPROC
22079 +
22080 + PTREGSCALL stub_clone, sys_clone, %r8
22081 + PTREGSCALL stub_fork, sys_fork, %rdi
22082 + PTREGSCALL stub_vfork, sys_vfork, %rdi
22083 + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
22084 + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
22085 + PTREGSCALL stub_iopl, sys_iopl, %rsi
22086 +
22087 +ENTRY(ptregscall_common)
22088 + popq %r11
22089 + CFI_ADJUST_CFA_OFFSET -8
22090 + CFI_REGISTER rip, r11
22091 + SAVE_REST
22092 + movq %r11, %r15
22093 + CFI_REGISTER rip, r15
22094 + FIXUP_TOP_OF_STACK %r11
22095 + call *%rax
22096 + RESTORE_TOP_OF_STACK %r11
22097 + movq %r15, %r11
22098 + CFI_REGISTER rip, r11
22099 + RESTORE_REST
22100 + pushq %r11
22101 + CFI_ADJUST_CFA_OFFSET 8
22102 + CFI_REL_OFFSET rip, 0
22103 + ret
22104 + CFI_ENDPROC
22105 +END(ptregscall_common)
22106 +
22107 +ENTRY(stub_execve)
22108 + CFI_STARTPROC
22109 + popq %r11
22110 + CFI_ADJUST_CFA_OFFSET -8
22111 + CFI_REGISTER rip, r11
22112 + SAVE_REST
22113 + FIXUP_TOP_OF_STACK %r11
22114 + call sys_execve
22115 + RESTORE_TOP_OF_STACK %r11
22116 + movq %rax,RAX(%rsp)
22117 + RESTORE_REST
22118 + jmp int_ret_from_sys_call
22119 + CFI_ENDPROC
22120 +END(stub_execve)
22121 +
22122 +/*
22123 + * sigreturn is special because it needs to restore all registers on return.
22124 + * This cannot be done with SYSRET, so use the IRET return path instead.
22125 + */
22126 +ENTRY(stub_rt_sigreturn)
22127 + CFI_STARTPROC
22128 + addq $8, %rsp
22129 + CFI_ADJUST_CFA_OFFSET -8
22130 + SAVE_REST
22131 + movq %rsp,%rdi
22132 + FIXUP_TOP_OF_STACK %r11
22133 + call sys_rt_sigreturn
22134 + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
22135 + RESTORE_REST
22136 + jmp int_ret_from_sys_call
22137 + CFI_ENDPROC
22138 +END(stub_rt_sigreturn)
22139 +
22140 +/* initial frame state for interrupts (and exceptions without error code) */
22141 +#define INTR_FRAME _frame (RIP-0x10); \
22142 + CFI_REL_OFFSET rcx,0; \
22143 + CFI_REL_OFFSET r11,8
22144 +
22145 +/* initial frame state for exceptions with error code (and interrupts with
22146 + vector already pushed) */
22147 +#define XCPT_FRAME _frame (RIP-0x18); \
22148 + CFI_REL_OFFSET rcx,0; \
22149 + CFI_REL_OFFSET r11,8
22150 +
22151 +/*
22152 + * Interrupt exit.
22153 + *
22154 + */
22155 +
22156 +retint_check:
22157 + CFI_DEFAULT_STACK adj=1
22158 + movl threadinfo_flags(%rcx),%edx
22159 + andl %edi,%edx
22160 + CFI_REMEMBER_STATE
22161 + jnz retint_careful
22162 +retint_restore_args:
22163 + movl EFLAGS-REST_SKIP(%rsp), %eax
22164 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
22165 + XEN_GET_VCPU_INFO(%rsi)
22166 + andb evtchn_upcall_mask(%rsi),%al
22167 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
22168 + jnz restore_all_enable_events # != 0 => enable event delivery
22169 + XEN_PUT_VCPU_INFO(%rsi)
22170 +
22171 + RESTORE_ARGS 0,8,0
22172 + HYPERVISOR_IRET 0
22173 +
22174 + /* edi: workmask, edx: work */
22175 +retint_careful:
22176 + CFI_RESTORE_STATE
22177 + bt $TIF_NEED_RESCHED,%edx
22178 + jnc retint_signal
22179 + TRACE_IRQS_ON
22180 + XEN_UNBLOCK_EVENTS(%rsi)
22181 +/* sti */
22182 + pushq %rdi
22183 + CFI_ADJUST_CFA_OFFSET 8
22184 + call schedule
22185 + popq %rdi
22186 + CFI_ADJUST_CFA_OFFSET -8
22187 + GET_THREAD_INFO(%rcx)
22188 + XEN_BLOCK_EVENTS(%rsi)
22189 +/* cli */
22190 + TRACE_IRQS_OFF
22191 + jmp retint_check
22192 +
22193 +retint_signal:
22194 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
22195 + jz retint_restore_args
22196 + TRACE_IRQS_ON
22197 + XEN_UNBLOCK_EVENTS(%rsi)
22198 + SAVE_REST
22199 + movq $-1,ORIG_RAX(%rsp)
22200 + xorl %esi,%esi # oldset
22201 + movq %rsp,%rdi # &pt_regs
22202 + call do_notify_resume
22203 + RESTORE_REST
22204 + XEN_BLOCK_EVENTS(%rsi)
22205 + TRACE_IRQS_OFF
22206 + movl $_TIF_NEED_RESCHED,%edi
22207 + GET_THREAD_INFO(%rcx)
22208 + jmp retint_check
22209 +
22210 +#ifdef CONFIG_PREEMPT
22211 + /* Returning to kernel space. Check if we need preemption */
22212 + /* rcx: threadinfo. interrupts off. */
22213 + .p2align
22214 +retint_kernel:
22215 + cmpl $0,threadinfo_preempt_count(%rcx)
22216 + jnz retint_restore_args
22217 + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
22218 + jnc retint_restore_args
22219 + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
22220 + jnc retint_restore_args
22221 + call preempt_schedule_irq
22222 + jmp retint_kernel /* check again */
22223 +#endif
22224 +
22225 + CFI_ENDPROC
22226 +END(retint_check)
22227 +
22228 +#ifndef CONFIG_XEN
22229 +/*
22230 + * APIC interrupts.
22231 + */
22232 + .macro apicinterrupt num,func
22233 + INTR_FRAME
22234 + pushq $~(\num)
22235 + CFI_ADJUST_CFA_OFFSET 8
22236 + interrupt \func
22237 + jmp error_entry
22238 + CFI_ENDPROC
22239 + .endm
22240 +
22241 +ENTRY(thermal_interrupt)
22242 + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
22243 +END(thermal_interrupt)
22244 +
22245 +ENTRY(threshold_interrupt)
22246 + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
22247 +END(threshold_interrupt)
22248 +
22249 +#ifdef CONFIG_SMP
22250 +ENTRY(reschedule_interrupt)
22251 + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
22252 +END(reschedule_interrupt)
22253 +
22254 + .macro INVALIDATE_ENTRY num
22255 +ENTRY(invalidate_interrupt\num)
22256 + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
22257 +END(invalidate_interrupt\num)
22258 + .endm
22259 +
22260 + INVALIDATE_ENTRY 0
22261 + INVALIDATE_ENTRY 1
22262 + INVALIDATE_ENTRY 2
22263 + INVALIDATE_ENTRY 3
22264 + INVALIDATE_ENTRY 4
22265 + INVALIDATE_ENTRY 5
22266 + INVALIDATE_ENTRY 6
22267 + INVALIDATE_ENTRY 7
22268 +
22269 +ENTRY(call_function_interrupt)
22270 + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
22271 +END(call_function_interrupt)
22272 +#endif
22273 +
22274 +#ifdef CONFIG_X86_LOCAL_APIC
22275 +ENTRY(apic_timer_interrupt)
22276 + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
22277 +END(apic_timer_interrupt)
22278 +
22279 +ENTRY(error_interrupt)
22280 + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
22281 +END(error_interrupt)
22282 +
22283 +ENTRY(spurious_interrupt)
22284 + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
22285 +END(spurious_interrupt)
22286 +#endif
22287 +#endif /* !CONFIG_XEN */
22288 +
22289 +/*
22290 + * Exception entry points.
22291 + */
22292 + .macro zeroentry sym
22293 + INTR_FRAME
22294 + movq (%rsp),%rcx
22295 + CFI_RESTORE rcx
22296 + movq 8(%rsp),%r11
22297 + CFI_RESTORE r11
22298 + addq $0x10,%rsp /* skip rcx and r11 */
22299 + CFI_ADJUST_CFA_OFFSET -0x10
22300 + pushq $0 /* push error code/oldrax */
22301 + CFI_ADJUST_CFA_OFFSET 8
22302 + pushq %rax /* push real oldrax to the rdi slot */
22303 + CFI_ADJUST_CFA_OFFSET 8
22304 + CFI_REL_OFFSET rax,0
22305 + leaq \sym(%rip),%rax
22306 + jmp error_entry
22307 + CFI_ENDPROC
22308 + .endm
22309 +
22310 + .macro errorentry sym
22311 + XCPT_FRAME
22312 + movq (%rsp),%rcx
22313 + CFI_RESTORE rcx
22314 + movq 8(%rsp),%r11
22315 + CFI_RESTORE r11
22316 + addq $0x10,%rsp /* rsp points to the error code */
22317 + CFI_ADJUST_CFA_OFFSET -0x10
22318 + pushq %rax
22319 + CFI_ADJUST_CFA_OFFSET 8
22320 + CFI_REL_OFFSET rax,0
22321 + leaq \sym(%rip),%rax
22322 + jmp error_entry
22323 + CFI_ENDPROC
22324 + .endm
22325 +
22326 +#if 0 /* not XEN */
22327 + /* error code is on the stack already */
22328 + /* handle NMI like exceptions that can happen everywhere */
22329 + .macro paranoidentry sym, ist=0, irqtrace=1
22330 + movq (%rsp),%rcx
22331 + movq 8(%rsp),%r11
22332 + addq $0x10,%rsp /* skip rcx and r11 */
22333 + SAVE_ALL
22334 + cld
22335 +#if 0 /* not XEN */
22336 + movl $1,%ebx
22337 + movl $MSR_GS_BASE,%ecx
22338 + rdmsr
22339 + testl %edx,%edx
22340 + js 1f
22341 + swapgs
22342 + xorl %ebx,%ebx
22343 +1:
22344 +#endif
22345 + .if \ist
22346 + movq %gs:pda_data_offset, %rbp
22347 + .endif
22348 + movq %rsp,%rdi
22349 + movq ORIG_RAX(%rsp),%rsi
22350 + movq $-1,ORIG_RAX(%rsp)
22351 + .if \ist
22352 + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22353 + .endif
22354 + call \sym
22355 + .if \ist
22356 + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
22357 + .endif
22358 +/* cli */
22359 + XEN_BLOCK_EVENTS(%rsi)
22360 + .if \irqtrace
22361 + TRACE_IRQS_OFF
22362 + .endif
22363 + .endm
22364 +
22365 + /*
22366 + * "Paranoid" exit path from exception stack.
22367 + * Paranoid because this is used by NMIs and cannot take
22368 + * any kernel state for granted.
22369 + * We don't do kernel preemption checks here, because only
22370 + * NMI should be common and it does not enable IRQs and
22371 + * cannot get reschedule ticks.
22372 + *
22373 + * "trace" is 0 for the NMI handler only, because irq-tracing
22374 + * is fundamentally NMI-unsafe. (we cannot change the soft and
22375 + * hard flags at once, atomically)
22376 + */
22377 + .macro paranoidexit trace=1
22378 + /* ebx: no swapgs flag */
22379 +paranoid_exit\trace:
22380 + testl %ebx,%ebx /* swapgs needed? */
22381 + jnz paranoid_restore\trace
22382 + testl $3,CS(%rsp)
22383 + jnz paranoid_userspace\trace
22384 +paranoid_swapgs\trace:
22385 + TRACE_IRQS_IRETQ 0
22386 + swapgs
22387 +paranoid_restore\trace:
22388 + RESTORE_ALL 8
22389 + iretq
22390 +paranoid_userspace\trace:
22391 + GET_THREAD_INFO(%rcx)
22392 + movl threadinfo_flags(%rcx),%ebx
22393 + andl $_TIF_WORK_MASK,%ebx
22394 + jz paranoid_swapgs\trace
22395 + movq %rsp,%rdi /* &pt_regs */
22396 + call sync_regs
22397 + movq %rax,%rsp /* switch stack for scheduling */
22398 + testl $_TIF_NEED_RESCHED,%ebx
22399 + jnz paranoid_schedule\trace
22400 + movl %ebx,%edx /* arg3: thread flags */
22401 + .if \trace
22402 + TRACE_IRQS_ON
22403 + .endif
22404 + sti
22405 + xorl %esi,%esi /* arg2: oldset */
22406 + movq %rsp,%rdi /* arg1: &pt_regs */
22407 + call do_notify_resume
22408 + cli
22409 + .if \trace
22410 + TRACE_IRQS_OFF
22411 + .endif
22412 + jmp paranoid_userspace\trace
22413 +paranoid_schedule\trace:
22414 + .if \trace
22415 + TRACE_IRQS_ON
22416 + .endif
22417 + sti
22418 + call schedule
22419 + cli
22420 + .if \trace
22421 + TRACE_IRQS_OFF
22422 + .endif
22423 + jmp paranoid_userspace\trace
22424 + CFI_ENDPROC
22425 + .endm
22426 +#endif
22427 +
22428 +/*
22429 + * Exception entry point. This expects an error code/orig_rax on the stack
22430 + * and the exception handler in %rax.
22431 + */
22432 +ENTRY(error_entry)
22433 + _frame RDI
22434 + CFI_REL_OFFSET rax,0
22435 + /* rdi slot contains rax, oldrax contains error code */
22436 + cld
22437 + subq $14*8,%rsp
22438 + CFI_ADJUST_CFA_OFFSET (14*8)
22439 + movq %rsi,13*8(%rsp)
22440 + CFI_REL_OFFSET rsi,RSI
22441 + movq 14*8(%rsp),%rsi /* load rax from rdi slot */
22442 + CFI_REGISTER rax,rsi
22443 + movq %rdx,12*8(%rsp)
22444 + CFI_REL_OFFSET rdx,RDX
22445 + movq %rcx,11*8(%rsp)
22446 + CFI_REL_OFFSET rcx,RCX
22447 + movq %rsi,10*8(%rsp) /* store rax */
22448 + CFI_REL_OFFSET rax,RAX
22449 + movq %r8, 9*8(%rsp)
22450 + CFI_REL_OFFSET r8,R8
22451 + movq %r9, 8*8(%rsp)
22452 + CFI_REL_OFFSET r9,R9
22453 + movq %r10,7*8(%rsp)
22454 + CFI_REL_OFFSET r10,R10
22455 + movq %r11,6*8(%rsp)
22456 + CFI_REL_OFFSET r11,R11
22457 + movq %rbx,5*8(%rsp)
22458 + CFI_REL_OFFSET rbx,RBX
22459 + movq %rbp,4*8(%rsp)
22460 + CFI_REL_OFFSET rbp,RBP
22461 + movq %r12,3*8(%rsp)
22462 + CFI_REL_OFFSET r12,R12
22463 + movq %r13,2*8(%rsp)
22464 + CFI_REL_OFFSET r13,R13
22465 + movq %r14,1*8(%rsp)
22466 + CFI_REL_OFFSET r14,R14
22467 + movq %r15,(%rsp)
22468 + CFI_REL_OFFSET r15,R15
22469 +#if 0
22470 + cmpl $__KERNEL_CS,CS(%rsp)
22471 + CFI_REMEMBER_STATE
22472 + je error_kernelspace
22473 +#endif
22474 +error_call_handler:
22475 + movq %rdi, RDI(%rsp)
22476 + CFI_REL_OFFSET rdi,RDI
22477 + movq %rsp,%rdi
22478 + movq ORIG_RAX(%rsp),%rsi # get error code
22479 + movq $-1,ORIG_RAX(%rsp)
22480 + call *%rax
22481 +error_exit:
22482 + RESTORE_REST
22483 +/* cli */
22484 + XEN_BLOCK_EVENTS(%rsi)
22485 + TRACE_IRQS_OFF
22486 + GET_THREAD_INFO(%rcx)
22487 + testb $3,CS-ARGOFFSET(%rsp)
22488 + jz retint_kernel
22489 + movl threadinfo_flags(%rcx),%edx
22490 + movl $_TIF_WORK_MASK,%edi
22491 + andl %edi,%edx
22492 + jnz retint_careful
22493 + /*
22494 + * The iret might restore flags:
22495 + */
22496 + TRACE_IRQS_IRETQ
22497 + jmp retint_restore_args
22498 +
22499 +#if 0
22500 + /*
22501 + * We need to re-write the logic here because we don't do iretq to
22502 + * to return to user mode. It's still possible that we get trap/fault
22503 + * in the kernel (when accessing buffers pointed to by system calls,
22504 + * for example).
22505 + *
22506 + */
22507 + CFI_RESTORE_STATE
22508 +error_kernelspace:
22509 + incl %ebx
22510 + /* There are two places in the kernel that can potentially fault with
22511 + usergs. Handle them here. The exception handlers after
22512 + iret run with kernel gs again, so don't set the user space flag.
22513 + B stepping K8s sometimes report an truncated RIP for IRET
22514 + exceptions returning to compat mode. Check for these here too. */
22515 + leaq iret_label(%rip),%rbp
22516 + cmpq %rbp,RIP(%rsp)
22517 + je error_swapgs
22518 + movl %ebp,%ebp /* zero extend */
22519 + cmpq %rbp,RIP(%rsp)
22520 + je error_swapgs
22521 + cmpq $gs_change,RIP(%rsp)
22522 + je error_swapgs
22523 + jmp error_sti
22524 +#endif
22525 + CFI_ENDPROC
22526 +END(error_entry)
22527 +
22528 +ENTRY(hypervisor_callback)
22529 + zeroentry do_hypervisor_callback
22530 +END(hypervisor_callback)
22531 +
22532 +/*
22533 + * Copied from arch/xen/i386/kernel/entry.S
22534 + */
22535 +# A note on the "critical region" in our callback handler.
22536 +# We want to avoid stacking callback handlers due to events occurring
22537 +# during handling of the last event. To do this, we keep events disabled
22538 +# until we've done all processing. HOWEVER, we must enable events before
22539 +# popping the stack frame (can't be done atomically) and so it would still
22540 +# be possible to get enough handler activations to overflow the stack.
22541 +# Although unlikely, bugs of that kind are hard to track down, so we'd
22542 +# like to avoid the possibility.
22543 +# So, on entry to the handler we detect whether we interrupted an
22544 +# existing activation in its critical region -- if so, we pop the current
22545 +# activation and restart the handler using the previous one.
22546 +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
22547 + CFI_STARTPROC
22548 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
22549 +# see the correct pointer to the pt_regs
22550 + movq %rdi, %rsp # we don't return, adjust the stack frame
22551 + CFI_ENDPROC
22552 + CFI_DEFAULT_STACK
22553 +11: incl %gs:pda_irqcount
22554 + movq %rsp,%rbp
22555 + CFI_DEF_CFA_REGISTER rbp
22556 + cmovzq %gs:pda_irqstackptr,%rsp
22557 + pushq %rbp # backlink for old unwinder
22558 + call evtchn_do_upcall
22559 + popq %rsp
22560 + CFI_DEF_CFA_REGISTER rsp
22561 + decl %gs:pda_irqcount
22562 + jmp error_exit
22563 + CFI_ENDPROC
22564 +END(do_hypervisor_callback)
22565 +
22566 +#ifdef CONFIG_X86_LOCAL_APIC
22567 +KPROBE_ENTRY(nmi)
22568 + zeroentry do_nmi_callback
22569 +ENTRY(do_nmi_callback)
22570 + CFI_STARTPROC
22571 + addq $8, %rsp
22572 + CFI_ENDPROC
22573 + CFI_DEFAULT_STACK
22574 + call do_nmi
22575 + orl $NMI_MASK,EFLAGS(%rsp)
22576 + RESTORE_REST
22577 + XEN_BLOCK_EVENTS(%rsi)
22578 + TRACE_IRQS_OFF
22579 + GET_THREAD_INFO(%rcx)
22580 + jmp retint_restore_args
22581 + CFI_ENDPROC
22582 + .previous .text
22583 +END(nmi)
22584 +#endif
22585 +
22586 + ALIGN
22587 +restore_all_enable_events:
22588 + CFI_DEFAULT_STACK adj=1
22589 + TRACE_IRQS_ON
22590 + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
22591 +
22592 +scrit: /**** START OF CRITICAL REGION ****/
22593 + XEN_TEST_PENDING(%rsi)
22594 + CFI_REMEMBER_STATE
22595 + jnz 14f # process more events if necessary...
22596 + XEN_PUT_VCPU_INFO(%rsi)
22597 + RESTORE_ARGS 0,8,0
22598 + HYPERVISOR_IRET 0
22599 +
22600 + CFI_RESTORE_STATE
22601 +14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
22602 + XEN_PUT_VCPU_INFO(%rsi)
22603 + SAVE_REST
22604 + movq %rsp,%rdi # set the argument again
22605 + jmp 11b
22606 + CFI_ENDPROC
22607 +ecrit: /**** END OF CRITICAL REGION ****/
22608 +# At this point, unlike on x86-32, we don't do the fixup to simplify the
22609 +# code and the stack frame is more complex on x86-64.
22610 +# When the kernel is interrupted in the critical section, the kernel
22611 +# will do IRET in that case, and everything will be restored at that point,
22612 +# i.e. it just resumes from the next instruction interrupted with the same context.
22613 +
22614 +# Hypervisor uses this for application faults while it executes.
22615 +# We get here for two reasons:
22616 +# 1. Fault while reloading DS, ES, FS or GS
22617 +# 2. Fault while executing IRET
22618 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
22619 +# registers that could be reloaded and zeroed the others.
22620 +# Category 2 we fix up by killing the current process. We cannot use the
22621 +# normal Linux return path in this case because if we use the IRET hypercall
22622 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
22623 +# We distinguish between categories by comparing each saved segment register
22624 +# with its current contents: any discrepancy means we in category 1.
22625 +ENTRY(failsafe_callback)
22626 + _frame (RIP-0x30)
22627 + CFI_REL_OFFSET rcx, 0
22628 + CFI_REL_OFFSET r11, 8
22629 + movw %ds,%cx
22630 + cmpw %cx,0x10(%rsp)
22631 + CFI_REMEMBER_STATE
22632 + jne 1f
22633 + movw %es,%cx
22634 + cmpw %cx,0x18(%rsp)
22635 + jne 1f
22636 + movw %fs,%cx
22637 + cmpw %cx,0x20(%rsp)
22638 + jne 1f
22639 + movw %gs,%cx
22640 + cmpw %cx,0x28(%rsp)
22641 + jne 1f
22642 + /* All segments match their saved values => Category 2 (Bad IRET). */
22643 + movq (%rsp),%rcx
22644 + CFI_RESTORE rcx
22645 + movq 8(%rsp),%r11
22646 + CFI_RESTORE r11
22647 + addq $0x30,%rsp
22648 + CFI_ADJUST_CFA_OFFSET -0x30
22649 + movq $11,%rdi /* SIGSEGV */
22650 + jmp do_exit
22651 + CFI_RESTORE_STATE
22652 +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
22653 + movq (%rsp),%rcx
22654 + CFI_RESTORE rcx
22655 + movq 8(%rsp),%r11
22656 + CFI_RESTORE r11
22657 + addq $0x30,%rsp
22658 + CFI_ADJUST_CFA_OFFSET -0x30
22659 + pushq $0
22660 + CFI_ADJUST_CFA_OFFSET 8
22661 + SAVE_ALL
22662 + jmp error_exit
22663 + CFI_ENDPROC
22664 +#if 0
22665 + .section __ex_table,"a"
22666 + .align 8
22667 + .quad gs_change,bad_gs
22668 + .previous
22669 + .section .fixup,"ax"
22670 + /* running with kernelgs */
22671 +bad_gs:
22672 +/* swapgs */ /* switch back to user gs */
22673 + xorl %eax,%eax
22674 + movl %eax,%gs
22675 + jmp 2b
22676 + .previous
22677 +#endif
22678 +
22679 +/*
22680 + * Create a kernel thread.
22681 + *
22682 + * C extern interface:
22683 + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
22684 + *
22685 + * asm input arguments:
22686 + * rdi: fn, rsi: arg, rdx: flags
22687 + */
22688 +ENTRY(kernel_thread)
22689 + CFI_STARTPROC
22690 + FAKE_STACK_FRAME $child_rip
22691 + SAVE_ALL
22692 +
22693 + # rdi: flags, rsi: usp, rdx: will be &pt_regs
22694 + movq %rdx,%rdi
22695 + orq kernel_thread_flags(%rip),%rdi
22696 + movq $-1, %rsi
22697 + movq %rsp, %rdx
22698 +
22699 + xorl %r8d,%r8d
22700 + xorl %r9d,%r9d
22701 +
22702 + # clone now
22703 + call do_fork
22704 + movq %rax,RAX(%rsp)
22705 + xorl %edi,%edi
22706 +
22707 + /*
22708 + * It isn't worth to check for reschedule here,
22709 + * so internally to the x86_64 port you can rely on kernel_thread()
22710 + * not to reschedule the child before returning, this avoids the need
22711 + * of hacks for example to fork off the per-CPU idle tasks.
22712 + * [Hopefully no generic code relies on the reschedule -AK]
22713 + */
22714 + RESTORE_ALL
22715 + UNFAKE_STACK_FRAME
22716 + ret
22717 + CFI_ENDPROC
22718 +ENDPROC(kernel_thread)
22719 +
22720 +child_rip:
22721 + pushq $0 # fake return address
22722 + CFI_STARTPROC
22723 + /*
22724 + * Here we are in the child and the registers are set as they were
22725 + * at kernel_thread() invocation in the parent.
22726 + */
22727 + movq %rdi, %rax
22728 + movq %rsi, %rdi
22729 + call *%rax
22730 + # exit
22731 + xorl %edi, %edi
22732 + call do_exit
22733 + CFI_ENDPROC
22734 +ENDPROC(child_rip)
22735 +
22736 +/*
22737 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
22738 + *
22739 + * C extern interface:
22740 + * extern long execve(char *name, char **argv, char **envp)
22741 + *
22742 + * asm input arguments:
22743 + * rdi: name, rsi: argv, rdx: envp
22744 + *
22745 + * We want to fallback into:
22746 + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
22747 + *
22748 + * do_sys_execve asm fallback arguments:
22749 + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
22750 + */
22751 +ENTRY(execve)
22752 + CFI_STARTPROC
22753 + FAKE_STACK_FRAME $0
22754 + SAVE_ALL
22755 + call sys_execve
22756 + movq %rax, RAX(%rsp)
22757 + RESTORE_REST
22758 + testq %rax,%rax
22759 + jne 1f
22760 + jmp int_ret_from_sys_call
22761 +1: RESTORE_ARGS
22762 + UNFAKE_STACK_FRAME
22763 + ret
22764 + CFI_ENDPROC
22765 +ENDPROC(execve)
22766 +
22767 +KPROBE_ENTRY(page_fault)
22768 + errorentry do_page_fault
22769 +END(page_fault)
22770 + .previous .text
22771 +
22772 +ENTRY(coprocessor_error)
22773 + zeroentry do_coprocessor_error
22774 +END(coprocessor_error)
22775 +
22776 +ENTRY(simd_coprocessor_error)
22777 + zeroentry do_simd_coprocessor_error
22778 +END(simd_coprocessor_error)
22779 +
22780 +ENTRY(device_not_available)
22781 + zeroentry math_state_restore
22782 +END(device_not_available)
22783 +
22784 + /* runs on exception stack */
22785 +KPROBE_ENTRY(debug)
22786 +/* INTR_FRAME
22787 + pushq $0
22788 + CFI_ADJUST_CFA_OFFSET 8 */
22789 + zeroentry do_debug
22790 +/* paranoidexit
22791 + CFI_ENDPROC */
22792 +END(debug)
22793 + .previous .text
22794 +
22795 +#if 0
22796 + /* runs on exception stack */
22797 +KPROBE_ENTRY(nmi)
22798 + INTR_FRAME
22799 + pushq $-1
22800 + CFI_ADJUST_CFA_OFFSET 8
22801 + paranoidentry do_nmi, 0, 0
22802 +#ifdef CONFIG_TRACE_IRQFLAGS
22803 + paranoidexit 0
22804 +#else
22805 + jmp paranoid_exit1
22806 + CFI_ENDPROC
22807 +#endif
22808 +END(nmi)
22809 + .previous .text
22810 +#endif
22811 +
22812 +KPROBE_ENTRY(int3)
22813 +/* INTR_FRAME
22814 + pushq $0
22815 + CFI_ADJUST_CFA_OFFSET 8 */
22816 + zeroentry do_int3
22817 +/* jmp paranoid_exit1
22818 + CFI_ENDPROC */
22819 +END(int3)
22820 + .previous .text
22821 +
22822 +ENTRY(overflow)
22823 + zeroentry do_overflow
22824 +END(overflow)
22825 +
22826 +ENTRY(bounds)
22827 + zeroentry do_bounds
22828 +END(bounds)
22829 +
22830 +ENTRY(invalid_op)
22831 + zeroentry do_invalid_op
22832 +END(invalid_op)
22833 +
22834 +ENTRY(coprocessor_segment_overrun)
22835 + zeroentry do_coprocessor_segment_overrun
22836 +END(coprocessor_segment_overrun)
22837 +
22838 +ENTRY(reserved)
22839 + zeroentry do_reserved
22840 +END(reserved)
22841 +
22842 +#if 0
22843 + /* runs on exception stack */
22844 +ENTRY(double_fault)
22845 + XCPT_FRAME
22846 + paranoidentry do_double_fault
22847 + jmp paranoid_exit1
22848 + CFI_ENDPROC
22849 +END(double_fault)
22850 +#endif
22851 +
22852 +ENTRY(invalid_TSS)
22853 + errorentry do_invalid_TSS
22854 +END(invalid_TSS)
22855 +
22856 +ENTRY(segment_not_present)
22857 + errorentry do_segment_not_present
22858 +END(segment_not_present)
22859 +
22860 + /* runs on exception stack */
22861 +ENTRY(stack_segment)
22862 +/* XCPT_FRAME
22863 + paranoidentry do_stack_segment */
22864 + errorentry do_stack_segment
22865 +/* jmp paranoid_exit1
22866 + CFI_ENDPROC */
22867 +END(stack_segment)
22868 +
22869 +KPROBE_ENTRY(general_protection)
22870 + errorentry do_general_protection
22871 +END(general_protection)
22872 + .previous .text
22873 +
22874 +ENTRY(alignment_check)
22875 + errorentry do_alignment_check
22876 +END(alignment_check)
22877 +
22878 +ENTRY(divide_error)
22879 + zeroentry do_divide_error
22880 +END(divide_error)
22881 +
22882 +ENTRY(spurious_interrupt_bug)
22883 + zeroentry do_spurious_interrupt_bug
22884 +END(spurious_interrupt_bug)
22885 +
22886 +#ifdef CONFIG_X86_MCE
22887 + /* runs on exception stack */
22888 +ENTRY(machine_check)
22889 + INTR_FRAME
22890 + pushq $0
22891 + CFI_ADJUST_CFA_OFFSET 8
22892 + paranoidentry do_machine_check
22893 + jmp paranoid_exit1
22894 + CFI_ENDPROC
22895 +END(machine_check)
22896 +#endif
22897 +
22898 +/* Call softirq on interrupt stack. Interrupts are off. */
22899 +ENTRY(call_softirq)
22900 + CFI_STARTPROC
22901 + push %rbp
22902 + CFI_ADJUST_CFA_OFFSET 8
22903 + CFI_REL_OFFSET rbp,0
22904 + mov %rsp,%rbp
22905 + CFI_DEF_CFA_REGISTER rbp
22906 + incl %gs:pda_irqcount
22907 + cmove %gs:pda_irqstackptr,%rsp
22908 + push %rbp # backlink for old unwinder
22909 + call __do_softirq
22910 + leaveq
22911 + CFI_DEF_CFA_REGISTER rsp
22912 + CFI_ADJUST_CFA_OFFSET -8
22913 + decl %gs:pda_irqcount
22914 + ret
22915 + CFI_ENDPROC
22916 +ENDPROC(call_softirq)
22917 +
22918 +#ifdef CONFIG_STACK_UNWIND
22919 +ENTRY(arch_unwind_init_running)
22920 + CFI_STARTPROC
22921 + movq %r15, R15(%rdi)
22922 + movq %r14, R14(%rdi)
22923 + xchgq %rsi, %rdx
22924 + movq %r13, R13(%rdi)
22925 + movq %r12, R12(%rdi)
22926 + xorl %eax, %eax
22927 + movq %rbp, RBP(%rdi)
22928 + movq %rbx, RBX(%rdi)
22929 + movq (%rsp), %rcx
22930 + movq %rax, R11(%rdi)
22931 + movq %rax, R10(%rdi)
22932 + movq %rax, R9(%rdi)
22933 + movq %rax, R8(%rdi)
22934 + movq %rax, RAX(%rdi)
22935 + movq %rax, RCX(%rdi)
22936 + movq %rax, RDX(%rdi)
22937 + movq %rax, RSI(%rdi)
22938 + movq %rax, RDI(%rdi)
22939 + movq %rax, ORIG_RAX(%rdi)
22940 + movq %rcx, RIP(%rdi)
22941 + leaq 8(%rsp), %rcx
22942 + movq $__KERNEL_CS, CS(%rdi)
22943 + movq %rax, EFLAGS(%rdi)
22944 + movq %rcx, RSP(%rdi)
22945 + movq $__KERNEL_DS, SS(%rdi)
22946 + jmpq *%rdx
22947 + CFI_ENDPROC
22948 +ENDPROC(arch_unwind_init_running)
22949 +#endif
22950 Index: head-2008-11-25/arch/x86/kernel/genapic_64-xen.c
22951 ===================================================================
22952 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
22953 +++ head-2008-11-25/arch/x86/kernel/genapic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
22954 @@ -0,0 +1,143 @@
22955 +/*
22956 + * Copyright 2004 James Cleverdon, IBM.
22957 + * Subject to the GNU Public License, v.2
22958 + *
22959 + * Generic APIC sub-arch probe layer.
22960 + *
22961 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
22962 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
22963 + * James Cleverdon.
22964 + */
22965 +#include <linux/threads.h>
22966 +#include <linux/cpumask.h>
22967 +#include <linux/string.h>
22968 +#include <linux/kernel.h>
22969 +#include <linux/ctype.h>
22970 +#include <linux/init.h>
22971 +#include <linux/module.h>
22972 +
22973 +#include <asm/smp.h>
22974 +#include <asm/ipi.h>
22975 +
22976 +#if defined(CONFIG_ACPI)
22977 +#include <acpi/acpi_bus.h>
22978 +#endif
22979 +
22980 +/* which logical CPU number maps to which CPU (physical APIC ID) */
22981 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
22982 +EXPORT_SYMBOL(x86_cpu_to_apicid);
22983 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
22984 +
22985 +extern struct genapic apic_cluster;
22986 +extern struct genapic apic_flat;
22987 +extern struct genapic apic_physflat;
22988 +
22989 +#ifndef CONFIG_XEN
22990 +struct genapic *genapic = &apic_flat;
22991 +#else
22992 +extern struct genapic apic_xen;
22993 +struct genapic *genapic = &apic_xen;
22994 +#endif
22995 +
22996 +
22997 +/*
22998 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
22999 + */
23000 +void __init clustered_apic_check(void)
23001 +{
23002 +#ifndef CONFIG_XEN
23003 + long i;
23004 + u8 clusters, max_cluster;
23005 + u8 id;
23006 + u8 cluster_cnt[NUM_APIC_CLUSTERS];
23007 + int max_apic = 0;
23008 +
23009 +#if defined(CONFIG_ACPI)
23010 + /*
23011 + * Some x86_64 machines use physical APIC mode regardless of how many
23012 + * procs/clusters are present (x86_64 ES7000 is an example).
23013 + */
23014 + if (acpi_fadt.revision > FADT2_REVISION_ID)
23015 + if (acpi_fadt.force_apic_physical_destination_mode) {
23016 + genapic = &apic_cluster;
23017 + goto print;
23018 + }
23019 +#endif
23020 +
23021 + memset(cluster_cnt, 0, sizeof(cluster_cnt));
23022 + for (i = 0; i < NR_CPUS; i++) {
23023 + id = bios_cpu_apicid[i];
23024 + if (id == BAD_APICID)
23025 + continue;
23026 + if (id > max_apic)
23027 + max_apic = id;
23028 + cluster_cnt[APIC_CLUSTERID(id)]++;
23029 + }
23030 +
23031 + /* Don't use clustered mode on AMD platforms. */
23032 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
23033 + genapic = &apic_physflat;
23034 +#ifndef CONFIG_HOTPLUG_CPU
23035 + /* In the CPU hotplug case we cannot use broadcast mode
23036 + because that opens a race when a CPU is removed.
23037 + Stay at physflat mode in this case.
23038 + It is bad to do this unconditionally though. Once
23039 + we have ACPI platform support for CPU hotplug
23040 + we should detect hotplug capablity from ACPI tables and
23041 + only do this when really needed. -AK */
23042 + if (max_apic <= 8)
23043 + genapic = &apic_flat;
23044 +#endif
23045 + goto print;
23046 + }
23047 +
23048 + clusters = 0;
23049 + max_cluster = 0;
23050 +
23051 + for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
23052 + if (cluster_cnt[i] > 0) {
23053 + ++clusters;
23054 + if (cluster_cnt[i] > max_cluster)
23055 + max_cluster = cluster_cnt[i];
23056 + }
23057 + }
23058 +
23059 + /*
23060 + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
23061 + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
23062 + * else physical mode.
23063 + * (We don't use lowest priority delivery + HW APIC IRQ steering, so
23064 + * can ignore the clustered logical case and go straight to physical.)
23065 + */
23066 + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
23067 +#ifdef CONFIG_HOTPLUG_CPU
23068 + /* Don't use APIC shortcuts in CPU hotplug to avoid races */
23069 + genapic = &apic_physflat;
23070 +#else
23071 + genapic = &apic_flat;
23072 +#endif
23073 + } else
23074 + genapic = &apic_cluster;
23075 +
23076 +print:
23077 +#else
23078 + /* hardcode to xen apic functions */
23079 + genapic = &apic_xen;
23080 +#endif
23081 + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
23082 +}
23083 +
23084 +/* Same for both flat and clustered. */
23085 +
23086 +#ifdef CONFIG_XEN
23087 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
23088 +#endif
23089 +
23090 +void send_IPI_self(int vector)
23091 +{
23092 +#ifndef CONFIG_XEN
23093 + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23094 +#else
23095 + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
23096 +#endif
23097 +}
23098 Index: head-2008-11-25/arch/x86/kernel/genapic_xen_64.c
23099 ===================================================================
23100 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23101 +++ head-2008-11-25/arch/x86/kernel/genapic_xen_64.c 2007-06-12 13:13:01.000000000 +0200
23102 @@ -0,0 +1,161 @@
23103 +/*
23104 + * Copyright 2004 James Cleverdon, IBM.
23105 + * Subject to the GNU Public License, v.2
23106 + *
23107 + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
23108 + *
23109 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
23110 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
23111 + * James Cleverdon.
23112 + *
23113 + * Hacked to pieces for Xen by Chris Wright.
23114 + */
23115 +#include <linux/threads.h>
23116 +#include <linux/cpumask.h>
23117 +#include <linux/string.h>
23118 +#include <linux/kernel.h>
23119 +#include <linux/ctype.h>
23120 +#include <linux/init.h>
23121 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23122 +#include <asm/smp.h>
23123 +#include <asm/ipi.h>
23124 +#else
23125 +#include <asm/apic.h>
23126 +#include <asm/apicdef.h>
23127 +#include <asm/genapic.h>
23128 +#endif
23129 +#include <xen/evtchn.h>
23130 +
23131 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
23132 +
23133 +static inline void __send_IPI_one(unsigned int cpu, int vector)
23134 +{
23135 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
23136 + BUG_ON(irq < 0);
23137 + notify_remote_via_irq(irq);
23138 +}
23139 +
23140 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
23141 +{
23142 + int cpu;
23143 +
23144 + switch (shortcut) {
23145 + case APIC_DEST_SELF:
23146 + __send_IPI_one(smp_processor_id(), vector);
23147 + break;
23148 + case APIC_DEST_ALLBUT:
23149 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23150 + if (cpu == smp_processor_id())
23151 + continue;
23152 + if (cpu_isset(cpu, cpu_online_map)) {
23153 + __send_IPI_one(cpu, vector);
23154 + }
23155 + }
23156 + break;
23157 + case APIC_DEST_ALLINC:
23158 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23159 + if (cpu_isset(cpu, cpu_online_map)) {
23160 + __send_IPI_one(cpu, vector);
23161 + }
23162 + }
23163 + break;
23164 + default:
23165 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
23166 + vector);
23167 + break;
23168 + }
23169 +}
23170 +
23171 +static cpumask_t xen_target_cpus(void)
23172 +{
23173 + return cpu_online_map;
23174 +}
23175 +
23176 +/*
23177 + * Set up the logical destination ID.
23178 + * Do nothing, not called now.
23179 + */
23180 +static void xen_init_apic_ldr(void)
23181 +{
23182 + Dprintk("%s\n", __FUNCTION__);
23183 + return;
23184 +}
23185 +
23186 +static void xen_send_IPI_allbutself(int vector)
23187 +{
23188 + /*
23189 + * if there are no other CPUs in the system then
23190 + * we get an APIC send error if we try to broadcast.
23191 + * thus we have to avoid sending IPIs in this case.
23192 + */
23193 + Dprintk("%s\n", __FUNCTION__);
23194 + if (num_online_cpus() > 1)
23195 + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
23196 +}
23197 +
23198 +static void xen_send_IPI_all(int vector)
23199 +{
23200 + Dprintk("%s\n", __FUNCTION__);
23201 + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
23202 +}
23203 +
23204 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
23205 +{
23206 + unsigned long mask = cpus_addr(cpumask)[0];
23207 + unsigned int cpu;
23208 + unsigned long flags;
23209 +
23210 + Dprintk("%s\n", __FUNCTION__);
23211 + local_irq_save(flags);
23212 + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
23213 +
23214 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
23215 + if (cpu_isset(cpu, cpumask)) {
23216 + __send_IPI_one(cpu, vector);
23217 + }
23218 + }
23219 + local_irq_restore(flags);
23220 +}
23221 +
23222 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23223 +static int xen_apic_id_registered(void)
23224 +{
23225 + /* better be set */
23226 + Dprintk("%s\n", __FUNCTION__);
23227 + return physid_isset(smp_processor_id(), phys_cpu_present_map);
23228 +}
23229 +#endif
23230 +
23231 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
23232 +{
23233 + Dprintk("%s\n", __FUNCTION__);
23234 + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
23235 +}
23236 +
23237 +static unsigned int phys_pkg_id(int index_msb)
23238 +{
23239 + u32 ebx;
23240 +
23241 + Dprintk("%s\n", __FUNCTION__);
23242 + ebx = cpuid_ebx(1);
23243 + return ((ebx >> 24) & 0xFF) >> index_msb;
23244 +}
23245 +
23246 +struct genapic apic_xen = {
23247 + .name = "xen",
23248 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23249 + .int_delivery_mode = dest_LowestPrio,
23250 +#endif
23251 + .int_dest_mode = (APIC_DEST_LOGICAL != 0),
23252 + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
23253 + .target_cpus = xen_target_cpus,
23254 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
23255 + .apic_id_registered = xen_apic_id_registered,
23256 +#endif
23257 + .init_apic_ldr = xen_init_apic_ldr,
23258 + .send_IPI_all = xen_send_IPI_all,
23259 + .send_IPI_allbutself = xen_send_IPI_allbutself,
23260 + .send_IPI_mask = xen_send_IPI_mask,
23261 + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
23262 + .phys_pkg_id = phys_pkg_id,
23263 +};
23264 Index: head-2008-11-25/arch/x86/kernel/head_64-xen.S
23265 ===================================================================
23266 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23267 +++ head-2008-11-25/arch/x86/kernel/head_64-xen.S 2007-08-06 15:10:49.000000000 +0200
23268 @@ -0,0 +1,214 @@
23269 +/*
23270 + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
23271 + *
23272 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23273 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
23274 + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
23275 + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
23276 + *
23277 + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
23278 + *
23279 + * Jun Nakajima <jun.nakajima@intel.com>
23280 + * Modified for Xen
23281 + */
23282 +
23283 +
23284 +#include <linux/linkage.h>
23285 +#include <linux/threads.h>
23286 +#include <linux/init.h>
23287 +#include <linux/elfnote.h>
23288 +#include <asm/desc.h>
23289 +#include <asm/segment.h>
23290 +#include <asm/page.h>
23291 +#include <asm/msr.h>
23292 +#include <asm/cache.h>
23293 +#include <asm/dwarf2.h>
23294 +#include <xen/interface/elfnote.h>
23295 +
23296 + .section .bootstrap.text, "ax", @progbits
23297 + .code64
23298 + .globl startup_64
23299 +startup_64:
23300 + movq $(init_thread_union+THREAD_SIZE-8),%rsp
23301 +
23302 + /* rsi is pointer to startup info structure.
23303 + pass it to C */
23304 + movq %rsi,%rdi
23305 + pushq $0 # fake return address
23306 + jmp x86_64_start_kernel
23307 +
23308 +#ifdef CONFIG_ACPI_SLEEP
23309 +.org 0xf00
23310 + .globl pGDT32
23311 +pGDT32:
23312 + .word gdt_end-cpu_gdt_table-1
23313 + .long cpu_gdt_table-__START_KERNEL_map
23314 +#endif
23315 +ENTRY(stext)
23316 +ENTRY(_stext)
23317 +
23318 + $page = 0
23319 +#define NEXT_PAGE(name) \
23320 + $page = $page + 1; \
23321 + .org $page * 0x1000; \
23322 + phys_##name = $page * 0x1000 + __PHYSICAL_START; \
23323 +ENTRY(name)
23324 +
23325 +NEXT_PAGE(init_level4_pgt)
23326 + /* This gets initialized in x86_64_start_kernel */
23327 + .fill 512,8,0
23328 +NEXT_PAGE(init_level4_user_pgt)
23329 + /*
23330 + * We update two pgd entries to make kernel and user pgd consistent
23331 + * at pgd_populate(). It can be used for kernel modules. So we place
23332 + * this page here for those cases to avoid memory corruption.
23333 + * We also use this page to establish the initial mapping for the
23334 + * vsyscall area.
23335 + */
23336 + .fill 512,8,0
23337 +
23338 +NEXT_PAGE(level3_kernel_pgt)
23339 + .fill 512,8,0
23340 +
23341 + /*
23342 + * This is used for vsyscall area mapping as we have a different
23343 + * level4 page table for user.
23344 + */
23345 +NEXT_PAGE(level3_user_pgt)
23346 + .fill 512,8,0
23347 +
23348 +NEXT_PAGE(level2_kernel_pgt)
23349 + .fill 512,8,0
23350 +
23351 +NEXT_PAGE(hypercall_page)
23352 + CFI_STARTPROC
23353 + .rept 0x1000 / 0x20
23354 + .skip 1 /* push %rcx */
23355 + CFI_ADJUST_CFA_OFFSET 8
23356 + CFI_REL_OFFSET rcx,0
23357 + .skip 2 /* push %r11 */
23358 + CFI_ADJUST_CFA_OFFSET 8
23359 + CFI_REL_OFFSET rcx,0
23360 + .skip 5 /* mov $#,%eax */
23361 + .skip 2 /* syscall */
23362 + .skip 2 /* pop %r11 */
23363 + CFI_ADJUST_CFA_OFFSET -8
23364 + CFI_RESTORE r11
23365 + .skip 1 /* pop %rcx */
23366 + CFI_ADJUST_CFA_OFFSET -8
23367 + CFI_RESTORE rcx
23368 + .align 0x20,0 /* ret */
23369 + .endr
23370 + CFI_ENDPROC
23371 +
23372 +#undef NEXT_PAGE
23373 +
23374 + .data
23375 +/* Just dummy symbol to allow compilation. Not used in sleep path */
23376 +#ifdef CONFIG_ACPI_SLEEP
23377 + .align PAGE_SIZE
23378 +ENTRY(wakeup_level4_pgt)
23379 + .fill 512,8,0
23380 +#endif
23381 +
23382 + .data
23383 +
23384 + .align 16
23385 + .globl cpu_gdt_descr
23386 +cpu_gdt_descr:
23387 + .word gdt_end-cpu_gdt_table-1
23388 +gdt:
23389 + .quad cpu_gdt_table
23390 +#ifdef CONFIG_SMP
23391 + .rept NR_CPUS-1
23392 + .word 0
23393 + .quad 0
23394 + .endr
23395 +#endif
23396 +
23397 +/* We need valid kernel segments for data and code in long mode too
23398 + * IRET will check the segment types kkeil 2000/10/28
23399 + * Also sysret mandates a special GDT layout
23400 + */
23401 +
23402 + .section .data.page_aligned, "aw"
23403 + .align PAGE_SIZE
23404 +
23405 +/* The TLS descriptors are currently at a different place compared to i386.
23406 + Hopefully nobody expects them at a fixed place (Wine?) */
23407 +
23408 +ENTRY(cpu_gdt_table)
23409 + .quad 0x0000000000000000 /* NULL descriptor */
23410 + .quad 0x0 /* unused */
23411 + .quad 0x00af9a000000ffff /* __KERNEL_CS */
23412 + .quad 0x00cf92000000ffff /* __KERNEL_DS */
23413 + .quad 0x00cffa000000ffff /* __USER32_CS */
23414 + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
23415 + .quad 0x00affa000000ffff /* __USER_CS */
23416 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
23417 + .quad 0,0 /* TSS */
23418 + .quad 0,0 /* LDT */
23419 + .quad 0,0,0 /* three TLS descriptors */
23420 + .quad 0 /* unused */
23421 +gdt_end:
23422 + /* asm/segment.h:GDT_ENTRIES must match this */
23423 + /* This should be a multiple of the cache line size */
23424 + /* GDTs of other CPUs are now dynamically allocated */
23425 +
23426 + /* zero the remaining page */
23427 + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
23428 +
23429 + .section .bss.page_aligned, "aw", @nobits
23430 + .align PAGE_SIZE
23431 +ENTRY(empty_zero_page)
23432 + .skip PAGE_SIZE
23433 +
23434 +#if CONFIG_XEN_COMPAT <= 0x030002
23435 +/*
23436 + * __xen_guest information
23437 + */
23438 +.macro utoh value
23439 + .if (\value) < 0 || (\value) >= 0x10
23440 + utoh (((\value)>>4)&0x0fffffffffffffff)
23441 + .endif
23442 + .if ((\value) & 0xf) < 10
23443 + .byte '0' + ((\value) & 0xf)
23444 + .else
23445 + .byte 'A' + ((\value) & 0xf) - 10
23446 + .endif
23447 +.endm
23448 +
23449 +.section __xen_guest
23450 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
23451 + .ascii ",XEN_VER=xen-3.0"
23452 + .ascii ",VIRT_BASE=0x"
23453 + utoh __START_KERNEL_map
23454 + .ascii ",ELF_PADDR_OFFSET=0x"
23455 + utoh __START_KERNEL_map
23456 + .ascii ",VIRT_ENTRY=0x"
23457 + utoh (__START_KERNEL_map + __PHYSICAL_START)
23458 + .ascii ",HYPERCALL_PAGE=0x"
23459 + utoh (phys_hypercall_page >> PAGE_SHIFT)
23460 + .ascii ",FEATURES=writable_page_tables"
23461 + .ascii "|writable_descriptor_tables"
23462 + .ascii "|auto_translated_physmap"
23463 + .ascii "|supervisor_mode_kernel"
23464 + .ascii ",LOADER=generic"
23465 + .byte 0
23466 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
23467 +
23468 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
23469 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
23470 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
23471 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map)
23472 +#if CONFIG_XEN_COMPAT <= 0x030002
23473 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map)
23474 +#else
23475 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0)
23476 +#endif
23477 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64)
23478 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page)
23479 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
23480 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
23481 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
23482 + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
23483 Index: head-2008-11-25/arch/x86/kernel/head64-xen.c
23484 ===================================================================
23485 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23486 +++ head-2008-11-25/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200
23487 @@ -0,0 +1,162 @@
23488 +/*
23489 + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
23490 + *
23491 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
23492 + *
23493 + * Jun Nakajima <jun.nakajima@intel.com>
23494 + * Modified for Xen.
23495 + */
23496 +
23497 +#include <linux/init.h>
23498 +#include <linux/linkage.h>
23499 +#include <linux/types.h>
23500 +#include <linux/kernel.h>
23501 +#include <linux/string.h>
23502 +#include <linux/percpu.h>
23503 +#include <linux/module.h>
23504 +
23505 +#include <asm/processor.h>
23506 +#include <asm/proto.h>
23507 +#include <asm/smp.h>
23508 +#include <asm/bootsetup.h>
23509 +#include <asm/setup.h>
23510 +#include <asm/desc.h>
23511 +#include <asm/pgtable.h>
23512 +#include <asm/sections.h>
23513 +
23514 +unsigned long start_pfn;
23515 +
23516 +/* Don't add a printk in there. printk relies on the PDA which is not initialized
23517 + yet. */
23518 +#if 0
23519 +static void __init clear_bss(void)
23520 +{
23521 + memset(__bss_start, 0,
23522 + (unsigned long) __bss_stop - (unsigned long) __bss_start);
23523 +}
23524 +#endif
23525 +
23526 +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
23527 +#define OLD_CL_MAGIC_ADDR 0x90020
23528 +#define OLD_CL_MAGIC 0xA33F
23529 +#define OLD_CL_BASE_ADDR 0x90000
23530 +#define OLD_CL_OFFSET 0x90022
23531 +
23532 +extern char saved_command_line[];
23533 +
23534 +static void __init copy_bootdata(char *real_mode_data)
23535 +{
23536 +#ifndef CONFIG_XEN
23537 + int new_data;
23538 + char * command_line;
23539 +
23540 + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
23541 + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
23542 + if (!new_data) {
23543 + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
23544 + printk("so old bootloader that it does not support commandline?!\n");
23545 + return;
23546 + }
23547 + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
23548 + printk("old bootloader convention, maybe loadlin?\n");
23549 + }
23550 + command_line = (char *) ((u64)(new_data));
23551 + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
23552 +#else
23553 + int max_cmdline;
23554 +
23555 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
23556 + max_cmdline = COMMAND_LINE_SIZE;
23557 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
23558 + saved_command_line[max_cmdline-1] = '\0';
23559 +#endif
23560 + printk("Bootdata ok (command line is %s)\n", saved_command_line);
23561 +}
23562 +
23563 +static void __init setup_boot_cpu_data(void)
23564 +{
23565 + unsigned int dummy, eax;
23566 +
23567 + /* get vendor info */
23568 + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
23569 + (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
23570 + (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
23571 + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
23572 +
23573 + /* get cpu type */
23574 + cpuid(1, &eax, &dummy, &dummy,
23575 + (unsigned int *) &boot_cpu_data.x86_capability);
23576 + boot_cpu_data.x86 = (eax >> 8) & 0xf;
23577 + boot_cpu_data.x86_model = (eax >> 4) & 0xf;
23578 + boot_cpu_data.x86_mask = eax & 0xf;
23579 +}
23580 +
23581 +#include <xen/interface/memory.h>
23582 +unsigned long *machine_to_phys_mapping;
23583 +EXPORT_SYMBOL(machine_to_phys_mapping);
23584 +unsigned int machine_to_phys_order;
23585 +EXPORT_SYMBOL(machine_to_phys_order);
23586 +
23587 +void __init x86_64_start_kernel(char * real_mode_data)
23588 +{
23589 + struct xen_machphys_mapping mapping;
23590 + unsigned long machine_to_phys_nr_ents;
23591 + char *s;
23592 + int i;
23593 +
23594 + setup_xen_features();
23595 +
23596 + xen_start_info = (struct start_info *)real_mode_data;
23597 + if (!xen_feature(XENFEAT_auto_translated_physmap))
23598 + phys_to_machine_mapping =
23599 + (unsigned long *)xen_start_info->mfn_list;
23600 + start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
23601 + xen_start_info->nr_pt_frames;
23602 +
23603 + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
23604 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
23605 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
23606 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
23607 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
23608 + }
23609 + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
23610 + machine_to_phys_order++;
23611 +
23612 +#if 0
23613 + for (i = 0; i < 256; i++)
23614 + set_intr_gate(i, early_idt_handler);
23615 + asm volatile("lidt %0" :: "m" (idt_descr));
23616 +#endif
23617 +
23618 + /*
23619 + * This must be called really, really early:
23620 + */
23621 + lockdep_init();
23622 +
23623 + for (i = 0; i < NR_CPUS; i++)
23624 + cpu_pda(i) = &boot_cpu_pda[i];
23625 +
23626 + pda_init(0);
23627 + copy_bootdata(real_mode_data);
23628 +#ifdef CONFIG_SMP
23629 + cpu_set(0, cpu_online_map);
23630 +#endif
23631 + s = strstr(saved_command_line, "earlyprintk=");
23632 + if (s != NULL)
23633 + setup_early_printk(strchr(s, '=') + 1);
23634 +#ifdef CONFIG_NUMA
23635 + s = strstr(saved_command_line, "numa=");
23636 + if (s != NULL)
23637 + numa_setup(s+5);
23638 +#endif
23639 +#ifdef CONFIG_X86_IO_APIC
23640 + if (strstr(saved_command_line, "disableapic"))
23641 + disable_apic = 1;
23642 +#endif
23643 + /* You need early console to see that */
23644 + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
23645 + panic("Kernel too big for kernel mapping\n");
23646 +
23647 + setup_boot_cpu_data();
23648 + start_kernel();
23649 +}
23650 Index: head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c
23651 ===================================================================
23652 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
23653 +++ head-2008-11-25/arch/x86/kernel/io_apic_64-xen.c 2008-11-25 12:22:34.000000000 +0100
23654 @@ -0,0 +1,2268 @@
23655 +/*
23656 + * Intel IO-APIC support for multi-Pentium hosts.
23657 + *
23658 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
23659 + *
23660 + * Many thanks to Stig Venaas for trying out countless experimental
23661 + * patches and reporting/debugging problems patiently!
23662 + *
23663 + * (c) 1999, Multiple IO-APIC support, developed by
23664 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
23665 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
23666 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
23667 + * and Ingo Molnar <mingo@redhat.com>
23668 + *
23669 + * Fixes
23670 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
23671 + * thanks to Eric Gilmore
23672 + * and Rolf G. Tews
23673 + * for testing these extensively
23674 + * Paul Diefenbaugh : Added full ACPI support
23675 + */
23676 +
23677 +#include <linux/mm.h>
23678 +#include <linux/interrupt.h>
23679 +#include <linux/init.h>
23680 +#include <linux/delay.h>
23681 +#include <linux/sched.h>
23682 +#include <linux/smp_lock.h>
23683 +#include <linux/mc146818rtc.h>
23684 +#include <linux/acpi.h>
23685 +#include <linux/sysdev.h>
23686 +#ifdef CONFIG_ACPI
23687 +#include <acpi/acpi_bus.h>
23688 +#endif
23689 +
23690 +#include <asm/io.h>
23691 +#include <asm/smp.h>
23692 +#include <asm/desc.h>
23693 +#include <asm/proto.h>
23694 +#include <asm/mach_apic.h>
23695 +#include <asm/acpi.h>
23696 +#include <asm/dma.h>
23697 +#include <asm/nmi.h>
23698 +
23699 +#define __apicdebuginit __init
23700 +
23701 +int sis_apic_bug; /* not actually supported, dummy for compile */
23702 +
23703 +static int no_timer_check;
23704 +
23705 +int disable_timer_pin_1 __initdata;
23706 +
23707 +#ifndef CONFIG_XEN
23708 +int timer_over_8254 __initdata = 0;
23709 +
23710 +/* Where if anywhere is the i8259 connect in external int mode */
23711 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
23712 +#endif
23713 +
23714 +static DEFINE_SPINLOCK(ioapic_lock);
23715 +static DEFINE_SPINLOCK(vector_lock);
23716 +
23717 +/*
23718 + * # of IRQ routing registers
23719 + */
23720 +int nr_ioapic_registers[MAX_IO_APICS];
23721 +
23722 +/*
23723 + * Rough estimation of how many shared IRQs there are, can
23724 + * be changed anytime.
23725 + */
23726 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
23727 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
23728 +
23729 +/*
23730 + * This is performance-critical, we want to do it O(1)
23731 + *
23732 + * the indexing order of this array favors 1:1 mappings
23733 + * between pins and IRQs.
23734 + */
23735 +
23736 +static struct irq_pin_list {
23737 + short apic, pin, next;
23738 +} irq_2_pin[PIN_MAP_SIZE];
23739 +
23740 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
23741 +#ifdef CONFIG_PCI_MSI
23742 +#define vector_to_irq(vector) \
23743 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
23744 +#else
23745 +#define vector_to_irq(vector) (vector)
23746 +#endif
23747 +
23748 +#ifdef CONFIG_XEN
23749 +
23750 +#include <xen/interface/xen.h>
23751 +#include <xen/interface/physdev.h>
23752 +#include <xen/evtchn.h>
23753 +
23754 +/* Fake i8259 */
23755 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
23756 +#define disable_8259A_irq(_irq) ((void)0)
23757 +#define i8259A_irq_pending(_irq) (0)
23758 +
23759 +unsigned long io_apic_irqs;
23760 +
23761 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
23762 +{
23763 + struct physdev_apic apic_op;
23764 + int ret;
23765 +
23766 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23767 + apic_op.reg = reg;
23768 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
23769 + if (ret)
23770 + return ret;
23771 + return apic_op.value;
23772 +}
23773 +
23774 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
23775 +{
23776 + struct physdev_apic apic_op;
23777 +
23778 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
23779 + apic_op.reg = reg;
23780 + apic_op.value = value;
23781 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
23782 +}
23783 +
23784 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
23785 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
23786 +
23787 +#define clear_IO_APIC() ((void)0)
23788 +
23789 +#else
23790 +
23791 +#ifdef CONFIG_SMP
23792 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
23793 +{
23794 + unsigned long flags;
23795 + unsigned int dest;
23796 + cpumask_t tmp;
23797 +
23798 + cpus_and(tmp, mask, cpu_online_map);
23799 + if (cpus_empty(tmp))
23800 + tmp = TARGET_CPUS;
23801 +
23802 + cpus_and(mask, tmp, CPU_MASK_ALL);
23803 +
23804 + dest = cpu_mask_to_apicid(mask);
23805 +
23806 + /*
23807 + * Only the high 8 bits are valid.
23808 + */
23809 + dest = SET_APIC_LOGICAL_ID(dest);
23810 +
23811 + spin_lock_irqsave(&ioapic_lock, flags);
23812 + __DO_ACTION(1, = dest, )
23813 + set_irq_info(irq, mask);
23814 + spin_unlock_irqrestore(&ioapic_lock, flags);
23815 +}
23816 +#endif
23817 +
23818 +#endif /* !CONFIG_XEN */
23819 +
23820 +/*
23821 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
23822 + * shared ISA-space IRQs, so we have to support them. We are super
23823 + * fast in the common case, and fast for shared ISA-space IRQs.
23824 + */
23825 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
23826 +{
23827 + static int first_free_entry = NR_IRQS;
23828 + struct irq_pin_list *entry = irq_2_pin + irq;
23829 +
23830 + BUG_ON(irq >= NR_IRQS);
23831 + while (entry->next)
23832 + entry = irq_2_pin + entry->next;
23833 +
23834 + if (entry->pin != -1) {
23835 + entry->next = first_free_entry;
23836 + entry = irq_2_pin + entry->next;
23837 + if (++first_free_entry >= PIN_MAP_SIZE)
23838 + panic("io_apic.c: ran out of irq_2_pin entries!");
23839 + }
23840 + entry->apic = apic;
23841 + entry->pin = pin;
23842 +}
23843 +
23844 +#ifndef CONFIG_XEN
23845 +#define __DO_ACTION(R, ACTION, FINAL) \
23846 + \
23847 +{ \
23848 + int pin; \
23849 + struct irq_pin_list *entry = irq_2_pin + irq; \
23850 + \
23851 + BUG_ON(irq >= NR_IRQS); \
23852 + for (;;) { \
23853 + unsigned int reg; \
23854 + pin = entry->pin; \
23855 + if (pin == -1) \
23856 + break; \
23857 + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
23858 + reg ACTION; \
23859 + io_apic_modify(entry->apic, reg); \
23860 + if (!entry->next) \
23861 + break; \
23862 + entry = irq_2_pin + entry->next; \
23863 + } \
23864 + FINAL; \
23865 +}
23866 +
23867 +#define DO_ACTION(name,R,ACTION, FINAL) \
23868 + \
23869 + static void name##_IO_APIC_irq (unsigned int irq) \
23870 + __DO_ACTION(R, ACTION, FINAL)
23871 +
23872 +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
23873 + /* mask = 1 */
23874 +DO_ACTION( __unmask, 0, &= 0xfffeffff, )
23875 + /* mask = 0 */
23876 +
23877 +static void mask_IO_APIC_irq (unsigned int irq)
23878 +{
23879 + unsigned long flags;
23880 +
23881 + spin_lock_irqsave(&ioapic_lock, flags);
23882 + __mask_IO_APIC_irq(irq);
23883 + spin_unlock_irqrestore(&ioapic_lock, flags);
23884 +}
23885 +
23886 +static void unmask_IO_APIC_irq (unsigned int irq)
23887 +{
23888 + unsigned long flags;
23889 +
23890 + spin_lock_irqsave(&ioapic_lock, flags);
23891 + __unmask_IO_APIC_irq(irq);
23892 + spin_unlock_irqrestore(&ioapic_lock, flags);
23893 +}
23894 +
23895 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
23896 +{
23897 + struct IO_APIC_route_entry entry;
23898 + unsigned long flags;
23899 +
23900 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
23901 + spin_lock_irqsave(&ioapic_lock, flags);
23902 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
23903 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
23904 + spin_unlock_irqrestore(&ioapic_lock, flags);
23905 + if (entry.delivery_mode == dest_SMI)
23906 + return;
23907 + /*
23908 + * Disable it in the IO-APIC irq-routing table:
23909 + */
23910 + memset(&entry, 0, sizeof(entry));
23911 + entry.mask = 1;
23912 + spin_lock_irqsave(&ioapic_lock, flags);
23913 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
23914 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
23915 + spin_unlock_irqrestore(&ioapic_lock, flags);
23916 +}
23917 +
23918 +static void clear_IO_APIC (void)
23919 +{
23920 + int apic, pin;
23921 +
23922 + for (apic = 0; apic < nr_ioapics; apic++)
23923 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
23924 + clear_IO_APIC_pin(apic, pin);
23925 +}
23926 +
23927 +#endif /* !CONFIG_XEN */
23928 +
23929 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
23930 +
23931 +/*
23932 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
23933 + * specific CPU-side IRQs.
23934 + */
23935 +
23936 +#define MAX_PIRQS 8
23937 +static int pirq_entries [MAX_PIRQS];
23938 +static int pirqs_enabled;
23939 +int skip_ioapic_setup;
23940 +int ioapic_force;
23941 +
23942 +/* dummy parsing: see setup.c */
23943 +
23944 +static int __init disable_ioapic_setup(char *str)
23945 +{
23946 + skip_ioapic_setup = 1;
23947 + return 1;
23948 +}
23949 +
23950 +static int __init enable_ioapic_setup(char *str)
23951 +{
23952 + ioapic_force = 1;
23953 + skip_ioapic_setup = 0;
23954 + return 1;
23955 +}
23956 +
23957 +__setup("noapic", disable_ioapic_setup);
23958 +__setup("apic", enable_ioapic_setup);
23959 +
23960 +#ifndef CONFIG_XEN
23961 +static int __init setup_disable_8254_timer(char *s)
23962 +{
23963 + timer_over_8254 = -1;
23964 + return 1;
23965 +}
23966 +static int __init setup_enable_8254_timer(char *s)
23967 +{
23968 + timer_over_8254 = 2;
23969 + return 1;
23970 +}
23971 +
23972 +__setup("disable_8254_timer", setup_disable_8254_timer);
23973 +__setup("enable_8254_timer", setup_enable_8254_timer);
23974 +#endif /* !CONFIG_XEN */
23975 +
23976 +#include <asm/pci-direct.h>
23977 +#include <linux/pci_ids.h>
23978 +#include <linux/pci.h>
23979 +
23980 +
23981 +#ifdef CONFIG_ACPI
23982 +
23983 +static int nvidia_hpet_detected __initdata;
23984 +
23985 +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
23986 +{
23987 + nvidia_hpet_detected = 1;
23988 + return 0;
23989 +}
23990 +#endif
23991 +
23992 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
23993 + off. Check for an Nvidia or VIA PCI bridge and turn it off.
23994 + Use pci direct infrastructure because this runs before the PCI subsystem.
23995 +
23996 + Can be overwritten with "apic"
23997 +
23998 + And another hack to disable the IOMMU on VIA chipsets.
23999 +
24000 + ... and others. Really should move this somewhere else.
24001 +
24002 + Kludge-O-Rama. */
24003 +void __init check_ioapic(void)
24004 +{
24005 + int num,slot,func;
24006 + /* Poor man's PCI discovery */
24007 + for (num = 0; num < 32; num++) {
24008 + for (slot = 0; slot < 32; slot++) {
24009 + for (func = 0; func < 8; func++) {
24010 + u32 class;
24011 + u32 vendor;
24012 + u8 type;
24013 + class = read_pci_config(num,slot,func,
24014 + PCI_CLASS_REVISION);
24015 + if (class == 0xffffffff)
24016 + break;
24017 +
24018 + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
24019 + continue;
24020 +
24021 + vendor = read_pci_config(num, slot, func,
24022 + PCI_VENDOR_ID);
24023 + vendor &= 0xffff;
24024 + switch (vendor) {
24025 + case PCI_VENDOR_ID_VIA:
24026 +#ifdef CONFIG_IOMMU
24027 + if ((end_pfn > MAX_DMA32_PFN ||
24028 + force_iommu) &&
24029 + !iommu_aperture_allowed) {
24030 + printk(KERN_INFO
24031 + "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
24032 + iommu_aperture_disabled = 1;
24033 + }
24034 +#endif
24035 + return;
24036 + case PCI_VENDOR_ID_NVIDIA:
24037 +#ifdef CONFIG_ACPI
24038 + /*
24039 + * All timer overrides on Nvidia are
24040 + * wrong unless HPET is enabled.
24041 + */
24042 + nvidia_hpet_detected = 0;
24043 + acpi_table_parse(ACPI_HPET,
24044 + nvidia_hpet_check);
24045 + if (nvidia_hpet_detected == 0) {
24046 + acpi_skip_timer_override = 1;
24047 + printk(KERN_INFO "Nvidia board "
24048 + "detected. Ignoring ACPI "
24049 + "timer override.\n");
24050 + }
24051 +#endif
24052 + /* RED-PEN skip them on mptables too? */
24053 + return;
24054 + case PCI_VENDOR_ID_ATI:
24055 +
24056 + /* This should be actually default, but
24057 + for 2.6.16 let's do it for ATI only where
24058 + it's really needed. */
24059 +#ifndef CONFIG_XEN
24060 + if (timer_over_8254 == 1) {
24061 + timer_over_8254 = 0;
24062 + printk(KERN_INFO
24063 + "ATI board detected. Disabling timer routing over 8254.\n");
24064 + }
24065 +#endif
24066 + return;
24067 + }
24068 +
24069 +
24070 + /* No multi-function device? */
24071 + type = read_pci_config_byte(num,slot,func,
24072 + PCI_HEADER_TYPE);
24073 + if (!(type & 0x80))
24074 + break;
24075 + }
24076 + }
24077 + }
24078 +}
24079 +
24080 +static int __init ioapic_pirq_setup(char *str)
24081 +{
24082 + int i, max;
24083 + int ints[MAX_PIRQS+1];
24084 +
24085 + get_options(str, ARRAY_SIZE(ints), ints);
24086 +
24087 + for (i = 0; i < MAX_PIRQS; i++)
24088 + pirq_entries[i] = -1;
24089 +
24090 + pirqs_enabled = 1;
24091 + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
24092 + max = MAX_PIRQS;
24093 + if (ints[0] < MAX_PIRQS)
24094 + max = ints[0];
24095 +
24096 + for (i = 0; i < max; i++) {
24097 + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
24098 + /*
24099 + * PIRQs are mapped upside down, usually.
24100 + */
24101 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
24102 + }
24103 + return 1;
24104 +}
24105 +
24106 +__setup("pirq=", ioapic_pirq_setup);
24107 +
24108 +/*
24109 + * Find the IRQ entry number of a certain pin.
24110 + */
24111 +static int find_irq_entry(int apic, int pin, int type)
24112 +{
24113 + int i;
24114 +
24115 + for (i = 0; i < mp_irq_entries; i++)
24116 + if (mp_irqs[i].mpc_irqtype == type &&
24117 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
24118 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
24119 + mp_irqs[i].mpc_dstirq == pin)
24120 + return i;
24121 +
24122 + return -1;
24123 +}
24124 +
24125 +#ifndef CONFIG_XEN
24126 +/*
24127 + * Find the pin to which IRQ[irq] (ISA) is connected
24128 + */
24129 +static int __init find_isa_irq_pin(int irq, int type)
24130 +{
24131 + int i;
24132 +
24133 + for (i = 0; i < mp_irq_entries; i++) {
24134 + int lbus = mp_irqs[i].mpc_srcbus;
24135 +
24136 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24137 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24138 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24139 + (mp_irqs[i].mpc_irqtype == type) &&
24140 + (mp_irqs[i].mpc_srcbusirq == irq))
24141 +
24142 + return mp_irqs[i].mpc_dstirq;
24143 + }
24144 + return -1;
24145 +}
24146 +
24147 +static int __init find_isa_irq_apic(int irq, int type)
24148 +{
24149 + int i;
24150 +
24151 + for (i = 0; i < mp_irq_entries; i++) {
24152 + int lbus = mp_irqs[i].mpc_srcbus;
24153 +
24154 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
24155 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
24156 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
24157 + (mp_irqs[i].mpc_irqtype == type) &&
24158 + (mp_irqs[i].mpc_srcbusirq == irq))
24159 + break;
24160 + }
24161 + if (i < mp_irq_entries) {
24162 + int apic;
24163 + for(apic = 0; apic < nr_ioapics; apic++) {
24164 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
24165 + return apic;
24166 + }
24167 + }
24168 +
24169 + return -1;
24170 +}
24171 +#endif
24172 +
24173 +/*
24174 + * Find a specific PCI IRQ entry.
24175 + * Not an __init, possibly needed by modules
24176 + */
24177 +static int pin_2_irq(int idx, int apic, int pin);
24178 +
24179 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
24180 +{
24181 + int apic, i, best_guess = -1;
24182 +
24183 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
24184 + bus, slot, pin);
24185 + if (mp_bus_id_to_pci_bus[bus] == -1) {
24186 + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
24187 + return -1;
24188 + }
24189 + for (i = 0; i < mp_irq_entries; i++) {
24190 + int lbus = mp_irqs[i].mpc_srcbus;
24191 +
24192 + for (apic = 0; apic < nr_ioapics; apic++)
24193 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
24194 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
24195 + break;
24196 +
24197 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
24198 + !mp_irqs[i].mpc_irqtype &&
24199 + (bus == lbus) &&
24200 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
24201 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
24202 +
24203 + if (!(apic || IO_APIC_IRQ(irq)))
24204 + continue;
24205 +
24206 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
24207 + return irq;
24208 + /*
24209 + * Use the first all-but-pin matching entry as a
24210 + * best-guess fuzzy result for broken mptables.
24211 + */
24212 + if (best_guess < 0)
24213 + best_guess = irq;
24214 + }
24215 + }
24216 + BUG_ON(best_guess >= NR_IRQS);
24217 + return best_guess;
24218 +}
24219 +
24220 +/*
24221 + * EISA Edge/Level control register, ELCR
24222 + */
24223 +static int EISA_ELCR(unsigned int irq)
24224 +{
24225 + if (irq < 16) {
24226 + unsigned int port = 0x4d0 + (irq >> 3);
24227 + return (inb(port) >> (irq & 7)) & 1;
24228 + }
24229 + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
24230 + return 0;
24231 +}
24232 +
24233 +/* EISA interrupts are always polarity zero and can be edge or level
24234 + * trigger depending on the ELCR value. If an interrupt is listed as
24235 + * EISA conforming in the MP table, that means its trigger type must
24236 + * be read in from the ELCR */
24237 +
24238 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
24239 +#define default_EISA_polarity(idx) (0)
24240 +
24241 +/* ISA interrupts are always polarity zero edge triggered,
24242 + * when listed as conforming in the MP table. */
24243 +
24244 +#define default_ISA_trigger(idx) (0)
24245 +#define default_ISA_polarity(idx) (0)
24246 +
24247 +/* PCI interrupts are always polarity one level triggered,
24248 + * when listed as conforming in the MP table. */
24249 +
24250 +#define default_PCI_trigger(idx) (1)
24251 +#define default_PCI_polarity(idx) (1)
24252 +
24253 +/* MCA interrupts are always polarity zero level triggered,
24254 + * when listed as conforming in the MP table. */
24255 +
24256 +#define default_MCA_trigger(idx) (1)
24257 +#define default_MCA_polarity(idx) (0)
24258 +
24259 +static int __init MPBIOS_polarity(int idx)
24260 +{
24261 + int bus = mp_irqs[idx].mpc_srcbus;
24262 + int polarity;
24263 +
24264 + /*
24265 + * Determine IRQ line polarity (high active or low active):
24266 + */
24267 + switch (mp_irqs[idx].mpc_irqflag & 3)
24268 + {
24269 + case 0: /* conforms, ie. bus-type dependent polarity */
24270 + {
24271 + switch (mp_bus_id_to_type[bus])
24272 + {
24273 + case MP_BUS_ISA: /* ISA pin */
24274 + {
24275 + polarity = default_ISA_polarity(idx);
24276 + break;
24277 + }
24278 + case MP_BUS_EISA: /* EISA pin */
24279 + {
24280 + polarity = default_EISA_polarity(idx);
24281 + break;
24282 + }
24283 + case MP_BUS_PCI: /* PCI pin */
24284 + {
24285 + polarity = default_PCI_polarity(idx);
24286 + break;
24287 + }
24288 + case MP_BUS_MCA: /* MCA pin */
24289 + {
24290 + polarity = default_MCA_polarity(idx);
24291 + break;
24292 + }
24293 + default:
24294 + {
24295 + printk(KERN_WARNING "broken BIOS!!\n");
24296 + polarity = 1;
24297 + break;
24298 + }
24299 + }
24300 + break;
24301 + }
24302 + case 1: /* high active */
24303 + {
24304 + polarity = 0;
24305 + break;
24306 + }
24307 + case 2: /* reserved */
24308 + {
24309 + printk(KERN_WARNING "broken BIOS!!\n");
24310 + polarity = 1;
24311 + break;
24312 + }
24313 + case 3: /* low active */
24314 + {
24315 + polarity = 1;
24316 + break;
24317 + }
24318 + default: /* invalid */
24319 + {
24320 + printk(KERN_WARNING "broken BIOS!!\n");
24321 + polarity = 1;
24322 + break;
24323 + }
24324 + }
24325 + return polarity;
24326 +}
24327 +
24328 +static int MPBIOS_trigger(int idx)
24329 +{
24330 + int bus = mp_irqs[idx].mpc_srcbus;
24331 + int trigger;
24332 +
24333 + /*
24334 + * Determine IRQ trigger mode (edge or level sensitive):
24335 + */
24336 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
24337 + {
24338 + case 0: /* conforms, ie. bus-type dependent */
24339 + {
24340 + switch (mp_bus_id_to_type[bus])
24341 + {
24342 + case MP_BUS_ISA: /* ISA pin */
24343 + {
24344 + trigger = default_ISA_trigger(idx);
24345 + break;
24346 + }
24347 + case MP_BUS_EISA: /* EISA pin */
24348 + {
24349 + trigger = default_EISA_trigger(idx);
24350 + break;
24351 + }
24352 + case MP_BUS_PCI: /* PCI pin */
24353 + {
24354 + trigger = default_PCI_trigger(idx);
24355 + break;
24356 + }
24357 + case MP_BUS_MCA: /* MCA pin */
24358 + {
24359 + trigger = default_MCA_trigger(idx);
24360 + break;
24361 + }
24362 + default:
24363 + {
24364 + printk(KERN_WARNING "broken BIOS!!\n");
24365 + trigger = 1;
24366 + break;
24367 + }
24368 + }
24369 + break;
24370 + }
24371 + case 1: /* edge */
24372 + {
24373 + trigger = 0;
24374 + break;
24375 + }
24376 + case 2: /* reserved */
24377 + {
24378 + printk(KERN_WARNING "broken BIOS!!\n");
24379 + trigger = 1;
24380 + break;
24381 + }
24382 + case 3: /* level */
24383 + {
24384 + trigger = 1;
24385 + break;
24386 + }
24387 + default: /* invalid */
24388 + {
24389 + printk(KERN_WARNING "broken BIOS!!\n");
24390 + trigger = 0;
24391 + break;
24392 + }
24393 + }
24394 + return trigger;
24395 +}
24396 +
24397 +static inline int irq_polarity(int idx)
24398 +{
24399 + return MPBIOS_polarity(idx);
24400 +}
24401 +
24402 +static inline int irq_trigger(int idx)
24403 +{
24404 + return MPBIOS_trigger(idx);
24405 +}
24406 +
24407 +static int next_irq = 16;
24408 +
24409 +/*
24410 + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
24411 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
24412 + * from ACPI, which can reach 800 in large boxen.
24413 + *
24414 + * Compact the sparse GSI space into a sequential IRQ series and reuse
24415 + * vectors if possible.
24416 + */
24417 +int gsi_irq_sharing(int gsi)
24418 +{
24419 + int i, tries, vector;
24420 +
24421 + BUG_ON(gsi >= NR_IRQ_VECTORS);
24422 +
24423 + if (platform_legacy_irq(gsi))
24424 + return gsi;
24425 +
24426 + if (gsi_2_irq[gsi] != 0xFF)
24427 + return (int)gsi_2_irq[gsi];
24428 +
24429 + tries = NR_IRQS;
24430 + try_again:
24431 + vector = assign_irq_vector(gsi);
24432 +
24433 + /*
24434 + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
24435 + * use of vector and if found, return that IRQ. However, we never want
24436 + * to share legacy IRQs, which usually have a different trigger mode
24437 + * than PCI.
24438 + */
24439 + for (i = 0; i < NR_IRQS; i++)
24440 + if (IO_APIC_VECTOR(i) == vector)
24441 + break;
24442 + if (platform_legacy_irq(i)) {
24443 + if (--tries >= 0) {
24444 + IO_APIC_VECTOR(i) = 0;
24445 + goto try_again;
24446 + }
24447 + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
24448 + }
24449 + if (i < NR_IRQS) {
24450 + gsi_2_irq[gsi] = i;
24451 + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
24452 + gsi, vector, i);
24453 + return i;
24454 + }
24455 +
24456 + i = next_irq++;
24457 + BUG_ON(i >= NR_IRQS);
24458 + gsi_2_irq[gsi] = i;
24459 + IO_APIC_VECTOR(i) = vector;
24460 + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
24461 + gsi, vector, i);
24462 + return i;
24463 +}
24464 +
24465 +static int pin_2_irq(int idx, int apic, int pin)
24466 +{
24467 + int irq, i;
24468 + int bus = mp_irqs[idx].mpc_srcbus;
24469 +
24470 + /*
24471 + * Debugging check, we are in big trouble if this message pops up!
24472 + */
24473 + if (mp_irqs[idx].mpc_dstirq != pin)
24474 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
24475 +
24476 + switch (mp_bus_id_to_type[bus])
24477 + {
24478 + case MP_BUS_ISA: /* ISA pin */
24479 + case MP_BUS_EISA:
24480 + case MP_BUS_MCA:
24481 + {
24482 + irq = mp_irqs[idx].mpc_srcbusirq;
24483 + break;
24484 + }
24485 + case MP_BUS_PCI: /* PCI pin */
24486 + {
24487 + /*
24488 + * PCI IRQs are mapped in order
24489 + */
24490 + i = irq = 0;
24491 + while (i < apic)
24492 + irq += nr_ioapic_registers[i++];
24493 + irq += pin;
24494 + irq = gsi_irq_sharing(irq);
24495 + break;
24496 + }
24497 + default:
24498 + {
24499 + printk(KERN_ERR "unknown bus type %d.\n",bus);
24500 + irq = 0;
24501 + break;
24502 + }
24503 + }
24504 + BUG_ON(irq >= NR_IRQS);
24505 +
24506 + /*
24507 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
24508 + */
24509 + if ((pin >= 16) && (pin <= 23)) {
24510 + if (pirq_entries[pin-16] != -1) {
24511 + if (!pirq_entries[pin-16]) {
24512 + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
24513 + } else {
24514 + irq = pirq_entries[pin-16];
24515 + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
24516 + pin-16, irq);
24517 + }
24518 + }
24519 + }
24520 + BUG_ON(irq >= NR_IRQS);
24521 + return irq;
24522 +}
24523 +
24524 +static inline int IO_APIC_irq_trigger(int irq)
24525 +{
24526 + int apic, idx, pin;
24527 +
24528 + for (apic = 0; apic < nr_ioapics; apic++) {
24529 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24530 + idx = find_irq_entry(apic,pin,mp_INT);
24531 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
24532 + return irq_trigger(idx);
24533 + }
24534 + }
24535 + /*
24536 + * nonexistent IRQs are edge default
24537 + */
24538 + return 0;
24539 +}
24540 +
24541 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
24542 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
24543 +
24544 +int assign_irq_vector(int irq)
24545 +{
24546 + unsigned long flags;
24547 + int vector;
24548 + struct physdev_irq irq_op;
24549 +
24550 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
24551 +
24552 + if (irq < PIRQ_BASE || irq - PIRQ_BASE > NR_PIRQS)
24553 + return -EINVAL;
24554 +
24555 + spin_lock_irqsave(&vector_lock, flags);
24556 +
24557 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
24558 + spin_unlock_irqrestore(&vector_lock, flags);
24559 + return IO_APIC_VECTOR(irq);
24560 + }
24561 +
24562 + irq_op.irq = irq;
24563 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
24564 + spin_unlock_irqrestore(&vector_lock, flags);
24565 + return -ENOSPC;
24566 + }
24567 +
24568 + vector = irq_op.vector;
24569 + vector_irq[vector] = irq;
24570 + if (irq != AUTO_ASSIGN)
24571 + IO_APIC_VECTOR(irq) = vector;
24572 +
24573 + spin_unlock_irqrestore(&vector_lock, flags);
24574 +
24575 + return vector;
24576 +}
24577 +
24578 +extern void (*interrupt[NR_IRQS])(void);
24579 +#ifndef CONFIG_XEN
24580 +static struct hw_interrupt_type ioapic_level_type;
24581 +static struct hw_interrupt_type ioapic_edge_type;
24582 +
24583 +#define IOAPIC_AUTO -1
24584 +#define IOAPIC_EDGE 0
24585 +#define IOAPIC_LEVEL 1
24586 +
24587 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
24588 +{
24589 + unsigned idx;
24590 +
24591 + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
24592 +
24593 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
24594 + trigger == IOAPIC_LEVEL)
24595 + irq_desc[idx].chip = &ioapic_level_type;
24596 + else
24597 + irq_desc[idx].chip = &ioapic_edge_type;
24598 + set_intr_gate(vector, interrupt[idx]);
24599 +}
24600 +#else
24601 +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
24602 +#endif /* !CONFIG_XEN */
24603 +
24604 +static void __init setup_IO_APIC_irqs(void)
24605 +{
24606 + struct IO_APIC_route_entry entry;
24607 + int apic, pin, idx, irq, first_notcon = 1, vector;
24608 + unsigned long flags;
24609 +
24610 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
24611 +
24612 + for (apic = 0; apic < nr_ioapics; apic++) {
24613 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
24614 +
24615 + /*
24616 + * add it to the IO-APIC irq-routing table:
24617 + */
24618 + memset(&entry,0,sizeof(entry));
24619 +
24620 + entry.delivery_mode = INT_DELIVERY_MODE;
24621 + entry.dest_mode = INT_DEST_MODE;
24622 + entry.mask = 0; /* enable IRQ */
24623 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24624 +
24625 + idx = find_irq_entry(apic,pin,mp_INT);
24626 + if (idx == -1) {
24627 + if (first_notcon) {
24628 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24629 + first_notcon = 0;
24630 + } else
24631 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
24632 + continue;
24633 + }
24634 +
24635 + entry.trigger = irq_trigger(idx);
24636 + entry.polarity = irq_polarity(idx);
24637 +
24638 + if (irq_trigger(idx)) {
24639 + entry.trigger = 1;
24640 + entry.mask = 1;
24641 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24642 + }
24643 +
24644 + irq = pin_2_irq(idx, apic, pin);
24645 + add_pin_to_irq(irq, apic, pin);
24646 +
24647 + if (/* !apic && */ !IO_APIC_IRQ(irq))
24648 + continue;
24649 +
24650 + if (IO_APIC_IRQ(irq)) {
24651 + vector = assign_irq_vector(irq);
24652 + entry.vector = vector;
24653 +
24654 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
24655 + if (!apic && (irq < 16))
24656 + disable_8259A_irq(irq);
24657 + }
24658 + spin_lock_irqsave(&ioapic_lock, flags);
24659 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24660 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24661 + set_native_irq_info(irq, TARGET_CPUS);
24662 + spin_unlock_irqrestore(&ioapic_lock, flags);
24663 + }
24664 + }
24665 +
24666 + if (!first_notcon)
24667 + apic_printk(APIC_VERBOSE," not connected.\n");
24668 +}
24669 +
24670 +#ifndef CONFIG_XEN
24671 +/*
24672 + * Set up the 8259A-master output pin as broadcast to all
24673 + * CPUs.
24674 + */
24675 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
24676 +{
24677 + struct IO_APIC_route_entry entry;
24678 + unsigned long flags;
24679 +
24680 + memset(&entry,0,sizeof(entry));
24681 +
24682 + disable_8259A_irq(0);
24683 +
24684 + /* mask LVT0 */
24685 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
24686 +
24687 + /*
24688 + * We use logical delivery to get the timer IRQ
24689 + * to the first CPU.
24690 + */
24691 + entry.dest_mode = INT_DEST_MODE;
24692 + entry.mask = 0; /* unmask IRQ now */
24693 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
24694 + entry.delivery_mode = INT_DELIVERY_MODE;
24695 + entry.polarity = 0;
24696 + entry.trigger = 0;
24697 + entry.vector = vector;
24698 +
24699 + /*
24700 + * The timer IRQ doesn't have to know that behind the
24701 + * scene we have a 8259A-master in AEOI mode ...
24702 + */
24703 + irq_desc[0].chip = &ioapic_edge_type;
24704 +
24705 + /*
24706 + * Add it to the IO-APIC irq-routing table:
24707 + */
24708 + spin_lock_irqsave(&ioapic_lock, flags);
24709 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
24710 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
24711 + spin_unlock_irqrestore(&ioapic_lock, flags);
24712 +
24713 + enable_8259A_irq(0);
24714 +}
24715 +
24716 +void __init UNEXPECTED_IO_APIC(void)
24717 +{
24718 +}
24719 +
24720 +void __apicdebuginit print_IO_APIC(void)
24721 +{
24722 + int apic, i;
24723 + union IO_APIC_reg_00 reg_00;
24724 + union IO_APIC_reg_01 reg_01;
24725 + union IO_APIC_reg_02 reg_02;
24726 + unsigned long flags;
24727 +
24728 + if (apic_verbosity == APIC_QUIET)
24729 + return;
24730 +
24731 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
24732 + for (i = 0; i < nr_ioapics; i++)
24733 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
24734 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
24735 +
24736 + /*
24737 + * We are a bit conservative about what we expect. We have to
24738 + * know about every hardware change ASAP.
24739 + */
24740 + printk(KERN_INFO "testing the IO APIC.......................\n");
24741 +
24742 + for (apic = 0; apic < nr_ioapics; apic++) {
24743 +
24744 + spin_lock_irqsave(&ioapic_lock, flags);
24745 + reg_00.raw = io_apic_read(apic, 0);
24746 + reg_01.raw = io_apic_read(apic, 1);
24747 + if (reg_01.bits.version >= 0x10)
24748 + reg_02.raw = io_apic_read(apic, 2);
24749 + spin_unlock_irqrestore(&ioapic_lock, flags);
24750 +
24751 + printk("\n");
24752 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
24753 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
24754 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
24755 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
24756 + UNEXPECTED_IO_APIC();
24757 +
24758 + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
24759 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
24760 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
24761 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
24762 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
24763 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
24764 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
24765 + (reg_01.bits.entries != 0x2E) &&
24766 + (reg_01.bits.entries != 0x3F) &&
24767 + (reg_01.bits.entries != 0x03)
24768 + )
24769 + UNEXPECTED_IO_APIC();
24770 +
24771 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
24772 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
24773 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
24774 + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
24775 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
24776 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
24777 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
24778 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
24779 + )
24780 + UNEXPECTED_IO_APIC();
24781 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
24782 + UNEXPECTED_IO_APIC();
24783 +
24784 + if (reg_01.bits.version >= 0x10) {
24785 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
24786 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
24787 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
24788 + UNEXPECTED_IO_APIC();
24789 + }
24790 +
24791 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
24792 +
24793 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
24794 + " Stat Dest Deli Vect: \n");
24795 +
24796 + for (i = 0; i <= reg_01.bits.entries; i++) {
24797 + struct IO_APIC_route_entry entry;
24798 +
24799 + spin_lock_irqsave(&ioapic_lock, flags);
24800 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
24801 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
24802 + spin_unlock_irqrestore(&ioapic_lock, flags);
24803 +
24804 + printk(KERN_DEBUG " %02x %03X %02X ",
24805 + i,
24806 + entry.dest.logical.logical_dest,
24807 + entry.dest.physical.physical_dest
24808 + );
24809 +
24810 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
24811 + entry.mask,
24812 + entry.trigger,
24813 + entry.irr,
24814 + entry.polarity,
24815 + entry.delivery_status,
24816 + entry.dest_mode,
24817 + entry.delivery_mode,
24818 + entry.vector
24819 + );
24820 + }
24821 + }
24822 + if (use_pci_vector())
24823 + printk(KERN_INFO "Using vector-based indexing\n");
24824 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
24825 + for (i = 0; i < NR_IRQS; i++) {
24826 + struct irq_pin_list *entry = irq_2_pin + i;
24827 + if (entry->pin < 0)
24828 + continue;
24829 + if (use_pci_vector() && !platform_legacy_irq(i))
24830 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
24831 + else
24832 + printk(KERN_DEBUG "IRQ%d ", i);
24833 + for (;;) {
24834 + printk("-> %d:%d", entry->apic, entry->pin);
24835 + if (!entry->next)
24836 + break;
24837 + entry = irq_2_pin + entry->next;
24838 + }
24839 + printk("\n");
24840 + }
24841 +
24842 + printk(KERN_INFO ".................................... done.\n");
24843 +
24844 + return;
24845 +}
24846 +
24847 +static __apicdebuginit void print_APIC_bitfield (int base)
24848 +{
24849 + unsigned int v;
24850 + int i, j;
24851 +
24852 + if (apic_verbosity == APIC_QUIET)
24853 + return;
24854 +
24855 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
24856 + for (i = 0; i < 8; i++) {
24857 + v = apic_read(base + i*0x10);
24858 + for (j = 0; j < 32; j++) {
24859 + if (v & (1<<j))
24860 + printk("1");
24861 + else
24862 + printk("0");
24863 + }
24864 + printk("\n");
24865 + }
24866 +}
24867 +
24868 +void __apicdebuginit print_local_APIC(void * dummy)
24869 +{
24870 + unsigned int v, ver, maxlvt;
24871 +
24872 + if (apic_verbosity == APIC_QUIET)
24873 + return;
24874 +
24875 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
24876 + smp_processor_id(), hard_smp_processor_id());
24877 + v = apic_read(APIC_ID);
24878 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
24879 + v = apic_read(APIC_LVR);
24880 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
24881 + ver = GET_APIC_VERSION(v);
24882 + maxlvt = get_maxlvt();
24883 +
24884 + v = apic_read(APIC_TASKPRI);
24885 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
24886 +
24887 + v = apic_read(APIC_ARBPRI);
24888 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
24889 + v & APIC_ARBPRI_MASK);
24890 + v = apic_read(APIC_PROCPRI);
24891 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
24892 +
24893 + v = apic_read(APIC_EOI);
24894 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
24895 + v = apic_read(APIC_RRR);
24896 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
24897 + v = apic_read(APIC_LDR);
24898 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
24899 + v = apic_read(APIC_DFR);
24900 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
24901 + v = apic_read(APIC_SPIV);
24902 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
24903 +
24904 + printk(KERN_DEBUG "... APIC ISR field:\n");
24905 + print_APIC_bitfield(APIC_ISR);
24906 + printk(KERN_DEBUG "... APIC TMR field:\n");
24907 + print_APIC_bitfield(APIC_TMR);
24908 + printk(KERN_DEBUG "... APIC IRR field:\n");
24909 + print_APIC_bitfield(APIC_IRR);
24910 +
24911 + v = apic_read(APIC_ESR);
24912 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
24913 +
24914 + v = apic_read(APIC_ICR);
24915 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
24916 + v = apic_read(APIC_ICR2);
24917 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
24918 +
24919 + v = apic_read(APIC_LVTT);
24920 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
24921 +
24922 + if (maxlvt > 3) { /* PC is LVT#4. */
24923 + v = apic_read(APIC_LVTPC);
24924 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
24925 + }
24926 + v = apic_read(APIC_LVT0);
24927 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
24928 + v = apic_read(APIC_LVT1);
24929 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
24930 +
24931 + if (maxlvt > 2) { /* ERR is LVT#3. */
24932 + v = apic_read(APIC_LVTERR);
24933 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
24934 + }
24935 +
24936 + v = apic_read(APIC_TMICT);
24937 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
24938 + v = apic_read(APIC_TMCCT);
24939 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
24940 + v = apic_read(APIC_TDCR);
24941 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
24942 + printk("\n");
24943 +}
24944 +
24945 +void print_all_local_APICs (void)
24946 +{
24947 + on_each_cpu(print_local_APIC, NULL, 1, 1);
24948 +}
24949 +
24950 +void __apicdebuginit print_PIC(void)
24951 +{
24952 + unsigned int v;
24953 + unsigned long flags;
24954 +
24955 + if (apic_verbosity == APIC_QUIET)
24956 + return;
24957 +
24958 + printk(KERN_DEBUG "\nprinting PIC contents\n");
24959 +
24960 + spin_lock_irqsave(&i8259A_lock, flags);
24961 +
24962 + v = inb(0xa1) << 8 | inb(0x21);
24963 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
24964 +
24965 + v = inb(0xa0) << 8 | inb(0x20);
24966 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
24967 +
24968 + outb(0x0b,0xa0);
24969 + outb(0x0b,0x20);
24970 + v = inb(0xa0) << 8 | inb(0x20);
24971 + outb(0x0a,0xa0);
24972 + outb(0x0a,0x20);
24973 +
24974 + spin_unlock_irqrestore(&i8259A_lock, flags);
24975 +
24976 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
24977 +
24978 + v = inb(0x4d1) << 8 | inb(0x4d0);
24979 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
24980 +}
24981 +#endif /* !CONFIG_XEN */
24982 +
24983 +static void __init enable_IO_APIC(void)
24984 +{
24985 + union IO_APIC_reg_01 reg_01;
24986 +#ifndef CONFIG_XEN
24987 + int i8259_apic, i8259_pin;
24988 +#endif
24989 + int i, apic;
24990 + unsigned long flags;
24991 +
24992 + for (i = 0; i < PIN_MAP_SIZE; i++) {
24993 + irq_2_pin[i].pin = -1;
24994 + irq_2_pin[i].next = 0;
24995 + }
24996 + if (!pirqs_enabled)
24997 + for (i = 0; i < MAX_PIRQS; i++)
24998 + pirq_entries[i] = -1;
24999 +
25000 + /*
25001 + * The number of IO-APIC IRQ registers (== #pins):
25002 + */
25003 + for (apic = 0; apic < nr_ioapics; apic++) {
25004 + spin_lock_irqsave(&ioapic_lock, flags);
25005 + reg_01.raw = io_apic_read(apic, 1);
25006 + spin_unlock_irqrestore(&ioapic_lock, flags);
25007 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
25008 + }
25009 +#ifndef CONFIG_XEN
25010 + for(apic = 0; apic < nr_ioapics; apic++) {
25011 + int pin;
25012 + /* See if any of the pins is in ExtINT mode */
25013 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
25014 + struct IO_APIC_route_entry entry;
25015 + spin_lock_irqsave(&ioapic_lock, flags);
25016 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25017 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25018 + spin_unlock_irqrestore(&ioapic_lock, flags);
25019 +
25020 +
25021 + /* If the interrupt line is enabled and in ExtInt mode
25022 + * I have found the pin where the i8259 is connected.
25023 + */
25024 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
25025 + ioapic_i8259.apic = apic;
25026 + ioapic_i8259.pin = pin;
25027 + goto found_i8259;
25028 + }
25029 + }
25030 + }
25031 + found_i8259:
25032 + /* Look to see what if the MP table has reported the ExtINT */
25033 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
25034 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
25035 + /* Trust the MP table if nothing is setup in the hardware */
25036 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
25037 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
25038 + ioapic_i8259.pin = i8259_pin;
25039 + ioapic_i8259.apic = i8259_apic;
25040 + }
25041 + /* Complain if the MP table and the hardware disagree */
25042 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
25043 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
25044 + {
25045 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
25046 + }
25047 +#endif
25048 +
25049 + /*
25050 + * Do not trust the IO-APIC being empty at bootup
25051 + */
25052 + clear_IO_APIC();
25053 +}
25054 +
25055 +/*
25056 + * Not an __init, needed by the reboot code
25057 + */
25058 +void disable_IO_APIC(void)
25059 +{
25060 + /*
25061 + * Clear the IO-APIC before rebooting:
25062 + */
25063 + clear_IO_APIC();
25064 +
25065 +#ifndef CONFIG_XEN
25066 + /*
25067 + * If the i8259 is routed through an IOAPIC
25068 + * Put that IOAPIC in virtual wire mode
25069 + * so legacy interrupts can be delivered.
25070 + */
25071 + if (ioapic_i8259.pin != -1) {
25072 + struct IO_APIC_route_entry entry;
25073 + unsigned long flags;
25074 +
25075 + memset(&entry, 0, sizeof(entry));
25076 + entry.mask = 0; /* Enabled */
25077 + entry.trigger = 0; /* Edge */
25078 + entry.irr = 0;
25079 + entry.polarity = 0; /* High */
25080 + entry.delivery_status = 0;
25081 + entry.dest_mode = 0; /* Physical */
25082 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
25083 + entry.vector = 0;
25084 + entry.dest.physical.physical_dest =
25085 + GET_APIC_ID(apic_read(APIC_ID));
25086 +
25087 + /*
25088 + * Add it to the IO-APIC irq-routing table:
25089 + */
25090 + spin_lock_irqsave(&ioapic_lock, flags);
25091 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
25092 + *(((int *)&entry)+1));
25093 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
25094 + *(((int *)&entry)+0));
25095 + spin_unlock_irqrestore(&ioapic_lock, flags);
25096 + }
25097 +
25098 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
25099 +#endif
25100 +}
25101 +
25102 +/*
25103 + * function to set the IO-APIC physical IDs based on the
25104 + * values stored in the MPC table.
25105 + *
25106 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
25107 + */
25108 +
25109 +#ifndef CONFIG_XEN
25110 +static void __init setup_ioapic_ids_from_mpc (void)
25111 +{
25112 + union IO_APIC_reg_00 reg_00;
25113 + int apic;
25114 + int i;
25115 + unsigned char old_id;
25116 + unsigned long flags;
25117 +
25118 + /*
25119 + * Set the IOAPIC ID to the value stored in the MPC table.
25120 + */
25121 + for (apic = 0; apic < nr_ioapics; apic++) {
25122 +
25123 + /* Read the register 0 value */
25124 + spin_lock_irqsave(&ioapic_lock, flags);
25125 + reg_00.raw = io_apic_read(apic, 0);
25126 + spin_unlock_irqrestore(&ioapic_lock, flags);
25127 +
25128 + old_id = mp_ioapics[apic].mpc_apicid;
25129 +
25130 +
25131 + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
25132 +
25133 +
25134 + /*
25135 + * We need to adjust the IRQ routing table
25136 + * if the ID changed.
25137 + */
25138 + if (old_id != mp_ioapics[apic].mpc_apicid)
25139 + for (i = 0; i < mp_irq_entries; i++)
25140 + if (mp_irqs[i].mpc_dstapic == old_id)
25141 + mp_irqs[i].mpc_dstapic
25142 + = mp_ioapics[apic].mpc_apicid;
25143 +
25144 + /*
25145 + * Read the right value from the MPC table and
25146 + * write it into the ID register.
25147 + */
25148 + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
25149 + mp_ioapics[apic].mpc_apicid);
25150 +
25151 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
25152 + spin_lock_irqsave(&ioapic_lock, flags);
25153 + io_apic_write(apic, 0, reg_00.raw);
25154 + spin_unlock_irqrestore(&ioapic_lock, flags);
25155 +
25156 + /*
25157 + * Sanity check
25158 + */
25159 + spin_lock_irqsave(&ioapic_lock, flags);
25160 + reg_00.raw = io_apic_read(apic, 0);
25161 + spin_unlock_irqrestore(&ioapic_lock, flags);
25162 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
25163 + printk("could not set ID!\n");
25164 + else
25165 + apic_printk(APIC_VERBOSE," ok.\n");
25166 + }
25167 +}
25168 +#else
25169 +static void __init setup_ioapic_ids_from_mpc(void) { }
25170 +#endif
25171 +
25172 +/*
25173 + * There is a nasty bug in some older SMP boards, their mptable lies
25174 + * about the timer IRQ. We do the following to work around the situation:
25175 + *
25176 + * - timer IRQ defaults to IO-APIC IRQ
25177 + * - if this function detects that timer IRQs are defunct, then we fall
25178 + * back to ISA timer IRQs
25179 + */
25180 +#ifndef CONFIG_XEN
25181 +static int __init timer_irq_works(void)
25182 +{
25183 + unsigned long t1 = jiffies;
25184 +
25185 + local_irq_enable();
25186 + /* Let ten ticks pass... */
25187 + mdelay((10 * 1000) / HZ);
25188 +
25189 + /*
25190 + * Expect a few ticks at least, to be sure some possible
25191 + * glue logic does not lock up after one or two first
25192 + * ticks in a non-ExtINT mode. Also the local APIC
25193 + * might have cached one ExtINT interrupt. Finally, at
25194 + * least one tick may be lost due to delays.
25195 + */
25196 +
25197 + /* jiffies wrap? */
25198 + if (jiffies - t1 > 4)
25199 + return 1;
25200 + return 0;
25201 +}
25202 +
25203 +/*
25204 + * In the SMP+IOAPIC case it might happen that there are an unspecified
25205 + * number of pending IRQ events unhandled. These cases are very rare,
25206 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
25207 + * better to do it this way as thus we do not have to be aware of
25208 + * 'pending' interrupts in the IRQ path, except at this point.
25209 + */
25210 +/*
25211 + * Edge triggered needs to resend any interrupt
25212 + * that was delayed but this is now handled in the device
25213 + * independent code.
25214 + */
25215 +
25216 +/*
25217 + * Starting up a edge-triggered IO-APIC interrupt is
25218 + * nasty - we need to make sure that we get the edge.
25219 + * If it is already asserted for some reason, we need
25220 + * return 1 to indicate that is was pending.
25221 + *
25222 + * This is not complete - we should be able to fake
25223 + * an edge even if it isn't on the 8259A...
25224 + */
25225 +
25226 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
25227 +{
25228 + int was_pending = 0;
25229 + unsigned long flags;
25230 +
25231 + spin_lock_irqsave(&ioapic_lock, flags);
25232 + if (irq < 16) {
25233 + disable_8259A_irq(irq);
25234 + if (i8259A_irq_pending(irq))
25235 + was_pending = 1;
25236 + }
25237 + __unmask_IO_APIC_irq(irq);
25238 + spin_unlock_irqrestore(&ioapic_lock, flags);
25239 +
25240 + return was_pending;
25241 +}
25242 +
25243 +/*
25244 + * Once we have recorded IRQ_PENDING already, we can mask the
25245 + * interrupt for real. This prevents IRQ storms from unhandled
25246 + * devices.
25247 + */
25248 +static void ack_edge_ioapic_irq(unsigned int irq)
25249 +{
25250 + move_irq(irq);
25251 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
25252 + == (IRQ_PENDING | IRQ_DISABLED))
25253 + mask_IO_APIC_irq(irq);
25254 + ack_APIC_irq();
25255 +}
25256 +
25257 +/*
25258 + * Level triggered interrupts can just be masked,
25259 + * and shutting down and starting up the interrupt
25260 + * is the same as enabling and disabling them -- except
25261 + * with a startup need to return a "was pending" value.
25262 + *
25263 + * Level triggered interrupts are special because we
25264 + * do not touch any IO-APIC register while handling
25265 + * them. We ack the APIC in the end-IRQ handler, not
25266 + * in the start-IRQ-handler. Protection against reentrance
25267 + * from the same interrupt is still provided, both by the
25268 + * generic IRQ layer and by the fact that an unacked local
25269 + * APIC does not accept IRQs.
25270 + */
25271 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
25272 +{
25273 + unmask_IO_APIC_irq(irq);
25274 +
25275 + return 0; /* don't check for pending */
25276 +}
25277 +
25278 +static void end_level_ioapic_irq (unsigned int irq)
25279 +{
25280 + move_irq(irq);
25281 + ack_APIC_irq();
25282 +}
25283 +
25284 +#ifdef CONFIG_PCI_MSI
25285 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
25286 +{
25287 + int irq = vector_to_irq(vector);
25288 +
25289 + return startup_edge_ioapic_irq(irq);
25290 +}
25291 +
25292 +static void ack_edge_ioapic_vector(unsigned int vector)
25293 +{
25294 + int irq = vector_to_irq(vector);
25295 +
25296 + move_native_irq(vector);
25297 + ack_edge_ioapic_irq(irq);
25298 +}
25299 +
25300 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
25301 +{
25302 + int irq = vector_to_irq(vector);
25303 +
25304 + return startup_level_ioapic_irq (irq);
25305 +}
25306 +
25307 +static void end_level_ioapic_vector (unsigned int vector)
25308 +{
25309 + int irq = vector_to_irq(vector);
25310 +
25311 + move_native_irq(vector);
25312 + end_level_ioapic_irq(irq);
25313 +}
25314 +
25315 +static void mask_IO_APIC_vector (unsigned int vector)
25316 +{
25317 + int irq = vector_to_irq(vector);
25318 +
25319 + mask_IO_APIC_irq(irq);
25320 +}
25321 +
25322 +static void unmask_IO_APIC_vector (unsigned int vector)
25323 +{
25324 + int irq = vector_to_irq(vector);
25325 +
25326 + unmask_IO_APIC_irq(irq);
25327 +}
25328 +
25329 +#ifdef CONFIG_SMP
25330 +static void set_ioapic_affinity_vector (unsigned int vector,
25331 + cpumask_t cpu_mask)
25332 +{
25333 + int irq = vector_to_irq(vector);
25334 +
25335 + set_native_irq_info(vector, cpu_mask);
25336 + set_ioapic_affinity_irq(irq, cpu_mask);
25337 +}
25338 +#endif // CONFIG_SMP
25339 +#endif // CONFIG_PCI_MSI
25340 +
25341 +static int ioapic_retrigger(unsigned int irq)
25342 +{
25343 + send_IPI_self(IO_APIC_VECTOR(irq));
25344 +
25345 + return 1;
25346 +}
25347 +
25348 +/*
25349 + * Level and edge triggered IO-APIC interrupts need different handling,
25350 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
25351 + * handled with the level-triggered descriptor, but that one has slightly
25352 + * more overhead. Level-triggered interrupts cannot be handled with the
25353 + * edge-triggered handler, without risking IRQ storms and other ugly
25354 + * races.
25355 + */
25356 +
25357 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
25358 + .typename = "IO-APIC-edge",
25359 + .startup = startup_edge_ioapic,
25360 + .shutdown = shutdown_edge_ioapic,
25361 + .enable = enable_edge_ioapic,
25362 + .disable = disable_edge_ioapic,
25363 + .ack = ack_edge_ioapic,
25364 + .end = end_edge_ioapic,
25365 +#ifdef CONFIG_SMP
25366 + .set_affinity = set_ioapic_affinity,
25367 +#endif
25368 + .retrigger = ioapic_retrigger,
25369 +};
25370 +
25371 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
25372 + .typename = "IO-APIC-level",
25373 + .startup = startup_level_ioapic,
25374 + .shutdown = shutdown_level_ioapic,
25375 + .enable = enable_level_ioapic,
25376 + .disable = disable_level_ioapic,
25377 + .ack = mask_and_ack_level_ioapic,
25378 + .end = end_level_ioapic,
25379 +#ifdef CONFIG_SMP
25380 + .set_affinity = set_ioapic_affinity,
25381 +#endif
25382 + .retrigger = ioapic_retrigger,
25383 +};
25384 +#endif /* !CONFIG_XEN */
25385 +
25386 +static inline void init_IO_APIC_traps(void)
25387 +{
25388 + int irq;
25389 +
25390 + /*
25391 + * NOTE! The local APIC isn't very good at handling
25392 + * multiple interrupts at the same interrupt level.
25393 + * As the interrupt level is determined by taking the
25394 + * vector number and shifting that right by 4, we
25395 + * want to spread these out a bit so that they don't
25396 + * all fall in the same interrupt level.
25397 + *
25398 + * Also, we've got to be careful not to trash gate
25399 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
25400 + */
25401 + for (irq = 0; irq < NR_IRQS ; irq++) {
25402 + int tmp = irq;
25403 + if (use_pci_vector()) {
25404 + if (!platform_legacy_irq(tmp))
25405 + if ((tmp = vector_to_irq(tmp)) == -1)
25406 + continue;
25407 + }
25408 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
25409 + /*
25410 + * Hmm.. We don't have an entry for this,
25411 + * so default to an old-fashioned 8259
25412 + * interrupt if we can..
25413 + */
25414 + if (irq < 16)
25415 + make_8259A_irq(irq);
25416 +#ifndef CONFIG_XEN
25417 + else
25418 + /* Strange. Oh, well.. */
25419 + irq_desc[irq].chip = &no_irq_type;
25420 +#endif
25421 + }
25422 + }
25423 +}
25424 +
25425 +#ifndef CONFIG_XEN
25426 +static void enable_lapic_irq (unsigned int irq)
25427 +{
25428 + unsigned long v;
25429 +
25430 + v = apic_read(APIC_LVT0);
25431 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
25432 +}
25433 +
25434 +static void disable_lapic_irq (unsigned int irq)
25435 +{
25436 + unsigned long v;
25437 +
25438 + v = apic_read(APIC_LVT0);
25439 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
25440 +}
25441 +
25442 +static void ack_lapic_irq (unsigned int irq)
25443 +{
25444 + ack_APIC_irq();
25445 +}
25446 +
25447 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
25448 +
25449 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
25450 + .typename = "local-APIC-edge",
25451 + .startup = NULL, /* startup_irq() not used for IRQ0 */
25452 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
25453 + .enable = enable_lapic_irq,
25454 + .disable = disable_lapic_irq,
25455 + .ack = ack_lapic_irq,
25456 + .end = end_lapic_irq,
25457 +};
25458 +
25459 +static void setup_nmi (void)
25460 +{
25461 + /*
25462 + * Dirty trick to enable the NMI watchdog ...
25463 + * We put the 8259A master into AEOI mode and
25464 + * unmask on all local APICs LVT0 as NMI.
25465 + *
25466 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
25467 + * is from Maciej W. Rozycki - so we do not have to EOI from
25468 + * the NMI handler or the timer interrupt.
25469 + */
25470 + printk(KERN_INFO "activating NMI Watchdog ...");
25471 +
25472 + enable_NMI_through_LVT0(NULL);
25473 +
25474 + printk(" done.\n");
25475 +}
25476 +
25477 +/*
25478 + * This looks a bit hackish but it's about the only one way of sending
25479 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
25480 + * not support the ExtINT mode, unfortunately. We need to send these
25481 + * cycles as some i82489DX-based boards have glue logic that keeps the
25482 + * 8259A interrupt line asserted until INTA. --macro
25483 + */
25484 +static inline void unlock_ExtINT_logic(void)
25485 +{
25486 + int apic, pin, i;
25487 + struct IO_APIC_route_entry entry0, entry1;
25488 + unsigned char save_control, save_freq_select;
25489 + unsigned long flags;
25490 +
25491 + pin = find_isa_irq_pin(8, mp_INT);
25492 + apic = find_isa_irq_apic(8, mp_INT);
25493 + if (pin == -1)
25494 + return;
25495 +
25496 + spin_lock_irqsave(&ioapic_lock, flags);
25497 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
25498 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
25499 + spin_unlock_irqrestore(&ioapic_lock, flags);
25500 + clear_IO_APIC_pin(apic, pin);
25501 +
25502 + memset(&entry1, 0, sizeof(entry1));
25503 +
25504 + entry1.dest_mode = 0; /* physical delivery */
25505 + entry1.mask = 0; /* unmask IRQ now */
25506 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
25507 + entry1.delivery_mode = dest_ExtINT;
25508 + entry1.polarity = entry0.polarity;
25509 + entry1.trigger = 0;
25510 + entry1.vector = 0;
25511 +
25512 + spin_lock_irqsave(&ioapic_lock, flags);
25513 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
25514 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
25515 + spin_unlock_irqrestore(&ioapic_lock, flags);
25516 +
25517 + save_control = CMOS_READ(RTC_CONTROL);
25518 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
25519 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
25520 + RTC_FREQ_SELECT);
25521 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
25522 +
25523 + i = 100;
25524 + while (i-- > 0) {
25525 + mdelay(10);
25526 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
25527 + i -= 10;
25528 + }
25529 +
25530 + CMOS_WRITE(save_control, RTC_CONTROL);
25531 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
25532 + clear_IO_APIC_pin(apic, pin);
25533 +
25534 + spin_lock_irqsave(&ioapic_lock, flags);
25535 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
25536 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
25537 + spin_unlock_irqrestore(&ioapic_lock, flags);
25538 +}
25539 +
25540 +int timer_uses_ioapic_pin_0;
25541 +
25542 +/*
25543 + * This code may look a bit paranoid, but it's supposed to cooperate with
25544 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
25545 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
25546 + * fanatically on his truly buggy board.
25547 + *
25548 + * FIXME: really need to revamp this for modern platforms only.
25549 + */
25550 +static inline void check_timer(void)
25551 +{
25552 + int apic1, pin1, apic2, pin2;
25553 + int vector;
25554 +
25555 + /*
25556 + * get/set the timer IRQ vector:
25557 + */
25558 + disable_8259A_irq(0);
25559 + vector = assign_irq_vector(0);
25560 + set_intr_gate(vector, interrupt[0]);
25561 +
25562 + /*
25563 + * Subtle, code in do_timer_interrupt() expects an AEOI
25564 + * mode for the 8259A whenever interrupts are routed
25565 + * through I/O APICs. Also IRQ0 has to be enabled in
25566 + * the 8259A which implies the virtual wire has to be
25567 + * disabled in the local APIC.
25568 + */
25569 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
25570 + init_8259A(1);
25571 + if (timer_over_8254 > 0)
25572 + enable_8259A_irq(0);
25573 +
25574 + pin1 = find_isa_irq_pin(0, mp_INT);
25575 + apic1 = find_isa_irq_apic(0, mp_INT);
25576 + pin2 = ioapic_i8259.pin;
25577 + apic2 = ioapic_i8259.apic;
25578 +
25579 + if (pin1 == 0)
25580 + timer_uses_ioapic_pin_0 = 1;
25581 +
25582 + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
25583 + vector, apic1, pin1, apic2, pin2);
25584 +
25585 + if (pin1 != -1) {
25586 + /*
25587 + * Ok, does IRQ0 through the IOAPIC work?
25588 + */
25589 + unmask_IO_APIC_irq(0);
25590 + if (!no_timer_check && timer_irq_works()) {
25591 + nmi_watchdog_default();
25592 + if (nmi_watchdog == NMI_IO_APIC) {
25593 + disable_8259A_irq(0);
25594 + setup_nmi();
25595 + enable_8259A_irq(0);
25596 + }
25597 + if (disable_timer_pin_1 > 0)
25598 + clear_IO_APIC_pin(0, pin1);
25599 + return;
25600 + }
25601 + clear_IO_APIC_pin(apic1, pin1);
25602 + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
25603 + "connected to IO-APIC\n");
25604 + }
25605 +
25606 + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
25607 + "through the 8259A ... ");
25608 + if (pin2 != -1) {
25609 + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
25610 + apic2, pin2);
25611 + /*
25612 + * legacy devices should be connected to IO APIC #0
25613 + */
25614 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
25615 + if (timer_irq_works()) {
25616 + apic_printk(APIC_VERBOSE," works.\n");
25617 + nmi_watchdog_default();
25618 + if (nmi_watchdog == NMI_IO_APIC) {
25619 + setup_nmi();
25620 + }
25621 + return;
25622 + }
25623 + /*
25624 + * Cleanup, just in case ...
25625 + */
25626 + clear_IO_APIC_pin(apic2, pin2);
25627 + }
25628 + apic_printk(APIC_VERBOSE," failed.\n");
25629 +
25630 + if (nmi_watchdog == NMI_IO_APIC) {
25631 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
25632 + nmi_watchdog = 0;
25633 + }
25634 +
25635 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
25636 +
25637 + disable_8259A_irq(0);
25638 + irq_desc[0].chip = &lapic_irq_type;
25639 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
25640 + enable_8259A_irq(0);
25641 +
25642 + if (timer_irq_works()) {
25643 + apic_printk(APIC_VERBOSE," works.\n");
25644 + return;
25645 + }
25646 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
25647 + apic_printk(APIC_VERBOSE," failed.\n");
25648 +
25649 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
25650 +
25651 + init_8259A(0);
25652 + make_8259A_irq(0);
25653 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
25654 +
25655 + unlock_ExtINT_logic();
25656 +
25657 + if (timer_irq_works()) {
25658 + apic_printk(APIC_VERBOSE," works.\n");
25659 + return;
25660 + }
25661 + apic_printk(APIC_VERBOSE," failed :(.\n");
25662 + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
25663 +}
25664 +#else
25665 +#define check_timer() ((void)0)
25666 +int timer_uses_ioapic_pin_0 = 0;
25667 +#endif /* !CONFIG_XEN */
25668 +
25669 +static int __init notimercheck(char *s)
25670 +{
25671 + no_timer_check = 1;
25672 + return 1;
25673 +}
25674 +__setup("no_timer_check", notimercheck);
25675 +
25676 +/*
25677 + *
25678 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
25679 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
25680 + * Linux doesn't really care, as it's not actually used
25681 + * for any interrupt handling anyway.
25682 + */
25683 +#define PIC_IRQS (1<<2)
25684 +
25685 +void __init setup_IO_APIC(void)
25686 +{
25687 + enable_IO_APIC();
25688 +
25689 + if (acpi_ioapic)
25690 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
25691 + else
25692 + io_apic_irqs = ~PIC_IRQS;
25693 +
25694 + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
25695 +
25696 + /*
25697 + * Set up the IO-APIC IRQ routing table.
25698 + */
25699 + if (!acpi_ioapic)
25700 + setup_ioapic_ids_from_mpc();
25701 +#ifndef CONFIG_XEN
25702 + sync_Arb_IDs();
25703 +#endif /* !CONFIG_XEN */
25704 + setup_IO_APIC_irqs();
25705 + init_IO_APIC_traps();
25706 + check_timer();
25707 + if (!acpi_ioapic)
25708 + print_IO_APIC();
25709 +}
25710 +
25711 +struct sysfs_ioapic_data {
25712 + struct sys_device dev;
25713 + struct IO_APIC_route_entry entry[0];
25714 +};
25715 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
25716 +
25717 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
25718 +{
25719 + struct IO_APIC_route_entry *entry;
25720 + struct sysfs_ioapic_data *data;
25721 + unsigned long flags;
25722 + int i;
25723 +
25724 + data = container_of(dev, struct sysfs_ioapic_data, dev);
25725 + entry = data->entry;
25726 + spin_lock_irqsave(&ioapic_lock, flags);
25727 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25728 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
25729 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
25730 + }
25731 + spin_unlock_irqrestore(&ioapic_lock, flags);
25732 +
25733 + return 0;
25734 +}
25735 +
25736 +static int ioapic_resume(struct sys_device *dev)
25737 +{
25738 + struct IO_APIC_route_entry *entry;
25739 + struct sysfs_ioapic_data *data;
25740 + unsigned long flags;
25741 + union IO_APIC_reg_00 reg_00;
25742 + int i;
25743 +
25744 + data = container_of(dev, struct sysfs_ioapic_data, dev);
25745 + entry = data->entry;
25746 +
25747 + spin_lock_irqsave(&ioapic_lock, flags);
25748 + reg_00.raw = io_apic_read(dev->id, 0);
25749 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
25750 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
25751 + io_apic_write(dev->id, 0, reg_00.raw);
25752 + }
25753 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
25754 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
25755 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
25756 + }
25757 + spin_unlock_irqrestore(&ioapic_lock, flags);
25758 +
25759 + return 0;
25760 +}
25761 +
25762 +static struct sysdev_class ioapic_sysdev_class = {
25763 + set_kset_name("ioapic"),
25764 +#ifndef CONFIG_XEN
25765 + .suspend = ioapic_suspend,
25766 + .resume = ioapic_resume,
25767 +#endif
25768 +};
25769 +
25770 +static int __init ioapic_init_sysfs(void)
25771 +{
25772 + struct sys_device * dev;
25773 + int i, size, error = 0;
25774 +
25775 + error = sysdev_class_register(&ioapic_sysdev_class);
25776 + if (error)
25777 + return error;
25778 +
25779 + for (i = 0; i < nr_ioapics; i++ ) {
25780 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
25781 + * sizeof(struct IO_APIC_route_entry);
25782 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
25783 + if (!mp_ioapic_data[i]) {
25784 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25785 + continue;
25786 + }
25787 + memset(mp_ioapic_data[i], 0, size);
25788 + dev = &mp_ioapic_data[i]->dev;
25789 + dev->id = i;
25790 + dev->cls = &ioapic_sysdev_class;
25791 + error = sysdev_register(dev);
25792 + if (error) {
25793 + kfree(mp_ioapic_data[i]);
25794 + mp_ioapic_data[i] = NULL;
25795 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
25796 + continue;
25797 + }
25798 + }
25799 +
25800 + return 0;
25801 +}
25802 +
25803 +device_initcall(ioapic_init_sysfs);
25804 +
25805 +/* --------------------------------------------------------------------------
25806 + ACPI-based IOAPIC Configuration
25807 + -------------------------------------------------------------------------- */
25808 +
25809 +#ifdef CONFIG_ACPI
25810 +
25811 +#define IO_APIC_MAX_ID 0xFE
25812 +
25813 +int __init io_apic_get_version (int ioapic)
25814 +{
25815 + union IO_APIC_reg_01 reg_01;
25816 + unsigned long flags;
25817 +
25818 + spin_lock_irqsave(&ioapic_lock, flags);
25819 + reg_01.raw = io_apic_read(ioapic, 1);
25820 + spin_unlock_irqrestore(&ioapic_lock, flags);
25821 +
25822 + return reg_01.bits.version;
25823 +}
25824 +
25825 +
25826 +int __init io_apic_get_redir_entries (int ioapic)
25827 +{
25828 + union IO_APIC_reg_01 reg_01;
25829 + unsigned long flags;
25830 +
25831 + spin_lock_irqsave(&ioapic_lock, flags);
25832 + reg_01.raw = io_apic_read(ioapic, 1);
25833 + spin_unlock_irqrestore(&ioapic_lock, flags);
25834 +
25835 + return reg_01.bits.entries;
25836 +}
25837 +
25838 +
25839 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
25840 +{
25841 + struct IO_APIC_route_entry entry;
25842 + unsigned long flags;
25843 +
25844 + if (!IO_APIC_IRQ(irq)) {
25845 + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
25846 + ioapic);
25847 + return -EINVAL;
25848 + }
25849 +
25850 + /*
25851 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
25852 + * Note that we mask (disable) IRQs now -- these get enabled when the
25853 + * corresponding device driver registers for this IRQ.
25854 + */
25855 +
25856 + memset(&entry,0,sizeof(entry));
25857 +
25858 + entry.delivery_mode = INT_DELIVERY_MODE;
25859 + entry.dest_mode = INT_DEST_MODE;
25860 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
25861 + entry.trigger = edge_level;
25862 + entry.polarity = active_high_low;
25863 + entry.mask = 1; /* Disabled (masked) */
25864 +
25865 + irq = gsi_irq_sharing(irq);
25866 + /*
25867 + * IRQs < 16 are already in the irq_2_pin[] map
25868 + */
25869 + if (irq >= 16)
25870 + add_pin_to_irq(irq, ioapic, pin);
25871 +
25872 + entry.vector = assign_irq_vector(irq);
25873 +
25874 + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
25875 + "IRQ %d Mode:%i Active:%i)\n", ioapic,
25876 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
25877 + edge_level, active_high_low);
25878 +
25879 + ioapic_register_intr(irq, entry.vector, edge_level);
25880 +
25881 + if (!ioapic && (irq < 16))
25882 + disable_8259A_irq(irq);
25883 +
25884 + spin_lock_irqsave(&ioapic_lock, flags);
25885 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
25886 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
25887 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
25888 + spin_unlock_irqrestore(&ioapic_lock, flags);
25889 +
25890 + return 0;
25891 +}
25892 +
25893 +#endif /* CONFIG_ACPI */
25894 +
25895 +
25896 +#ifndef CONFIG_XEN
25897 +/*
25898 + * This function currently is only a helper for the i386 smp boot process where
25899 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
25900 + * so mask in all cases should simply be TARGET_CPUS
25901 + */
25902 +#ifdef CONFIG_SMP
25903 +void __init setup_ioapic_dest(void)
25904 +{
25905 + int pin, ioapic, irq, irq_entry;
25906 +
25907 + if (skip_ioapic_setup == 1)
25908 + return;
25909 +
25910 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
25911 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
25912 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
25913 + if (irq_entry == -1)
25914 + continue;
25915 + irq = pin_2_irq(irq_entry, ioapic, pin);
25916 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
25917 + }
25918 +
25919 + }
25920 +}
25921 +#endif
25922 +#endif /* !CONFIG_XEN */
25923 Index: head-2008-11-25/arch/x86/kernel/ioport_64-xen.c
25924 ===================================================================
25925 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
25926 +++ head-2008-11-25/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100
25927 @@ -0,0 +1,100 @@
25928 +/*
25929 + * linux/arch/x86_64/kernel/ioport.c
25930 + *
25931 + * This contains the io-permission bitmap code - written by obz, with changes
25932 + * by Linus.
25933 + */
25934 +
25935 +#include <linux/sched.h>
25936 +#include <linux/kernel.h>
25937 +#include <linux/capability.h>
25938 +#include <linux/errno.h>
25939 +#include <linux/types.h>
25940 +#include <linux/ioport.h>
25941 +#include <linux/mm.h>
25942 +#include <linux/smp.h>
25943 +#include <linux/smp_lock.h>
25944 +#include <linux/stddef.h>
25945 +#include <linux/slab.h>
25946 +#include <linux/thread_info.h>
25947 +#include <xen/interface/physdev.h>
25948 +
25949 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
25950 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
25951 +{
25952 + int i;
25953 +
25954 + if (new_value)
25955 + for (i = base; i < base + extent; i++)
25956 + __set_bit(i, bitmap);
25957 + else
25958 + for (i = base; i < base + extent; i++)
25959 + clear_bit(i, bitmap);
25960 +}
25961 +
25962 +/*
25963 + * this changes the io permissions bitmap in the current task.
25964 + */
25965 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
25966 +{
25967 + struct thread_struct * t = &current->thread;
25968 + unsigned long *bitmap;
25969 + struct physdev_set_iobitmap set_iobitmap;
25970 +
25971 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
25972 + return -EINVAL;
25973 + if (turn_on && !capable(CAP_SYS_RAWIO))
25974 + return -EPERM;
25975 +
25976 + /*
25977 + * If it's the first ioperm() call in this thread's lifetime, set the
25978 + * IO bitmap up. ioperm() is much less timing critical than clone(),
25979 + * this is why we delay this operation until now:
25980 + */
25981 + if (!t->io_bitmap_ptr) {
25982 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
25983 + if (!bitmap)
25984 + return -ENOMEM;
25985 +
25986 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
25987 + t->io_bitmap_ptr = bitmap;
25988 +
25989 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
25990 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
25991 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
25992 + &set_iobitmap));
25993 + }
25994 +
25995 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
25996 +
25997 + return 0;
25998 +}
25999 +
26000 +/*
26001 + * sys_iopl has to be used when you want to access the IO ports
26002 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
26003 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
26004 + *
26005 + */
26006 +
26007 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
26008 +{
26009 + unsigned int old_iopl = current->thread.iopl;
26010 + struct physdev_set_iopl set_iopl;
26011 +
26012 + if (new_iopl > 3)
26013 + return -EINVAL;
26014 +
26015 + /* Need "raw I/O" privileges for direct port access. */
26016 + if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
26017 + return -EPERM;
26018 +
26019 + /* Change our version of the privilege levels. */
26020 + current->thread.iopl = new_iopl;
26021 +
26022 + /* Force the change at ring 0. */
26023 + set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
26024 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26025 +
26026 + return 0;
26027 +}
26028 Index: head-2008-11-25/arch/x86/kernel/irq_64-xen.c
26029 ===================================================================
26030 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26031 +++ head-2008-11-25/arch/x86/kernel/irq_64-xen.c 2008-10-29 09:55:56.000000000 +0100
26032 @@ -0,0 +1,197 @@
26033 +/*
26034 + * linux/arch/x86_64/kernel/irq.c
26035 + *
26036 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
26037 + *
26038 + * This file contains the lowest level x86_64-specific interrupt
26039 + * entry and irq statistics code. All the remaining irq logic is
26040 + * done by the generic kernel/irq/ code and in the
26041 + * x86_64-specific irq controller code. (e.g. i8259.c and
26042 + * io_apic.c.)
26043 + */
26044 +
26045 +#include <linux/kernel_stat.h>
26046 +#include <linux/interrupt.h>
26047 +#include <linux/seq_file.h>
26048 +#include <linux/module.h>
26049 +#include <linux/delay.h>
26050 +#include <asm/uaccess.h>
26051 +#include <asm/io_apic.h>
26052 +#include <asm/idle.h>
26053 +
26054 +atomic_t irq_err_count;
26055 +#ifdef CONFIG_X86_IO_APIC
26056 +#ifdef APIC_MISMATCH_DEBUG
26057 +atomic_t irq_mis_count;
26058 +#endif
26059 +#endif
26060 +
26061 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26062 +/*
26063 + * Probabilistic stack overflow check:
26064 + *
26065 + * Only check the stack in process context, because everything else
26066 + * runs on the big interrupt stacks. Checking reliably is too expensive,
26067 + * so we just check from interrupts.
26068 + */
26069 +static inline void stack_overflow_check(struct pt_regs *regs)
26070 +{
26071 + u64 curbase = (u64) current->thread_info;
26072 + static unsigned long warned = -60*HZ;
26073 +
26074 + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
26075 + regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
26076 + time_after(jiffies, warned + 60*HZ)) {
26077 + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
26078 + current->comm, curbase, regs->rsp);
26079 + show_stack(NULL,NULL);
26080 + warned = jiffies;
26081 + }
26082 +}
26083 +#endif
26084 +
26085 +/*
26086 + * Generic, controller-independent functions:
26087 + */
26088 +
26089 +int show_interrupts(struct seq_file *p, void *v)
26090 +{
26091 + int i = *(loff_t *) v, j;
26092 + struct irqaction * action;
26093 + unsigned long flags;
26094 +
26095 + if (i == 0) {
26096 + seq_printf(p, " ");
26097 + for_each_online_cpu(j)
26098 + seq_printf(p, "CPU%-8d",j);
26099 + seq_putc(p, '\n');
26100 + }
26101 +
26102 + if (i < NR_IRQS) {
26103 + spin_lock_irqsave(&irq_desc[i].lock, flags);
26104 + action = irq_desc[i].action;
26105 + if (!action)
26106 + goto skip;
26107 + seq_printf(p, "%3d: ",i);
26108 +#ifndef CONFIG_SMP
26109 + seq_printf(p, "%10u ", kstat_irqs(i));
26110 +#else
26111 + for_each_online_cpu(j)
26112 + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
26113 +#endif
26114 + seq_printf(p, " %14s", irq_desc[i].chip->typename);
26115 +
26116 + seq_printf(p, " %s", action->name);
26117 + for (action=action->next; action; action = action->next)
26118 + seq_printf(p, ", %s", action->name);
26119 + seq_putc(p, '\n');
26120 +skip:
26121 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
26122 + } else if (i == NR_IRQS) {
26123 + seq_printf(p, "NMI: ");
26124 + for_each_online_cpu(j)
26125 + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
26126 + seq_putc(p, '\n');
26127 +#ifdef CONFIG_X86_LOCAL_APIC
26128 + seq_printf(p, "LOC: ");
26129 + for_each_online_cpu(j)
26130 + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
26131 + seq_putc(p, '\n');
26132 +#endif
26133 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
26134 +#ifdef CONFIG_X86_IO_APIC
26135 +#ifdef APIC_MISMATCH_DEBUG
26136 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
26137 +#endif
26138 +#endif
26139 + }
26140 + return 0;
26141 +}
26142 +
26143 +/*
26144 + * do_IRQ handles all normal device IRQ's (the special
26145 + * SMP cross-CPU interrupts have their own specific
26146 + * handlers).
26147 + */
26148 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
26149 +{
26150 + /* high bit used in ret_from_ code */
26151 + unsigned irq = ~regs->orig_rax;
26152 +
26153 + if (unlikely(irq >= NR_IRQS)) {
26154 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
26155 + __FUNCTION__, irq);
26156 + BUG();
26157 + }
26158 +
26159 + /*exit_idle();*/
26160 + /*irq_enter();*/
26161 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
26162 + stack_overflow_check(regs);
26163 +#endif
26164 + __do_IRQ(irq, regs);
26165 + /*irq_exit();*/
26166 +
26167 + return 1;
26168 +}
26169 +
26170 +#ifdef CONFIG_HOTPLUG_CPU
26171 +void fixup_irqs(cpumask_t map)
26172 +{
26173 + unsigned int irq;
26174 + static int warned;
26175 +
26176 + for (irq = 0; irq < NR_IRQS; irq++) {
26177 + cpumask_t mask;
26178 + if (irq == 2)
26179 + continue;
26180 +
26181 + cpus_and(mask, irq_desc[irq].affinity, map);
26182 + if (any_online_cpu(mask) == NR_CPUS) {
26183 + /*printk("Breaking affinity for irq %i\n", irq);*/
26184 + mask = map;
26185 + }
26186 + if (irq_desc[irq].chip->set_affinity)
26187 + irq_desc[irq].chip->set_affinity(irq, mask);
26188 + else if (irq_desc[irq].action && !(warned++))
26189 + printk("Cannot set affinity for irq %i\n", irq);
26190 + }
26191 +
26192 + /* That doesn't seem sufficient. Give it 1ms. */
26193 + local_irq_enable();
26194 + mdelay(1);
26195 + local_irq_disable();
26196 +}
26197 +#endif
26198 +
26199 +extern void call_softirq(void);
26200 +
26201 +asmlinkage void do_softirq(void)
26202 +{
26203 + __u32 pending;
26204 + unsigned long flags;
26205 +
26206 + if (in_interrupt())
26207 + return;
26208 +
26209 + local_irq_save(flags);
26210 + pending = local_softirq_pending();
26211 + /* Switch to interrupt stack */
26212 + if (pending) {
26213 + call_softirq();
26214 + WARN_ON_ONCE(softirq_count());
26215 + }
26216 + local_irq_restore(flags);
26217 +}
26218 +EXPORT_SYMBOL(do_softirq);
26219 +
26220 +#ifndef CONFIG_X86_LOCAL_APIC
26221 +/*
26222 + * 'what should we do if we get a hw irq event on an illegal vector'.
26223 + * each architecture has to answer this themselves.
26224 + */
26225 +void ack_bad_irq(unsigned int irq)
26226 +{
26227 + printk("unexpected IRQ trap at vector %02x\n", irq);
26228 +}
26229 +#endif
26230 Index: head-2008-11-25/arch/x86/kernel/ldt_64-xen.c
26231 ===================================================================
26232 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26233 +++ head-2008-11-25/arch/x86/kernel/ldt_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26234 @@ -0,0 +1,282 @@
26235 +/*
26236 + * linux/arch/x86_64/kernel/ldt.c
26237 + *
26238 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
26239 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
26240 + * Copyright (C) 2002 Andi Kleen
26241 + *
26242 + * This handles calls from both 32bit and 64bit mode.
26243 + */
26244 +
26245 +#include <linux/errno.h>
26246 +#include <linux/sched.h>
26247 +#include <linux/string.h>
26248 +#include <linux/mm.h>
26249 +#include <linux/smp.h>
26250 +#include <linux/smp_lock.h>
26251 +#include <linux/vmalloc.h>
26252 +#include <linux/slab.h>
26253 +
26254 +#include <asm/uaccess.h>
26255 +#include <asm/system.h>
26256 +#include <asm/ldt.h>
26257 +#include <asm/desc.h>
26258 +#include <asm/proto.h>
26259 +#include <asm/pgalloc.h>
26260 +
26261 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26262 +static void flush_ldt(void *null)
26263 +{
26264 + if (current->active_mm)
26265 + load_LDT(&current->active_mm->context);
26266 +}
26267 +#endif
26268 +
26269 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
26270 +{
26271 + void *oldldt;
26272 + void *newldt;
26273 + unsigned oldsize;
26274 +
26275 + if (mincount <= (unsigned)pc->size)
26276 + return 0;
26277 + oldsize = pc->size;
26278 + mincount = (mincount+511)&(~511);
26279 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
26280 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
26281 + else
26282 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
26283 +
26284 + if (!newldt)
26285 + return -ENOMEM;
26286 +
26287 + if (oldsize)
26288 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
26289 + oldldt = pc->ldt;
26290 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
26291 + wmb();
26292 + pc->ldt = newldt;
26293 + wmb();
26294 + pc->size = mincount;
26295 + wmb();
26296 + if (reload) {
26297 +#ifdef CONFIG_SMP
26298 + cpumask_t mask;
26299 +
26300 + preempt_disable();
26301 +#endif
26302 + make_pages_readonly(
26303 + pc->ldt,
26304 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26305 + XENFEAT_writable_descriptor_tables);
26306 + load_LDT(pc);
26307 +#ifdef CONFIG_SMP
26308 + mask = cpumask_of_cpu(smp_processor_id());
26309 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
26310 + smp_call_function(flush_ldt, NULL, 1, 1);
26311 + preempt_enable();
26312 +#endif
26313 + }
26314 + if (oldsize) {
26315 + make_pages_writable(
26316 + oldldt,
26317 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
26318 + XENFEAT_writable_descriptor_tables);
26319 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
26320 + vfree(oldldt);
26321 + else
26322 + kfree(oldldt);
26323 + }
26324 + return 0;
26325 +}
26326 +
26327 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
26328 +{
26329 + int err = alloc_ldt(new, old->size, 0);
26330 + if (err < 0)
26331 + return err;
26332 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
26333 + make_pages_readonly(
26334 + new->ldt,
26335 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26336 + XENFEAT_writable_descriptor_tables);
26337 + return 0;
26338 +}
26339 +
26340 +/*
26341 + * we do not have to muck with descriptors here, that is
26342 + * done in switch_mm() as needed.
26343 + */
26344 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
26345 +{
26346 + struct mm_struct * old_mm;
26347 + int retval = 0;
26348 +
26349 + memset(&mm->context, 0, sizeof(mm->context));
26350 + init_MUTEX(&mm->context.sem);
26351 + old_mm = current->mm;
26352 + if (old_mm && old_mm->context.size > 0) {
26353 + down(&old_mm->context.sem);
26354 + retval = copy_ldt(&mm->context, &old_mm->context);
26355 + up(&old_mm->context.sem);
26356 + }
26357 + if (retval == 0) {
26358 + spin_lock(&mm_unpinned_lock);
26359 + list_add(&mm->context.unpinned, &mm_unpinned);
26360 + spin_unlock(&mm_unpinned_lock);
26361 + }
26362 + return retval;
26363 +}
26364 +
26365 +/*
26366 + *
26367 + * Don't touch the LDT register - we're already in the next thread.
26368 + */
26369 +void destroy_context(struct mm_struct *mm)
26370 +{
26371 + if (mm->context.size) {
26372 + if (mm == current->active_mm)
26373 + clear_LDT();
26374 + make_pages_writable(
26375 + mm->context.ldt,
26376 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
26377 + XENFEAT_writable_descriptor_tables);
26378 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
26379 + vfree(mm->context.ldt);
26380 + else
26381 + kfree(mm->context.ldt);
26382 + mm->context.size = 0;
26383 + }
26384 + if (!mm->context.pinned) {
26385 + spin_lock(&mm_unpinned_lock);
26386 + list_del(&mm->context.unpinned);
26387 + spin_unlock(&mm_unpinned_lock);
26388 + }
26389 +}
26390 +
26391 +static int read_ldt(void __user * ptr, unsigned long bytecount)
26392 +{
26393 + int err;
26394 + unsigned long size;
26395 + struct mm_struct * mm = current->mm;
26396 +
26397 + if (!mm->context.size)
26398 + return 0;
26399 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
26400 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
26401 +
26402 + down(&mm->context.sem);
26403 + size = mm->context.size*LDT_ENTRY_SIZE;
26404 + if (size > bytecount)
26405 + size = bytecount;
26406 +
26407 + err = 0;
26408 + if (copy_to_user(ptr, mm->context.ldt, size))
26409 + err = -EFAULT;
26410 + up(&mm->context.sem);
26411 + if (err < 0)
26412 + goto error_return;
26413 + if (size != bytecount) {
26414 + /* zero-fill the rest */
26415 + if (clear_user(ptr+size, bytecount-size) != 0) {
26416 + err = -EFAULT;
26417 + goto error_return;
26418 + }
26419 + }
26420 + return bytecount;
26421 +error_return:
26422 + return err;
26423 +}
26424 +
26425 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
26426 +{
26427 + /* Arbitrary number */
26428 + /* x86-64 default LDT is all zeros */
26429 + if (bytecount > 128)
26430 + bytecount = 128;
26431 + if (clear_user(ptr, bytecount))
26432 + return -EFAULT;
26433 + return bytecount;
26434 +}
26435 +
26436 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
26437 +{
26438 + struct task_struct *me = current;
26439 + struct mm_struct * mm = me->mm;
26440 + __u32 entry_1, entry_2, *lp;
26441 + unsigned long mach_lp;
26442 + int error;
26443 + struct user_desc ldt_info;
26444 +
26445 + error = -EINVAL;
26446 +
26447 + if (bytecount != sizeof(ldt_info))
26448 + goto out;
26449 + error = -EFAULT;
26450 + if (copy_from_user(&ldt_info, ptr, bytecount))
26451 + goto out;
26452 +
26453 + error = -EINVAL;
26454 + if (ldt_info.entry_number >= LDT_ENTRIES)
26455 + goto out;
26456 + if (ldt_info.contents == 3) {
26457 + if (oldmode)
26458 + goto out;
26459 + if (ldt_info.seg_not_present == 0)
26460 + goto out;
26461 + }
26462 +
26463 + down(&mm->context.sem);
26464 + if (ldt_info.entry_number >= (unsigned)mm->context.size) {
26465 + error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
26466 + if (error < 0)
26467 + goto out_unlock;
26468 + }
26469 +
26470 + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
26471 + mach_lp = arbitrary_virt_to_machine(lp);
26472 +
26473 + /* Allow LDTs to be cleared by the user. */
26474 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
26475 + if (oldmode || LDT_empty(&ldt_info)) {
26476 + entry_1 = 0;
26477 + entry_2 = 0;
26478 + goto install;
26479 + }
26480 + }
26481 +
26482 + entry_1 = LDT_entry_a(&ldt_info);
26483 + entry_2 = LDT_entry_b(&ldt_info);
26484 + if (oldmode)
26485 + entry_2 &= ~(1 << 20);
26486 +
26487 + /* Install the new entry ... */
26488 +install:
26489 + error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
26490 +
26491 +out_unlock:
26492 + up(&mm->context.sem);
26493 +out:
26494 + return error;
26495 +}
26496 +
26497 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
26498 +{
26499 + int ret = -ENOSYS;
26500 +
26501 + switch (func) {
26502 + case 0:
26503 + ret = read_ldt(ptr, bytecount);
26504 + break;
26505 + case 1:
26506 + ret = write_ldt(ptr, bytecount, 1);
26507 + break;
26508 + case 2:
26509 + ret = read_default_ldt(ptr, bytecount);
26510 + break;
26511 + case 0x11:
26512 + ret = write_ldt(ptr, bytecount, 0);
26513 + break;
26514 + }
26515 + return ret;
26516 +}
26517 Index: head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c
26518 ===================================================================
26519 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
26520 +++ head-2008-11-25/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200
26521 @@ -0,0 +1,1011 @@
26522 +/*
26523 + * Intel Multiprocessor Specification 1.1 and 1.4
26524 + * compliant MP-table parsing routines.
26525 + *
26526 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
26527 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
26528 + *
26529 + * Fixes
26530 + * Erich Boleyn : MP v1.4 and additional changes.
26531 + * Alan Cox : Added EBDA scanning
26532 + * Ingo Molnar : various cleanups and rewrites
26533 + * Maciej W. Rozycki: Bits for default MP configurations
26534 + * Paul Diefenbaugh: Added full ACPI support
26535 + */
26536 +
26537 +#include <linux/mm.h>
26538 +#include <linux/init.h>
26539 +#include <linux/delay.h>
26540 +#include <linux/bootmem.h>
26541 +#include <linux/smp_lock.h>
26542 +#include <linux/kernel_stat.h>
26543 +#include <linux/mc146818rtc.h>
26544 +#include <linux/acpi.h>
26545 +#include <linux/module.h>
26546 +
26547 +#include <asm/smp.h>
26548 +#include <asm/mtrr.h>
26549 +#include <asm/mpspec.h>
26550 +#include <asm/pgalloc.h>
26551 +#include <asm/io_apic.h>
26552 +#include <asm/proto.h>
26553 +#include <asm/acpi.h>
26554 +
26555 +/* Have we found an MP table */
26556 +int smp_found_config;
26557 +unsigned int __initdata maxcpus = NR_CPUS;
26558 +
26559 +int acpi_found_madt;
26560 +
26561 +/*
26562 + * Various Linux-internal data structures created from the
26563 + * MP-table.
26564 + */
26565 +unsigned char apic_version [MAX_APICS];
26566 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26567 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
26568 +
26569 +static int mp_current_pci_id = 0;
26570 +/* I/O APIC entries */
26571 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
26572 +
26573 +/* # of MP IRQ source entries */
26574 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
26575 +
26576 +/* MP IRQ source entries */
26577 +int mp_irq_entries;
26578 +
26579 +int nr_ioapics;
26580 +int pic_mode;
26581 +unsigned long mp_lapic_addr = 0;
26582 +
26583 +
26584 +
26585 +/* Processor that is doing the boot up */
26586 +unsigned int boot_cpu_id = -1U;
26587 +/* Internal processor count */
26588 +unsigned int num_processors __initdata = 0;
26589 +
26590 +unsigned disabled_cpus __initdata;
26591 +
26592 +/* Bitmask of physically existing CPUs */
26593 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
26594 +
26595 +/* ACPI MADT entry parsing functions */
26596 +#ifdef CONFIG_ACPI
26597 +extern struct acpi_boot_flags acpi_boot;
26598 +#ifdef CONFIG_X86_LOCAL_APIC
26599 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
26600 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
26601 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
26602 +#endif /*CONFIG_X86_LOCAL_APIC*/
26603 +#ifdef CONFIG_X86_IO_APIC
26604 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
26605 +#endif /*CONFIG_X86_IO_APIC*/
26606 +#endif /*CONFIG_ACPI*/
26607 +
26608 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
26609 +
26610 +
26611 +/*
26612 + * Intel MP BIOS table parsing routines:
26613 + */
26614 +
26615 +/*
26616 + * Checksum an MP configuration block.
26617 + */
26618 +
26619 +static int __init mpf_checksum(unsigned char *mp, int len)
26620 +{
26621 + int sum = 0;
26622 +
26623 + while (len--)
26624 + sum += *mp++;
26625 +
26626 + return sum & 0xFF;
26627 +}
26628 +
26629 +#ifndef CONFIG_XEN
26630 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26631 +{
26632 + int cpu;
26633 + unsigned char ver;
26634 + cpumask_t tmp_map;
26635 +
26636 + if (!(m->mpc_cpuflag & CPU_ENABLED)) {
26637 + disabled_cpus++;
26638 + return;
26639 + }
26640 +
26641 + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
26642 + m->mpc_apicid,
26643 + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
26644 + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
26645 + m->mpc_apicver);
26646 +
26647 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26648 + Dprintk(" Bootup CPU\n");
26649 + boot_cpu_id = m->mpc_apicid;
26650 + }
26651 + if (num_processors >= NR_CPUS) {
26652 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
26653 + " Processor ignored.\n", NR_CPUS);
26654 + return;
26655 + }
26656 +
26657 + num_processors++;
26658 + cpus_complement(tmp_map, cpu_present_map);
26659 + cpu = first_cpu(tmp_map);
26660 +
26661 +#if MAX_APICS < 255
26662 + if ((int)m->mpc_apicid > MAX_APICS) {
26663 + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
26664 + m->mpc_apicid, MAX_APICS);
26665 + return;
26666 + }
26667 +#endif
26668 + ver = m->mpc_apicver;
26669 +
26670 + physid_set(m->mpc_apicid, phys_cpu_present_map);
26671 + /*
26672 + * Validate version
26673 + */
26674 + if (ver == 0x0) {
26675 + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
26676 + ver = 0x10;
26677 + }
26678 + apic_version[m->mpc_apicid] = ver;
26679 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
26680 + /*
26681 + * bios_cpu_apicid is required to have processors listed
26682 + * in same order as logical cpu numbers. Hence the first
26683 + * entry is BSP, and so on.
26684 + */
26685 + cpu = 0;
26686 + }
26687 + bios_cpu_apicid[cpu] = m->mpc_apicid;
26688 + x86_cpu_to_apicid[cpu] = m->mpc_apicid;
26689 +
26690 + cpu_set(cpu, cpu_possible_map);
26691 + cpu_set(cpu, cpu_present_map);
26692 +}
26693 +#else
26694 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
26695 +{
26696 + num_processors++;
26697 +}
26698 +#endif /* CONFIG_XEN */
26699 +
26700 +static void __init MP_bus_info (struct mpc_config_bus *m)
26701 +{
26702 + char str[7];
26703 +
26704 + memcpy(str, m->mpc_bustype, 6);
26705 + str[6] = 0;
26706 + Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
26707 +
26708 + if (strncmp(str, "ISA", 3) == 0) {
26709 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
26710 + } else if (strncmp(str, "EISA", 4) == 0) {
26711 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
26712 + } else if (strncmp(str, "PCI", 3) == 0) {
26713 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
26714 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
26715 + mp_current_pci_id++;
26716 + } else if (strncmp(str, "MCA", 3) == 0) {
26717 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
26718 + } else {
26719 + printk(KERN_ERR "Unknown bustype %s\n", str);
26720 + }
26721 +}
26722 +
26723 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
26724 +{
26725 + if (!(m->mpc_flags & MPC_APIC_USABLE))
26726 + return;
26727 +
26728 + printk("I/O APIC #%d Version %d at 0x%X.\n",
26729 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
26730 + if (nr_ioapics >= MAX_IO_APICS) {
26731 + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
26732 + MAX_IO_APICS, nr_ioapics);
26733 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
26734 + }
26735 + if (!m->mpc_apicaddr) {
26736 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
26737 + " found in MP table, skipping!\n");
26738 + return;
26739 + }
26740 + mp_ioapics[nr_ioapics] = *m;
26741 + nr_ioapics++;
26742 +}
26743 +
26744 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
26745 +{
26746 + mp_irqs [mp_irq_entries] = *m;
26747 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
26748 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
26749 + m->mpc_irqtype, m->mpc_irqflag & 3,
26750 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
26751 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
26752 + if (++mp_irq_entries >= MAX_IRQ_SOURCES)
26753 + panic("Max # of irq sources exceeded!!\n");
26754 +}
26755 +
26756 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
26757 +{
26758 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
26759 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
26760 + m->mpc_irqtype, m->mpc_irqflag & 3,
26761 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
26762 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
26763 + /*
26764 + * Well it seems all SMP boards in existence
26765 + * use ExtINT/LVT1 == LINT0 and
26766 + * NMI/LVT2 == LINT1 - the following check
26767 + * will show us if this assumptions is false.
26768 + * Until then we do not have to add baggage.
26769 + */
26770 + if ((m->mpc_irqtype == mp_ExtINT) &&
26771 + (m->mpc_destapiclint != 0))
26772 + BUG();
26773 + if ((m->mpc_irqtype == mp_NMI) &&
26774 + (m->mpc_destapiclint != 1))
26775 + BUG();
26776 +}
26777 +
26778 +/*
26779 + * Read/parse the MPC
26780 + */
26781 +
26782 +static int __init smp_read_mpc(struct mp_config_table *mpc)
26783 +{
26784 + char str[16];
26785 + int count=sizeof(*mpc);
26786 + unsigned char *mpt=((unsigned char *)mpc)+count;
26787 +
26788 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
26789 + printk("SMP mptable: bad signature [%c%c%c%c]!\n",
26790 + mpc->mpc_signature[0],
26791 + mpc->mpc_signature[1],
26792 + mpc->mpc_signature[2],
26793 + mpc->mpc_signature[3]);
26794 + return 0;
26795 + }
26796 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
26797 + printk("SMP mptable: checksum error!\n");
26798 + return 0;
26799 + }
26800 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
26801 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
26802 + mpc->mpc_spec);
26803 + return 0;
26804 + }
26805 + if (!mpc->mpc_lapic) {
26806 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
26807 + return 0;
26808 + }
26809 + memcpy(str,mpc->mpc_oem,8);
26810 + str[8]=0;
26811 + printk(KERN_INFO "OEM ID: %s ",str);
26812 +
26813 + memcpy(str,mpc->mpc_productid,12);
26814 + str[12]=0;
26815 + printk("Product ID: %s ",str);
26816 +
26817 + printk("APIC at: 0x%X\n",mpc->mpc_lapic);
26818 +
26819 + /* save the local APIC address, it might be non-default */
26820 + if (!acpi_lapic)
26821 + mp_lapic_addr = mpc->mpc_lapic;
26822 +
26823 + /*
26824 + * Now process the configuration blocks.
26825 + */
26826 + while (count < mpc->mpc_length) {
26827 + switch(*mpt) {
26828 + case MP_PROCESSOR:
26829 + {
26830 + struct mpc_config_processor *m=
26831 + (struct mpc_config_processor *)mpt;
26832 + if (!acpi_lapic)
26833 + MP_processor_info(m);
26834 + mpt += sizeof(*m);
26835 + count += sizeof(*m);
26836 + break;
26837 + }
26838 + case MP_BUS:
26839 + {
26840 + struct mpc_config_bus *m=
26841 + (struct mpc_config_bus *)mpt;
26842 + MP_bus_info(m);
26843 + mpt += sizeof(*m);
26844 + count += sizeof(*m);
26845 + break;
26846 + }
26847 + case MP_IOAPIC:
26848 + {
26849 + struct mpc_config_ioapic *m=
26850 + (struct mpc_config_ioapic *)mpt;
26851 + MP_ioapic_info(m);
26852 + mpt+=sizeof(*m);
26853 + count+=sizeof(*m);
26854 + break;
26855 + }
26856 + case MP_INTSRC:
26857 + {
26858 + struct mpc_config_intsrc *m=
26859 + (struct mpc_config_intsrc *)mpt;
26860 +
26861 + MP_intsrc_info(m);
26862 + mpt+=sizeof(*m);
26863 + count+=sizeof(*m);
26864 + break;
26865 + }
26866 + case MP_LINTSRC:
26867 + {
26868 + struct mpc_config_lintsrc *m=
26869 + (struct mpc_config_lintsrc *)mpt;
26870 + MP_lintsrc_info(m);
26871 + mpt+=sizeof(*m);
26872 + count+=sizeof(*m);
26873 + break;
26874 + }
26875 + }
26876 + }
26877 + clustered_apic_check();
26878 + if (!num_processors)
26879 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
26880 + return num_processors;
26881 +}
26882 +
26883 +static int __init ELCR_trigger(unsigned int irq)
26884 +{
26885 + unsigned int port;
26886 +
26887 + port = 0x4d0 + (irq >> 3);
26888 + return (inb(port) >> (irq & 7)) & 1;
26889 +}
26890 +
26891 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
26892 +{
26893 + struct mpc_config_intsrc intsrc;
26894 + int i;
26895 + int ELCR_fallback = 0;
26896 +
26897 + intsrc.mpc_type = MP_INTSRC;
26898 + intsrc.mpc_irqflag = 0; /* conforming */
26899 + intsrc.mpc_srcbus = 0;
26900 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
26901 +
26902 + intsrc.mpc_irqtype = mp_INT;
26903 +
26904 + /*
26905 + * If true, we have an ISA/PCI system with no IRQ entries
26906 + * in the MP table. To prevent the PCI interrupts from being set up
26907 + * incorrectly, we try to use the ELCR. The sanity check to see if
26908 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
26909 + * never be level sensitive, so we simply see if the ELCR agrees.
26910 + * If it does, we assume it's valid.
26911 + */
26912 + if (mpc_default_type == 5) {
26913 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
26914 +
26915 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
26916 + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
26917 + else {
26918 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
26919 + ELCR_fallback = 1;
26920 + }
26921 + }
26922 +
26923 + for (i = 0; i < 16; i++) {
26924 + switch (mpc_default_type) {
26925 + case 2:
26926 + if (i == 0 || i == 13)
26927 + continue; /* IRQ0 & IRQ13 not connected */
26928 + /* fall through */
26929 + default:
26930 + if (i == 2)
26931 + continue; /* IRQ2 is never connected */
26932 + }
26933 +
26934 + if (ELCR_fallback) {
26935 + /*
26936 + * If the ELCR indicates a level-sensitive interrupt, we
26937 + * copy that information over to the MP table in the
26938 + * irqflag field (level sensitive, active high polarity).
26939 + */
26940 + if (ELCR_trigger(i))
26941 + intsrc.mpc_irqflag = 13;
26942 + else
26943 + intsrc.mpc_irqflag = 0;
26944 + }
26945 +
26946 + intsrc.mpc_srcbusirq = i;
26947 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
26948 + MP_intsrc_info(&intsrc);
26949 + }
26950 +
26951 + intsrc.mpc_irqtype = mp_ExtINT;
26952 + intsrc.mpc_srcbusirq = 0;
26953 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
26954 + MP_intsrc_info(&intsrc);
26955 +}
26956 +
26957 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
26958 +{
26959 + struct mpc_config_processor processor;
26960 + struct mpc_config_bus bus;
26961 + struct mpc_config_ioapic ioapic;
26962 + struct mpc_config_lintsrc lintsrc;
26963 + int linttypes[2] = { mp_ExtINT, mp_NMI };
26964 + int i;
26965 +
26966 + /*
26967 + * local APIC has default address
26968 + */
26969 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
26970 +
26971 + /*
26972 + * 2 CPUs, numbered 0 & 1.
26973 + */
26974 + processor.mpc_type = MP_PROCESSOR;
26975 + /* Either an integrated APIC or a discrete 82489DX. */
26976 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
26977 + processor.mpc_cpuflag = CPU_ENABLED;
26978 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
26979 + (boot_cpu_data.x86_model << 4) |
26980 + boot_cpu_data.x86_mask;
26981 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
26982 + processor.mpc_reserved[0] = 0;
26983 + processor.mpc_reserved[1] = 0;
26984 + for (i = 0; i < 2; i++) {
26985 + processor.mpc_apicid = i;
26986 + MP_processor_info(&processor);
26987 + }
26988 +
26989 + bus.mpc_type = MP_BUS;
26990 + bus.mpc_busid = 0;
26991 + switch (mpc_default_type) {
26992 + default:
26993 + printk(KERN_ERR "???\nUnknown standard configuration %d\n",
26994 + mpc_default_type);
26995 + /* fall through */
26996 + case 1:
26997 + case 5:
26998 + memcpy(bus.mpc_bustype, "ISA ", 6);
26999 + break;
27000 + case 2:
27001 + case 6:
27002 + case 3:
27003 + memcpy(bus.mpc_bustype, "EISA ", 6);
27004 + break;
27005 + case 4:
27006 + case 7:
27007 + memcpy(bus.mpc_bustype, "MCA ", 6);
27008 + }
27009 + MP_bus_info(&bus);
27010 + if (mpc_default_type > 4) {
27011 + bus.mpc_busid = 1;
27012 + memcpy(bus.mpc_bustype, "PCI ", 6);
27013 + MP_bus_info(&bus);
27014 + }
27015 +
27016 + ioapic.mpc_type = MP_IOAPIC;
27017 + ioapic.mpc_apicid = 2;
27018 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
27019 + ioapic.mpc_flags = MPC_APIC_USABLE;
27020 + ioapic.mpc_apicaddr = 0xFEC00000;
27021 + MP_ioapic_info(&ioapic);
27022 +
27023 + /*
27024 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
27025 + */
27026 + construct_default_ioirq_mptable(mpc_default_type);
27027 +
27028 + lintsrc.mpc_type = MP_LINTSRC;
27029 + lintsrc.mpc_irqflag = 0; /* conforming */
27030 + lintsrc.mpc_srcbusid = 0;
27031 + lintsrc.mpc_srcbusirq = 0;
27032 + lintsrc.mpc_destapic = MP_APIC_ALL;
27033 + for (i = 0; i < 2; i++) {
27034 + lintsrc.mpc_irqtype = linttypes[i];
27035 + lintsrc.mpc_destapiclint = i;
27036 + MP_lintsrc_info(&lintsrc);
27037 + }
27038 +}
27039 +
27040 +static struct intel_mp_floating *mpf_found;
27041 +
27042 +/*
27043 + * Scan the memory blocks for an SMP configuration block.
27044 + */
27045 +void __init get_smp_config (void)
27046 +{
27047 + struct intel_mp_floating *mpf = mpf_found;
27048 +
27049 + /*
27050 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
27051 + * processors, where MPS only supports physical.
27052 + */
27053 + if (acpi_lapic && acpi_ioapic) {
27054 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
27055 + return;
27056 + }
27057 + else if (acpi_lapic)
27058 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
27059 +
27060 + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
27061 + if (mpf->mpf_feature2 & (1<<7)) {
27062 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
27063 + pic_mode = 1;
27064 + } else {
27065 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
27066 + pic_mode = 0;
27067 + }
27068 +
27069 + /*
27070 + * Now see if we need to read further.
27071 + */
27072 + if (mpf->mpf_feature1 != 0) {
27073 +
27074 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
27075 + construct_default_ISA_mptable(mpf->mpf_feature1);
27076 +
27077 + } else if (mpf->mpf_physptr) {
27078 +
27079 + /*
27080 + * Read the physical hardware table. Anything here will
27081 + * override the defaults.
27082 + */
27083 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
27084 + smp_found_config = 0;
27085 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
27086 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
27087 + return;
27088 + }
27089 + /*
27090 + * If there are no explicit MP IRQ entries, then we are
27091 + * broken. We set up most of the low 16 IO-APIC pins to
27092 + * ISA defaults and hope it will work.
27093 + */
27094 + if (!mp_irq_entries) {
27095 + struct mpc_config_bus bus;
27096 +
27097 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
27098 +
27099 + bus.mpc_type = MP_BUS;
27100 + bus.mpc_busid = 0;
27101 + memcpy(bus.mpc_bustype, "ISA ", 6);
27102 + MP_bus_info(&bus);
27103 +
27104 + construct_default_ioirq_mptable(0);
27105 + }
27106 +
27107 + } else
27108 + BUG();
27109 +
27110 + printk(KERN_INFO "Processors: %d\n", num_processors);
27111 + /*
27112 + * Only use the first configuration found.
27113 + */
27114 +}
27115 +
27116 +static int __init smp_scan_config (unsigned long base, unsigned long length)
27117 +{
27118 + extern void __bad_mpf_size(void);
27119 + unsigned int *bp = isa_bus_to_virt(base);
27120 + struct intel_mp_floating *mpf;
27121 +
27122 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
27123 + if (sizeof(*mpf) != 16)
27124 + __bad_mpf_size();
27125 +
27126 + while (length > 0) {
27127 + mpf = (struct intel_mp_floating *)bp;
27128 + if ((*bp == SMP_MAGIC_IDENT) &&
27129 + (mpf->mpf_length == 1) &&
27130 + !mpf_checksum((unsigned char *)bp, 16) &&
27131 + ((mpf->mpf_specification == 1)
27132 + || (mpf->mpf_specification == 4)) ) {
27133 +
27134 + smp_found_config = 1;
27135 + mpf_found = mpf;
27136 + return 1;
27137 + }
27138 + bp += 4;
27139 + length -= 16;
27140 + }
27141 + return 0;
27142 +}
27143 +
27144 +void __init find_intel_smp (void)
27145 +{
27146 + unsigned int address;
27147 +
27148 + /*
27149 + * FIXME: Linux assumes you have 640K of base ram..
27150 + * this continues the error...
27151 + *
27152 + * 1) Scan the bottom 1K for a signature
27153 + * 2) Scan the top 1K of base RAM
27154 + * 3) Scan the 64K of bios
27155 + */
27156 + if (smp_scan_config(0x0,0x400) ||
27157 + smp_scan_config(639*0x400,0x400) ||
27158 + smp_scan_config(0xF0000,0x10000))
27159 + return;
27160 + /*
27161 + * If it is an SMP machine we should know now, unless the
27162 + * configuration is in an EISA/MCA bus machine with an
27163 + * extended bios data area.
27164 + *
27165 + * there is a real-mode segmented pointer pointing to the
27166 + * 4K EBDA area at 0x40E, calculate and scan it here.
27167 + *
27168 + * NOTE! There are Linux loaders that will corrupt the EBDA
27169 + * area, and as such this kind of SMP config may be less
27170 + * trustworthy, simply because the SMP table may have been
27171 + * stomped on during early boot. These loaders are buggy and
27172 + * should be fixed.
27173 + */
27174 +
27175 + address = *(unsigned short *)phys_to_virt(0x40E);
27176 + address <<= 4;
27177 + if (smp_scan_config(address, 0x1000))
27178 + return;
27179 +
27180 + /* If we have come this far, we did not find an MP table */
27181 + printk(KERN_INFO "No mptable found.\n");
27182 +}
27183 +
27184 +/*
27185 + * - Intel MP Configuration Table
27186 + */
27187 +void __init find_smp_config (void)
27188 +{
27189 +#ifdef CONFIG_X86_LOCAL_APIC
27190 + find_intel_smp();
27191 +#endif
27192 +}
27193 +
27194 +
27195 +/* --------------------------------------------------------------------------
27196 + ACPI-based MP Configuration
27197 + -------------------------------------------------------------------------- */
27198 +
27199 +#ifdef CONFIG_ACPI
27200 +
27201 +void __init mp_register_lapic_address (
27202 + u64 address)
27203 +{
27204 +#ifndef CONFIG_XEN
27205 + mp_lapic_addr = (unsigned long) address;
27206 +
27207 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
27208 +
27209 + if (boot_cpu_id == -1U)
27210 + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
27211 +
27212 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
27213 +#endif
27214 +}
27215 +
27216 +
27217 +void __cpuinit mp_register_lapic (
27218 + u8 id,
27219 + u8 enabled)
27220 +{
27221 + struct mpc_config_processor processor;
27222 + int boot_cpu = 0;
27223 +
27224 + if (id >= MAX_APICS) {
27225 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
27226 + id, MAX_APICS);
27227 + return;
27228 + }
27229 +
27230 + if (id == boot_cpu_physical_apicid)
27231 + boot_cpu = 1;
27232 +
27233 +#ifndef CONFIG_XEN
27234 + processor.mpc_type = MP_PROCESSOR;
27235 + processor.mpc_apicid = id;
27236 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
27237 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
27238 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
27239 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
27240 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
27241 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
27242 + processor.mpc_reserved[0] = 0;
27243 + processor.mpc_reserved[1] = 0;
27244 +#endif
27245 +
27246 + MP_processor_info(&processor);
27247 +}
27248 +
27249 +#ifdef CONFIG_X86_IO_APIC
27250 +
27251 +#define MP_ISA_BUS 0
27252 +#define MP_MAX_IOAPIC_PIN 127
27253 +
27254 +static struct mp_ioapic_routing {
27255 + int apic_id;
27256 + int gsi_start;
27257 + int gsi_end;
27258 + u32 pin_programmed[4];
27259 +} mp_ioapic_routing[MAX_IO_APICS];
27260 +
27261 +
27262 +static int mp_find_ioapic (
27263 + int gsi)
27264 +{
27265 + int i = 0;
27266 +
27267 + /* Find the IOAPIC that manages this GSI. */
27268 + for (i = 0; i < nr_ioapics; i++) {
27269 + if ((gsi >= mp_ioapic_routing[i].gsi_start)
27270 + && (gsi <= mp_ioapic_routing[i].gsi_end))
27271 + return i;
27272 + }
27273 +
27274 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
27275 +
27276 + return -1;
27277 +}
27278 +
27279 +
27280 +void __init mp_register_ioapic (
27281 + u8 id,
27282 + u32 address,
27283 + u32 gsi_base)
27284 +{
27285 + int idx = 0;
27286 +
27287 + if (nr_ioapics >= MAX_IO_APICS) {
27288 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
27289 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
27290 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
27291 + }
27292 + if (!address) {
27293 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
27294 + " found in MADT table, skipping!\n");
27295 + return;
27296 + }
27297 +
27298 + idx = nr_ioapics++;
27299 +
27300 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
27301 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
27302 + mp_ioapics[idx].mpc_apicaddr = address;
27303 +
27304 +#ifndef CONFIG_XEN
27305 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
27306 +#endif
27307 + mp_ioapics[idx].mpc_apicid = id;
27308 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
27309 +
27310 + /*
27311 + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
27312 + * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
27313 + */
27314 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
27315 + mp_ioapic_routing[idx].gsi_start = gsi_base;
27316 + mp_ioapic_routing[idx].gsi_end = gsi_base +
27317 + io_apic_get_redir_entries(idx);
27318 +
27319 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
27320 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
27321 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
27322 + mp_ioapic_routing[idx].gsi_start,
27323 + mp_ioapic_routing[idx].gsi_end);
27324 +
27325 + return;
27326 +}
27327 +
27328 +
27329 +void __init mp_override_legacy_irq (
27330 + u8 bus_irq,
27331 + u8 polarity,
27332 + u8 trigger,
27333 + u32 gsi)
27334 +{
27335 + struct mpc_config_intsrc intsrc;
27336 + int ioapic = -1;
27337 + int pin = -1;
27338 +
27339 + /*
27340 + * Convert 'gsi' to 'ioapic.pin'.
27341 + */
27342 + ioapic = mp_find_ioapic(gsi);
27343 + if (ioapic < 0)
27344 + return;
27345 + pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27346 +
27347 + /*
27348 + * TBD: This check is for faulty timer entries, where the override
27349 + * erroneously sets the trigger to level, resulting in a HUGE
27350 + * increase of timer interrupts!
27351 + */
27352 + if ((bus_irq == 0) && (trigger == 3))
27353 + trigger = 1;
27354 +
27355 + intsrc.mpc_type = MP_INTSRC;
27356 + intsrc.mpc_irqtype = mp_INT;
27357 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
27358 + intsrc.mpc_srcbus = MP_ISA_BUS;
27359 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
27360 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
27361 + intsrc.mpc_dstirq = pin; /* INTIN# */
27362 +
27363 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
27364 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27365 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27366 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
27367 +
27368 + mp_irqs[mp_irq_entries] = intsrc;
27369 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
27370 + panic("Max # of irq sources exceeded!\n");
27371 +
27372 + return;
27373 +}
27374 +
27375 +
27376 +void __init mp_config_acpi_legacy_irqs (void)
27377 +{
27378 + struct mpc_config_intsrc intsrc;
27379 + int i = 0;
27380 + int ioapic = -1;
27381 +
27382 + /*
27383 + * Fabricate the legacy ISA bus (bus #31).
27384 + */
27385 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
27386 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
27387 +
27388 + /*
27389 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
27390 + */
27391 + ioapic = mp_find_ioapic(0);
27392 + if (ioapic < 0)
27393 + return;
27394 +
27395 + intsrc.mpc_type = MP_INTSRC;
27396 + intsrc.mpc_irqflag = 0; /* Conforming */
27397 + intsrc.mpc_srcbus = MP_ISA_BUS;
27398 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
27399 +
27400 + /*
27401 + * Use the default configuration for the IRQs 0-15. Unless
27402 + * overridden by (MADT) interrupt source override entries.
27403 + */
27404 + for (i = 0; i < 16; i++) {
27405 + int idx;
27406 +
27407 + for (idx = 0; idx < mp_irq_entries; idx++) {
27408 + struct mpc_config_intsrc *irq = mp_irqs + idx;
27409 +
27410 + /* Do we already have a mapping for this ISA IRQ? */
27411 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
27412 + break;
27413 +
27414 + /* Do we already have a mapping for this IOAPIC pin */
27415 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
27416 + (irq->mpc_dstirq == i))
27417 + break;
27418 + }
27419 +
27420 + if (idx != mp_irq_entries) {
27421 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
27422 + continue; /* IRQ already used */
27423 + }
27424 +
27425 + intsrc.mpc_irqtype = mp_INT;
27426 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
27427 + intsrc.mpc_dstirq = i;
27428 +
27429 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
27430 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
27431 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
27432 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
27433 + intsrc.mpc_dstirq);
27434 +
27435 + mp_irqs[mp_irq_entries] = intsrc;
27436 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
27437 + panic("Max # of irq sources exceeded!\n");
27438 + }
27439 +
27440 + return;
27441 +}
27442 +
27443 +#define MAX_GSI_NUM 4096
27444 +
27445 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
27446 +{
27447 + int ioapic = -1;
27448 + int ioapic_pin = 0;
27449 + int idx, bit = 0;
27450 + static int pci_irq = 16;
27451 + /*
27452 + * Mapping between Global System Interrupts, which
27453 + * represent all possible interrupts, to the IRQs
27454 + * assigned to actual devices.
27455 + */
27456 + static int gsi_to_irq[MAX_GSI_NUM];
27457 +
27458 + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
27459 + return gsi;
27460 +
27461 + /* Don't set up the ACPI SCI because it's already set up */
27462 + if (acpi_fadt.sci_int == gsi)
27463 + return gsi;
27464 +
27465 + ioapic = mp_find_ioapic(gsi);
27466 + if (ioapic < 0) {
27467 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
27468 + return gsi;
27469 + }
27470 +
27471 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
27472 +
27473 + /*
27474 + * Avoid pin reprogramming. PRTs typically include entries
27475 + * with redundant pin->gsi mappings (but unique PCI devices);
27476 + * we only program the IOAPIC on the first.
27477 + */
27478 + bit = ioapic_pin % 32;
27479 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
27480 + if (idx > 3) {
27481 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
27482 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
27483 + ioapic_pin);
27484 + return gsi;
27485 + }
27486 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
27487 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
27488 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
27489 + return gsi_to_irq[gsi];
27490 + }
27491 +
27492 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
27493 +
27494 + if (triggering == ACPI_LEVEL_SENSITIVE) {
27495 + /*
27496 + * For PCI devices assign IRQs in order, avoiding gaps
27497 + * due to unused I/O APIC pins.
27498 + */
27499 + int irq = gsi;
27500 + if (gsi < MAX_GSI_NUM) {
27501 + /*
27502 + * Retain the VIA chipset work-around (gsi > 15), but
27503 + * avoid a problem where the 8254 timer (IRQ0) is setup
27504 + * via an override (so it's not on pin 0 of the ioapic),
27505 + * and at the same time, the pin 0 interrupt is a PCI
27506 + * type. The gsi > 15 test could cause these two pins
27507 + * to be shared as IRQ0, and they are not shareable.
27508 + * So test for this condition, and if necessary, avoid
27509 + * the pin collision.
27510 + */
27511 + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
27512 + gsi = pci_irq++;
27513 + /*
27514 + * Don't assign IRQ used by ACPI SCI
27515 + */
27516 + if (gsi == acpi_fadt.sci_int)
27517 + gsi = pci_irq++;
27518 + gsi_to_irq[irq] = gsi;
27519 + } else {
27520 + printk(KERN_ERR "GSI %u is too high\n", gsi);
27521 + return gsi;
27522 + }
27523 + }
27524 +
27525 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
27526 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
27527 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
27528 + return gsi;
27529 +}
27530 +
27531 +#endif /*CONFIG_X86_IO_APIC*/
27532 +#endif /*CONFIG_ACPI*/
27533 Index: head-2008-11-25/arch/x86/kernel/process_64-xen.c
27534 ===================================================================
27535 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
27536 +++ head-2008-11-25/arch/x86/kernel/process_64-xen.c 2008-08-07 12:44:36.000000000 +0200
27537 @@ -0,0 +1,848 @@
27538 +/*
27539 + * linux/arch/x86-64/kernel/process.c
27540 + *
27541 + * Copyright (C) 1995 Linus Torvalds
27542 + *
27543 + * Pentium III FXSR, SSE support
27544 + * Gareth Hughes <gareth@valinux.com>, May 2000
27545 + *
27546 + * X86-64 port
27547 + * Andi Kleen.
27548 + *
27549 + * CPU hotplug support - ashok.raj@intel.com
27550 + *
27551 + * Jun Nakajima <jun.nakajima@intel.com>
27552 + * Modified for Xen
27553 + */
27554 +
27555 +/*
27556 + * This file handles the architecture-dependent parts of process handling..
27557 + */
27558 +
27559 +#include <stdarg.h>
27560 +
27561 +#include <linux/cpu.h>
27562 +#include <linux/errno.h>
27563 +#include <linux/sched.h>
27564 +#include <linux/kernel.h>
27565 +#include <linux/mm.h>
27566 +#include <linux/elfcore.h>
27567 +#include <linux/smp.h>
27568 +#include <linux/slab.h>
27569 +#include <linux/user.h>
27570 +#include <linux/module.h>
27571 +#include <linux/a.out.h>
27572 +#include <linux/interrupt.h>
27573 +#include <linux/delay.h>
27574 +#include <linux/ptrace.h>
27575 +#include <linux/utsname.h>
27576 +#include <linux/random.h>
27577 +#include <linux/notifier.h>
27578 +#include <linux/kprobes.h>
27579 +
27580 +#include <asm/uaccess.h>
27581 +#include <asm/pgtable.h>
27582 +#include <asm/system.h>
27583 +#include <asm/io.h>
27584 +#include <asm/processor.h>
27585 +#include <asm/i387.h>
27586 +#include <asm/mmu_context.h>
27587 +#include <asm/pda.h>
27588 +#include <asm/prctl.h>
27589 +#include <asm/kdebug.h>
27590 +#include <xen/interface/platform.h>
27591 +#include <xen/interface/physdev.h>
27592 +#include <xen/interface/vcpu.h>
27593 +#include <asm/desc.h>
27594 +#include <asm/proto.h>
27595 +#include <asm/hardirq.h>
27596 +#include <asm/ia32.h>
27597 +#include <asm/idle.h>
27598 +
27599 +#include <xen/cpu_hotplug.h>
27600 +
27601 +asmlinkage extern void ret_from_fork(void);
27602 +
27603 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
27604 +
27605 +unsigned long boot_option_idle_override = 0;
27606 +EXPORT_SYMBOL(boot_option_idle_override);
27607 +
27608 +/*
27609 + * Powermanagement idle function, if any..
27610 + */
27611 +void (*pm_idle)(void);
27612 +EXPORT_SYMBOL(pm_idle);
27613 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
27614 +
27615 +static ATOMIC_NOTIFIER_HEAD(idle_notifier);
27616 +
27617 +void idle_notifier_register(struct notifier_block *n)
27618 +{
27619 + atomic_notifier_chain_register(&idle_notifier, n);
27620 +}
27621 +EXPORT_SYMBOL_GPL(idle_notifier_register);
27622 +
27623 +void idle_notifier_unregister(struct notifier_block *n)
27624 +{
27625 + atomic_notifier_chain_unregister(&idle_notifier, n);
27626 +}
27627 +EXPORT_SYMBOL(idle_notifier_unregister);
27628 +
27629 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
27630 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
27631 +
27632 +void enter_idle(void)
27633 +{
27634 + __get_cpu_var(idle_state) = CPU_IDLE;
27635 + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
27636 +}
27637 +
27638 +static void __exit_idle(void)
27639 +{
27640 + __get_cpu_var(idle_state) = CPU_NOT_IDLE;
27641 + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
27642 +}
27643 +
27644 +/* Called from interrupts to signify idle end */
27645 +void exit_idle(void)
27646 +{
27647 + if (current->pid | read_pda(irqcount))
27648 + return;
27649 + __exit_idle();
27650 +}
27651 +
27652 +/*
27653 + * On SMP it's slightly faster (but much more power-consuming!)
27654 + * to poll the ->need_resched flag instead of waiting for the
27655 + * cross-CPU IPI to arrive. Use this option with caution.
27656 + */
27657 +static void poll_idle (void)
27658 +{
27659 + local_irq_enable();
27660 +
27661 + asm volatile(
27662 + "2:"
27663 + "testl %0,%1;"
27664 + "rep; nop;"
27665 + "je 2b;"
27666 + : :
27667 + "i" (_TIF_NEED_RESCHED),
27668 + "m" (current_thread_info()->flags));
27669 +}
27670 +
27671 +static void xen_idle(void)
27672 +{
27673 + local_irq_disable();
27674 +
27675 + if (need_resched())
27676 + local_irq_enable();
27677 + else {
27678 + current_thread_info()->status &= ~TS_POLLING;
27679 + smp_mb__after_clear_bit();
27680 + safe_halt();
27681 + current_thread_info()->status |= TS_POLLING;
27682 + }
27683 +}
27684 +
27685 +#ifdef CONFIG_HOTPLUG_CPU
27686 +static inline void play_dead(void)
27687 +{
27688 + idle_task_exit();
27689 + local_irq_disable();
27690 + cpu_clear(smp_processor_id(), cpu_initialized);
27691 + preempt_enable_no_resched();
27692 + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
27693 + cpu_bringup();
27694 +}
27695 +#else
27696 +static inline void play_dead(void)
27697 +{
27698 + BUG();
27699 +}
27700 +#endif /* CONFIG_HOTPLUG_CPU */
27701 +
27702 +/*
27703 + * The idle thread. There's no useful work to be
27704 + * done, so just try to conserve power and have a
27705 + * low exit latency (ie sit in a loop waiting for
27706 + * somebody to say that they'd like to reschedule)
27707 + */
27708 +void cpu_idle (void)
27709 +{
27710 + current_thread_info()->status |= TS_POLLING;
27711 + /* endless idle loop with no priority at all */
27712 + while (1) {
27713 + while (!need_resched()) {
27714 + void (*idle)(void);
27715 +
27716 + if (__get_cpu_var(cpu_idle_state))
27717 + __get_cpu_var(cpu_idle_state) = 0;
27718 + rmb();
27719 + idle = xen_idle; /* no alternatives */
27720 + if (cpu_is_offline(smp_processor_id()))
27721 + play_dead();
27722 + enter_idle();
27723 + idle();
27724 + __exit_idle();
27725 + }
27726 +
27727 + preempt_enable_no_resched();
27728 + schedule();
27729 + preempt_disable();
27730 + }
27731 +}
27732 +
27733 +void cpu_idle_wait(void)
27734 +{
27735 + unsigned int cpu, this_cpu = get_cpu();
27736 + cpumask_t map;
27737 +
27738 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
27739 + put_cpu();
27740 +
27741 + cpus_clear(map);
27742 + for_each_online_cpu(cpu) {
27743 + per_cpu(cpu_idle_state, cpu) = 1;
27744 + cpu_set(cpu, map);
27745 + }
27746 +
27747 + __get_cpu_var(cpu_idle_state) = 0;
27748 +
27749 + wmb();
27750 + do {
27751 + ssleep(1);
27752 + for_each_online_cpu(cpu) {
27753 + if (cpu_isset(cpu, map) &&
27754 + !per_cpu(cpu_idle_state, cpu))
27755 + cpu_clear(cpu, map);
27756 + }
27757 + cpus_and(map, map, cpu_online_map);
27758 + } while (!cpus_empty(map));
27759 +}
27760 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
27761 +
27762 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
27763 +{
27764 +}
27765 +
27766 +static int __init idle_setup (char *str)
27767 +{
27768 + if (!strncmp(str, "poll", 4)) {
27769 + printk("using polling idle threads.\n");
27770 + pm_idle = poll_idle;
27771 + }
27772 +
27773 + boot_option_idle_override = 1;
27774 + return 1;
27775 +}
27776 +
27777 +__setup("idle=", idle_setup);
27778 +
27779 +/* Prints also some state that isn't saved in the pt_regs */
27780 +void __show_regs(struct pt_regs * regs)
27781 +{
27782 + unsigned long fs, gs, shadowgs;
27783 + unsigned int fsindex,gsindex;
27784 + unsigned int ds,cs,es;
27785 +
27786 + printk("\n");
27787 + print_modules();
27788 + printk("Pid: %d, comm: %.20s %s %s %.*s\n",
27789 + current->pid, current->comm, print_tainted(),
27790 + system_utsname.release,
27791 + (int)strcspn(system_utsname.version, " "),
27792 + system_utsname.version);
27793 + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
27794 + printk_address(regs->rip);
27795 + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
27796 + regs->eflags);
27797 + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
27798 + regs->rax, regs->rbx, regs->rcx);
27799 + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
27800 + regs->rdx, regs->rsi, regs->rdi);
27801 + printk("RBP: %016lx R08: %016lx R09: %016lx\n",
27802 + regs->rbp, regs->r8, regs->r9);
27803 + printk("R10: %016lx R11: %016lx R12: %016lx\n",
27804 + regs->r10, regs->r11, regs->r12);
27805 + printk("R13: %016lx R14: %016lx R15: %016lx\n",
27806 + regs->r13, regs->r14, regs->r15);
27807 +
27808 + asm("mov %%ds,%0" : "=r" (ds));
27809 + asm("mov %%cs,%0" : "=r" (cs));
27810 + asm("mov %%es,%0" : "=r" (es));
27811 + asm("mov %%fs,%0" : "=r" (fsindex));
27812 + asm("mov %%gs,%0" : "=r" (gsindex));
27813 +
27814 + rdmsrl(MSR_FS_BASE, fs);
27815 + rdmsrl(MSR_GS_BASE, gs);
27816 + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
27817 +
27818 + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
27819 + fs,fsindex,gs,gsindex,shadowgs);
27820 + printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
27821 +
27822 +}
27823 +
27824 +void show_regs(struct pt_regs *regs)
27825 +{
27826 + printk("CPU %d:", smp_processor_id());
27827 + __show_regs(regs);
27828 + show_trace(NULL, regs, (void *)(regs + 1));
27829 +}
27830 +
27831 +/*
27832 + * Free current thread data structures etc..
27833 + */
27834 +void exit_thread(void)
27835 +{
27836 + struct task_struct *me = current;
27837 + struct thread_struct *t = &me->thread;
27838 +
27839 + if (me->thread.io_bitmap_ptr) {
27840 +#ifndef CONFIG_X86_NO_TSS
27841 + struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
27842 +#endif
27843 +#ifdef CONFIG_XEN
27844 + struct physdev_set_iobitmap iobmp_op;
27845 + memset(&iobmp_op, 0, sizeof(iobmp_op));
27846 +#endif
27847 +
27848 + kfree(t->io_bitmap_ptr);
27849 + t->io_bitmap_ptr = NULL;
27850 + /*
27851 + * Careful, clear this in the TSS too:
27852 + */
27853 +#ifndef CONFIG_X86_NO_TSS
27854 + memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
27855 + put_cpu();
27856 +#endif
27857 +#ifdef CONFIG_XEN
27858 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
27859 + &iobmp_op));
27860 +#endif
27861 + t->io_bitmap_max = 0;
27862 + }
27863 +}
27864 +
27865 +void load_gs_index(unsigned gs)
27866 +{
27867 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
27868 +}
27869 +
27870 +void flush_thread(void)
27871 +{
27872 + struct task_struct *tsk = current;
27873 + struct thread_info *t = current_thread_info();
27874 +
27875 + if (t->flags & _TIF_ABI_PENDING) {
27876 + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
27877 + if (t->flags & _TIF_IA32)
27878 + current_thread_info()->status |= TS_COMPAT;
27879 + }
27880 +
27881 + tsk->thread.debugreg0 = 0;
27882 + tsk->thread.debugreg1 = 0;
27883 + tsk->thread.debugreg2 = 0;
27884 + tsk->thread.debugreg3 = 0;
27885 + tsk->thread.debugreg6 = 0;
27886 + tsk->thread.debugreg7 = 0;
27887 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
27888 + /*
27889 + * Forget coprocessor state..
27890 + */
27891 + clear_fpu(tsk);
27892 + clear_used_math();
27893 +}
27894 +
27895 +void release_thread(struct task_struct *dead_task)
27896 +{
27897 + if (dead_task->mm) {
27898 + if (dead_task->mm->context.size) {
27899 + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
27900 + dead_task->comm,
27901 + dead_task->mm->context.ldt,
27902 + dead_task->mm->context.size);
27903 + BUG();
27904 + }
27905 + }
27906 +}
27907 +
27908 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
27909 +{
27910 + struct user_desc ud = {
27911 + .base_addr = addr,
27912 + .limit = 0xfffff,
27913 + .seg_32bit = 1,
27914 + .limit_in_pages = 1,
27915 + .useable = 1,
27916 + };
27917 + struct n_desc_struct *desc = (void *)t->thread.tls_array;
27918 + desc += tls;
27919 + desc->a = LDT_entry_a(&ud);
27920 + desc->b = LDT_entry_b(&ud);
27921 +}
27922 +
27923 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
27924 +{
27925 + struct desc_struct *desc = (void *)t->thread.tls_array;
27926 + desc += tls;
27927 + return desc->base0 |
27928 + (((u32)desc->base1) << 16) |
27929 + (((u32)desc->base2) << 24);
27930 +}
27931 +
27932 +/*
27933 + * This gets called before we allocate a new thread and copy
27934 + * the current task into it.
27935 + */
27936 +void prepare_to_copy(struct task_struct *tsk)
27937 +{
27938 + unlazy_fpu(tsk);
27939 +}
27940 +
27941 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
27942 + unsigned long unused,
27943 + struct task_struct * p, struct pt_regs * regs)
27944 +{
27945 + int err;
27946 + struct pt_regs * childregs;
27947 + struct task_struct *me = current;
27948 +
27949 + childregs = ((struct pt_regs *)
27950 + (THREAD_SIZE + task_stack_page(p))) - 1;
27951 + *childregs = *regs;
27952 +
27953 + childregs->rax = 0;
27954 + childregs->rsp = rsp;
27955 + if (rsp == ~0UL)
27956 + childregs->rsp = (unsigned long)childregs;
27957 +
27958 + p->thread.rsp = (unsigned long) childregs;
27959 + p->thread.rsp0 = (unsigned long) (childregs+1);
27960 + p->thread.userrsp = me->thread.userrsp;
27961 +
27962 + set_tsk_thread_flag(p, TIF_FORK);
27963 +
27964 + p->thread.fs = me->thread.fs;
27965 + p->thread.gs = me->thread.gs;
27966 +
27967 + asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
27968 + asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
27969 + asm("mov %%es,%0" : "=m" (p->thread.es));
27970 + asm("mov %%ds,%0" : "=m" (p->thread.ds));
27971 +
27972 + if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
27973 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
27974 + if (!p->thread.io_bitmap_ptr) {
27975 + p->thread.io_bitmap_max = 0;
27976 + return -ENOMEM;
27977 + }
27978 + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
27979 + IO_BITMAP_BYTES);
27980 + }
27981 +
27982 + /*
27983 + * Set a new TLS for the child thread?
27984 + */
27985 + if (clone_flags & CLONE_SETTLS) {
27986 +#ifdef CONFIG_IA32_EMULATION
27987 + if (test_thread_flag(TIF_IA32))
27988 + err = ia32_child_tls(p, childregs);
27989 + else
27990 +#endif
27991 + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
27992 + if (err)
27993 + goto out;
27994 + }
27995 + p->thread.iopl = current->thread.iopl;
27996 +
27997 + err = 0;
27998 +out:
27999 + if (err && p->thread.io_bitmap_ptr) {
28000 + kfree(p->thread.io_bitmap_ptr);
28001 + p->thread.io_bitmap_max = 0;
28002 + }
28003 + return err;
28004 +}
28005 +
28006 +static inline void __save_init_fpu( struct task_struct *tsk )
28007 +{
28008 + asm volatile( "rex64 ; fxsave %0 ; fnclex"
28009 + : "=m" (tsk->thread.i387.fxsave));
28010 + tsk->thread_info->status &= ~TS_USEDFPU;
28011 +}
28012 +
28013 +/*
28014 + * switch_to(x,y) should switch tasks from x to y.
28015 + *
28016 + * This could still be optimized:
28017 + * - fold all the options into a flag word and test it with a single test.
28018 + * - could test fs/gs bitsliced
28019 + *
28020 + * Kprobes not supported here. Set the probe on schedule instead.
28021 + */
28022 +__kprobes struct task_struct *
28023 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
28024 +{
28025 + struct thread_struct *prev = &prev_p->thread,
28026 + *next = &next_p->thread;
28027 + int cpu = smp_processor_id();
28028 +#ifndef CONFIG_X86_NO_TSS
28029 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
28030 +#endif
28031 +#if CONFIG_XEN_COMPAT > 0x030002
28032 + struct physdev_set_iopl iopl_op;
28033 + struct physdev_set_iobitmap iobmp_op;
28034 +#else
28035 + struct physdev_op _pdo[2], *pdo = _pdo;
28036 +#define iopl_op pdo->u.set_iopl
28037 +#define iobmp_op pdo->u.set_iobitmap
28038 +#endif
28039 + multicall_entry_t _mcl[8], *mcl = _mcl;
28040 +
28041 + /*
28042 + * This is basically '__unlazy_fpu', except that we queue a
28043 + * multicall to indicate FPU task switch, rather than
28044 + * synchronously trapping to Xen.
28045 + * The AMD workaround requires it to be after DS reload, or
28046 + * after DS has been cleared, which we do in __prepare_arch_switch.
28047 + */
28048 + if (prev_p->thread_info->status & TS_USEDFPU) {
28049 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
28050 + mcl->op = __HYPERVISOR_fpu_taskswitch;
28051 + mcl->args[0] = 1;
28052 + mcl++;
28053 + }
28054 +
28055 + /*
28056 + * Reload esp0, LDT and the page table pointer:
28057 + */
28058 + mcl->op = __HYPERVISOR_stack_switch;
28059 + mcl->args[0] = __KERNEL_DS;
28060 + mcl->args[1] = next->rsp0;
28061 + mcl++;
28062 +
28063 + /*
28064 + * Load the per-thread Thread-Local Storage descriptor.
28065 + * This is load_TLS(next, cpu) with multicalls.
28066 + */
28067 +#define C(i) do { \
28068 + if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
28069 + mcl->op = __HYPERVISOR_update_descriptor; \
28070 + mcl->args[0] = virt_to_machine( \
28071 + &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
28072 + mcl->args[1] = next->tls_array[i]; \
28073 + mcl++; \
28074 + } \
28075 +} while (0)
28076 + C(0); C(1); C(2);
28077 +#undef C
28078 +
28079 + if (unlikely(prev->iopl != next->iopl)) {
28080 + iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
28081 +#if CONFIG_XEN_COMPAT > 0x030002
28082 + mcl->op = __HYPERVISOR_physdev_op;
28083 + mcl->args[0] = PHYSDEVOP_set_iopl;
28084 + mcl->args[1] = (unsigned long)&iopl_op;
28085 +#else
28086 + mcl->op = __HYPERVISOR_physdev_op_compat;
28087 + pdo->cmd = PHYSDEVOP_set_iopl;
28088 + mcl->args[0] = (unsigned long)pdo++;
28089 +#endif
28090 + mcl++;
28091 + }
28092 +
28093 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
28094 + set_xen_guest_handle(iobmp_op.bitmap,
28095 + (char *)next->io_bitmap_ptr);
28096 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
28097 +#if CONFIG_XEN_COMPAT > 0x030002
28098 + mcl->op = __HYPERVISOR_physdev_op;
28099 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
28100 + mcl->args[1] = (unsigned long)&iobmp_op;
28101 +#else
28102 + mcl->op = __HYPERVISOR_physdev_op_compat;
28103 + pdo->cmd = PHYSDEVOP_set_iobitmap;
28104 + mcl->args[0] = (unsigned long)pdo++;
28105 +#endif
28106 + mcl++;
28107 + }
28108 +
28109 +#if CONFIG_XEN_COMPAT <= 0x030002
28110 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
28111 +#endif
28112 + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
28113 + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
28114 + BUG();
28115 +
28116 + /*
28117 + * Switch DS and ES.
28118 + * This won't pick up thread selector changes, but I guess that is ok.
28119 + */
28120 + if (unlikely(next->es))
28121 + loadsegment(es, next->es);
28122 +
28123 + if (unlikely(next->ds))
28124 + loadsegment(ds, next->ds);
28125 +
28126 + /*
28127 + * Switch FS and GS.
28128 + */
28129 + if (unlikely(next->fsindex))
28130 + loadsegment(fs, next->fsindex);
28131 +
28132 + if (next->fs)
28133 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
28134 +
28135 + if (unlikely(next->gsindex))
28136 + load_gs_index(next->gsindex);
28137 +
28138 + if (next->gs)
28139 + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
28140 +
28141 + /*
28142 + * Switch the PDA context.
28143 + */
28144 + prev->userrsp = read_pda(oldrsp);
28145 + write_pda(oldrsp, next->userrsp);
28146 + write_pda(pcurrent, next_p);
28147 + write_pda(kernelstack,
28148 + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
28149 +
28150 + /*
28151 + * Now maybe reload the debug registers
28152 + */
28153 + if (unlikely(next->debugreg7)) {
28154 + set_debugreg(next->debugreg0, 0);
28155 + set_debugreg(next->debugreg1, 1);
28156 + set_debugreg(next->debugreg2, 2);
28157 + set_debugreg(next->debugreg3, 3);
28158 + /* no 4 and 5 */
28159 + set_debugreg(next->debugreg6, 6);
28160 + set_debugreg(next->debugreg7, 7);
28161 + }
28162 +
28163 + return prev_p;
28164 +}
28165 +
28166 +/*
28167 + * sys_execve() executes a new program.
28168 + */
28169 +asmlinkage
28170 +long sys_execve(char __user *name, char __user * __user *argv,
28171 + char __user * __user *envp, struct pt_regs regs)
28172 +{
28173 + long error;
28174 + char * filename;
28175 +
28176 + filename = getname(name);
28177 + error = PTR_ERR(filename);
28178 + if (IS_ERR(filename))
28179 + return error;
28180 + error = do_execve(filename, argv, envp, &regs);
28181 + if (error == 0) {
28182 + task_lock(current);
28183 + current->ptrace &= ~PT_DTRACE;
28184 + task_unlock(current);
28185 + }
28186 + putname(filename);
28187 + return error;
28188 +}
28189 +
28190 +void set_personality_64bit(void)
28191 +{
28192 + /* inherit personality from parent */
28193 +
28194 + /* Make sure to be in 64bit mode */
28195 + clear_thread_flag(TIF_IA32);
28196 +
28197 + /* TBD: overwrites user setup. Should have two bits.
28198 + But 64bit processes have always behaved this way,
28199 + so it's not too bad. The main problem is just that
28200 + 32bit childs are affected again. */
28201 + current->personality &= ~READ_IMPLIES_EXEC;
28202 +}
28203 +
28204 +asmlinkage long sys_fork(struct pt_regs *regs)
28205 +{
28206 + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
28207 +}
28208 +
28209 +asmlinkage long
28210 +sys_clone(unsigned long clone_flags, unsigned long newsp,
28211 + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
28212 +{
28213 + if (!newsp)
28214 + newsp = regs->rsp;
28215 + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
28216 +}
28217 +
28218 +/*
28219 + * This is trivial, and on the face of it looks like it
28220 + * could equally well be done in user mode.
28221 + *
28222 + * Not so, for quite unobvious reasons - register pressure.
28223 + * In user mode vfork() cannot have a stack frame, and if
28224 + * done by calling the "clone()" system call directly, you
28225 + * do not have enough call-clobbered registers to hold all
28226 + * the information you need.
28227 + */
28228 +asmlinkage long sys_vfork(struct pt_regs *regs)
28229 +{
28230 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
28231 + NULL, NULL);
28232 +}
28233 +
28234 +unsigned long get_wchan(struct task_struct *p)
28235 +{
28236 + unsigned long stack;
28237 + u64 fp,rip;
28238 + int count = 0;
28239 +
28240 + if (!p || p == current || p->state==TASK_RUNNING)
28241 + return 0;
28242 + stack = (unsigned long)task_stack_page(p);
28243 + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
28244 + return 0;
28245 + fp = *(u64 *)(p->thread.rsp);
28246 + do {
28247 + if (fp < (unsigned long)stack ||
28248 + fp > (unsigned long)stack+THREAD_SIZE)
28249 + return 0;
28250 + rip = *(u64 *)(fp+8);
28251 + if (!in_sched_functions(rip))
28252 + return rip;
28253 + fp = *(u64 *)fp;
28254 + } while (count++ < 16);
28255 + return 0;
28256 +}
28257 +
28258 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
28259 +{
28260 + int ret = 0;
28261 + int doit = task == current;
28262 + int cpu;
28263 +
28264 + switch (code) {
28265 + case ARCH_SET_GS:
28266 + if (addr >= TASK_SIZE_OF(task))
28267 + return -EPERM;
28268 + cpu = get_cpu();
28269 + /* handle small bases via the GDT because that's faster to
28270 + switch. */
28271 + if (addr <= 0xffffffff) {
28272 + set_32bit_tls(task, GS_TLS, addr);
28273 + if (doit) {
28274 + load_TLS(&task->thread, cpu);
28275 + load_gs_index(GS_TLS_SEL);
28276 + }
28277 + task->thread.gsindex = GS_TLS_SEL;
28278 + task->thread.gs = 0;
28279 + } else {
28280 + task->thread.gsindex = 0;
28281 + task->thread.gs = addr;
28282 + if (doit) {
28283 + load_gs_index(0);
28284 + ret = HYPERVISOR_set_segment_base(
28285 + SEGBASE_GS_USER, addr);
28286 + }
28287 + }
28288 + put_cpu();
28289 + break;
28290 + case ARCH_SET_FS:
28291 + /* Not strictly needed for fs, but do it for symmetry
28292 + with gs */
28293 + if (addr >= TASK_SIZE_OF(task))
28294 + return -EPERM;
28295 + cpu = get_cpu();
28296 + /* handle small bases via the GDT because that's faster to
28297 + switch. */
28298 + if (addr <= 0xffffffff) {
28299 + set_32bit_tls(task, FS_TLS, addr);
28300 + if (doit) {
28301 + load_TLS(&task->thread, cpu);
28302 + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
28303 + }
28304 + task->thread.fsindex = FS_TLS_SEL;
28305 + task->thread.fs = 0;
28306 + } else {
28307 + task->thread.fsindex = 0;
28308 + task->thread.fs = addr;
28309 + if (doit) {
28310 + /* set the selector to 0 to not confuse
28311 + __switch_to */
28312 + asm volatile("movl %0,%%fs" :: "r" (0));
28313 + ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
28314 + addr);
28315 + }
28316 + }
28317 + put_cpu();
28318 + break;
28319 + case ARCH_GET_FS: {
28320 + unsigned long base;
28321 + if (task->thread.fsindex == FS_TLS_SEL)
28322 + base = read_32bit_tls(task, FS_TLS);
28323 + else if (doit)
28324 + rdmsrl(MSR_FS_BASE, base);
28325 + else
28326 + base = task->thread.fs;
28327 + ret = put_user(base, (unsigned long __user *)addr);
28328 + break;
28329 + }
28330 + case ARCH_GET_GS: {
28331 + unsigned long base;
28332 + unsigned gsindex;
28333 + if (task->thread.gsindex == GS_TLS_SEL)
28334 + base = read_32bit_tls(task, GS_TLS);
28335 + else if (doit) {
28336 + asm("movl %%gs,%0" : "=r" (gsindex));
28337 + if (gsindex)
28338 + rdmsrl(MSR_KERNEL_GS_BASE, base);
28339 + else
28340 + base = task->thread.gs;
28341 + }
28342 + else
28343 + base = task->thread.gs;
28344 + ret = put_user(base, (unsigned long __user *)addr);
28345 + break;
28346 + }
28347 +
28348 + default:
28349 + ret = -EINVAL;
28350 + break;
28351 + }
28352 +
28353 + return ret;
28354 +}
28355 +
28356 +long sys_arch_prctl(int code, unsigned long addr)
28357 +{
28358 + return do_arch_prctl(current, code, addr);
28359 +}
28360 +
28361 +/*
28362 + * Capture the user space registers if the task is not running (in user space)
28363 + */
28364 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
28365 +{
28366 + struct pt_regs *pp, ptregs;
28367 +
28368 + pp = task_pt_regs(tsk);
28369 +
28370 + ptregs = *pp;
28371 + ptregs.cs &= 0xffff;
28372 + ptregs.ss &= 0xffff;
28373 +
28374 + elf_core_copy_regs(regs, &ptregs);
28375 +
28376 + boot_option_idle_override = 1;
28377 + return 1;
28378 +}
28379 +
28380 +unsigned long arch_align_stack(unsigned long sp)
28381 +{
28382 + if (randomize_va_space)
28383 + sp -= get_random_int() % 8192;
28384 + return sp & ~0xf;
28385 +}
28386 Index: head-2008-11-25/arch/x86/kernel/setup_64-xen.c
28387 ===================================================================
28388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
28389 +++ head-2008-11-25/arch/x86/kernel/setup_64-xen.c 2008-04-22 15:41:51.000000000 +0200
28390 @@ -0,0 +1,1652 @@
28391 +/*
28392 + * linux/arch/x86-64/kernel/setup.c
28393 + *
28394 + * Copyright (C) 1995 Linus Torvalds
28395 + *
28396 + * Nov 2001 Dave Jones <davej@suse.de>
28397 + * Forked from i386 setup code.
28398 + */
28399 +
28400 +/*
28401 + * This file handles the architecture-dependent parts of initialization
28402 + */
28403 +
28404 +#include <linux/errno.h>
28405 +#include <linux/sched.h>
28406 +#include <linux/kernel.h>
28407 +#include <linux/mm.h>
28408 +#include <linux/stddef.h>
28409 +#include <linux/unistd.h>
28410 +#include <linux/ptrace.h>
28411 +#include <linux/slab.h>
28412 +#include <linux/user.h>
28413 +#include <linux/a.out.h>
28414 +#include <linux/screen_info.h>
28415 +#include <linux/ioport.h>
28416 +#include <linux/delay.h>
28417 +#include <linux/init.h>
28418 +#include <linux/initrd.h>
28419 +#include <linux/highmem.h>
28420 +#include <linux/bootmem.h>
28421 +#include <linux/module.h>
28422 +#include <asm/processor.h>
28423 +#include <linux/console.h>
28424 +#include <linux/seq_file.h>
28425 +#include <linux/crash_dump.h>
28426 +#include <linux/root_dev.h>
28427 +#include <linux/pci.h>
28428 +#include <linux/acpi.h>
28429 +#include <linux/kallsyms.h>
28430 +#include <linux/edd.h>
28431 +#include <linux/mmzone.h>
28432 +#include <linux/kexec.h>
28433 +#include <linux/cpufreq.h>
28434 +#include <linux/dmi.h>
28435 +#include <linux/dma-mapping.h>
28436 +#include <linux/ctype.h>
28437 +
28438 +#include <asm/mtrr.h>
28439 +#include <asm/uaccess.h>
28440 +#include <asm/system.h>
28441 +#include <asm/io.h>
28442 +#include <asm/smp.h>
28443 +#include <asm/msr.h>
28444 +#include <asm/desc.h>
28445 +#include <video/edid.h>
28446 +#include <asm/e820.h>
28447 +#include <asm/dma.h>
28448 +#include <asm/mpspec.h>
28449 +#include <asm/mmu_context.h>
28450 +#include <asm/bootsetup.h>
28451 +#include <asm/proto.h>
28452 +#include <asm/setup.h>
28453 +#include <asm/mach_apic.h>
28454 +#include <asm/numa.h>
28455 +#include <asm/sections.h>
28456 +#include <asm/dmi.h>
28457 +#ifdef CONFIG_XEN
28458 +#include <linux/percpu.h>
28459 +#include <xen/interface/physdev.h>
28460 +#include "setup_arch_pre.h"
28461 +#include <asm/hypervisor.h>
28462 +#include <xen/interface/nmi.h>
28463 +#include <xen/features.h>
28464 +#include <xen/firmware.h>
28465 +#include <xen/xencons.h>
28466 +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
28467 +#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
28468 +#include <asm/mach-xen/setup_arch_post.h>
28469 +#include <xen/interface/memory.h>
28470 +
28471 +#ifdef CONFIG_XEN
28472 +#include <xen/interface/kexec.h>
28473 +#endif
28474 +
28475 +extern unsigned long start_pfn;
28476 +extern struct edid_info edid_info;
28477 +
28478 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
28479 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
28480 +
28481 +extern char hypercall_page[PAGE_SIZE];
28482 +EXPORT_SYMBOL(hypercall_page);
28483 +
28484 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
28485 +static struct notifier_block xen_panic_block = {
28486 + xen_panic_event, NULL, 0 /* try to go last */
28487 +};
28488 +
28489 +unsigned long *phys_to_machine_mapping;
28490 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
28491 +
28492 +EXPORT_SYMBOL(phys_to_machine_mapping);
28493 +
28494 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
28495 +DEFINE_PER_CPU(int, nr_multicall_ents);
28496 +
28497 +/* Raw start-of-day parameters from the hypervisor. */
28498 +start_info_t *xen_start_info;
28499 +EXPORT_SYMBOL(xen_start_info);
28500 +#endif
28501 +
28502 +/*
28503 + * Machine setup..
28504 + */
28505 +
28506 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
28507 +EXPORT_SYMBOL(boot_cpu_data);
28508 +
28509 +unsigned long mmu_cr4_features;
28510 +
28511 +int acpi_disabled;
28512 +EXPORT_SYMBOL(acpi_disabled);
28513 +#ifdef CONFIG_ACPI
28514 +extern int __initdata acpi_ht;
28515 +extern acpi_interrupt_flags acpi_sci_flags;
28516 +int __initdata acpi_force = 0;
28517 +#endif
28518 +
28519 +int acpi_numa __initdata;
28520 +
28521 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
28522 +int bootloader_type;
28523 +
28524 +unsigned long saved_video_mode;
28525 +
28526 +/*
28527 + * Early DMI memory
28528 + */
28529 +int dmi_alloc_index;
28530 +char dmi_alloc_data[DMI_MAX_DATA];
28531 +
28532 +/*
28533 + * Setup options
28534 + */
28535 +struct screen_info screen_info;
28536 +EXPORT_SYMBOL(screen_info);
28537 +struct sys_desc_table_struct {
28538 + unsigned short length;
28539 + unsigned char table[0];
28540 +};
28541 +
28542 +struct edid_info edid_info;
28543 +EXPORT_SYMBOL_GPL(edid_info);
28544 +struct e820map e820;
28545 +#ifdef CONFIG_XEN
28546 +struct e820map machine_e820;
28547 +#endif
28548 +
28549 +extern int root_mountflags;
28550 +
28551 +char command_line[COMMAND_LINE_SIZE];
28552 +
28553 +struct resource standard_io_resources[] = {
28554 + { .name = "dma1", .start = 0x00, .end = 0x1f,
28555 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28556 + { .name = "pic1", .start = 0x20, .end = 0x21,
28557 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28558 + { .name = "timer0", .start = 0x40, .end = 0x43,
28559 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28560 + { .name = "timer1", .start = 0x50, .end = 0x53,
28561 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28562 + { .name = "keyboard", .start = 0x60, .end = 0x6f,
28563 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28564 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
28565 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28566 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
28567 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28568 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
28569 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
28570 + { .name = "fpu", .start = 0xf0, .end = 0xff,
28571 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
28572 +};
28573 +
28574 +#define STANDARD_IO_RESOURCES \
28575 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
28576 +
28577 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
28578 +
28579 +struct resource data_resource = {
28580 + .name = "Kernel data",
28581 + .start = 0,
28582 + .end = 0,
28583 + .flags = IORESOURCE_RAM,
28584 +};
28585 +struct resource code_resource = {
28586 + .name = "Kernel code",
28587 + .start = 0,
28588 + .end = 0,
28589 + .flags = IORESOURCE_RAM,
28590 +};
28591 +
28592 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
28593 +
28594 +static struct resource system_rom_resource = {
28595 + .name = "System ROM",
28596 + .start = 0xf0000,
28597 + .end = 0xfffff,
28598 + .flags = IORESOURCE_ROM,
28599 +};
28600 +
28601 +static struct resource extension_rom_resource = {
28602 + .name = "Extension ROM",
28603 + .start = 0xe0000,
28604 + .end = 0xeffff,
28605 + .flags = IORESOURCE_ROM,
28606 +};
28607 +
28608 +static struct resource adapter_rom_resources[] = {
28609 + { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
28610 + .flags = IORESOURCE_ROM },
28611 + { .name = "Adapter ROM", .start = 0, .end = 0,
28612 + .flags = IORESOURCE_ROM },
28613 + { .name = "Adapter ROM", .start = 0, .end = 0,
28614 + .flags = IORESOURCE_ROM },
28615 + { .name = "Adapter ROM", .start = 0, .end = 0,
28616 + .flags = IORESOURCE_ROM },
28617 + { .name = "Adapter ROM", .start = 0, .end = 0,
28618 + .flags = IORESOURCE_ROM },
28619 + { .name = "Adapter ROM", .start = 0, .end = 0,
28620 + .flags = IORESOURCE_ROM }
28621 +};
28622 +
28623 +#define ADAPTER_ROM_RESOURCES \
28624 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
28625 +
28626 +static struct resource video_rom_resource = {
28627 + .name = "Video ROM",
28628 + .start = 0xc0000,
28629 + .end = 0xc7fff,
28630 + .flags = IORESOURCE_ROM,
28631 +};
28632 +
28633 +static struct resource video_ram_resource = {
28634 + .name = "Video RAM area",
28635 + .start = 0xa0000,
28636 + .end = 0xbffff,
28637 + .flags = IORESOURCE_RAM,
28638 +};
28639 +
28640 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
28641 +
28642 +static int __init romchecksum(unsigned char *rom, unsigned long length)
28643 +{
28644 + unsigned char *p, sum = 0;
28645 +
28646 + for (p = rom; p < rom + length; p++)
28647 + sum += *p;
28648 + return sum == 0;
28649 +}
28650 +
28651 +static void __init probe_roms(void)
28652 +{
28653 + unsigned long start, length, upper;
28654 + unsigned char *rom;
28655 + int i;
28656 +
28657 +#ifdef CONFIG_XEN
28658 + /* Nothing to do if not running in dom0. */
28659 + if (!is_initial_xendomain())
28660 + return;
28661 +#endif
28662 +
28663 + /* video rom */
28664 + upper = adapter_rom_resources[0].start;
28665 + for (start = video_rom_resource.start; start < upper; start += 2048) {
28666 + rom = isa_bus_to_virt(start);
28667 + if (!romsignature(rom))
28668 + continue;
28669 +
28670 + video_rom_resource.start = start;
28671 +
28672 + /* 0 < length <= 0x7f * 512, historically */
28673 + length = rom[2] * 512;
28674 +
28675 + /* if checksum okay, trust length byte */
28676 + if (length && romchecksum(rom, length))
28677 + video_rom_resource.end = start + length - 1;
28678 +
28679 + request_resource(&iomem_resource, &video_rom_resource);
28680 + break;
28681 + }
28682 +
28683 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
28684 + if (start < upper)
28685 + start = upper;
28686 +
28687 + /* system rom */
28688 + request_resource(&iomem_resource, &system_rom_resource);
28689 + upper = system_rom_resource.start;
28690 +
28691 + /* check for extension rom (ignore length byte!) */
28692 + rom = isa_bus_to_virt(extension_rom_resource.start);
28693 + if (romsignature(rom)) {
28694 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
28695 + if (romchecksum(rom, length)) {
28696 + request_resource(&iomem_resource, &extension_rom_resource);
28697 + upper = extension_rom_resource.start;
28698 + }
28699 + }
28700 +
28701 + /* check for adapter roms on 2k boundaries */
28702 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
28703 + rom = isa_bus_to_virt(start);
28704 + if (!romsignature(rom))
28705 + continue;
28706 +
28707 + /* 0 < length <= 0x7f * 512, historically */
28708 + length = rom[2] * 512;
28709 +
28710 + /* but accept any length that fits if checksum okay */
28711 + if (!length || start + length > upper || !romchecksum(rom, length))
28712 + continue;
28713 +
28714 + adapter_rom_resources[i].start = start;
28715 + adapter_rom_resources[i].end = start + length - 1;
28716 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
28717 +
28718 + start = adapter_rom_resources[i++].end & ~2047UL;
28719 + }
28720 +}
28721 +
28722 +/* Check for full argument with no trailing characters */
28723 +static int fullarg(char *p, char *arg)
28724 +{
28725 + int l = strlen(arg);
28726 + return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
28727 +}
28728 +
28729 +static __init void parse_cmdline_early (char ** cmdline_p)
28730 +{
28731 + char c = ' ', *to = command_line, *from = COMMAND_LINE;
28732 + int len = 0;
28733 + int userdef = 0;
28734 +
28735 + for (;;) {
28736 + if (c != ' ')
28737 + goto next_char;
28738 +
28739 +#ifdef CONFIG_SMP
28740 + /*
28741 + * If the BIOS enumerates physical processors before logical,
28742 + * maxcpus=N at enumeration-time can be used to disable HT.
28743 + */
28744 + else if (!memcmp(from, "maxcpus=", 8)) {
28745 + extern unsigned int maxcpus;
28746 +
28747 + maxcpus = simple_strtoul(from + 8, NULL, 0);
28748 + }
28749 +#endif
28750 +#ifdef CONFIG_ACPI
28751 + /* "acpi=off" disables both ACPI table parsing and interpreter init */
28752 + if (fullarg(from,"acpi=off"))
28753 + disable_acpi();
28754 +
28755 + if (fullarg(from, "acpi=force")) {
28756 + /* add later when we do DMI horrors: */
28757 + acpi_force = 1;
28758 + acpi_disabled = 0;
28759 + }
28760 +
28761 + /* acpi=ht just means: do ACPI MADT parsing
28762 + at bootup, but don't enable the full ACPI interpreter */
28763 + if (fullarg(from, "acpi=ht")) {
28764 + if (!acpi_force)
28765 + disable_acpi();
28766 + acpi_ht = 1;
28767 + }
28768 + else if (fullarg(from, "pci=noacpi"))
28769 + acpi_disable_pci();
28770 + else if (fullarg(from, "acpi=noirq"))
28771 + acpi_noirq_set();
28772 +
28773 + else if (fullarg(from, "acpi_sci=edge"))
28774 + acpi_sci_flags.trigger = 1;
28775 + else if (fullarg(from, "acpi_sci=level"))
28776 + acpi_sci_flags.trigger = 3;
28777 + else if (fullarg(from, "acpi_sci=high"))
28778 + acpi_sci_flags.polarity = 1;
28779 + else if (fullarg(from, "acpi_sci=low"))
28780 + acpi_sci_flags.polarity = 3;
28781 +
28782 + /* acpi=strict disables out-of-spec workarounds */
28783 + else if (fullarg(from, "acpi=strict")) {
28784 + acpi_strict = 1;
28785 + }
28786 +#ifdef CONFIG_X86_IO_APIC
28787 + else if (fullarg(from, "acpi_skip_timer_override"))
28788 + acpi_skip_timer_override = 1;
28789 +#endif
28790 +#endif
28791 +
28792 +#ifndef CONFIG_XEN
28793 + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
28794 + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
28795 + disable_apic = 1;
28796 + }
28797 +
28798 + if (fullarg(from, "noapic"))
28799 + skip_ioapic_setup = 1;
28800 +
28801 + if (fullarg(from,"apic")) {
28802 + skip_ioapic_setup = 0;
28803 + ioapic_force = 1;
28804 + }
28805 +#endif
28806 +
28807 + if (!memcmp(from, "mem=", 4))
28808 + parse_memopt(from+4, &from);
28809 +
28810 + if (!memcmp(from, "memmap=", 7)) {
28811 + /* exactmap option is for used defined memory */
28812 + if (!memcmp(from+7, "exactmap", 8)) {
28813 +#ifdef CONFIG_CRASH_DUMP
28814 + /* If we are doing a crash dump, we
28815 + * still need to know the real mem
28816 + * size before original memory map is
28817 + * reset.
28818 + */
28819 + saved_max_pfn = e820_end_of_ram();
28820 +#endif
28821 + from += 8+7;
28822 + end_pfn_map = 0;
28823 + e820.nr_map = 0;
28824 + userdef = 1;
28825 + }
28826 + else {
28827 + parse_memmapopt(from+7, &from);
28828 + userdef = 1;
28829 + }
28830 + }
28831 +
28832 +#ifdef CONFIG_NUMA
28833 + if (!memcmp(from, "numa=", 5))
28834 + numa_setup(from+5);
28835 +#endif
28836 +
28837 + if (!memcmp(from,"iommu=",6)) {
28838 + iommu_setup(from+6);
28839 + }
28840 +
28841 + if (fullarg(from,"oops=panic"))
28842 + panic_on_oops = 1;
28843 +
28844 + if (!memcmp(from, "noexec=", 7))
28845 + nonx_setup(from + 7);
28846 +
28847 +#ifdef CONFIG_KEXEC
28848 + /* crashkernel=size@addr specifies the location to reserve for
28849 + * a crash kernel. By reserving this memory we guarantee
28850 + * that linux never set's it up as a DMA target.
28851 + * Useful for holding code to do something appropriate
28852 + * after a kernel panic.
28853 + */
28854 + else if (!memcmp(from, "crashkernel=", 12)) {
28855 +#ifndef CONFIG_XEN
28856 + unsigned long size, base;
28857 + size = memparse(from+12, &from);
28858 + if (*from == '@') {
28859 + base = memparse(from+1, &from);
28860 + /* FIXME: Do I want a sanity check
28861 + * to validate the memory range?
28862 + */
28863 + crashk_res.start = base;
28864 + crashk_res.end = base + size - 1;
28865 + }
28866 +#else
28867 + printk("Ignoring crashkernel command line, "
28868 + "parameter will be supplied by xen\n");
28869 +#endif
28870 + }
28871 +#endif
28872 +
28873 +#ifdef CONFIG_PROC_VMCORE
28874 + /* elfcorehdr= specifies the location of elf core header
28875 + * stored by the crashed kernel. This option will be passed
28876 + * by kexec loader to the capture kernel.
28877 + */
28878 + else if(!memcmp(from, "elfcorehdr=", 11))
28879 + elfcorehdr_addr = memparse(from+11, &from);
28880 +#endif
28881 +
28882 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
28883 + else if (!memcmp(from, "additional_cpus=", 16))
28884 + setup_additional_cpus(from+16);
28885 +#endif
28886 +
28887 + next_char:
28888 + c = *(from++);
28889 + if (!c)
28890 + break;
28891 + if (COMMAND_LINE_SIZE <= ++len)
28892 + break;
28893 + *(to++) = c;
28894 + }
28895 + if (userdef) {
28896 + printk(KERN_INFO "user-defined physical RAM map:\n");
28897 + e820_print_map("user");
28898 + }
28899 + *to = '\0';
28900 + *cmdline_p = command_line;
28901 +}
28902 +
28903 +#ifndef CONFIG_NUMA
28904 +static void __init
28905 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
28906 +{
28907 + unsigned long bootmap_size, bootmap;
28908 +
28909 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
28910 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
28911 + if (bootmap == -1L)
28912 + panic("Cannot find bootmem map of size %ld\n",bootmap_size);
28913 + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
28914 +#ifdef CONFIG_XEN
28915 + e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
28916 +#else
28917 + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
28918 +#endif
28919 + reserve_bootmem(bootmap, bootmap_size);
28920 +}
28921 +#endif
28922 +
28923 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
28924 +struct edd edd;
28925 +#ifdef CONFIG_EDD_MODULE
28926 +EXPORT_SYMBOL(edd);
28927 +#endif
28928 +#ifndef CONFIG_XEN
28929 +/**
28930 + * copy_edd() - Copy the BIOS EDD information
28931 + * from boot_params into a safe place.
28932 + *
28933 + */
28934 +static inline void copy_edd(void)
28935 +{
28936 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
28937 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
28938 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
28939 + edd.edd_info_nr = EDD_NR;
28940 +}
28941 +#endif
28942 +#else
28943 +static inline void copy_edd(void)
28944 +{
28945 +}
28946 +#endif
28947 +
28948 +#ifndef CONFIG_XEN
28949 +#define EBDA_ADDR_POINTER 0x40E
28950 +
28951 +unsigned __initdata ebda_addr;
28952 +unsigned __initdata ebda_size;
28953 +
28954 +static void discover_ebda(void)
28955 +{
28956 + /*
28957 + * there is a real-mode segmented pointer pointing to the
28958 + * 4K EBDA area at 0x40E
28959 + */
28960 + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
28961 + ebda_addr <<= 4;
28962 +
28963 + ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
28964 +
28965 + /* Round EBDA up to pages */
28966 + if (ebda_size == 0)
28967 + ebda_size = 1;
28968 + ebda_size <<= 10;
28969 + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
28970 + if (ebda_size > 64*1024)
28971 + ebda_size = 64*1024;
28972 +}
28973 +#else
28974 +#define discover_ebda() ((void)0)
28975 +#endif
28976 +
28977 +void __init setup_arch(char **cmdline_p)
28978 +{
28979 +#ifdef CONFIG_XEN
28980 + /* Register a call for panic conditions. */
28981 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
28982 +
28983 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
28984 + screen_info = SCREEN_INFO;
28985 +
28986 + if (is_initial_xendomain()) {
28987 + const struct dom0_vga_console_info *info =
28988 + (void *)((char *)xen_start_info +
28989 + xen_start_info->console.dom0.info_off);
28990 +
28991 + dom0_init_screen_info(info,
28992 + xen_start_info->console.dom0.info_size);
28993 + xen_start_info->console.domU.mfn = 0;
28994 + xen_start_info->console.domU.evtchn = 0;
28995 + } else
28996 + screen_info.orig_video_isVGA = 0;
28997 +
28998 + copy_edid();
28999 +
29000 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
29001 + VMASST_TYPE_writable_pagetables));
29002 +
29003 + ARCH_SETUP
29004 +#else
29005 + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
29006 + screen_info = SCREEN_INFO;
29007 + edid_info = EDID_INFO;
29008 +#endif /* !CONFIG_XEN */
29009 + saved_video_mode = SAVED_VIDEO_MODE;
29010 + bootloader_type = LOADER_TYPE;
29011 +
29012 +#ifdef CONFIG_BLK_DEV_RAM
29013 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
29014 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
29015 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
29016 +#endif
29017 + setup_memory_region();
29018 + copy_edd();
29019 +
29020 + if (!MOUNT_ROOT_RDONLY)
29021 + root_mountflags &= ~MS_RDONLY;
29022 + init_mm.start_code = (unsigned long) &_text;
29023 + init_mm.end_code = (unsigned long) &_etext;
29024 + init_mm.end_data = (unsigned long) &_edata;
29025 + init_mm.brk = (unsigned long) &_end;
29026 +
29027 + code_resource.start = virt_to_phys(&_text);
29028 + code_resource.end = virt_to_phys(&_etext)-1;
29029 + data_resource.start = virt_to_phys(&_etext);
29030 + data_resource.end = virt_to_phys(&_edata)-1;
29031 +
29032 + parse_cmdline_early(cmdline_p);
29033 +
29034 + early_identify_cpu(&boot_cpu_data);
29035 +
29036 + /*
29037 + * partially used pages are not usable - thus
29038 + * we are rounding upwards:
29039 + */
29040 + end_pfn = e820_end_of_ram();
29041 + num_physpages = end_pfn; /* for pfn_valid */
29042 +
29043 + check_efer();
29044 +
29045 + discover_ebda();
29046 +
29047 + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
29048 +
29049 + if (is_initial_xendomain())
29050 + dmi_scan_machine();
29051 +
29052 +#ifdef CONFIG_ACPI_NUMA
29053 + /*
29054 + * Parse SRAT to discover nodes.
29055 + */
29056 + acpi_numa_init();
29057 +#endif
29058 +
29059 +#ifdef CONFIG_NUMA
29060 + numa_initmem_init(0, end_pfn);
29061 +#else
29062 + contig_initmem_init(0, end_pfn);
29063 +#endif
29064 +
29065 +#ifdef CONFIG_XEN
29066 + /*
29067 + * Reserve kernel, physmap, start info, initial page tables, and
29068 + * direct mapping.
29069 + */
29070 + reserve_bootmem_generic(__pa_symbol(&_text),
29071 + (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
29072 +#else
29073 + /* Reserve direct mapping */
29074 + reserve_bootmem_generic(table_start << PAGE_SHIFT,
29075 + (table_end - table_start) << PAGE_SHIFT);
29076 +
29077 + /* reserve kernel */
29078 + reserve_bootmem_generic(__pa_symbol(&_text),
29079 + __pa_symbol(&_end) - __pa_symbol(&_text));
29080 +
29081 + /*
29082 + * reserve physical page 0 - it's a special BIOS page on many boxes,
29083 + * enabling clean reboots, SMP operation, laptop functions.
29084 + */
29085 + reserve_bootmem_generic(0, PAGE_SIZE);
29086 +
29087 + /* reserve ebda region */
29088 + if (ebda_addr)
29089 + reserve_bootmem_generic(ebda_addr, ebda_size);
29090 +
29091 +#ifdef CONFIG_SMP
29092 + /*
29093 + * But first pinch a few for the stack/trampoline stuff
29094 + * FIXME: Don't need the extra page at 4K, but need to fix
29095 + * trampoline before removing it. (see the GDT stuff)
29096 + */
29097 + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
29098 +
29099 + /* Reserve SMP trampoline */
29100 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
29101 +#endif
29102 +#endif
29103 +
29104 +#ifdef CONFIG_ACPI_SLEEP
29105 + /*
29106 + * Reserve low memory region for sleep support.
29107 + */
29108 + acpi_reserve_bootmem();
29109 +#endif
29110 +#ifdef CONFIG_XEN
29111 +#ifdef CONFIG_BLK_DEV_INITRD
29112 + if (xen_start_info->mod_start) {
29113 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29114 + /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
29115 + initrd_start = INITRD_START + PAGE_OFFSET;
29116 + initrd_end = initrd_start+INITRD_SIZE;
29117 + initrd_below_start_ok = 1;
29118 + } else {
29119 + printk(KERN_ERR "initrd extends beyond end of memory "
29120 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29121 + (unsigned long)(INITRD_START + INITRD_SIZE),
29122 + (unsigned long)(end_pfn << PAGE_SHIFT));
29123 + initrd_start = 0;
29124 + }
29125 + }
29126 +#endif
29127 +#else /* CONFIG_XEN */
29128 +#ifdef CONFIG_BLK_DEV_INITRD
29129 + if (LOADER_TYPE && INITRD_START) {
29130 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
29131 + reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
29132 + initrd_start =
29133 + INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
29134 + initrd_end = initrd_start+INITRD_SIZE;
29135 + }
29136 + else {
29137 + printk(KERN_ERR "initrd extends beyond end of memory "
29138 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
29139 + (unsigned long)(INITRD_START + INITRD_SIZE),
29140 + (unsigned long)(end_pfn << PAGE_SHIFT));
29141 + initrd_start = 0;
29142 + }
29143 + }
29144 +#endif
29145 +#endif /* !CONFIG_XEN */
29146 +#ifdef CONFIG_KEXEC
29147 +#ifdef CONFIG_XEN
29148 + xen_machine_kexec_setup_resources();
29149 +#else
29150 + if (crashk_res.start != crashk_res.end) {
29151 + reserve_bootmem_generic(crashk_res.start,
29152 + crashk_res.end - crashk_res.start + 1);
29153 + }
29154 +#endif
29155 +#endif
29156 +
29157 + paging_init();
29158 +#ifdef CONFIG_X86_LOCAL_APIC
29159 + /*
29160 + * Find and reserve possible boot-time SMP configuration:
29161 + */
29162 + find_smp_config();
29163 +#endif
29164 +#ifdef CONFIG_XEN
29165 + {
29166 + int i, j, k, fpp;
29167 + unsigned long p2m_pages;
29168 +
29169 + p2m_pages = end_pfn;
29170 + if (xen_start_info->nr_pages > end_pfn) {
29171 + /*
29172 + * the end_pfn was shrunk (probably by mem= or highmem=
29173 + * kernel parameter); shrink reservation with the HV
29174 + */
29175 + struct xen_memory_reservation reservation = {
29176 + .address_bits = 0,
29177 + .extent_order = 0,
29178 + .domid = DOMID_SELF
29179 + };
29180 + unsigned int difference;
29181 + int ret;
29182 +
29183 + difference = xen_start_info->nr_pages - end_pfn;
29184 +
29185 + set_xen_guest_handle(reservation.extent_start,
29186 + ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
29187 + reservation.nr_extents = difference;
29188 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
29189 + &reservation);
29190 + BUG_ON (ret != difference);
29191 + }
29192 + else if (end_pfn > xen_start_info->nr_pages)
29193 + p2m_pages = xen_start_info->nr_pages;
29194 +
29195 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29196 + /* Make sure we have a large enough P->M table. */
29197 + phys_to_machine_mapping = alloc_bootmem_pages(
29198 + end_pfn * sizeof(unsigned long));
29199 + memset(phys_to_machine_mapping, ~0,
29200 + end_pfn * sizeof(unsigned long));
29201 + memcpy(phys_to_machine_mapping,
29202 + (unsigned long *)xen_start_info->mfn_list,
29203 + p2m_pages * sizeof(unsigned long));
29204 + free_bootmem(
29205 + __pa(xen_start_info->mfn_list),
29206 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
29207 + sizeof(unsigned long))));
29208 +
29209 + /*
29210 + * Initialise the list of the frames that specify the
29211 + * list of frames that make up the p2m table. Used by
29212 + * save/restore.
29213 + */
29214 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
29215 +
29216 + fpp = PAGE_SIZE/sizeof(unsigned long);
29217 + for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
29218 + if ((j % fpp) == 0) {
29219 + k++;
29220 + BUG_ON(k>=fpp);
29221 + pfn_to_mfn_frame_list[k] =
29222 + alloc_bootmem_pages(PAGE_SIZE);
29223 + pfn_to_mfn_frame_list_list[k] =
29224 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
29225 + j=0;
29226 + }
29227 + pfn_to_mfn_frame_list[k][j] =
29228 + virt_to_mfn(&phys_to_machine_mapping[i]);
29229 + }
29230 + HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
29231 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
29232 + virt_to_mfn(pfn_to_mfn_frame_list_list);
29233 + }
29234 +
29235 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
29236 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
29237 + if (i != 4 && request_dma(i, "xen") != 0)
29238 + BUG();
29239 + }
29240 +
29241 + if (!is_initial_xendomain()) {
29242 + acpi_disabled = 1;
29243 +#ifdef CONFIG_ACPI
29244 + acpi_ht = 0;
29245 +#endif
29246 + }
29247 +#endif
29248 +
29249 +#ifndef CONFIG_XEN
29250 + check_ioapic();
29251 +#endif
29252 +
29253 + zap_low_mappings(0);
29254 +
29255 + /*
29256 + * set this early, so we dont allocate cpu0
29257 + * if MADT list doesnt list BSP first
29258 + * mpparse.c/MP_processor_info() allocates logical cpu numbers.
29259 + */
29260 + cpu_set(0, cpu_present_map);
29261 +#ifdef CONFIG_ACPI
29262 + /*
29263 + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
29264 + * Call this early for SRAT node setup.
29265 + */
29266 + acpi_boot_table_init();
29267 +
29268 + /*
29269 + * Read APIC and some other early information from ACPI tables.
29270 + */
29271 + acpi_boot_init();
29272 +#endif
29273 +
29274 + init_cpu_to_node();
29275 +
29276 +#ifdef CONFIG_X86_LOCAL_APIC
29277 + /*
29278 + * get boot-time SMP configuration:
29279 + */
29280 + if (smp_found_config)
29281 + get_smp_config();
29282 +#ifndef CONFIG_XEN
29283 + init_apic_mappings();
29284 +#endif
29285 +#endif
29286 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
29287 + prefill_possible_map();
29288 +#endif
29289 +
29290 + /*
29291 + * Request address space for all standard RAM and ROM resources
29292 + * and also for regions reported as reserved by the e820.
29293 + */
29294 + probe_roms();
29295 +#ifdef CONFIG_XEN
29296 + if (is_initial_xendomain())
29297 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
29298 +#else
29299 + e820_reserve_resources(e820.map, e820.nr_map);
29300 +#endif
29301 +
29302 + request_resource(&iomem_resource, &video_ram_resource);
29303 +
29304 + {
29305 + unsigned i;
29306 + /* request I/O space for devices used on all i[345]86 PCs */
29307 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
29308 + request_resource(&ioport_resource, &standard_io_resources[i]);
29309 + }
29310 +
29311 +#ifdef CONFIG_XEN
29312 + if (is_initial_xendomain())
29313 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
29314 +#else
29315 + e820_setup_gap(e820.map, e820.nr_map);
29316 +#endif
29317 +
29318 +#ifdef CONFIG_XEN
29319 + {
29320 + struct physdev_set_iopl set_iopl;
29321 +
29322 + set_iopl.iopl = 1;
29323 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
29324 +
29325 + if (is_initial_xendomain()) {
29326 +#ifdef CONFIG_VT
29327 +#if defined(CONFIG_VGA_CONSOLE)
29328 + conswitchp = &vga_con;
29329 +#elif defined(CONFIG_DUMMY_CONSOLE)
29330 + conswitchp = &dummy_con;
29331 +#endif
29332 +#endif
29333 + } else {
29334 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
29335 + conswitchp = &dummy_con;
29336 +#endif
29337 + }
29338 + }
29339 +#else /* CONFIG_XEN */
29340 +
29341 +#ifdef CONFIG_VT
29342 +#if defined(CONFIG_VGA_CONSOLE)
29343 + conswitchp = &vga_con;
29344 +#elif defined(CONFIG_DUMMY_CONSOLE)
29345 + conswitchp = &dummy_con;
29346 +#endif
29347 +#endif
29348 +
29349 +#endif /* !CONFIG_XEN */
29350 +}
29351 +
29352 +#ifdef CONFIG_XEN
29353 +static int
29354 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
29355 +{
29356 + HYPERVISOR_shutdown(SHUTDOWN_crash);
29357 + /* we're never actually going to get here... */
29358 + return NOTIFY_DONE;
29359 +}
29360 +#endif /* !CONFIG_XEN */
29361 +
29362 +
29363 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
29364 +{
29365 + unsigned int *v;
29366 +
29367 + if (c->extended_cpuid_level < 0x80000004)
29368 + return 0;
29369 +
29370 + v = (unsigned int *) c->x86_model_id;
29371 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
29372 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
29373 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
29374 + c->x86_model_id[48] = 0;
29375 + return 1;
29376 +}
29377 +
29378 +
29379 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
29380 +{
29381 + unsigned int n, dummy, eax, ebx, ecx, edx;
29382 +
29383 + n = c->extended_cpuid_level;
29384 +
29385 + if (n >= 0x80000005) {
29386 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
29387 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
29388 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
29389 + c->x86_cache_size=(ecx>>24)+(edx>>24);
29390 + /* On K8 L1 TLB is inclusive, so don't count it */
29391 + c->x86_tlbsize = 0;
29392 + }
29393 +
29394 + if (n >= 0x80000006) {
29395 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
29396 + ecx = cpuid_ecx(0x80000006);
29397 + c->x86_cache_size = ecx >> 16;
29398 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
29399 +
29400 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
29401 + c->x86_cache_size, ecx & 0xFF);
29402 + }
29403 +
29404 + if (n >= 0x80000007)
29405 + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
29406 + if (n >= 0x80000008) {
29407 + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
29408 + c->x86_virt_bits = (eax >> 8) & 0xff;
29409 + c->x86_phys_bits = eax & 0xff;
29410 + }
29411 +}
29412 +
29413 +#ifdef CONFIG_NUMA
29414 +static int nearby_node(int apicid)
29415 +{
29416 + int i;
29417 + for (i = apicid - 1; i >= 0; i--) {
29418 + int node = apicid_to_node[i];
29419 + if (node != NUMA_NO_NODE && node_online(node))
29420 + return node;
29421 + }
29422 + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
29423 + int node = apicid_to_node[i];
29424 + if (node != NUMA_NO_NODE && node_online(node))
29425 + return node;
29426 + }
29427 + return first_node(node_online_map); /* Shouldn't happen */
29428 +}
29429 +#endif
29430 +
29431 +/*
29432 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
29433 + * Assumes number of cores is a power of two.
29434 + */
29435 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
29436 +{
29437 +#ifdef CONFIG_SMP
29438 + unsigned bits;
29439 +#ifdef CONFIG_NUMA
29440 + int cpu = smp_processor_id();
29441 + int node = 0;
29442 + unsigned apicid = hard_smp_processor_id();
29443 +#endif
29444 + unsigned ecx = cpuid_ecx(0x80000008);
29445 +
29446 + c->x86_max_cores = (ecx & 0xff) + 1;
29447 +
29448 + /* CPU telling us the core id bits shift? */
29449 + bits = (ecx >> 12) & 0xF;
29450 +
29451 + /* Otherwise recompute */
29452 + if (bits == 0) {
29453 + while ((1 << bits) < c->x86_max_cores)
29454 + bits++;
29455 + }
29456 +
29457 + /* Low order bits define the core id (index of core in socket) */
29458 + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
29459 + /* Convert the APIC ID into the socket ID */
29460 + c->phys_proc_id = phys_pkg_id(bits);
29461 +
29462 +#ifdef CONFIG_NUMA
29463 + node = c->phys_proc_id;
29464 + if (apicid_to_node[apicid] != NUMA_NO_NODE)
29465 + node = apicid_to_node[apicid];
29466 + if (!node_online(node)) {
29467 + /* Two possibilities here:
29468 + - The CPU is missing memory and no node was created.
29469 + In that case try picking one from a nearby CPU
29470 + - The APIC IDs differ from the HyperTransport node IDs
29471 + which the K8 northbridge parsing fills in.
29472 + Assume they are all increased by a constant offset,
29473 + but in the same order as the HT nodeids.
29474 + If that doesn't result in a usable node fall back to the
29475 + path for the previous case. */
29476 + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
29477 + if (ht_nodeid >= 0 &&
29478 + apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
29479 + node = apicid_to_node[ht_nodeid];
29480 + /* Pick a nearby node */
29481 + if (!node_online(node))
29482 + node = nearby_node(apicid);
29483 + }
29484 + numa_set_node(cpu, node);
29485 +
29486 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29487 +#endif
29488 +#endif
29489 +}
29490 +
29491 +static void __init init_amd(struct cpuinfo_x86 *c)
29492 +{
29493 + unsigned level;
29494 +
29495 +#ifdef CONFIG_SMP
29496 + unsigned long value;
29497 +
29498 + /*
29499 + * Disable TLB flush filter by setting HWCR.FFDIS on K8
29500 + * bit 6 of msr C001_0015
29501 + *
29502 + * Errata 63 for SH-B3 steppings
29503 + * Errata 122 for all steppings (F+ have it disabled by default)
29504 + */
29505 + if (c->x86 == 15) {
29506 + rdmsrl(MSR_K8_HWCR, value);
29507 + value |= 1 << 6;
29508 + wrmsrl(MSR_K8_HWCR, value);
29509 + }
29510 +#endif
29511 +
29512 + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
29513 + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
29514 + clear_bit(0*32+31, &c->x86_capability);
29515 +
29516 + /* On C+ stepping K8 rep microcode works well for copy/memset */
29517 + level = cpuid_eax(1);
29518 + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
29519 + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
29520 +
29521 + /* Enable workaround for FXSAVE leak */
29522 + if (c->x86 >= 6)
29523 + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
29524 +
29525 + level = get_model_name(c);
29526 + if (!level) {
29527 + switch (c->x86) {
29528 + case 15:
29529 + /* Should distinguish Models here, but this is only
29530 + a fallback anyways. */
29531 + strcpy(c->x86_model_id, "Hammer");
29532 + break;
29533 + }
29534 + }
29535 + display_cacheinfo(c);
29536 +
29537 + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
29538 + if (c->x86_power & (1<<8))
29539 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29540 +
29541 + /* Multi core CPU? */
29542 + if (c->extended_cpuid_level >= 0x80000008)
29543 + amd_detect_cmp(c);
29544 +
29545 + /* Fix cpuid4 emulation for more */
29546 + num_cache_leaves = 3;
29547 +}
29548 +
29549 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
29550 +{
29551 +#ifdef CONFIG_SMP
29552 + u32 eax, ebx, ecx, edx;
29553 + int index_msb, core_bits;
29554 +
29555 + cpuid(1, &eax, &ebx, &ecx, &edx);
29556 +
29557 +
29558 + if (!cpu_has(c, X86_FEATURE_HT))
29559 + return;
29560 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
29561 + goto out;
29562 +
29563 + smp_num_siblings = (ebx & 0xff0000) >> 16;
29564 +
29565 + if (smp_num_siblings == 1) {
29566 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
29567 + } else if (smp_num_siblings > 1 ) {
29568 +
29569 + if (smp_num_siblings > NR_CPUS) {
29570 + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
29571 + smp_num_siblings = 1;
29572 + return;
29573 + }
29574 +
29575 + index_msb = get_count_order(smp_num_siblings);
29576 + c->phys_proc_id = phys_pkg_id(index_msb);
29577 +
29578 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
29579 +
29580 + index_msb = get_count_order(smp_num_siblings) ;
29581 +
29582 + core_bits = get_count_order(c->x86_max_cores);
29583 +
29584 + c->cpu_core_id = phys_pkg_id(index_msb) &
29585 + ((1 << core_bits) - 1);
29586 + }
29587 +out:
29588 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
29589 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
29590 + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
29591 + }
29592 +
29593 +#endif
29594 +}
29595 +
29596 +/*
29597 + * find out the number of processor cores on the die
29598 + */
29599 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
29600 +{
29601 + unsigned int eax, t;
29602 +
29603 + if (c->cpuid_level < 4)
29604 + return 1;
29605 +
29606 + cpuid_count(4, 0, &eax, &t, &t, &t);
29607 +
29608 + if (eax & 0x1f)
29609 + return ((eax >> 26) + 1);
29610 + else
29611 + return 1;
29612 +}
29613 +
29614 +static void srat_detect_node(void)
29615 +{
29616 +#ifdef CONFIG_NUMA
29617 + unsigned node;
29618 + int cpu = smp_processor_id();
29619 + int apicid = hard_smp_processor_id();
29620 +
29621 + /* Don't do the funky fallback heuristics the AMD version employs
29622 + for now. */
29623 + node = apicid_to_node[apicid];
29624 + if (node == NUMA_NO_NODE)
29625 + node = first_node(node_online_map);
29626 + numa_set_node(cpu, node);
29627 +
29628 + if (acpi_numa > 0)
29629 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
29630 +#endif
29631 +}
29632 +
29633 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
29634 +{
29635 + /* Cache sizes */
29636 + unsigned n;
29637 +
29638 + init_intel_cacheinfo(c);
29639 + if (c->cpuid_level > 9 ) {
29640 + unsigned eax = cpuid_eax(10);
29641 + /* Check for version and the number of counters */
29642 + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
29643 + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
29644 + }
29645 +
29646 + n = c->extended_cpuid_level;
29647 + if (n >= 0x80000008) {
29648 + unsigned eax = cpuid_eax(0x80000008);
29649 + c->x86_virt_bits = (eax >> 8) & 0xff;
29650 + c->x86_phys_bits = eax & 0xff;
29651 + /* CPUID workaround for Intel 0F34 CPU */
29652 + if (c->x86_vendor == X86_VENDOR_INTEL &&
29653 + c->x86 == 0xF && c->x86_model == 0x3 &&
29654 + c->x86_mask == 0x4)
29655 + c->x86_phys_bits = 36;
29656 + }
29657 +
29658 + if (c->x86 == 15)
29659 + c->x86_cache_alignment = c->x86_clflush_size * 2;
29660 + if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
29661 + (c->x86 == 0x6 && c->x86_model >= 0x0e))
29662 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
29663 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
29664 + c->x86_max_cores = intel_num_cpu_cores(c);
29665 +
29666 + srat_detect_node();
29667 +}
29668 +
29669 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
29670 +{
29671 + char *v = c->x86_vendor_id;
29672 +
29673 + if (!strcmp(v, "AuthenticAMD"))
29674 + c->x86_vendor = X86_VENDOR_AMD;
29675 + else if (!strcmp(v, "GenuineIntel"))
29676 + c->x86_vendor = X86_VENDOR_INTEL;
29677 + else
29678 + c->x86_vendor = X86_VENDOR_UNKNOWN;
29679 +}
29680 +
29681 +struct cpu_model_info {
29682 + int vendor;
29683 + int family;
29684 + char *model_names[16];
29685 +};
29686 +
29687 +/* Do some early cpuid on the boot CPU to get some parameter that are
29688 + needed before check_bugs. Everything advanced is in identify_cpu
29689 + below. */
29690 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
29691 +{
29692 + u32 tfms;
29693 +
29694 + c->loops_per_jiffy = loops_per_jiffy;
29695 + c->x86_cache_size = -1;
29696 + c->x86_vendor = X86_VENDOR_UNKNOWN;
29697 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
29698 + c->x86_vendor_id[0] = '\0'; /* Unset */
29699 + c->x86_model_id[0] = '\0'; /* Unset */
29700 + c->x86_clflush_size = 64;
29701 + c->x86_cache_alignment = c->x86_clflush_size;
29702 + c->x86_max_cores = 1;
29703 + c->extended_cpuid_level = 0;
29704 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
29705 +
29706 + /* Get vendor name */
29707 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
29708 + (unsigned int *)&c->x86_vendor_id[0],
29709 + (unsigned int *)&c->x86_vendor_id[8],
29710 + (unsigned int *)&c->x86_vendor_id[4]);
29711 +
29712 + get_cpu_vendor(c);
29713 +
29714 + /* Initialize the standard set of capabilities */
29715 + /* Note that the vendor-specific code below might override */
29716 +
29717 + /* Intel-defined flags: level 0x00000001 */
29718 + if (c->cpuid_level >= 0x00000001) {
29719 + __u32 misc;
29720 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
29721 + &c->x86_capability[0]);
29722 + c->x86 = (tfms >> 8) & 0xf;
29723 + c->x86_model = (tfms >> 4) & 0xf;
29724 + c->x86_mask = tfms & 0xf;
29725 + if (c->x86 == 0xf)
29726 + c->x86 += (tfms >> 20) & 0xff;
29727 + if (c->x86 >= 0x6)
29728 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
29729 + if (c->x86_capability[0] & (1<<19))
29730 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
29731 + } else {
29732 + /* Have CPUID level 0 only - unheard of */
29733 + c->x86 = 4;
29734 + }
29735 +
29736 +#ifdef CONFIG_SMP
29737 + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
29738 +#endif
29739 +}
29740 +
29741 +/*
29742 + * This does the hard work of actually picking apart the CPU stuff...
29743 + */
29744 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
29745 +{
29746 + int i;
29747 + u32 xlvl;
29748 +
29749 + early_identify_cpu(c);
29750 +
29751 + /* AMD-defined flags: level 0x80000001 */
29752 + xlvl = cpuid_eax(0x80000000);
29753 + c->extended_cpuid_level = xlvl;
29754 + if ((xlvl & 0xffff0000) == 0x80000000) {
29755 + if (xlvl >= 0x80000001) {
29756 + c->x86_capability[1] = cpuid_edx(0x80000001);
29757 + c->x86_capability[6] = cpuid_ecx(0x80000001);
29758 + }
29759 + if (xlvl >= 0x80000004)
29760 + get_model_name(c); /* Default name */
29761 + }
29762 +
29763 + /* Transmeta-defined flags: level 0x80860001 */
29764 + xlvl = cpuid_eax(0x80860000);
29765 + if ((xlvl & 0xffff0000) == 0x80860000) {
29766 + /* Don't set x86_cpuid_level here for now to not confuse. */
29767 + if (xlvl >= 0x80860001)
29768 + c->x86_capability[2] = cpuid_edx(0x80860001);
29769 + }
29770 +
29771 + c->apicid = phys_pkg_id(0);
29772 +
29773 + /*
29774 + * Vendor-specific initialization. In this section we
29775 + * canonicalize the feature flags, meaning if there are
29776 + * features a certain CPU supports which CPUID doesn't
29777 + * tell us, CPUID claiming incorrect flags, or other bugs,
29778 + * we handle them here.
29779 + *
29780 + * At the end of this section, c->x86_capability better
29781 + * indicate the features this CPU genuinely supports!
29782 + */
29783 + switch (c->x86_vendor) {
29784 + case X86_VENDOR_AMD:
29785 + init_amd(c);
29786 + break;
29787 +
29788 + case X86_VENDOR_INTEL:
29789 + init_intel(c);
29790 + break;
29791 +
29792 + case X86_VENDOR_UNKNOWN:
29793 + default:
29794 + display_cacheinfo(c);
29795 + break;
29796 + }
29797 +
29798 + select_idle_routine(c);
29799 + detect_ht(c);
29800 +
29801 + /*
29802 + * On SMP, boot_cpu_data holds the common feature set between
29803 + * all CPUs; so make sure that we indicate which features are
29804 + * common between the CPUs. The first time this routine gets
29805 + * executed, c == &boot_cpu_data.
29806 + */
29807 + if (c != &boot_cpu_data) {
29808 + /* AND the already accumulated flags with these */
29809 + for (i = 0 ; i < NCAPINTS ; i++)
29810 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
29811 + }
29812 +
29813 +#ifdef CONFIG_X86_MCE
29814 + mcheck_init(c);
29815 +#endif
29816 + if (c == &boot_cpu_data)
29817 + mtrr_bp_init();
29818 + else
29819 + mtrr_ap_init();
29820 +#ifdef CONFIG_NUMA
29821 + numa_add_cpu(smp_processor_id());
29822 +#endif
29823 +}
29824 +
29825 +
29826 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
29827 +{
29828 + if (c->x86_model_id[0])
29829 + printk("%s", c->x86_model_id);
29830 +
29831 + if (c->x86_mask || c->cpuid_level >= 0)
29832 + printk(" stepping %02x\n", c->x86_mask);
29833 + else
29834 + printk("\n");
29835 +}
29836 +
29837 +/*
29838 + * Get CPU information for use by the procfs.
29839 + */
29840 +
29841 +static int show_cpuinfo(struct seq_file *m, void *v)
29842 +{
29843 + struct cpuinfo_x86 *c = v;
29844 +
29845 + /*
29846 + * These flag bits must match the definitions in <asm/cpufeature.h>.
29847 + * NULL means this bit is undefined or reserved; either way it doesn't
29848 + * have meaning as far as Linux is concerned. Note that it's important
29849 + * to realize there is a difference between this table and CPUID -- if
29850 + * applications want to get the raw CPUID data, they should access
29851 + * /dev/cpu/<cpu_nr>/cpuid instead.
29852 + */
29853 + static char *x86_cap_flags[] = {
29854 + /* Intel-defined */
29855 + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
29856 + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
29857 + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
29858 + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
29859 +
29860 + /* AMD-defined */
29861 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29862 + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
29863 + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
29864 + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
29865 +
29866 + /* Transmeta-defined */
29867 + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
29868 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29869 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29870 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29871 +
29872 + /* Other (Linux-defined) */
29873 + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
29874 + "constant_tsc", NULL, NULL,
29875 + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29876 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29877 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29878 +
29879 + /* Intel-defined (#2) */
29880 + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
29881 + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
29882 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29883 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29884 +
29885 + /* VIA/Cyrix/Centaur-defined */
29886 + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
29887 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29888 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29889 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29890 +
29891 + /* AMD-defined (#2) */
29892 + "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
29893 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29894 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29895 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29896 + };
29897 + static char *x86_power_flags[] = {
29898 + "ts", /* temperature sensor */
29899 + "fid", /* frequency id control */
29900 + "vid", /* voltage id control */
29901 + "ttp", /* thermal trip */
29902 + "tm",
29903 + "stc",
29904 + NULL,
29905 + /* nothing */ /* constant_tsc - moved to flags */
29906 + };
29907 +
29908 +
29909 +#ifdef CONFIG_SMP
29910 + if (!cpu_online(c-cpu_data))
29911 + return 0;
29912 +#endif
29913 +
29914 + seq_printf(m,"processor\t: %u\n"
29915 + "vendor_id\t: %s\n"
29916 + "cpu family\t: %d\n"
29917 + "model\t\t: %d\n"
29918 + "model name\t: %s\n",
29919 + (unsigned)(c-cpu_data),
29920 + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
29921 + c->x86,
29922 + (int)c->x86_model,
29923 + c->x86_model_id[0] ? c->x86_model_id : "unknown");
29924 +
29925 + if (c->x86_mask || c->cpuid_level >= 0)
29926 + seq_printf(m, "stepping\t: %d\n", c->x86_mask);
29927 + else
29928 + seq_printf(m, "stepping\t: unknown\n");
29929 +
29930 + if (cpu_has(c,X86_FEATURE_TSC)) {
29931 + unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
29932 + if (!freq)
29933 + freq = cpu_khz;
29934 + seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
29935 + freq / 1000, (freq % 1000));
29936 + }
29937 +
29938 + /* Cache size */
29939 + if (c->x86_cache_size >= 0)
29940 + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
29941 +
29942 +#ifdef CONFIG_SMP
29943 + if (smp_num_siblings * c->x86_max_cores > 1) {
29944 + int cpu = c - cpu_data;
29945 + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
29946 + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
29947 + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
29948 + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
29949 + }
29950 +#endif
29951 +
29952 + seq_printf(m,
29953 + "fpu\t\t: yes\n"
29954 + "fpu_exception\t: yes\n"
29955 + "cpuid level\t: %d\n"
29956 + "wp\t\t: yes\n"
29957 + "flags\t\t:",
29958 + c->cpuid_level);
29959 +
29960 + {
29961 + int i;
29962 + for ( i = 0 ; i < 32*NCAPINTS ; i++ )
29963 + if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
29964 + seq_printf(m, " %s", x86_cap_flags[i]);
29965 + }
29966 +
29967 + seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
29968 + c->loops_per_jiffy/(500000/HZ),
29969 + (c->loops_per_jiffy/(5000/HZ)) % 100);
29970 +
29971 + if (c->x86_tlbsize > 0)
29972 + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
29973 + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
29974 + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
29975 +
29976 + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
29977 + c->x86_phys_bits, c->x86_virt_bits);
29978 +
29979 + seq_printf(m, "power management:");
29980 + {
29981 + unsigned i;
29982 + for (i = 0; i < 32; i++)
29983 + if (c->x86_power & (1 << i)) {
29984 + if (i < ARRAY_SIZE(x86_power_flags) &&
29985 + x86_power_flags[i])
29986 + seq_printf(m, "%s%s",
29987 + x86_power_flags[i][0]?" ":"",
29988 + x86_power_flags[i]);
29989 + else
29990 + seq_printf(m, " [%d]", i);
29991 + }
29992 + }
29993 +
29994 + seq_printf(m, "\n\n");
29995 +
29996 + return 0;
29997 +}
29998 +
29999 +static void *c_start(struct seq_file *m, loff_t *pos)
30000 +{
30001 + return *pos < NR_CPUS ? cpu_data + *pos : NULL;
30002 +}
30003 +
30004 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
30005 +{
30006 + ++*pos;
30007 + return c_start(m, pos);
30008 +}
30009 +
30010 +static void c_stop(struct seq_file *m, void *v)
30011 +{
30012 +}
30013 +
30014 +struct seq_operations cpuinfo_op = {
30015 + .start =c_start,
30016 + .next = c_next,
30017 + .stop = c_stop,
30018 + .show = show_cpuinfo,
30019 +};
30020 +
30021 +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
30022 +#include <linux/platform_device.h>
30023 +static __init int add_pcspkr(void)
30024 +{
30025 + struct platform_device *pd;
30026 + int ret;
30027 +
30028 + if (!is_initial_xendomain())
30029 + return 0;
30030 +
30031 + pd = platform_device_alloc("pcspkr", -1);
30032 + if (!pd)
30033 + return -ENOMEM;
30034 +
30035 + ret = platform_device_add(pd);
30036 + if (ret)
30037 + platform_device_put(pd);
30038 +
30039 + return ret;
30040 +}
30041 +device_initcall(add_pcspkr);
30042 +#endif
30043 Index: head-2008-11-25/arch/x86/kernel/setup64-xen.c
30044 ===================================================================
30045 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30046 +++ head-2008-11-25/arch/x86/kernel/setup64-xen.c 2008-01-28 12:24:19.000000000 +0100
30047 @@ -0,0 +1,367 @@
30048 +/*
30049 + * X86-64 specific CPU setup.
30050 + * Copyright (C) 1995 Linus Torvalds
30051 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
30052 + * See setup.c for older changelog.
30053 + *
30054 + * Jun Nakajima <jun.nakajima@intel.com>
30055 + * Modified for Xen
30056 + *
30057 + */
30058 +#include <linux/init.h>
30059 +#include <linux/kernel.h>
30060 +#include <linux/sched.h>
30061 +#include <linux/string.h>
30062 +#include <linux/bootmem.h>
30063 +#include <linux/bitops.h>
30064 +#include <linux/module.h>
30065 +#include <asm/bootsetup.h>
30066 +#include <asm/pda.h>
30067 +#include <asm/pgtable.h>
30068 +#include <asm/processor.h>
30069 +#include <asm/desc.h>
30070 +#include <asm/atomic.h>
30071 +#include <asm/mmu_context.h>
30072 +#include <asm/smp.h>
30073 +#include <asm/i387.h>
30074 +#include <asm/percpu.h>
30075 +#include <asm/proto.h>
30076 +#include <asm/sections.h>
30077 +#ifdef CONFIG_XEN
30078 +#include <asm/hypervisor.h>
30079 +#endif
30080 +
30081 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
30082 +
30083 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30084 +
30085 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
30086 +EXPORT_SYMBOL(_cpu_pda);
30087 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
30088 +
30089 +#ifndef CONFIG_X86_NO_IDT
30090 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
30091 +#endif
30092 +
30093 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
30094 +
30095 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
30096 +EXPORT_SYMBOL(__supported_pte_mask);
30097 +static int do_not_nx __cpuinitdata = 0;
30098 +
30099 +/* noexec=on|off
30100 +Control non executable mappings for 64bit processes.
30101 +
30102 +on Enable(default)
30103 +off Disable
30104 +*/
30105 +int __init nonx_setup(char *str)
30106 +{
30107 + if (!strncmp(str, "on", 2)) {
30108 + __supported_pte_mask |= _PAGE_NX;
30109 + do_not_nx = 0;
30110 + } else if (!strncmp(str, "off", 3)) {
30111 + do_not_nx = 1;
30112 + __supported_pte_mask &= ~_PAGE_NX;
30113 + }
30114 + return 1;
30115 +}
30116 +__setup("noexec=", nonx_setup); /* parsed early actually */
30117 +
30118 +int force_personality32 = 0;
30119 +
30120 +/* noexec32=on|off
30121 +Control non executable heap for 32bit processes.
30122 +To control the stack too use noexec=off
30123 +
30124 +on PROT_READ does not imply PROT_EXEC for 32bit processes
30125 +off PROT_READ implies PROT_EXEC (default)
30126 +*/
30127 +static int __init nonx32_setup(char *str)
30128 +{
30129 + if (!strcmp(str, "on"))
30130 + force_personality32 &= ~READ_IMPLIES_EXEC;
30131 + else if (!strcmp(str, "off"))
30132 + force_personality32 |= READ_IMPLIES_EXEC;
30133 + return 1;
30134 +}
30135 +__setup("noexec32=", nonx32_setup);
30136 +
30137 +/*
30138 + * Great future plan:
30139 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
30140 + * Always point %gs to its beginning
30141 + */
30142 +void __init setup_per_cpu_areas(void)
30143 +{
30144 + int i;
30145 + unsigned long size;
30146 +
30147 +#ifdef CONFIG_HOTPLUG_CPU
30148 + prefill_possible_map();
30149 +#endif
30150 +
30151 + /* Copy section for each CPU (we discard the original) */
30152 + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
30153 +#ifdef CONFIG_MODULES
30154 + if (size < PERCPU_ENOUGH_ROOM)
30155 + size = PERCPU_ENOUGH_ROOM;
30156 +#endif
30157 +
30158 + for_each_cpu_mask (i, cpu_possible_map) {
30159 + char *ptr;
30160 +
30161 + if (!NODE_DATA(cpu_to_node(i))) {
30162 + printk("cpu with no node %d, num_online_nodes %d\n",
30163 + i, num_online_nodes());
30164 + ptr = alloc_bootmem(size);
30165 + } else {
30166 + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
30167 + }
30168 + if (!ptr)
30169 + panic("Cannot allocate cpu data for CPU %d\n", i);
30170 + cpu_pda(i)->data_offset = ptr - __per_cpu_start;
30171 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
30172 + }
30173 +}
30174 +
30175 +#ifdef CONFIG_XEN
30176 +static void switch_pt(void)
30177 +{
30178 + xen_pt_switch(__pa_symbol(init_level4_pgt));
30179 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
30180 +}
30181 +
30182 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30183 +{
30184 + unsigned long frames[16];
30185 + unsigned long va;
30186 + int f;
30187 +
30188 + for (va = gdt_descr->address, f = 0;
30189 + va < gdt_descr->address + gdt_descr->size;
30190 + va += PAGE_SIZE, f++) {
30191 + frames[f] = virt_to_mfn(va);
30192 + make_page_readonly(
30193 + (void *)va, XENFEAT_writable_descriptor_tables);
30194 + }
30195 + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
30196 + sizeof (struct desc_struct)))
30197 + BUG();
30198 +}
30199 +#else
30200 +static void switch_pt(void)
30201 +{
30202 + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
30203 +}
30204 +
30205 +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
30206 +{
30207 + asm volatile("lgdt %0" :: "m" (*gdt_descr));
30208 + asm volatile("lidt %0" :: "m" (idt_descr));
30209 +}
30210 +#endif
30211 +
30212 +void pda_init(int cpu)
30213 +{
30214 + struct x8664_pda *pda = cpu_pda(cpu);
30215 +
30216 + /* Setup up data that may be needed in __get_free_pages early */
30217 + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
30218 +#ifndef CONFIG_XEN
30219 + wrmsrl(MSR_GS_BASE, pda);
30220 +#else
30221 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
30222 + (unsigned long)pda))
30223 + BUG();
30224 +#endif
30225 + pda->cpunumber = cpu;
30226 + pda->irqcount = -1;
30227 + pda->kernelstack =
30228 + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
30229 + pda->active_mm = &init_mm;
30230 + pda->mmu_state = 0;
30231 +
30232 + if (cpu == 0) {
30233 +#ifdef CONFIG_XEN
30234 + xen_init_pt();
30235 +#endif
30236 + /* others are initialized in smpboot.c */
30237 + pda->pcurrent = &init_task;
30238 + pda->irqstackptr = boot_cpu_stack;
30239 + } else {
30240 + pda->irqstackptr = (char *)
30241 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
30242 + if (!pda->irqstackptr)
30243 + panic("cannot allocate irqstack for cpu %d", cpu);
30244 + }
30245 +
30246 + switch_pt();
30247 +
30248 + pda->irqstackptr += IRQSTACKSIZE-64;
30249 +}
30250 +
30251 +#ifndef CONFIG_X86_NO_TSS
30252 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
30253 +__attribute__((section(".bss.page_aligned")));
30254 +#endif
30255 +
30256 +/* May not be marked __init: used by software suspend */
30257 +void syscall_init(void)
30258 +{
30259 +#ifndef CONFIG_XEN
30260 + /*
30261 + * LSTAR and STAR live in a bit strange symbiosis.
30262 + * They both write to the same internal register. STAR allows to set CS/DS
30263 + * but only a 32bit target. LSTAR sets the 64bit rip.
30264 + */
30265 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
30266 + wrmsrl(MSR_LSTAR, system_call);
30267 +
30268 + /* Flags to clear on syscall */
30269 + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
30270 +#endif
30271 +#ifdef CONFIG_IA32_EMULATION
30272 + syscall32_cpu_init ();
30273 +#endif
30274 +}
30275 +
30276 +void __cpuinit check_efer(void)
30277 +{
30278 + unsigned long efer;
30279 +
30280 + rdmsrl(MSR_EFER, efer);
30281 + if (!(efer & EFER_NX) || do_not_nx) {
30282 + __supported_pte_mask &= ~_PAGE_NX;
30283 + }
30284 +}
30285 +
30286 +unsigned long kernel_eflags;
30287 +
30288 +/*
30289 + * cpu_init() initializes state that is per-CPU. Some data is already
30290 + * initialized (naturally) in the bootstrap process, such as the GDT
30291 + * and IDT. We reload them nevertheless, this function acts as a
30292 + * 'CPU state barrier', nothing should get across.
30293 + * A lot of state is already set up in PDA init.
30294 + */
30295 +void __cpuinit cpu_init (void)
30296 +{
30297 + int cpu = stack_smp_processor_id();
30298 +#ifndef CONFIG_X86_NO_TSS
30299 + struct tss_struct *t = &per_cpu(init_tss, cpu);
30300 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
30301 + unsigned long v;
30302 + char *estacks = NULL;
30303 + unsigned i;
30304 +#endif
30305 + struct task_struct *me;
30306 +
30307 + /* CPU 0 is initialised in head64.c */
30308 + if (cpu != 0) {
30309 + pda_init(cpu);
30310 + zap_low_mappings(cpu);
30311 + }
30312 +#ifndef CONFIG_X86_NO_TSS
30313 + else
30314 + estacks = boot_exception_stacks;
30315 +#endif
30316 +
30317 + me = current;
30318 +
30319 + if (cpu_test_and_set(cpu, cpu_initialized))
30320 + panic("CPU#%d already initialized!\n", cpu);
30321 +
30322 + printk("Initializing CPU#%d\n", cpu);
30323 +
30324 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
30325 +
30326 + /*
30327 + * Initialize the per-CPU GDT with the boot GDT,
30328 + * and set up the GDT descriptor:
30329 + */
30330 +#ifndef CONFIG_XEN
30331 + if (cpu)
30332 + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
30333 +#endif
30334 +
30335 + cpu_gdt_descr[cpu].size = GDT_SIZE;
30336 + cpu_gdt_init(&cpu_gdt_descr[cpu]);
30337 +
30338 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
30339 + syscall_init();
30340 +
30341 + wrmsrl(MSR_FS_BASE, 0);
30342 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
30343 + barrier();
30344 +
30345 + check_efer();
30346 +
30347 +#ifndef CONFIG_X86_NO_TSS
30348 + /*
30349 + * set up and load the per-CPU TSS
30350 + */
30351 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
30352 + if (cpu) {
30353 + static const unsigned int order[N_EXCEPTION_STACKS] = {
30354 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
30355 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
30356 + };
30357 +
30358 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
30359 + if (!estacks)
30360 + panic("Cannot allocate exception stack %ld %d\n",
30361 + v, cpu);
30362 + }
30363 + switch (v + 1) {
30364 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
30365 + case DEBUG_STACK:
30366 + cpu_pda(cpu)->debugstack = (unsigned long)estacks;
30367 + estacks += DEBUG_STKSZ;
30368 + break;
30369 +#endif
30370 + default:
30371 + estacks += EXCEPTION_STKSZ;
30372 + break;
30373 + }
30374 + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
30375 + }
30376 +
30377 + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
30378 + /*
30379 + * <= is required because the CPU will access up to
30380 + * 8 bits beyond the end of the IO permission bitmap.
30381 + */
30382 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
30383 + t->io_bitmap[i] = ~0UL;
30384 +#endif
30385 +
30386 + atomic_inc(&init_mm.mm_count);
30387 + me->active_mm = &init_mm;
30388 + if (me->mm)
30389 + BUG();
30390 + enter_lazy_tlb(&init_mm, me);
30391 +
30392 +#ifndef CONFIG_X86_NO_TSS
30393 + set_tss_desc(cpu, t);
30394 +#endif
30395 +#ifndef CONFIG_XEN
30396 + load_TR_desc();
30397 +#endif
30398 + load_LDT(&init_mm.context);
30399 +
30400 + /*
30401 + * Clear all 6 debug registers:
30402 + */
30403 +
30404 + set_debugreg(0UL, 0);
30405 + set_debugreg(0UL, 1);
30406 + set_debugreg(0UL, 2);
30407 + set_debugreg(0UL, 3);
30408 + set_debugreg(0UL, 6);
30409 + set_debugreg(0UL, 7);
30410 +
30411 + fpu_init();
30412 +
30413 + raw_local_save_flags(kernel_eflags);
30414 +}
30415 Index: head-2008-11-25/arch/x86/kernel/smp_64-xen.c
30416 ===================================================================
30417 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30418 +++ head-2008-11-25/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200
30419 @@ -0,0 +1,575 @@
30420 +/*
30421 + * Intel SMP support routines.
30422 + *
30423 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
30424 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
30425 + * (c) 2002,2003 Andi Kleen, SuSE Labs.
30426 + *
30427 + * This code is released under the GNU General Public License version 2 or
30428 + * later.
30429 + */
30430 +
30431 +#include <linux/init.h>
30432 +
30433 +#include <linux/mm.h>
30434 +#include <linux/delay.h>
30435 +#include <linux/spinlock.h>
30436 +#include <linux/smp_lock.h>
30437 +#include <linux/smp.h>
30438 +#include <linux/kernel_stat.h>
30439 +#include <linux/mc146818rtc.h>
30440 +#include <linux/interrupt.h>
30441 +
30442 +#include <asm/mtrr.h>
30443 +#include <asm/pgalloc.h>
30444 +#include <asm/tlbflush.h>
30445 +#include <asm/mach_apic.h>
30446 +#include <asm/mmu_context.h>
30447 +#include <asm/proto.h>
30448 +#include <asm/apicdef.h>
30449 +#include <asm/idle.h>
30450 +#ifdef CONFIG_XEN
30451 +#include <xen/evtchn.h>
30452 +#endif
30453 +
30454 +#ifndef CONFIG_XEN
30455 +/*
30456 + * Smarter SMP flushing macros.
30457 + * c/o Linus Torvalds.
30458 + *
30459 + * These mean you can really definitely utterly forget about
30460 + * writing to user space from interrupts. (Its not allowed anyway).
30461 + *
30462 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
30463 + *
30464 + * More scalable flush, from Andi Kleen
30465 + *
30466 + * To avoid global state use 8 different call vectors.
30467 + * Each CPU uses a specific vector to trigger flushes on other
30468 + * CPUs. Depending on the received vector the target CPUs look into
30469 + * the right per cpu variable for the flush data.
30470 + *
30471 + * With more than 8 CPUs they are hashed to the 8 available
30472 + * vectors. The limited global vector space forces us to this right now.
30473 + * In future when interrupts are split into per CPU domains this could be
30474 + * fixed, at the cost of triggering multiple IPIs in some cases.
30475 + */
30476 +
30477 +union smp_flush_state {
30478 + struct {
30479 + cpumask_t flush_cpumask;
30480 + struct mm_struct *flush_mm;
30481 + unsigned long flush_va;
30482 +#define FLUSH_ALL -1ULL
30483 + spinlock_t tlbstate_lock;
30484 + };
30485 + char pad[SMP_CACHE_BYTES];
30486 +} ____cacheline_aligned;
30487 +
30488 +/* State is put into the per CPU data section, but padded
30489 + to a full cache line because other CPUs can access it and we don't
30490 + want false sharing in the per cpu data segment. */
30491 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
30492 +
30493 +/*
30494 + * We cannot call mmdrop() because we are in interrupt context,
30495 + * instead update mm->cpu_vm_mask.
30496 + */
30497 +static inline void leave_mm(unsigned long cpu)
30498 +{
30499 + if (read_pda(mmu_state) == TLBSTATE_OK)
30500 + BUG();
30501 + cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
30502 + load_cr3(swapper_pg_dir);
30503 +}
30504 +
30505 +/*
30506 + *
30507 + * The flush IPI assumes that a thread switch happens in this order:
30508 + * [cpu0: the cpu that switches]
30509 + * 1) switch_mm() either 1a) or 1b)
30510 + * 1a) thread switch to a different mm
30511 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
30512 + * Stop ipi delivery for the old mm. This is not synchronized with
30513 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
30514 + * for the wrong mm, and in the worst case we perform a superfluous
30515 + * tlb flush.
30516 + * 1a2) set cpu mmu_state to TLBSTATE_OK
30517 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
30518 + * was in lazy tlb mode.
30519 + * 1a3) update cpu active_mm
30520 + * Now cpu0 accepts tlb flushes for the new mm.
30521 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
30522 + * Now the other cpus will send tlb flush ipis.
30523 + * 1a4) change cr3.
30524 + * 1b) thread switch without mm change
30525 + * cpu active_mm is correct, cpu0 already handles
30526 + * flush ipis.
30527 + * 1b1) set cpu mmu_state to TLBSTATE_OK
30528 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
30529 + * Atomically set the bit [other cpus will start sending flush ipis],
30530 + * and test the bit.
30531 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
30532 + * 2) switch %%esp, ie current
30533 + *
30534 + * The interrupt must handle 2 special cases:
30535 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
30536 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
30537 + * runs in kernel space, the cpu could load tlb entries for user space
30538 + * pages.
30539 + *
30540 + * The good news is that cpu mmu_state is local to each cpu, no
30541 + * write/read ordering problems.
30542 + */
30543 +
30544 +/*
30545 + * TLB flush IPI:
30546 + *
30547 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
30548 + * 2) Leave the mm if we are in the lazy tlb mode.
30549 + *
30550 + * Interrupts are disabled.
30551 + */
30552 +
30553 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
30554 +{
30555 + int cpu;
30556 + int sender;
30557 + union smp_flush_state *f;
30558 +
30559 + cpu = smp_processor_id();
30560 + /*
30561 + * orig_rax contains the negated interrupt vector.
30562 + * Use that to determine where the sender put the data.
30563 + */
30564 + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
30565 + f = &per_cpu(flush_state, sender);
30566 +
30567 + if (!cpu_isset(cpu, f->flush_cpumask))
30568 + goto out;
30569 + /*
30570 + * This was a BUG() but until someone can quote me the
30571 + * line from the intel manual that guarantees an IPI to
30572 + * multiple CPUs is retried _only_ on the erroring CPUs
30573 + * its staying as a return
30574 + *
30575 + * BUG();
30576 + */
30577 +
30578 + if (f->flush_mm == read_pda(active_mm)) {
30579 + if (read_pda(mmu_state) == TLBSTATE_OK) {
30580 + if (f->flush_va == FLUSH_ALL)
30581 + local_flush_tlb();
30582 + else
30583 + __flush_tlb_one(f->flush_va);
30584 + } else
30585 + leave_mm(cpu);
30586 + }
30587 +out:
30588 + ack_APIC_irq();
30589 + cpu_clear(cpu, f->flush_cpumask);
30590 +}
30591 +
30592 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
30593 + unsigned long va)
30594 +{
30595 + int sender;
30596 + union smp_flush_state *f;
30597 +
30598 + /* Caller has disabled preemption */
30599 + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
30600 + f = &per_cpu(flush_state, sender);
30601 +
30602 + /* Could avoid this lock when
30603 + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
30604 + probably not worth checking this for a cache-hot lock. */
30605 + spin_lock(&f->tlbstate_lock);
30606 +
30607 + f->flush_mm = mm;
30608 + f->flush_va = va;
30609 + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
30610 +
30611 + /*
30612 + * We have to send the IPI only to
30613 + * CPUs affected.
30614 + */
30615 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
30616 +
30617 + while (!cpus_empty(f->flush_cpumask))
30618 + cpu_relax();
30619 +
30620 + f->flush_mm = NULL;
30621 + f->flush_va = 0;
30622 + spin_unlock(&f->tlbstate_lock);
30623 +}
30624 +
30625 +int __cpuinit init_smp_flush(void)
30626 +{
30627 + int i;
30628 + for_each_cpu_mask(i, cpu_possible_map) {
30629 + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
30630 + }
30631 + return 0;
30632 +}
30633 +
30634 +core_initcall(init_smp_flush);
30635 +
30636 +void flush_tlb_current_task(void)
30637 +{
30638 + struct mm_struct *mm = current->mm;
30639 + cpumask_t cpu_mask;
30640 +
30641 + preempt_disable();
30642 + cpu_mask = mm->cpu_vm_mask;
30643 + cpu_clear(smp_processor_id(), cpu_mask);
30644 +
30645 + local_flush_tlb();
30646 + if (!cpus_empty(cpu_mask))
30647 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30648 + preempt_enable();
30649 +}
30650 +EXPORT_SYMBOL(flush_tlb_current_task);
30651 +
30652 +void flush_tlb_mm (struct mm_struct * mm)
30653 +{
30654 + cpumask_t cpu_mask;
30655 +
30656 + preempt_disable();
30657 + cpu_mask = mm->cpu_vm_mask;
30658 + cpu_clear(smp_processor_id(), cpu_mask);
30659 +
30660 + if (current->active_mm == mm) {
30661 + if (current->mm)
30662 + local_flush_tlb();
30663 + else
30664 + leave_mm(smp_processor_id());
30665 + }
30666 + if (!cpus_empty(cpu_mask))
30667 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
30668 +
30669 + preempt_enable();
30670 +}
30671 +EXPORT_SYMBOL(flush_tlb_mm);
30672 +
30673 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
30674 +{
30675 + struct mm_struct *mm = vma->vm_mm;
30676 + cpumask_t cpu_mask;
30677 +
30678 + preempt_disable();
30679 + cpu_mask = mm->cpu_vm_mask;
30680 + cpu_clear(smp_processor_id(), cpu_mask);
30681 +
30682 + if (current->active_mm == mm) {
30683 + if(current->mm)
30684 + __flush_tlb_one(va);
30685 + else
30686 + leave_mm(smp_processor_id());
30687 + }
30688 +
30689 + if (!cpus_empty(cpu_mask))
30690 + flush_tlb_others(cpu_mask, mm, va);
30691 +
30692 + preempt_enable();
30693 +}
30694 +EXPORT_SYMBOL(flush_tlb_page);
30695 +
30696 +static void do_flush_tlb_all(void* info)
30697 +{
30698 + unsigned long cpu = smp_processor_id();
30699 +
30700 + __flush_tlb_all();
30701 + if (read_pda(mmu_state) == TLBSTATE_LAZY)
30702 + leave_mm(cpu);
30703 +}
30704 +
30705 +void flush_tlb_all(void)
30706 +{
30707 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
30708 +}
30709 +#endif /* Xen */
30710 +
30711 +/*
30712 + * this function sends a 'reschedule' IPI to another CPU.
30713 + * it goes straight through and wastes no time serializing
30714 + * anything. Worst case is that we lose a reschedule ...
30715 + */
30716 +
30717 +void smp_send_reschedule(int cpu)
30718 +{
30719 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
30720 +}
30721 +
30722 +/*
30723 + * Structure and data for smp_call_function(). This is designed to minimise
30724 + * static memory requirements. It also looks cleaner.
30725 + */
30726 +static DEFINE_SPINLOCK(call_lock);
30727 +
30728 +struct call_data_struct {
30729 + void (*func) (void *info);
30730 + void *info;
30731 + atomic_t started;
30732 + atomic_t finished;
30733 + int wait;
30734 +};
30735 +
30736 +static struct call_data_struct * call_data;
30737 +
30738 +void lock_ipi_call_lock(void)
30739 +{
30740 + spin_lock_irq(&call_lock);
30741 +}
30742 +
30743 +void unlock_ipi_call_lock(void)
30744 +{
30745 + spin_unlock_irq(&call_lock);
30746 +}
30747 +
30748 +/*
30749 + * this function sends a 'generic call function' IPI to one other CPU
30750 + * in the system.
30751 + *
30752 + * cpu is a standard Linux logical CPU number.
30753 + */
30754 +static void
30755 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
30756 + int nonatomic, int wait)
30757 +{
30758 + struct call_data_struct data;
30759 + int cpus = 1;
30760 +
30761 + data.func = func;
30762 + data.info = info;
30763 + atomic_set(&data.started, 0);
30764 + data.wait = wait;
30765 + if (wait)
30766 + atomic_set(&data.finished, 0);
30767 +
30768 + call_data = &data;
30769 + wmb();
30770 + /* Send a message to all other CPUs and wait for them to respond */
30771 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
30772 +
30773 + /* Wait for response */
30774 + while (atomic_read(&data.started) != cpus)
30775 + cpu_relax();
30776 +
30777 + if (!wait)
30778 + return;
30779 +
30780 + while (atomic_read(&data.finished) != cpus)
30781 + cpu_relax();
30782 +}
30783 +
30784 +/*
30785 + * smp_call_function_single - Run a function on another CPU
30786 + * @func: The function to run. This must be fast and non-blocking.
30787 + * @info: An arbitrary pointer to pass to the function.
30788 + * @nonatomic: Currently unused.
30789 + * @wait: If true, wait until function has completed on other CPUs.
30790 + *
30791 + * Retrurns 0 on success, else a negative status code.
30792 + *
30793 + * Does not return until the remote CPU is nearly ready to execute <func>
30794 + * or is or has executed.
30795 + */
30796 +
30797 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
30798 + int nonatomic, int wait)
30799 +{
30800 + /* prevent preemption and reschedule on another processor */
30801 + int me = get_cpu();
30802 + if (cpu == me) {
30803 + WARN_ON(1);
30804 + put_cpu();
30805 + return -EBUSY;
30806 + }
30807 + spin_lock_bh(&call_lock);
30808 + __smp_call_function_single(cpu, func, info, nonatomic, wait);
30809 + spin_unlock_bh(&call_lock);
30810 + put_cpu();
30811 + return 0;
30812 +}
30813 +
30814 +/*
30815 + * this function sends a 'generic call function' IPI to all other CPUs
30816 + * in the system.
30817 + */
30818 +static void __smp_call_function (void (*func) (void *info), void *info,
30819 + int nonatomic, int wait)
30820 +{
30821 + struct call_data_struct data;
30822 + int cpus = num_online_cpus()-1;
30823 +
30824 + if (!cpus)
30825 + return;
30826 +
30827 + data.func = func;
30828 + data.info = info;
30829 + atomic_set(&data.started, 0);
30830 + data.wait = wait;
30831 + if (wait)
30832 + atomic_set(&data.finished, 0);
30833 +
30834 + call_data = &data;
30835 + wmb();
30836 + /* Send a message to all other CPUs and wait for them to respond */
30837 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
30838 +
30839 + /* Wait for response */
30840 + while (atomic_read(&data.started) != cpus)
30841 + cpu_relax();
30842 +
30843 + if (!wait)
30844 + return;
30845 +
30846 + while (atomic_read(&data.finished) != cpus)
30847 + cpu_relax();
30848 +}
30849 +
30850 +/*
30851 + * smp_call_function - run a function on all other CPUs.
30852 + * @func: The function to run. This must be fast and non-blocking.
30853 + * @info: An arbitrary pointer to pass to the function.
30854 + * @nonatomic: currently unused.
30855 + * @wait: If true, wait (atomically) until function has completed on other
30856 + * CPUs.
30857 + *
30858 + * Returns 0 on success, else a negative status code. Does not return until
30859 + * remote CPUs are nearly ready to execute func or are or have executed.
30860 + *
30861 + * You must not call this function with disabled interrupts or from a
30862 + * hardware interrupt handler or from a bottom half handler.
30863 + * Actually there are a few legal cases, like panic.
30864 + */
30865 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
30866 + int wait)
30867 +{
30868 + spin_lock(&call_lock);
30869 + __smp_call_function(func,info,nonatomic,wait);
30870 + spin_unlock(&call_lock);
30871 + return 0;
30872 +}
30873 +EXPORT_SYMBOL(smp_call_function);
30874 +
30875 +void smp_stop_cpu(void)
30876 +{
30877 + unsigned long flags;
30878 + /*
30879 + * Remove this CPU:
30880 + */
30881 + cpu_clear(smp_processor_id(), cpu_online_map);
30882 + local_irq_save(flags);
30883 + disable_all_local_evtchn();
30884 + local_irq_restore(flags);
30885 +}
30886 +
30887 +static void smp_really_stop_cpu(void *dummy)
30888 +{
30889 + smp_stop_cpu();
30890 + for (;;)
30891 + halt();
30892 +}
30893 +
30894 +void smp_send_stop(void)
30895 +{
30896 + int nolock = 0;
30897 +#ifndef CONFIG_XEN
30898 + if (reboot_force)
30899 + return;
30900 +#endif
30901 + /* Don't deadlock on the call lock in panic */
30902 + if (!spin_trylock(&call_lock)) {
30903 + /* ignore locking because we have panicked anyways */
30904 + nolock = 1;
30905 + }
30906 + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
30907 + if (!nolock)
30908 + spin_unlock(&call_lock);
30909 +
30910 + local_irq_disable();
30911 + disable_all_local_evtchn();
30912 + local_irq_enable();
30913 +}
30914 +
30915 +/*
30916 + * Reschedule call back. Nothing to do,
30917 + * all the work is done automatically when
30918 + * we return from the interrupt.
30919 + */
30920 +#ifndef CONFIG_XEN
30921 +asmlinkage void smp_reschedule_interrupt(void)
30922 +#else
30923 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
30924 +#endif
30925 +{
30926 +#ifndef CONFIG_XEN
30927 + ack_APIC_irq();
30928 +#else
30929 + return IRQ_HANDLED;
30930 +#endif
30931 +}
30932 +
30933 +#ifndef CONFIG_XEN
30934 +asmlinkage void smp_call_function_interrupt(void)
30935 +#else
30936 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
30937 +#endif
30938 +{
30939 + void (*func) (void *info) = call_data->func;
30940 + void *info = call_data->info;
30941 + int wait = call_data->wait;
30942 +
30943 +#ifndef CONFIG_XEN
30944 + ack_APIC_irq();
30945 +#endif
30946 + /*
30947 + * Notify initiating CPU that I've grabbed the data and am
30948 + * about to execute the function
30949 + */
30950 + mb();
30951 + atomic_inc(&call_data->started);
30952 + /*
30953 + * At this point the info structure may be out of scope unless wait==1
30954 + */
30955 + exit_idle();
30956 + irq_enter();
30957 + (*func)(info);
30958 + irq_exit();
30959 + if (wait) {
30960 + mb();
30961 + atomic_inc(&call_data->finished);
30962 + }
30963 +#ifdef CONFIG_XEN
30964 + return IRQ_HANDLED;
30965 +#endif
30966 +}
30967 +
30968 +int safe_smp_processor_id(void)
30969 +{
30970 +#ifdef CONFIG_XEN
30971 + return smp_processor_id();
30972 +#else
30973 + unsigned apicid, i;
30974 +
30975 + if (disable_apic)
30976 + return 0;
30977 +
30978 + apicid = hard_smp_processor_id();
30979 + if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
30980 + return apicid;
30981 +
30982 + for (i = 0; i < NR_CPUS; ++i) {
30983 + if (x86_cpu_to_apicid[i] == apicid)
30984 + return i;
30985 + }
30986 +
30987 + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
30988 + * or called too early. Either way, we must be CPU 0. */
30989 + if (x86_cpu_to_apicid[0] == BAD_APICID)
30990 + return 0;
30991 +
30992 + return 0; /* Should not happen */
30993 +#endif
30994 +}
30995 Index: head-2008-11-25/arch/x86/kernel/traps_64-xen.c
30996 ===================================================================
30997 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
30998 +++ head-2008-11-25/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200
30999 @@ -0,0 +1,1173 @@
31000 +/*
31001 + * linux/arch/x86-64/traps.c
31002 + *
31003 + * Copyright (C) 1991, 1992 Linus Torvalds
31004 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
31005 + *
31006 + * Pentium III FXSR, SSE support
31007 + * Gareth Hughes <gareth@valinux.com>, May 2000
31008 + */
31009 +
31010 +/*
31011 + * 'Traps.c' handles hardware traps and faults after we have saved some
31012 + * state in 'entry.S'.
31013 + */
31014 +#include <linux/sched.h>
31015 +#include <linux/kernel.h>
31016 +#include <linux/string.h>
31017 +#include <linux/errno.h>
31018 +#include <linux/ptrace.h>
31019 +#include <linux/timer.h>
31020 +#include <linux/mm.h>
31021 +#include <linux/init.h>
31022 +#include <linux/delay.h>
31023 +#include <linux/spinlock.h>
31024 +#include <linux/interrupt.h>
31025 +#include <linux/module.h>
31026 +#include <linux/moduleparam.h>
31027 +#include <linux/nmi.h>
31028 +#include <linux/kprobes.h>
31029 +#include <linux/kexec.h>
31030 +#include <linux/unwind.h>
31031 +
31032 +#include <asm/system.h>
31033 +#include <asm/uaccess.h>
31034 +#include <asm/io.h>
31035 +#include <asm/atomic.h>
31036 +#include <asm/debugreg.h>
31037 +#include <asm/desc.h>
31038 +#include <asm/i387.h>
31039 +#include <asm/kdebug.h>
31040 +#include <asm/processor.h>
31041 +#include <asm/unwind.h>
31042 +#include <asm/smp.h>
31043 +#include <asm/pgalloc.h>
31044 +#include <asm/pda.h>
31045 +#include <asm/proto.h>
31046 +#include <asm/nmi.h>
31047 +
31048 +asmlinkage void divide_error(void);
31049 +asmlinkage void debug(void);
31050 +asmlinkage void nmi(void);
31051 +asmlinkage void int3(void);
31052 +asmlinkage void overflow(void);
31053 +asmlinkage void bounds(void);
31054 +asmlinkage void invalid_op(void);
31055 +asmlinkage void device_not_available(void);
31056 +asmlinkage void double_fault(void);
31057 +asmlinkage void coprocessor_segment_overrun(void);
31058 +asmlinkage void invalid_TSS(void);
31059 +asmlinkage void segment_not_present(void);
31060 +asmlinkage void stack_segment(void);
31061 +asmlinkage void general_protection(void);
31062 +asmlinkage void page_fault(void);
31063 +asmlinkage void coprocessor_error(void);
31064 +asmlinkage void simd_coprocessor_error(void);
31065 +asmlinkage void reserved(void);
31066 +asmlinkage void alignment_check(void);
31067 +asmlinkage void machine_check(void);
31068 +asmlinkage void spurious_interrupt_bug(void);
31069 +
31070 +ATOMIC_NOTIFIER_HEAD(die_chain);
31071 +EXPORT_SYMBOL(die_chain);
31072 +
31073 +int register_die_notifier(struct notifier_block *nb)
31074 +{
31075 + vmalloc_sync_all();
31076 + return atomic_notifier_chain_register(&die_chain, nb);
31077 +}
31078 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
31079 +
31080 +int unregister_die_notifier(struct notifier_block *nb)
31081 +{
31082 + return atomic_notifier_chain_unregister(&die_chain, nb);
31083 +}
31084 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
31085 +
31086 +static inline void conditional_sti(struct pt_regs *regs)
31087 +{
31088 + if (regs->eflags & X86_EFLAGS_IF)
31089 + local_irq_enable();
31090 +}
31091 +
31092 +static inline void preempt_conditional_sti(struct pt_regs *regs)
31093 +{
31094 + preempt_disable();
31095 + if (regs->eflags & X86_EFLAGS_IF)
31096 + local_irq_enable();
31097 +}
31098 +
31099 +static inline void preempt_conditional_cli(struct pt_regs *regs)
31100 +{
31101 + if (regs->eflags & X86_EFLAGS_IF)
31102 + local_irq_disable();
31103 + /* Make sure to not schedule here because we could be running
31104 + on an exception stack. */
31105 + preempt_enable_no_resched();
31106 +}
31107 +
31108 +static int kstack_depth_to_print = 12;
31109 +#ifdef CONFIG_STACK_UNWIND
31110 +static int call_trace = 1;
31111 +#else
31112 +#define call_trace (-1)
31113 +#endif
31114 +
31115 +#ifdef CONFIG_KALLSYMS
31116 +# include <linux/kallsyms.h>
31117 +void printk_address(unsigned long address)
31118 +{
31119 + unsigned long offset = 0, symsize;
31120 + const char *symname;
31121 + char *modname;
31122 + char *delim = ":";
31123 + char namebuf[128];
31124 +
31125 + symname = kallsyms_lookup(address, &symsize, &offset,
31126 + &modname, namebuf);
31127 + if (!symname) {
31128 + printk(" [<%016lx>]\n", address);
31129 + return;
31130 + }
31131 + if (!modname)
31132 + modname = delim = "";
31133 + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
31134 + address, delim, modname, delim, symname, offset, symsize);
31135 +}
31136 +#else
31137 +void printk_address(unsigned long address)
31138 +{
31139 + printk(" [<%016lx>]\n", address);
31140 +}
31141 +#endif
31142 +
31143 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
31144 + unsigned *usedp, const char **idp)
31145 +{
31146 +#ifndef CONFIG_X86_NO_TSS
31147 + static char ids[][8] = {
31148 + [DEBUG_STACK - 1] = "#DB",
31149 + [NMI_STACK - 1] = "NMI",
31150 + [DOUBLEFAULT_STACK - 1] = "#DF",
31151 + [STACKFAULT_STACK - 1] = "#SS",
31152 + [MCE_STACK - 1] = "#MC",
31153 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31154 + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
31155 +#endif
31156 + };
31157 + unsigned k;
31158 +
31159 + /*
31160 + * Iterate over all exception stacks, and figure out whether
31161 + * 'stack' is in one of them:
31162 + */
31163 + for (k = 0; k < N_EXCEPTION_STACKS; k++) {
31164 + unsigned long end;
31165 +
31166 + /*
31167 + * set 'end' to the end of the exception stack.
31168 + */
31169 + switch (k + 1) {
31170 + /*
31171 + * TODO: this block is not needed i think, because
31172 + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
31173 + * properly too.
31174 + */
31175 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31176 + case DEBUG_STACK:
31177 + end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
31178 + break;
31179 +#endif
31180 + default:
31181 + end = per_cpu(orig_ist, cpu).ist[k];
31182 + break;
31183 + }
31184 + /*
31185 + * Is 'stack' above this exception frame's end?
31186 + * If yes then skip to the next frame.
31187 + */
31188 + if (stack >= end)
31189 + continue;
31190 + /*
31191 + * Is 'stack' above this exception frame's start address?
31192 + * If yes then we found the right frame.
31193 + */
31194 + if (stack >= end - EXCEPTION_STKSZ) {
31195 + /*
31196 + * Make sure we only iterate through an exception
31197 + * stack once. If it comes up for the second time
31198 + * then there's something wrong going on - just
31199 + * break out and return NULL:
31200 + */
31201 + if (*usedp & (1U << k))
31202 + break;
31203 + *usedp |= 1U << k;
31204 + *idp = ids[k];
31205 + return (unsigned long *)end;
31206 + }
31207 + /*
31208 + * If this is a debug stack, and if it has a larger size than
31209 + * the usual exception stacks, then 'stack' might still
31210 + * be within the lower portion of the debug stack:
31211 + */
31212 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
31213 + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
31214 + unsigned j = N_EXCEPTION_STACKS - 1;
31215 +
31216 + /*
31217 + * Black magic. A large debug stack is composed of
31218 + * multiple exception stack entries, which we
31219 + * iterate through now. Dont look:
31220 + */
31221 + do {
31222 + ++j;
31223 + end -= EXCEPTION_STKSZ;
31224 + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
31225 + } while (stack < end - EXCEPTION_STKSZ);
31226 + if (*usedp & (1U << j))
31227 + break;
31228 + *usedp |= 1U << j;
31229 + *idp = ids[j];
31230 + return (unsigned long *)end;
31231 + }
31232 +#endif
31233 + }
31234 +#endif
31235 + return NULL;
31236 +}
31237 +
31238 +static int show_trace_unwind(struct unwind_frame_info *info, void *context)
31239 +{
31240 + int n = 0;
31241 +
31242 + while (unwind(info) == 0 && UNW_PC(info)) {
31243 + n++;
31244 + printk_address(UNW_PC(info));
31245 + if (arch_unw_user_mode(info))
31246 + break;
31247 + }
31248 + return n;
31249 +}
31250 +
31251 +/*
31252 + * x86-64 can have upto three kernel stacks:
31253 + * process stack
31254 + * interrupt stack
31255 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
31256 + */
31257 +
31258 +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
31259 +{
31260 + const unsigned cpu = safe_smp_processor_id();
31261 + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
31262 + unsigned used = 0;
31263 +
31264 + printk("\nCall Trace:\n");
31265 +
31266 + if (!tsk)
31267 + tsk = current;
31268 +
31269 + if (call_trace >= 0) {
31270 + int unw_ret = 0;
31271 + struct unwind_frame_info info;
31272 +
31273 + if (regs) {
31274 + if (unwind_init_frame_info(&info, tsk, regs) == 0)
31275 + unw_ret = show_trace_unwind(&info, NULL);
31276 + } else if (tsk == current)
31277 + unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
31278 + else {
31279 + if (unwind_init_blocked(&info, tsk) == 0)
31280 + unw_ret = show_trace_unwind(&info, NULL);
31281 + }
31282 + if (unw_ret > 0) {
31283 + if (call_trace == 1 && !arch_unw_user_mode(&info)) {
31284 + print_symbol("DWARF2 unwinder stuck at %s\n",
31285 + UNW_PC(&info));
31286 + if ((long)UNW_SP(&info) < 0) {
31287 + printk("Leftover inexact backtrace:\n");
31288 + stack = (unsigned long *)UNW_SP(&info);
31289 + } else
31290 + printk("Full inexact backtrace again:\n");
31291 + } else if (call_trace >= 1)
31292 + return;
31293 + else
31294 + printk("Full inexact backtrace again:\n");
31295 + } else
31296 + printk("Inexact backtrace:\n");
31297 + }
31298 +
31299 + /*
31300 + * Print function call entries within a stack. 'cond' is the
31301 + * "end of stackframe" condition, that the 'stack++'
31302 + * iteration will eventually trigger.
31303 + */
31304 +#define HANDLE_STACK(cond) \
31305 + do while (cond) { \
31306 + unsigned long addr = *stack++; \
31307 + if (kernel_text_address(addr)) { \
31308 + /* \
31309 + * If the address is either in the text segment of the \
31310 + * kernel, or in the region which contains vmalloc'ed \
31311 + * memory, it *may* be the address of a calling \
31312 + * routine; if so, print it so that someone tracing \
31313 + * down the cause of the crash will be able to figure \
31314 + * out the call path that was taken. \
31315 + */ \
31316 + printk_address(addr); \
31317 + } \
31318 + } while (0)
31319 +
31320 + /*
31321 + * Print function call entries in all stacks, starting at the
31322 + * current stack address. If the stacks consist of nested
31323 + * exceptions
31324 + */
31325 + for ( ; ; ) {
31326 + const char *id;
31327 + unsigned long *estack_end;
31328 + estack_end = in_exception_stack(cpu, (unsigned long)stack,
31329 + &used, &id);
31330 +
31331 + if (estack_end) {
31332 + printk(" <%s>", id);
31333 + HANDLE_STACK (stack < estack_end);
31334 + printk(" <EOE>");
31335 + /*
31336 + * We link to the next stack via the
31337 + * second-to-last pointer (index -2 to end) in the
31338 + * exception stack:
31339 + */
31340 + stack = (unsigned long *) estack_end[-2];
31341 + continue;
31342 + }
31343 + if (irqstack_end) {
31344 + unsigned long *irqstack;
31345 + irqstack = irqstack_end -
31346 + (IRQSTACKSIZE - 64) / sizeof(*irqstack);
31347 +
31348 + if (stack >= irqstack && stack < irqstack_end) {
31349 + printk(" <IRQ>");
31350 + HANDLE_STACK (stack < irqstack_end);
31351 + /*
31352 + * We link to the next stack (which would be
31353 + * the process stack normally) the last
31354 + * pointer (index -1 to end) in the IRQ stack:
31355 + */
31356 + stack = (unsigned long *) (irqstack_end[-1]);
31357 + irqstack_end = NULL;
31358 + printk(" <EOI>");
31359 + continue;
31360 + }
31361 + }
31362 + break;
31363 + }
31364 +
31365 + /*
31366 + * This prints the process stack:
31367 + */
31368 + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
31369 +#undef HANDLE_STACK
31370 +
31371 + printk("\n");
31372 +}
31373 +
31374 +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
31375 +{
31376 + unsigned long *stack;
31377 + int i;
31378 + const int cpu = safe_smp_processor_id();
31379 + unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
31380 + unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
31381 +
31382 + // debugging aid: "show_stack(NULL, NULL);" prints the
31383 + // back trace for this cpu.
31384 +
31385 + if (rsp == NULL) {
31386 + if (tsk)
31387 + rsp = (unsigned long *)tsk->thread.rsp;
31388 + else
31389 + rsp = (unsigned long *)&rsp;
31390 + }
31391 +
31392 + stack = rsp;
31393 + for(i=0; i < kstack_depth_to_print; i++) {
31394 + if (stack >= irqstack && stack <= irqstack_end) {
31395 + if (stack == irqstack_end) {
31396 + stack = (unsigned long *) (irqstack_end[-1]);
31397 + printk(" <EOI> ");
31398 + }
31399 + } else {
31400 + if (((long) stack & (THREAD_SIZE-1)) == 0)
31401 + break;
31402 + }
31403 + if (i && ((i % 4) == 0))
31404 + printk("\n");
31405 + printk(" %016lx", *stack++);
31406 + touch_nmi_watchdog();
31407 + }
31408 + show_trace(tsk, regs, rsp);
31409 +}
31410 +
31411 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
31412 +{
31413 + _show_stack(tsk, NULL, rsp);
31414 +}
31415 +
31416 +/*
31417 + * The architecture-independent dump_stack generator
31418 + */
31419 +void dump_stack(void)
31420 +{
31421 + unsigned long dummy;
31422 + show_trace(NULL, NULL, &dummy);
31423 +}
31424 +
31425 +EXPORT_SYMBOL(dump_stack);
31426 +
31427 +void show_registers(struct pt_regs *regs)
31428 +{
31429 + int i;
31430 + int in_kernel = !user_mode(regs);
31431 + unsigned long rsp;
31432 + const int cpu = safe_smp_processor_id();
31433 + struct task_struct *cur = cpu_pda(cpu)->pcurrent;
31434 +
31435 + rsp = regs->rsp;
31436 +
31437 + printk("CPU %d ", cpu);
31438 + __show_regs(regs);
31439 + printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
31440 + cur->comm, cur->pid, task_thread_info(cur), cur);
31441 +
31442 + /*
31443 + * When in-kernel, we also print out the stack and code at the
31444 + * time of the fault..
31445 + */
31446 + if (in_kernel) {
31447 +
31448 + printk("Stack: ");
31449 + _show_stack(NULL, regs, (unsigned long*)rsp);
31450 +
31451 + printk("\nCode: ");
31452 + if (regs->rip < PAGE_OFFSET)
31453 + goto bad;
31454 +
31455 + for (i=0; i<20; i++) {
31456 + unsigned char c;
31457 + if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
31458 +bad:
31459 + printk(" Bad RIP value.");
31460 + break;
31461 + }
31462 + printk("%02x ", c);
31463 + }
31464 + }
31465 + printk("\n");
31466 +}
31467 +
31468 +void handle_BUG(struct pt_regs *regs)
31469 +{
31470 + struct bug_frame f;
31471 + long len;
31472 + const char *prefix = "";
31473 +
31474 + if (user_mode(regs))
31475 + return;
31476 + if (__copy_from_user(&f, (const void __user *) regs->rip,
31477 + sizeof(struct bug_frame)))
31478 + return;
31479 + if (f.filename >= 0 ||
31480 + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
31481 + return;
31482 + len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
31483 + if (len < 0 || len >= PATH_MAX)
31484 + f.filename = (int)(long)"unmapped filename";
31485 + else if (len > 50) {
31486 + f.filename += len - 50;
31487 + prefix = "...";
31488 + }
31489 + printk("----------- [cut here ] --------- [please bite here ] ---------\n");
31490 + printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
31491 +}
31492 +
31493 +#ifdef CONFIG_BUG
31494 +void out_of_line_bug(void)
31495 +{
31496 + BUG();
31497 +}
31498 +EXPORT_SYMBOL(out_of_line_bug);
31499 +#endif
31500 +
31501 +static DEFINE_SPINLOCK(die_lock);
31502 +static int die_owner = -1;
31503 +static unsigned int die_nest_count;
31504 +
31505 +unsigned __kprobes long oops_begin(void)
31506 +{
31507 + int cpu = safe_smp_processor_id();
31508 + unsigned long flags;
31509 +
31510 + /* racy, but better than risking deadlock. */
31511 + local_irq_save(flags);
31512 + if (!spin_trylock(&die_lock)) {
31513 + if (cpu == die_owner)
31514 + /* nested oops. should stop eventually */;
31515 + else
31516 + spin_lock(&die_lock);
31517 + }
31518 + die_nest_count++;
31519 + die_owner = cpu;
31520 + console_verbose();
31521 + bust_spinlocks(1);
31522 + return flags;
31523 +}
31524 +
31525 +void __kprobes oops_end(unsigned long flags)
31526 +{
31527 + die_owner = -1;
31528 + bust_spinlocks(0);
31529 + die_nest_count--;
31530 + if (die_nest_count)
31531 + /* We still own the lock */
31532 + local_irq_restore(flags);
31533 + else
31534 + /* Nest count reaches zero, release the lock. */
31535 + spin_unlock_irqrestore(&die_lock, flags);
31536 + if (panic_on_oops)
31537 + panic("Fatal exception");
31538 +}
31539 +
31540 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
31541 +{
31542 + static int die_counter;
31543 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
31544 +#ifdef CONFIG_PREEMPT
31545 + printk("PREEMPT ");
31546 +#endif
31547 +#ifdef CONFIG_SMP
31548 + printk("SMP ");
31549 +#endif
31550 +#ifdef CONFIG_DEBUG_PAGEALLOC
31551 + printk("DEBUG_PAGEALLOC");
31552 +#endif
31553 + printk("\n");
31554 + notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
31555 + show_registers(regs);
31556 + /* Executive summary in case the oops scrolled away */
31557 + printk(KERN_ALERT "RIP ");
31558 + printk_address(regs->rip);
31559 + printk(" RSP <%016lx>\n", regs->rsp);
31560 + if (kexec_should_crash(current))
31561 + crash_kexec(regs);
31562 +}
31563 +
31564 +void die(const char * str, struct pt_regs * regs, long err)
31565 +{
31566 + unsigned long flags = oops_begin();
31567 +
31568 + handle_BUG(regs);
31569 + __die(str, regs, err);
31570 + oops_end(flags);
31571 + do_exit(SIGSEGV);
31572 +}
31573 +
31574 +#ifdef CONFIG_X86_LOCAL_APIC
31575 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
31576 +{
31577 + unsigned long flags = oops_begin();
31578 +
31579 + /*
31580 + * We are in trouble anyway, lets at least try
31581 + * to get a message out.
31582 + */
31583 + printk(str, safe_smp_processor_id());
31584 + show_registers(regs);
31585 + if (kexec_should_crash(current))
31586 + crash_kexec(regs);
31587 + if (panic_on_timeout || panic_on_oops)
31588 + panic("nmi watchdog");
31589 + printk("console shuts up ...\n");
31590 + oops_end(flags);
31591 + nmi_exit();
31592 + local_irq_enable();
31593 + do_exit(SIGSEGV);
31594 +}
31595 +#endif
31596 +
31597 +static void __kprobes do_trap(int trapnr, int signr, char *str,
31598 + struct pt_regs * regs, long error_code,
31599 + siginfo_t *info)
31600 +{
31601 + struct task_struct *tsk = current;
31602 +
31603 + tsk->thread.error_code = error_code;
31604 + tsk->thread.trap_no = trapnr;
31605 +
31606 + if (user_mode(regs)) {
31607 + if (exception_trace && unhandled_signal(tsk, signr))
31608 + printk(KERN_INFO
31609 + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
31610 + tsk->comm, tsk->pid, str,
31611 + regs->rip, regs->rsp, error_code);
31612 +
31613 + if (info)
31614 + force_sig_info(signr, info, tsk);
31615 + else
31616 + force_sig(signr, tsk);
31617 + return;
31618 + }
31619 +
31620 +
31621 + /* kernel trap */
31622 + {
31623 + const struct exception_table_entry *fixup;
31624 + fixup = search_exception_tables(regs->rip);
31625 + if (fixup)
31626 + regs->rip = fixup->fixup;
31627 + else
31628 + die(str, regs, error_code);
31629 + return;
31630 + }
31631 +}
31632 +
31633 +#define DO_ERROR(trapnr, signr, str, name) \
31634 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31635 +{ \
31636 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31637 + == NOTIFY_STOP) \
31638 + return; \
31639 + conditional_sti(regs); \
31640 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
31641 +}
31642 +
31643 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
31644 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
31645 +{ \
31646 + siginfo_t info; \
31647 + info.si_signo = signr; \
31648 + info.si_errno = 0; \
31649 + info.si_code = sicode; \
31650 + info.si_addr = (void __user *)siaddr; \
31651 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
31652 + == NOTIFY_STOP) \
31653 + return; \
31654 + conditional_sti(regs); \
31655 + do_trap(trapnr, signr, str, regs, error_code, &info); \
31656 +}
31657 +
31658 +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
31659 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
31660 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
31661 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
31662 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
31663 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
31664 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
31665 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
31666 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
31667 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
31668 +
31669 +/* Runs on IST stack */
31670 +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
31671 +{
31672 + if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
31673 + 12, SIGBUS) == NOTIFY_STOP)
31674 + return;
31675 + preempt_conditional_sti(regs);
31676 + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
31677 + preempt_conditional_cli(regs);
31678 +}
31679 +
31680 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
31681 +{
31682 + static const char str[] = "double fault";
31683 + struct task_struct *tsk = current;
31684 +
31685 + /* Return not checked because double check cannot be ignored */
31686 + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
31687 +
31688 + tsk->thread.error_code = error_code;
31689 + tsk->thread.trap_no = 8;
31690 +
31691 + /* This is always a kernel trap and never fixable (and thus must
31692 + never return). */
31693 + for (;;)
31694 + die(str, regs, error_code);
31695 +}
31696 +
31697 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
31698 + long error_code)
31699 +{
31700 + struct task_struct *tsk = current;
31701 +
31702 + conditional_sti(regs);
31703 +
31704 + tsk->thread.error_code = error_code;
31705 + tsk->thread.trap_no = 13;
31706 +
31707 + if (user_mode(regs)) {
31708 + if (exception_trace && unhandled_signal(tsk, SIGSEGV))
31709 + printk(KERN_INFO
31710 + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
31711 + tsk->comm, tsk->pid,
31712 + regs->rip, regs->rsp, error_code);
31713 +
31714 + force_sig(SIGSEGV, tsk);
31715 + return;
31716 + }
31717 +
31718 + /* kernel gp */
31719 + {
31720 + const struct exception_table_entry *fixup;
31721 + fixup = search_exception_tables(regs->rip);
31722 + if (fixup) {
31723 + regs->rip = fixup->fixup;
31724 + return;
31725 + }
31726 + if (notify_die(DIE_GPF, "general protection fault", regs,
31727 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
31728 + return;
31729 + die("general protection fault", regs, error_code);
31730 + }
31731 +}
31732 +
31733 +static __kprobes void
31734 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
31735 +{
31736 + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
31737 + printk("You probably have a hardware problem with your RAM chips\n");
31738 +
31739 +#if 0 /* XEN */
31740 + /* Clear and disable the memory parity error line. */
31741 + reason = (reason & 0xf) | 4;
31742 + outb(reason, 0x61);
31743 +#endif /* XEN */
31744 +}
31745 +
31746 +static __kprobes void
31747 +io_check_error(unsigned char reason, struct pt_regs * regs)
31748 +{
31749 + printk("NMI: IOCK error (debug interrupt?)\n");
31750 + show_registers(regs);
31751 +
31752 +#if 0 /* XEN */
31753 + /* Re-enable the IOCK line, wait for a few seconds */
31754 + reason = (reason & 0xf) | 8;
31755 + outb(reason, 0x61);
31756 + mdelay(2000);
31757 + reason &= ~8;
31758 + outb(reason, 0x61);
31759 +#endif /* XEN */
31760 +}
31761 +
31762 +static __kprobes void
31763 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
31764 +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
31765 + printk("Dazed and confused, but trying to continue\n");
31766 + printk("Do you have a strange power saving mode enabled?\n");
31767 +}
31768 +
31769 +/* Runs on IST stack. This code must keep interrupts off all the time.
31770 + Nested NMIs are prevented by the CPU. */
31771 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
31772 +{
31773 + unsigned char reason = 0;
31774 + int cpu;
31775 +
31776 + cpu = smp_processor_id();
31777 +
31778 + /* Only the BSP gets external NMIs from the system. */
31779 + if (!cpu)
31780 + reason = get_nmi_reason();
31781 +
31782 + if (!(reason & 0xc0)) {
31783 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
31784 + == NOTIFY_STOP)
31785 + return;
31786 +#ifdef CONFIG_X86_LOCAL_APIC
31787 + /*
31788 + * Ok, so this is none of the documented NMI sources,
31789 + * so it must be the NMI watchdog.
31790 + */
31791 + if (nmi_watchdog > 0) {
31792 + nmi_watchdog_tick(regs,reason);
31793 + return;
31794 + }
31795 +#endif
31796 + unknown_nmi_error(reason, regs);
31797 + return;
31798 + }
31799 + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
31800 + return;
31801 +
31802 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
31803 +
31804 + if (reason & 0x80)
31805 + mem_parity_error(reason, regs);
31806 + if (reason & 0x40)
31807 + io_check_error(reason, regs);
31808 +}
31809 +
31810 +/* runs on IST stack. */
31811 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
31812 +{
31813 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
31814 + return;
31815 + }
31816 + preempt_conditional_sti(regs);
31817 + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
31818 + preempt_conditional_cli(regs);
31819 +}
31820 +
31821 +/* Help handler running on IST stack to switch back to user stack
31822 + for scheduling or signal handling. The actual stack switch is done in
31823 + entry.S */
31824 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
31825 +{
31826 + struct pt_regs *regs = eregs;
31827 + /* Did already sync */
31828 + if (eregs == (struct pt_regs *)eregs->rsp)
31829 + ;
31830 + /* Exception from user space */
31831 + else if (user_mode(eregs))
31832 + regs = task_pt_regs(current);
31833 + /* Exception from kernel and interrupts are enabled. Move to
31834 + kernel process stack. */
31835 + else if (eregs->eflags & X86_EFLAGS_IF)
31836 + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
31837 + if (eregs != regs)
31838 + *regs = *eregs;
31839 + return regs;
31840 +}
31841 +
31842 +/* runs on IST stack. */
31843 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
31844 + unsigned long error_code)
31845 +{
31846 + unsigned long condition;
31847 + struct task_struct *tsk = current;
31848 + siginfo_t info;
31849 +
31850 + get_debugreg(condition, 6);
31851 +
31852 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
31853 + SIGTRAP) == NOTIFY_STOP)
31854 + return;
31855 +
31856 + preempt_conditional_sti(regs);
31857 +
31858 + /* Mask out spurious debug traps due to lazy DR7 setting */
31859 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
31860 + if (!tsk->thread.debugreg7) {
31861 + goto clear_dr7;
31862 + }
31863 + }
31864 +
31865 + tsk->thread.debugreg6 = condition;
31866 +
31867 + /* Mask out spurious TF errors due to lazy TF clearing */
31868 + if (condition & DR_STEP) {
31869 + /*
31870 + * The TF error should be masked out only if the current
31871 + * process is not traced and if the TRAP flag has been set
31872 + * previously by a tracing process (condition detected by
31873 + * the PT_DTRACE flag); remember that the i386 TRAP flag
31874 + * can be modified by the process itself in user mode,
31875 + * allowing programs to debug themselves without the ptrace()
31876 + * interface.
31877 + */
31878 + if (!user_mode(regs))
31879 + goto clear_TF_reenable;
31880 + /*
31881 + * Was the TF flag set by a debugger? If so, clear it now,
31882 + * so that register information is correct.
31883 + */
31884 + if (tsk->ptrace & PT_DTRACE) {
31885 + regs->eflags &= ~TF_MASK;
31886 + tsk->ptrace &= ~PT_DTRACE;
31887 + }
31888 + }
31889 +
31890 + /* Ok, finally something we can handle */
31891 + tsk->thread.trap_no = 1;
31892 + tsk->thread.error_code = error_code;
31893 + info.si_signo = SIGTRAP;
31894 + info.si_errno = 0;
31895 + info.si_code = TRAP_BRKPT;
31896 + info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
31897 + force_sig_info(SIGTRAP, &info, tsk);
31898 +
31899 +clear_dr7:
31900 + set_debugreg(0UL, 7);
31901 + preempt_conditional_cli(regs);
31902 + return;
31903 +
31904 +clear_TF_reenable:
31905 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
31906 + regs->eflags &= ~TF_MASK;
31907 + preempt_conditional_cli(regs);
31908 +}
31909 +
31910 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
31911 +{
31912 + const struct exception_table_entry *fixup;
31913 + fixup = search_exception_tables(regs->rip);
31914 + if (fixup) {
31915 + regs->rip = fixup->fixup;
31916 + return 1;
31917 + }
31918 + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
31919 + /* Illegal floating point operation in the kernel */
31920 + current->thread.trap_no = trapnr;
31921 + die(str, regs, 0);
31922 + return 0;
31923 +}
31924 +
31925 +/*
31926 + * Note that we play around with the 'TS' bit in an attempt to get
31927 + * the correct behaviour even in the presence of the asynchronous
31928 + * IRQ13 behaviour
31929 + */
31930 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
31931 +{
31932 + void __user *rip = (void __user *)(regs->rip);
31933 + struct task_struct * task;
31934 + siginfo_t info;
31935 + unsigned short cwd, swd;
31936 +
31937 + conditional_sti(regs);
31938 + if (!user_mode(regs) &&
31939 + kernel_math_error(regs, "kernel x87 math error", 16))
31940 + return;
31941 +
31942 + /*
31943 + * Save the info for the exception handler and clear the error.
31944 + */
31945 + task = current;
31946 + save_init_fpu(task);
31947 + task->thread.trap_no = 16;
31948 + task->thread.error_code = 0;
31949 + info.si_signo = SIGFPE;
31950 + info.si_errno = 0;
31951 + info.si_code = __SI_FAULT;
31952 + info.si_addr = rip;
31953 + /*
31954 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
31955 + * status. 0x3f is the exception bits in these regs, 0x200 is the
31956 + * C1 reg you need in case of a stack fault, 0x040 is the stack
31957 + * fault bit. We should only be taking one exception at a time,
31958 + * so if this combination doesn't produce any single exception,
31959 + * then we have a bad program that isn't synchronizing its FPU usage
31960 + * and it will suffer the consequences since we won't be able to
31961 + * fully reproduce the context of the exception
31962 + */
31963 + cwd = get_fpu_cwd(task);
31964 + swd = get_fpu_swd(task);
31965 + switch (swd & ~cwd & 0x3f) {
31966 + case 0x000:
31967 + default:
31968 + break;
31969 + case 0x001: /* Invalid Op */
31970 + /*
31971 + * swd & 0x240 == 0x040: Stack Underflow
31972 + * swd & 0x240 == 0x240: Stack Overflow
31973 + * User must clear the SF bit (0x40) if set
31974 + */
31975 + info.si_code = FPE_FLTINV;
31976 + break;
31977 + case 0x002: /* Denormalize */
31978 + case 0x010: /* Underflow */
31979 + info.si_code = FPE_FLTUND;
31980 + break;
31981 + case 0x004: /* Zero Divide */
31982 + info.si_code = FPE_FLTDIV;
31983 + break;
31984 + case 0x008: /* Overflow */
31985 + info.si_code = FPE_FLTOVF;
31986 + break;
31987 + case 0x020: /* Precision */
31988 + info.si_code = FPE_FLTRES;
31989 + break;
31990 + }
31991 + force_sig_info(SIGFPE, &info, task);
31992 +}
31993 +
31994 +asmlinkage void bad_intr(void)
31995 +{
31996 + printk("bad interrupt");
31997 +}
31998 +
31999 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
32000 +{
32001 + void __user *rip = (void __user *)(regs->rip);
32002 + struct task_struct * task;
32003 + siginfo_t info;
32004 + unsigned short mxcsr;
32005 +
32006 + conditional_sti(regs);
32007 + if (!user_mode(regs) &&
32008 + kernel_math_error(regs, "kernel simd math error", 19))
32009 + return;
32010 +
32011 + /*
32012 + * Save the info for the exception handler and clear the error.
32013 + */
32014 + task = current;
32015 + save_init_fpu(task);
32016 + task->thread.trap_no = 19;
32017 + task->thread.error_code = 0;
32018 + info.si_signo = SIGFPE;
32019 + info.si_errno = 0;
32020 + info.si_code = __SI_FAULT;
32021 + info.si_addr = rip;
32022 + /*
32023 + * The SIMD FPU exceptions are handled a little differently, as there
32024 + * is only a single status/control register. Thus, to determine which
32025 + * unmasked exception was caught we must mask the exception mask bits
32026 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
32027 + */
32028 + mxcsr = get_fpu_mxcsr(task);
32029 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
32030 + case 0x000:
32031 + default:
32032 + break;
32033 + case 0x001: /* Invalid Op */
32034 + info.si_code = FPE_FLTINV;
32035 + break;
32036 + case 0x002: /* Denormalize */
32037 + case 0x010: /* Underflow */
32038 + info.si_code = FPE_FLTUND;
32039 + break;
32040 + case 0x004: /* Zero Divide */
32041 + info.si_code = FPE_FLTDIV;
32042 + break;
32043 + case 0x008: /* Overflow */
32044 + info.si_code = FPE_FLTOVF;
32045 + break;
32046 + case 0x020: /* Precision */
32047 + info.si_code = FPE_FLTRES;
32048 + break;
32049 + }
32050 + force_sig_info(SIGFPE, &info, task);
32051 +}
32052 +
32053 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
32054 +{
32055 +}
32056 +
32057 +#if 0
32058 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
32059 +{
32060 +}
32061 +#endif
32062 +
32063 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
32064 +{
32065 +}
32066 +
32067 +/*
32068 + * 'math_state_restore()' saves the current math information in the
32069 + * old math state array, and gets the new ones from the current task
32070 + *
32071 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
32072 + * Don't touch unless you *really* know how it works.
32073 + */
32074 +asmlinkage void math_state_restore(void)
32075 +{
32076 + struct task_struct *me = current;
32077 + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
32078 +
32079 + if (!used_math())
32080 + init_fpu(me);
32081 + restore_fpu_checking(&me->thread.i387.fxsave);
32082 + task_thread_info(me)->status |= TS_USEDFPU;
32083 +}
32084 +
32085 +
32086 +/*
32087 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
32088 + * specify <dpl>|4 in the second field.
32089 + */
32090 +static trap_info_t __cpuinitdata trap_table[] = {
32091 + { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
32092 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
32093 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
32094 + { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
32095 + { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
32096 + { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
32097 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
32098 + { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
32099 + { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
32100 + { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
32101 + { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
32102 + { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
32103 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
32104 + { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
32105 + { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
32106 + { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
32107 +#ifdef CONFIG_X86_MCE
32108 + { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
32109 +#endif
32110 + { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
32111 +#ifdef CONFIG_IA32_EMULATION
32112 + { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall},
32113 +#endif
32114 + { 0, 0, 0, 0 }
32115 +};
32116 +
32117 +void __init trap_init(void)
32118 +{
32119 + int ret;
32120 +
32121 + ret = HYPERVISOR_set_trap_table(trap_table);
32122 + if (ret)
32123 + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
32124 +
32125 + /*
32126 + * Should be a barrier for any external CPU state.
32127 + */
32128 + cpu_init();
32129 +}
32130 +
32131 +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
32132 +{
32133 + const trap_info_t *t = trap_table;
32134 +
32135 + for (t = trap_table; t->address; t++) {
32136 + trap_ctxt[t->vector].flags = t->flags;
32137 + trap_ctxt[t->vector].cs = t->cs;
32138 + trap_ctxt[t->vector].address = t->address;
32139 + }
32140 +}
32141 +
32142 +
32143 +/* Actual parsing is done early in setup.c. */
32144 +static int __init oops_dummy(char *s)
32145 +{
32146 + panic_on_oops = 1;
32147 + return 1;
32148 +}
32149 +__setup("oops=", oops_dummy);
32150 +
32151 +static int __init kstack_setup(char *s)
32152 +{
32153 + kstack_depth_to_print = simple_strtoul(s,NULL,0);
32154 + return 1;
32155 +}
32156 +__setup("kstack=", kstack_setup);
32157 +
32158 +#ifdef CONFIG_STACK_UNWIND
32159 +static int __init call_trace_setup(char *s)
32160 +{
32161 + if (strcmp(s, "old") == 0)
32162 + call_trace = -1;
32163 + else if (strcmp(s, "both") == 0)
32164 + call_trace = 0;
32165 + else if (strcmp(s, "newfallback") == 0)
32166 + call_trace = 1;
32167 + else if (strcmp(s, "new") == 0)
32168 + call_trace = 2;
32169 + return 1;
32170 +}
32171 +__setup("call_trace=", call_trace_setup);
32172 +#endif
32173 Index: head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c
32174 ===================================================================
32175 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32176 +++ head-2008-11-25/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200
32177 @@ -0,0 +1,227 @@
32178 +/*
32179 + * linux/arch/x86_64/kernel/vsyscall.c
32180 + *
32181 + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
32182 + * Copyright 2003 Andi Kleen, SuSE Labs.
32183 + *
32184 + * Thanks to hpa@transmeta.com for some useful hint.
32185 + * Special thanks to Ingo Molnar for his early experience with
32186 + * a different vsyscall implementation for Linux/IA32 and for the name.
32187 + *
32188 + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
32189 + * at virtual address -10Mbyte+1024bytes etc... There are at max 4
32190 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
32191 + * jumping out of line if necessary. We cannot add more with this
32192 + * mechanism because older kernels won't return -ENOSYS.
32193 + * If we want more than four we need a vDSO.
32194 + *
32195 + * Note: the concept clashes with user mode linux. If you use UML and
32196 + * want per guest time just set the kernel.vsyscall64 sysctl to 0.
32197 + */
32198 +
32199 +#include <linux/time.h>
32200 +#include <linux/init.h>
32201 +#include <linux/kernel.h>
32202 +#include <linux/timer.h>
32203 +#include <linux/seqlock.h>
32204 +#include <linux/jiffies.h>
32205 +#include <linux/sysctl.h>
32206 +
32207 +#include <asm/vsyscall.h>
32208 +#include <asm/pgtable.h>
32209 +#include <asm/page.h>
32210 +#include <asm/fixmap.h>
32211 +#include <asm/errno.h>
32212 +#include <asm/io.h>
32213 +
32214 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
32215 +
32216 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
32217 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
32218 +
32219 +#include <asm/unistd.h>
32220 +
32221 +static __always_inline void timeval_normalize(struct timeval * tv)
32222 +{
32223 + time_t __sec;
32224 +
32225 + __sec = tv->tv_usec / 1000000;
32226 + if (__sec) {
32227 + tv->tv_usec %= 1000000;
32228 + tv->tv_sec += __sec;
32229 + }
32230 +}
32231 +
32232 +static __always_inline void do_vgettimeofday(struct timeval * tv)
32233 +{
32234 + long sequence, t;
32235 + unsigned long sec, usec;
32236 +
32237 + do {
32238 + sequence = read_seqbegin(&__xtime_lock);
32239 +
32240 + sec = __xtime.tv_sec;
32241 + usec = (__xtime.tv_nsec / 1000) +
32242 + (__jiffies - __wall_jiffies) * (1000000 / HZ);
32243 +
32244 + if (__vxtime.mode != VXTIME_HPET) {
32245 + t = get_cycles_sync();
32246 + if (t < __vxtime.last_tsc)
32247 + t = __vxtime.last_tsc;
32248 + usec += ((t - __vxtime.last_tsc) *
32249 + __vxtime.tsc_quot) >> 32;
32250 + /* See comment in x86_64 do_gettimeofday. */
32251 + } else {
32252 + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
32253 + __vxtime.last) * __vxtime.quot) >> 32;
32254 + }
32255 + } while (read_seqretry(&__xtime_lock, sequence));
32256 +
32257 + tv->tv_sec = sec + usec / 1000000;
32258 + tv->tv_usec = usec % 1000000;
32259 +}
32260 +
32261 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
32262 +static __always_inline void do_get_tz(struct timezone * tz)
32263 +{
32264 + *tz = __sys_tz;
32265 +}
32266 +
32267 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
32268 +{
32269 + int ret;
32270 + asm volatile("vsysc2: syscall"
32271 + : "=a" (ret)
32272 + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
32273 + return ret;
32274 +}
32275 +
32276 +static __always_inline long time_syscall(long *t)
32277 +{
32278 + long secs;
32279 + asm volatile("vsysc1: syscall"
32280 + : "=a" (secs)
32281 + : "0" (__NR_time),"D" (t) : __syscall_clobber);
32282 + return secs;
32283 +}
32284 +
32285 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
32286 +{
32287 + if (!__sysctl_vsyscall)
32288 + return gettimeofday(tv,tz);
32289 + if (tv)
32290 + do_vgettimeofday(tv);
32291 + if (tz)
32292 + do_get_tz(tz);
32293 + return 0;
32294 +}
32295 +
32296 +/* This will break when the xtime seconds get inaccurate, but that is
32297 + * unlikely */
32298 +time_t __vsyscall(1) vtime(time_t *t)
32299 +{
32300 + if (!__sysctl_vsyscall)
32301 + return time_syscall(t);
32302 + else if (t)
32303 + *t = __xtime.tv_sec;
32304 + return __xtime.tv_sec;
32305 +}
32306 +
32307 +long __vsyscall(2) venosys_0(void)
32308 +{
32309 + return -ENOSYS;
32310 +}
32311 +
32312 +long __vsyscall(3) venosys_1(void)
32313 +{
32314 + return -ENOSYS;
32315 +}
32316 +
32317 +#ifdef CONFIG_SYSCTL
32318 +
32319 +#define SYSCALL 0x050f
32320 +#define NOP2 0x9090
32321 +
32322 +/*
32323 + * NOP out syscall in vsyscall page when not needed.
32324 + */
32325 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
32326 + void __user *buffer, size_t *lenp, loff_t *ppos)
32327 +{
32328 + extern u16 vsysc1, vsysc2;
32329 + u16 *map1, *map2;
32330 + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
32331 + if (!write)
32332 + return ret;
32333 + /* gcc has some trouble with __va(__pa()), so just do it this
32334 + way. */
32335 + map1 = ioremap(__pa_symbol(&vsysc1), 2);
32336 + if (!map1)
32337 + return -ENOMEM;
32338 + map2 = ioremap(__pa_symbol(&vsysc2), 2);
32339 + if (!map2) {
32340 + ret = -ENOMEM;
32341 + goto out;
32342 + }
32343 + if (!sysctl_vsyscall) {
32344 + *map1 = SYSCALL;
32345 + *map2 = SYSCALL;
32346 + } else {
32347 + *map1 = NOP2;
32348 + *map2 = NOP2;
32349 + }
32350 + iounmap(map2);
32351 +out:
32352 + iounmap(map1);
32353 + return ret;
32354 +}
32355 +
32356 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
32357 + void __user *oldval, size_t __user *oldlenp,
32358 + void __user *newval, size_t newlen,
32359 + void **context)
32360 +{
32361 + return -ENOSYS;
32362 +}
32363 +
32364 +static ctl_table kernel_table2[] = {
32365 + { .ctl_name = 99, .procname = "vsyscall64",
32366 + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
32367 + .strategy = vsyscall_sysctl_nostrat,
32368 + .proc_handler = vsyscall_sysctl_change },
32369 + { 0, }
32370 +};
32371 +
32372 +static ctl_table kernel_root_table2[] = {
32373 + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
32374 + .child = kernel_table2 },
32375 + { 0 },
32376 +};
32377 +
32378 +#endif
32379 +
32380 +static void __init map_vsyscall(void)
32381 +{
32382 + extern char __vsyscall_0;
32383 + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
32384 +
32385 + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
32386 +}
32387 +
32388 +static int __init vsyscall_init(void)
32389 +{
32390 + BUG_ON(((unsigned long) &vgettimeofday !=
32391 + VSYSCALL_ADDR(__NR_vgettimeofday)));
32392 + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
32393 + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
32394 + map_vsyscall();
32395 +#ifdef CONFIG_XEN
32396 + sysctl_vsyscall = 0; /* disable vgettimeofay() */
32397 +#endif
32398 +#ifdef CONFIG_SYSCTL
32399 + register_sysctl_table(kernel_root_table2, 0);
32400 +#endif
32401 + return 0;
32402 +}
32403 +
32404 +__initcall(vsyscall_init);
32405 Index: head-2008-11-25/arch/x86/kernel/xen_entry_64.S
32406 ===================================================================
32407 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32408 +++ head-2008-11-25/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200
32409 @@ -0,0 +1,36 @@
32410 +/*
32411 + * Copied from arch/xen/i386/kernel/entry.S
32412 + */
32413 +/* Offsets into shared_info_t. */
32414 +#define evtchn_upcall_pending /* 0 */
32415 +#define evtchn_upcall_mask 1
32416 +
32417 +#define sizeof_vcpu_shift 6
32418 +
32419 +#ifdef CONFIG_SMP
32420 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
32421 +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
32422 +#define preempt_disable(reg)
32423 +#define preempt_enable(reg)
32424 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
32425 + movq %gs:pda_cpunumber,reg ; \
32426 + shl $32, reg ; \
32427 + shr $32-sizeof_vcpu_shift,reg ; \
32428 + addq HYPERVISOR_shared_info,reg
32429 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
32430 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
32431 +#else
32432 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
32433 +#define XEN_PUT_VCPU_INFO(reg)
32434 +#define XEN_PUT_VCPU_INFO_fixup
32435 +#endif
32436 +
32437 +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
32438 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
32439 +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32440 + XEN_LOCKED_BLOCK_EVENTS(reg) ; \
32441 + XEN_PUT_VCPU_INFO(reg)
32442 +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
32443 + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
32444 + XEN_PUT_VCPU_INFO(reg)
32445 +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
32446 Index: head-2008-11-25/arch/x86/mm/fault_64-xen.c
32447 ===================================================================
32448 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
32449 +++ head-2008-11-25/arch/x86/mm/fault_64-xen.c 2007-11-02 17:34:23.000000000 +0100
32450 @@ -0,0 +1,724 @@
32451 +/*
32452 + * linux/arch/x86-64/mm/fault.c
32453 + *
32454 + * Copyright (C) 1995 Linus Torvalds
32455 + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
32456 + */
32457 +
32458 +#include <linux/signal.h>
32459 +#include <linux/sched.h>
32460 +#include <linux/kernel.h>
32461 +#include <linux/errno.h>
32462 +#include <linux/string.h>
32463 +#include <linux/types.h>
32464 +#include <linux/ptrace.h>
32465 +#include <linux/mman.h>
32466 +#include <linux/mm.h>
32467 +#include <linux/smp.h>
32468 +#include <linux/smp_lock.h>
32469 +#include <linux/interrupt.h>
32470 +#include <linux/init.h>
32471 +#include <linux/tty.h>
32472 +#include <linux/vt_kern.h> /* For unblank_screen() */
32473 +#include <linux/compiler.h>
32474 +#include <linux/module.h>
32475 +#include <linux/kprobes.h>
32476 +
32477 +#include <asm/system.h>
32478 +#include <asm/uaccess.h>
32479 +#include <asm/pgalloc.h>
32480 +#include <asm/smp.h>
32481 +#include <asm/tlbflush.h>
32482 +#include <asm/proto.h>
32483 +#include <asm/kdebug.h>
32484 +#include <asm-generic/sections.h>
32485 +
32486 +/* Page fault error code bits */
32487 +#define PF_PROT (1<<0) /* or no page found */
32488 +#define PF_WRITE (1<<1)
32489 +#define PF_USER (1<<2)
32490 +#define PF_RSVD (1<<3)
32491 +#define PF_INSTR (1<<4)
32492 +
32493 +#ifdef CONFIG_KPROBES
32494 +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
32495 +
32496 +/* Hook to register for page fault notifications */
32497 +int register_page_fault_notifier(struct notifier_block *nb)
32498 +{
32499 + vmalloc_sync_all();
32500 + return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
32501 +}
32502 +
32503 +int unregister_page_fault_notifier(struct notifier_block *nb)
32504 +{
32505 + return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
32506 +}
32507 +
32508 +static inline int notify_page_fault(enum die_val val, const char *str,
32509 + struct pt_regs *regs, long err, int trap, int sig)
32510 +{
32511 + struct die_args args = {
32512 + .regs = regs,
32513 + .str = str,
32514 + .err = err,
32515 + .trapnr = trap,
32516 + .signr = sig
32517 + };
32518 + return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
32519 +}
32520 +#else
32521 +static inline int notify_page_fault(enum die_val val, const char *str,
32522 + struct pt_regs *regs, long err, int trap, int sig)
32523 +{
32524 + return NOTIFY_DONE;
32525 +}
32526 +#endif
32527 +
32528 +void bust_spinlocks(int yes)
32529 +{
32530 + int loglevel_save = console_loglevel;
32531 + if (yes) {
32532 + oops_in_progress = 1;
32533 + } else {
32534 +#ifdef CONFIG_VT
32535 + unblank_screen();
32536 +#endif
32537 + oops_in_progress = 0;
32538 + /*
32539 + * OK, the message is on the console. Now we call printk()
32540 + * without oops_in_progress set so that printk will give klogd
32541 + * a poke. Hold onto your hats...
32542 + */
32543 + console_loglevel = 15; /* NMI oopser may have shut the console up */
32544 + printk(" ");
32545 + console_loglevel = loglevel_save;
32546 + }
32547 +}
32548 +
32549 +/* Sometimes the CPU reports invalid exceptions on prefetch.
32550 + Check that here and ignore.
32551 + Opcode checker based on code by Richard Brunner */
32552 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
32553 + unsigned long error_code)
32554 +{
32555 + unsigned char *instr;
32556 + int scan_more = 1;
32557 + int prefetch = 0;
32558 + unsigned char *max_instr;
32559 +
32560 + /* If it was a exec fault ignore */
32561 + if (error_code & PF_INSTR)
32562 + return 0;
32563 +
32564 + instr = (unsigned char *)convert_rip_to_linear(current, regs);
32565 + max_instr = instr + 15;
32566 +
32567 + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
32568 + return 0;
32569 +
32570 + while (scan_more && instr < max_instr) {
32571 + unsigned char opcode;
32572 + unsigned char instr_hi;
32573 + unsigned char instr_lo;
32574 +
32575 + if (__get_user(opcode, instr))
32576 + break;
32577 +
32578 + instr_hi = opcode & 0xf0;
32579 + instr_lo = opcode & 0x0f;
32580 + instr++;
32581 +
32582 + switch (instr_hi) {
32583 + case 0x20:
32584 + case 0x30:
32585 + /* Values 0x26,0x2E,0x36,0x3E are valid x86
32586 + prefixes. In long mode, the CPU will signal
32587 + invalid opcode if some of these prefixes are
32588 + present so we will never get here anyway */
32589 + scan_more = ((instr_lo & 7) == 0x6);
32590 + break;
32591 +
32592 + case 0x40:
32593 + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
32594 + Need to figure out under what instruction mode the
32595 + instruction was issued ... */
32596 + /* Could check the LDT for lm, but for now it's good
32597 + enough to assume that long mode only uses well known
32598 + segments or kernel. */
32599 + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
32600 + break;
32601 +
32602 + case 0x60:
32603 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
32604 + scan_more = (instr_lo & 0xC) == 0x4;
32605 + break;
32606 + case 0xF0:
32607 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
32608 + scan_more = !instr_lo || (instr_lo>>1) == 1;
32609 + break;
32610 + case 0x00:
32611 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
32612 + scan_more = 0;
32613 + if (__get_user(opcode, instr))
32614 + break;
32615 + prefetch = (instr_lo == 0xF) &&
32616 + (opcode == 0x0D || opcode == 0x18);
32617 + break;
32618 + default:
32619 + scan_more = 0;
32620 + break;
32621 + }
32622 + }
32623 + return prefetch;
32624 +}
32625 +
32626 +static int bad_address(void *p)
32627 +{
32628 + unsigned long dummy;
32629 + return __get_user(dummy, (unsigned long *)p);
32630 +}
32631 +
32632 +void dump_pagetable(unsigned long address)
32633 +{
32634 + pgd_t *pgd;
32635 + pud_t *pud;
32636 + pmd_t *pmd;
32637 + pte_t *pte;
32638 +
32639 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32640 + pgd += pgd_index(address);
32641 + if (bad_address(pgd)) goto bad;
32642 + printk("PGD %lx ", pgd_val(*pgd));
32643 + if (!pgd_present(*pgd)) goto ret;
32644 +
32645 + pud = pud_offset(pgd, address);
32646 + if (bad_address(pud)) goto bad;
32647 + printk("PUD %lx ", pud_val(*pud));
32648 + if (!pud_present(*pud)) goto ret;
32649 +
32650 + pmd = pmd_offset(pud, address);
32651 + if (bad_address(pmd)) goto bad;
32652 + printk("PMD %lx ", pmd_val(*pmd));
32653 + if (!pmd_present(*pmd)) goto ret;
32654 +
32655 + pte = pte_offset_kernel(pmd, address);
32656 + if (bad_address(pte)) goto bad;
32657 + printk("PTE %lx", pte_val(*pte));
32658 +ret:
32659 + printk("\n");
32660 + return;
32661 +bad:
32662 + printk("BAD\n");
32663 +}
32664 +
32665 +static const char errata93_warning[] =
32666 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
32667 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
32668 +KERN_ERR "******* Please consider a BIOS update.\n"
32669 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
32670 +
32671 +/* Workaround for K8 erratum #93 & buggy BIOS.
32672 + BIOS SMM functions are required to use a specific workaround
32673 + to avoid corruption of the 64bit RIP register on C stepping K8.
32674 + A lot of BIOS that didn't get tested properly miss this.
32675 + The OS sees this as a page fault with the upper 32bits of RIP cleared.
32676 + Try to work around it here.
32677 + Note we only handle faults in kernel here. */
32678 +
32679 +static int is_errata93(struct pt_regs *regs, unsigned long address)
32680 +{
32681 + static int warned;
32682 + if (address != regs->rip)
32683 + return 0;
32684 + if ((address >> 32) != 0)
32685 + return 0;
32686 + address |= 0xffffffffUL << 32;
32687 + if ((address >= (u64)_stext && address <= (u64)_etext) ||
32688 + (address >= MODULES_VADDR && address <= MODULES_END)) {
32689 + if (!warned) {
32690 + printk(errata93_warning);
32691 + warned = 1;
32692 + }
32693 + regs->rip = address;
32694 + return 1;
32695 + }
32696 + return 0;
32697 +}
32698 +
32699 +int unhandled_signal(struct task_struct *tsk, int sig)
32700 +{
32701 + if (tsk->pid == 1)
32702 + return 1;
32703 + if (tsk->ptrace & PT_PTRACED)
32704 + return 0;
32705 + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
32706 + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
32707 +}
32708 +
32709 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
32710 + unsigned long error_code)
32711 +{
32712 + unsigned long flags = oops_begin();
32713 + struct task_struct *tsk;
32714 +
32715 + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
32716 + current->comm, address);
32717 + dump_pagetable(address);
32718 + tsk = current;
32719 + tsk->thread.cr2 = address;
32720 + tsk->thread.trap_no = 14;
32721 + tsk->thread.error_code = error_code;
32722 + __die("Bad pagetable", regs, error_code);
32723 + oops_end(flags);
32724 + do_exit(SIGKILL);
32725 +}
32726 +
32727 +/*
32728 + * Handle a fault on the vmalloc area
32729 + *
32730 + * This assumes no large pages in there.
32731 + */
32732 +static int vmalloc_fault(unsigned long address)
32733 +{
32734 + pgd_t *pgd, *pgd_ref;
32735 + pud_t *pud, *pud_ref;
32736 + pmd_t *pmd, *pmd_ref;
32737 + pte_t *pte, *pte_ref;
32738 +
32739 + /* Copy kernel mappings over when needed. This can also
32740 + happen within a race in page table update. In the later
32741 + case just flush. */
32742 +
32743 + /* On Xen the line below does not always work. Needs investigating! */
32744 + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
32745 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
32746 + pgd += pgd_index(address);
32747 + pgd_ref = pgd_offset_k(address);
32748 + if (pgd_none(*pgd_ref))
32749 + return -1;
32750 + if (pgd_none(*pgd))
32751 + set_pgd(pgd, *pgd_ref);
32752 + else
32753 + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
32754 +
32755 + /* Below here mismatches are bugs because these lower tables
32756 + are shared */
32757 +
32758 + pud = pud_offset(pgd, address);
32759 + pud_ref = pud_offset(pgd_ref, address);
32760 + if (pud_none(*pud_ref))
32761 + return -1;
32762 + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
32763 + BUG();
32764 + pmd = pmd_offset(pud, address);
32765 + pmd_ref = pmd_offset(pud_ref, address);
32766 + if (pmd_none(*pmd_ref))
32767 + return -1;
32768 + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
32769 + BUG();
32770 + pte_ref = pte_offset_kernel(pmd_ref, address);
32771 + if (!pte_present(*pte_ref))
32772 + return -1;
32773 + pte = pte_offset_kernel(pmd, address);
32774 + /* Don't use pte_page here, because the mappings can point
32775 + outside mem_map, and the NUMA hash lookup cannot handle
32776 + that. */
32777 + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
32778 + BUG();
32779 + return 0;
32780 +}
32781 +
32782 +int page_fault_trace = 0;
32783 +int exception_trace = 1;
32784 +
32785 +
32786 +#define MEM_VERBOSE 1
32787 +
32788 +#ifdef MEM_VERBOSE
32789 +#define MEM_LOG(_f, _a...) \
32790 + printk("fault.c:[%d]-> " _f "\n", \
32791 + __LINE__ , ## _a )
32792 +#else
32793 +#define MEM_LOG(_f, _a...) ((void)0)
32794 +#endif
32795 +
32796 +static int spurious_fault(struct pt_regs *regs,
32797 + unsigned long address,
32798 + unsigned long error_code)
32799 +{
32800 + pgd_t *pgd;
32801 + pud_t *pud;
32802 + pmd_t *pmd;
32803 + pte_t *pte;
32804 +
32805 +#ifdef CONFIG_XEN
32806 + /* Faults in hypervisor area are never spurious. */
32807 + if ((address >= HYPERVISOR_VIRT_START) &&
32808 + (address < HYPERVISOR_VIRT_END))
32809 + return 0;
32810 +#endif
32811 +
32812 + /* Reserved-bit violation or user access to kernel space? */
32813 + if (error_code & (PF_RSVD|PF_USER))
32814 + return 0;
32815 +
32816 + pgd = init_mm.pgd + pgd_index(address);
32817 + if (!pgd_present(*pgd))
32818 + return 0;
32819 +
32820 + pud = pud_offset(pgd, address);
32821 + if (!pud_present(*pud))
32822 + return 0;
32823 +
32824 + pmd = pmd_offset(pud, address);
32825 + if (!pmd_present(*pmd))
32826 + return 0;
32827 +
32828 + pte = pte_offset_kernel(pmd, address);
32829 + if (!pte_present(*pte))
32830 + return 0;
32831 + if ((error_code & PF_WRITE) && !pte_write(*pte))
32832 + return 0;
32833 + if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
32834 + return 0;
32835 +
32836 + return 1;
32837 +}
32838 +
32839 +/*
32840 + * This routine handles page faults. It determines the address,
32841 + * and the problem, and then passes it off to one of the appropriate
32842 + * routines.
32843 + */
32844 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
32845 + unsigned long error_code)
32846 +{
32847 + struct task_struct *tsk;
32848 + struct mm_struct *mm;
32849 + struct vm_area_struct * vma;
32850 + unsigned long address;
32851 + const struct exception_table_entry *fixup;
32852 + int write;
32853 + unsigned long flags;
32854 + siginfo_t info;
32855 +
32856 + if (!user_mode(regs))
32857 + error_code &= ~PF_USER; /* means kernel */
32858 +
32859 + tsk = current;
32860 + mm = tsk->mm;
32861 + prefetchw(&mm->mmap_sem);
32862 +
32863 + /* get the address */
32864 + address = current_vcpu_info()->arch.cr2;
32865 +
32866 + info.si_code = SEGV_MAPERR;
32867 +
32868 +
32869 + /*
32870 + * We fault-in kernel-space virtual memory on-demand. The
32871 + * 'reference' page table is init_mm.pgd.
32872 + *
32873 + * NOTE! We MUST NOT take any locks for this case. We may
32874 + * be in an interrupt or a critical region, and should
32875 + * only copy the information from the master page table,
32876 + * nothing more.
32877 + *
32878 + * This verifies that the fault happens in kernel space
32879 + * (error_code & 4) == 0, and that the fault was not a
32880 + * protection error (error_code & 9) == 0.
32881 + */
32882 + if (unlikely(address >= TASK_SIZE64)) {
32883 + /*
32884 + * Don't check for the module range here: its PML4
32885 + * is always initialized because it's shared with the main
32886 + * kernel text. Only vmalloc may need PML4 syncups.
32887 + */
32888 + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
32889 + ((address >= VMALLOC_START && address < VMALLOC_END))) {
32890 + if (vmalloc_fault(address) >= 0)
32891 + return;
32892 + }
32893 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
32894 + if (spurious_fault(regs, address, error_code))
32895 + return;
32896 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32897 + SIGSEGV) == NOTIFY_STOP)
32898 + return;
32899 + /*
32900 + * Don't take the mm semaphore here. If we fixup a prefetch
32901 + * fault we could otherwise deadlock.
32902 + */
32903 + goto bad_area_nosemaphore;
32904 + }
32905 +
32906 + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
32907 + SIGSEGV) == NOTIFY_STOP)
32908 + return;
32909 +
32910 + if (likely(regs->eflags & X86_EFLAGS_IF))
32911 + local_irq_enable();
32912 +
32913 + if (unlikely(page_fault_trace))
32914 + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
32915 + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
32916 +
32917 + if (unlikely(error_code & PF_RSVD))
32918 + pgtable_bad(address, regs, error_code);
32919 +
32920 + /*
32921 + * If we're in an interrupt or have no user
32922 + * context, we must not take the fault..
32923 + */
32924 + if (unlikely(in_atomic() || !mm))
32925 + goto bad_area_nosemaphore;
32926 +
32927 + again:
32928 + /* When running in the kernel we expect faults to occur only to
32929 + * addresses in user space. All other faults represent errors in the
32930 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
32931 + * erroneous fault occurring in a code path which already holds mmap_sem
32932 + * we will deadlock attempting to validate the fault against the
32933 + * address space. Luckily the kernel only validly references user
32934 + * space from well defined areas of code, which are listed in the
32935 + * exceptions table.
32936 + *
32937 + * As the vast majority of faults will be valid we will only perform
32938 + * the source reference check when there is a possibilty of a deadlock.
32939 + * Attempt to lock the address space, if we cannot we then validate the
32940 + * source. If this is invalid we can skip the address space check,
32941 + * thus avoiding the deadlock.
32942 + */
32943 + if (!down_read_trylock(&mm->mmap_sem)) {
32944 + if ((error_code & PF_USER) == 0 &&
32945 + !search_exception_tables(regs->rip))
32946 + goto bad_area_nosemaphore;
32947 + down_read(&mm->mmap_sem);
32948 + }
32949 +
32950 + vma = find_vma(mm, address);
32951 + if (!vma)
32952 + goto bad_area;
32953 + if (likely(vma->vm_start <= address))
32954 + goto good_area;
32955 + if (!(vma->vm_flags & VM_GROWSDOWN))
32956 + goto bad_area;
32957 + if (error_code & 4) {
32958 + /* Allow userspace just enough access below the stack pointer
32959 + * to let the 'enter' instruction work.
32960 + */
32961 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
32962 + goto bad_area;
32963 + }
32964 + if (expand_stack(vma, address))
32965 + goto bad_area;
32966 +/*
32967 + * Ok, we have a good vm_area for this memory access, so
32968 + * we can handle it..
32969 + */
32970 +good_area:
32971 + info.si_code = SEGV_ACCERR;
32972 + write = 0;
32973 + switch (error_code & (PF_PROT|PF_WRITE)) {
32974 + default: /* 3: write, present */
32975 + /* fall through */
32976 + case PF_WRITE: /* write, not present */
32977 + if (!(vma->vm_flags & VM_WRITE))
32978 + goto bad_area;
32979 + write++;
32980 + break;
32981 + case PF_PROT: /* read, present */
32982 + goto bad_area;
32983 + case 0: /* read, not present */
32984 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
32985 + goto bad_area;
32986 + }
32987 +
32988 + /*
32989 + * If for any reason at all we couldn't handle the fault,
32990 + * make sure we exit gracefully rather than endlessly redo
32991 + * the fault.
32992 + */
32993 + switch (handle_mm_fault(mm, vma, address, write)) {
32994 + case VM_FAULT_MINOR:
32995 + tsk->min_flt++;
32996 + break;
32997 + case VM_FAULT_MAJOR:
32998 + tsk->maj_flt++;
32999 + break;
33000 + case VM_FAULT_SIGBUS:
33001 + goto do_sigbus;
33002 + default:
33003 + goto out_of_memory;
33004 + }
33005 +
33006 + up_read(&mm->mmap_sem);
33007 + return;
33008 +
33009 +/*
33010 + * Something tried to access memory that isn't in our memory map..
33011 + * Fix it, but check if it's kernel or user first..
33012 + */
33013 +bad_area:
33014 + up_read(&mm->mmap_sem);
33015 +
33016 +bad_area_nosemaphore:
33017 + /* User mode accesses just cause a SIGSEGV */
33018 + if (error_code & PF_USER) {
33019 + if (is_prefetch(regs, address, error_code))
33020 + return;
33021 +
33022 + /* Work around K8 erratum #100 K8 in compat mode
33023 + occasionally jumps to illegal addresses >4GB. We
33024 + catch this here in the page fault handler because
33025 + these addresses are not reachable. Just detect this
33026 + case and return. Any code segment in LDT is
33027 + compatibility mode. */
33028 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
33029 + (address >> 32))
33030 + return;
33031 +
33032 + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
33033 + printk(
33034 + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
33035 + tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
33036 + tsk->comm, tsk->pid, address, regs->rip,
33037 + regs->rsp, error_code);
33038 + }
33039 +
33040 + tsk->thread.cr2 = address;
33041 + /* Kernel addresses are always protection faults */
33042 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
33043 + tsk->thread.trap_no = 14;
33044 + info.si_signo = SIGSEGV;
33045 + info.si_errno = 0;
33046 + /* info.si_code has been set above */
33047 + info.si_addr = (void __user *)address;
33048 + force_sig_info(SIGSEGV, &info, tsk);
33049 + return;
33050 + }
33051 +
33052 +no_context:
33053 +
33054 + /* Are we prepared to handle this kernel fault? */
33055 + fixup = search_exception_tables(regs->rip);
33056 + if (fixup) {
33057 + regs->rip = fixup->fixup;
33058 + return;
33059 + }
33060 +
33061 + /*
33062 + * Hall of shame of CPU/BIOS bugs.
33063 + */
33064 +
33065 + if (is_prefetch(regs, address, error_code))
33066 + return;
33067 +
33068 + if (is_errata93(regs, address))
33069 + return;
33070 +
33071 +/*
33072 + * Oops. The kernel tried to access some bad page. We'll have to
33073 + * terminate things with extreme prejudice.
33074 + */
33075 +
33076 + flags = oops_begin();
33077 +
33078 + if (address < PAGE_SIZE)
33079 + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
33080 + else
33081 + printk(KERN_ALERT "Unable to handle kernel paging request");
33082 + printk(" at %016lx RIP: \n" KERN_ALERT,address);
33083 + printk_address(regs->rip);
33084 + dump_pagetable(address);
33085 + tsk->thread.cr2 = address;
33086 + tsk->thread.trap_no = 14;
33087 + tsk->thread.error_code = error_code;
33088 + __die("Oops", regs, error_code);
33089 + /* Executive summary in case the body of the oops scrolled away */
33090 + printk(KERN_EMERG "CR2: %016lx\n", address);
33091 + oops_end(flags);
33092 + do_exit(SIGKILL);
33093 +
33094 +/*
33095 + * We ran out of memory, or some other thing happened to us that made
33096 + * us unable to handle the page fault gracefully.
33097 + */
33098 +out_of_memory:
33099 + up_read(&mm->mmap_sem);
33100 + if (current->pid == 1) {
33101 + yield();
33102 + goto again;
33103 + }
33104 + printk("VM: killing process %s\n", tsk->comm);
33105 + if (error_code & 4)
33106 + do_exit(SIGKILL);
33107 + goto no_context;
33108 +
33109 +do_sigbus:
33110 + up_read(&mm->mmap_sem);
33111 +
33112 + /* Kernel mode? Handle exceptions or die */
33113 + if (!(error_code & PF_USER))
33114 + goto no_context;
33115 +
33116 + tsk->thread.cr2 = address;
33117 + tsk->thread.error_code = error_code;
33118 + tsk->thread.trap_no = 14;
33119 + info.si_signo = SIGBUS;
33120 + info.si_errno = 0;
33121 + info.si_code = BUS_ADRERR;
33122 + info.si_addr = (void __user *)address;
33123 + force_sig_info(SIGBUS, &info, tsk);
33124 + return;
33125 +}
33126 +
33127 +DEFINE_SPINLOCK(pgd_lock);
33128 +struct page *pgd_list;
33129 +
33130 +void vmalloc_sync_all(void)
33131 +{
33132 + /* Note that races in the updates of insync and start aren't
33133 + problematic:
33134 + insync can only get set bits added, and updates to start are only
33135 + improving performance (without affecting correctness if undone). */
33136 + static DECLARE_BITMAP(insync, PTRS_PER_PGD);
33137 + static unsigned long start = VMALLOC_START & PGDIR_MASK;
33138 + unsigned long address;
33139 +
33140 + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
33141 + if (!test_bit(pgd_index(address), insync)) {
33142 + const pgd_t *pgd_ref = pgd_offset_k(address);
33143 + struct page *page;
33144 +
33145 + if (pgd_none(*pgd_ref))
33146 + continue;
33147 + spin_lock(&pgd_lock);
33148 + for (page = pgd_list; page;
33149 + page = (struct page *)page->index) {
33150 + pgd_t *pgd;
33151 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
33152 + if (pgd_none(*pgd))
33153 + set_pgd(pgd, *pgd_ref);
33154 + else
33155 + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
33156 + }
33157 + spin_unlock(&pgd_lock);
33158 + set_bit(pgd_index(address), insync);
33159 + }
33160 + if (address == start)
33161 + start = address + PGDIR_SIZE;
33162 + }
33163 + /* Check that there is no need to do the same for the modules area. */
33164 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33165 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
33166 + (__START_KERNEL & PGDIR_MASK)));
33167 +}
33168 +
33169 +static int __init enable_pagefaulttrace(char *str)
33170 +{
33171 + page_fault_trace = 1;
33172 + return 1;
33173 +}
33174 +__setup("pagefaulttrace", enable_pagefaulttrace);
33175 Index: head-2008-11-25/arch/x86/mm/init_64-xen.c
33176 ===================================================================
33177 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
33178 +++ head-2008-11-25/arch/x86/mm/init_64-xen.c 2008-10-29 09:55:56.000000000 +0100
33179 @@ -0,0 +1,1206 @@
33180 +/*
33181 + * linux/arch/x86_64/mm/init.c
33182 + *
33183 + * Copyright (C) 1995 Linus Torvalds
33184 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
33185 + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
33186 + *
33187 + * Jun Nakajima <jun.nakajima@intel.com>
33188 + * Modified for Xen.
33189 + */
33190 +
33191 +#include <linux/signal.h>
33192 +#include <linux/sched.h>
33193 +#include <linux/kernel.h>
33194 +#include <linux/errno.h>
33195 +#include <linux/string.h>
33196 +#include <linux/types.h>
33197 +#include <linux/ptrace.h>
33198 +#include <linux/mman.h>
33199 +#include <linux/mm.h>
33200 +#include <linux/swap.h>
33201 +#include <linux/smp.h>
33202 +#include <linux/init.h>
33203 +#include <linux/pagemap.h>
33204 +#include <linux/bootmem.h>
33205 +#include <linux/proc_fs.h>
33206 +#include <linux/pci.h>
33207 +#include <linux/poison.h>
33208 +#include <linux/dma-mapping.h>
33209 +#include <linux/module.h>
33210 +#include <linux/memory_hotplug.h>
33211 +
33212 +#include <asm/processor.h>
33213 +#include <asm/system.h>
33214 +#include <asm/uaccess.h>
33215 +#include <asm/pgtable.h>
33216 +#include <asm/pgalloc.h>
33217 +#include <asm/dma.h>
33218 +#include <asm/fixmap.h>
33219 +#include <asm/e820.h>
33220 +#include <asm/apic.h>
33221 +#include <asm/tlb.h>
33222 +#include <asm/mmu_context.h>
33223 +#include <asm/proto.h>
33224 +#include <asm/smp.h>
33225 +#include <asm/sections.h>
33226 +
33227 +#include <xen/features.h>
33228 +
33229 +#ifndef Dprintk
33230 +#define Dprintk(x...)
33231 +#endif
33232 +
33233 +struct dma_mapping_ops* dma_ops;
33234 +EXPORT_SYMBOL(dma_ops);
33235 +
33236 +#if CONFIG_XEN_COMPAT <= 0x030002
33237 +unsigned int __kernel_page_user;
33238 +EXPORT_SYMBOL(__kernel_page_user);
33239 +#endif
33240 +
33241 +int after_bootmem;
33242 +
33243 +static unsigned long dma_reserve __initdata;
33244 +
33245 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
33246 +extern unsigned long start_pfn;
33247 +
33248 +/*
33249 + * Use this until direct mapping is established, i.e. before __va() is
33250 + * available in init_memory_mapping().
33251 + */
33252 +
33253 +#define addr_to_page(addr, page) \
33254 + (addr) &= PHYSICAL_PAGE_MASK; \
33255 + (page) = ((unsigned long *) ((unsigned long) \
33256 + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
33257 + __START_KERNEL_map)))
33258 +
33259 +static void __meminit early_make_page_readonly(void *va, unsigned int feature)
33260 +{
33261 + unsigned long addr, _va = (unsigned long)va;
33262 + pte_t pte, *ptep;
33263 + unsigned long *page = (unsigned long *) init_level4_pgt;
33264 +
33265 + BUG_ON(after_bootmem);
33266 +
33267 + if (xen_feature(feature))
33268 + return;
33269 +
33270 + addr = (unsigned long) page[pgd_index(_va)];
33271 + addr_to_page(addr, page);
33272 +
33273 + addr = page[pud_index(_va)];
33274 + addr_to_page(addr, page);
33275 +
33276 + addr = page[pmd_index(_va)];
33277 + addr_to_page(addr, page);
33278 +
33279 + ptep = (pte_t *) &page[pte_index(_va)];
33280 +
33281 + pte.pte = ptep->pte & ~_PAGE_RW;
33282 + if (HYPERVISOR_update_va_mapping(_va, pte, 0))
33283 + BUG();
33284 +}
33285 +
33286 +static void __make_page_readonly(void *va)
33287 +{
33288 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33289 + unsigned long addr = (unsigned long) va;
33290 +
33291 + pgd = pgd_offset_k(addr);
33292 + pud = pud_offset(pgd, addr);
33293 + pmd = pmd_offset(pud, addr);
33294 + ptep = pte_offset_kernel(pmd, addr);
33295 +
33296 + pte.pte = ptep->pte & ~_PAGE_RW;
33297 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33298 + xen_l1_entry_update(ptep, pte); /* fallback */
33299 +
33300 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33301 + __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
33302 +}
33303 +
33304 +static void __make_page_writable(void *va)
33305 +{
33306 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
33307 + unsigned long addr = (unsigned long) va;
33308 +
33309 + pgd = pgd_offset_k(addr);
33310 + pud = pud_offset(pgd, addr);
33311 + pmd = pmd_offset(pud, addr);
33312 + ptep = pte_offset_kernel(pmd, addr);
33313 +
33314 + pte.pte = ptep->pte | _PAGE_RW;
33315 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
33316 + xen_l1_entry_update(ptep, pte); /* fallback */
33317 +
33318 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
33319 + __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
33320 +}
33321 +
33322 +void make_page_readonly(void *va, unsigned int feature)
33323 +{
33324 + if (!xen_feature(feature))
33325 + __make_page_readonly(va);
33326 +}
33327 +
33328 +void make_page_writable(void *va, unsigned int feature)
33329 +{
33330 + if (!xen_feature(feature))
33331 + __make_page_writable(va);
33332 +}
33333 +
33334 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
33335 +{
33336 + if (xen_feature(feature))
33337 + return;
33338 +
33339 + while (nr-- != 0) {
33340 + __make_page_readonly(va);
33341 + va = (void*)((unsigned long)va + PAGE_SIZE);
33342 + }
33343 +}
33344 +
33345 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
33346 +{
33347 + if (xen_feature(feature))
33348 + return;
33349 +
33350 + while (nr-- != 0) {
33351 + __make_page_writable(va);
33352 + va = (void*)((unsigned long)va + PAGE_SIZE);
33353 + }
33354 +}
33355 +
33356 +/*
33357 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
33358 + * physical space so we can cache the place of the first one and move
33359 + * around without checking the pgd every time.
33360 + */
33361 +
33362 +void show_mem(void)
33363 +{
33364 + long i, total = 0, reserved = 0;
33365 + long shared = 0, cached = 0;
33366 + pg_data_t *pgdat;
33367 + struct page *page;
33368 +
33369 + printk(KERN_INFO "Mem-info:\n");
33370 + show_free_areas();
33371 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
33372 +
33373 + for_each_online_pgdat(pgdat) {
33374 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
33375 + page = pfn_to_page(pgdat->node_start_pfn + i);
33376 + total++;
33377 + if (PageReserved(page))
33378 + reserved++;
33379 + else if (PageSwapCache(page))
33380 + cached++;
33381 + else if (page_count(page))
33382 + shared += page_count(page) - 1;
33383 + }
33384 + }
33385 + printk(KERN_INFO "%lu pages of RAM\n", total);
33386 + printk(KERN_INFO "%lu reserved pages\n",reserved);
33387 + printk(KERN_INFO "%lu pages shared\n",shared);
33388 + printk(KERN_INFO "%lu pages swap cached\n",cached);
33389 +}
33390 +
33391 +
33392 +static __init void *spp_getpage(void)
33393 +{
33394 + void *ptr;
33395 + if (after_bootmem)
33396 + ptr = (void *) get_zeroed_page(GFP_ATOMIC);
33397 + else if (start_pfn < table_end) {
33398 + ptr = __va(start_pfn << PAGE_SHIFT);
33399 + start_pfn++;
33400 + memset(ptr, 0, PAGE_SIZE);
33401 + } else
33402 + ptr = alloc_bootmem_pages(PAGE_SIZE);
33403 + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
33404 + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
33405 +
33406 + Dprintk("spp_getpage %p\n", ptr);
33407 + return ptr;
33408 +}
33409 +
33410 +#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
33411 +#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
33412 +
33413 +static __init void set_pte_phys(unsigned long vaddr,
33414 + unsigned long phys, pgprot_t prot, int user_mode)
33415 +{
33416 + pgd_t *pgd;
33417 + pud_t *pud;
33418 + pmd_t *pmd;
33419 + pte_t *pte, new_pte;
33420 +
33421 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33422 +
33423 + pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
33424 + if (pgd_none(*pgd)) {
33425 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33426 + return;
33427 + }
33428 + pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
33429 + if (pud_none(*pud)) {
33430 + pmd = (pmd_t *) spp_getpage();
33431 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
33432 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33433 + if (pmd != pmd_offset(pud, 0)) {
33434 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33435 + return;
33436 + }
33437 + }
33438 + pmd = pmd_offset(pud, vaddr);
33439 + if (pmd_none(*pmd)) {
33440 + pte = (pte_t *) spp_getpage();
33441 + make_page_readonly(pte, XENFEAT_writable_page_tables);
33442 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33443 + if (pte != pte_offset_kernel(pmd, 0)) {
33444 + printk("PAGETABLE BUG #02!\n");
33445 + return;
33446 + }
33447 + }
33448 + if (pgprot_val(prot))
33449 + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
33450 + else
33451 + new_pte = __pte(0);
33452 +
33453 + pte = pte_offset_kernel(pmd, vaddr);
33454 + if (!pte_none(*pte) && __pte_val(new_pte) &&
33455 + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33456 + pte_ERROR(*pte);
33457 + set_pte(pte, new_pte);
33458 +
33459 + /*
33460 + * It's enough to flush this one mapping.
33461 + * (PGE mappings get flushed as well)
33462 + */
33463 + __flush_tlb_one(vaddr);
33464 +}
33465 +
33466 +static __init void set_pte_phys_ma(unsigned long vaddr,
33467 + unsigned long phys, pgprot_t prot)
33468 +{
33469 + pgd_t *pgd;
33470 + pud_t *pud;
33471 + pmd_t *pmd;
33472 + pte_t *pte, new_pte;
33473 +
33474 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
33475 +
33476 + pgd = pgd_offset_k(vaddr);
33477 + if (pgd_none(*pgd)) {
33478 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
33479 + return;
33480 + }
33481 + pud = pud_offset(pgd, vaddr);
33482 + if (pud_none(*pud)) {
33483 +
33484 + pmd = (pmd_t *) spp_getpage();
33485 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
33486 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
33487 + if (pmd != pmd_offset(pud, 0)) {
33488 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
33489 + return;
33490 + }
33491 + }
33492 + pmd = pmd_offset(pud, vaddr);
33493 + if (pmd_none(*pmd)) {
33494 + pte = (pte_t *) spp_getpage();
33495 + make_page_readonly(pte, XENFEAT_writable_page_tables);
33496 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
33497 + if (pte != pte_offset_kernel(pmd, 0)) {
33498 + printk("PAGETABLE BUG #02!\n");
33499 + return;
33500 + }
33501 + }
33502 + new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
33503 +
33504 + pte = pte_offset_kernel(pmd, vaddr);
33505 + if (!pte_none(*pte) && __pte_val(new_pte) &&
33506 +#ifdef CONFIG_ACPI
33507 + /* __acpi_map_table() fails to properly call clear_fixmap() */
33508 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
33509 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
33510 +#endif
33511 + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
33512 + pte_ERROR(*pte);
33513 + set_pte(pte, new_pte);
33514 +
33515 + /*
33516 + * It's enough to flush this one mapping.
33517 + * (PGE mappings get flushed as well)
33518 + */
33519 + __flush_tlb_one(vaddr);
33520 +}
33521 +
33522 +/* NOTE: this is meant to be run only at boot */
33523 +void __init
33524 +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
33525 +{
33526 + unsigned long address = __fix_to_virt(idx);
33527 +
33528 + if (idx >= __end_of_fixed_addresses) {
33529 + printk("Invalid __set_fixmap\n");
33530 + return;
33531 + }
33532 + switch (idx) {
33533 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
33534 + set_pte_phys(address, phys, prot, 0);
33535 + set_pte_phys(address, phys, prot, 1);
33536 + break;
33537 + default:
33538 + set_pte_phys_ma(address, phys, prot);
33539 + break;
33540 + }
33541 +}
33542 +
33543 +unsigned long __initdata table_start, table_end;
33544 +
33545 +static __meminit void *alloc_static_page(unsigned long *phys)
33546 +{
33547 + unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
33548 +
33549 + if (after_bootmem) {
33550 + void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
33551 +
33552 + *phys = __pa(adr);
33553 + return adr;
33554 + }
33555 +
33556 + *phys = start_pfn << PAGE_SHIFT;
33557 + start_pfn++;
33558 + memset((void *)va, 0, PAGE_SIZE);
33559 + return (void *)va;
33560 +}
33561 +
33562 +#define PTE_SIZE PAGE_SIZE
33563 +
33564 +static inline int make_readonly(unsigned long paddr)
33565 +{
33566 + extern char __vsyscall_0;
33567 + int readonly = 0;
33568 +
33569 + /* Make new page tables read-only. */
33570 + if (!xen_feature(XENFEAT_writable_page_tables)
33571 + && (paddr >= (table_start << PAGE_SHIFT))
33572 + && (paddr < (table_end << PAGE_SHIFT)))
33573 + readonly = 1;
33574 + /* Make old page tables read-only. */
33575 + if (!xen_feature(XENFEAT_writable_page_tables)
33576 + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
33577 + && (paddr < (start_pfn << PAGE_SHIFT)))
33578 + readonly = 1;
33579 +
33580 + /*
33581 + * No need for writable mapping of kernel image. This also ensures that
33582 + * page and descriptor tables embedded inside don't have writable
33583 + * mappings. Exclude the vsyscall area here, allowing alternative
33584 + * instruction patching to work.
33585 + */
33586 + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
33587 + && !(paddr >= __pa_symbol(&__vsyscall_0)
33588 + && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
33589 + readonly = 1;
33590 +
33591 + return readonly;
33592 +}
33593 +
33594 +#ifndef CONFIG_XEN
33595 +/* Must run before zap_low_mappings */
33596 +__init void *early_ioremap(unsigned long addr, unsigned long size)
33597 +{
33598 + unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
33599 +
33600 + /* actually usually some more */
33601 + if (size >= LARGE_PAGE_SIZE) {
33602 + printk("SMBIOS area too long %lu\n", size);
33603 + return NULL;
33604 + }
33605 + set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33606 + map += LARGE_PAGE_SIZE;
33607 + set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
33608 + __flush_tlb();
33609 + return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
33610 +}
33611 +
33612 +/* To avoid virtual aliases later */
33613 +__init void early_iounmap(void *addr, unsigned long size)
33614 +{
33615 + if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
33616 + printk("early_iounmap: bad address %p\n", addr);
33617 + set_pmd(temp_mappings[0].pmd, __pmd(0));
33618 + set_pmd(temp_mappings[1].pmd, __pmd(0));
33619 + __flush_tlb();
33620 +}
33621 +#endif
33622 +
33623 +static void __meminit
33624 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
33625 +{
33626 + int i, k;
33627 +
33628 + for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
33629 + unsigned long pte_phys;
33630 + pte_t *pte, *pte_save;
33631 +
33632 + if (address >= end)
33633 + break;
33634 + pte = alloc_static_page(&pte_phys);
33635 + pte_save = pte;
33636 + for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
33637 + unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
33638 +
33639 + if (address >= (after_bootmem
33640 + ? end
33641 + : xen_start_info->nr_pages << PAGE_SHIFT))
33642 + pteval = 0;
33643 + else if (make_readonly(address))
33644 + pteval &= ~_PAGE_RW;
33645 + set_pte(pte, __pte(pteval & __supported_pte_mask));
33646 + }
33647 + if (!after_bootmem) {
33648 + early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33649 + *pmd = __pmd(pte_phys | _KERNPG_TABLE);
33650 + } else {
33651 + make_page_readonly(pte_save, XENFEAT_writable_page_tables);
33652 + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
33653 + }
33654 + }
33655 +}
33656 +
33657 +static void __meminit
33658 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
33659 +{
33660 + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
33661 +
33662 + if (pmd_none(*pmd)) {
33663 + spin_lock(&init_mm.page_table_lock);
33664 + phys_pmd_init(pmd, address, end);
33665 + spin_unlock(&init_mm.page_table_lock);
33666 + __flush_tlb_all();
33667 + }
33668 +}
33669 +
33670 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
33671 +{
33672 + long i = pud_index(address);
33673 +
33674 + pud = pud + i;
33675 +
33676 + if (after_bootmem && pud_val(*pud)) {
33677 + phys_pmd_update(pud, address, end);
33678 + return;
33679 + }
33680 +
33681 + for (; i < PTRS_PER_PUD; pud++, i++) {
33682 + unsigned long paddr, pmd_phys;
33683 + pmd_t *pmd;
33684 +
33685 + paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
33686 + if (paddr >= end)
33687 + break;
33688 +
33689 + pmd = alloc_static_page(&pmd_phys);
33690 +
33691 + spin_lock(&init_mm.page_table_lock);
33692 + *pud = __pud(pmd_phys | _KERNPG_TABLE);
33693 + phys_pmd_init(pmd, paddr, end);
33694 + spin_unlock(&init_mm.page_table_lock);
33695 +
33696 + early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
33697 + }
33698 + __flush_tlb();
33699 +}
33700 +
33701 +void __init xen_init_pt(void)
33702 +{
33703 + unsigned long addr, *page;
33704 +
33705 + /* Find the initial pte page that was built for us. */
33706 + page = (unsigned long *)xen_start_info->pt_base;
33707 + addr = page[pgd_index(__START_KERNEL_map)];
33708 + addr_to_page(addr, page);
33709 + addr = page[pud_index(__START_KERNEL_map)];
33710 + addr_to_page(addr, page);
33711 +
33712 +#if CONFIG_XEN_COMPAT <= 0x030002
33713 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
33714 + in kernel PTEs. We check that here. */
33715 + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
33716 + unsigned long *pg;
33717 + pte_t pte;
33718 +
33719 + /* Mess with the initial mapping of page 0. It's not needed. */
33720 + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
33721 + addr = page[pmd_index(__START_KERNEL_map)];
33722 + addr_to_page(addr, pg);
33723 + pte.pte = pg[pte_index(__START_KERNEL_map)];
33724 + BUG_ON(!(pte.pte & _PAGE_PRESENT));
33725 +
33726 + /* If _PAGE_USER isn't set, we obviously do not need it. */
33727 + if (pte.pte & _PAGE_USER) {
33728 + /* _PAGE_USER is needed, but is it set implicitly? */
33729 + pte.pte &= ~_PAGE_USER;
33730 + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
33731 + pte, 0) != 0) ||
33732 + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
33733 + /* We need to explicitly specify _PAGE_USER. */
33734 + __kernel_page_user = _PAGE_USER;
33735 + }
33736 + }
33737 +#endif
33738 +
33739 + /* Construct mapping of initial pte page in our own directories. */
33740 + init_level4_pgt[pgd_index(__START_KERNEL_map)] =
33741 + __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
33742 + level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
33743 + __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
33744 + memcpy(level2_kernel_pgt, page, PAGE_SIZE);
33745 +
33746 + __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
33747 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
33748 +
33749 + early_make_page_readonly(init_level4_pgt,
33750 + XENFEAT_writable_page_tables);
33751 + early_make_page_readonly(__user_pgd(init_level4_pgt),
33752 + XENFEAT_writable_page_tables);
33753 + early_make_page_readonly(level3_kernel_pgt,
33754 + XENFEAT_writable_page_tables);
33755 + early_make_page_readonly(level3_user_pgt,
33756 + XENFEAT_writable_page_tables);
33757 + early_make_page_readonly(level2_kernel_pgt,
33758 + XENFEAT_writable_page_tables);
33759 +
33760 + if (!xen_feature(XENFEAT_writable_page_tables)) {
33761 + xen_pgd_pin(__pa_symbol(init_level4_pgt));
33762 + xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
33763 + }
33764 +}
33765 +
33766 +static void __init extend_init_mapping(unsigned long tables_space)
33767 +{
33768 + unsigned long va = __START_KERNEL_map;
33769 + unsigned long phys, addr, *pte_page;
33770 + pmd_t *pmd;
33771 + pte_t *pte, new_pte;
33772 + unsigned long *page = (unsigned long *)init_level4_pgt;
33773 +
33774 + addr = page[pgd_index(va)];
33775 + addr_to_page(addr, page);
33776 + addr = page[pud_index(va)];
33777 + addr_to_page(addr, page);
33778 +
33779 + /* Kill mapping of low 1MB. */
33780 + while (va < (unsigned long)&_text) {
33781 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33782 + BUG();
33783 + va += PAGE_SIZE;
33784 + }
33785 +
33786 + /* Ensure init mappings cover kernel text/data and initial tables. */
33787 + while (va < (__START_KERNEL_map
33788 + + (start_pfn << PAGE_SHIFT)
33789 + + tables_space)) {
33790 + pmd = (pmd_t *)&page[pmd_index(va)];
33791 + if (pmd_none(*pmd)) {
33792 + pte_page = alloc_static_page(&phys);
33793 + early_make_page_readonly(
33794 + pte_page, XENFEAT_writable_page_tables);
33795 + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
33796 + } else {
33797 + addr = page[pmd_index(va)];
33798 + addr_to_page(addr, pte_page);
33799 + }
33800 + pte = (pte_t *)&pte_page[pte_index(va)];
33801 + if (pte_none(*pte)) {
33802 + new_pte = pfn_pte(
33803 + (va - __START_KERNEL_map) >> PAGE_SHIFT,
33804 + __pgprot(_KERNPG_TABLE));
33805 + xen_l1_entry_update(pte, new_pte);
33806 + }
33807 + va += PAGE_SIZE;
33808 + }
33809 +
33810 + /* Finally, blow away any spurious initial mappings. */
33811 + while (1) {
33812 + pmd = (pmd_t *)&page[pmd_index(va)];
33813 + if (pmd_none(*pmd))
33814 + break;
33815 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
33816 + BUG();
33817 + va += PAGE_SIZE;
33818 + }
33819 +}
33820 +
33821 +static void __init find_early_table_space(unsigned long end)
33822 +{
33823 + unsigned long puds, pmds, ptes, tables;
33824 +
33825 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
33826 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
33827 + ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
33828 +
33829 + tables = round_up(puds * 8, PAGE_SIZE) +
33830 + round_up(pmds * 8, PAGE_SIZE) +
33831 + round_up(ptes * 8, PAGE_SIZE);
33832 +
33833 + extend_init_mapping(tables);
33834 +
33835 + table_start = start_pfn;
33836 + table_end = table_start + (tables>>PAGE_SHIFT);
33837 +
33838 + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
33839 + end, table_start << PAGE_SHIFT,
33840 + (table_start << PAGE_SHIFT) + tables);
33841 +}
33842 +
33843 +static void xen_finish_init_mapping(void)
33844 +{
33845 + unsigned long i, start, end;
33846 +
33847 + /* Re-vector virtual addresses pointing into the initial
33848 + mapping to the just-established permanent ones. */
33849 + xen_start_info = __va(__pa(xen_start_info));
33850 + xen_start_info->pt_base = (unsigned long)
33851 + __va(__pa(xen_start_info->pt_base));
33852 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
33853 + phys_to_machine_mapping =
33854 + __va(__pa(xen_start_info->mfn_list));
33855 + xen_start_info->mfn_list = (unsigned long)
33856 + phys_to_machine_mapping;
33857 + }
33858 + if (xen_start_info->mod_start)
33859 + xen_start_info->mod_start = (unsigned long)
33860 + __va(__pa(xen_start_info->mod_start));
33861 +
33862 + /* Destroy the Xen-created mappings beyond the kernel image as
33863 + * well as the temporary mappings created above. Prevents
33864 + * overlap with modules area (if init mapping is very big).
33865 + */
33866 + start = PAGE_ALIGN((unsigned long)_end);
33867 + end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
33868 + for (; start < end; start += PAGE_SIZE)
33869 + if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
33870 + BUG();
33871 +
33872 + /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
33873 + table_end = ~0UL;
33874 +
33875 + /*
33876 + * Prefetch pte's for the bt_ioremap() area. It gets used before the
33877 + * boot-time allocator is online, so allocate-on-demand would fail.
33878 + */
33879 + for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
33880 + __set_fixmap(i, 0, __pgprot(0));
33881 +
33882 + /* Switch to the real shared_info page, and clear the dummy page. */
33883 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
33884 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
33885 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
33886 +
33887 + /* Set up mapping of lowest 1MB of physical memory. */
33888 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
33889 + if (is_initial_xendomain())
33890 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
33891 + else
33892 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
33893 + virt_to_mfn(empty_zero_page)
33894 + << PAGE_SHIFT,
33895 + PAGE_KERNEL_RO);
33896 +
33897 + /* Disable the 'start_pfn' allocator. */
33898 + table_end = start_pfn;
33899 +}
33900 +
33901 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
33902 + This runs before bootmem is initialized and gets pages directly from the
33903 + physical memory. To access them they are temporarily mapped. */
33904 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
33905 +{
33906 + unsigned long next;
33907 +
33908 + Dprintk("init_memory_mapping\n");
33909 +
33910 + /*
33911 + * Find space for the kernel direct mapping tables.
33912 + * Later we should allocate these tables in the local node of the memory
33913 + * mapped. Unfortunately this is done currently before the nodes are
33914 + * discovered.
33915 + */
33916 + if (!after_bootmem)
33917 + find_early_table_space(end);
33918 +
33919 + start = (unsigned long)__va(start);
33920 + end = (unsigned long)__va(end);
33921 +
33922 + for (; start < end; start = next) {
33923 + unsigned long pud_phys;
33924 + pgd_t *pgd = pgd_offset_k(start);
33925 + pud_t *pud;
33926 +
33927 + if (after_bootmem)
33928 + pud = pud_offset(pgd, start & PGDIR_MASK);
33929 + else
33930 + pud = alloc_static_page(&pud_phys);
33931 + next = start + PGDIR_SIZE;
33932 + if (next > end)
33933 + next = end;
33934 + phys_pud_init(pud, __pa(start), __pa(next));
33935 + if (!after_bootmem) {
33936 + early_make_page_readonly(pud, XENFEAT_writable_page_tables);
33937 + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
33938 + }
33939 + }
33940 +
33941 + if (!after_bootmem) {
33942 + BUG_ON(start_pfn != table_end);
33943 + xen_finish_init_mapping();
33944 + }
33945 +
33946 + __flush_tlb_all();
33947 +}
33948 +
33949 +void __cpuinit zap_low_mappings(int cpu)
33950 +{
33951 + /* this is not required for Xen */
33952 +#if 0
33953 + swap_low_mappings();
33954 +#endif
33955 +}
33956 +
33957 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
33958 +__init void
33959 +size_zones(unsigned long *z, unsigned long *h,
33960 + unsigned long start_pfn, unsigned long end_pfn)
33961 +{
33962 + int i;
33963 + unsigned long w;
33964 +
33965 + for (i = 0; i < MAX_NR_ZONES; i++)
33966 + z[i] = 0;
33967 +
33968 + if (start_pfn < MAX_DMA_PFN)
33969 + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
33970 + if (start_pfn < MAX_DMA32_PFN) {
33971 + unsigned long dma32_pfn = MAX_DMA32_PFN;
33972 + if (dma32_pfn > end_pfn)
33973 + dma32_pfn = end_pfn;
33974 + z[ZONE_DMA32] = dma32_pfn - start_pfn;
33975 + }
33976 + z[ZONE_NORMAL] = end_pfn - start_pfn;
33977 +
33978 + /* Remove lower zones from higher ones. */
33979 + w = 0;
33980 + for (i = 0; i < MAX_NR_ZONES; i++) {
33981 + if (z[i])
33982 + z[i] -= w;
33983 + w += z[i];
33984 + }
33985 +
33986 + /* Compute holes */
33987 + w = start_pfn;
33988 + for (i = 0; i < MAX_NR_ZONES; i++) {
33989 + unsigned long s = w;
33990 + w += z[i];
33991 + h[i] = e820_hole_size(s, w);
33992 + }
33993 +
33994 + /* Add the space pace needed for mem_map to the holes too. */
33995 + for (i = 0; i < MAX_NR_ZONES; i++)
33996 + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
33997 +
33998 + /* The 16MB DMA zone has the kernel and other misc mappings.
33999 + Account them too */
34000 + if (h[ZONE_DMA]) {
34001 + h[ZONE_DMA] += dma_reserve;
34002 + if (h[ZONE_DMA] >= z[ZONE_DMA]) {
34003 + printk(KERN_WARNING
34004 + "Kernel too large and filling up ZONE_DMA?\n");
34005 + h[ZONE_DMA] = z[ZONE_DMA];
34006 + }
34007 + }
34008 +}
34009 +
34010 +#ifndef CONFIG_NUMA
34011 +void __init paging_init(void)
34012 +{
34013 + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
34014 +
34015 + memory_present(0, 0, end_pfn);
34016 + sparse_init();
34017 + size_zones(zones, holes, 0, end_pfn);
34018 + free_area_init_node(0, NODE_DATA(0), zones,
34019 + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
34020 +
34021 + init_mm.context.pinned = 1;
34022 +}
34023 +#endif
34024 +
34025 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
34026 + from the CPU leading to inconsistent cache lines. address and size
34027 + must be aligned to 2MB boundaries.
34028 + Does nothing when the mapping doesn't exist. */
34029 +void __init clear_kernel_mapping(unsigned long address, unsigned long size)
34030 +{
34031 + unsigned long end = address + size;
34032 +
34033 + BUG_ON(address & ~LARGE_PAGE_MASK);
34034 + BUG_ON(size & ~LARGE_PAGE_MASK);
34035 +
34036 + for (; address < end; address += LARGE_PAGE_SIZE) {
34037 + pgd_t *pgd = pgd_offset_k(address);
34038 + pud_t *pud;
34039 + pmd_t *pmd;
34040 + if (pgd_none(*pgd))
34041 + continue;
34042 + pud = pud_offset(pgd, address);
34043 + if (pud_none(*pud))
34044 + continue;
34045 + pmd = pmd_offset(pud, address);
34046 + if (!pmd || pmd_none(*pmd))
34047 + continue;
34048 + if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
34049 + /* Could handle this, but it should not happen currently. */
34050 + printk(KERN_ERR
34051 + "clear_kernel_mapping: mapping has been split. will leak memory\n");
34052 + pmd_ERROR(*pmd);
34053 + }
34054 + set_pmd(pmd, __pmd(0));
34055 + }
34056 + __flush_tlb_all();
34057 +}
34058 +
34059 +/*
34060 + * Memory hotplug specific functions
34061 + */
34062 +void online_page(struct page *page)
34063 +{
34064 + ClearPageReserved(page);
34065 + init_page_count(page);
34066 + __free_page(page);
34067 + totalram_pages++;
34068 + num_physpages++;
34069 +}
34070 +
34071 +#ifdef CONFIG_MEMORY_HOTPLUG
34072 +/*
34073 + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
34074 + * via probe interface of sysfs. If acpi notifies hot-add event, then it
34075 + * can tell node id by searching dsdt. But, probe interface doesn't have
34076 + * node id. So, return 0 as node id at this time.
34077 + */
34078 +#ifdef CONFIG_NUMA
34079 +int memory_add_physaddr_to_nid(u64 start)
34080 +{
34081 + return 0;
34082 +}
34083 +#endif
34084 +
34085 +/*
34086 + * Memory is added always to NORMAL zone. This means you will never get
34087 + * additional DMA/DMA32 memory.
34088 + */
34089 +int arch_add_memory(int nid, u64 start, u64 size)
34090 +{
34091 + struct pglist_data *pgdat = NODE_DATA(nid);
34092 + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
34093 + unsigned long start_pfn = start >> PAGE_SHIFT;
34094 + unsigned long nr_pages = size >> PAGE_SHIFT;
34095 + int ret;
34096 +
34097 + ret = __add_pages(zone, start_pfn, nr_pages);
34098 + if (ret)
34099 + goto error;
34100 +
34101 + init_memory_mapping(start, (start + size -1));
34102 +
34103 + return ret;
34104 +error:
34105 + printk("%s: Problem encountered in __add_pages!\n", __func__);
34106 + return ret;
34107 +}
34108 +EXPORT_SYMBOL_GPL(arch_add_memory);
34109 +
34110 +int remove_memory(u64 start, u64 size)
34111 +{
34112 + return -EINVAL;
34113 +}
34114 +EXPORT_SYMBOL_GPL(remove_memory);
34115 +
34116 +#else /* CONFIG_MEMORY_HOTPLUG */
34117 +/*
34118 + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
34119 + * just online the pages.
34120 + */
34121 +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
34122 +{
34123 + int err = -EIO;
34124 + unsigned long pfn;
34125 + unsigned long total = 0, mem = 0;
34126 + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
34127 + if (pfn_valid(pfn)) {
34128 + online_page(pfn_to_page(pfn));
34129 + err = 0;
34130 + mem++;
34131 + }
34132 + total++;
34133 + }
34134 + if (!err) {
34135 + z->spanned_pages += total;
34136 + z->present_pages += mem;
34137 + z->zone_pgdat->node_spanned_pages += total;
34138 + z->zone_pgdat->node_present_pages += mem;
34139 + }
34140 + return err;
34141 +}
34142 +#endif /* CONFIG_MEMORY_HOTPLUG */
34143 +
34144 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
34145 + kcore_vsyscall;
34146 +
34147 +void __init mem_init(void)
34148 +{
34149 + long codesize, reservedpages, datasize, initsize;
34150 + unsigned long pfn;
34151 +
34152 + pci_iommu_alloc();
34153 +
34154 + /* How many end-of-memory variables you have, grandma! */
34155 + max_low_pfn = end_pfn;
34156 + max_pfn = end_pfn;
34157 + num_physpages = end_pfn;
34158 + high_memory = (void *) __va(end_pfn * PAGE_SIZE);
34159 +
34160 + /* clear the zero-page */
34161 + memset(empty_zero_page, 0, PAGE_SIZE);
34162 +
34163 + reservedpages = 0;
34164 +
34165 + /* this will put all low memory onto the freelists */
34166 +#ifdef CONFIG_NUMA
34167 + totalram_pages = numa_free_all_bootmem();
34168 +#else
34169 + totalram_pages = free_all_bootmem();
34170 +#endif
34171 + /* XEN: init and count pages outside initial allocation. */
34172 + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
34173 + ClearPageReserved(pfn_to_page(pfn));
34174 + init_page_count(pfn_to_page(pfn));
34175 + totalram_pages++;
34176 + }
34177 + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
34178 +
34179 + after_bootmem = 1;
34180 +
34181 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
34182 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
34183 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
34184 +
34185 + /* Register memory areas for /proc/kcore */
34186 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
34187 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
34188 + VMALLOC_END-VMALLOC_START);
34189 + kclist_add(&kcore_kernel, &_stext, _end - _stext);
34190 + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
34191 + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
34192 + VSYSCALL_END - VSYSCALL_START);
34193 +
34194 + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
34195 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
34196 + end_pfn << (PAGE_SHIFT-10),
34197 + codesize >> 10,
34198 + reservedpages << (PAGE_SHIFT-10),
34199 + datasize >> 10,
34200 + initsize >> 10);
34201 +
34202 +#ifndef CONFIG_XEN
34203 +#ifdef CONFIG_SMP
34204 + /*
34205 + * Sync boot_level4_pgt mappings with the init_level4_pgt
34206 + * except for the low identity mappings which are already zapped
34207 + * in init_level4_pgt. This sync-up is essential for AP's bringup
34208 + */
34209 + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
34210 +#endif
34211 +#endif
34212 +}
34213 +
34214 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
34215 +{
34216 + unsigned long addr;
34217 +
34218 + if (begin >= end)
34219 + return;
34220 +
34221 + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
34222 + for (addr = begin; addr < end; addr += PAGE_SIZE) {
34223 + ClearPageReserved(virt_to_page(addr));
34224 + init_page_count(virt_to_page(addr));
34225 + memset((void *)(addr & ~(PAGE_SIZE-1)),
34226 + POISON_FREE_INITMEM, PAGE_SIZE);
34227 + if (addr >= __START_KERNEL_map) {
34228 + /* make_readonly() reports all kernel addresses. */
34229 + __make_page_writable(__va(__pa(addr)));
34230 + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
34231 + pgd_t *pgd = pgd_offset_k(addr);
34232 + pud_t *pud = pud_offset(pgd, addr);
34233 + pmd_t *pmd = pmd_offset(pud, addr);
34234 + pte_t *pte = pte_offset_kernel(pmd, addr);
34235 +
34236 + xen_l1_entry_update(pte, __pte(0)); /* fallback */
34237 + }
34238 + }
34239 + free_page(addr);
34240 + totalram_pages++;
34241 + }
34242 +}
34243 +
34244 +void free_initmem(void)
34245 +{
34246 + memset(__initdata_begin, POISON_FREE_INITDATA,
34247 + __initdata_end - __initdata_begin);
34248 + free_init_pages("unused kernel memory",
34249 + (unsigned long)(&__init_begin),
34250 + (unsigned long)(&__init_end));
34251 +}
34252 +
34253 +#ifdef CONFIG_DEBUG_RODATA
34254 +
34255 +void mark_rodata_ro(void)
34256 +{
34257 + unsigned long addr = (unsigned long)__start_rodata;
34258 +
34259 + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
34260 + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
34261 +
34262 + printk ("Write protecting the kernel read-only data: %luk\n",
34263 + (__end_rodata - __start_rodata) >> 10);
34264 +
34265 + /*
34266 + * change_page_attr_addr() requires a global_flush_tlb() call after it.
34267 + * We do this after the printk so that if something went wrong in the
34268 + * change, the printk gets out at least to give a better debug hint
34269 + * of who is the culprit.
34270 + */
34271 + global_flush_tlb();
34272 +}
34273 +#endif
34274 +
34275 +#ifdef CONFIG_BLK_DEV_INITRD
34276 +void free_initrd_mem(unsigned long start, unsigned long end)
34277 +{
34278 + free_init_pages("initrd memory", start, end);
34279 +}
34280 +#endif
34281 +
34282 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
34283 +{
34284 + /* Should check here against the e820 map to avoid double free */
34285 +#ifdef CONFIG_NUMA
34286 + int nid = phys_to_nid(phys);
34287 + reserve_bootmem_node(NODE_DATA(nid), phys, len);
34288 +#else
34289 + reserve_bootmem(phys, len);
34290 +#endif
34291 + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
34292 + dma_reserve += len / PAGE_SIZE;
34293 +}
34294 +
34295 +int kern_addr_valid(unsigned long addr)
34296 +{
34297 + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
34298 + pgd_t *pgd;
34299 + pud_t *pud;
34300 + pmd_t *pmd;
34301 + pte_t *pte;
34302 +
34303 + if (above != 0 && above != -1UL)
34304 + return 0;
34305 +
34306 + pgd = pgd_offset_k(addr);
34307 + if (pgd_none(*pgd))
34308 + return 0;
34309 +
34310 + pud = pud_offset(pgd, addr);
34311 + if (pud_none(*pud))
34312 + return 0;
34313 +
34314 + pmd = pmd_offset(pud, addr);
34315 + if (pmd_none(*pmd))
34316 + return 0;
34317 + if (pmd_large(*pmd))
34318 + return pfn_valid(pmd_pfn(*pmd));
34319 +
34320 + pte = pte_offset_kernel(pmd, addr);
34321 + if (pte_none(*pte))
34322 + return 0;
34323 + return pfn_valid(pte_pfn(*pte));
34324 +}
34325 +
34326 +#ifdef CONFIG_SYSCTL
34327 +#include <linux/sysctl.h>
34328 +
34329 +extern int exception_trace, page_fault_trace;
34330 +
34331 +static ctl_table debug_table2[] = {
34332 + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
34333 + proc_dointvec },
34334 + { 0, }
34335 +};
34336 +
34337 +static ctl_table debug_root_table2[] = {
34338 + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
34339 + .child = debug_table2 },
34340 + { 0 },
34341 +};
34342 +
34343 +static __init int x8664_sysctl_init(void)
34344 +{
34345 + register_sysctl_table(debug_root_table2, 1);
34346 + return 0;
34347 +}
34348 +__initcall(x8664_sysctl_init);
34349 +#endif
34350 +
34351 +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
34352 + covers the 64bit vsyscall page now. 32bit has a real VMA now and does
34353 + not need special handling anymore. */
34354 +
34355 +static struct vm_area_struct gate_vma = {
34356 + .vm_start = VSYSCALL_START,
34357 + .vm_end = VSYSCALL_END,
34358 + .vm_page_prot = PAGE_READONLY
34359 +};
34360 +
34361 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
34362 +{
34363 +#ifdef CONFIG_IA32_EMULATION
34364 + if (test_tsk_thread_flag(tsk, TIF_IA32))
34365 + return NULL;
34366 +#endif
34367 + return &gate_vma;
34368 +}
34369 +
34370 +int in_gate_area(struct task_struct *task, unsigned long addr)
34371 +{
34372 + struct vm_area_struct *vma = get_gate_vma(task);
34373 + if (!vma)
34374 + return 0;
34375 + return (addr >= vma->vm_start) && (addr < vma->vm_end);
34376 +}
34377 +
34378 +/* Use this when you have no reliable task/vma, typically from interrupt
34379 + * context. It is less reliable than using the task's vma and may give
34380 + * false positives.
34381 + */
34382 +int in_gate_area_no_task(unsigned long addr)
34383 +{
34384 + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
34385 +}
34386 Index: head-2008-11-25/arch/x86/mm/pageattr_64-xen.c
34387 ===================================================================
34388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
34389 +++ head-2008-11-25/arch/x86/mm/pageattr_64-xen.c 2008-07-21 11:00:32.000000000 +0200
34390 @@ -0,0 +1,502 @@
34391 +/*
34392 + * Copyright 2002 Andi Kleen, SuSE Labs.
34393 + * Thanks to Ben LaHaise for precious feedback.
34394 + */
34395 +
34396 +#include <linux/mm.h>
34397 +#include <linux/sched.h>
34398 +#include <linux/highmem.h>
34399 +#include <linux/module.h>
34400 +#include <linux/slab.h>
34401 +#include <asm/uaccess.h>
34402 +#include <asm/processor.h>
34403 +#include <asm/tlbflush.h>
34404 +#include <asm/io.h>
34405 +
34406 +#ifdef CONFIG_XEN
34407 +#include <asm/pgalloc.h>
34408 +#include <asm/mmu_context.h>
34409 +
34410 +LIST_HEAD(mm_unpinned);
34411 +DEFINE_SPINLOCK(mm_unpinned_lock);
34412 +
34413 +static void _pin_lock(struct mm_struct *mm, int lock) {
34414 + if (lock)
34415 + spin_lock(&mm->page_table_lock);
34416 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
34417 + /* While mm->page_table_lock protects us against insertions and
34418 + * removals of higher level page table pages, it doesn't protect
34419 + * against updates of pte-s. Such updates, however, require the
34420 + * pte pages to be in consistent state (unpinned+writable or
34421 + * pinned+readonly). The pinning and attribute changes, however
34422 + * cannot be done atomically, which is why such updates must be
34423 + * prevented from happening concurrently.
34424 + * Note that no pte lock can ever elsewhere be acquired nesting
34425 + * with an already acquired one in the same mm, or with the mm's
34426 + * page_table_lock already acquired, as that would break in the
34427 + * non-split case (where all these are actually resolving to the
34428 + * one page_table_lock). Thus acquiring all of them here is not
34429 + * going to result in dead locks, and the order of acquires
34430 + * doesn't matter.
34431 + */
34432 + {
34433 + pgd_t *pgd = mm->pgd;
34434 + unsigned g;
34435 +
34436 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34437 + pud_t *pud;
34438 + unsigned u;
34439 +
34440 + if (pgd_none(*pgd))
34441 + continue;
34442 + pud = pud_offset(pgd, 0);
34443 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34444 + pmd_t *pmd;
34445 + unsigned m;
34446 +
34447 + if (pud_none(*pud))
34448 + continue;
34449 + pmd = pmd_offset(pud, 0);
34450 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34451 + spinlock_t *ptl;
34452 +
34453 + if (pmd_none(*pmd))
34454 + continue;
34455 + ptl = pte_lockptr(0, pmd);
34456 + if (lock)
34457 + spin_lock(ptl);
34458 + else
34459 + spin_unlock(ptl);
34460 + }
34461 + }
34462 + }
34463 + }
34464 +#endif
34465 + if (!lock)
34466 + spin_unlock(&mm->page_table_lock);
34467 +}
34468 +#define pin_lock(mm) _pin_lock(mm, 1)
34469 +#define pin_unlock(mm) _pin_lock(mm, 0)
34470 +
34471 +#define PIN_BATCH 8
34472 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
34473 +
34474 +static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags,
34475 + unsigned int cpu, unsigned int seq)
34476 +{
34477 + struct page *page = virt_to_page(pt);
34478 + unsigned long pfn = page_to_pfn(page);
34479 +
34480 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
34481 + (unsigned long)__va(pfn << PAGE_SHIFT),
34482 + pfn_pte(pfn, flags), 0);
34483 + if (unlikely(++seq == PIN_BATCH)) {
34484 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
34485 + PIN_BATCH, NULL)))
34486 + BUG();
34487 + seq = 0;
34488 + }
34489 +
34490 + return seq;
34491 +}
34492 +
34493 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
34494 +{
34495 + pgd_t *pgd;
34496 + pud_t *pud;
34497 + pmd_t *pmd;
34498 + pte_t *pte;
34499 + int g,u,m;
34500 + unsigned int cpu, seq;
34501 + multicall_entry_t *mcl;
34502 +
34503 + pgd = mm->pgd;
34504 + cpu = get_cpu();
34505 +
34506 + /*
34507 + * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
34508 + * be the 'current' task's pagetables (e.g., current may be 32-bit,
34509 + * but the pagetables may be for a 64-bit task).
34510 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
34511 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
34512 + */
34513 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
34514 + if (pgd_none(*pgd))
34515 + continue;
34516 + pud = pud_offset(pgd, 0);
34517 + if (PTRS_PER_PUD > 1) /* not folded */
34518 + seq = mm_walk_set_prot(pud,flags,cpu,seq);
34519 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
34520 + if (pud_none(*pud))
34521 + continue;
34522 + pmd = pmd_offset(pud, 0);
34523 + if (PTRS_PER_PMD > 1) /* not folded */
34524 + seq = mm_walk_set_prot(pmd,flags,cpu,seq);
34525 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
34526 + if (pmd_none(*pmd))
34527 + continue;
34528 + pte = pte_offset_kernel(pmd,0);
34529 + seq = mm_walk_set_prot(pte,flags,cpu,seq);
34530 + }
34531 + }
34532 + }
34533 +
34534 + mcl = per_cpu(pb_mcl, cpu);
34535 + if (unlikely(seq > PIN_BATCH - 2)) {
34536 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
34537 + BUG();
34538 + seq = 0;
34539 + }
34540 + MULTI_update_va_mapping(mcl + seq,
34541 + (unsigned long)__user_pgd(mm->pgd),
34542 + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags),
34543 + 0);
34544 + MULTI_update_va_mapping(mcl + seq + 1,
34545 + (unsigned long)mm->pgd,
34546 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags),
34547 + UVMF_TLB_FLUSH);
34548 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
34549 + BUG();
34550 +
34551 + put_cpu();
34552 +}
34553 +
34554 +void mm_pin(struct mm_struct *mm)
34555 +{
34556 + if (xen_feature(XENFEAT_writable_page_tables))
34557 + return;
34558 +
34559 + pin_lock(mm);
34560 +
34561 + mm_walk(mm, PAGE_KERNEL_RO);
34562 + xen_pgd_pin(__pa(mm->pgd)); /* kernel */
34563 + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
34564 + mm->context.pinned = 1;
34565 + spin_lock(&mm_unpinned_lock);
34566 + list_del(&mm->context.unpinned);
34567 + spin_unlock(&mm_unpinned_lock);
34568 +
34569 + pin_unlock(mm);
34570 +}
34571 +
34572 +void mm_unpin(struct mm_struct *mm)
34573 +{
34574 + if (xen_feature(XENFEAT_writable_page_tables))
34575 + return;
34576 +
34577 + pin_lock(mm);
34578 +
34579 + xen_pgd_unpin(__pa(mm->pgd));
34580 + xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
34581 + mm_walk(mm, PAGE_KERNEL);
34582 + mm->context.pinned = 0;
34583 + spin_lock(&mm_unpinned_lock);
34584 + list_add(&mm->context.unpinned, &mm_unpinned);
34585 + spin_unlock(&mm_unpinned_lock);
34586 +
34587 + pin_unlock(mm);
34588 +}
34589 +
34590 +void mm_pin_all(void)
34591 +{
34592 + if (xen_feature(XENFEAT_writable_page_tables))
34593 + return;
34594 +
34595 + /*
34596 + * Allow uninterrupted access to the mm_unpinned list. We don't
34597 + * actually take the mm_unpinned_lock as it is taken inside mm_pin().
34598 + * All other CPUs must be at a safe point (e.g., in stop_machine
34599 + * or offlined entirely).
34600 + */
34601 + preempt_disable();
34602 + while (!list_empty(&mm_unpinned))
34603 + mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
34604 + context.unpinned));
34605 + preempt_enable();
34606 +}
34607 +
34608 +void _arch_dup_mmap(struct mm_struct *mm)
34609 +{
34610 + if (!mm->context.pinned)
34611 + mm_pin(mm);
34612 +}
34613 +
34614 +void _arch_exit_mmap(struct mm_struct *mm)
34615 +{
34616 + struct task_struct *tsk = current;
34617 +
34618 + task_lock(tsk);
34619 +
34620 + /*
34621 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
34622 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
34623 + */
34624 + if (tsk->active_mm == mm) {
34625 + tsk->active_mm = &init_mm;
34626 + atomic_inc(&init_mm.mm_count);
34627 +
34628 + switch_mm(mm, &init_mm, tsk);
34629 +
34630 + atomic_dec(&mm->mm_count);
34631 + BUG_ON(atomic_read(&mm->mm_count) == 0);
34632 + }
34633 +
34634 + task_unlock(tsk);
34635 +
34636 + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
34637 + !mm->context.has_foreign_mappings )
34638 + mm_unpin(mm);
34639 +}
34640 +
34641 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
34642 +{
34643 + struct page *pte;
34644 +
34645 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
34646 + if (pte) {
34647 + SetPageForeign(pte, pte_free);
34648 + init_page_count(pte);
34649 + }
34650 + return pte;
34651 +}
34652 +
34653 +void pte_free(struct page *pte)
34654 +{
34655 + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
34656 +
34657 + if (!pte_write(*virt_to_ptep(va)))
34658 + if (HYPERVISOR_update_va_mapping(
34659 + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
34660 + BUG();
34661 +
34662 + ClearPageForeign(pte);
34663 + init_page_count(pte);
34664 +
34665 + __free_page(pte);
34666 +}
34667 +#endif /* CONFIG_XEN */
34668 +
34669 +pte_t *lookup_address(unsigned long address)
34670 +{
34671 + pgd_t *pgd = pgd_offset_k(address);
34672 + pud_t *pud;
34673 + pmd_t *pmd;
34674 + pte_t *pte;
34675 + if (pgd_none(*pgd))
34676 + return NULL;
34677 + pud = pud_offset(pgd, address);
34678 + if (!pud_present(*pud))
34679 + return NULL;
34680 + pmd = pmd_offset(pud, address);
34681 + if (!pmd_present(*pmd))
34682 + return NULL;
34683 + if (pmd_large(*pmd))
34684 + return (pte_t *)pmd;
34685 + pte = pte_offset_kernel(pmd, address);
34686 + if (pte && !pte_present(*pte))
34687 + pte = NULL;
34688 + return pte;
34689 +}
34690 +
34691 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
34692 + pgprot_t ref_prot)
34693 +{
34694 + int i;
34695 + unsigned long addr;
34696 + struct page *base = alloc_pages(GFP_KERNEL, 0);
34697 + pte_t *pbase;
34698 + if (!base)
34699 + return NULL;
34700 + /*
34701 + * page_private is used to track the number of entries in
34702 + * the page table page have non standard attributes.
34703 + */
34704 + SetPagePrivate(base);
34705 + page_private(base) = 0;
34706 +
34707 + address = __pa(address);
34708 + addr = address & LARGE_PAGE_MASK;
34709 + pbase = (pte_t *)page_address(base);
34710 + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
34711 + pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
34712 + addr == address ? prot : ref_prot);
34713 + }
34714 + return base;
34715 +}
34716 +
34717 +
34718 +static void flush_kernel_map(void *address)
34719 +{
34720 + if (0 && address && cpu_has_clflush) {
34721 + /* is this worth it? */
34722 + int i;
34723 + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
34724 + asm volatile("clflush (%0)" :: "r" (address + i));
34725 + } else
34726 + asm volatile("wbinvd":::"memory");
34727 + if (address)
34728 + __flush_tlb_one(address);
34729 + else
34730 + __flush_tlb_all();
34731 +}
34732 +
34733 +
34734 +static inline void flush_map(unsigned long address)
34735 +{
34736 + on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
34737 +}
34738 +
34739 +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
34740 +
34741 +static inline void save_page(struct page *fpage)
34742 +{
34743 + fpage->lru.next = (struct list_head *)deferred_pages;
34744 + deferred_pages = fpage;
34745 +}
34746 +
34747 +/*
34748 + * No more special protections in this 2/4MB area - revert to a
34749 + * large page again.
34750 + */
34751 +static void revert_page(unsigned long address, pgprot_t ref_prot)
34752 +{
34753 + pgd_t *pgd;
34754 + pud_t *pud;
34755 + pmd_t *pmd;
34756 + pte_t large_pte;
34757 +
34758 + pgd = pgd_offset_k(address);
34759 + BUG_ON(pgd_none(*pgd));
34760 + pud = pud_offset(pgd,address);
34761 + BUG_ON(pud_none(*pud));
34762 + pmd = pmd_offset(pud, address);
34763 + BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
34764 + pgprot_val(ref_prot) |= _PAGE_PSE;
34765 + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
34766 + set_pte((pte_t *)pmd, large_pte);
34767 +}
34768 +
34769 +static int
34770 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
34771 + pgprot_t ref_prot)
34772 +{
34773 + pte_t *kpte;
34774 + struct page *kpte_page;
34775 + unsigned kpte_flags;
34776 + pgprot_t ref_prot2;
34777 + kpte = lookup_address(address);
34778 + if (!kpte) return 0;
34779 + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
34780 + kpte_flags = pte_val(*kpte);
34781 + if (pgprot_val(prot) != pgprot_val(ref_prot)) {
34782 + if ((kpte_flags & _PAGE_PSE) == 0) {
34783 + set_pte(kpte, pfn_pte(pfn, prot));
34784 + } else {
34785 + /*
34786 + * split_large_page will take the reference for this
34787 + * change_page_attr on the split page.
34788 + */
34789 +
34790 + struct page *split;
34791 + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
34792 +
34793 + split = split_large_page(address, prot, ref_prot2);
34794 + if (!split)
34795 + return -ENOMEM;
34796 + set_pte(kpte,mk_pte(split, ref_prot2));
34797 + kpte_page = split;
34798 + }
34799 + page_private(kpte_page)++;
34800 + } else if ((kpte_flags & _PAGE_PSE) == 0) {
34801 + set_pte(kpte, pfn_pte(pfn, ref_prot));
34802 + BUG_ON(page_private(kpte_page) == 0);
34803 + page_private(kpte_page)--;
34804 + } else
34805 + BUG();
34806 +
34807 + /* on x86-64 the direct mapping set at boot is not using 4k pages */
34808 + /*
34809 + * ..., but the XEN guest kernels (currently) do:
34810 + * If the pte was reserved, it means it was created at boot
34811 + * time (not via split_large_page) and in turn we must not
34812 + * replace it with a large page.
34813 + */
34814 +#ifndef CONFIG_XEN
34815 + BUG_ON(PageReserved(kpte_page));
34816 +#else
34817 + if (PageReserved(kpte_page))
34818 + return 0;
34819 +#endif
34820 +
34821 + if (page_private(kpte_page) == 0) {
34822 + save_page(kpte_page);
34823 + revert_page(address, ref_prot);
34824 + }
34825 + return 0;
34826 +}
34827 +
34828 +/*
34829 + * Change the page attributes of an page in the linear mapping.
34830 + *
34831 + * This should be used when a page is mapped with a different caching policy
34832 + * than write-back somewhere - some CPUs do not like it when mappings with
34833 + * different caching policies exist. This changes the page attributes of the
34834 + * in kernel linear mapping too.
34835 + *
34836 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
34837 + * This function only deals with the kernel linear map.
34838 + *
34839 + * Caller must call global_flush_tlb() after this.
34840 + */
34841 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
34842 +{
34843 + int err = 0;
34844 + int i;
34845 +
34846 + down_write(&init_mm.mmap_sem);
34847 + for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
34848 + unsigned long pfn = __pa(address) >> PAGE_SHIFT;
34849 +
34850 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
34851 + if (err)
34852 + break;
34853 + /* Handle kernel mapping too which aliases part of the
34854 + * lowmem */
34855 + if (__pa(address) < KERNEL_TEXT_SIZE) {
34856 + unsigned long addr2;
34857 + pgprot_t prot2 = prot;
34858 + addr2 = __START_KERNEL_map + __pa(address);
34859 + pgprot_val(prot2) &= ~_PAGE_NX;
34860 + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
34861 + }
34862 + }
34863 + up_write(&init_mm.mmap_sem);
34864 + return err;
34865 +}
34866 +
34867 +/* Don't call this for MMIO areas that may not have a mem_map entry */
34868 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
34869 +{
34870 + unsigned long addr = (unsigned long)page_address(page);
34871 + return change_page_attr_addr(addr, numpages, prot);
34872 +}
34873 +
34874 +void global_flush_tlb(void)
34875 +{
34876 + struct page *dpage;
34877 +
34878 + down_read(&init_mm.mmap_sem);
34879 + dpage = xchg(&deferred_pages, NULL);
34880 + up_read(&init_mm.mmap_sem);
34881 +
34882 + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
34883 + while (dpage) {
34884 + struct page *tmp = dpage;
34885 + dpage = (struct page *)dpage->lru.next;
34886 + ClearPagePrivate(tmp);
34887 + __free_page(tmp);
34888 + }
34889 +}
34890 +
34891 +EXPORT_SYMBOL(change_page_attr);
34892 +EXPORT_SYMBOL(global_flush_tlb);
34893 Index: head-2008-11-25/drivers/pci/msi-xen.c
34894 ===================================================================
34895 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
34896 +++ head-2008-11-25/drivers/pci/msi-xen.c 2008-10-13 13:43:45.000000000 +0200
34897 @@ -0,0 +1,809 @@
34898 +/*
34899 + * File: msi.c
34900 + * Purpose: PCI Message Signaled Interrupt (MSI)
34901 + *
34902 + * Copyright (C) 2003-2004 Intel
34903 + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
34904 + */
34905 +
34906 +#include <linux/mm.h>
34907 +#include <linux/irq.h>
34908 +#include <linux/interrupt.h>
34909 +#include <linux/init.h>
34910 +#include <linux/ioport.h>
34911 +#include <linux/smp_lock.h>
34912 +#include <linux/pci.h>
34913 +#include <linux/proc_fs.h>
34914 +
34915 +#include <xen/evtchn.h>
34916 +
34917 +#include <asm/errno.h>
34918 +#include <asm/io.h>
34919 +#include <asm/smp.h>
34920 +
34921 +#include "pci.h"
34922 +#include "msi.h"
34923 +
34924 +static int pci_msi_enable = 1;
34925 +
34926 +static struct msi_ops *msi_ops;
34927 +
34928 +int msi_register(struct msi_ops *ops)
34929 +{
34930 + msi_ops = ops;
34931 + return 0;
34932 +}
34933 +
34934 +static LIST_HEAD(msi_dev_head);
34935 +DEFINE_SPINLOCK(msi_dev_lock);
34936 +
34937 +struct msi_dev_list {
34938 + struct pci_dev *dev;
34939 + struct list_head list;
34940 + spinlock_t pirq_list_lock;
34941 + struct list_head pirq_list_head;
34942 +};
34943 +
34944 +struct msi_pirq_entry {
34945 + struct list_head list;
34946 + int pirq;
34947 + int entry_nr;
34948 +};
34949 +
34950 +static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
34951 +{
34952 + struct msi_dev_list *msi_dev_list, *ret = NULL;
34953 + unsigned long flags;
34954 +
34955 + spin_lock_irqsave(&msi_dev_lock, flags);
34956 +
34957 + list_for_each_entry(msi_dev_list, &msi_dev_head, list)
34958 + if ( msi_dev_list->dev == dev )
34959 + ret = msi_dev_list;
34960 +
34961 + if ( ret ) {
34962 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34963 + return ret;
34964 + }
34965 +
34966 + /* Has not allocate msi_dev until now. */
34967 + ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
34968 +
34969 + /* Failed to allocate msi_dev structure */
34970 + if ( !ret ) {
34971 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34972 + return NULL;
34973 + }
34974 +
34975 + ret->dev = dev;
34976 + spin_lock_init(&ret->pirq_list_lock);
34977 + INIT_LIST_HEAD(&ret->pirq_list_head);
34978 + list_add_tail(&ret->list, &msi_dev_head);
34979 + spin_unlock_irqrestore(&msi_dev_lock, flags);
34980 + return ret;
34981 +}
34982 +
34983 +static int attach_pirq_entry(int pirq, int entry_nr,
34984 + struct msi_dev_list *msi_dev_entry)
34985 +{
34986 + struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
34987 + unsigned long flags;
34988 +
34989 + if (!entry)
34990 + return -ENOMEM;
34991 + entry->pirq = pirq;
34992 + entry->entry_nr = entry_nr;
34993 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
34994 + list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
34995 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
34996 + return 0;
34997 +}
34998 +
34999 +static void detach_pirq_entry(int entry_nr,
35000 + struct msi_dev_list *msi_dev_entry)
35001 +{
35002 + unsigned long flags;
35003 + struct msi_pirq_entry *pirq_entry;
35004 +
35005 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35006 + if (pirq_entry->entry_nr == entry_nr) {
35007 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35008 + list_del(&pirq_entry->list);
35009 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35010 + kfree(pirq_entry);
35011 + return;
35012 + }
35013 + }
35014 +}
35015 +
35016 +/*
35017 + * pciback will provide device's owner
35018 + */
35019 +static int (*get_owner)(struct pci_dev *dev);
35020 +
35021 +int register_msi_get_owner(int (*func)(struct pci_dev *dev))
35022 +{
35023 + if (get_owner) {
35024 + printk(KERN_WARNING "register msi_get_owner again\n");
35025 + return -EEXIST;
35026 + }
35027 + get_owner = func;
35028 + return 0;
35029 +}
35030 +
35031 +int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
35032 +{
35033 + if (get_owner != func)
35034 + return -EINVAL;
35035 + get_owner = NULL;
35036 + return 0;
35037 +}
35038 +
35039 +static int msi_get_dev_owner(struct pci_dev *dev)
35040 +{
35041 + int owner;
35042 +
35043 + BUG_ON(!is_initial_xendomain());
35044 + if (get_owner && (owner = get_owner(dev)) >= 0) {
35045 + printk(KERN_INFO "get owner for dev %x get %x \n",
35046 + dev->devfn, owner);
35047 + return owner;
35048 + }
35049 +
35050 + return DOMID_SELF;
35051 +}
35052 +
35053 +static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
35054 +{
35055 + struct physdev_unmap_pirq unmap;
35056 + int rc;
35057 +
35058 + unmap.domid = msi_get_dev_owner(dev);
35059 + /* See comments in msi_map_pirq_to_vector, input parameter pirq
35060 + * mean irq number only if the device belongs to dom0 itself.
35061 + */
35062 + unmap.pirq = (unmap.domid != DOMID_SELF)
35063 + ? pirq : evtchn_get_xen_pirq(pirq);
35064 +
35065 + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
35066 + printk(KERN_WARNING "unmap irq %x failed\n", pirq);
35067 +
35068 + if (rc < 0)
35069 + return rc;
35070 +
35071 + if (unmap.domid == DOMID_SELF)
35072 + evtchn_map_pirq(pirq, 0);
35073 +
35074 + return 0;
35075 +}
35076 +
35077 +static u64 find_table_base(struct pci_dev *dev, int pos)
35078 +{
35079 + u8 bar;
35080 + u32 reg;
35081 + unsigned long flags;
35082 +
35083 + pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
35084 + bar = reg & PCI_MSIX_FLAGS_BIRMASK;
35085 +
35086 + flags = pci_resource_flags(dev, bar);
35087 + if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
35088 + return 0;
35089 +
35090 + return pci_resource_start(dev, bar);
35091 +}
35092 +
35093 +/*
35094 + * Protected by msi_lock
35095 + */
35096 +static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq,
35097 + int entry_nr, u64 table_base)
35098 +{
35099 + struct physdev_map_pirq map_irq;
35100 + int rc;
35101 + domid_t domid = DOMID_SELF;
35102 +
35103 + domid = msi_get_dev_owner(dev);
35104 +
35105 + map_irq.domid = domid;
35106 + map_irq.type = MAP_PIRQ_TYPE_MSI;
35107 + map_irq.index = -1;
35108 + map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq);
35109 + map_irq.bus = dev->bus->number;
35110 + map_irq.devfn = dev->devfn;
35111 + map_irq.entry_nr = entry_nr;
35112 + map_irq.table_base = table_base;
35113 +
35114 + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
35115 + printk(KERN_WARNING "map irq failed\n");
35116 +
35117 + if (rc < 0)
35118 + return rc;
35119 + /* This happens when MSI support is not enabled in Xen. */
35120 + if (rc == 0 && map_irq.pirq < 0)
35121 + return -ENOSYS;
35122 +
35123 + BUG_ON(map_irq.pirq <= 0);
35124 +
35125 + /* If mapping of this particular MSI is on behalf of another domain,
35126 + * we do not need to get an irq in dom0. This also implies:
35127 + * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
35128 + * to another domain, and will be 'Linux irq' if it belongs to dom0.
35129 + */
35130 + return ((domid != DOMID_SELF) ?
35131 + map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq));
35132 +}
35133 +
35134 +static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
35135 +{
35136 + return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base);
35137 +}
35138 +
35139 +static int msi_init(void)
35140 +{
35141 + static int status = 0;
35142 +
35143 + if (pci_msi_quirk) {
35144 + pci_msi_enable = 0;
35145 + printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n");
35146 + status = -EINVAL;
35147 + }
35148 +
35149 + return status;
35150 +}
35151 +
35152 +void pci_scan_msi_device(struct pci_dev *dev) { }
35153 +
35154 +void disable_msi_mode(struct pci_dev *dev, int pos, int type)
35155 +{
35156 + u16 control;
35157 +
35158 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35159 + if (type == PCI_CAP_ID_MSI) {
35160 + /* Set enabled bits to single MSI & enable MSI_enable bit */
35161 + msi_disable(control);
35162 + pci_write_config_word(dev, msi_control_reg(pos), control);
35163 + dev->msi_enabled = 0;
35164 + } else {
35165 + msix_disable(control);
35166 + pci_write_config_word(dev, msi_control_reg(pos), control);
35167 + dev->msix_enabled = 0;
35168 + }
35169 + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35170 + /* PCI Express Endpoint device detected */
35171 + pci_intx(dev, 1); /* enable intx */
35172 + }
35173 +}
35174 +
35175 +static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
35176 +{
35177 + u16 control;
35178 +
35179 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35180 + if (type == PCI_CAP_ID_MSI) {
35181 + /* Set enabled bits to single MSI & enable MSI_enable bit */
35182 + msi_enable(control, 1);
35183 + pci_write_config_word(dev, msi_control_reg(pos), control);
35184 + dev->msi_enabled = 1;
35185 + } else {
35186 + msix_enable(control);
35187 + pci_write_config_word(dev, msi_control_reg(pos), control);
35188 + dev->msix_enabled = 1;
35189 + }
35190 + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
35191 + /* PCI Express Endpoint device detected */
35192 + pci_intx(dev, 0); /* disable intx */
35193 + }
35194 +}
35195 +
35196 +#ifdef CONFIG_PM
35197 +int pci_save_msi_state(struct pci_dev *dev)
35198 +{
35199 + int pos;
35200 +
35201 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35202 + if (pos <= 0 || dev->no_msi)
35203 + return 0;
35204 +
35205 + if (!dev->msi_enabled)
35206 + return 0;
35207 +
35208 + /* Restore dev->irq to its default pin-assertion vector */
35209 + msi_unmap_pirq(dev, dev->irq);
35210 + /* Disable MSI mode */
35211 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35212 + /* Set the flags for use of restore */
35213 + dev->msi_enabled = 1;
35214 + return 0;
35215 +}
35216 +
35217 +void pci_restore_msi_state(struct pci_dev *dev)
35218 +{
35219 + int pos, pirq;
35220 +
35221 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35222 + if (pos <= 0)
35223 + return;
35224 +
35225 + if (!dev->msi_enabled)
35226 + return;
35227 +
35228 + pirq = msi_map_pirq_to_vector(dev, dev->irq, 0, 0);
35229 + if (pirq < 0)
35230 + return;
35231 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35232 +}
35233 +
35234 +int pci_save_msix_state(struct pci_dev *dev)
35235 +{
35236 + int pos;
35237 + unsigned long flags;
35238 + struct msi_dev_list *msi_dev_entry;
35239 + struct msi_pirq_entry *pirq_entry, *tmp;
35240 +
35241 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35242 + if (pos <= 0 || dev->no_msi)
35243 + return 0;
35244 +
35245 + /* save the capability */
35246 + if (!dev->msix_enabled)
35247 + return 0;
35248 +
35249 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35250 +
35251 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35252 + list_for_each_entry_safe(pirq_entry, tmp,
35253 + &msi_dev_entry->pirq_list_head, list)
35254 + msi_unmap_pirq(dev, pirq_entry->pirq);
35255 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35256 +
35257 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35258 + /* Set the flags for use of restore */
35259 + dev->msix_enabled = 1;
35260 +
35261 + return 0;
35262 +}
35263 +
35264 +void pci_restore_msix_state(struct pci_dev *dev)
35265 +{
35266 + int pos;
35267 + unsigned long flags;
35268 + u64 table_base;
35269 + struct msi_dev_list *msi_dev_entry;
35270 + struct msi_pirq_entry *pirq_entry, *tmp;
35271 +
35272 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35273 + if (pos <= 0)
35274 + return;
35275 +
35276 + if (!dev->msix_enabled)
35277 + return;
35278 +
35279 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35280 + table_base = find_table_base(dev, pos);
35281 + if (!table_base)
35282 + return;
35283 +
35284 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35285 + list_for_each_entry_safe(pirq_entry, tmp,
35286 + &msi_dev_entry->pirq_list_head, list) {
35287 + int rc = msi_map_pirq_to_vector(dev, pirq_entry->pirq,
35288 + pirq_entry->entry_nr, table_base);
35289 + if (rc < 0)
35290 + printk(KERN_WARNING
35291 + "%s: re-mapping irq #%d (pirq%d) failed: %d\n",
35292 + pci_name(dev), pirq_entry->entry_nr,
35293 + pirq_entry->pirq, rc);
35294 + }
35295 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35296 +
35297 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35298 +}
35299 +#endif
35300 +
35301 +/**
35302 + * msi_capability_init - configure device's MSI capability structure
35303 + * @dev: pointer to the pci_dev data structure of MSI device function
35304 + *
35305 + * Setup the MSI capability structure of device function with a single
35306 + * MSI vector, regardless of device function is capable of handling
35307 + * multiple messages. A return of zero indicates the successful setup
35308 + * of an entry zero with the new MSI vector or non-zero for otherwise.
35309 + **/
35310 +static int msi_capability_init(struct pci_dev *dev)
35311 +{
35312 + int pos, pirq;
35313 + u16 control;
35314 +
35315 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35316 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35317 +
35318 + pirq = msi_map_vector(dev, 0, 0);
35319 + if (pirq < 0)
35320 + return -EBUSY;
35321 +
35322 + dev->irq = pirq;
35323 + /* Set MSI enabled bits */
35324 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35325 + dev->msi_enabled = 1;
35326 +
35327 + return 0;
35328 +}
35329 +
35330 +/**
35331 + * msix_capability_init - configure device's MSI-X capability
35332 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35333 + * @entries: pointer to an array of struct msix_entry entries
35334 + * @nvec: number of @entries
35335 + *
35336 + * Setup the MSI-X capability structure of device function with a
35337 + * single MSI-X vector. A return of zero indicates the successful setup of
35338 + * requested MSI-X entries with allocated vectors or non-zero for otherwise.
35339 + **/
35340 +static int msix_capability_init(struct pci_dev *dev,
35341 + struct msix_entry *entries, int nvec)
35342 +{
35343 + u64 table_base;
35344 + int pirq, i, j, mapped, pos;
35345 + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
35346 + struct msi_pirq_entry *pirq_entry;
35347 +
35348 + if (!msi_dev_entry)
35349 + return -ENOMEM;
35350 +
35351 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35352 + table_base = find_table_base(dev, pos);
35353 + if (!table_base)
35354 + return -ENODEV;
35355 +
35356 + /* MSI-X Table Initialization */
35357 + for (i = 0; i < nvec; i++) {
35358 + mapped = 0;
35359 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35360 + if (pirq_entry->entry_nr == entries[i].entry) {
35361 + printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
35362 + not freed before acquire again.\n", entries[i].entry,
35363 + dev->bus->number, PCI_SLOT(dev->devfn),
35364 + PCI_FUNC(dev->devfn));
35365 + (entries + i)->vector = pirq_entry->pirq;
35366 + mapped = 1;
35367 + break;
35368 + }
35369 + }
35370 + if (mapped)
35371 + continue;
35372 + pirq = msi_map_vector(dev, entries[i].entry, table_base);
35373 + if (pirq < 0)
35374 + break;
35375 + attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
35376 + (entries + i)->vector = pirq;
35377 + }
35378 +
35379 + if (i != nvec) {
35380 + for (j = --i; j >= 0; j--) {
35381 + msi_unmap_pirq(dev, entries[j].vector);
35382 + detach_pirq_entry(entries[j].entry, msi_dev_entry);
35383 + entries[j].vector = 0;
35384 + }
35385 + return -EBUSY;
35386 + }
35387 +
35388 + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35389 + dev->msix_enabled = 1;
35390 +
35391 + return 0;
35392 +}
35393 +
35394 +/**
35395 + * pci_enable_msi - configure device's MSI capability structure
35396 + * @dev: pointer to the pci_dev data structure of MSI device function
35397 + *
35398 + * Setup the MSI capability structure of device function with
35399 + * a single MSI vector upon its software driver call to request for
35400 + * MSI mode enabled on its hardware device function. A return of zero
35401 + * indicates the successful setup of an entry zero with the new MSI
35402 + * vector or non-zero for otherwise.
35403 + **/
35404 +extern int pci_frontend_enable_msi(struct pci_dev *dev);
35405 +int pci_enable_msi(struct pci_dev* dev)
35406 +{
35407 + struct pci_bus *bus;
35408 + int pos, temp, status = -EINVAL;
35409 +
35410 + if (!pci_msi_enable || !dev)
35411 + return status;
35412 +
35413 + if (dev->no_msi)
35414 + return status;
35415 +
35416 + for (bus = dev->bus; bus; bus = bus->parent)
35417 + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35418 + return -EINVAL;
35419 +
35420 + status = msi_init();
35421 + if (status < 0)
35422 + return status;
35423 +
35424 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35425 + if (!is_initial_xendomain())
35426 + {
35427 + int ret;
35428 +
35429 + temp = dev->irq;
35430 + ret = pci_frontend_enable_msi(dev);
35431 + if (ret)
35432 + return ret;
35433 +
35434 + dev->irq = evtchn_map_pirq(-1, dev->irq);
35435 + dev->irq_old = temp;
35436 +
35437 + return ret;
35438 + }
35439 +#endif
35440 +
35441 + temp = dev->irq;
35442 +
35443 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35444 + if (!pos)
35445 + return -EINVAL;
35446 +
35447 + /* Check whether driver already requested for MSI-X vectors */
35448 + if (dev->msix_enabled) {
35449 + printk(KERN_INFO "PCI: %s: Can't enable MSI. "
35450 + "Device already has MSI-X vectors assigned\n",
35451 + pci_name(dev));
35452 + dev->irq = temp;
35453 + return -EINVAL;
35454 + }
35455 +
35456 + status = msi_capability_init(dev);
35457 + if ( !status )
35458 + dev->irq_old = temp;
35459 + else
35460 + dev->irq = temp;
35461 +
35462 + return status;
35463 +}
35464 +
35465 +extern void pci_frontend_disable_msi(struct pci_dev* dev);
35466 +void pci_disable_msi(struct pci_dev* dev)
35467 +{
35468 + int pos;
35469 + int pirq;
35470 +
35471 + if (!pci_msi_enable)
35472 + return;
35473 + if (!dev)
35474 + return;
35475 +
35476 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35477 + if (!is_initial_xendomain()) {
35478 + evtchn_map_pirq(dev->irq, 0);
35479 + pci_frontend_disable_msi(dev);
35480 + dev->irq = dev->irq_old;
35481 + return;
35482 + }
35483 +#endif
35484 +
35485 + pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
35486 + if (!pos)
35487 + return;
35488 +
35489 + pirq = dev->irq;
35490 + /* Restore dev->irq to its default pin-assertion vector */
35491 + dev->irq = dev->irq_old;
35492 + msi_unmap_pirq(dev, pirq);
35493 +
35494 + /* Disable MSI mode */
35495 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
35496 +}
35497 +
35498 +/**
35499 + * pci_enable_msix - configure device's MSI-X capability structure
35500 + * @dev: pointer to the pci_dev data structure of MSI-X device function
35501 + * @entries: pointer to an array of MSI-X entries
35502 + * @nvec: number of MSI-X vectors requested for allocation by device driver
35503 + *
35504 + * Setup the MSI-X capability structure of device function with the number
35505 + * of requested vectors upon its software driver call to request for
35506 + * MSI-X mode enabled on its hardware device function. A return of zero
35507 + * indicates the successful configuration of MSI-X capability structure
35508 + * with new allocated MSI-X vectors. A return of < 0 indicates a failure.
35509 + * Or a return of > 0 indicates that driver request is exceeding the number
35510 + * of vectors available. Driver should use the returned value to re-send
35511 + * its request.
35512 + **/
35513 +extern int pci_frontend_enable_msix(struct pci_dev *dev,
35514 + struct msix_entry *entries, int nvec);
35515 +int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
35516 +{
35517 + struct pci_bus *bus;
35518 + int status, pos, nr_entries;
35519 + int i, j, temp;
35520 + u16 control;
35521 +
35522 + if (!pci_msi_enable || !dev || !entries)
35523 + return -EINVAL;
35524 +
35525 + if (dev->no_msi)
35526 + return -EINVAL;
35527 +
35528 + for (bus = dev->bus; bus; bus = bus->parent)
35529 + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
35530 + return -EINVAL;
35531 +
35532 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35533 + if (!is_initial_xendomain()) {
35534 + struct msi_dev_list *msi_dev_entry;
35535 + struct msi_pirq_entry *pirq_entry;
35536 + int ret, irq;
35537 +
35538 + ret = pci_frontend_enable_msix(dev, entries, nvec);
35539 + if (ret) {
35540 + printk("get %x from pci_frontend_enable_msix\n", ret);
35541 + return ret;
35542 + }
35543 +
35544 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35545 + for (i = 0; i < nvec; i++) {
35546 + int mapped = 0;
35547 +
35548 + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
35549 + if (pirq_entry->entry_nr == entries[i].entry) {
35550 + irq = pirq_entry->pirq;
35551 + BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
35552 + entries[i].vector = irq;
35553 + mapped = 1;
35554 + break;
35555 + }
35556 + }
35557 + if (mapped)
35558 + continue;
35559 + irq = evtchn_map_pirq(-1, entries[i].vector);
35560 + attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
35561 + entries[i].vector = irq;
35562 + }
35563 + return 0;
35564 + }
35565 +#endif
35566 +
35567 + status = msi_init();
35568 + if (status < 0)
35569 + return status;
35570 +
35571 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35572 + if (!pos)
35573 + return -EINVAL;
35574 +
35575 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35576 + nr_entries = multi_msix_capable(control);
35577 + if (nvec > nr_entries)
35578 + return -EINVAL;
35579 +
35580 + /* Check for any invalid entries */
35581 + for (i = 0; i < nvec; i++) {
35582 + if (entries[i].entry >= nr_entries)
35583 + return -EINVAL; /* invalid entry */
35584 + for (j = i + 1; j < nvec; j++) {
35585 + if (entries[i].entry == entries[j].entry)
35586 + return -EINVAL; /* duplicate entry */
35587 + }
35588 + }
35589 +
35590 + temp = dev->irq;
35591 + /* Check whether driver already requested for MSI vector */
35592 + if (dev->msi_enabled) {
35593 + printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
35594 + "Device already has an MSI vector assigned\n",
35595 + pci_name(dev));
35596 + dev->irq = temp;
35597 + return -EINVAL;
35598 + }
35599 +
35600 + status = msix_capability_init(dev, entries, nvec);
35601 +
35602 + if ( !status )
35603 + dev->irq_old = temp;
35604 + else
35605 + dev->irq = temp;
35606 +
35607 + return status;
35608 +}
35609 +
35610 +extern void pci_frontend_disable_msix(struct pci_dev* dev);
35611 +void pci_disable_msix(struct pci_dev* dev)
35612 +{
35613 + int pos;
35614 + u16 control;
35615 +
35616 +
35617 + if (!pci_msi_enable)
35618 + return;
35619 + if (!dev)
35620 + return;
35621 +
35622 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
35623 + if (!is_initial_xendomain()) {
35624 + struct msi_dev_list *msi_dev_entry;
35625 + struct msi_pirq_entry *pirq_entry, *tmp;
35626 +
35627 + pci_frontend_disable_msix(dev);
35628 +
35629 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35630 + list_for_each_entry_safe(pirq_entry, tmp,
35631 + &msi_dev_entry->pirq_list_head, list) {
35632 + evtchn_map_pirq(pirq_entry->pirq, 0);
35633 + list_del(&pirq_entry->list);
35634 + kfree(pirq_entry);
35635 + }
35636 +
35637 + dev->irq = dev->irq_old;
35638 + return;
35639 + }
35640 +#endif
35641 +
35642 + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
35643 + if (!pos)
35644 + return;
35645 +
35646 + pci_read_config_word(dev, msi_control_reg(pos), &control);
35647 + if (!(control & PCI_MSIX_FLAGS_ENABLE))
35648 + return;
35649 +
35650 + msi_remove_pci_irq_vectors(dev);
35651 +
35652 + /* Disable MSI mode */
35653 + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
35654 +}
35655 +
35656 +/**
35657 + * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
35658 + * @dev: pointer to the pci_dev data structure of MSI(X) device function
35659 + *
35660 + * Being called during hotplug remove, from which the device function
35661 + * is hot-removed. All previous assigned MSI/MSI-X vectors, if
35662 + * allocated for this device function, are reclaimed to unused state,
35663 + * which may be used later on.
35664 + **/
35665 +void msi_remove_pci_irq_vectors(struct pci_dev* dev)
35666 +{
35667 + unsigned long flags;
35668 + struct msi_dev_list *msi_dev_entry;
35669 + struct msi_pirq_entry *pirq_entry, *tmp;
35670 +
35671 + if (!pci_msi_enable || !dev)
35672 + return;
35673 +
35674 + msi_dev_entry = get_msi_dev_pirq_list(dev);
35675 +
35676 + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
35677 + if (!list_empty(&msi_dev_entry->pirq_list_head))
35678 + {
35679 + printk(KERN_WARNING "msix pirqs for dev %02x:%02x:%01x are not freed \
35680 + before acquire again.\n", dev->bus->number, PCI_SLOT(dev->devfn),
35681 + PCI_FUNC(dev->devfn));
35682 + list_for_each_entry_safe(pirq_entry, tmp,
35683 + &msi_dev_entry->pirq_list_head, list) {
35684 + msi_unmap_pirq(dev, pirq_entry->pirq);
35685 + list_del(&pirq_entry->list);
35686 + kfree(pirq_entry);
35687 + }
35688 + }
35689 + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
35690 + dev->irq = dev->irq_old;
35691 +}
35692 +
35693 +void pci_no_msi(void)
35694 +{
35695 + pci_msi_enable = 0;
35696 +}
35697 +
35698 +EXPORT_SYMBOL(pci_enable_msi);
35699 +EXPORT_SYMBOL(pci_disable_msi);
35700 +EXPORT_SYMBOL(pci_enable_msix);
35701 +EXPORT_SYMBOL(pci_disable_msix);
35702 +#ifdef CONFIG_XEN
35703 +EXPORT_SYMBOL(register_msi_get_owner);
35704 +EXPORT_SYMBOL(unregister_msi_get_owner);
35705 +#endif
35706 +
35707 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h
35708 ===================================================================
35709 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35710 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/agp.h 2007-06-22 09:08:06.000000000 +0200
35711 @@ -0,0 +1,44 @@
35712 +#ifndef AGP_H
35713 +#define AGP_H 1
35714 +
35715 +#include <asm/pgtable.h>
35716 +#include <asm/cacheflush.h>
35717 +#include <asm/system.h>
35718 +
35719 +/*
35720 + * Functions to keep the agpgart mappings coherent with the MMU.
35721 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
35722 + * mapped uncacheable. Make sure there are no conflicting mappings
35723 + * with different cachability attributes for the same page. This avoids
35724 + * data corruption on some CPUs.
35725 + */
35726 +
35727 +/* Caller's responsibility to call global_flush_tlb() for
35728 + * performance reasons */
35729 +#define map_page_into_agp(page) ( \
35730 + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
35731 + ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
35732 +#define unmap_page_from_agp(page) ( \
35733 + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
35734 + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
35735 + change_page_attr(page, 1, PAGE_KERNEL))
35736 +#define flush_agp_mappings() global_flush_tlb()
35737 +
35738 +/* Could use CLFLUSH here if the cpu supports it. But then it would
35739 + need to be called for each cacheline of the whole page so it may not be
35740 + worth it. Would need a page for it. */
35741 +#define flush_agp_cache() wbinvd()
35742 +
35743 +/* Convert a physical address to an address suitable for the GART. */
35744 +#define phys_to_gart(x) phys_to_machine(x)
35745 +#define gart_to_phys(x) machine_to_phys(x)
35746 +
35747 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
35748 +#define alloc_gatt_pages(order) ({ \
35749 + char *_t; dma_addr_t _d; \
35750 + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
35751 + _t; })
35752 +#define free_gatt_pages(table, order) \
35753 + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
35754 +
35755 +#endif
35756 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h
35757 ===================================================================
35758 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35759 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100
35760 @@ -0,0 +1,166 @@
35761 +#ifndef __ARCH_DESC_H
35762 +#define __ARCH_DESC_H
35763 +
35764 +#include <asm/ldt.h>
35765 +#include <asm/segment.h>
35766 +
35767 +#define CPU_16BIT_STACK_SIZE 1024
35768 +
35769 +#ifndef __ASSEMBLY__
35770 +
35771 +#include <linux/preempt.h>
35772 +#include <linux/smp.h>
35773 +
35774 +#include <asm/mmu.h>
35775 +
35776 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
35777 +
35778 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
35779 +
35780 +struct Xgt_desc_struct {
35781 + unsigned short size;
35782 + unsigned long address __attribute__((packed));
35783 + unsigned short pad;
35784 +} __attribute__ ((packed));
35785 +
35786 +extern struct Xgt_desc_struct idt_descr;
35787 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
35788 +
35789 +
35790 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
35791 +{
35792 + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
35793 +}
35794 +
35795 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
35796 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
35797 +
35798 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
35799 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
35800 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
35801 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
35802 +
35803 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
35804 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
35805 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
35806 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
35807 +
35808 +/*
35809 + * This is the ldt that every process will get unless we need
35810 + * something other than this.
35811 + */
35812 +extern struct desc_struct default_ldt[];
35813 +extern void set_intr_gate(unsigned int irq, void * addr);
35814 +
35815 +#define _set_tssldt_desc(n,addr,limit,type) \
35816 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
35817 + "movw %w1,2(%2)\n\t" \
35818 + "rorl $16,%1\n\t" \
35819 + "movb %b1,4(%2)\n\t" \
35820 + "movb %4,5(%2)\n\t" \
35821 + "movb $0,6(%2)\n\t" \
35822 + "movb %h1,7(%2)\n\t" \
35823 + "rorl $16,%1" \
35824 + : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
35825 +
35826 +#ifndef CONFIG_X86_NO_TSS
35827 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
35828 +{
35829 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
35830 + offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
35831 +}
35832 +
35833 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
35834 +#endif
35835 +
35836 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
35837 +{
35838 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
35839 +}
35840 +
35841 +#define LDT_entry_a(info) \
35842 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
35843 +
35844 +#define LDT_entry_b(info) \
35845 + (((info)->base_addr & 0xff000000) | \
35846 + (((info)->base_addr & 0x00ff0000) >> 16) | \
35847 + ((info)->limit & 0xf0000) | \
35848 + (((info)->read_exec_only ^ 1) << 9) | \
35849 + ((info)->contents << 10) | \
35850 + (((info)->seg_not_present ^ 1) << 15) | \
35851 + ((info)->seg_32bit << 22) | \
35852 + ((info)->limit_in_pages << 23) | \
35853 + ((info)->useable << 20) | \
35854 + 0x7000)
35855 +
35856 +#define LDT_empty(info) (\
35857 + (info)->base_addr == 0 && \
35858 + (info)->limit == 0 && \
35859 + (info)->contents == 0 && \
35860 + (info)->read_exec_only == 1 && \
35861 + (info)->seg_32bit == 0 && \
35862 + (info)->limit_in_pages == 0 && \
35863 + (info)->seg_not_present == 1 && \
35864 + (info)->useable == 0 )
35865 +
35866 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
35867 +
35868 +#if TLS_SIZE != 24
35869 +# error update this code.
35870 +#endif
35871 +
35872 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
35873 +{
35874 +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
35875 + *(u64 *)&t->tls_array[i])) \
35876 + BUG();
35877 + C(0); C(1); C(2);
35878 +#undef C
35879 +}
35880 +
35881 +static inline void clear_LDT(void)
35882 +{
35883 + int cpu = get_cpu();
35884 +
35885 + /*
35886 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
35887 + * it slows down context switching. Noone uses it anyway.
35888 + */
35889 + cpu = cpu; /* XXX avoid compiler warning */
35890 + xen_set_ldt(NULL, 0);
35891 + put_cpu();
35892 +}
35893 +
35894 +/*
35895 + * load one particular LDT into the current CPU
35896 + */
35897 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
35898 +{
35899 + void *segments = pc->ldt;
35900 + int count = pc->size;
35901 +
35902 + if (likely(!count))
35903 + segments = NULL;
35904 +
35905 + xen_set_ldt(segments, count);
35906 +}
35907 +
35908 +static inline void load_LDT(mm_context_t *pc)
35909 +{
35910 + int cpu = get_cpu();
35911 + load_LDT_nolock(pc, cpu);
35912 + put_cpu();
35913 +}
35914 +
35915 +static inline unsigned long get_desc_base(unsigned long *desc)
35916 +{
35917 + unsigned long base;
35918 + base = ((desc[0] >> 16) & 0x0000ffff) |
35919 + ((desc[1] << 16) & 0x00ff0000) |
35920 + (desc[1] & 0xff000000);
35921 + return base;
35922 +}
35923 +
35924 +#endif /* !__ASSEMBLY__ */
35925 +
35926 +#endif
35927 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h
35928 ===================================================================
35929 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35930 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-04-02 12:34:02.000000000 +0200
35931 @@ -0,0 +1,151 @@
35932 +#ifndef _ASM_I386_DMA_MAPPING_H
35933 +#define _ASM_I386_DMA_MAPPING_H
35934 +
35935 +/*
35936 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
35937 + * documentation.
35938 + */
35939 +
35940 +#include <linux/mm.h>
35941 +#include <asm/cache.h>
35942 +#include <asm/io.h>
35943 +#include <asm/scatterlist.h>
35944 +#include <asm/swiotlb.h>
35945 +
35946 +static inline int
35947 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
35948 +{
35949 + dma_addr_t mask = 0xffffffff;
35950 + /* If the device has a mask, use it, otherwise default to 32 bits */
35951 + if (hwdev && hwdev->dma_mask)
35952 + mask = *hwdev->dma_mask;
35953 + return (addr & ~mask) != 0;
35954 +}
35955 +
35956 +extern int range_straddles_page_boundary(paddr_t p, size_t size);
35957 +
35958 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
35959 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
35960 +
35961 +void *dma_alloc_coherent(struct device *dev, size_t size,
35962 + dma_addr_t *dma_handle, gfp_t flag);
35963 +
35964 +void dma_free_coherent(struct device *dev, size_t size,
35965 + void *vaddr, dma_addr_t dma_handle);
35966 +
35967 +extern dma_addr_t
35968 +dma_map_single(struct device *dev, void *ptr, size_t size,
35969 + enum dma_data_direction direction);
35970 +
35971 +extern void
35972 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
35973 + enum dma_data_direction direction);
35974 +
35975 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
35976 + int nents, enum dma_data_direction direction);
35977 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
35978 + int nents, enum dma_data_direction direction);
35979 +
35980 +#ifdef CONFIG_HIGHMEM
35981 +extern dma_addr_t
35982 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
35983 + size_t size, enum dma_data_direction direction);
35984 +
35985 +extern void
35986 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
35987 + enum dma_data_direction direction);
35988 +#else
35989 +#define dma_map_page(dev, page, offset, size, dir) \
35990 + dma_map_single(dev, page_address(page) + (offset), (size), (dir))
35991 +#define dma_unmap_page dma_unmap_single
35992 +#endif
35993 +
35994 +extern void
35995 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
35996 + enum dma_data_direction direction);
35997 +
35998 +extern void
35999 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
36000 + enum dma_data_direction direction);
36001 +
36002 +static inline void
36003 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
36004 + unsigned long offset, size_t size,
36005 + enum dma_data_direction direction)
36006 +{
36007 + dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
36008 +}
36009 +
36010 +static inline void
36011 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
36012 + unsigned long offset, size_t size,
36013 + enum dma_data_direction direction)
36014 +{
36015 + dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
36016 +}
36017 +
36018 +static inline void
36019 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
36020 + enum dma_data_direction direction)
36021 +{
36022 + if (swiotlb)
36023 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
36024 + flush_write_buffers();
36025 +}
36026 +
36027 +static inline void
36028 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
36029 + enum dma_data_direction direction)
36030 +{
36031 + if (swiotlb)
36032 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
36033 + flush_write_buffers();
36034 +}
36035 +
36036 +extern int
36037 +dma_mapping_error(dma_addr_t dma_addr);
36038 +
36039 +extern int
36040 +dma_supported(struct device *dev, u64 mask);
36041 +
36042 +static inline int
36043 +dma_set_mask(struct device *dev, u64 mask)
36044 +{
36045 + if(!dev->dma_mask || !dma_supported(dev, mask))
36046 + return -EIO;
36047 +
36048 + *dev->dma_mask = mask;
36049 +
36050 + return 0;
36051 +}
36052 +
36053 +static inline int
36054 +dma_get_cache_alignment(void)
36055 +{
36056 + /* no easy way to get cache size on all x86, so return the
36057 + * maximum possible, to be safe */
36058 + return (1 << INTERNODE_CACHE_SHIFT);
36059 +}
36060 +
36061 +#define dma_is_consistent(d) (1)
36062 +
36063 +static inline void
36064 +dma_cache_sync(void *vaddr, size_t size,
36065 + enum dma_data_direction direction)
36066 +{
36067 + flush_write_buffers();
36068 +}
36069 +
36070 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
36071 +extern int
36072 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
36073 + dma_addr_t device_addr, size_t size, int flags);
36074 +
36075 +extern void
36076 +dma_release_declared_memory(struct device *dev);
36077 +
36078 +extern void *
36079 +dma_mark_declared_memory_occupied(struct device *dev,
36080 + dma_addr_t device_addr, size_t size);
36081 +
36082 +#endif
36083 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h
36084 ===================================================================
36085 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36086 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200
36087 @@ -0,0 +1,155 @@
36088 +/*
36089 + * fixmap.h: compile-time virtual memory allocation
36090 + *
36091 + * This file is subject to the terms and conditions of the GNU General Public
36092 + * License. See the file "COPYING" in the main directory of this archive
36093 + * for more details.
36094 + *
36095 + * Copyright (C) 1998 Ingo Molnar
36096 + *
36097 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
36098 + */
36099 +
36100 +#ifndef _ASM_FIXMAP_H
36101 +#define _ASM_FIXMAP_H
36102 +
36103 +
36104 +/* used by vmalloc.c, vsyscall.lds.S.
36105 + *
36106 + * Leave one empty page between vmalloc'ed areas and
36107 + * the start of the fixmap.
36108 + */
36109 +extern unsigned long __FIXADDR_TOP;
36110 +
36111 +#ifndef __ASSEMBLY__
36112 +#include <linux/kernel.h>
36113 +#include <asm/acpi.h>
36114 +#include <asm/apicdef.h>
36115 +#include <asm/page.h>
36116 +#ifdef CONFIG_HIGHMEM
36117 +#include <linux/threads.h>
36118 +#include <asm/kmap_types.h>
36119 +#endif
36120 +
36121 +/*
36122 + * Here we define all the compile-time 'special' virtual
36123 + * addresses. The point is to have a constant address at
36124 + * compile time, but to set the physical address only
36125 + * in the boot process. We allocate these special addresses
36126 + * from the end of virtual memory (0xfffff000) backwards.
36127 + * Also this lets us do fail-safe vmalloc(), we
36128 + * can guarantee that these special addresses and
36129 + * vmalloc()-ed addresses never overlap.
36130 + *
36131 + * these 'compile-time allocated' memory buffers are
36132 + * fixed-size 4k pages. (or larger if used with an increment
36133 + * highger than 1) use fixmap_set(idx,phys) to associate
36134 + * physical memory with fixmap indices.
36135 + *
36136 + * TLB entries of such buffers will not be flushed across
36137 + * task switches.
36138 + */
36139 +enum fixed_addresses {
36140 + FIX_HOLE,
36141 + FIX_VDSO,
36142 +#ifdef CONFIG_X86_LOCAL_APIC
36143 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
36144 +#endif
36145 +#ifdef CONFIG_X86_IO_APIC
36146 + FIX_IO_APIC_BASE_0,
36147 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
36148 +#endif
36149 +#ifdef CONFIG_X86_VISWS_APIC
36150 + FIX_CO_CPU, /* Cobalt timer */
36151 + FIX_CO_APIC, /* Cobalt APIC Redirection Table */
36152 + FIX_LI_PCIA, /* Lithium PCI Bridge A */
36153 + FIX_LI_PCIB, /* Lithium PCI Bridge B */
36154 +#endif
36155 +#ifdef CONFIG_X86_F00F_BUG
36156 + FIX_F00F_IDT, /* Virtual mapping for IDT */
36157 +#endif
36158 +#ifdef CONFIG_X86_CYCLONE_TIMER
36159 + FIX_CYCLONE_TIMER, /*cyclone timer register*/
36160 +#endif
36161 +#ifdef CONFIG_HIGHMEM
36162 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
36163 + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
36164 +#endif
36165 +#ifdef CONFIG_ACPI
36166 + FIX_ACPI_BEGIN,
36167 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
36168 +#endif
36169 +#ifdef CONFIG_PCI_MMCONFIG
36170 + FIX_PCIE_MCFG,
36171 +#endif
36172 + FIX_SHARED_INFO,
36173 +#define NR_FIX_ISAMAPS 256
36174 + FIX_ISAMAP_END,
36175 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
36176 + __end_of_permanent_fixed_addresses,
36177 + /* temporary boot-time mappings, used before ioremap() is functional */
36178 +#define NR_FIX_BTMAPS 16
36179 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
36180 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
36181 + FIX_WP_TEST,
36182 + __end_of_fixed_addresses
36183 +};
36184 +
36185 +extern void set_fixaddr_top(unsigned long top);
36186 +
36187 +extern void __set_fixmap(enum fixed_addresses idx,
36188 + maddr_t phys, pgprot_t flags);
36189 +
36190 +#define set_fixmap(idx, phys) \
36191 + __set_fixmap(idx, phys, PAGE_KERNEL)
36192 +/*
36193 + * Some hardware wants to get fixmapped without caching.
36194 + */
36195 +#define set_fixmap_nocache(idx, phys) \
36196 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
36197 +
36198 +#define clear_fixmap(idx) \
36199 + __set_fixmap(idx, 0, __pgprot(0))
36200 +
36201 +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
36202 +
36203 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
36204 +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
36205 +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
36206 +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
36207 +
36208 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
36209 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
36210 +
36211 +extern void __this_fixmap_does_not_exist(void);
36212 +
36213 +/*
36214 + * 'index to address' translation. If anyone tries to use the idx
36215 + * directly without tranlation, we catch the bug with a NULL-deference
36216 + * kernel oops. Illegal ranges of incoming indices are caught too.
36217 + */
36218 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
36219 +{
36220 + /*
36221 + * this branch gets completely eliminated after inlining,
36222 + * except when someone tries to use fixaddr indices in an
36223 + * illegal way. (such as mixing up address types or using
36224 + * out-of-range indices).
36225 + *
36226 + * If it doesn't get removed, the linker will complain
36227 + * loudly with a reasonably clear error message..
36228 + */
36229 + if (idx >= __end_of_fixed_addresses)
36230 + __this_fixmap_does_not_exist();
36231 +
36232 + return __fix_to_virt(idx);
36233 +}
36234 +
36235 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
36236 +{
36237 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
36238 + return __virt_to_fix(vaddr);
36239 +}
36240 +
36241 +#endif /* !__ASSEMBLY__ */
36242 +#endif
36243 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h
36244 ===================================================================
36245 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36246 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/gnttab_dma.h 2007-08-06 15:10:49.000000000 +0200
36247 @@ -0,0 +1,41 @@
36248 +/*
36249 + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
36250 + * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
36251 + * VA Linux Systems Japan K.K.
36252 + *
36253 + * This program is free software; you can redistribute it and/or modify
36254 + * it under the terms of the GNU General Public License as published by
36255 + * the Free Software Foundation; either version 2 of the License, or
36256 + * (at your option) any later version.
36257 + *
36258 + * This program is distributed in the hope that it will be useful,
36259 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
36260 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36261 + * GNU General Public License for more details.
36262 + *
36263 + * You should have received a copy of the GNU General Public License
36264 + * along with this program; if not, write to the Free Software
36265 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36266 + */
36267 +
36268 +#ifndef _ASM_I386_GNTTAB_DMA_H
36269 +#define _ASM_I386_GNTTAB_DMA_H
36270 +
36271 +static inline int gnttab_dma_local_pfn(struct page *page)
36272 +{
36273 + /* Has it become a local MFN? */
36274 + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
36275 +}
36276 +
36277 +static inline maddr_t gnttab_dma_map_page(struct page *page)
36278 +{
36279 + __gnttab_dma_map_page(page);
36280 + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
36281 +}
36282 +
36283 +static inline void gnttab_dma_unmap_page(maddr_t maddr)
36284 +{
36285 + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
36286 +}
36287 +
36288 +#endif /* _ASM_I386_GNTTAB_DMA_H */
36289 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h
36290 ===================================================================
36291 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36292 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100
36293 @@ -0,0 +1,97 @@
36294 +/*
36295 + * highmem.h: virtual kernel memory mappings for high memory
36296 + *
36297 + * Used in CONFIG_HIGHMEM systems for memory pages which
36298 + * are not addressable by direct kernel virtual addresses.
36299 + *
36300 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
36301 + * Gerhard.Wichert@pdb.siemens.de
36302 + *
36303 + *
36304 + * Redesigned the x86 32-bit VM architecture to deal with
36305 + * up to 16 Terabyte physical memory. With current x86 CPUs
36306 + * we now support up to 64 Gigabytes physical RAM.
36307 + *
36308 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
36309 + */
36310 +
36311 +#ifndef _ASM_HIGHMEM_H
36312 +#define _ASM_HIGHMEM_H
36313 +
36314 +#ifdef __KERNEL__
36315 +
36316 +#include <linux/interrupt.h>
36317 +#include <linux/threads.h>
36318 +#include <asm/kmap_types.h>
36319 +#include <asm/tlbflush.h>
36320 +
36321 +/* declarations for highmem.c */
36322 +extern unsigned long highstart_pfn, highend_pfn;
36323 +
36324 +extern pte_t *kmap_pte;
36325 +extern pgprot_t kmap_prot;
36326 +extern pte_t *pkmap_page_table;
36327 +
36328 +/*
36329 + * Right now we initialize only a single pte table. It can be extended
36330 + * easily, subsequent pte tables have to be allocated in one physical
36331 + * chunk of RAM.
36332 + */
36333 +#ifdef CONFIG_X86_PAE
36334 +#define LAST_PKMAP 512
36335 +#else
36336 +#define LAST_PKMAP 1024
36337 +#endif
36338 +/*
36339 + * Ordering is:
36340 + *
36341 + * FIXADDR_TOP
36342 + * fixed_addresses
36343 + * FIXADDR_START
36344 + * temp fixed addresses
36345 + * FIXADDR_BOOT_START
36346 + * Persistent kmap area
36347 + * PKMAP_BASE
36348 + * VMALLOC_END
36349 + * Vmalloc area
36350 + * VMALLOC_START
36351 + * high_memory
36352 + */
36353 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
36354 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
36355 +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
36356 +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
36357 +
36358 +extern void * FASTCALL(kmap_high(struct page *page));
36359 +extern void FASTCALL(kunmap_high(struct page *page));
36360 +
36361 +void *kmap(struct page *page);
36362 +void kunmap(struct page *page);
36363 +void *kmap_atomic(struct page *page, enum km_type type);
36364 +void *kmap_atomic_pte(struct page *page, enum km_type type);
36365 +void kunmap_atomic(void *kvaddr, enum km_type type);
36366 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
36367 +struct page *kmap_atomic_to_page(void *ptr);
36368 +
36369 +#define flush_cache_kmaps() do { } while (0)
36370 +
36371 +void clear_highpage(struct page *);
36372 +static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
36373 +{
36374 + clear_highpage(page);
36375 +}
36376 +#define __HAVE_ARCH_CLEAR_HIGHPAGE
36377 +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
36378 +
36379 +void copy_highpage(struct page *to, struct page *from);
36380 +static inline void copy_user_highpage(struct page *to, struct page *from,
36381 + unsigned long vaddr)
36382 +{
36383 + copy_highpage(to, from);
36384 +}
36385 +#define __HAVE_ARCH_COPY_HIGHPAGE
36386 +#define __HAVE_ARCH_COPY_USER_HIGHPAGE
36387 +
36388 +#endif /* __KERNEL__ */
36389 +
36390 +#endif /* _ASM_HIGHMEM_H */
36391 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h
36392 ===================================================================
36393 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36394 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_32.h 2008-11-25 12:22:34.000000000 +0100
36395 @@ -0,0 +1,409 @@
36396 +/******************************************************************************
36397 + * hypercall.h
36398 + *
36399 + * Linux-specific hypervisor handling.
36400 + *
36401 + * Copyright (c) 2002-2004, K A Fraser
36402 + *
36403 + * This program is free software; you can redistribute it and/or
36404 + * modify it under the terms of the GNU General Public License version 2
36405 + * as published by the Free Software Foundation; or, when distributed
36406 + * separately from the Linux kernel or incorporated into other
36407 + * software packages, subject to the following license:
36408 + *
36409 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36410 + * of this source file (the "Software"), to deal in the Software without
36411 + * restriction, including without limitation the rights to use, copy, modify,
36412 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36413 + * and to permit persons to whom the Software is furnished to do so, subject to
36414 + * the following conditions:
36415 + *
36416 + * The above copyright notice and this permission notice shall be included in
36417 + * all copies or substantial portions of the Software.
36418 + *
36419 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36420 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36421 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36422 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36423 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36424 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36425 + * IN THE SOFTWARE.
36426 + */
36427 +
36428 +#ifndef __HYPERCALL_H__
36429 +#define __HYPERCALL_H__
36430 +
36431 +#include <linux/string.h> /* memcpy() */
36432 +#include <linux/stringify.h>
36433 +
36434 +#ifndef __HYPERVISOR_H__
36435 +# error "please don't include this file directly"
36436 +#endif
36437 +
36438 +#ifdef CONFIG_XEN
36439 +#define HYPERCALL_STR(name) \
36440 + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
36441 +#else
36442 +#define HYPERCALL_STR(name) \
36443 + "mov hypercall_stubs,%%eax; " \
36444 + "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
36445 + "call *%%eax"
36446 +#endif
36447 +
36448 +#define _hypercall0(type, name) \
36449 +({ \
36450 + type __res; \
36451 + asm volatile ( \
36452 + HYPERCALL_STR(name) \
36453 + : "=a" (__res) \
36454 + : \
36455 + : "memory" ); \
36456 + __res; \
36457 +})
36458 +
36459 +#define _hypercall1(type, name, a1) \
36460 +({ \
36461 + type __res; \
36462 + long __ign1; \
36463 + asm volatile ( \
36464 + HYPERCALL_STR(name) \
36465 + : "=a" (__res), "=b" (__ign1) \
36466 + : "1" ((long)(a1)) \
36467 + : "memory" ); \
36468 + __res; \
36469 +})
36470 +
36471 +#define _hypercall2(type, name, a1, a2) \
36472 +({ \
36473 + type __res; \
36474 + long __ign1, __ign2; \
36475 + asm volatile ( \
36476 + HYPERCALL_STR(name) \
36477 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
36478 + : "1" ((long)(a1)), "2" ((long)(a2)) \
36479 + : "memory" ); \
36480 + __res; \
36481 +})
36482 +
36483 +#define _hypercall3(type, name, a1, a2, a3) \
36484 +({ \
36485 + type __res; \
36486 + long __ign1, __ign2, __ign3; \
36487 + asm volatile ( \
36488 + HYPERCALL_STR(name) \
36489 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36490 + "=d" (__ign3) \
36491 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36492 + "3" ((long)(a3)) \
36493 + : "memory" ); \
36494 + __res; \
36495 +})
36496 +
36497 +#define _hypercall4(type, name, a1, a2, a3, a4) \
36498 +({ \
36499 + type __res; \
36500 + long __ign1, __ign2, __ign3, __ign4; \
36501 + asm volatile ( \
36502 + HYPERCALL_STR(name) \
36503 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36504 + "=d" (__ign3), "=S" (__ign4) \
36505 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36506 + "3" ((long)(a3)), "4" ((long)(a4)) \
36507 + : "memory" ); \
36508 + __res; \
36509 +})
36510 +
36511 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
36512 +({ \
36513 + type __res; \
36514 + long __ign1, __ign2, __ign3, __ign4, __ign5; \
36515 + asm volatile ( \
36516 + HYPERCALL_STR(name) \
36517 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
36518 + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
36519 + : "1" ((long)(a1)), "2" ((long)(a2)), \
36520 + "3" ((long)(a3)), "4" ((long)(a4)), \
36521 + "5" ((long)(a5)) \
36522 + : "memory" ); \
36523 + __res; \
36524 +})
36525 +
36526 +static inline int __must_check
36527 +HYPERVISOR_set_trap_table(
36528 + const trap_info_t *table)
36529 +{
36530 + return _hypercall1(int, set_trap_table, table);
36531 +}
36532 +
36533 +static inline int __must_check
36534 +HYPERVISOR_mmu_update(
36535 + mmu_update_t *req, unsigned int count, unsigned int *success_count,
36536 + domid_t domid)
36537 +{
36538 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
36539 +}
36540 +
36541 +static inline int __must_check
36542 +HYPERVISOR_mmuext_op(
36543 + struct mmuext_op *op, unsigned int count, unsigned int *success_count,
36544 + domid_t domid)
36545 +{
36546 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
36547 +}
36548 +
36549 +static inline int __must_check
36550 +HYPERVISOR_set_gdt(
36551 + unsigned long *frame_list, unsigned int entries)
36552 +{
36553 + return _hypercall2(int, set_gdt, frame_list, entries);
36554 +}
36555 +
36556 +static inline int __must_check
36557 +HYPERVISOR_stack_switch(
36558 + unsigned long ss, unsigned long esp)
36559 +{
36560 + return _hypercall2(int, stack_switch, ss, esp);
36561 +}
36562 +
36563 +static inline int __must_check
36564 +HYPERVISOR_set_callbacks(
36565 + unsigned long event_selector, unsigned long event_address,
36566 + unsigned long failsafe_selector, unsigned long failsafe_address)
36567 +{
36568 + return _hypercall4(int, set_callbacks,
36569 + event_selector, event_address,
36570 + failsafe_selector, failsafe_address);
36571 +}
36572 +
36573 +static inline int
36574 +HYPERVISOR_fpu_taskswitch(
36575 + int set)
36576 +{
36577 + return _hypercall1(int, fpu_taskswitch, set);
36578 +}
36579 +
36580 +static inline int __must_check
36581 +HYPERVISOR_sched_op_compat(
36582 + int cmd, unsigned long arg)
36583 +{
36584 + return _hypercall2(int, sched_op_compat, cmd, arg);
36585 +}
36586 +
36587 +static inline int __must_check
36588 +HYPERVISOR_sched_op(
36589 + int cmd, void *arg)
36590 +{
36591 + return _hypercall2(int, sched_op, cmd, arg);
36592 +}
36593 +
36594 +static inline long __must_check
36595 +HYPERVISOR_set_timer_op(
36596 + u64 timeout)
36597 +{
36598 + unsigned long timeout_hi = (unsigned long)(timeout>>32);
36599 + unsigned long timeout_lo = (unsigned long)timeout;
36600 + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
36601 +}
36602 +
36603 +static inline int __must_check
36604 +HYPERVISOR_platform_op(
36605 + struct xen_platform_op *platform_op)
36606 +{
36607 + platform_op->interface_version = XENPF_INTERFACE_VERSION;
36608 + return _hypercall1(int, platform_op, platform_op);
36609 +}
36610 +
36611 +static inline int __must_check
36612 +HYPERVISOR_set_debugreg(
36613 + unsigned int reg, unsigned long value)
36614 +{
36615 + return _hypercall2(int, set_debugreg, reg, value);
36616 +}
36617 +
36618 +static inline unsigned long __must_check
36619 +HYPERVISOR_get_debugreg(
36620 + unsigned int reg)
36621 +{
36622 + return _hypercall1(unsigned long, get_debugreg, reg);
36623 +}
36624 +
36625 +static inline int __must_check
36626 +HYPERVISOR_update_descriptor(
36627 + u64 ma, u64 desc)
36628 +{
36629 + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
36630 +}
36631 +
36632 +static inline int __must_check
36633 +HYPERVISOR_memory_op(
36634 + unsigned int cmd, void *arg)
36635 +{
36636 + return _hypercall2(int, memory_op, cmd, arg);
36637 +}
36638 +
36639 +static inline int __must_check
36640 +HYPERVISOR_multicall(
36641 + multicall_entry_t *call_list, unsigned int nr_calls)
36642 +{
36643 + return _hypercall2(int, multicall, call_list, nr_calls);
36644 +}
36645 +
36646 +static inline int __must_check
36647 +HYPERVISOR_update_va_mapping(
36648 + unsigned long va, pte_t new_val, unsigned long flags)
36649 +{
36650 + unsigned long pte_hi = 0;
36651 +#ifdef CONFIG_X86_PAE
36652 + pte_hi = new_val.pte_high;
36653 +#endif
36654 + return _hypercall4(int, update_va_mapping, va,
36655 + new_val.pte_low, pte_hi, flags);
36656 +}
36657 +
36658 +static inline int __must_check
36659 +HYPERVISOR_event_channel_op(
36660 + int cmd, void *arg)
36661 +{
36662 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
36663 +
36664 +#if CONFIG_XEN_COMPAT <= 0x030002
36665 + if (unlikely(rc == -ENOSYS)) {
36666 + struct evtchn_op op;
36667 + op.cmd = cmd;
36668 + memcpy(&op.u, arg, sizeof(op.u));
36669 + rc = _hypercall1(int, event_channel_op_compat, &op);
36670 + memcpy(arg, &op.u, sizeof(op.u));
36671 + }
36672 +#endif
36673 +
36674 + return rc;
36675 +}
36676 +
36677 +static inline int __must_check
36678 +HYPERVISOR_xen_version(
36679 + int cmd, void *arg)
36680 +{
36681 + return _hypercall2(int, xen_version, cmd, arg);
36682 +}
36683 +
36684 +static inline int __must_check
36685 +HYPERVISOR_console_io(
36686 + int cmd, unsigned int count, char *str)
36687 +{
36688 + return _hypercall3(int, console_io, cmd, count, str);
36689 +}
36690 +
36691 +static inline int __must_check
36692 +HYPERVISOR_physdev_op(
36693 + int cmd, void *arg)
36694 +{
36695 + int rc = _hypercall2(int, physdev_op, cmd, arg);
36696 +
36697 +#if CONFIG_XEN_COMPAT <= 0x030002
36698 + if (unlikely(rc == -ENOSYS)) {
36699 + struct physdev_op op;
36700 + op.cmd = cmd;
36701 + memcpy(&op.u, arg, sizeof(op.u));
36702 + rc = _hypercall1(int, physdev_op_compat, &op);
36703 + memcpy(arg, &op.u, sizeof(op.u));
36704 + }
36705 +#endif
36706 +
36707 + return rc;
36708 +}
36709 +
36710 +static inline int __must_check
36711 +HYPERVISOR_grant_table_op(
36712 + unsigned int cmd, void *uop, unsigned int count)
36713 +{
36714 + return _hypercall3(int, grant_table_op, cmd, uop, count);
36715 +}
36716 +
36717 +static inline int __must_check
36718 +HYPERVISOR_update_va_mapping_otherdomain(
36719 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
36720 +{
36721 + unsigned long pte_hi = 0;
36722 +#ifdef CONFIG_X86_PAE
36723 + pte_hi = new_val.pte_high;
36724 +#endif
36725 + return _hypercall5(int, update_va_mapping_otherdomain, va,
36726 + new_val.pte_low, pte_hi, flags, domid);
36727 +}
36728 +
36729 +static inline int __must_check
36730 +HYPERVISOR_vm_assist(
36731 + unsigned int cmd, unsigned int type)
36732 +{
36733 + return _hypercall2(int, vm_assist, cmd, type);
36734 +}
36735 +
36736 +static inline int __must_check
36737 +HYPERVISOR_vcpu_op(
36738 + int cmd, unsigned int vcpuid, void *extra_args)
36739 +{
36740 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
36741 +}
36742 +
36743 +static inline int __must_check
36744 +HYPERVISOR_suspend(
36745 + unsigned long srec)
36746 +{
36747 + struct sched_shutdown sched_shutdown = {
36748 + .reason = SHUTDOWN_suspend
36749 + };
36750 +
36751 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
36752 + &sched_shutdown, srec);
36753 +
36754 +#if CONFIG_XEN_COMPAT <= 0x030002
36755 + if (rc == -ENOSYS)
36756 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
36757 + SHUTDOWN_suspend, srec);
36758 +#endif
36759 +
36760 + return rc;
36761 +}
36762 +
36763 +#if CONFIG_XEN_COMPAT <= 0x030002
36764 +static inline int
36765 +HYPERVISOR_nmi_op(
36766 + unsigned long op, void *arg)
36767 +{
36768 + return _hypercall2(int, nmi_op, op, arg);
36769 +}
36770 +#endif
36771 +
36772 +#ifndef CONFIG_XEN
36773 +static inline unsigned long __must_check
36774 +HYPERVISOR_hvm_op(
36775 + int op, void *arg)
36776 +{
36777 + return _hypercall2(unsigned long, hvm_op, op, arg);
36778 +}
36779 +#endif
36780 +
36781 +static inline int __must_check
36782 +HYPERVISOR_callback_op(
36783 + int cmd, const void *arg)
36784 +{
36785 + return _hypercall2(int, callback_op, cmd, arg);
36786 +}
36787 +
36788 +static inline int __must_check
36789 +HYPERVISOR_xenoprof_op(
36790 + int op, void *arg)
36791 +{
36792 + return _hypercall2(int, xenoprof_op, op, arg);
36793 +}
36794 +
36795 +static inline int __must_check
36796 +HYPERVISOR_kexec_op(
36797 + unsigned long op, void *args)
36798 +{
36799 + return _hypercall2(int, kexec_op, op, args);
36800 +}
36801 +
36802 +
36803 +
36804 +#endif /* __HYPERCALL_H__ */
36805 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h
36806 ===================================================================
36807 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
36808 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypervisor.h 2008-02-20 09:32:49.000000000 +0100
36809 @@ -0,0 +1,259 @@
36810 +/******************************************************************************
36811 + * hypervisor.h
36812 + *
36813 + * Linux-specific hypervisor handling.
36814 + *
36815 + * Copyright (c) 2002-2004, K A Fraser
36816 + *
36817 + * This program is free software; you can redistribute it and/or
36818 + * modify it under the terms of the GNU General Public License version 2
36819 + * as published by the Free Software Foundation; or, when distributed
36820 + * separately from the Linux kernel or incorporated into other
36821 + * software packages, subject to the following license:
36822 + *
36823 + * Permission is hereby granted, free of charge, to any person obtaining a copy
36824 + * of this source file (the "Software"), to deal in the Software without
36825 + * restriction, including without limitation the rights to use, copy, modify,
36826 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
36827 + * and to permit persons to whom the Software is furnished to do so, subject to
36828 + * the following conditions:
36829 + *
36830 + * The above copyright notice and this permission notice shall be included in
36831 + * all copies or substantial portions of the Software.
36832 + *
36833 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36834 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36835 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36836 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36837 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36838 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36839 + * IN THE SOFTWARE.
36840 + */
36841 +
36842 +#ifndef __HYPERVISOR_H__
36843 +#define __HYPERVISOR_H__
36844 +
36845 +#include <linux/types.h>
36846 +#include <linux/kernel.h>
36847 +#include <linux/version.h>
36848 +#include <linux/errno.h>
36849 +#include <xen/interface/xen.h>
36850 +#include <xen/interface/platform.h>
36851 +#include <xen/interface/event_channel.h>
36852 +#include <xen/interface/physdev.h>
36853 +#include <xen/interface/sched.h>
36854 +#include <xen/interface/nmi.h>
36855 +#include <asm/ptrace.h>
36856 +#include <asm/page.h>
36857 +#if defined(__i386__)
36858 +# ifdef CONFIG_X86_PAE
36859 +# include <asm-generic/pgtable-nopud.h>
36860 +# else
36861 +# include <asm-generic/pgtable-nopmd.h>
36862 +# endif
36863 +#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
36864 +# include <asm-generic/pgtable-nopud.h>
36865 +#endif
36866 +
36867 +extern shared_info_t *HYPERVISOR_shared_info;
36868 +
36869 +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
36870 +#ifdef CONFIG_SMP
36871 +#define current_vcpu_info() vcpu_info(smp_processor_id())
36872 +#else
36873 +#define current_vcpu_info() vcpu_info(0)
36874 +#endif
36875 +
36876 +#ifdef CONFIG_X86_32
36877 +extern unsigned long hypervisor_virt_start;
36878 +#endif
36879 +
36880 +/* arch/xen/i386/kernel/setup.c */
36881 +extern start_info_t *xen_start_info;
36882 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
36883 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
36884 +#else
36885 +#define is_initial_xendomain() 0
36886 +#endif
36887 +
36888 +/* arch/xen/kernel/evtchn.c */
36889 +/* Force a proper event-channel callback from Xen. */
36890 +void force_evtchn_callback(void);
36891 +
36892 +/* arch/xen/kernel/process.c */
36893 +void xen_cpu_idle (void);
36894 +
36895 +/* arch/xen/i386/kernel/hypervisor.c */
36896 +void do_hypervisor_callback(struct pt_regs *regs);
36897 +
36898 +/* arch/xen/i386/mm/hypervisor.c */
36899 +/*
36900 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
36901 + * be MACHINE addresses.
36902 + */
36903 +
36904 +void xen_pt_switch(unsigned long ptr);
36905 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
36906 +void xen_load_gs(unsigned int selector); /* x86_64 only */
36907 +void xen_tlb_flush(void);
36908 +void xen_invlpg(unsigned long ptr);
36909 +
36910 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
36911 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
36912 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
36913 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
36914 +void xen_pgd_pin(unsigned long ptr);
36915 +void xen_pgd_unpin(unsigned long ptr);
36916 +
36917 +void xen_set_ldt(const void *ptr, unsigned int ents);
36918 +
36919 +#ifdef CONFIG_SMP
36920 +#include <linux/cpumask.h>
36921 +void xen_tlb_flush_all(void);
36922 +void xen_invlpg_all(unsigned long ptr);
36923 +void xen_tlb_flush_mask(cpumask_t *mask);
36924 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
36925 +#endif
36926 +
36927 +/* Returns zero on success else negative errno. */
36928 +int xen_create_contiguous_region(
36929 + unsigned long vstart, unsigned int order, unsigned int address_bits);
36930 +void xen_destroy_contiguous_region(
36931 + unsigned long vstart, unsigned int order);
36932 +
36933 +struct page;
36934 +
36935 +int xen_limit_pages_to_max_mfn(
36936 + struct page *pages, unsigned int order, unsigned int address_bits);
36937 +
36938 +/* Turn jiffies into Xen system time. */
36939 +u64 jiffies_to_st(unsigned long jiffies);
36940 +
36941 +#ifdef CONFIG_XEN_SCRUB_PAGES
36942 +void scrub_pages(void *, unsigned int);
36943 +#else
36944 +#define scrub_pages(_p,_n) ((void)0)
36945 +#endif
36946 +
36947 +#include <xen/hypercall.h>
36948 +
36949 +#if defined(CONFIG_X86_64)
36950 +#define MULTI_UVMFLAGS_INDEX 2
36951 +#define MULTI_UVMDOMID_INDEX 3
36952 +#else
36953 +#define MULTI_UVMFLAGS_INDEX 3
36954 +#define MULTI_UVMDOMID_INDEX 4
36955 +#endif
36956 +
36957 +#ifdef CONFIG_XEN
36958 +#define is_running_on_xen() 1
36959 +#else
36960 +extern char *hypercall_stubs;
36961 +#define is_running_on_xen() (!!hypercall_stubs)
36962 +#endif
36963 +
36964 +static inline int
36965 +HYPERVISOR_yield(
36966 + void)
36967 +{
36968 + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
36969 +
36970 +#if CONFIG_XEN_COMPAT <= 0x030002
36971 + if (rc == -ENOSYS)
36972 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
36973 +#endif
36974 +
36975 + return rc;
36976 +}
36977 +
36978 +static inline int
36979 +HYPERVISOR_block(
36980 + void)
36981 +{
36982 + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
36983 +
36984 +#if CONFIG_XEN_COMPAT <= 0x030002
36985 + if (rc == -ENOSYS)
36986 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
36987 +#endif
36988 +
36989 + return rc;
36990 +}
36991 +
36992 +static inline void /*__noreturn*/
36993 +HYPERVISOR_shutdown(
36994 + unsigned int reason)
36995 +{
36996 + struct sched_shutdown sched_shutdown = {
36997 + .reason = reason
36998 + };
36999 +
37000 + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
37001 +#if CONFIG_XEN_COMPAT <= 0x030002
37002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
37003 +#endif
37004 + /* Don't recurse needlessly. */
37005 + BUG_ON(reason != SHUTDOWN_crash);
37006 + for(;;);
37007 +}
37008 +
37009 +static inline int __must_check
37010 +HYPERVISOR_poll(
37011 + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
37012 +{
37013 + int rc;
37014 + struct sched_poll sched_poll = {
37015 + .nr_ports = nr_ports,
37016 + .timeout = jiffies_to_st(timeout)
37017 + };
37018 + set_xen_guest_handle(sched_poll.ports, ports);
37019 +
37020 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
37021 +#if CONFIG_XEN_COMPAT <= 0x030002
37022 + if (rc == -ENOSYS)
37023 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
37024 +#endif
37025 +
37026 + return rc;
37027 +}
37028 +
37029 +#ifdef CONFIG_XEN
37030 +
37031 +static inline void
37032 +MULTI_update_va_mapping(
37033 + multicall_entry_t *mcl, unsigned long va,
37034 + pte_t new_val, unsigned long flags)
37035 +{
37036 + mcl->op = __HYPERVISOR_update_va_mapping;
37037 + mcl->args[0] = va;
37038 +#if defined(CONFIG_X86_64)
37039 + mcl->args[1] = new_val.pte;
37040 +#elif defined(CONFIG_X86_PAE)
37041 + mcl->args[1] = new_val.pte_low;
37042 + mcl->args[2] = new_val.pte_high;
37043 +#else
37044 + mcl->args[1] = new_val.pte_low;
37045 + mcl->args[2] = 0;
37046 +#endif
37047 + mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
37048 +}
37049 +
37050 +static inline void
37051 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
37052 + void *uop, unsigned int count)
37053 +{
37054 + mcl->op = __HYPERVISOR_grant_table_op;
37055 + mcl->args[0] = cmd;
37056 + mcl->args[1] = (unsigned long)uop;
37057 + mcl->args[2] = count;
37058 +}
37059 +
37060 +#else /* !defined(CONFIG_XEN) */
37061 +
37062 +/* Multicalls not supported for HVM guests. */
37063 +#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
37064 +#define MULTI_grant_table_op(a,b,c,d) ((void)0)
37065 +
37066 +#endif
37067 +
37068 +#endif /* __HYPERVISOR_H__ */
37069 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h
37070 ===================================================================
37071 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37072 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_32.h 2007-08-16 18:07:01.000000000 +0200
37073 @@ -0,0 +1,389 @@
37074 +#ifndef _ASM_IO_H
37075 +#define _ASM_IO_H
37076 +
37077 +#include <linux/string.h>
37078 +#include <linux/compiler.h>
37079 +
37080 +/*
37081 + * This file contains the definitions for the x86 IO instructions
37082 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
37083 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
37084 + * versions of the single-IO instructions (inb_p/inw_p/..).
37085 + *
37086 + * This file is not meant to be obfuscating: it's just complicated
37087 + * to (a) handle it all in a way that makes gcc able to optimize it
37088 + * as well as possible and (b) trying to avoid writing the same thing
37089 + * over and over again with slight variations and possibly making a
37090 + * mistake somewhere.
37091 + */
37092 +
37093 +/*
37094 + * Thanks to James van Artsdalen for a better timing-fix than
37095 + * the two short jumps: using outb's to a nonexistent port seems
37096 + * to guarantee better timings even on fast machines.
37097 + *
37098 + * On the other hand, I'd like to be sure of a non-existent port:
37099 + * I feel a bit unsafe about using 0x80 (should be safe, though)
37100 + *
37101 + * Linus
37102 + */
37103 +
37104 + /*
37105 + * Bit simplified and optimized by Jan Hubicka
37106 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
37107 + *
37108 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
37109 + * isa_read[wl] and isa_write[wl] fixed
37110 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
37111 + */
37112 +
37113 +#define IO_SPACE_LIMIT 0xffff
37114 +
37115 +#define XQUAD_PORTIO_BASE 0xfe400000
37116 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
37117 +
37118 +#ifdef __KERNEL__
37119 +
37120 +#include <asm-generic/iomap.h>
37121 +
37122 +#include <linux/vmalloc.h>
37123 +#include <asm/fixmap.h>
37124 +
37125 +/*
37126 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
37127 + * access
37128 + */
37129 +#define xlate_dev_mem_ptr(p) __va(p)
37130 +
37131 +/*
37132 + * Convert a virtual cached pointer to an uncached pointer
37133 + */
37134 +#define xlate_dev_kmem_ptr(p) p
37135 +
37136 +/**
37137 + * virt_to_phys - map virtual addresses to physical
37138 + * @address: address to remap
37139 + *
37140 + * The returned physical address is the physical (CPU) mapping for
37141 + * the memory address given. It is only valid to use this function on
37142 + * addresses directly mapped or allocated via kmalloc.
37143 + *
37144 + * This function does not give bus mappings for DMA transfers. In
37145 + * almost all conceivable cases a device driver should not be using
37146 + * this function
37147 + */
37148 +
37149 +static inline unsigned long virt_to_phys(volatile void * address)
37150 +{
37151 + return __pa(address);
37152 +}
37153 +
37154 +/**
37155 + * phys_to_virt - map physical address to virtual
37156 + * @address: address to remap
37157 + *
37158 + * The returned virtual address is a current CPU mapping for
37159 + * the memory address given. It is only valid to use this function on
37160 + * addresses that have a kernel mapping
37161 + *
37162 + * This function does not handle bus mappings for DMA transfers. In
37163 + * almost all conceivable cases a device driver should not be using
37164 + * this function
37165 + */
37166 +
37167 +static inline void * phys_to_virt(unsigned long address)
37168 +{
37169 + return __va(address);
37170 +}
37171 +
37172 +/*
37173 + * Change "struct page" to physical address.
37174 + */
37175 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
37176 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
37177 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
37178 +
37179 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
37180 + (unsigned long) bio_offset((bio)))
37181 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
37182 + (unsigned long) (bv)->bv_offset)
37183 +
37184 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
37185 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
37186 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
37187 + bvec_to_pseudophys((vec2))))
37188 +
37189 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
37190 +
37191 +/**
37192 + * ioremap - map bus memory into CPU space
37193 + * @offset: bus address of the memory
37194 + * @size: size of the resource to map
37195 + *
37196 + * ioremap performs a platform specific sequence of operations to
37197 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
37198 + * writew/writel functions and the other mmio helpers. The returned
37199 + * address is not guaranteed to be usable directly as a virtual
37200 + * address.
37201 + */
37202 +
37203 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
37204 +{
37205 + return __ioremap(offset, size, 0);
37206 +}
37207 +
37208 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
37209 +extern void iounmap(volatile void __iomem *addr);
37210 +
37211 +/*
37212 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
37213 + * mappings, before the real ioremap() is functional.
37214 + * A boot-time mapping is currently limited to at most 16 pages.
37215 + */
37216 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
37217 +extern void bt_iounmap(void *addr, unsigned long size);
37218 +
37219 +/* Use early IO mappings for DMI because it's initialized early */
37220 +#define dmi_ioremap bt_ioremap
37221 +#define dmi_iounmap bt_iounmap
37222 +#define dmi_alloc alloc_bootmem
37223 +
37224 +/*
37225 + * ISA I/O bus memory addresses are 1:1 with the physical address.
37226 + */
37227 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
37228 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
37229 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
37230 +
37231 +/*
37232 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
37233 + * are forbidden in portable PCI drivers.
37234 + *
37235 + * Allow them on x86 for legacy drivers, though.
37236 + */
37237 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
37238 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
37239 +
37240 +/*
37241 + * readX/writeX() are used to access memory mapped devices. On some
37242 + * architectures the memory mapped IO stuff needs to be accessed
37243 + * differently. On the x86 architecture, we just read/write the
37244 + * memory location directly.
37245 + */
37246 +
37247 +static inline unsigned char readb(const volatile void __iomem *addr)
37248 +{
37249 + return *(volatile unsigned char __force *) addr;
37250 +}
37251 +static inline unsigned short readw(const volatile void __iomem *addr)
37252 +{
37253 + return *(volatile unsigned short __force *) addr;
37254 +}
37255 +static inline unsigned int readl(const volatile void __iomem *addr)
37256 +{
37257 + return *(volatile unsigned int __force *) addr;
37258 +}
37259 +#define readb_relaxed(addr) readb(addr)
37260 +#define readw_relaxed(addr) readw(addr)
37261 +#define readl_relaxed(addr) readl(addr)
37262 +#define __raw_readb readb
37263 +#define __raw_readw readw
37264 +#define __raw_readl readl
37265 +
37266 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
37267 +{
37268 + *(volatile unsigned char __force *) addr = b;
37269 +}
37270 +static inline void writew(unsigned short b, volatile void __iomem *addr)
37271 +{
37272 + *(volatile unsigned short __force *) addr = b;
37273 +}
37274 +static inline void writel(unsigned int b, volatile void __iomem *addr)
37275 +{
37276 + *(volatile unsigned int __force *) addr = b;
37277 +}
37278 +#define __raw_writeb writeb
37279 +#define __raw_writew writew
37280 +#define __raw_writel writel
37281 +
37282 +#define mmiowb()
37283 +
37284 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
37285 +{
37286 + memset((void __force *) addr, val, count);
37287 +}
37288 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
37289 +{
37290 + __memcpy(dst, (void __force *) src, count);
37291 +}
37292 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
37293 +{
37294 + __memcpy((void __force *) dst, src, count);
37295 +}
37296 +
37297 +/*
37298 + * ISA space is 'always mapped' on a typical x86 system, no need to
37299 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
37300 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
37301 + * are physical addresses. The following constant pointer can be
37302 + * used as the IO-area pointer (it can be iounmapped as well, so the
37303 + * analogy with PCI is quite large):
37304 + */
37305 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
37306 +
37307 +/*
37308 + * Again, i386 does not require mem IO specific function.
37309 + */
37310 +
37311 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
37312 +
37313 +/**
37314 + * check_signature - find BIOS signatures
37315 + * @io_addr: mmio address to check
37316 + * @signature: signature block
37317 + * @length: length of signature
37318 + *
37319 + * Perform a signature comparison with the mmio address io_addr. This
37320 + * address should have been obtained by ioremap.
37321 + * Returns 1 on a match.
37322 + */
37323 +
37324 +static inline int check_signature(volatile void __iomem * io_addr,
37325 + const unsigned char *signature, int length)
37326 +{
37327 + int retval = 0;
37328 + do {
37329 + if (readb(io_addr) != *signature)
37330 + goto out;
37331 + io_addr++;
37332 + signature++;
37333 + length--;
37334 + } while (length);
37335 + retval = 1;
37336 +out:
37337 + return retval;
37338 +}
37339 +
37340 +/*
37341 + * Cache management
37342 + *
37343 + * This needed for two cases
37344 + * 1. Out of order aware processors
37345 + * 2. Accidentally out of order processors (PPro errata #51)
37346 + */
37347 +
37348 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
37349 +
37350 +static inline void flush_write_buffers(void)
37351 +{
37352 + __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
37353 +}
37354 +
37355 +#define dma_cache_inv(_start,_size) flush_write_buffers()
37356 +#define dma_cache_wback(_start,_size) flush_write_buffers()
37357 +#define dma_cache_wback_inv(_start,_size) flush_write_buffers()
37358 +
37359 +#else
37360 +
37361 +/* Nothing to do */
37362 +
37363 +#define dma_cache_inv(_start,_size) do { } while (0)
37364 +#define dma_cache_wback(_start,_size) do { } while (0)
37365 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
37366 +#define flush_write_buffers()
37367 +
37368 +#endif
37369 +
37370 +#endif /* __KERNEL__ */
37371 +
37372 +#ifdef SLOW_IO_BY_JUMPING
37373 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
37374 +#else
37375 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
37376 +#endif
37377 +
37378 +static inline void slow_down_io(void) {
37379 + __asm__ __volatile__(
37380 + __SLOW_DOWN_IO
37381 +#ifdef REALLY_SLOW_IO
37382 + __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
37383 +#endif
37384 + : : );
37385 +}
37386 +
37387 +#ifdef CONFIG_X86_NUMAQ
37388 +extern void *xquad_portio; /* Where the IO area was mapped */
37389 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
37390 +#define __BUILDIO(bwl,bw,type) \
37391 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
37392 + if (xquad_portio) \
37393 + write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
37394 + else \
37395 + out##bwl##_local(value, port); \
37396 +} \
37397 +static inline void out##bwl(unsigned type value, int port) { \
37398 + out##bwl##_quad(value, port, 0); \
37399 +} \
37400 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
37401 + if (xquad_portio) \
37402 + return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
37403 + else \
37404 + return in##bwl##_local(port); \
37405 +} \
37406 +static inline unsigned type in##bwl(int port) { \
37407 + return in##bwl##_quad(port, 0); \
37408 +}
37409 +#else
37410 +#define __BUILDIO(bwl,bw,type) \
37411 +static inline void out##bwl(unsigned type value, int port) { \
37412 + out##bwl##_local(value, port); \
37413 +} \
37414 +static inline unsigned type in##bwl(int port) { \
37415 + return in##bwl##_local(port); \
37416 +}
37417 +#endif
37418 +
37419 +
37420 +#define BUILDIO(bwl,bw,type) \
37421 +static inline void out##bwl##_local(unsigned type value, int port) { \
37422 + __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
37423 +} \
37424 +static inline unsigned type in##bwl##_local(int port) { \
37425 + unsigned type value; \
37426 + __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
37427 + return value; \
37428 +} \
37429 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
37430 + out##bwl##_local(value, port); \
37431 + slow_down_io(); \
37432 +} \
37433 +static inline unsigned type in##bwl##_local_p(int port) { \
37434 + unsigned type value = in##bwl##_local(port); \
37435 + slow_down_io(); \
37436 + return value; \
37437 +} \
37438 +__BUILDIO(bwl,bw,type) \
37439 +static inline void out##bwl##_p(unsigned type value, int port) { \
37440 + out##bwl(value, port); \
37441 + slow_down_io(); \
37442 +} \
37443 +static inline unsigned type in##bwl##_p(int port) { \
37444 + unsigned type value = in##bwl(port); \
37445 + slow_down_io(); \
37446 + return value; \
37447 +} \
37448 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
37449 + __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
37450 +} \
37451 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
37452 + __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
37453 +}
37454 +
37455 +BUILDIO(b,b,char)
37456 +BUILDIO(w,w,short)
37457 +BUILDIO(l,,int)
37458 +
37459 +/* We will be supplying our own /dev/mem implementation */
37460 +#define ARCH_HAS_DEV_MEM
37461 +
37462 +#endif
37463 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h
37464 ===================================================================
37465 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37466 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200
37467 @@ -0,0 +1,127 @@
37468 +/*
37469 + * include/asm-i386/irqflags.h
37470 + *
37471 + * IRQ flags handling
37472 + *
37473 + * This file gets included from lowlevel asm headers too, to provide
37474 + * wrapped versions of the local_irq_*() APIs, based on the
37475 + * raw_local_irq_*() functions from the lowlevel headers.
37476 + */
37477 +#ifndef _ASM_IRQFLAGS_H
37478 +#define _ASM_IRQFLAGS_H
37479 +
37480 +#ifndef __ASSEMBLY__
37481 +
37482 +/*
37483 + * The use of 'barrier' in the following reflects their use as local-lock
37484 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
37485 + * critical operations are executed. All critical operations must complete
37486 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
37487 + * includes these barriers, for example.
37488 + */
37489 +
37490 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
37491 +
37492 +#define raw_local_save_flags(flags) \
37493 + do { (flags) = __raw_local_save_flags(); } while (0)
37494 +
37495 +#define raw_local_irq_restore(x) \
37496 +do { \
37497 + vcpu_info_t *_vcpu; \
37498 + barrier(); \
37499 + _vcpu = current_vcpu_info(); \
37500 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
37501 + barrier(); /* unmask then check (avoid races) */ \
37502 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
37503 + force_evtchn_callback(); \
37504 + } \
37505 +} while (0)
37506 +
37507 +#define raw_local_irq_disable() \
37508 +do { \
37509 + current_vcpu_info()->evtchn_upcall_mask = 1; \
37510 + barrier(); \
37511 +} while (0)
37512 +
37513 +#define raw_local_irq_enable() \
37514 +do { \
37515 + vcpu_info_t *_vcpu; \
37516 + barrier(); \
37517 + _vcpu = current_vcpu_info(); \
37518 + _vcpu->evtchn_upcall_mask = 0; \
37519 + barrier(); /* unmask then check (avoid races) */ \
37520 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
37521 + force_evtchn_callback(); \
37522 +} while (0)
37523 +
37524 +/*
37525 + * Used in the idle loop; sti takes one instruction cycle
37526 + * to complete:
37527 + */
37528 +void raw_safe_halt(void);
37529 +
37530 +/*
37531 + * Used when interrupts are already enabled or to
37532 + * shutdown the processor:
37533 + */
37534 +void halt(void);
37535 +
37536 +static inline int raw_irqs_disabled_flags(unsigned long flags)
37537 +{
37538 + return (flags != 0);
37539 +}
37540 +
37541 +#define raw_irqs_disabled() \
37542 +({ \
37543 + unsigned long flags = __raw_local_save_flags(); \
37544 + \
37545 + raw_irqs_disabled_flags(flags); \
37546 +})
37547 +
37548 +/*
37549 + * For spinlocks, etc:
37550 + */
37551 +#define __raw_local_irq_save() \
37552 +({ \
37553 + unsigned long flags = __raw_local_save_flags(); \
37554 + \
37555 + raw_local_irq_disable(); \
37556 + \
37557 + flags; \
37558 +})
37559 +
37560 +#define raw_local_irq_save(flags) \
37561 + do { (flags) = __raw_local_irq_save(); } while (0)
37562 +
37563 +#endif /* __ASSEMBLY__ */
37564 +
37565 +/*
37566 + * Do the CPU's IRQ-state tracing from assembly code. We call a
37567 + * C function, so save all the C-clobbered registers:
37568 + */
37569 +#ifdef CONFIG_TRACE_IRQFLAGS
37570 +
37571 +# define TRACE_IRQS_ON \
37572 + pushl %eax; \
37573 + pushl %ecx; \
37574 + pushl %edx; \
37575 + call trace_hardirqs_on; \
37576 + popl %edx; \
37577 + popl %ecx; \
37578 + popl %eax;
37579 +
37580 +# define TRACE_IRQS_OFF \
37581 + pushl %eax; \
37582 + pushl %ecx; \
37583 + pushl %edx; \
37584 + call trace_hardirqs_off; \
37585 + popl %edx; \
37586 + popl %ecx; \
37587 + popl %eax;
37588 +
37589 +#else
37590 +# define TRACE_IRQS_ON
37591 +# define TRACE_IRQS_OFF
37592 +#endif
37593 +
37594 +#endif
37595 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h
37596 ===================================================================
37597 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37598 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_32.h 2008-04-02 12:34:02.000000000 +0200
37599 @@ -0,0 +1,193 @@
37600 +#ifndef _I386_MADDR_H
37601 +#define _I386_MADDR_H
37602 +
37603 +#include <xen/features.h>
37604 +#include <xen/interface/xen.h>
37605 +
37606 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
37607 +#define INVALID_P2M_ENTRY (~0UL)
37608 +#define FOREIGN_FRAME_BIT (1UL<<31)
37609 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
37610 +
37611 +/* Definitions for machine and pseudophysical addresses. */
37612 +#ifdef CONFIG_X86_PAE
37613 +typedef unsigned long long paddr_t;
37614 +typedef unsigned long long maddr_t;
37615 +#else
37616 +typedef unsigned long paddr_t;
37617 +typedef unsigned long maddr_t;
37618 +#endif
37619 +
37620 +#ifdef CONFIG_XEN
37621 +
37622 +extern unsigned long *phys_to_machine_mapping;
37623 +extern unsigned long max_mapnr;
37624 +
37625 +#undef machine_to_phys_mapping
37626 +extern unsigned long *machine_to_phys_mapping;
37627 +extern unsigned int machine_to_phys_order;
37628 +
37629 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
37630 +{
37631 + if (xen_feature(XENFEAT_auto_translated_physmap))
37632 + return pfn;
37633 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37634 + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
37635 +}
37636 +
37637 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
37638 +{
37639 + if (xen_feature(XENFEAT_auto_translated_physmap))
37640 + return 1;
37641 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37642 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
37643 +}
37644 +
37645 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
37646 +{
37647 + unsigned long pfn;
37648 +
37649 + if (xen_feature(XENFEAT_auto_translated_physmap))
37650 + return mfn;
37651 +
37652 + if (unlikely((mfn >> machine_to_phys_order) != 0))
37653 + return max_mapnr;
37654 +
37655 + /* The array access can fail (e.g., device space beyond end of RAM). */
37656 + asm (
37657 + "1: movl %1,%0\n"
37658 + "2:\n"
37659 + ".section .fixup,\"ax\"\n"
37660 + "3: movl %2,%0\n"
37661 + " jmp 2b\n"
37662 + ".previous\n"
37663 + ".section __ex_table,\"a\"\n"
37664 + " .align 4\n"
37665 + " .long 1b,3b\n"
37666 + ".previous"
37667 + : "=r" (pfn)
37668 + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
37669 +
37670 + return pfn;
37671 +}
37672 +
37673 +/*
37674 + * We detect special mappings in one of two ways:
37675 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
37676 + * to be outside our maximum possible pseudophys range.
37677 + * 2. If the MFN belongs to a different domain then we will certainly
37678 + * not have MFN in our p2m table. Conversely, if the page is ours,
37679 + * then we'll have p2m(m2p(MFN))==MFN.
37680 + * If we detect a special mapping then it doesn't have a 'struct page'.
37681 + * We force !pfn_valid() by returning an out-of-range pointer.
37682 + *
37683 + * NB. These checks require that, for any MFN that is not in our reservation,
37684 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
37685 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
37686 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
37687 + *
37688 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
37689 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
37690 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
37691 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
37692 + */
37693 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
37694 +{
37695 + unsigned long pfn = mfn_to_pfn(mfn);
37696 + if ((pfn < max_mapnr)
37697 + && !xen_feature(XENFEAT_auto_translated_physmap)
37698 + && (phys_to_machine_mapping[pfn] != mfn))
37699 + return max_mapnr; /* force !pfn_valid() */
37700 + return pfn;
37701 +}
37702 +
37703 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
37704 +{
37705 + BUG_ON(max_mapnr && pfn >= max_mapnr);
37706 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
37707 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
37708 + return;
37709 + }
37710 + phys_to_machine_mapping[pfn] = mfn;
37711 +}
37712 +
37713 +static inline maddr_t phys_to_machine(paddr_t phys)
37714 +{
37715 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37716 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
37717 + return machine;
37718 +}
37719 +
37720 +static inline paddr_t machine_to_phys(maddr_t machine)
37721 +{
37722 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37723 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
37724 + return phys;
37725 +}
37726 +
37727 +#ifdef CONFIG_X86_PAE
37728 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
37729 +{
37730 + /*
37731 + * In PAE mode, the NX bit needs to be dealt with in the value
37732 + * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
37733 + * but for i386 the conversion to ulong for the argument will
37734 + * clip it off.
37735 + */
37736 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
37737 + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
37738 + return machine;
37739 +}
37740 +
37741 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
37742 +{
37743 + /*
37744 + * In PAE mode, the NX bit needs to be dealt with in the value
37745 + * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
37746 + * but for i386 the conversion to ulong for the argument will
37747 + * clip it off.
37748 + */
37749 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
37750 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
37751 + return phys;
37752 +}
37753 +#endif
37754 +
37755 +#ifdef CONFIG_X86_PAE
37756 +#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
37757 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
37758 +{
37759 + pte_t pte;
37760 +
37761 + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
37762 + (pgprot_val(pgprot) >> 32);
37763 + pte.pte_high &= (__supported_pte_mask >> 32);
37764 + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
37765 + __supported_pte_mask;
37766 + return pte;
37767 +}
37768 +#else
37769 +#define __pte_ma(x) ((pte_t) { (x) } )
37770 +#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
37771 +#endif
37772 +
37773 +#else /* !CONFIG_XEN */
37774 +
37775 +#define pfn_to_mfn(pfn) (pfn)
37776 +#define mfn_to_pfn(mfn) (mfn)
37777 +#define mfn_to_local_pfn(mfn) (mfn)
37778 +#define set_phys_to_machine(pfn, mfn) ((void)0)
37779 +#define phys_to_machine_mapping_valid(pfn) (1)
37780 +#define phys_to_machine(phys) ((maddr_t)(phys))
37781 +#define machine_to_phys(mach) ((paddr_t)(mach))
37782 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
37783 +#define __pte_ma(x) __pte(x)
37784 +
37785 +#endif /* !CONFIG_XEN */
37786 +
37787 +/* VIRT <-> MACHINE conversion */
37788 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
37789 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
37790 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
37791 +
37792 +#endif /* _I386_MADDR_H */
37793 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h
37794 ===================================================================
37795 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37796 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200
37797 @@ -0,0 +1,108 @@
37798 +#ifndef __I386_SCHED_H
37799 +#define __I386_SCHED_H
37800 +
37801 +#include <asm/desc.h>
37802 +#include <asm/atomic.h>
37803 +#include <asm/pgalloc.h>
37804 +#include <asm/tlbflush.h>
37805 +
37806 +/*
37807 + * Used for LDT copy/destruction.
37808 + */
37809 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
37810 +void destroy_context(struct mm_struct *mm);
37811 +
37812 +
37813 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
37814 +{
37815 +#if 0 /* XEN: no lazy tlb */
37816 + unsigned cpu = smp_processor_id();
37817 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
37818 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
37819 +#endif
37820 +}
37821 +
37822 +#define prepare_arch_switch(next) __prepare_arch_switch()
37823 +
37824 +static inline void __prepare_arch_switch(void)
37825 +{
37826 + /*
37827 + * Save away %fs and %gs. No need to save %es and %ds, as those
37828 + * are always kernel segments while inside the kernel. Must
37829 + * happen before reload of cr3/ldt (i.e., not in __switch_to).
37830 + */
37831 + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
37832 + : "=m" (current->thread.fs),
37833 + "=m" (current->thread.gs));
37834 + asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
37835 + : : "r" (0) );
37836 +}
37837 +
37838 +extern void mm_pin(struct mm_struct *mm);
37839 +extern void mm_unpin(struct mm_struct *mm);
37840 +void mm_pin_all(void);
37841 +
37842 +static inline void switch_mm(struct mm_struct *prev,
37843 + struct mm_struct *next,
37844 + struct task_struct *tsk)
37845 +{
37846 + int cpu = smp_processor_id();
37847 + struct mmuext_op _op[2], *op = _op;
37848 +
37849 + if (likely(prev != next)) {
37850 + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
37851 + !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
37852 +
37853 + /* stop flush ipis for the previous mm */
37854 + cpu_clear(cpu, prev->cpu_vm_mask);
37855 +#if 0 /* XEN: no lazy tlb */
37856 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37857 + per_cpu(cpu_tlbstate, cpu).active_mm = next;
37858 +#endif
37859 + cpu_set(cpu, next->cpu_vm_mask);
37860 +
37861 + /* Re-load page tables: load_cr3(next->pgd) */
37862 + op->cmd = MMUEXT_NEW_BASEPTR;
37863 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
37864 + op++;
37865 +
37866 + /*
37867 + * load the LDT, if the LDT is different:
37868 + */
37869 + if (unlikely(prev->context.ldt != next->context.ldt)) {
37870 + /* load_LDT_nolock(&next->context, cpu) */
37871 + op->cmd = MMUEXT_SET_LDT;
37872 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
37873 + op->arg2.nr_ents = next->context.size;
37874 + op++;
37875 + }
37876 +
37877 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
37878 + }
37879 +#if 0 /* XEN: no lazy tlb */
37880 + else {
37881 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
37882 + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
37883 +
37884 + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37885 + /* We were in lazy tlb mode and leave_mm disabled
37886 + * tlb flush IPI delivery. We must reload %cr3.
37887 + */
37888 + load_cr3(next->pgd);
37889 + load_LDT_nolock(&next->context, cpu);
37890 + }
37891 + }
37892 +#endif
37893 +}
37894 +
37895 +#define deactivate_mm(tsk, mm) \
37896 + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
37897 +
37898 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
37899 +{
37900 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
37901 + mm_pin(next);
37902 + switch_mm(prev, next, NULL);
37903 +}
37904 +
37905 +#endif
37906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h
37907 ===================================================================
37908 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
37909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_32.h 2007-09-14 11:14:51.000000000 +0200
37910 @@ -0,0 +1,148 @@
37911 +#ifndef __i386_PCI_H
37912 +#define __i386_PCI_H
37913 +
37914 +
37915 +#ifdef __KERNEL__
37916 +#include <linux/mm.h> /* for struct page */
37917 +
37918 +/* Can be used to override the logic in pci_scan_bus for skipping
37919 + already-configured bus numbers - to be used for buggy BIOSes
37920 + or architectures with incomplete PCI setup by the loader */
37921 +
37922 +#ifdef CONFIG_PCI
37923 +extern unsigned int pcibios_assign_all_busses(void);
37924 +#else
37925 +#define pcibios_assign_all_busses() 0
37926 +#endif
37927 +
37928 +#include <asm/hypervisor.h>
37929 +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
37930 +
37931 +extern unsigned long pci_mem_start;
37932 +#define PCIBIOS_MIN_IO 0x1000
37933 +#define PCIBIOS_MIN_MEM (pci_mem_start)
37934 +
37935 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
37936 +
37937 +void pcibios_config_init(void);
37938 +struct pci_bus * pcibios_scan_root(int bus);
37939 +
37940 +void pcibios_set_master(struct pci_dev *dev);
37941 +void pcibios_penalize_isa_irq(int irq, int active);
37942 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
37943 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
37944 +
37945 +/* Dynamic DMA mapping stuff.
37946 + * i386 has everything mapped statically.
37947 + */
37948 +
37949 +#include <linux/types.h>
37950 +#include <linux/slab.h>
37951 +#include <asm/scatterlist.h>
37952 +#include <linux/string.h>
37953 +#include <asm/io.h>
37954 +
37955 +struct pci_dev;
37956 +
37957 +#ifdef CONFIG_SWIOTLB
37958 +
37959 +
37960 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
37961 +#define PCI_DMA_BUS_IS_PHYS (0)
37962 +
37963 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
37964 + dma_addr_t ADDR_NAME;
37965 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
37966 + __u32 LEN_NAME;
37967 +#define pci_unmap_addr(PTR, ADDR_NAME) \
37968 + ((PTR)->ADDR_NAME)
37969 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
37970 + (((PTR)->ADDR_NAME) = (VAL))
37971 +#define pci_unmap_len(PTR, LEN_NAME) \
37972 + ((PTR)->LEN_NAME)
37973 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
37974 + (((PTR)->LEN_NAME) = (VAL))
37975 +
37976 +#else
37977 +
37978 +/* The PCI address space does equal the physical memory
37979 + * address space. The networking and block device layers use
37980 + * this boolean for bounce buffer decisions.
37981 + */
37982 +#define PCI_DMA_BUS_IS_PHYS (1)
37983 +
37984 +/* pci_unmap_{page,single} is a nop so... */
37985 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
37986 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
37987 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
37988 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
37989 +#define pci_unmap_len(PTR, LEN_NAME) (0)
37990 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
37991 +
37992 +#endif
37993 +
37994 +/* This is always fine. */
37995 +#define pci_dac_dma_supported(pci_dev, mask) (1)
37996 +
37997 +static inline dma64_addr_t
37998 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
37999 +{
38000 + return ((dma64_addr_t) page_to_phys(page) +
38001 + (dma64_addr_t) offset);
38002 +}
38003 +
38004 +static inline struct page *
38005 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
38006 +{
38007 + return pfn_to_page(dma_addr >> PAGE_SHIFT);
38008 +}
38009 +
38010 +static inline unsigned long
38011 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
38012 +{
38013 + return (dma_addr & ~PAGE_MASK);
38014 +}
38015 +
38016 +static inline void
38017 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38018 +{
38019 +}
38020 +
38021 +static inline void
38022 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
38023 +{
38024 + flush_write_buffers();
38025 +}
38026 +
38027 +#define HAVE_PCI_MMAP
38028 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
38029 + enum pci_mmap_state mmap_state, int write_combine);
38030 +
38031 +
38032 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
38033 +{
38034 +}
38035 +
38036 +#ifdef CONFIG_PCI
38037 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
38038 + enum pci_dma_burst_strategy *strat,
38039 + unsigned long *strategy_parameter)
38040 +{
38041 + *strat = PCI_DMA_BURST_INFINITY;
38042 + *strategy_parameter = ~0UL;
38043 +}
38044 +#endif
38045 +
38046 +#endif /* __KERNEL__ */
38047 +
38048 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
38049 +#include <xen/pcifront.h>
38050 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
38051 +
38052 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
38053 +#include <asm-generic/pci-dma-compat.h>
38054 +
38055 +/* generic pci stuff */
38056 +#include <asm-generic/pci.h>
38057 +
38058 +#endif /* __i386_PCI_H */
38059 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h
38060 ===================================================================
38061 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38062 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_32.h 2008-07-21 11:00:33.000000000 +0200
38063 @@ -0,0 +1,59 @@
38064 +#ifndef _I386_PGALLOC_H
38065 +#define _I386_PGALLOC_H
38066 +
38067 +#include <asm/fixmap.h>
38068 +#include <linux/threads.h>
38069 +#include <linux/mm.h> /* for struct page */
38070 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
38071 +
38072 +#define pmd_populate_kernel(mm, pmd, pte) \
38073 + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
38074 +
38075 +#define pmd_populate(mm, pmd, pte) \
38076 +do { \
38077 + unsigned long pfn = page_to_pfn(pte); \
38078 + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \
38079 + if (!PageHighMem(pte)) \
38080 + BUG_ON(HYPERVISOR_update_va_mapping( \
38081 + (unsigned long)__va(pfn << PAGE_SHIFT), \
38082 + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
38083 + else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
38084 + kmap_flush_unused(); \
38085 + set_pmd(pmd, \
38086 + __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
38087 + } else \
38088 + *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
38089 +} while (0)
38090 +
38091 +/*
38092 + * Allocate and free page tables.
38093 + */
38094 +extern pgd_t *pgd_alloc(struct mm_struct *);
38095 +extern void pgd_free(pgd_t *pgd);
38096 +
38097 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
38098 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
38099 +
38100 +static inline void pte_free_kernel(pte_t *pte)
38101 +{
38102 + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
38103 + free_page((unsigned long)pte);
38104 +}
38105 +
38106 +extern void pte_free(struct page *pte);
38107 +
38108 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
38109 +
38110 +#ifdef CONFIG_X86_PAE
38111 +/*
38112 + * In the PAE case we free the pmds as part of the pgd.
38113 + */
38114 +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
38115 +#define pmd_free(x) do { } while (0)
38116 +#define __pmd_free_tlb(tlb,x) do { } while (0)
38117 +#define pud_populate(mm, pmd, pte) BUG()
38118 +#endif
38119 +
38120 +#define check_pgt_cache() do { } while (0)
38121 +
38122 +#endif /* _I386_PGALLOC_H */
38123 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
38124 ===================================================================
38125 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38126 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h 2007-06-12 13:14:02.000000000 +0200
38127 @@ -0,0 +1,24 @@
38128 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
38129 +#define _I386_PGTABLE_3LEVEL_DEFS_H
38130 +
38131 +#define HAVE_SHARED_KERNEL_PMD 0
38132 +
38133 +/*
38134 + * PGDIR_SHIFT determines what a top-level page table entry can map
38135 + */
38136 +#define PGDIR_SHIFT 30
38137 +#define PTRS_PER_PGD 4
38138 +
38139 +/*
38140 + * PMD_SHIFT determines the size of the area a middle-level
38141 + * page table can map
38142 + */
38143 +#define PMD_SHIFT 21
38144 +#define PTRS_PER_PMD 512
38145 +
38146 +/*
38147 + * entries per page directory level
38148 + */
38149 +#define PTRS_PER_PTE 512
38150 +
38151 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
38152 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h
38153 ===================================================================
38154 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38155 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200
38156 @@ -0,0 +1,211 @@
38157 +#ifndef _I386_PGTABLE_3LEVEL_H
38158 +#define _I386_PGTABLE_3LEVEL_H
38159 +
38160 +#include <asm-generic/pgtable-nopud.h>
38161 +
38162 +/*
38163 + * Intel Physical Address Extension (PAE) Mode - three-level page
38164 + * tables on PPro+ CPUs.
38165 + *
38166 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
38167 + */
38168 +
38169 +#define pte_ERROR(e) \
38170 + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
38171 + &(e), __pte_val(e), pte_pfn(e))
38172 +#define pmd_ERROR(e) \
38173 + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38174 + &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38175 +#define pgd_ERROR(e) \
38176 + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
38177 + &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
38178 +
38179 +#define pud_none(pud) 0
38180 +#define pud_bad(pud) 0
38181 +#define pud_present(pud) 1
38182 +
38183 +/*
38184 + * Is the pte executable?
38185 + */
38186 +static inline int pte_x(pte_t pte)
38187 +{
38188 + return !(__pte_val(pte) & _PAGE_NX);
38189 +}
38190 +
38191 +/*
38192 + * All present user-pages with !NX bit are user-executable:
38193 + */
38194 +static inline int pte_exec(pte_t pte)
38195 +{
38196 + return pte_user(pte) && pte_x(pte);
38197 +}
38198 +/*
38199 + * All present pages with !NX bit are kernel-executable:
38200 + */
38201 +static inline int pte_exec_kernel(pte_t pte)
38202 +{
38203 + return pte_x(pte);
38204 +}
38205 +
38206 +/* Rules for using set_pte: the pte being assigned *must* be
38207 + * either not present or in a state where the hardware will
38208 + * not attempt to update the pte. In places where this is
38209 + * not possible, use pte_get_and_clear to obtain the old pte
38210 + * value and then use set_pte to update it. -ben
38211 + */
38212 +#define __HAVE_ARCH_SET_PTE_ATOMIC
38213 +
38214 +static inline void set_pte(pte_t *ptep, pte_t pte)
38215 +{
38216 + ptep->pte_high = pte.pte_high;
38217 + smp_wmb();
38218 + ptep->pte_low = pte.pte_low;
38219 +}
38220 +#define set_pte_atomic(pteptr,pteval) \
38221 + set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
38222 +
38223 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
38224 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
38225 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
38226 + set_pte((ptep), (pteval)); \
38227 +} while (0)
38228 +
38229 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
38230 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
38231 + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
38232 + set_pte((ptep), (pteval)); \
38233 + xen_invlpg((addr)); \
38234 + } \
38235 +} while (0)
38236 +
38237 +#define set_pmd(pmdptr,pmdval) \
38238 + xen_l2_entry_update((pmdptr), (pmdval))
38239 +#define set_pud(pudptr,pudval) \
38240 + xen_l3_entry_update((pudptr), (pudval))
38241 +
38242 +/*
38243 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
38244 + * the TLB via cr3 if the top-level pgd is changed...
38245 + * We do not let the generic code free and clear pgd entries due to
38246 + * this erratum.
38247 + */
38248 +static inline void pud_clear (pud_t * pud) { }
38249 +
38250 +#define pud_page(pud) \
38251 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
38252 +
38253 +#define pud_page_kernel(pud) \
38254 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
38255 +
38256 +
38257 +/* Find an entry in the second-level page table.. */
38258 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
38259 + pmd_index(address))
38260 +
38261 +static inline int pte_none(pte_t pte)
38262 +{
38263 + return !(pte.pte_low | pte.pte_high);
38264 +}
38265 +
38266 +/*
38267 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
38268 + * entry, so clear the bottom half first and enforce ordering with a compiler
38269 + * barrier.
38270 + */
38271 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38272 +{
38273 + if ((mm != current->mm && mm != &init_mm)
38274 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38275 + ptep->pte_low = 0;
38276 + smp_wmb();
38277 + ptep->pte_high = 0;
38278 + }
38279 +}
38280 +
38281 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
38282 +
38283 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38284 +{
38285 + pte_t pte = *ptep;
38286 + if (!pte_none(pte)) {
38287 + if ((mm != &init_mm) ||
38288 + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
38289 + uint64_t val = __pte_val(pte);
38290 + if (__cmpxchg64(ptep, val, 0) != val) {
38291 + /* xchg acts as a barrier before the setting of the high bits */
38292 + pte.pte_low = xchg(&ptep->pte_low, 0);
38293 + pte.pte_high = ptep->pte_high;
38294 + ptep->pte_high = 0;
38295 + }
38296 + }
38297 + }
38298 + return pte;
38299 +}
38300 +
38301 +#define ptep_clear_flush(vma, addr, ptep) \
38302 +({ \
38303 + pte_t *__ptep = (ptep); \
38304 + pte_t __res = *__ptep; \
38305 + if (!pte_none(__res) && \
38306 + ((vma)->vm_mm != current->mm || \
38307 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
38308 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38309 + UVMF_INVLPG|UVMF_MULTI))) { \
38310 + __ptep->pte_low = 0; \
38311 + smp_wmb(); \
38312 + __ptep->pte_high = 0; \
38313 + flush_tlb_page(vma, addr); \
38314 + } \
38315 + __res; \
38316 +})
38317 +
38318 +static inline int pte_same(pte_t a, pte_t b)
38319 +{
38320 + return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
38321 +}
38322 +
38323 +#define pte_page(x) pfn_to_page(pte_pfn(x))
38324 +
38325 +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
38326 + ((_pte).pte_high << (32-PAGE_SHIFT)))
38327 +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
38328 + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
38329 +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \
38330 + (_pte).pte_low & _PAGE_PRESENT ? \
38331 + mfn_to_local_pfn(__pte_mfn(_pte)) : \
38332 + __pte_mfn(_pte))
38333 +
38334 +extern unsigned long long __supported_pte_mask;
38335 +
38336 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
38337 +{
38338 + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
38339 + pgprot_val(pgprot)) & __supported_pte_mask);
38340 +}
38341 +
38342 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
38343 +{
38344 + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
38345 + pgprot_val(pgprot)) & __supported_pte_mask);
38346 +}
38347 +
38348 +/*
38349 + * Bits 0, 6 and 7 are taken in the low part of the pte,
38350 + * put the 32 bits of offset into the high part.
38351 + */
38352 +#define pte_to_pgoff(pte) ((pte).pte_high)
38353 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
38354 +#define PTE_FILE_MAX_BITS 32
38355 +
38356 +/* Encode and de-code a swap entry */
38357 +#define __swp_type(x) (((x).val) & 0x1f)
38358 +#define __swp_offset(x) ((x).val >> 5)
38359 +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
38360 +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
38361 +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
38362 +
38363 +#define __pmd_free_tlb(tlb, x) do { } while (0)
38364 +
38365 +void vmalloc_sync_all(void);
38366 +
38367 +#endif /* _I386_PGTABLE_3LEVEL_H */
38368 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h
38369 ===================================================================
38370 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38371 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-07-21 11:00:33.000000000 +0200
38372 @@ -0,0 +1,537 @@
38373 +#ifndef _I386_PGTABLE_H
38374 +#define _I386_PGTABLE_H
38375 +
38376 +#include <asm/hypervisor.h>
38377 +
38378 +/*
38379 + * The Linux memory management assumes a three-level page table setup. On
38380 + * the i386, we use that, but "fold" the mid level into the top-level page
38381 + * table, so that we physically have the same two-level page table as the
38382 + * i386 mmu expects.
38383 + *
38384 + * This file contains the functions and defines necessary to modify and use
38385 + * the i386 page table tree.
38386 + */
38387 +#ifndef __ASSEMBLY__
38388 +#include <asm/processor.h>
38389 +#include <asm/fixmap.h>
38390 +#include <linux/threads.h>
38391 +
38392 +#ifndef _I386_BITOPS_H
38393 +#include <asm/bitops.h>
38394 +#endif
38395 +
38396 +#include <linux/slab.h>
38397 +#include <linux/list.h>
38398 +#include <linux/spinlock.h>
38399 +
38400 +/* Is this pagetable pinned? */
38401 +#define PG_pinned PG_arch_1
38402 +
38403 +struct mm_struct;
38404 +struct vm_area_struct;
38405 +
38406 +/*
38407 + * ZERO_PAGE is a global shared page that is always zero: used
38408 + * for zero-mapped memory areas etc..
38409 + */
38410 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
38411 +extern unsigned long empty_zero_page[1024];
38412 +extern pgd_t *swapper_pg_dir;
38413 +extern kmem_cache_t *pgd_cache;
38414 +extern kmem_cache_t *pmd_cache;
38415 +extern spinlock_t pgd_lock;
38416 +extern struct page *pgd_list;
38417 +
38418 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
38419 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
38420 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
38421 +void pgtable_cache_init(void);
38422 +void paging_init(void);
38423 +
38424 +/*
38425 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
38426 + * implements both the traditional 2-level x86 page tables and the
38427 + * newer 3-level PAE-mode page tables.
38428 + */
38429 +#ifdef CONFIG_X86_PAE
38430 +# include <asm/pgtable-3level-defs.h>
38431 +# define PMD_SIZE (1UL << PMD_SHIFT)
38432 +# define PMD_MASK (~(PMD_SIZE-1))
38433 +#else
38434 +# include <asm/pgtable-2level-defs.h>
38435 +#endif
38436 +
38437 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
38438 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
38439 +
38440 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
38441 +#define FIRST_USER_ADDRESS 0
38442 +
38443 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
38444 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
38445 +
38446 +#define TWOLEVEL_PGDIR_SHIFT 22
38447 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
38448 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
38449 +
38450 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
38451 + * current 8MB value just means that there will be a 8MB "hole" after the
38452 + * physical memory until the kernel virtual memory starts. That means that
38453 + * any out-of-bounds memory accesses will hopefully be caught.
38454 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
38455 + * area for the same reason. ;)
38456 + */
38457 +#define VMALLOC_OFFSET (8*1024*1024)
38458 +#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \
38459 + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
38460 +#ifdef CONFIG_HIGHMEM
38461 +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
38462 +#else
38463 +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
38464 +#endif
38465 +
38466 +/*
38467 + * _PAGE_PSE set in the page directory entry just means that
38468 + * the page directory entry points directly to a 4MB-aligned block of
38469 + * memory.
38470 + */
38471 +#define _PAGE_BIT_PRESENT 0
38472 +#define _PAGE_BIT_RW 1
38473 +#define _PAGE_BIT_USER 2
38474 +#define _PAGE_BIT_PWT 3
38475 +#define _PAGE_BIT_PCD 4
38476 +#define _PAGE_BIT_ACCESSED 5
38477 +#define _PAGE_BIT_DIRTY 6
38478 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38479 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
38480 +/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
38481 +#define _PAGE_BIT_UNUSED2 10
38482 +#define _PAGE_BIT_UNUSED3 11
38483 +#define _PAGE_BIT_NX 63
38484 +
38485 +#define _PAGE_PRESENT 0x001
38486 +#define _PAGE_RW 0x002
38487 +#define _PAGE_USER 0x004
38488 +#define _PAGE_PWT 0x008
38489 +#define _PAGE_PCD 0x010
38490 +#define _PAGE_ACCESSED 0x020
38491 +#define _PAGE_DIRTY 0x040
38492 +#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
38493 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
38494 +/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
38495 +#define _PAGE_UNUSED2 0x400
38496 +#define _PAGE_UNUSED3 0x800
38497 +
38498 +/* If _PAGE_PRESENT is clear, we use these: */
38499 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
38500 +#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
38501 + pte_present gives true */
38502 +#ifdef CONFIG_X86_PAE
38503 +#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
38504 +#else
38505 +#define _PAGE_NX 0
38506 +#endif
38507 +
38508 +/* Mapped page is I/O or foreign and has no associated page struct. */
38509 +#define _PAGE_IO 0x200
38510 +
38511 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
38512 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
38513 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
38514 +
38515 +#define PAGE_NONE \
38516 + __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
38517 +#define PAGE_SHARED \
38518 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38519 +
38520 +#define PAGE_SHARED_EXEC \
38521 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
38522 +#define PAGE_COPY_NOEXEC \
38523 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38524 +#define PAGE_COPY_EXEC \
38525 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38526 +#define PAGE_COPY \
38527 + PAGE_COPY_NOEXEC
38528 +#define PAGE_READONLY \
38529 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
38530 +#define PAGE_READONLY_EXEC \
38531 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
38532 +
38533 +#define _PAGE_KERNEL \
38534 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
38535 +#define _PAGE_KERNEL_EXEC \
38536 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
38537 +
38538 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
38539 +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
38540 +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
38541 +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
38542 +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
38543 +
38544 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
38545 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
38546 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
38547 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
38548 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
38549 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
38550 +
38551 +/*
38552 + * The i386 can't do page protection for execute, and considers that
38553 + * the same are read. Also, write permissions imply read permissions.
38554 + * This is the closest we can get..
38555 + */
38556 +#define __P000 PAGE_NONE
38557 +#define __P001 PAGE_READONLY
38558 +#define __P010 PAGE_COPY
38559 +#define __P011 PAGE_COPY
38560 +#define __P100 PAGE_READONLY_EXEC
38561 +#define __P101 PAGE_READONLY_EXEC
38562 +#define __P110 PAGE_COPY_EXEC
38563 +#define __P111 PAGE_COPY_EXEC
38564 +
38565 +#define __S000 PAGE_NONE
38566 +#define __S001 PAGE_READONLY
38567 +#define __S010 PAGE_SHARED
38568 +#define __S011 PAGE_SHARED
38569 +#define __S100 PAGE_READONLY_EXEC
38570 +#define __S101 PAGE_READONLY_EXEC
38571 +#define __S110 PAGE_SHARED_EXEC
38572 +#define __S111 PAGE_SHARED_EXEC
38573 +
38574 +/*
38575 + * Define this if things work differently on an i386 and an i486:
38576 + * it will (on an i486) warn about kernel memory accesses that are
38577 + * done without a 'access_ok(VERIFY_WRITE,..)'
38578 + */
38579 +#undef TEST_ACCESS_OK
38580 +
38581 +/* The boot page tables (all created as a single array) */
38582 +extern unsigned long pg0[];
38583 +
38584 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
38585 +
38586 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
38587 +#define pmd_none(x) (!(unsigned long)__pmd_val(x))
38588 +#if CONFIG_XEN_COMPAT <= 0x030002
38589 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
38590 + can temporarily clear it. */
38591 +#define pmd_present(x) (__pmd_val(x))
38592 +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
38593 +#else
38594 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
38595 +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
38596 +#endif
38597 +
38598 +
38599 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
38600 +
38601 +/*
38602 + * The following only work if pte_present() is true.
38603 + * Undefined behaviour if not..
38604 + */
38605 +static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38606 +static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
38607 +static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
38608 +static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
38609 +static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
38610 +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
38611 +
38612 +/*
38613 + * The following only works if pte_present() is not true.
38614 + */
38615 +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
38616 +
38617 +static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38618 +static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
38619 +static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
38620 +static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
38621 +static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
38622 +static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38623 +static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
38624 +static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
38625 +static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
38626 +static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
38627 +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
38628 +
38629 +#ifdef CONFIG_X86_PAE
38630 +# include <asm/pgtable-3level.h>
38631 +#else
38632 +# include <asm/pgtable-2level.h>
38633 +#endif
38634 +
38635 +#define ptep_test_and_clear_dirty(vma, addr, ptep) \
38636 +({ \
38637 + pte_t __pte = *(ptep); \
38638 + int __ret = pte_dirty(__pte); \
38639 + if (__ret) { \
38640 + __pte = pte_mkclean(__pte); \
38641 + if ((vma)->vm_mm != current->mm || \
38642 + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38643 + (ptep)->pte_low = __pte.pte_low; \
38644 + } \
38645 + __ret; \
38646 +})
38647 +
38648 +#define ptep_test_and_clear_young(vma, addr, ptep) \
38649 +({ \
38650 + pte_t __pte = *(ptep); \
38651 + int __ret = pte_young(__pte); \
38652 + if (__ret) \
38653 + __pte = pte_mkold(__pte); \
38654 + if ((vma)->vm_mm != current->mm || \
38655 + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
38656 + (ptep)->pte_low = __pte.pte_low; \
38657 + __ret; \
38658 +})
38659 +
38660 +#define ptep_get_and_clear_full(mm, addr, ptep, full) \
38661 + ((full) ? ({ \
38662 + pte_t __res = *(ptep); \
38663 + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \
38664 + xen_l1_entry_update(ptep, __pte(0)); \
38665 + else \
38666 + *(ptep) = __pte(0); \
38667 + __res; \
38668 + }) : \
38669 + ptep_get_and_clear(mm, addr, ptep))
38670 +
38671 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
38672 +{
38673 + pte_t pte = *ptep;
38674 + if (pte_write(pte))
38675 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
38676 +}
38677 +
38678 +/*
38679 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
38680 + *
38681 + * dst - pointer to pgd range anwhere on a pgd page
38682 + * src - ""
38683 + * count - the number of pgds to copy.
38684 + *
38685 + * dst and src can be on the same page, but the range must not overlap,
38686 + * and must not cross a page boundary.
38687 + */
38688 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
38689 +{
38690 + memcpy(dst, src, count * sizeof(pgd_t));
38691 +}
38692 +
38693 +/*
38694 + * Macro to mark a page protection value as "uncacheable". On processors which do not support
38695 + * it, this is a no-op.
38696 + */
38697 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
38698 + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
38699 +
38700 +/*
38701 + * Conversion functions: convert a page and protection to a page entry,
38702 + * and a page entry and page directory to the page they refer to.
38703 + */
38704 +
38705 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
38706 +
38707 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
38708 +{
38709 + /*
38710 + * Since this might change the present bit (which controls whether
38711 + * a pte_t object has undergone p2m translation), we must use
38712 + * pte_val() on the input pte and __pte() for the return value.
38713 + */
38714 + paddr_t pteval = pte_val(pte);
38715 +
38716 + pteval &= _PAGE_CHG_MASK;
38717 + pteval |= pgprot_val(newprot);
38718 +#ifdef CONFIG_X86_PAE
38719 + pteval &= __supported_pte_mask;
38720 +#endif
38721 + return __pte(pteval);
38722 +}
38723 +
38724 +#define pmd_large(pmd) \
38725 +((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
38726 +
38727 +/*
38728 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
38729 + *
38730 + * this macro returns the index of the entry in the pgd page which would
38731 + * control the given virtual address
38732 + */
38733 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
38734 +#define pgd_index_k(addr) pgd_index(addr)
38735 +
38736 +/*
38737 + * pgd_offset() returns a (pgd_t *)
38738 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
38739 + */
38740 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
38741 +
38742 +/*
38743 + * a shortcut which implies the use of the kernel's pgd, instead
38744 + * of a process's
38745 + */
38746 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
38747 +
38748 +/*
38749 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
38750 + *
38751 + * this macro returns the index of the entry in the pmd page which would
38752 + * control the given virtual address
38753 + */
38754 +#define pmd_index(address) \
38755 + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
38756 +
38757 +/*
38758 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
38759 + *
38760 + * this macro returns the index of the entry in the pte page which would
38761 + * control the given virtual address
38762 + */
38763 +#define pte_index(address) \
38764 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
38765 +#define pte_offset_kernel(dir, address) \
38766 + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
38767 +
38768 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
38769 +
38770 +#define pmd_page_kernel(pmd) \
38771 + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
38772 +
38773 +/*
38774 + * Helper function that returns the kernel pagetable entry controlling
38775 + * the virtual address 'address'. NULL means no pagetable entry present.
38776 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
38777 + * as a pte too.
38778 + */
38779 +extern pte_t *lookup_address(unsigned long address);
38780 +
38781 +/*
38782 + * Make a given kernel text page executable/non-executable.
38783 + * Returns the previous executability setting of that page (which
38784 + * is used to restore the previous state). Used by the SMP bootup code.
38785 + * NOTE: this is an __init function for security reasons.
38786 + */
38787 +#ifdef CONFIG_X86_PAE
38788 + extern int set_kernel_exec(unsigned long vaddr, int enable);
38789 +#else
38790 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
38791 +#endif
38792 +
38793 +extern void noexec_setup(const char *str);
38794 +
38795 +#if defined(CONFIG_HIGHPTE)
38796 +#define pte_offset_map(dir, address) \
38797 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
38798 + pte_index(address))
38799 +#define pte_offset_map_nested(dir, address) \
38800 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
38801 + pte_index(address))
38802 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
38803 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
38804 +#else
38805 +#define pte_offset_map(dir, address) \
38806 + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
38807 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
38808 +#define pte_unmap(pte) do { } while (0)
38809 +#define pte_unmap_nested(pte) do { } while (0)
38810 +#endif
38811 +
38812 +#define __HAVE_ARCH_PTEP_ESTABLISH
38813 +#define ptep_establish(vma, address, ptep, pteval) \
38814 + do { \
38815 + if ( likely((vma)->vm_mm == current->mm) ) { \
38816 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
38817 + pteval, \
38818 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
38819 + UVMF_INVLPG|UVMF_MULTI)); \
38820 + } else { \
38821 + xen_l1_entry_update(ptep, pteval); \
38822 + flush_tlb_page(vma, address); \
38823 + } \
38824 + } while (0)
38825 +
38826 +/*
38827 + * The i386 doesn't have any external MMU info: the kernel page
38828 + * tables contain all the necessary information.
38829 + *
38830 + * Also, we only update the dirty/accessed state if we set
38831 + * the dirty bit by hand in the kernel, since the hardware
38832 + * will do the accessed bit for us, and we don't want to
38833 + * race with other CPU's that might be updating the dirty
38834 + * bit at the same time.
38835 + */
38836 +#define update_mmu_cache(vma,address,pte) do { } while (0)
38837 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
38838 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
38839 + do { \
38840 + if (dirty) \
38841 + ptep_establish(vma, address, ptep, entry); \
38842 + } while (0)
38843 +
38844 +#include <xen/features.h>
38845 +void make_lowmem_page_readonly(void *va, unsigned int feature);
38846 +void make_lowmem_page_writable(void *va, unsigned int feature);
38847 +void make_page_readonly(void *va, unsigned int feature);
38848 +void make_page_writable(void *va, unsigned int feature);
38849 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
38850 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
38851 +
38852 +#define virt_to_ptep(va) \
38853 +({ \
38854 + pte_t *__ptep = lookup_address((unsigned long)(va)); \
38855 + BUG_ON(!__ptep || !pte_present(*__ptep)); \
38856 + __ptep; \
38857 +})
38858 +
38859 +#define arbitrary_virt_to_machine(va) \
38860 + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
38861 + | ((unsigned long)(va) & (PAGE_SIZE - 1)))
38862 +
38863 +#endif /* !__ASSEMBLY__ */
38864 +
38865 +#ifdef CONFIG_FLATMEM
38866 +#define kern_addr_valid(addr) (1)
38867 +#endif /* CONFIG_FLATMEM */
38868 +
38869 +int direct_remap_pfn_range(struct vm_area_struct *vma,
38870 + unsigned long address,
38871 + unsigned long mfn,
38872 + unsigned long size,
38873 + pgprot_t prot,
38874 + domid_t domid);
38875 +int direct_kernel_remap_pfn_range(unsigned long address,
38876 + unsigned long mfn,
38877 + unsigned long size,
38878 + pgprot_t prot,
38879 + domid_t domid);
38880 +int create_lookup_pte_addr(struct mm_struct *mm,
38881 + unsigned long address,
38882 + uint64_t *ptep);
38883 +int touch_pte_range(struct mm_struct *mm,
38884 + unsigned long address,
38885 + unsigned long size);
38886 +
38887 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
38888 + unsigned long addr, unsigned long end, pgprot_t newprot);
38889 +
38890 +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
38891 + xen_change_pte_range(mm, pmd, addr, end, newprot)
38892 +
38893 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
38894 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
38895 +
38896 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
38897 +#define GET_IOSPACE(pfn) 0
38898 +#define GET_PFN(pfn) (pfn)
38899 +
38900 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
38901 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
38902 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
38903 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
38904 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
38905 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
38906 +#define __HAVE_ARCH_PTE_SAME
38907 +#include <asm-generic/pgtable.h>
38908 +
38909 +#endif /* _I386_PGTABLE_H */
38910 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h
38911 ===================================================================
38912 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
38913 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100
38914 @@ -0,0 +1,743 @@
38915 +/*
38916 + * include/asm-i386/processor.h
38917 + *
38918 + * Copyright (C) 1994 Linus Torvalds
38919 + */
38920 +
38921 +#ifndef __ASM_I386_PROCESSOR_H
38922 +#define __ASM_I386_PROCESSOR_H
38923 +
38924 +#include <asm/vm86.h>
38925 +#include <asm/math_emu.h>
38926 +#include <asm/segment.h>
38927 +#include <asm/page.h>
38928 +#include <asm/types.h>
38929 +#include <asm/sigcontext.h>
38930 +#include <asm/cpufeature.h>
38931 +#include <asm/msr.h>
38932 +#include <asm/system.h>
38933 +#include <linux/cache.h>
38934 +#include <linux/threads.h>
38935 +#include <asm/percpu.h>
38936 +#include <linux/cpumask.h>
38937 +#include <xen/interface/physdev.h>
38938 +
38939 +/* flag for disabling the tsc */
38940 +extern int tsc_disable;
38941 +
38942 +struct desc_struct {
38943 + unsigned long a,b;
38944 +};
38945 +
38946 +#define desc_empty(desc) \
38947 + (!((desc)->a | (desc)->b))
38948 +
38949 +#define desc_equal(desc1, desc2) \
38950 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
38951 +/*
38952 + * Default implementation of macro that returns current
38953 + * instruction pointer ("program counter").
38954 + */
38955 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
38956 +
38957 +/*
38958 + * CPU type and hardware bug flags. Kept separately for each CPU.
38959 + * Members of this structure are referenced in head.S, so think twice
38960 + * before touching them. [mj]
38961 + */
38962 +
38963 +struct cpuinfo_x86 {
38964 + __u8 x86; /* CPU family */
38965 + __u8 x86_vendor; /* CPU vendor */
38966 + __u8 x86_model;
38967 + __u8 x86_mask;
38968 + char wp_works_ok; /* It doesn't on 386's */
38969 + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
38970 + char hard_math;
38971 + char rfu;
38972 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
38973 + unsigned long x86_capability[NCAPINTS];
38974 + char x86_vendor_id[16];
38975 + char x86_model_id[64];
38976 + int x86_cache_size; /* in KB - valid for CPUS which support this
38977 + call */
38978 + int x86_cache_alignment; /* In bytes */
38979 + char fdiv_bug;
38980 + char f00f_bug;
38981 + char coma_bug;
38982 + char pad0;
38983 + int x86_power;
38984 + unsigned long loops_per_jiffy;
38985 +#ifdef CONFIG_SMP
38986 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
38987 +#endif
38988 + unsigned char x86_max_cores; /* cpuid returned max cores value */
38989 + unsigned char apicid;
38990 +#ifdef CONFIG_SMP
38991 + unsigned char booted_cores; /* number of cores as seen by OS */
38992 + __u8 phys_proc_id; /* Physical processor id. */
38993 + __u8 cpu_core_id; /* Core id */
38994 +#endif
38995 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
38996 +
38997 +#define X86_VENDOR_INTEL 0
38998 +#define X86_VENDOR_CYRIX 1
38999 +#define X86_VENDOR_AMD 2
39000 +#define X86_VENDOR_UMC 3
39001 +#define X86_VENDOR_NEXGEN 4
39002 +#define X86_VENDOR_CENTAUR 5
39003 +#define X86_VENDOR_RISE 6
39004 +#define X86_VENDOR_TRANSMETA 7
39005 +#define X86_VENDOR_NSC 8
39006 +#define X86_VENDOR_NUM 9
39007 +#define X86_VENDOR_UNKNOWN 0xff
39008 +
39009 +/*
39010 + * capabilities of CPUs
39011 + */
39012 +
39013 +extern struct cpuinfo_x86 boot_cpu_data;
39014 +extern struct cpuinfo_x86 new_cpu_data;
39015 +#ifndef CONFIG_X86_NO_TSS
39016 +extern struct tss_struct doublefault_tss;
39017 +DECLARE_PER_CPU(struct tss_struct, init_tss);
39018 +#endif
39019 +
39020 +#ifdef CONFIG_SMP
39021 +extern struct cpuinfo_x86 cpu_data[];
39022 +#define current_cpu_data cpu_data[smp_processor_id()]
39023 +#else
39024 +#define cpu_data (&boot_cpu_data)
39025 +#define current_cpu_data boot_cpu_data
39026 +#endif
39027 +
39028 +extern int cpu_llc_id[NR_CPUS];
39029 +extern char ignore_fpu_irq;
39030 +
39031 +extern void identify_cpu(struct cpuinfo_x86 *);
39032 +extern void print_cpu_info(struct cpuinfo_x86 *);
39033 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
39034 +extern unsigned short num_cache_leaves;
39035 +
39036 +#ifdef CONFIG_X86_HT
39037 +extern void detect_ht(struct cpuinfo_x86 *c);
39038 +#else
39039 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
39040 +#endif
39041 +
39042 +/*
39043 + * EFLAGS bits
39044 + */
39045 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
39046 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
39047 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
39048 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
39049 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
39050 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
39051 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
39052 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
39053 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
39054 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
39055 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
39056 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
39057 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
39058 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
39059 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
39060 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
39061 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
39062 +
39063 +/*
39064 + * Generic CPUID function
39065 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
39066 + * resulting in stale register contents being returned.
39067 + */
39068 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
39069 +{
39070 + __asm__(XEN_CPUID
39071 + : "=a" (*eax),
39072 + "=b" (*ebx),
39073 + "=c" (*ecx),
39074 + "=d" (*edx)
39075 + : "0" (op), "c"(0));
39076 +}
39077 +
39078 +/* Some CPUID calls want 'count' to be placed in ecx */
39079 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
39080 + int *edx)
39081 +{
39082 + __asm__(XEN_CPUID
39083 + : "=a" (*eax),
39084 + "=b" (*ebx),
39085 + "=c" (*ecx),
39086 + "=d" (*edx)
39087 + : "0" (op), "c" (count));
39088 +}
39089 +
39090 +/*
39091 + * CPUID functions returning a single datum
39092 + */
39093 +static inline unsigned int cpuid_eax(unsigned int op)
39094 +{
39095 + unsigned int eax;
39096 +
39097 + __asm__(XEN_CPUID
39098 + : "=a" (eax)
39099 + : "0" (op)
39100 + : "bx", "cx", "dx");
39101 + return eax;
39102 +}
39103 +static inline unsigned int cpuid_ebx(unsigned int op)
39104 +{
39105 + unsigned int eax, ebx;
39106 +
39107 + __asm__(XEN_CPUID
39108 + : "=a" (eax), "=b" (ebx)
39109 + : "0" (op)
39110 + : "cx", "dx" );
39111 + return ebx;
39112 +}
39113 +static inline unsigned int cpuid_ecx(unsigned int op)
39114 +{
39115 + unsigned int eax, ecx;
39116 +
39117 + __asm__(XEN_CPUID
39118 + : "=a" (eax), "=c" (ecx)
39119 + : "0" (op)
39120 + : "bx", "dx" );
39121 + return ecx;
39122 +}
39123 +static inline unsigned int cpuid_edx(unsigned int op)
39124 +{
39125 + unsigned int eax, edx;
39126 +
39127 + __asm__(XEN_CPUID
39128 + : "=a" (eax), "=d" (edx)
39129 + : "0" (op)
39130 + : "bx", "cx");
39131 + return edx;
39132 +}
39133 +
39134 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
39135 +
39136 +/*
39137 + * Intel CPU features in CR4
39138 + */
39139 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
39140 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
39141 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
39142 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
39143 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
39144 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
39145 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
39146 +#define X86_CR4_PGE 0x0080 /* enable global pages */
39147 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
39148 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
39149 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
39150 +
39151 +/*
39152 + * Save the cr4 feature set we're using (ie
39153 + * Pentium 4MB enable and PPro Global page
39154 + * enable), so that any CPU's that boot up
39155 + * after us can get the correct flags.
39156 + */
39157 +extern unsigned long mmu_cr4_features;
39158 +
39159 +static inline void set_in_cr4 (unsigned long mask)
39160 +{
39161 + unsigned cr4;
39162 + mmu_cr4_features |= mask;
39163 + cr4 = read_cr4();
39164 + cr4 |= mask;
39165 + write_cr4(cr4);
39166 +}
39167 +
39168 +static inline void clear_in_cr4 (unsigned long mask)
39169 +{
39170 + unsigned cr4;
39171 + mmu_cr4_features &= ~mask;
39172 + cr4 = read_cr4();
39173 + cr4 &= ~mask;
39174 + write_cr4(cr4);
39175 +}
39176 +
39177 +/*
39178 + * NSC/Cyrix CPU configuration register indexes
39179 + */
39180 +
39181 +#define CX86_PCR0 0x20
39182 +#define CX86_GCR 0xb8
39183 +#define CX86_CCR0 0xc0
39184 +#define CX86_CCR1 0xc1
39185 +#define CX86_CCR2 0xc2
39186 +#define CX86_CCR3 0xc3
39187 +#define CX86_CCR4 0xe8
39188 +#define CX86_CCR5 0xe9
39189 +#define CX86_CCR6 0xea
39190 +#define CX86_CCR7 0xeb
39191 +#define CX86_PCR1 0xf0
39192 +#define CX86_DIR0 0xfe
39193 +#define CX86_DIR1 0xff
39194 +#define CX86_ARR_BASE 0xc4
39195 +#define CX86_RCR_BASE 0xdc
39196 +
39197 +/*
39198 + * NSC/Cyrix CPU indexed register access macros
39199 + */
39200 +
39201 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
39202 +
39203 +#define setCx86(reg, data) do { \
39204 + outb((reg), 0x22); \
39205 + outb((data), 0x23); \
39206 +} while (0)
39207 +
39208 +/* Stop speculative execution */
39209 +static inline void sync_core(void)
39210 +{
39211 + int tmp;
39212 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
39213 +}
39214 +
39215 +static inline void __monitor(const void *eax, unsigned long ecx,
39216 + unsigned long edx)
39217 +{
39218 + /* "monitor %eax,%ecx,%edx;" */
39219 + asm volatile(
39220 + ".byte 0x0f,0x01,0xc8;"
39221 + : :"a" (eax), "c" (ecx), "d"(edx));
39222 +}
39223 +
39224 +static inline void __mwait(unsigned long eax, unsigned long ecx)
39225 +{
39226 + /* "mwait %eax,%ecx;" */
39227 + asm volatile(
39228 + ".byte 0x0f,0x01,0xc9;"
39229 + : :"a" (eax), "c" (ecx));
39230 +}
39231 +
39232 +/* from system description table in BIOS. Mostly for MCA use, but
39233 +others may find it useful. */
39234 +extern unsigned int machine_id;
39235 +extern unsigned int machine_submodel_id;
39236 +extern unsigned int BIOS_revision;
39237 +extern unsigned int mca_pentium_flag;
39238 +
39239 +/* Boot loader type from the setup header */
39240 +extern int bootloader_type;
39241 +
39242 +/*
39243 + * User space process size: 3GB (default).
39244 + */
39245 +#define TASK_SIZE (PAGE_OFFSET)
39246 +
39247 +/* This decides where the kernel will search for a free chunk of vm
39248 + * space during mmap's.
39249 + */
39250 +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
39251 +
39252 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
39253 +
39254 +/*
39255 + * Size of io_bitmap.
39256 + */
39257 +#define IO_BITMAP_BITS 65536
39258 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
39259 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
39260 +#ifndef CONFIG_X86_NO_TSS
39261 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
39262 +#endif
39263 +#define INVALID_IO_BITMAP_OFFSET 0x8000
39264 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
39265 +
39266 +struct i387_fsave_struct {
39267 + long cwd;
39268 + long swd;
39269 + long twd;
39270 + long fip;
39271 + long fcs;
39272 + long foo;
39273 + long fos;
39274 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39275 + long status; /* software status information */
39276 +};
39277 +
39278 +struct i387_fxsave_struct {
39279 + unsigned short cwd;
39280 + unsigned short swd;
39281 + unsigned short twd;
39282 + unsigned short fop;
39283 + long fip;
39284 + long fcs;
39285 + long foo;
39286 + long fos;
39287 + long mxcsr;
39288 + long mxcsr_mask;
39289 + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
39290 + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
39291 + long padding[56];
39292 +} __attribute__ ((aligned (16)));
39293 +
39294 +struct i387_soft_struct {
39295 + long cwd;
39296 + long swd;
39297 + long twd;
39298 + long fip;
39299 + long fcs;
39300 + long foo;
39301 + long fos;
39302 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
39303 + unsigned char ftop, changed, lookahead, no_update, rm, alimit;
39304 + struct info *info;
39305 + unsigned long entry_eip;
39306 +};
39307 +
39308 +union i387_union {
39309 + struct i387_fsave_struct fsave;
39310 + struct i387_fxsave_struct fxsave;
39311 + struct i387_soft_struct soft;
39312 +};
39313 +
39314 +typedef struct {
39315 + unsigned long seg;
39316 +} mm_segment_t;
39317 +
39318 +struct thread_struct;
39319 +
39320 +#ifndef CONFIG_X86_NO_TSS
39321 +struct tss_struct {
39322 + unsigned short back_link,__blh;
39323 + unsigned long esp0;
39324 + unsigned short ss0,__ss0h;
39325 + unsigned long esp1;
39326 + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
39327 + unsigned long esp2;
39328 + unsigned short ss2,__ss2h;
39329 + unsigned long __cr3;
39330 + unsigned long eip;
39331 + unsigned long eflags;
39332 + unsigned long eax,ecx,edx,ebx;
39333 + unsigned long esp;
39334 + unsigned long ebp;
39335 + unsigned long esi;
39336 + unsigned long edi;
39337 + unsigned short es, __esh;
39338 + unsigned short cs, __csh;
39339 + unsigned short ss, __ssh;
39340 + unsigned short ds, __dsh;
39341 + unsigned short fs, __fsh;
39342 + unsigned short gs, __gsh;
39343 + unsigned short ldt, __ldth;
39344 + unsigned short trace, io_bitmap_base;
39345 + /*
39346 + * The extra 1 is there because the CPU will access an
39347 + * additional byte beyond the end of the IO permission
39348 + * bitmap. The extra byte must be all 1 bits, and must
39349 + * be within the limit.
39350 + */
39351 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
39352 + /*
39353 + * Cache the current maximum and the last task that used the bitmap:
39354 + */
39355 + unsigned long io_bitmap_max;
39356 + struct thread_struct *io_bitmap_owner;
39357 + /*
39358 + * pads the TSS to be cacheline-aligned (size is 0x100)
39359 + */
39360 + unsigned long __cacheline_filler[35];
39361 + /*
39362 + * .. and then another 0x100 bytes for emergency kernel stack
39363 + */
39364 + unsigned long stack[64];
39365 +} __attribute__((packed));
39366 +#endif
39367 +
39368 +#define ARCH_MIN_TASKALIGN 16
39369 +
39370 +struct thread_struct {
39371 +/* cached TLS descriptors. */
39372 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
39373 + unsigned long esp0;
39374 + unsigned long sysenter_cs;
39375 + unsigned long eip;
39376 + unsigned long esp;
39377 + unsigned long fs;
39378 + unsigned long gs;
39379 +/* Hardware debugging registers */
39380 + unsigned long debugreg[8]; /* %%db0-7 debug registers */
39381 +/* fault info */
39382 + unsigned long cr2, trap_no, error_code;
39383 +/* floating point info */
39384 + union i387_union i387;
39385 +/* virtual 86 mode info */
39386 + struct vm86_struct __user * vm86_info;
39387 + unsigned long screen_bitmap;
39388 + unsigned long v86flags, v86mask, saved_esp0;
39389 + unsigned int saved_fs, saved_gs;
39390 +/* IO permissions */
39391 + unsigned long *io_bitmap_ptr;
39392 + unsigned long iopl;
39393 +/* max allowed port in the bitmap, in bytes: */
39394 + unsigned long io_bitmap_max;
39395 +};
39396 +
39397 +#define INIT_THREAD { \
39398 + .vm86_info = NULL, \
39399 + .sysenter_cs = __KERNEL_CS, \
39400 + .io_bitmap_ptr = NULL, \
39401 +}
39402 +
39403 +#ifndef CONFIG_X86_NO_TSS
39404 +/*
39405 + * Note that the .io_bitmap member must be extra-big. This is because
39406 + * the CPU will access an additional byte beyond the end of the IO
39407 + * permission bitmap. The extra byte must be all 1 bits, and must
39408 + * be within the limit.
39409 + */
39410 +#define INIT_TSS { \
39411 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
39412 + .ss0 = __KERNEL_DS, \
39413 + .ss1 = __KERNEL_CS, \
39414 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
39415 + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
39416 +}
39417 +
39418 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
39419 +{
39420 + tss->esp0 = thread->esp0;
39421 + /* This can only happen when SEP is enabled, no need to test "SEP"arately */
39422 + if (unlikely(tss->ss1 != thread->sysenter_cs)) {
39423 + tss->ss1 = thread->sysenter_cs;
39424 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
39425 + }
39426 +}
39427 +#define load_esp0(tss, thread) \
39428 + __load_esp0(tss, thread)
39429 +#else
39430 +#define load_esp0(tss, thread) do { \
39431 + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
39432 + BUG(); \
39433 +} while (0)
39434 +#endif
39435 +
39436 +#define start_thread(regs, new_eip, new_esp) do { \
39437 + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
39438 + set_fs(USER_DS); \
39439 + regs->xds = __USER_DS; \
39440 + regs->xes = __USER_DS; \
39441 + regs->xss = __USER_DS; \
39442 + regs->xcs = __USER_CS; \
39443 + regs->eip = new_eip; \
39444 + regs->esp = new_esp; \
39445 +} while (0)
39446 +
39447 +/*
39448 + * These special macros can be used to get or set a debugging register
39449 + */
39450 +#define get_debugreg(var, register) \
39451 + (var) = HYPERVISOR_get_debugreg((register))
39452 +#define set_debugreg(value, register) \
39453 + WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
39454 +
39455 +/*
39456 + * Set IOPL bits in EFLAGS from given mask
39457 + */
39458 +static inline void set_iopl_mask(unsigned mask)
39459 +{
39460 + struct physdev_set_iopl set_iopl;
39461 +
39462 + /* Force the change at ring 0. */
39463 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
39464 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
39465 +}
39466 +
39467 +/* Forward declaration, a strange C thing */
39468 +struct task_struct;
39469 +struct mm_struct;
39470 +
39471 +/* Free all resources held by a thread. */
39472 +extern void release_thread(struct task_struct *);
39473 +
39474 +/* Prepare to copy thread state - unlazy all lazy status */
39475 +extern void prepare_to_copy(struct task_struct *tsk);
39476 +
39477 +/*
39478 + * create a kernel thread without removing it from tasklists
39479 + */
39480 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
39481 +
39482 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
39483 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
39484 +
39485 +unsigned long get_wchan(struct task_struct *p);
39486 +
39487 +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
39488 +#define KSTK_TOP(info) \
39489 +({ \
39490 + unsigned long *__ptr = (unsigned long *)(info); \
39491 + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
39492 +})
39493 +
39494 +/*
39495 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
39496 + * This is necessary to guarantee that the entire "struct pt_regs"
39497 + * is accessable even if the CPU haven't stored the SS/ESP registers
39498 + * on the stack (interrupt gate does not save these registers
39499 + * when switching to the same priv ring).
39500 + * Therefore beware: accessing the xss/esp fields of the
39501 + * "struct pt_regs" is possible, but they may contain the
39502 + * completely wrong values.
39503 + */
39504 +#define task_pt_regs(task) \
39505 +({ \
39506 + struct pt_regs *__regs__; \
39507 + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
39508 + __regs__ - 1; \
39509 +})
39510 +
39511 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
39512 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
39513 +
39514 +
39515 +struct microcode_header {
39516 + unsigned int hdrver;
39517 + unsigned int rev;
39518 + unsigned int date;
39519 + unsigned int sig;
39520 + unsigned int cksum;
39521 + unsigned int ldrver;
39522 + unsigned int pf;
39523 + unsigned int datasize;
39524 + unsigned int totalsize;
39525 + unsigned int reserved[3];
39526 +};
39527 +
39528 +struct microcode {
39529 + struct microcode_header hdr;
39530 + unsigned int bits[0];
39531 +};
39532 +
39533 +typedef struct microcode microcode_t;
39534 +typedef struct microcode_header microcode_header_t;
39535 +
39536 +/* microcode format is extended from prescott processors */
39537 +struct extended_signature {
39538 + unsigned int sig;
39539 + unsigned int pf;
39540 + unsigned int cksum;
39541 +};
39542 +
39543 +struct extended_sigtable {
39544 + unsigned int count;
39545 + unsigned int cksum;
39546 + unsigned int reserved[3];
39547 + struct extended_signature sigs[0];
39548 +};
39549 +
39550 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
39551 +static inline void rep_nop(void)
39552 +{
39553 + __asm__ __volatile__("rep;nop": : :"memory");
39554 +}
39555 +
39556 +#define cpu_relax() rep_nop()
39557 +
39558 +/* generic versions from gas */
39559 +#define GENERIC_NOP1 ".byte 0x90\n"
39560 +#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
39561 +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
39562 +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
39563 +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
39564 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
39565 +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
39566 +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
39567 +
39568 +/* Opteron nops */
39569 +#define K8_NOP1 GENERIC_NOP1
39570 +#define K8_NOP2 ".byte 0x66,0x90\n"
39571 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
39572 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
39573 +#define K8_NOP5 K8_NOP3 K8_NOP2
39574 +#define K8_NOP6 K8_NOP3 K8_NOP3
39575 +#define K8_NOP7 K8_NOP4 K8_NOP3
39576 +#define K8_NOP8 K8_NOP4 K8_NOP4
39577 +
39578 +/* K7 nops */
39579 +/* uses eax dependencies (arbitary choice) */
39580 +#define K7_NOP1 GENERIC_NOP1
39581 +#define K7_NOP2 ".byte 0x8b,0xc0\n"
39582 +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
39583 +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
39584 +#define K7_NOP5 K7_NOP4 ASM_NOP1
39585 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
39586 +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
39587 +#define K7_NOP8 K7_NOP7 ASM_NOP1
39588 +
39589 +#ifdef CONFIG_MK8
39590 +#define ASM_NOP1 K8_NOP1
39591 +#define ASM_NOP2 K8_NOP2
39592 +#define ASM_NOP3 K8_NOP3
39593 +#define ASM_NOP4 K8_NOP4
39594 +#define ASM_NOP5 K8_NOP5
39595 +#define ASM_NOP6 K8_NOP6
39596 +#define ASM_NOP7 K8_NOP7
39597 +#define ASM_NOP8 K8_NOP8
39598 +#elif defined(CONFIG_MK7)
39599 +#define ASM_NOP1 K7_NOP1
39600 +#define ASM_NOP2 K7_NOP2
39601 +#define ASM_NOP3 K7_NOP3
39602 +#define ASM_NOP4 K7_NOP4
39603 +#define ASM_NOP5 K7_NOP5
39604 +#define ASM_NOP6 K7_NOP6
39605 +#define ASM_NOP7 K7_NOP7
39606 +#define ASM_NOP8 K7_NOP8
39607 +#else
39608 +#define ASM_NOP1 GENERIC_NOP1
39609 +#define ASM_NOP2 GENERIC_NOP2
39610 +#define ASM_NOP3 GENERIC_NOP3
39611 +#define ASM_NOP4 GENERIC_NOP4
39612 +#define ASM_NOP5 GENERIC_NOP5
39613 +#define ASM_NOP6 GENERIC_NOP6
39614 +#define ASM_NOP7 GENERIC_NOP7
39615 +#define ASM_NOP8 GENERIC_NOP8
39616 +#endif
39617 +
39618 +#define ASM_NOP_MAX 8
39619 +
39620 +/* Prefetch instructions for Pentium III and AMD Athlon */
39621 +/* It's not worth to care about 3dnow! prefetches for the K6
39622 + because they are microcoded there and very slow.
39623 + However we don't do prefetches for pre XP Athlons currently
39624 + That should be fixed. */
39625 +#define ARCH_HAS_PREFETCH
39626 +static inline void prefetch(const void *x)
39627 +{
39628 + alternative_input(ASM_NOP4,
39629 + "prefetchnta (%1)",
39630 + X86_FEATURE_XMM,
39631 + "r" (x));
39632 +}
39633 +
39634 +#define ARCH_HAS_PREFETCH
39635 +#define ARCH_HAS_PREFETCHW
39636 +#define ARCH_HAS_SPINLOCK_PREFETCH
39637 +
39638 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
39639 + spinlocks to avoid one state transition in the cache coherency protocol. */
39640 +static inline void prefetchw(const void *x)
39641 +{
39642 + alternative_input(ASM_NOP4,
39643 + "prefetchw (%1)",
39644 + X86_FEATURE_3DNOW,
39645 + "r" (x));
39646 +}
39647 +#define spin_lock_prefetch(x) prefetchw(x)
39648 +
39649 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
39650 +
39651 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
39652 +
39653 +extern unsigned long boot_option_idle_override;
39654 +extern void enable_sep_cpu(void);
39655 +extern int sysenter_setup(void);
39656 +
39657 +#endif /* __ASM_I386_PROCESSOR_H */
39658 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h
39659 ===================================================================
39660 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39661 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/segment_32.h 2007-06-12 13:14:02.000000000 +0200
39662 @@ -0,0 +1,117 @@
39663 +#ifndef _ASM_SEGMENT_H
39664 +#define _ASM_SEGMENT_H
39665 +
39666 +/*
39667 + * The layout of the per-CPU GDT under Linux:
39668 + *
39669 + * 0 - null
39670 + * 1 - reserved
39671 + * 2 - reserved
39672 + * 3 - reserved
39673 + *
39674 + * 4 - unused <==== new cacheline
39675 + * 5 - unused
39676 + *
39677 + * ------- start of TLS (Thread-Local Storage) segments:
39678 + *
39679 + * 6 - TLS segment #1 [ glibc's TLS segment ]
39680 + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
39681 + * 8 - TLS segment #3
39682 + * 9 - reserved
39683 + * 10 - reserved
39684 + * 11 - reserved
39685 + *
39686 + * ------- start of kernel segments:
39687 + *
39688 + * 12 - kernel code segment <==== new cacheline
39689 + * 13 - kernel data segment
39690 + * 14 - default user CS
39691 + * 15 - default user DS
39692 + * 16 - TSS
39693 + * 17 - LDT
39694 + * 18 - PNPBIOS support (16->32 gate)
39695 + * 19 - PNPBIOS support
39696 + * 20 - PNPBIOS support
39697 + * 21 - PNPBIOS support
39698 + * 22 - PNPBIOS support
39699 + * 23 - APM BIOS support
39700 + * 24 - APM BIOS support
39701 + * 25 - APM BIOS support
39702 + *
39703 + * 26 - ESPFIX small SS
39704 + * 27 - unused
39705 + * 28 - unused
39706 + * 29 - unused
39707 + * 30 - unused
39708 + * 31 - TSS for double fault handler
39709 + */
39710 +#define GDT_ENTRY_TLS_ENTRIES 3
39711 +#define GDT_ENTRY_TLS_MIN 6
39712 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
39713 +
39714 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
39715 +
39716 +#define GDT_ENTRY_DEFAULT_USER_CS 14
39717 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
39718 +
39719 +#define GDT_ENTRY_DEFAULT_USER_DS 15
39720 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
39721 +
39722 +#define GDT_ENTRY_KERNEL_BASE 12
39723 +
39724 +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
39725 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
39726 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39727 +
39728 +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
39729 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
39730 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
39731 +
39732 +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
39733 +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
39734 +
39735 +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
39736 +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
39737 +
39738 +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
39739 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
39740 +
39741 +#define GDT_ENTRY_DOUBLEFAULT_TSS 31
39742 +
39743 +/*
39744 + * The GDT has 32 entries
39745 + */
39746 +#define GDT_ENTRIES 32
39747 +
39748 +#define GDT_SIZE (GDT_ENTRIES * 8)
39749 +
39750 +/* Simple and small GDT entries for booting only */
39751 +
39752 +#define GDT_ENTRY_BOOT_CS 2
39753 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
39754 +
39755 +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
39756 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
39757 +
39758 +/* The PnP BIOS entries in the GDT */
39759 +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
39760 +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
39761 +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
39762 +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
39763 +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
39764 +
39765 +/* The PnP BIOS selectors */
39766 +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
39767 +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
39768 +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
39769 +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
39770 +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
39771 +
39772 +/*
39773 + * The interrupt descriptor table has room for 256 idt's,
39774 + * the global descriptor table is dependent on the number
39775 + * of tasks we can have..
39776 + */
39777 +#define IDT_ENTRIES 256
39778 +
39779 +#endif
39780 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h
39781 ===================================================================
39782 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39783 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200
39784 @@ -0,0 +1,103 @@
39785 +#ifndef __ASM_SMP_H
39786 +#define __ASM_SMP_H
39787 +
39788 +/*
39789 + * We need the APIC definitions automatically as part of 'smp.h'
39790 + */
39791 +#ifndef __ASSEMBLY__
39792 +#include <linux/kernel.h>
39793 +#include <linux/threads.h>
39794 +#include <linux/cpumask.h>
39795 +#endif
39796 +
39797 +#ifdef CONFIG_X86_LOCAL_APIC
39798 +#ifndef __ASSEMBLY__
39799 +#include <asm/fixmap.h>
39800 +#include <asm/bitops.h>
39801 +#include <asm/mpspec.h>
39802 +#ifdef CONFIG_X86_IO_APIC
39803 +#include <asm/io_apic.h>
39804 +#endif
39805 +#include <asm/apic.h>
39806 +#endif
39807 +#endif
39808 +
39809 +#define BAD_APICID 0xFFu
39810 +#ifdef CONFIG_SMP
39811 +#ifndef __ASSEMBLY__
39812 +
39813 +/*
39814 + * Private routines/data
39815 + */
39816 +
39817 +extern void smp_alloc_memory(void);
39818 +extern int pic_mode;
39819 +extern int smp_num_siblings;
39820 +extern cpumask_t cpu_sibling_map[];
39821 +extern cpumask_t cpu_core_map[];
39822 +
39823 +extern void (*mtrr_hook) (void);
39824 +extern void zap_low_mappings (void);
39825 +extern void lock_ipi_call_lock(void);
39826 +extern void unlock_ipi_call_lock(void);
39827 +
39828 +#define MAX_APICID 256
39829 +extern u8 x86_cpu_to_apicid[];
39830 +
39831 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
39832 +
39833 +#ifdef CONFIG_HOTPLUG_CPU
39834 +extern void cpu_exit_clear(void);
39835 +extern void cpu_uninit(void);
39836 +#endif
39837 +
39838 +/*
39839 + * This function is needed by all SMP systems. It must _always_ be valid
39840 + * from the initial startup. We map APIC_BASE very early in page_setup(),
39841 + * so this is correct in the x86 case.
39842 + */
39843 +#define raw_smp_processor_id() (current_thread_info()->cpu)
39844 +
39845 +extern cpumask_t cpu_possible_map;
39846 +#define cpu_callin_map cpu_possible_map
39847 +
39848 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
39849 +static inline int num_booting_cpus(void)
39850 +{
39851 + return cpus_weight(cpu_possible_map);
39852 +}
39853 +
39854 +#ifdef CONFIG_X86_LOCAL_APIC
39855 +
39856 +#ifdef APIC_DEFINITION
39857 +extern int hard_smp_processor_id(void);
39858 +#else
39859 +#include <mach_apicdef.h>
39860 +static inline int hard_smp_processor_id(void)
39861 +{
39862 + /* we don't want to mark this access volatile - bad code generation */
39863 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
39864 +}
39865 +#endif
39866 +
39867 +static __inline int logical_smp_processor_id(void)
39868 +{
39869 + /* we don't want to mark this access volatile - bad code generation */
39870 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
39871 +}
39872 +
39873 +#endif
39874 +
39875 +extern int __cpu_disable(void);
39876 +extern void __cpu_die(unsigned int cpu);
39877 +extern void prefill_possible_map(void);
39878 +#endif /* !__ASSEMBLY__ */
39879 +
39880 +#else /* CONFIG_SMP */
39881 +
39882 +#define cpu_physical_id(cpu) boot_cpu_physical_apicid
39883 +
39884 +#define NO_PROC_ID 0xFF /* No processor magic marker */
39885 +
39886 +#endif
39887 +#endif
39888 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h
39889 ===================================================================
39890 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39891 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/swiotlb_32.h 2007-06-12 13:14:02.000000000 +0200
39892 @@ -0,0 +1,43 @@
39893 +#ifndef _ASM_SWIOTLB_H
39894 +#define _ASM_SWIOTLB_H 1
39895 +
39896 +/* SWIOTLB interface */
39897 +
39898 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
39899 + int dir);
39900 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
39901 + size_t size, int dir);
39902 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
39903 + dma_addr_t dev_addr,
39904 + size_t size, int dir);
39905 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
39906 + dma_addr_t dev_addr,
39907 + size_t size, int dir);
39908 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
39909 + struct scatterlist *sg, int nelems,
39910 + int dir);
39911 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
39912 + struct scatterlist *sg, int nelems,
39913 + int dir);
39914 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
39915 + int nents, int direction);
39916 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
39917 + int nents, int direction);
39918 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
39919 +#ifdef CONFIG_HIGHMEM
39920 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
39921 + unsigned long offset, size_t size,
39922 + enum dma_data_direction direction);
39923 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
39924 + size_t size, enum dma_data_direction direction);
39925 +#endif
39926 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
39927 +extern void swiotlb_init(void);
39928 +
39929 +#ifdef CONFIG_SWIOTLB
39930 +extern int swiotlb;
39931 +#else
39932 +#define swiotlb 0
39933 +#endif
39934 +
39935 +#endif
39936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h
39937 ===================================================================
39938 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
39939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200
39940 @@ -0,0 +1,126 @@
39941 +#ifndef __XEN_SYNCH_BITOPS_H__
39942 +#define __XEN_SYNCH_BITOPS_H__
39943 +
39944 +/*
39945 + * Copyright 1992, Linus Torvalds.
39946 + * Heavily modified to provide guaranteed strong synchronisation
39947 + * when communicating with Xen or other guest OSes running on other CPUs.
39948 + */
39949 +
39950 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
39951 +#include <xen/platform-compat.h>
39952 +#endif
39953 +
39954 +#define ADDR (*(volatile long *) addr)
39955 +
39956 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
39957 +{
39958 + __asm__ __volatile__ (
39959 + "lock btsl %1,%0"
39960 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39961 +}
39962 +
39963 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
39964 +{
39965 + __asm__ __volatile__ (
39966 + "lock btrl %1,%0"
39967 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39968 +}
39969 +
39970 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
39971 +{
39972 + __asm__ __volatile__ (
39973 + "lock btcl %1,%0"
39974 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
39975 +}
39976 +
39977 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
39978 +{
39979 + int oldbit;
39980 + __asm__ __volatile__ (
39981 + "lock btsl %2,%1\n\tsbbl %0,%0"
39982 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39983 + return oldbit;
39984 +}
39985 +
39986 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
39987 +{
39988 + int oldbit;
39989 + __asm__ __volatile__ (
39990 + "lock btrl %2,%1\n\tsbbl %0,%0"
39991 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
39992 + return oldbit;
39993 +}
39994 +
39995 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
39996 +{
39997 + int oldbit;
39998 +
39999 + __asm__ __volatile__ (
40000 + "lock btcl %2,%1\n\tsbbl %0,%0"
40001 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
40002 + return oldbit;
40003 +}
40004 +
40005 +struct __synch_xchg_dummy { unsigned long a[100]; };
40006 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
40007 +
40008 +#define synch_cmpxchg(ptr, old, new) \
40009 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
40010 + (unsigned long)(old), \
40011 + (unsigned long)(new), \
40012 + sizeof(*(ptr))))
40013 +
40014 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
40015 + unsigned long old,
40016 + unsigned long new, int size)
40017 +{
40018 + unsigned long prev;
40019 + switch (size) {
40020 + case 1:
40021 + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
40022 + : "=a"(prev)
40023 + : "q"(new), "m"(*__synch_xg(ptr)),
40024 + "0"(old)
40025 + : "memory");
40026 + return prev;
40027 + case 2:
40028 + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
40029 + : "=a"(prev)
40030 + : "r"(new), "m"(*__synch_xg(ptr)),
40031 + "0"(old)
40032 + : "memory");
40033 + return prev;
40034 +#ifdef CONFIG_X86_64
40035 + case 4:
40036 + __asm__ __volatile__("lock; cmpxchgl %k1,%2"
40037 + : "=a"(prev)
40038 + : "r"(new), "m"(*__synch_xg(ptr)),
40039 + "0"(old)
40040 + : "memory");
40041 + return prev;
40042 + case 8:
40043 + __asm__ __volatile__("lock; cmpxchgq %1,%2"
40044 + : "=a"(prev)
40045 + : "r"(new), "m"(*__synch_xg(ptr)),
40046 + "0"(old)
40047 + : "memory");
40048 + return prev;
40049 +#else
40050 + case 4:
40051 + __asm__ __volatile__("lock; cmpxchgl %1,%2"
40052 + : "=a"(prev)
40053 + : "r"(new), "m"(*__synch_xg(ptr)),
40054 + "0"(old)
40055 + : "memory");
40056 + return prev;
40057 +#endif
40058 + }
40059 + return old;
40060 +}
40061 +
40062 +#define synch_test_bit test_bit
40063 +
40064 +#define synch_cmpxchg_subword synch_cmpxchg
40065 +
40066 +#endif /* __XEN_SYNCH_BITOPS_H__ */
40067 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h
40068 ===================================================================
40069 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40070 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200
40071 @@ -0,0 +1,488 @@
40072 +#ifndef __ASM_SYSTEM_H
40073 +#define __ASM_SYSTEM_H
40074 +
40075 +#include <linux/kernel.h>
40076 +#include <asm/segment.h>
40077 +#include <asm/cpufeature.h>
40078 +#include <linux/bitops.h> /* for LOCK_PREFIX */
40079 +#include <asm/synch_bitops.h>
40080 +#include <asm/hypervisor.h>
40081 +
40082 +#ifdef __KERNEL__
40083 +
40084 +struct task_struct; /* one of the stranger aspects of C forward declarations.. */
40085 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
40086 +
40087 +/*
40088 + * Saving eflags is important. It switches not only IOPL between tasks,
40089 + * it also protects other tasks from NT leaking through sysenter etc.
40090 + */
40091 +#define switch_to(prev,next,last) do { \
40092 + unsigned long esi,edi; \
40093 + asm volatile("pushfl\n\t" /* Save flags */ \
40094 + "pushl %%ebp\n\t" \
40095 + "movl %%esp,%0\n\t" /* save ESP */ \
40096 + "movl %5,%%esp\n\t" /* restore ESP */ \
40097 + "movl $1f,%1\n\t" /* save EIP */ \
40098 + "pushl %6\n\t" /* restore EIP */ \
40099 + "jmp __switch_to\n" \
40100 + "1:\t" \
40101 + "popl %%ebp\n\t" \
40102 + "popfl" \
40103 + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
40104 + "=a" (last),"=S" (esi),"=D" (edi) \
40105 + :"m" (next->thread.esp),"m" (next->thread.eip), \
40106 + "2" (prev), "d" (next)); \
40107 +} while (0)
40108 +
40109 +#define _set_base(addr,base) do { unsigned long __pr; \
40110 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40111 + "rorl $16,%%edx\n\t" \
40112 + "movb %%dl,%2\n\t" \
40113 + "movb %%dh,%3" \
40114 + :"=&d" (__pr) \
40115 + :"m" (*((addr)+2)), \
40116 + "m" (*((addr)+4)), \
40117 + "m" (*((addr)+7)), \
40118 + "0" (base) \
40119 + ); } while(0)
40120 +
40121 +#define _set_limit(addr,limit) do { unsigned long __lr; \
40122 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
40123 + "rorl $16,%%edx\n\t" \
40124 + "movb %2,%%dh\n\t" \
40125 + "andb $0xf0,%%dh\n\t" \
40126 + "orb %%dh,%%dl\n\t" \
40127 + "movb %%dl,%2" \
40128 + :"=&d" (__lr) \
40129 + :"m" (*(addr)), \
40130 + "m" (*((addr)+6)), \
40131 + "0" (limit) \
40132 + ); } while(0)
40133 +
40134 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
40135 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
40136 +
40137 +/*
40138 + * Load a segment. Fall back on loading the zero
40139 + * segment if something goes wrong..
40140 + */
40141 +#define loadsegment(seg,value) \
40142 + asm volatile("\n" \
40143 + "1:\t" \
40144 + "mov %0,%%" #seg "\n" \
40145 + "2:\n" \
40146 + ".section .fixup,\"ax\"\n" \
40147 + "3:\t" \
40148 + "pushl $0\n\t" \
40149 + "popl %%" #seg "\n\t" \
40150 + "jmp 2b\n" \
40151 + ".previous\n" \
40152 + ".section __ex_table,\"a\"\n\t" \
40153 + ".align 4\n\t" \
40154 + ".long 1b,3b\n" \
40155 + ".previous" \
40156 + : :"rm" (value))
40157 +
40158 +/*
40159 + * Save a segment register away
40160 + */
40161 +#define savesegment(seg, value) \
40162 + asm volatile("mov %%" #seg ",%0":"=rm" (value))
40163 +
40164 +#define read_cr0() ({ \
40165 + unsigned int __dummy; \
40166 + __asm__ __volatile__( \
40167 + "movl %%cr0,%0\n\t" \
40168 + :"=r" (__dummy)); \
40169 + __dummy; \
40170 +})
40171 +#define write_cr0(x) \
40172 + __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
40173 +
40174 +#define read_cr2() (current_vcpu_info()->arch.cr2)
40175 +#define write_cr2(x) \
40176 + __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
40177 +
40178 +#define read_cr3() ({ \
40179 + unsigned int __dummy; \
40180 + __asm__ ( \
40181 + "movl %%cr3,%0\n\t" \
40182 + :"=r" (__dummy)); \
40183 + __dummy = xen_cr3_to_pfn(__dummy); \
40184 + mfn_to_pfn(__dummy) << PAGE_SHIFT; \
40185 +})
40186 +#define write_cr3(x) ({ \
40187 + unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
40188 + __dummy = xen_pfn_to_cr3(__dummy); \
40189 + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
40190 +})
40191 +#define read_cr4() ({ \
40192 + unsigned int __dummy; \
40193 + __asm__( \
40194 + "movl %%cr4,%0\n\t" \
40195 + :"=r" (__dummy)); \
40196 + __dummy; \
40197 +})
40198 +#define read_cr4_safe() ({ \
40199 + unsigned int __dummy; \
40200 + /* This could fault if %cr4 does not exist */ \
40201 + __asm__("1: movl %%cr4, %0 \n" \
40202 + "2: \n" \
40203 + ".section __ex_table,\"a\" \n" \
40204 + ".long 1b,2b \n" \
40205 + ".previous \n" \
40206 + : "=r" (__dummy): "0" (0)); \
40207 + __dummy; \
40208 +})
40209 +
40210 +#define write_cr4(x) \
40211 + __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
40212 +
40213 +/*
40214 + * Clear and set 'TS' bit respectively
40215 + */
40216 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
40217 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
40218 +
40219 +#endif /* __KERNEL__ */
40220 +
40221 +#define wbinvd() \
40222 + __asm__ __volatile__ ("wbinvd": : :"memory")
40223 +
40224 +static inline unsigned long get_limit(unsigned long segment)
40225 +{
40226 + unsigned long __limit;
40227 + __asm__("lsll %1,%0"
40228 + :"=r" (__limit):"r" (segment));
40229 + return __limit+1;
40230 +}
40231 +
40232 +#define nop() __asm__ __volatile__ ("nop")
40233 +
40234 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
40235 +
40236 +#define tas(ptr) (xchg((ptr),1))
40237 +
40238 +struct __xchg_dummy { unsigned long a[100]; };
40239 +#define __xg(x) ((struct __xchg_dummy *)(x))
40240 +
40241 +
40242 +#ifdef CONFIG_X86_CMPXCHG64
40243 +
40244 +/*
40245 + * The semantics of XCHGCMP8B are a bit strange, this is why
40246 + * there is a loop and the loading of %%eax and %%edx has to
40247 + * be inside. This inlines well in most cases, the cached
40248 + * cost is around ~38 cycles. (in the future we might want
40249 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
40250 + * might have an implicit FPU-save as a cost, so it's not
40251 + * clear which path to go.)
40252 + *
40253 + * cmpxchg8b must be used with the lock prefix here to allow
40254 + * the instruction to be executed atomically, see page 3-102
40255 + * of the instruction set reference 24319102.pdf. We need
40256 + * the reader side to see the coherent 64bit value.
40257 + */
40258 +static inline void __set_64bit (unsigned long long * ptr,
40259 + unsigned int low, unsigned int high)
40260 +{
40261 + __asm__ __volatile__ (
40262 + "\n1:\t"
40263 + "movl (%0), %%eax\n\t"
40264 + "movl 4(%0), %%edx\n\t"
40265 + "lock cmpxchg8b (%0)\n\t"
40266 + "jnz 1b"
40267 + : /* no outputs */
40268 + : "D"(ptr),
40269 + "b"(low),
40270 + "c"(high)
40271 + : "ax","dx","memory");
40272 +}
40273 +
40274 +static inline void __set_64bit_constant (unsigned long long *ptr,
40275 + unsigned long long value)
40276 +{
40277 + __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
40278 +}
40279 +#define ll_low(x) *(((unsigned int*)&(x))+0)
40280 +#define ll_high(x) *(((unsigned int*)&(x))+1)
40281 +
40282 +static inline void __set_64bit_var (unsigned long long *ptr,
40283 + unsigned long long value)
40284 +{
40285 + __set_64bit(ptr,ll_low(value), ll_high(value));
40286 +}
40287 +
40288 +#define set_64bit(ptr,value) \
40289 +(__builtin_constant_p(value) ? \
40290 + __set_64bit_constant(ptr, value) : \
40291 + __set_64bit_var(ptr, value) )
40292 +
40293 +#define _set_64bit(ptr,value) \
40294 +(__builtin_constant_p(value) ? \
40295 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
40296 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
40297 +
40298 +#endif
40299 +
40300 +/*
40301 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
40302 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
40303 + * but generally the primitive is invalid, *ptr is output argument. --ANK
40304 + */
40305 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
40306 +{
40307 + switch (size) {
40308 + case 1:
40309 + __asm__ __volatile__("xchgb %b0,%1"
40310 + :"=q" (x)
40311 + :"m" (*__xg(ptr)), "0" (x)
40312 + :"memory");
40313 + break;
40314 + case 2:
40315 + __asm__ __volatile__("xchgw %w0,%1"
40316 + :"=r" (x)
40317 + :"m" (*__xg(ptr)), "0" (x)
40318 + :"memory");
40319 + break;
40320 + case 4:
40321 + __asm__ __volatile__("xchgl %0,%1"
40322 + :"=r" (x)
40323 + :"m" (*__xg(ptr)), "0" (x)
40324 + :"memory");
40325 + break;
40326 + }
40327 + return x;
40328 +}
40329 +
40330 +/*
40331 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
40332 + * store NEW in MEM. Return the initial value in MEM. Success is
40333 + * indicated by comparing RETURN with OLD.
40334 + */
40335 +
40336 +#ifdef CONFIG_X86_CMPXCHG
40337 +#define __HAVE_ARCH_CMPXCHG 1
40338 +#define cmpxchg(ptr,o,n)\
40339 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
40340 + (unsigned long)(n),sizeof(*(ptr))))
40341 +#endif
40342 +
40343 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
40344 + unsigned long new, int size)
40345 +{
40346 + unsigned long prev;
40347 + switch (size) {
40348 + case 1:
40349 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
40350 + : "=a"(prev)
40351 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
40352 + : "memory");
40353 + return prev;
40354 + case 2:
40355 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
40356 + : "=a"(prev)
40357 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
40358 + : "memory");
40359 + return prev;
40360 + case 4:
40361 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
40362 + : "=a"(prev)
40363 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
40364 + : "memory");
40365 + return prev;
40366 + }
40367 + return old;
40368 +}
40369 +
40370 +#ifndef CONFIG_X86_CMPXCHG
40371 +/*
40372 + * Building a kernel capable running on 80386. It may be necessary to
40373 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
40374 + * a function for each of the sizes we support.
40375 + */
40376 +
40377 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
40378 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
40379 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
40380 +
40381 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
40382 + unsigned long new, int size)
40383 +{
40384 + switch (size) {
40385 + case 1:
40386 + return cmpxchg_386_u8(ptr, old, new);
40387 + case 2:
40388 + return cmpxchg_386_u16(ptr, old, new);
40389 + case 4:
40390 + return cmpxchg_386_u32(ptr, old, new);
40391 + }
40392 + return old;
40393 +}
40394 +
40395 +#define cmpxchg(ptr,o,n) \
40396 +({ \
40397 + __typeof__(*(ptr)) __ret; \
40398 + if (likely(boot_cpu_data.x86 > 3)) \
40399 + __ret = __cmpxchg((ptr), (unsigned long)(o), \
40400 + (unsigned long)(n), sizeof(*(ptr))); \
40401 + else \
40402 + __ret = cmpxchg_386((ptr), (unsigned long)(o), \
40403 + (unsigned long)(n), sizeof(*(ptr))); \
40404 + __ret; \
40405 +})
40406 +#endif
40407 +
40408 +#ifdef CONFIG_X86_CMPXCHG64
40409 +
40410 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
40411 + unsigned long long new)
40412 +{
40413 + unsigned long long prev;
40414 + __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
40415 + : "=A"(prev)
40416 + : "b"((unsigned long)new),
40417 + "c"((unsigned long)(new >> 32)),
40418 + "m"(*__xg(ptr)),
40419 + "0"(old)
40420 + : "memory");
40421 + return prev;
40422 +}
40423 +
40424 +#define cmpxchg64(ptr,o,n)\
40425 + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
40426 + (unsigned long long)(n)))
40427 +
40428 +#endif
40429 +
40430 +/*
40431 + * Force strict CPU ordering.
40432 + * And yes, this is required on UP too when we're talking
40433 + * to devices.
40434 + *
40435 + * For now, "wmb()" doesn't actually do anything, as all
40436 + * Intel CPU's follow what Intel calls a *Processor Order*,
40437 + * in which all writes are seen in the program order even
40438 + * outside the CPU.
40439 + *
40440 + * I expect future Intel CPU's to have a weaker ordering,
40441 + * but I'd also expect them to finally get their act together
40442 + * and add some real memory barriers if so.
40443 + *
40444 + * Some non intel clones support out of order store. wmb() ceases to be a
40445 + * nop for these.
40446 + */
40447 +
40448 +
40449 +/*
40450 + * Actually only lfence would be needed for mb() because all stores done
40451 + * by the kernel should be already ordered. But keep a full barrier for now.
40452 + */
40453 +
40454 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
40455 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
40456 +
40457 +/**
40458 + * read_barrier_depends - Flush all pending reads that subsequents reads
40459 + * depend on.
40460 + *
40461 + * No data-dependent reads from memory-like regions are ever reordered
40462 + * over this barrier. All reads preceding this primitive are guaranteed
40463 + * to access memory (but not necessarily other CPUs' caches) before any
40464 + * reads following this primitive that depend on the data return by
40465 + * any of the preceding reads. This primitive is much lighter weight than
40466 + * rmb() on most CPUs, and is never heavier weight than is
40467 + * rmb().
40468 + *
40469 + * These ordering constraints are respected by both the local CPU
40470 + * and the compiler.
40471 + *
40472 + * Ordering is not guaranteed by anything other than these primitives,
40473 + * not even by data dependencies. See the documentation for
40474 + * memory_barrier() for examples and URLs to more information.
40475 + *
40476 + * For example, the following code would force ordering (the initial
40477 + * value of "a" is zero, "b" is one, and "p" is "&a"):
40478 + *
40479 + * <programlisting>
40480 + * CPU 0 CPU 1
40481 + *
40482 + * b = 2;
40483 + * memory_barrier();
40484 + * p = &b; q = p;
40485 + * read_barrier_depends();
40486 + * d = *q;
40487 + * </programlisting>
40488 + *
40489 + * because the read of "*q" depends on the read of "p" and these
40490 + * two reads are separated by a read_barrier_depends(). However,
40491 + * the following code, with the same initial values for "a" and "b":
40492 + *
40493 + * <programlisting>
40494 + * CPU 0 CPU 1
40495 + *
40496 + * a = 2;
40497 + * memory_barrier();
40498 + * b = 3; y = b;
40499 + * read_barrier_depends();
40500 + * x = a;
40501 + * </programlisting>
40502 + *
40503 + * does not enforce ordering, since there is no data dependency between
40504 + * the read of "a" and the read of "b". Therefore, on some CPUs, such
40505 + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
40506 + * in cases like this where there are no data dependencies.
40507 + **/
40508 +
40509 +#define read_barrier_depends() do { } while(0)
40510 +
40511 +#ifdef CONFIG_X86_OOSTORE
40512 +/* Actually there are no OOO store capable CPUs for now that do SSE,
40513 + but make it already an possibility. */
40514 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
40515 +#else
40516 +#define wmb() __asm__ __volatile__ ("": : :"memory")
40517 +#endif
40518 +
40519 +#ifdef CONFIG_SMP
40520 +#define smp_mb() mb()
40521 +#define smp_rmb() rmb()
40522 +#define smp_wmb() wmb()
40523 +#define smp_read_barrier_depends() read_barrier_depends()
40524 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
40525 +#else
40526 +#define smp_mb() barrier()
40527 +#define smp_rmb() barrier()
40528 +#define smp_wmb() barrier()
40529 +#define smp_read_barrier_depends() do { } while(0)
40530 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
40531 +#endif
40532 +
40533 +#include <linux/irqflags.h>
40534 +
40535 +/*
40536 + * disable hlt during certain critical i/o operations
40537 + */
40538 +#define HAVE_DISABLE_HLT
40539 +void disable_hlt(void);
40540 +void enable_hlt(void);
40541 +
40542 +extern int es7000_plat;
40543 +void cpu_idle_wait(void);
40544 +
40545 +/*
40546 + * On SMP systems, when the scheduler does migration-cost autodetection,
40547 + * it needs a way to flush as much of the CPU's caches as possible:
40548 + */
40549 +static inline void sched_cacheflush(void)
40550 +{
40551 + wbinvd();
40552 +}
40553 +
40554 +extern unsigned long arch_align_stack(unsigned long sp);
40555 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
40556 +
40557 +void default_idle(void);
40558 +
40559 +#endif
40560 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h
40561 ===================================================================
40562 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40563 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100
40564 @@ -0,0 +1,101 @@
40565 +#ifndef _I386_TLBFLUSH_H
40566 +#define _I386_TLBFLUSH_H
40567 +
40568 +#include <linux/mm.h>
40569 +#include <asm/processor.h>
40570 +
40571 +#define __flush_tlb() xen_tlb_flush()
40572 +#define __flush_tlb_global() xen_tlb_flush()
40573 +#define __flush_tlb_all() xen_tlb_flush()
40574 +
40575 +extern unsigned long pgkern_mask;
40576 +
40577 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
40578 +
40579 +#define __flush_tlb_single(addr) xen_invlpg(addr)
40580 +
40581 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
40582 +
40583 +/*
40584 + * TLB flushing:
40585 + *
40586 + * - flush_tlb() flushes the current mm struct TLBs
40587 + * - flush_tlb_all() flushes all processes TLBs
40588 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
40589 + * - flush_tlb_page(vma, vmaddr) flushes one page
40590 + * - flush_tlb_range(vma, start, end) flushes a range of pages
40591 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
40592 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
40593 + *
40594 + * ..but the i386 has somewhat limited tlb flushing capabilities,
40595 + * and page-granular flushes are available only on i486 and up.
40596 + */
40597 +
40598 +#ifndef CONFIG_SMP
40599 +
40600 +#define flush_tlb() __flush_tlb()
40601 +#define flush_tlb_all() __flush_tlb_all()
40602 +#define local_flush_tlb() __flush_tlb()
40603 +
40604 +static inline void flush_tlb_mm(struct mm_struct *mm)
40605 +{
40606 + if (mm == current->active_mm)
40607 + __flush_tlb();
40608 +}
40609 +
40610 +static inline void flush_tlb_page(struct vm_area_struct *vma,
40611 + unsigned long addr)
40612 +{
40613 + if (vma->vm_mm == current->active_mm)
40614 + __flush_tlb_one(addr);
40615 +}
40616 +
40617 +static inline void flush_tlb_range(struct vm_area_struct *vma,
40618 + unsigned long start, unsigned long end)
40619 +{
40620 + if (vma->vm_mm == current->active_mm)
40621 + __flush_tlb();
40622 +}
40623 +
40624 +#else
40625 +
40626 +#include <asm/smp.h>
40627 +
40628 +#define local_flush_tlb() \
40629 + __flush_tlb()
40630 +
40631 +#define flush_tlb_all xen_tlb_flush_all
40632 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
40633 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
40634 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
40635 +
40636 +#define flush_tlb() flush_tlb_current_task()
40637 +
40638 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
40639 +{
40640 + flush_tlb_mm(vma->vm_mm);
40641 +}
40642 +
40643 +#define TLBSTATE_OK 1
40644 +#define TLBSTATE_LAZY 2
40645 +
40646 +struct tlb_state
40647 +{
40648 + struct mm_struct *active_mm;
40649 + int state;
40650 + char __cacheline_padding[L1_CACHE_BYTES-8];
40651 +};
40652 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
40653 +
40654 +
40655 +#endif
40656 +
40657 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
40658 +
40659 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
40660 + unsigned long start, unsigned long end)
40661 +{
40662 + /* i386 does not keep any page table caches in TLB */
40663 +}
40664 +
40665 +#endif /* _I386_TLBFLUSH_H */
40666 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h
40667 ===================================================================
40668 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40669 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200
40670 @@ -0,0 +1,20 @@
40671 +/*
40672 + * Access to VGA videoram
40673 + *
40674 + * (c) 1998 Martin Mares <mj@ucw.cz>
40675 + */
40676 +
40677 +#ifndef _LINUX_ASM_VGA_H_
40678 +#define _LINUX_ASM_VGA_H_
40679 +
40680 +/*
40681 + * On the PC, we can just recalculate addresses and then
40682 + * access the videoram directly without any black magic.
40683 + */
40684 +
40685 +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
40686 +
40687 +#define vga_readb(x) (*(x))
40688 +#define vga_writeb(x,y) (*(y) = (x))
40689 +
40690 +#endif
40691 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h
40692 ===================================================================
40693 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40694 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xenoprof.h 2007-06-12 13:14:02.000000000 +0200
40695 @@ -0,0 +1,48 @@
40696 +/******************************************************************************
40697 + * asm-i386/mach-xen/asm/xenoprof.h
40698 + *
40699 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
40700 + * VA Linux Systems Japan K.K.
40701 + *
40702 + * This program is free software; you can redistribute it and/or modify
40703 + * it under the terms of the GNU General Public License as published by
40704 + * the Free Software Foundation; either version 2 of the License, or
40705 + * (at your option) any later version.
40706 + *
40707 + * This program is distributed in the hope that it will be useful,
40708 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
40709 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40710 + * GNU General Public License for more details.
40711 + *
40712 + * You should have received a copy of the GNU General Public License
40713 + * along with this program; if not, write to the Free Software
40714 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40715 + *
40716 + */
40717 +#ifndef __ASM_XENOPROF_H__
40718 +#define __ASM_XENOPROF_H__
40719 +#ifdef CONFIG_XEN
40720 +
40721 +struct super_block;
40722 +struct dentry;
40723 +int xenoprof_create_files(struct super_block * sb, struct dentry * root);
40724 +#define HAVE_XENOPROF_CREATE_FILES
40725 +
40726 +struct xenoprof_init;
40727 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
40728 +void xenoprof_arch_counter(void);
40729 +void xenoprof_arch_start(void);
40730 +void xenoprof_arch_stop(void);
40731 +
40732 +struct xenoprof_arch_shared_buffer {
40733 + /* nothing */
40734 +};
40735 +struct xenoprof_shared_buffer;
40736 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
40737 +struct xenoprof_get_buffer;
40738 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
40739 +struct xenoprof_passive;
40740 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
40741 +
40742 +#endif /* CONFIG_XEN */
40743 +#endif /* __ASM_XENOPROF_H__ */
40744 Index: head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h
40745 ===================================================================
40746 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40747 +++ head-2008-11-25/include/asm-x86/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200
40748 @@ -0,0 +1,125 @@
40749 +/*
40750 + * This file should contain #defines for all of the interrupt vector
40751 + * numbers used by this architecture.
40752 + *
40753 + * In addition, there are some standard defines:
40754 + *
40755 + * FIRST_EXTERNAL_VECTOR:
40756 + * The first free place for external interrupts
40757 + *
40758 + * SYSCALL_VECTOR:
40759 + * The IRQ vector a syscall makes the user to kernel transition
40760 + * under.
40761 + *
40762 + * TIMER_IRQ:
40763 + * The IRQ number the timer interrupt comes in at.
40764 + *
40765 + * NR_IRQS:
40766 + * The total number of interrupt vectors (including all the
40767 + * architecture specific interrupts) needed.
40768 + *
40769 + */
40770 +#ifndef _ASM_IRQ_VECTORS_H
40771 +#define _ASM_IRQ_VECTORS_H
40772 +
40773 +/*
40774 + * IDT vectors usable for external interrupt sources start
40775 + * at 0x20:
40776 + */
40777 +#define FIRST_EXTERNAL_VECTOR 0x20
40778 +
40779 +#define SYSCALL_VECTOR 0x80
40780 +
40781 +/*
40782 + * Vectors 0x20-0x2f are used for ISA interrupts.
40783 + */
40784 +
40785 +#if 0
40786 +/*
40787 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
40788 + *
40789 + * some of the following vectors are 'rare', they are merged
40790 + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
40791 + * TLB, reschedule and local APIC vectors are performance-critical.
40792 + *
40793 + * Vectors 0xf0-0xfa are free (reserved for future Linux use).
40794 + */
40795 +#define SPURIOUS_APIC_VECTOR 0xff
40796 +#define ERROR_APIC_VECTOR 0xfe
40797 +#define INVALIDATE_TLB_VECTOR 0xfd
40798 +#define RESCHEDULE_VECTOR 0xfc
40799 +#define CALL_FUNCTION_VECTOR 0xfb
40800 +
40801 +#define THERMAL_APIC_VECTOR 0xf0
40802 +/*
40803 + * Local APIC timer IRQ vector is on a different priority level,
40804 + * to work around the 'lost local interrupt if more than 2 IRQ
40805 + * sources per level' errata.
40806 + */
40807 +#define LOCAL_TIMER_VECTOR 0xef
40808 +#endif
40809 +
40810 +#define SPURIOUS_APIC_VECTOR 0xff
40811 +#define ERROR_APIC_VECTOR 0xfe
40812 +
40813 +/*
40814 + * First APIC vector available to drivers: (vectors 0x30-0xee)
40815 + * we start at 0x31 to spread out vectors evenly between priority
40816 + * levels. (0x80 is the syscall vector)
40817 + */
40818 +#define FIRST_DEVICE_VECTOR 0x31
40819 +#define FIRST_SYSTEM_VECTOR 0xef
40820 +
40821 +/*
40822 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
40823 + * Right now the APIC is mostly only used for SMP.
40824 + * 256 vectors is an architectural limit. (we can have
40825 + * more than 256 devices theoretically, but they will
40826 + * have to use shared interrupts)
40827 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
40828 + * the usable vector space is 0x20-0xff (224 vectors)
40829 + */
40830 +
40831 +#define RESCHEDULE_VECTOR 0
40832 +#define CALL_FUNCTION_VECTOR 1
40833 +#define NR_IPIS 2
40834 +
40835 +/*
40836 + * The maximum number of vectors supported by i386 processors
40837 + * is limited to 256. For processors other than i386, NR_VECTORS
40838 + * should be changed accordingly.
40839 + */
40840 +#define NR_VECTORS 256
40841 +
40842 +#define FPU_IRQ 13
40843 +
40844 +#define FIRST_VM86_IRQ 3
40845 +#define LAST_VM86_IRQ 15
40846 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
40847 +
40848 +/*
40849 + * The flat IRQ space is divided into two regions:
40850 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
40851 + * if we have physical device-access privilege. This region is at the
40852 + * start of the IRQ space so that existing device drivers do not need
40853 + * to be modified to translate physical IRQ numbers into our IRQ space.
40854 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
40855 + * are bound using the provided bind/unbind functions.
40856 + */
40857 +
40858 +#define PIRQ_BASE 0
40859 +#if !defined(MAX_IO_APICS)
40860 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40861 +#elif NR_CPUS < MAX_IO_APICS
40862 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
40863 +#else
40864 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
40865 +#endif
40866 +
40867 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
40868 +#define NR_DYNIRQS 256
40869 +
40870 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
40871 +#define NR_IRQ_VECTORS NR_IRQS
40872 +
40873 +#endif /* _ASM_IRQ_VECTORS_H */
40874 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h
40875 ===================================================================
40876 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40877 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_traps.h 2007-06-12 13:14:02.000000000 +0200
40878 @@ -0,0 +1,33 @@
40879 +/*
40880 + * include/asm-xen/asm-i386/mach-xen/mach_traps.h
40881 + *
40882 + * Machine specific NMI handling for Xen
40883 + */
40884 +#ifndef _MACH_TRAPS_H
40885 +#define _MACH_TRAPS_H
40886 +
40887 +#include <linux/bitops.h>
40888 +#include <xen/interface/nmi.h>
40889 +
40890 +static inline void clear_mem_error(unsigned char reason) {}
40891 +static inline void clear_io_check_error(unsigned char reason) {}
40892 +
40893 +static inline unsigned char get_nmi_reason(void)
40894 +{
40895 + shared_info_t *s = HYPERVISOR_shared_info;
40896 + unsigned char reason = 0;
40897 +
40898 + /* construct a value which looks like it came from
40899 + * port 0x61.
40900 + */
40901 + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
40902 + reason |= 0x40;
40903 + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
40904 + reason |= 0x80;
40905 +
40906 + return reason;
40907 +}
40908 +
40909 +static inline void reassert_nmi(void) {}
40910 +
40911 +#endif /* !_MACH_TRAPS_H */
40912 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h
40913 ===================================================================
40914 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40915 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch.h 2007-06-12 13:14:02.000000000 +0200
40916 @@ -0,0 +1,5 @@
40917 +/* Hook to call BIOS initialisation function */
40918 +
40919 +#define ARCH_SETUP machine_specific_arch_setup();
40920 +
40921 +void __init machine_specific_arch_setup(void);
40922 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h
40923 ===================================================================
40924 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
40925 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100
40926 @@ -0,0 +1,265 @@
40927 +/* Written 2000 by Andi Kleen */
40928 +#ifndef __ARCH_DESC_H
40929 +#define __ARCH_DESC_H
40930 +
40931 +#include <linux/threads.h>
40932 +#include <asm/ldt.h>
40933 +
40934 +#ifndef __ASSEMBLY__
40935 +
40936 +#include <linux/string.h>
40937 +#include <linux/smp.h>
40938 +
40939 +#include <asm/segment.h>
40940 +#include <asm/mmu.h>
40941 +
40942 +// 8 byte segment descriptor
40943 +struct desc_struct {
40944 + u16 limit0;
40945 + u16 base0;
40946 + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
40947 + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
40948 +} __attribute__((packed));
40949 +
40950 +struct n_desc_struct {
40951 + unsigned int a,b;
40952 +};
40953 +
40954 +enum {
40955 + GATE_INTERRUPT = 0xE,
40956 + GATE_TRAP = 0xF,
40957 + GATE_CALL = 0xC,
40958 +};
40959 +
40960 +// 16byte gate
40961 +struct gate_struct {
40962 + u16 offset_low;
40963 + u16 segment;
40964 + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
40965 + u16 offset_middle;
40966 + u32 offset_high;
40967 + u32 zero1;
40968 +} __attribute__((packed));
40969 +
40970 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
40971 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
40972 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
40973 +
40974 +enum {
40975 + DESC_TSS = 0x9,
40976 + DESC_LDT = 0x2,
40977 +};
40978 +
40979 +// LDT or TSS descriptor in the GDT. 16 bytes.
40980 +struct ldttss_desc {
40981 + u16 limit0;
40982 + u16 base0;
40983 + unsigned base1 : 8, type : 5, dpl : 2, p : 1;
40984 + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
40985 + u32 base3;
40986 + u32 zero1;
40987 +} __attribute__((packed));
40988 +
40989 +struct desc_ptr {
40990 + unsigned short size;
40991 + unsigned long address;
40992 +} __attribute__((packed)) ;
40993 +
40994 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
40995 +
40996 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
40997 +
40998 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
40999 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
41000 +
41001 +static inline void clear_LDT(void)
41002 +{
41003 + int cpu = get_cpu();
41004 +
41005 + /*
41006 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
41007 + * it slows down context switching. Noone uses it anyway.
41008 + */
41009 + cpu = cpu; /* XXX avoid compiler warning */
41010 + xen_set_ldt(NULL, 0);
41011 + put_cpu();
41012 +}
41013 +
41014 +/*
41015 + * This is the ldt that every process will get unless we need
41016 + * something other than this.
41017 + */
41018 +extern struct desc_struct default_ldt[];
41019 +#ifndef CONFIG_X86_NO_IDT
41020 +extern struct gate_struct idt_table[];
41021 +#endif
41022 +extern struct desc_ptr cpu_gdt_descr[];
41023 +
41024 +/* the cpu gdt accessor */
41025 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
41026 +
41027 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
41028 +{
41029 + struct gate_struct s;
41030 + s.offset_low = PTR_LOW(func);
41031 + s.segment = __KERNEL_CS;
41032 + s.ist = ist;
41033 + s.p = 1;
41034 + s.dpl = dpl;
41035 + s.zero0 = 0;
41036 + s.zero1 = 0;
41037 + s.type = type;
41038 + s.offset_middle = PTR_MIDDLE(func);
41039 + s.offset_high = PTR_HIGH(func);
41040 + /* does not need to be atomic because it is only done once at setup time */
41041 + memcpy(adr, &s, 16);
41042 +}
41043 +
41044 +#ifndef CONFIG_X86_NO_IDT
41045 +static inline void set_intr_gate(int nr, void *func)
41046 +{
41047 + BUG_ON((unsigned)nr > 0xFF);
41048 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
41049 +}
41050 +
41051 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
41052 +{
41053 + BUG_ON((unsigned)nr > 0xFF);
41054 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
41055 +}
41056 +
41057 +static inline void set_system_gate(int nr, void *func)
41058 +{
41059 + BUG_ON((unsigned)nr > 0xFF);
41060 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
41061 +}
41062 +
41063 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
41064 +{
41065 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
41066 +}
41067 +#endif
41068 +
41069 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
41070 + unsigned size)
41071 +{
41072 + struct ldttss_desc d;
41073 + memset(&d,0,sizeof(d));
41074 + d.limit0 = size & 0xFFFF;
41075 + d.base0 = PTR_LOW(tss);
41076 + d.base1 = PTR_MIDDLE(tss) & 0xFF;
41077 + d.type = type;
41078 + d.p = 1;
41079 + d.limit1 = (size >> 16) & 0xF;
41080 + d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
41081 + d.base3 = PTR_HIGH(tss);
41082 + memcpy(ptr, &d, 16);
41083 +}
41084 +
41085 +#ifndef CONFIG_X86_NO_TSS
41086 +static inline void set_tss_desc(unsigned cpu, void *addr)
41087 +{
41088 + /*
41089 + * sizeof(unsigned long) coming from an extra "long" at the end
41090 + * of the iobitmap. See tss_struct definition in processor.h
41091 + *
41092 + * -1? seg base+limit should be pointing to the address of the
41093 + * last valid byte
41094 + */
41095 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
41096 + (unsigned long)addr, DESC_TSS,
41097 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
41098 +}
41099 +#endif
41100 +
41101 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
41102 +{
41103 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
41104 + DESC_LDT, size * 8 - 1);
41105 +}
41106 +
41107 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
41108 +{
41109 + struct desc_struct *d = &cpu_gdt(cpu)[entry];
41110 + u32 addr = (u32)(u64)base;
41111 + BUG_ON((u64)base >> 32);
41112 + d->base0 = addr & 0xffff;
41113 + d->base1 = (addr >> 16) & 0xff;
41114 + d->base2 = (addr >> 24) & 0xff;
41115 +}
41116 +
41117 +#define LDT_entry_a(info) \
41118 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
41119 +/* Don't allow setting of the lm bit. It is useless anyways because
41120 + 64bit system calls require __USER_CS. */
41121 +#define LDT_entry_b(info) \
41122 + (((info)->base_addr & 0xff000000) | \
41123 + (((info)->base_addr & 0x00ff0000) >> 16) | \
41124 + ((info)->limit & 0xf0000) | \
41125 + (((info)->read_exec_only ^ 1) << 9) | \
41126 + ((info)->contents << 10) | \
41127 + (((info)->seg_not_present ^ 1) << 15) | \
41128 + ((info)->seg_32bit << 22) | \
41129 + ((info)->limit_in_pages << 23) | \
41130 + ((info)->useable << 20) | \
41131 + /* ((info)->lm << 21) | */ \
41132 + 0x7000)
41133 +
41134 +#define LDT_empty(info) (\
41135 + (info)->base_addr == 0 && \
41136 + (info)->limit == 0 && \
41137 + (info)->contents == 0 && \
41138 + (info)->read_exec_only == 1 && \
41139 + (info)->seg_32bit == 0 && \
41140 + (info)->limit_in_pages == 0 && \
41141 + (info)->seg_not_present == 1 && \
41142 + (info)->useable == 0 && \
41143 + (info)->lm == 0)
41144 +
41145 +#if TLS_SIZE != 24
41146 +# error update this code.
41147 +#endif
41148 +
41149 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
41150 +{
41151 +#if 0
41152 + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
41153 + gdt[0] = t->tls_array[0];
41154 + gdt[1] = t->tls_array[1];
41155 + gdt[2] = t->tls_array[2];
41156 +#endif
41157 +#define C(i) \
41158 + if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
41159 + t->tls_array[i])) \
41160 + BUG();
41161 +
41162 + C(0); C(1); C(2);
41163 +#undef C
41164 +}
41165 +
41166 +/*
41167 + * load one particular LDT into the current CPU
41168 + */
41169 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
41170 +{
41171 + void *segments = pc->ldt;
41172 + int count = pc->size;
41173 +
41174 + if (likely(!count))
41175 + segments = NULL;
41176 +
41177 + xen_set_ldt(segments, count);
41178 +}
41179 +
41180 +static inline void load_LDT(mm_context_t *pc)
41181 +{
41182 + int cpu = get_cpu();
41183 + load_LDT_nolock(pc, cpu);
41184 + put_cpu();
41185 +}
41186 +
41187 +extern struct desc_ptr idt_descr;
41188 +
41189 +#endif /* !__ASSEMBLY__ */
41190 +
41191 +#endif
41192 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h
41193 ===================================================================
41194 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41195 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2007-06-12 13:14:13.000000000 +0200
41196 @@ -0,0 +1,207 @@
41197 +#ifndef _X8664_DMA_MAPPING_H
41198 +#define _X8664_DMA_MAPPING_H 1
41199 +
41200 +/*
41201 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
41202 + * documentation.
41203 + */
41204 +
41205 +
41206 +#include <asm/scatterlist.h>
41207 +#include <asm/io.h>
41208 +#include <asm/swiotlb.h>
41209 +
41210 +struct dma_mapping_ops {
41211 + int (*mapping_error)(dma_addr_t dma_addr);
41212 + void* (*alloc_coherent)(struct device *dev, size_t size,
41213 + dma_addr_t *dma_handle, gfp_t gfp);
41214 + void (*free_coherent)(struct device *dev, size_t size,
41215 + void *vaddr, dma_addr_t dma_handle);
41216 + dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
41217 + size_t size, int direction);
41218 + /* like map_single, but doesn't check the device mask */
41219 + dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
41220 + size_t size, int direction);
41221 + void (*unmap_single)(struct device *dev, dma_addr_t addr,
41222 + size_t size, int direction);
41223 + void (*sync_single_for_cpu)(struct device *hwdev,
41224 + dma_addr_t dma_handle, size_t size,
41225 + int direction);
41226 + void (*sync_single_for_device)(struct device *hwdev,
41227 + dma_addr_t dma_handle, size_t size,
41228 + int direction);
41229 + void (*sync_single_range_for_cpu)(struct device *hwdev,
41230 + dma_addr_t dma_handle, unsigned long offset,
41231 + size_t size, int direction);
41232 + void (*sync_single_range_for_device)(struct device *hwdev,
41233 + dma_addr_t dma_handle, unsigned long offset,
41234 + size_t size, int direction);
41235 + void (*sync_sg_for_cpu)(struct device *hwdev,
41236 + struct scatterlist *sg, int nelems,
41237 + int direction);
41238 + void (*sync_sg_for_device)(struct device *hwdev,
41239 + struct scatterlist *sg, int nelems,
41240 + int direction);
41241 + int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
41242 + int nents, int direction);
41243 + void (*unmap_sg)(struct device *hwdev,
41244 + struct scatterlist *sg, int nents,
41245 + int direction);
41246 + int (*dma_supported)(struct device *hwdev, u64 mask);
41247 + int is_phys;
41248 +};
41249 +
41250 +extern dma_addr_t bad_dma_address;
41251 +extern struct dma_mapping_ops* dma_ops;
41252 +extern int iommu_merge;
41253 +
41254 +static inline int valid_dma_direction(int dma_direction)
41255 +{
41256 + return ((dma_direction == DMA_BIDIRECTIONAL) ||
41257 + (dma_direction == DMA_TO_DEVICE) ||
41258 + (dma_direction == DMA_FROM_DEVICE));
41259 +}
41260 +
41261 +#if 0
41262 +static inline int dma_mapping_error(dma_addr_t dma_addr)
41263 +{
41264 + if (dma_ops->mapping_error)
41265 + return dma_ops->mapping_error(dma_addr);
41266 +
41267 + return (dma_addr == bad_dma_address);
41268 +}
41269 +
41270 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
41271 + dma_addr_t *dma_handle, gfp_t gfp);
41272 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
41273 + dma_addr_t dma_handle);
41274 +
41275 +static inline dma_addr_t
41276 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
41277 + int direction)
41278 +{
41279 + BUG_ON(!valid_dma_direction(direction));
41280 + return dma_ops->map_single(hwdev, ptr, size, direction);
41281 +}
41282 +
41283 +static inline void
41284 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
41285 + int direction)
41286 +{
41287 + BUG_ON(!valid_dma_direction(direction));
41288 + dma_ops->unmap_single(dev, addr, size, direction);
41289 +}
41290 +
41291 +#define dma_map_page(dev,page,offset,size,dir) \
41292 + dma_map_single((dev), page_address(page)+(offset), (size), (dir))
41293 +
41294 +#define dma_unmap_page dma_unmap_single
41295 +
41296 +static inline void
41297 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41298 + size_t size, int direction)
41299 +{
41300 + BUG_ON(!valid_dma_direction(direction));
41301 + if (dma_ops->sync_single_for_cpu)
41302 + dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
41303 + direction);
41304 + flush_write_buffers();
41305 +}
41306 +
41307 +static inline void
41308 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
41309 + size_t size, int direction)
41310 +{
41311 + BUG_ON(!valid_dma_direction(direction));
41312 + if (dma_ops->sync_single_for_device)
41313 + dma_ops->sync_single_for_device(hwdev, dma_handle, size,
41314 + direction);
41315 + flush_write_buffers();
41316 +}
41317 +
41318 +static inline void
41319 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
41320 + unsigned long offset, size_t size, int direction)
41321 +{
41322 + BUG_ON(!valid_dma_direction(direction));
41323 + if (dma_ops->sync_single_range_for_cpu) {
41324 + dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
41325 + }
41326 +
41327 + flush_write_buffers();
41328 +}
41329 +
41330 +static inline void
41331 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
41332 + unsigned long offset, size_t size, int direction)
41333 +{
41334 + BUG_ON(!valid_dma_direction(direction));
41335 + if (dma_ops->sync_single_range_for_device)
41336 + dma_ops->sync_single_range_for_device(hwdev, dma_handle,
41337 + offset, size, direction);
41338 +
41339 + flush_write_buffers();
41340 +}
41341 +
41342 +static inline void
41343 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
41344 + int nelems, int direction)
41345 +{
41346 + BUG_ON(!valid_dma_direction(direction));
41347 + if (dma_ops->sync_sg_for_cpu)
41348 + dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
41349 + flush_write_buffers();
41350 +}
41351 +
41352 +static inline void
41353 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
41354 + int nelems, int direction)
41355 +{
41356 + BUG_ON(!valid_dma_direction(direction));
41357 + if (dma_ops->sync_sg_for_device) {
41358 + dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
41359 + }
41360 +
41361 + flush_write_buffers();
41362 +}
41363 +
41364 +static inline int
41365 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
41366 +{
41367 + BUG_ON(!valid_dma_direction(direction));
41368 + return dma_ops->map_sg(hwdev, sg, nents, direction);
41369 +}
41370 +
41371 +static inline void
41372 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
41373 + int direction)
41374 +{
41375 + BUG_ON(!valid_dma_direction(direction));
41376 + dma_ops->unmap_sg(hwdev, sg, nents, direction);
41377 +}
41378 +
41379 +extern int dma_supported(struct device *hwdev, u64 mask);
41380 +
41381 +/* same for gart, swiotlb, and nommu */
41382 +static inline int dma_get_cache_alignment(void)
41383 +{
41384 + return boot_cpu_data.x86_clflush_size;
41385 +}
41386 +
41387 +#define dma_is_consistent(h) 1
41388 +
41389 +extern int dma_set_mask(struct device *dev, u64 mask);
41390 +
41391 +static inline void
41392 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
41393 +{
41394 + flush_write_buffers();
41395 +}
41396 +
41397 +extern struct device fallback_dev;
41398 +extern int panic_on_overflow;
41399 +#endif
41400 +
41401 +#endif /* _X8664_DMA_MAPPING_H */
41402 +
41403 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
41404 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h
41405 ===================================================================
41406 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41407 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200
41408 @@ -0,0 +1,112 @@
41409 +/*
41410 + * fixmap.h: compile-time virtual memory allocation
41411 + *
41412 + * This file is subject to the terms and conditions of the GNU General Public
41413 + * License. See the file "COPYING" in the main directory of this archive
41414 + * for more details.
41415 + *
41416 + * Copyright (C) 1998 Ingo Molnar
41417 + */
41418 +
41419 +#ifndef _ASM_FIXMAP_H
41420 +#define _ASM_FIXMAP_H
41421 +
41422 +#include <linux/kernel.h>
41423 +#include <asm/apicdef.h>
41424 +#include <asm/page.h>
41425 +#include <asm/vsyscall.h>
41426 +#include <asm/vsyscall32.h>
41427 +#include <asm/acpi.h>
41428 +
41429 +/*
41430 + * Here we define all the compile-time 'special' virtual
41431 + * addresses. The point is to have a constant address at
41432 + * compile time, but to set the physical address only
41433 + * in the boot process.
41434 + *
41435 + * these 'compile-time allocated' memory buffers are
41436 + * fixed-size 4k pages. (or larger if used with an increment
41437 + * highger than 1) use fixmap_set(idx,phys) to associate
41438 + * physical memory with fixmap indices.
41439 + *
41440 + * TLB entries of such buffers will not be flushed across
41441 + * task switches.
41442 + */
41443 +
41444 +enum fixed_addresses {
41445 + VSYSCALL_LAST_PAGE,
41446 + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
41447 + VSYSCALL_HPET,
41448 + FIX_HPET_BASE,
41449 +#ifdef CONFIG_X86_LOCAL_APIC
41450 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
41451 +#endif
41452 +#ifdef CONFIG_X86_IO_APIC
41453 + FIX_IO_APIC_BASE_0,
41454 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
41455 +#endif
41456 +#ifdef CONFIG_ACPI
41457 + FIX_ACPI_BEGIN,
41458 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
41459 +#endif
41460 + FIX_SHARED_INFO,
41461 +#define NR_FIX_ISAMAPS 256
41462 + FIX_ISAMAP_END,
41463 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
41464 + __end_of_permanent_fixed_addresses,
41465 + /* temporary boot-time mappings, used before ioremap() is functional */
41466 +#define NR_FIX_BTMAPS 16
41467 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
41468 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
41469 + __end_of_fixed_addresses
41470 +};
41471 +
41472 +extern void __set_fixmap (enum fixed_addresses idx,
41473 + unsigned long phys, pgprot_t flags);
41474 +
41475 +#define set_fixmap(idx, phys) \
41476 + __set_fixmap(idx, phys, PAGE_KERNEL)
41477 +/*
41478 + * Some hardware wants to get fixmapped without caching.
41479 + */
41480 +#define set_fixmap_nocache(idx, phys) \
41481 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
41482 +
41483 +#define clear_fixmap(idx) \
41484 + __set_fixmap(idx, 0, __pgprot(0))
41485 +
41486 +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
41487 +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
41488 +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
41489 +
41490 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
41491 +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
41492 +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
41493 +
41494 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
41495 +
41496 +extern void __this_fixmap_does_not_exist(void);
41497 +
41498 +/*
41499 + * 'index to address' translation. If anyone tries to use the idx
41500 + * directly without translation, we catch the bug with a NULL-deference
41501 + * kernel oops. Illegal ranges of incoming indices are caught too.
41502 + */
41503 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
41504 +{
41505 + /*
41506 + * this branch gets completely eliminated after inlining,
41507 + * except when someone tries to use fixaddr indices in an
41508 + * illegal way. (such as mixing up address types or using
41509 + * out-of-range indices).
41510 + *
41511 + * If it doesn't get removed, the linker will complain
41512 + * loudly with a reasonably clear error message..
41513 + */
41514 + if (idx >= __end_of_fixed_addresses)
41515 + __this_fixmap_does_not_exist();
41516 +
41517 + return __fix_to_virt(idx);
41518 +}
41519 +
41520 +#endif
41521 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h
41522 ===================================================================
41523 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41524 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/hypercall_64.h 2008-11-25 12:22:34.000000000 +0100
41525 @@ -0,0 +1,408 @@
41526 +/******************************************************************************
41527 + * hypercall.h
41528 + *
41529 + * Linux-specific hypervisor handling.
41530 + *
41531 + * Copyright (c) 2002-2004, K A Fraser
41532 + *
41533 + * 64-bit updates:
41534 + * Benjamin Liu <benjamin.liu@intel.com>
41535 + * Jun Nakajima <jun.nakajima@intel.com>
41536 + *
41537 + * This program is free software; you can redistribute it and/or
41538 + * modify it under the terms of the GNU General Public License version 2
41539 + * as published by the Free Software Foundation; or, when distributed
41540 + * separately from the Linux kernel or incorporated into other
41541 + * software packages, subject to the following license:
41542 + *
41543 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41544 + * of this source file (the "Software"), to deal in the Software without
41545 + * restriction, including without limitation the rights to use, copy, modify,
41546 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41547 + * and to permit persons to whom the Software is furnished to do so, subject to
41548 + * the following conditions:
41549 + *
41550 + * The above copyright notice and this permission notice shall be included in
41551 + * all copies or substantial portions of the Software.
41552 + *
41553 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41554 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41555 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41556 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41557 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41558 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41559 + * IN THE SOFTWARE.
41560 + */
41561 +
41562 +#ifndef __HYPERCALL_H__
41563 +#define __HYPERCALL_H__
41564 +
41565 +#include <linux/string.h> /* memcpy() */
41566 +#include <linux/stringify.h>
41567 +
41568 +#ifndef __HYPERVISOR_H__
41569 +# error "please don't include this file directly"
41570 +#endif
41571 +
41572 +#ifdef CONFIG_XEN
41573 +#define HYPERCALL_STR(name) \
41574 + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
41575 +#else
41576 +#define HYPERCALL_STR(name) \
41577 + "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
41578 + "add hypercall_stubs(%%rip),%%rax; " \
41579 + "call *%%rax"
41580 +#endif
41581 +
41582 +#define _hypercall0(type, name) \
41583 +({ \
41584 + type __res; \
41585 + asm volatile ( \
41586 + HYPERCALL_STR(name) \
41587 + : "=a" (__res) \
41588 + : \
41589 + : "memory" ); \
41590 + __res; \
41591 +})
41592 +
41593 +#define _hypercall1(type, name, a1) \
41594 +({ \
41595 + type __res; \
41596 + long __ign1; \
41597 + asm volatile ( \
41598 + HYPERCALL_STR(name) \
41599 + : "=a" (__res), "=D" (__ign1) \
41600 + : "1" ((long)(a1)) \
41601 + : "memory" ); \
41602 + __res; \
41603 +})
41604 +
41605 +#define _hypercall2(type, name, a1, a2) \
41606 +({ \
41607 + type __res; \
41608 + long __ign1, __ign2; \
41609 + asm volatile ( \
41610 + HYPERCALL_STR(name) \
41611 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
41612 + : "1" ((long)(a1)), "2" ((long)(a2)) \
41613 + : "memory" ); \
41614 + __res; \
41615 +})
41616 +
41617 +#define _hypercall3(type, name, a1, a2, a3) \
41618 +({ \
41619 + type __res; \
41620 + long __ign1, __ign2, __ign3; \
41621 + asm volatile ( \
41622 + HYPERCALL_STR(name) \
41623 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41624 + "=d" (__ign3) \
41625 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41626 + "3" ((long)(a3)) \
41627 + : "memory" ); \
41628 + __res; \
41629 +})
41630 +
41631 +#define _hypercall4(type, name, a1, a2, a3, a4) \
41632 +({ \
41633 + type __res; \
41634 + long __ign1, __ign2, __ign3; \
41635 + register long __arg4 asm("r10") = (long)(a4); \
41636 + asm volatile ( \
41637 + HYPERCALL_STR(name) \
41638 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41639 + "=d" (__ign3), "+r" (__arg4) \
41640 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41641 + "3" ((long)(a3)) \
41642 + : "memory" ); \
41643 + __res; \
41644 +})
41645 +
41646 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
41647 +({ \
41648 + type __res; \
41649 + long __ign1, __ign2, __ign3; \
41650 + register long __arg4 asm("r10") = (long)(a4); \
41651 + register long __arg5 asm("r8") = (long)(a5); \
41652 + asm volatile ( \
41653 + HYPERCALL_STR(name) \
41654 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
41655 + "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
41656 + : "1" ((long)(a1)), "2" ((long)(a2)), \
41657 + "3" ((long)(a3)) \
41658 + : "memory" ); \
41659 + __res; \
41660 +})
41661 +
41662 +static inline int __must_check
41663 +HYPERVISOR_set_trap_table(
41664 + const trap_info_t *table)
41665 +{
41666 + return _hypercall1(int, set_trap_table, table);
41667 +}
41668 +
41669 +static inline int __must_check
41670 +HYPERVISOR_mmu_update(
41671 + mmu_update_t *req, unsigned int count, unsigned int *success_count,
41672 + domid_t domid)
41673 +{
41674 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
41675 +}
41676 +
41677 +static inline int __must_check
41678 +HYPERVISOR_mmuext_op(
41679 + struct mmuext_op *op, unsigned int count, unsigned int *success_count,
41680 + domid_t domid)
41681 +{
41682 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
41683 +}
41684 +
41685 +static inline int __must_check
41686 +HYPERVISOR_set_gdt(
41687 + unsigned long *frame_list, unsigned int entries)
41688 +{
41689 + return _hypercall2(int, set_gdt, frame_list, entries);
41690 +}
41691 +
41692 +static inline int __must_check
41693 +HYPERVISOR_stack_switch(
41694 + unsigned long ss, unsigned long esp)
41695 +{
41696 + return _hypercall2(int, stack_switch, ss, esp);
41697 +}
41698 +
41699 +static inline int __must_check
41700 +HYPERVISOR_set_callbacks(
41701 + unsigned long event_address, unsigned long failsafe_address,
41702 + unsigned long syscall_address)
41703 +{
41704 + return _hypercall3(int, set_callbacks,
41705 + event_address, failsafe_address, syscall_address);
41706 +}
41707 +
41708 +static inline int
41709 +HYPERVISOR_fpu_taskswitch(
41710 + int set)
41711 +{
41712 + return _hypercall1(int, fpu_taskswitch, set);
41713 +}
41714 +
41715 +static inline int __must_check
41716 +HYPERVISOR_sched_op_compat(
41717 + int cmd, unsigned long arg)
41718 +{
41719 + return _hypercall2(int, sched_op_compat, cmd, arg);
41720 +}
41721 +
41722 +static inline int __must_check
41723 +HYPERVISOR_sched_op(
41724 + int cmd, void *arg)
41725 +{
41726 + return _hypercall2(int, sched_op, cmd, arg);
41727 +}
41728 +
41729 +static inline long __must_check
41730 +HYPERVISOR_set_timer_op(
41731 + u64 timeout)
41732 +{
41733 + return _hypercall1(long, set_timer_op, timeout);
41734 +}
41735 +
41736 +static inline int __must_check
41737 +HYPERVISOR_platform_op(
41738 + struct xen_platform_op *platform_op)
41739 +{
41740 + platform_op->interface_version = XENPF_INTERFACE_VERSION;
41741 + return _hypercall1(int, platform_op, platform_op);
41742 +}
41743 +
41744 +static inline int __must_check
41745 +HYPERVISOR_set_debugreg(
41746 + unsigned int reg, unsigned long value)
41747 +{
41748 + return _hypercall2(int, set_debugreg, reg, value);
41749 +}
41750 +
41751 +static inline unsigned long __must_check
41752 +HYPERVISOR_get_debugreg(
41753 + unsigned int reg)
41754 +{
41755 + return _hypercall1(unsigned long, get_debugreg, reg);
41756 +}
41757 +
41758 +static inline int __must_check
41759 +HYPERVISOR_update_descriptor(
41760 + unsigned long ma, unsigned long word)
41761 +{
41762 + return _hypercall2(int, update_descriptor, ma, word);
41763 +}
41764 +
41765 +static inline int __must_check
41766 +HYPERVISOR_memory_op(
41767 + unsigned int cmd, void *arg)
41768 +{
41769 + return _hypercall2(int, memory_op, cmd, arg);
41770 +}
41771 +
41772 +static inline int __must_check
41773 +HYPERVISOR_multicall(
41774 + multicall_entry_t *call_list, unsigned int nr_calls)
41775 +{
41776 + return _hypercall2(int, multicall, call_list, nr_calls);
41777 +}
41778 +
41779 +static inline int __must_check
41780 +HYPERVISOR_update_va_mapping(
41781 + unsigned long va, pte_t new_val, unsigned long flags)
41782 +{
41783 + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
41784 +}
41785 +
41786 +static inline int __must_check
41787 +HYPERVISOR_event_channel_op(
41788 + int cmd, void *arg)
41789 +{
41790 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
41791 +
41792 +#if CONFIG_XEN_COMPAT <= 0x030002
41793 + if (unlikely(rc == -ENOSYS)) {
41794 + struct evtchn_op op;
41795 + op.cmd = cmd;
41796 + memcpy(&op.u, arg, sizeof(op.u));
41797 + rc = _hypercall1(int, event_channel_op_compat, &op);
41798 + memcpy(arg, &op.u, sizeof(op.u));
41799 + }
41800 +#endif
41801 +
41802 + return rc;
41803 +}
41804 +
41805 +static inline int __must_check
41806 +HYPERVISOR_xen_version(
41807 + int cmd, void *arg)
41808 +{
41809 + return _hypercall2(int, xen_version, cmd, arg);
41810 +}
41811 +
41812 +static inline int __must_check
41813 +HYPERVISOR_console_io(
41814 + int cmd, unsigned int count, char *str)
41815 +{
41816 + return _hypercall3(int, console_io, cmd, count, str);
41817 +}
41818 +
41819 +static inline int __must_check
41820 +HYPERVISOR_physdev_op(
41821 + int cmd, void *arg)
41822 +{
41823 + int rc = _hypercall2(int, physdev_op, cmd, arg);
41824 +
41825 +#if CONFIG_XEN_COMPAT <= 0x030002
41826 + if (unlikely(rc == -ENOSYS)) {
41827 + struct physdev_op op;
41828 + op.cmd = cmd;
41829 + memcpy(&op.u, arg, sizeof(op.u));
41830 + rc = _hypercall1(int, physdev_op_compat, &op);
41831 + memcpy(arg, &op.u, sizeof(op.u));
41832 + }
41833 +#endif
41834 +
41835 + return rc;
41836 +}
41837 +
41838 +static inline int __must_check
41839 +HYPERVISOR_grant_table_op(
41840 + unsigned int cmd, void *uop, unsigned int count)
41841 +{
41842 + return _hypercall3(int, grant_table_op, cmd, uop, count);
41843 +}
41844 +
41845 +static inline int __must_check
41846 +HYPERVISOR_update_va_mapping_otherdomain(
41847 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
41848 +{
41849 + return _hypercall4(int, update_va_mapping_otherdomain, va,
41850 + new_val.pte, flags, domid);
41851 +}
41852 +
41853 +static inline int __must_check
41854 +HYPERVISOR_vm_assist(
41855 + unsigned int cmd, unsigned int type)
41856 +{
41857 + return _hypercall2(int, vm_assist, cmd, type);
41858 +}
41859 +
41860 +static inline int __must_check
41861 +HYPERVISOR_vcpu_op(
41862 + int cmd, unsigned int vcpuid, void *extra_args)
41863 +{
41864 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
41865 +}
41866 +
41867 +static inline int __must_check
41868 +HYPERVISOR_set_segment_base(
41869 + int reg, unsigned long value)
41870 +{
41871 + return _hypercall2(int, set_segment_base, reg, value);
41872 +}
41873 +
41874 +static inline int __must_check
41875 +HYPERVISOR_suspend(
41876 + unsigned long srec)
41877 +{
41878 + struct sched_shutdown sched_shutdown = {
41879 + .reason = SHUTDOWN_suspend
41880 + };
41881 +
41882 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
41883 + &sched_shutdown, srec);
41884 +
41885 +#if CONFIG_XEN_COMPAT <= 0x030002
41886 + if (rc == -ENOSYS)
41887 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
41888 + SHUTDOWN_suspend, srec);
41889 +#endif
41890 +
41891 + return rc;
41892 +}
41893 +
41894 +#if CONFIG_XEN_COMPAT <= 0x030002
41895 +static inline int
41896 +HYPERVISOR_nmi_op(
41897 + unsigned long op, void *arg)
41898 +{
41899 + return _hypercall2(int, nmi_op, op, arg);
41900 +}
41901 +#endif
41902 +
41903 +#ifndef CONFIG_XEN
41904 +static inline unsigned long __must_check
41905 +HYPERVISOR_hvm_op(
41906 + int op, void *arg)
41907 +{
41908 + return _hypercall2(unsigned long, hvm_op, op, arg);
41909 +}
41910 +#endif
41911 +
41912 +static inline int __must_check
41913 +HYPERVISOR_callback_op(
41914 + int cmd, const void *arg)
41915 +{
41916 + return _hypercall2(int, callback_op, cmd, arg);
41917 +}
41918 +
41919 +static inline int __must_check
41920 +HYPERVISOR_xenoprof_op(
41921 + int op, void *arg)
41922 +{
41923 + return _hypercall2(int, xenoprof_op, op, arg);
41924 +}
41925 +
41926 +static inline int __must_check
41927 +HYPERVISOR_kexec_op(
41928 + unsigned long op, void *args)
41929 +{
41930 + return _hypercall2(int, kexec_op, op, args);
41931 +}
41932 +
41933 +#endif /* __HYPERCALL_H__ */
41934 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h
41935 ===================================================================
41936 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
41937 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/io_64.h 2007-08-16 18:07:01.000000000 +0200
41938 @@ -0,0 +1,329 @@
41939 +#ifndef _ASM_IO_H
41940 +#define _ASM_IO_H
41941 +
41942 +#include <asm/fixmap.h>
41943 +
41944 +/*
41945 + * This file contains the definitions for the x86 IO instructions
41946 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
41947 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
41948 + * versions of the single-IO instructions (inb_p/inw_p/..).
41949 + *
41950 + * This file is not meant to be obfuscating: it's just complicated
41951 + * to (a) handle it all in a way that makes gcc able to optimize it
41952 + * as well as possible and (b) trying to avoid writing the same thing
41953 + * over and over again with slight variations and possibly making a
41954 + * mistake somewhere.
41955 + */
41956 +
41957 +/*
41958 + * Thanks to James van Artsdalen for a better timing-fix than
41959 + * the two short jumps: using outb's to a nonexistent port seems
41960 + * to guarantee better timings even on fast machines.
41961 + *
41962 + * On the other hand, I'd like to be sure of a non-existent port:
41963 + * I feel a bit unsafe about using 0x80 (should be safe, though)
41964 + *
41965 + * Linus
41966 + */
41967 +
41968 + /*
41969 + * Bit simplified and optimized by Jan Hubicka
41970 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
41971 + *
41972 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
41973 + * isa_read[wl] and isa_write[wl] fixed
41974 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
41975 + */
41976 +
41977 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
41978 +
41979 +#ifdef REALLY_SLOW_IO
41980 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
41981 +#else
41982 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
41983 +#endif
41984 +
41985 +/*
41986 + * Talk about misusing macros..
41987 + */
41988 +#define __OUT1(s,x) \
41989 +static inline void out##s(unsigned x value, unsigned short port) {
41990 +
41991 +#define __OUT2(s,s1,s2) \
41992 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
41993 +
41994 +#define __OUT(s,s1,x) \
41995 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
41996 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
41997 +
41998 +#define __IN1(s) \
41999 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
42000 +
42001 +#define __IN2(s,s1,s2) \
42002 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
42003 +
42004 +#define __IN(s,s1,i...) \
42005 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42006 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
42007 +
42008 +#define __INS(s) \
42009 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
42010 +{ __asm__ __volatile__ ("rep ; ins" #s \
42011 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42012 +
42013 +#define __OUTS(s) \
42014 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
42015 +{ __asm__ __volatile__ ("rep ; outs" #s \
42016 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
42017 +
42018 +#define RETURN_TYPE unsigned char
42019 +__IN(b,"")
42020 +#undef RETURN_TYPE
42021 +#define RETURN_TYPE unsigned short
42022 +__IN(w,"")
42023 +#undef RETURN_TYPE
42024 +#define RETURN_TYPE unsigned int
42025 +__IN(l,"")
42026 +#undef RETURN_TYPE
42027 +
42028 +__OUT(b,"b",char)
42029 +__OUT(w,"w",short)
42030 +__OUT(l,,int)
42031 +
42032 +__INS(b)
42033 +__INS(w)
42034 +__INS(l)
42035 +
42036 +__OUTS(b)
42037 +__OUTS(w)
42038 +__OUTS(l)
42039 +
42040 +#define IO_SPACE_LIMIT 0xffff
42041 +
42042 +#if defined(__KERNEL__) && __x86_64__
42043 +
42044 +#include <linux/vmalloc.h>
42045 +
42046 +#ifndef __i386__
42047 +/*
42048 + * Change virtual addresses to physical addresses and vv.
42049 + * These are pretty trivial
42050 + */
42051 +static inline unsigned long virt_to_phys(volatile void * address)
42052 +{
42053 + return __pa(address);
42054 +}
42055 +
42056 +static inline void * phys_to_virt(unsigned long address)
42057 +{
42058 + return __va(address);
42059 +}
42060 +
42061 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42062 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42063 +#endif
42064 +
42065 +/*
42066 + * Change "struct page" to physical address.
42067 + */
42068 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
42069 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
42070 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
42071 +
42072 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
42073 + (unsigned long) bio_offset((bio)))
42074 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
42075 + (unsigned long) (bv)->bv_offset)
42076 +
42077 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
42078 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
42079 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
42080 + bvec_to_pseudophys((vec2))))
42081 +
42082 +#include <asm-generic/iomap.h>
42083 +
42084 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
42085 +
42086 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
42087 +{
42088 + return __ioremap(offset, size, 0);
42089 +}
42090 +
42091 +extern void *bt_ioremap(unsigned long addr, unsigned long size);
42092 +extern void bt_iounmap(void *addr, unsigned long size);
42093 +#define early_ioremap bt_ioremap
42094 +#define early_iounmap bt_iounmap
42095 +
42096 +/*
42097 + * This one maps high address device memory and turns off caching for that area.
42098 + * it's useful if some control registers are in such an area and write combining
42099 + * or read caching is not desirable:
42100 + */
42101 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
42102 +extern void iounmap(volatile void __iomem *addr);
42103 +
42104 +/*
42105 + * ISA I/O bus memory addresses are 1:1 with the physical address.
42106 + */
42107 +
42108 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
42109 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
42110 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
42111 +
42112 +/*
42113 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
42114 + * are forbidden in portable PCI drivers.
42115 + *
42116 + * Allow them on x86 for legacy drivers, though.
42117 + */
42118 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
42119 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
42120 +
42121 +/*
42122 + * readX/writeX() are used to access memory mapped devices. On some
42123 + * architectures the memory mapped IO stuff needs to be accessed
42124 + * differently. On the x86 architecture, we just read/write the
42125 + * memory location directly.
42126 + */
42127 +
42128 +static inline __u8 __readb(const volatile void __iomem *addr)
42129 +{
42130 + return *(__force volatile __u8 *)addr;
42131 +}
42132 +static inline __u16 __readw(const volatile void __iomem *addr)
42133 +{
42134 + return *(__force volatile __u16 *)addr;
42135 +}
42136 +static __always_inline __u32 __readl(const volatile void __iomem *addr)
42137 +{
42138 + return *(__force volatile __u32 *)addr;
42139 +}
42140 +static inline __u64 __readq(const volatile void __iomem *addr)
42141 +{
42142 + return *(__force volatile __u64 *)addr;
42143 +}
42144 +#define readb(x) __readb(x)
42145 +#define readw(x) __readw(x)
42146 +#define readl(x) __readl(x)
42147 +#define readq(x) __readq(x)
42148 +#define readb_relaxed(a) readb(a)
42149 +#define readw_relaxed(a) readw(a)
42150 +#define readl_relaxed(a) readl(a)
42151 +#define readq_relaxed(a) readq(a)
42152 +#define __raw_readb readb
42153 +#define __raw_readw readw
42154 +#define __raw_readl readl
42155 +#define __raw_readq readq
42156 +
42157 +#define mmiowb()
42158 +
42159 +static inline void __writel(__u32 b, volatile void __iomem *addr)
42160 +{
42161 + *(__force volatile __u32 *)addr = b;
42162 +}
42163 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
42164 +{
42165 + *(__force volatile __u64 *)addr = b;
42166 +}
42167 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
42168 +{
42169 + *(__force volatile __u8 *)addr = b;
42170 +}
42171 +static inline void __writew(__u16 b, volatile void __iomem *addr)
42172 +{
42173 + *(__force volatile __u16 *)addr = b;
42174 +}
42175 +#define writeq(val,addr) __writeq((val),(addr))
42176 +#define writel(val,addr) __writel((val),(addr))
42177 +#define writew(val,addr) __writew((val),(addr))
42178 +#define writeb(val,addr) __writeb((val),(addr))
42179 +#define __raw_writeb writeb
42180 +#define __raw_writew writew
42181 +#define __raw_writel writel
42182 +#define __raw_writeq writeq
42183 +
42184 +void __memcpy_fromio(void*,unsigned long,unsigned);
42185 +void __memcpy_toio(unsigned long,const void*,unsigned);
42186 +
42187 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
42188 +{
42189 + __memcpy_fromio(to,(unsigned long)from,len);
42190 +}
42191 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
42192 +{
42193 + __memcpy_toio((unsigned long)to,from,len);
42194 +}
42195 +
42196 +void memset_io(volatile void __iomem *a, int b, size_t c);
42197 +
42198 +/*
42199 + * ISA space is 'always mapped' on a typical x86 system, no need to
42200 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
42201 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
42202 + * are physical addresses. The following constant pointer can be
42203 + * used as the IO-area pointer (it can be iounmapped as well, so the
42204 + * analogy with PCI is quite large):
42205 + */
42206 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
42207 +
42208 +/*
42209 + * Again, x86-64 does not require mem IO specific function.
42210 + */
42211 +
42212 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
42213 +
42214 +/**
42215 + * check_signature - find BIOS signatures
42216 + * @io_addr: mmio address to check
42217 + * @signature: signature block
42218 + * @length: length of signature
42219 + *
42220 + * Perform a signature comparison with the mmio address io_addr. This
42221 + * address should have been obtained by ioremap.
42222 + * Returns 1 on a match.
42223 + */
42224 +
42225 +static inline int check_signature(void __iomem *io_addr,
42226 + const unsigned char *signature, int length)
42227 +{
42228 + int retval = 0;
42229 + do {
42230 + if (readb(io_addr) != *signature)
42231 + goto out;
42232 + io_addr++;
42233 + signature++;
42234 + length--;
42235 + } while (length);
42236 + retval = 1;
42237 +out:
42238 + return retval;
42239 +}
42240 +
42241 +/* Nothing to do */
42242 +
42243 +#define dma_cache_inv(_start,_size) do { } while (0)
42244 +#define dma_cache_wback(_start,_size) do { } while (0)
42245 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
42246 +
42247 +#define flush_write_buffers()
42248 +
42249 +extern int iommu_bio_merge;
42250 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
42251 +
42252 +/*
42253 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
42254 + * access
42255 + */
42256 +#define xlate_dev_mem_ptr(p) __va(p)
42257 +
42258 +/*
42259 + * Convert a virtual cached pointer to an uncached pointer
42260 + */
42261 +#define xlate_dev_kmem_ptr(p) p
42262 +
42263 +#endif /* __KERNEL__ */
42264 +
42265 +#define ARCH_HAS_DEV_MEM
42266 +
42267 +#endif
42268 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h
42269 ===================================================================
42270 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42271 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/irqflags_64.h 2007-06-12 13:14:13.000000000 +0200
42272 @@ -0,0 +1,139 @@
42273 +/*
42274 + * include/asm-x86_64/irqflags.h
42275 + *
42276 + * IRQ flags handling
42277 + *
42278 + * This file gets included from lowlevel asm headers too, to provide
42279 + * wrapped versions of the local_irq_*() APIs, based on the
42280 + * raw_local_irq_*() functions from the lowlevel headers.
42281 + */
42282 +#ifndef _ASM_IRQFLAGS_H
42283 +#define _ASM_IRQFLAGS_H
42284 +
42285 +#ifndef __ASSEMBLY__
42286 +/*
42287 + * Interrupt control:
42288 + */
42289 +
42290 +/*
42291 + * The use of 'barrier' in the following reflects their use as local-lock
42292 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
42293 + * critical operations are executed. All critical operations must complete
42294 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
42295 + * includes these barriers, for example.
42296 + */
42297 +
42298 +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
42299 +
42300 +#define raw_local_save_flags(flags) \
42301 + do { (flags) = __raw_local_save_flags(); } while (0)
42302 +
42303 +#define raw_local_irq_restore(x) \
42304 +do { \
42305 + vcpu_info_t *_vcpu; \
42306 + barrier(); \
42307 + _vcpu = current_vcpu_info(); \
42308 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
42309 + barrier(); /* unmask then check (avoid races) */ \
42310 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42311 + force_evtchn_callback(); \
42312 + } \
42313 +} while (0)
42314 +
42315 +#ifdef CONFIG_X86_VSMP
42316 +
42317 +/*
42318 + * Interrupt control for the VSMP architecture:
42319 + */
42320 +
42321 +static inline void raw_local_irq_disable(void)
42322 +{
42323 + unsigned long flags = __raw_local_save_flags();
42324 +
42325 + raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
42326 +}
42327 +
42328 +static inline void raw_local_irq_enable(void)
42329 +{
42330 + unsigned long flags = __raw_local_save_flags();
42331 +
42332 + raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
42333 +}
42334 +
42335 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42336 +{
42337 + return !(flags & (1<<9)) || (flags & (1 << 18));
42338 +}
42339 +
42340 +#else /* CONFIG_X86_VSMP */
42341 +
42342 +#define raw_local_irq_disable() \
42343 +do { \
42344 + current_vcpu_info()->evtchn_upcall_mask = 1; \
42345 + barrier(); \
42346 +} while (0)
42347 +
42348 +#define raw_local_irq_enable() \
42349 +do { \
42350 + vcpu_info_t *_vcpu; \
42351 + barrier(); \
42352 + _vcpu = current_vcpu_info(); \
42353 + _vcpu->evtchn_upcall_mask = 0; \
42354 + barrier(); /* unmask then check (avoid races) */ \
42355 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
42356 + force_evtchn_callback(); \
42357 +} while (0)
42358 +
42359 +static inline int raw_irqs_disabled_flags(unsigned long flags)
42360 +{
42361 + return (flags != 0);
42362 +}
42363 +
42364 +#endif
42365 +
42366 +/*
42367 + * For spinlocks, etc.:
42368 + */
42369 +
42370 +#define __raw_local_irq_save() \
42371 +({ \
42372 + unsigned long flags = __raw_local_save_flags(); \
42373 + \
42374 + raw_local_irq_disable(); \
42375 + \
42376 + flags; \
42377 +})
42378 +
42379 +#define raw_local_irq_save(flags) \
42380 + do { (flags) = __raw_local_irq_save(); } while (0)
42381 +
42382 +#define raw_irqs_disabled() \
42383 +({ \
42384 + unsigned long flags = __raw_local_save_flags(); \
42385 + \
42386 + raw_irqs_disabled_flags(flags); \
42387 +})
42388 +
42389 +/*
42390 + * Used in the idle loop; sti takes one instruction cycle
42391 + * to complete:
42392 + */
42393 +void raw_safe_halt(void);
42394 +
42395 +/*
42396 + * Used when interrupts are already enabled or to
42397 + * shutdown the processor:
42398 + */
42399 +void halt(void);
42400 +
42401 +#else /* __ASSEMBLY__: */
42402 +# ifdef CONFIG_TRACE_IRQFLAGS
42403 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
42404 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
42405 +# else
42406 +# define TRACE_IRQS_ON
42407 +# define TRACE_IRQS_OFF
42408 +# endif
42409 +#endif
42410 +
42411 +#endif
42412 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h
42413 ===================================================================
42414 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42415 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200
42416 @@ -0,0 +1,161 @@
42417 +#ifndef _X86_64_MADDR_H
42418 +#define _X86_64_MADDR_H
42419 +
42420 +#include <xen/features.h>
42421 +#include <xen/interface/xen.h>
42422 +
42423 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
42424 +#define INVALID_P2M_ENTRY (~0UL)
42425 +#define FOREIGN_FRAME_BIT (1UL<<63)
42426 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
42427 +
42428 +/* Definitions for machine and pseudophysical addresses. */
42429 +typedef unsigned long paddr_t;
42430 +typedef unsigned long maddr_t;
42431 +
42432 +#ifdef CONFIG_XEN
42433 +
42434 +extern unsigned long *phys_to_machine_mapping;
42435 +
42436 +#undef machine_to_phys_mapping
42437 +extern unsigned long *machine_to_phys_mapping;
42438 +extern unsigned int machine_to_phys_order;
42439 +
42440 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
42441 +{
42442 + if (xen_feature(XENFEAT_auto_translated_physmap))
42443 + return pfn;
42444 + BUG_ON(end_pfn && pfn >= end_pfn);
42445 + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
42446 +}
42447 +
42448 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
42449 +{
42450 + if (xen_feature(XENFEAT_auto_translated_physmap))
42451 + return 1;
42452 + BUG_ON(end_pfn && pfn >= end_pfn);
42453 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
42454 +}
42455 +
42456 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
42457 +{
42458 + unsigned long pfn;
42459 +
42460 + if (xen_feature(XENFEAT_auto_translated_physmap))
42461 + return mfn;
42462 +
42463 + if (unlikely((mfn >> machine_to_phys_order) != 0))
42464 + return end_pfn;
42465 +
42466 + /* The array access can fail (e.g., device space beyond end of RAM). */
42467 + asm (
42468 + "1: movq %1,%0\n"
42469 + "2:\n"
42470 + ".section .fixup,\"ax\"\n"
42471 + "3: movq %2,%0\n"
42472 + " jmp 2b\n"
42473 + ".previous\n"
42474 + ".section __ex_table,\"a\"\n"
42475 + " .align 8\n"
42476 + " .quad 1b,3b\n"
42477 + ".previous"
42478 + : "=r" (pfn)
42479 + : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
42480 +
42481 + return pfn;
42482 +}
42483 +
42484 +/*
42485 + * We detect special mappings in one of two ways:
42486 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
42487 + * to be outside our maximum possible pseudophys range.
42488 + * 2. If the MFN belongs to a different domain then we will certainly
42489 + * not have MFN in our p2m table. Conversely, if the page is ours,
42490 + * then we'll have p2m(m2p(MFN))==MFN.
42491 + * If we detect a special mapping then it doesn't have a 'struct page'.
42492 + * We force !pfn_valid() by returning an out-of-range pointer.
42493 + *
42494 + * NB. These checks require that, for any MFN that is not in our reservation,
42495 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
42496 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
42497 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
42498 + *
42499 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
42500 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
42501 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
42502 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
42503 + */
42504 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
42505 +{
42506 + unsigned long pfn = mfn_to_pfn(mfn);
42507 + if ((pfn < end_pfn)
42508 + && !xen_feature(XENFEAT_auto_translated_physmap)
42509 + && (phys_to_machine_mapping[pfn] != mfn))
42510 + return end_pfn; /* force !pfn_valid() */
42511 + return pfn;
42512 +}
42513 +
42514 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
42515 +{
42516 + BUG_ON(end_pfn && pfn >= end_pfn);
42517 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
42518 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
42519 + return;
42520 + }
42521 + phys_to_machine_mapping[pfn] = mfn;
42522 +}
42523 +
42524 +static inline maddr_t phys_to_machine(paddr_t phys)
42525 +{
42526 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
42527 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
42528 + return machine;
42529 +}
42530 +
42531 +static inline paddr_t machine_to_phys(maddr_t machine)
42532 +{
42533 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
42534 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
42535 + return phys;
42536 +}
42537 +
42538 +static inline paddr_t pte_phys_to_machine(paddr_t phys)
42539 +{
42540 + maddr_t machine;
42541 + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42542 + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
42543 + return machine;
42544 +}
42545 +
42546 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
42547 +{
42548 + paddr_t phys;
42549 + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
42550 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
42551 + return phys;
42552 +}
42553 +
42554 +#define __pte_ma(x) ((pte_t) { (x) } )
42555 +#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
42556 +
42557 +#else /* !CONFIG_XEN */
42558 +
42559 +#define pfn_to_mfn(pfn) (pfn)
42560 +#define mfn_to_pfn(mfn) (mfn)
42561 +#define mfn_to_local_pfn(mfn) (mfn)
42562 +#define set_phys_to_machine(pfn, mfn) ((void)0)
42563 +#define phys_to_machine_mapping_valid(pfn) (1)
42564 +#define phys_to_machine(phys) ((maddr_t)(phys))
42565 +#define machine_to_phys(mach) ((paddr_t)(mach))
42566 +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
42567 +#define __pte_ma(x) __pte(x)
42568 +
42569 +#endif /* !CONFIG_XEN */
42570 +
42571 +/* VIRT <-> MACHINE conversion */
42572 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
42573 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
42574 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
42575 +
42576 +#endif /* _X86_64_MADDR_H */
42577 +
42578 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h
42579 ===================================================================
42580 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42581 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/mmu_context_64.h 2007-06-12 13:14:13.000000000 +0200
42582 @@ -0,0 +1,136 @@
42583 +#ifndef __X86_64_MMU_CONTEXT_H
42584 +#define __X86_64_MMU_CONTEXT_H
42585 +
42586 +#include <asm/desc.h>
42587 +#include <asm/atomic.h>
42588 +#include <asm/pgalloc.h>
42589 +#include <asm/page.h>
42590 +#include <asm/pda.h>
42591 +#include <asm/pgtable.h>
42592 +#include <asm/tlbflush.h>
42593 +
42594 +/*
42595 + * possibly do the LDT unload here?
42596 + */
42597 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
42598 +void destroy_context(struct mm_struct *mm);
42599 +
42600 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
42601 +{
42602 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42603 + if (read_pda(mmu_state) == TLBSTATE_OK)
42604 + write_pda(mmu_state, TLBSTATE_LAZY);
42605 +#endif
42606 +}
42607 +
42608 +#define prepare_arch_switch(next) __prepare_arch_switch()
42609 +
42610 +static inline void __prepare_arch_switch(void)
42611 +{
42612 + /*
42613 + * Save away %es, %ds, %fs and %gs. Must happen before reload
42614 + * of cr3/ldt (i.e., not in __switch_to).
42615 + */
42616 + __asm__ __volatile__ (
42617 + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
42618 + : "=m" (current->thread.es),
42619 + "=m" (current->thread.ds),
42620 + "=m" (current->thread.fsindex),
42621 + "=m" (current->thread.gsindex) );
42622 +
42623 + if (current->thread.ds)
42624 + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
42625 +
42626 + if (current->thread.es)
42627 + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
42628 +
42629 + if (current->thread.fsindex) {
42630 + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
42631 + current->thread.fs = 0;
42632 + }
42633 +
42634 + if (current->thread.gsindex) {
42635 + load_gs_index(0);
42636 + current->thread.gs = 0;
42637 + }
42638 +}
42639 +
42640 +extern void mm_pin(struct mm_struct *mm);
42641 +extern void mm_unpin(struct mm_struct *mm);
42642 +void mm_pin_all(void);
42643 +
42644 +static inline void load_cr3(pgd_t *pgd)
42645 +{
42646 + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
42647 + "memory");
42648 +}
42649 +
42650 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
42651 + struct task_struct *tsk)
42652 +{
42653 + unsigned cpu = smp_processor_id();
42654 + struct mmuext_op _op[3], *op = _op;
42655 +
42656 + if (likely(prev != next)) {
42657 + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
42658 + !next->context.pinned);
42659 +
42660 + /* stop flush ipis for the previous mm */
42661 + cpu_clear(cpu, prev->cpu_vm_mask);
42662 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42663 + write_pda(mmu_state, TLBSTATE_OK);
42664 + write_pda(active_mm, next);
42665 +#endif
42666 + cpu_set(cpu, next->cpu_vm_mask);
42667 +
42668 + /* load_cr3(next->pgd) */
42669 + op->cmd = MMUEXT_NEW_BASEPTR;
42670 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
42671 + op++;
42672 +
42673 + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
42674 + op->cmd = MMUEXT_NEW_USER_BASEPTR;
42675 + op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
42676 + op++;
42677 +
42678 + if (unlikely(next->context.ldt != prev->context.ldt)) {
42679 + /* load_LDT_nolock(&next->context, cpu) */
42680 + op->cmd = MMUEXT_SET_LDT;
42681 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
42682 + op->arg2.nr_ents = next->context.size;
42683 + op++;
42684 + }
42685 +
42686 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
42687 + }
42688 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
42689 + else {
42690 + write_pda(mmu_state, TLBSTATE_OK);
42691 + if (read_pda(active_mm) != next)
42692 + out_of_line_bug();
42693 + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42694 + /* We were in lazy tlb mode and leave_mm disabled
42695 + * tlb flush IPI delivery. We must reload CR3
42696 + * to make sure to use no freed page tables.
42697 + */
42698 + load_cr3(next->pgd);
42699 + xen_new_user_pt(__pa(__user_pgd(next->pgd)));
42700 + load_LDT_nolock(&next->context, cpu);
42701 + }
42702 + }
42703 +#endif
42704 +}
42705 +
42706 +#define deactivate_mm(tsk,mm) do { \
42707 + load_gs_index(0); \
42708 + asm volatile("movl %0,%%fs"::"r"(0)); \
42709 +} while(0)
42710 +
42711 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
42712 +{
42713 + if (!next->context.pinned)
42714 + mm_pin(next);
42715 + switch_mm(prev, next, NULL);
42716 +}
42717 +
42718 +#endif
42719 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h
42720 ===================================================================
42721 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42722 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/page_64.h 2008-04-02 12:34:02.000000000 +0200
42723 @@ -0,0 +1,212 @@
42724 +#ifndef _X86_64_PAGE_H
42725 +#define _X86_64_PAGE_H
42726 +
42727 +/* #include <linux/string.h> */
42728 +#ifndef __ASSEMBLY__
42729 +#include <linux/kernel.h>
42730 +#include <linux/types.h>
42731 +#include <asm/bug.h>
42732 +#endif
42733 +#include <xen/interface/xen.h>
42734 +
42735 +/*
42736 + * Need to repeat this here in order to not include pgtable.h (which in turn
42737 + * depends on definitions made here), but to be able to use the symbolic
42738 + * below. The preprocessor will warn if the two definitions aren't identical.
42739 + */
42740 +#define _PAGE_PRESENT 0x001
42741 +#define _PAGE_IO 0x200
42742 +
42743 +/* PAGE_SHIFT determines the page size */
42744 +#define PAGE_SHIFT 12
42745 +#ifdef __ASSEMBLY__
42746 +#define PAGE_SIZE (0x1 << PAGE_SHIFT)
42747 +#else
42748 +#define PAGE_SIZE (1UL << PAGE_SHIFT)
42749 +#endif
42750 +#define PAGE_MASK (~(PAGE_SIZE-1))
42751 +
42752 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
42753 +#define __PHYSICAL_MASK_SHIFT 46
42754 +#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
42755 +#define __VIRTUAL_MASK_SHIFT 48
42756 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
42757 +
42758 +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
42759 +
42760 +#define THREAD_ORDER 1
42761 +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
42762 +#define CURRENT_MASK (~(THREAD_SIZE-1))
42763 +
42764 +#define EXCEPTION_STACK_ORDER 0
42765 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
42766 +
42767 +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
42768 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
42769 +
42770 +#define IRQSTACK_ORDER 2
42771 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
42772 +
42773 +#define STACKFAULT_STACK 1
42774 +#define DOUBLEFAULT_STACK 2
42775 +#define NMI_STACK 3
42776 +#define DEBUG_STACK 4
42777 +#define MCE_STACK 5
42778 +#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
42779 +
42780 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
42781 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
42782 +
42783 +#define HPAGE_SHIFT PMD_SHIFT
42784 +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
42785 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
42786 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
42787 +
42788 +#ifdef __KERNEL__
42789 +#ifndef __ASSEMBLY__
42790 +
42791 +extern unsigned long end_pfn;
42792 +
42793 +#include <asm/maddr.h>
42794 +
42795 +void clear_page(void *);
42796 +void copy_page(void *, void *);
42797 +
42798 +#define clear_user_page(page, vaddr, pg) clear_page(page)
42799 +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
42800 +
42801 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
42802 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
42803 +
42804 +/*
42805 + * These are used to make use of C type-checking..
42806 + */
42807 +typedef struct { unsigned long pte; } pte_t;
42808 +typedef struct { unsigned long pmd; } pmd_t;
42809 +typedef struct { unsigned long pud; } pud_t;
42810 +typedef struct { unsigned long pgd; } pgd_t;
42811 +#define PTE_MASK PHYSICAL_PAGE_MASK
42812 +
42813 +typedef struct { unsigned long pgprot; } pgprot_t;
42814 +
42815 +#define __pte_val(x) ((x).pte)
42816 +#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
42817 + == _PAGE_PRESENT ? \
42818 + pte_machine_to_phys(__pte_val(x)) : \
42819 + __pte_val(x))
42820 +
42821 +#define __pmd_val(x) ((x).pmd)
42822 +static inline unsigned long pmd_val(pmd_t x)
42823 +{
42824 + unsigned long ret = __pmd_val(x);
42825 +#if CONFIG_XEN_COMPAT <= 0x030002
42826 + if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
42827 +#else
42828 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42829 +#endif
42830 + return ret;
42831 +}
42832 +
42833 +#define __pud_val(x) ((x).pud)
42834 +static inline unsigned long pud_val(pud_t x)
42835 +{
42836 + unsigned long ret = __pud_val(x);
42837 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42838 + return ret;
42839 +}
42840 +
42841 +#define __pgd_val(x) ((x).pgd)
42842 +static inline unsigned long pgd_val(pgd_t x)
42843 +{
42844 + unsigned long ret = __pgd_val(x);
42845 + if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
42846 + return ret;
42847 +}
42848 +
42849 +#define pgprot_val(x) ((x).pgprot)
42850 +
42851 +static inline pte_t __pte(unsigned long x)
42852 +{
42853 + if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
42854 + x = pte_phys_to_machine(x);
42855 + return ((pte_t) { (x) });
42856 +}
42857 +
42858 +static inline pmd_t __pmd(unsigned long x)
42859 +{
42860 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42861 + return ((pmd_t) { (x) });
42862 +}
42863 +
42864 +static inline pud_t __pud(unsigned long x)
42865 +{
42866 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42867 + return ((pud_t) { (x) });
42868 +}
42869 +
42870 +static inline pgd_t __pgd(unsigned long x)
42871 +{
42872 + if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
42873 + return ((pgd_t) { (x) });
42874 +}
42875 +
42876 +#define __pgprot(x) ((pgprot_t) { (x) } )
42877 +
42878 +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
42879 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42880 +#define __START_KERNEL_map 0xffffffff80000000UL
42881 +#define __PAGE_OFFSET 0xffff880000000000UL
42882 +
42883 +#else
42884 +#define __PHYSICAL_START CONFIG_PHYSICAL_START
42885 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
42886 +#define __START_KERNEL_map 0xffffffff80000000
42887 +#define __PAGE_OFFSET 0xffff880000000000
42888 +#endif /* !__ASSEMBLY__ */
42889 +
42890 +#if CONFIG_XEN_COMPAT <= 0x030002
42891 +#undef LOAD_OFFSET
42892 +#define LOAD_OFFSET 0
42893 +#endif
42894 +
42895 +/* to align the pointer to the (next) page boundary */
42896 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
42897 +
42898 +#define KERNEL_TEXT_SIZE (40UL*1024*1024)
42899 +#define KERNEL_TEXT_START 0xffffffff80000000UL
42900 +
42901 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
42902 +
42903 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
42904 + Otherwise you risk miscompilation. */
42905 +#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
42906 +/* __pa_symbol should be used for C visible symbols.
42907 + This seems to be the official gcc blessed way to do such arithmetic. */
42908 +#define __pa_symbol(x) \
42909 + ({unsigned long v; \
42910 + asm("" : "=r" (v) : "0" (x)); \
42911 + __pa(v); })
42912 +
42913 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
42914 +#define __boot_va(x) __va(x)
42915 +#define __boot_pa(x) __pa(x)
42916 +#ifdef CONFIG_FLATMEM
42917 +#define pfn_valid(pfn) ((pfn) < end_pfn)
42918 +#endif
42919 +
42920 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
42921 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
42922 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
42923 +
42924 +#define VM_DATA_DEFAULT_FLAGS \
42925 + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
42926 + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
42927 +
42928 +#define __HAVE_ARCH_GATE_AREA 1
42929 +
42930 +#include <asm-generic/memory_model.h>
42931 +#include <asm-generic/page.h>
42932 +
42933 +#endif /* __KERNEL__ */
42934 +
42935 +#endif /* _X86_64_PAGE_H */
42936 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h
42937 ===================================================================
42938 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
42939 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pci_64.h 2007-09-14 11:14:51.000000000 +0200
42940 @@ -0,0 +1,168 @@
42941 +#ifndef __x8664_PCI_H
42942 +#define __x8664_PCI_H
42943 +
42944 +#include <asm/io.h>
42945 +
42946 +#ifdef __KERNEL__
42947 +
42948 +#include <linux/mm.h> /* for struct page */
42949 +
42950 +/* Can be used to override the logic in pci_scan_bus for skipping
42951 + already-configured bus numbers - to be used for buggy BIOSes
42952 + or architectures with incomplete PCI setup by the loader */
42953 +
42954 +#ifdef CONFIG_PCI
42955 +extern unsigned int pcibios_assign_all_busses(void);
42956 +#else
42957 +#define pcibios_assign_all_busses() 0
42958 +#endif
42959 +
42960 +#include <asm/hypervisor.h>
42961 +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
42962 +
42963 +extern unsigned long pci_mem_start;
42964 +#define PCIBIOS_MIN_IO 0x1000
42965 +#define PCIBIOS_MIN_MEM (pci_mem_start)
42966 +
42967 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
42968 +
42969 +void pcibios_config_init(void);
42970 +struct pci_bus * pcibios_scan_root(int bus);
42971 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
42972 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
42973 +
42974 +void pcibios_set_master(struct pci_dev *dev);
42975 +void pcibios_penalize_isa_irq(int irq, int active);
42976 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
42977 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
42978 +
42979 +#include <linux/types.h>
42980 +#include <linux/slab.h>
42981 +#include <asm/scatterlist.h>
42982 +#include <linux/string.h>
42983 +#include <asm/page.h>
42984 +
42985 +extern void pci_iommu_alloc(void);
42986 +extern int iommu_setup(char *opt);
42987 +
42988 +/* The PCI address space does equal the physical memory
42989 + * address space. The networking and block device layers use
42990 + * this boolean for bounce buffer decisions
42991 + *
42992 + * On AMD64 it mostly equals, but we set it to zero if a hardware
42993 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
42994 + */
42995 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
42996 +
42997 +#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
42998 +
42999 +/*
43000 + * x86-64 always supports DAC, but sometimes it is useful to force
43001 + * devices through the IOMMU to get automatic sg list merging.
43002 + * Optional right now.
43003 + */
43004 +extern int iommu_sac_force;
43005 +#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force)
43006 +
43007 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43008 + dma_addr_t ADDR_NAME;
43009 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43010 + __u32 LEN_NAME;
43011 +#define pci_unmap_addr(PTR, ADDR_NAME) \
43012 + ((PTR)->ADDR_NAME)
43013 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43014 + (((PTR)->ADDR_NAME) = (VAL))
43015 +#define pci_unmap_len(PTR, LEN_NAME) \
43016 + ((PTR)->LEN_NAME)
43017 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43018 + (((PTR)->LEN_NAME) = (VAL))
43019 +
43020 +#elif defined(CONFIG_SWIOTLB)
43021 +
43022 +#define pci_dac_dma_supported(pci_dev, mask) 1
43023 +
43024 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
43025 + dma_addr_t ADDR_NAME;
43026 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
43027 + __u32 LEN_NAME;
43028 +#define pci_unmap_addr(PTR, ADDR_NAME) \
43029 + ((PTR)->ADDR_NAME)
43030 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43031 + (((PTR)->ADDR_NAME) = (VAL))
43032 +#define pci_unmap_len(PTR, LEN_NAME) \
43033 + ((PTR)->LEN_NAME)
43034 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
43035 + (((PTR)->LEN_NAME) = (VAL))
43036 +
43037 +#else
43038 +/* No IOMMU */
43039 +
43040 +#define pci_dac_dma_supported(pci_dev, mask) 1
43041 +
43042 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
43043 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
43044 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
43045 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
43046 +#define pci_unmap_len(PTR, LEN_NAME) (0)
43047 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
43048 +
43049 +#endif
43050 +
43051 +#include <asm-generic/pci-dma-compat.h>
43052 +
43053 +static inline dma64_addr_t
43054 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
43055 +{
43056 + return ((dma64_addr_t) page_to_phys(page) +
43057 + (dma64_addr_t) offset);
43058 +}
43059 +
43060 +static inline struct page *
43061 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
43062 +{
43063 + return virt_to_page(__va(dma_addr));
43064 +}
43065 +
43066 +static inline unsigned long
43067 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
43068 +{
43069 + return (dma_addr & ~PAGE_MASK);
43070 +}
43071 +
43072 +static inline void
43073 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43074 +{
43075 +}
43076 +
43077 +static inline void
43078 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
43079 +{
43080 + flush_write_buffers();
43081 +}
43082 +
43083 +#ifdef CONFIG_PCI
43084 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
43085 + enum pci_dma_burst_strategy *strat,
43086 + unsigned long *strategy_parameter)
43087 +{
43088 + *strat = PCI_DMA_BURST_INFINITY;
43089 + *strategy_parameter = ~0UL;
43090 +}
43091 +#endif
43092 +
43093 +#define HAVE_PCI_MMAP
43094 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
43095 + enum pci_mmap_state mmap_state, int write_combine);
43096 +
43097 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
43098 +{
43099 +}
43100 +
43101 +#endif /* __KERNEL__ */
43102 +
43103 +/* generic pci stuff */
43104 +#ifdef CONFIG_PCI
43105 +#include <asm-generic/pci.h>
43106 +#endif
43107 +
43108 +#endif /* __x8664_PCI_H */
43109 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h
43110 ===================================================================
43111 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43112 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgalloc_64.h 2007-06-18 08:38:13.000000000 +0200
43113 @@ -0,0 +1,204 @@
43114 +#ifndef _X86_64_PGALLOC_H
43115 +#define _X86_64_PGALLOC_H
43116 +
43117 +#include <asm/fixmap.h>
43118 +#include <asm/pda.h>
43119 +#include <linux/threads.h>
43120 +#include <linux/mm.h>
43121 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
43122 +
43123 +#include <xen/features.h>
43124 +void make_page_readonly(void *va, unsigned int feature);
43125 +void make_page_writable(void *va, unsigned int feature);
43126 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
43127 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
43128 +
43129 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43130 +
43131 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
43132 +{
43133 + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
43134 +}
43135 +
43136 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
43137 +{
43138 + if (unlikely((mm)->context.pinned)) {
43139 + BUG_ON(HYPERVISOR_update_va_mapping(
43140 + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
43141 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
43142 + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
43143 + } else {
43144 + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
43145 + }
43146 +}
43147 +
43148 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
43149 +{
43150 + if (unlikely((mm)->context.pinned)) {
43151 + BUG_ON(HYPERVISOR_update_va_mapping(
43152 + (unsigned long)pmd,
43153 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
43154 + PAGE_KERNEL_RO), 0));
43155 + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
43156 + } else {
43157 + *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
43158 + }
43159 +}
43160 +
43161 +/*
43162 + * We need to use the batch mode here, but pgd_pupulate() won't be
43163 + * be called frequently.
43164 + */
43165 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
43166 +{
43167 + if (unlikely((mm)->context.pinned)) {
43168 + BUG_ON(HYPERVISOR_update_va_mapping(
43169 + (unsigned long)pud,
43170 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
43171 + PAGE_KERNEL_RO), 0));
43172 + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
43173 + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
43174 + } else {
43175 + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
43176 + *(__user_pgd(pgd)) = *(pgd);
43177 + }
43178 +}
43179 +
43180 +extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
43181 +extern void pte_free(struct page *pte);
43182 +
43183 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
43184 +{
43185 + struct page *pg;
43186 +
43187 + pg = pte_alloc_one(mm, addr);
43188 + return pg ? page_address(pg) : NULL;
43189 +}
43190 +
43191 +static inline void pmd_free(pmd_t *pmd)
43192 +{
43193 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
43194 + pte_free(virt_to_page(pmd));
43195 +}
43196 +
43197 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
43198 +{
43199 + struct page *pg;
43200 +
43201 + pg = pte_alloc_one(mm, addr);
43202 + return pg ? page_address(pg) : NULL;
43203 +}
43204 +
43205 +static inline void pud_free(pud_t *pud)
43206 +{
43207 + BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
43208 + pte_free(virt_to_page(pud));
43209 +}
43210 +
43211 +static inline void pgd_list_add(pgd_t *pgd)
43212 +{
43213 + struct page *page = virt_to_page(pgd);
43214 +
43215 + spin_lock(&pgd_lock);
43216 + page->index = (pgoff_t)pgd_list;
43217 + if (pgd_list)
43218 + pgd_list->private = (unsigned long)&page->index;
43219 + pgd_list = page;
43220 + page->private = (unsigned long)&pgd_list;
43221 + spin_unlock(&pgd_lock);
43222 +}
43223 +
43224 +static inline void pgd_list_del(pgd_t *pgd)
43225 +{
43226 + struct page *next, **pprev, *page = virt_to_page(pgd);
43227 +
43228 + spin_lock(&pgd_lock);
43229 + next = (struct page *)page->index;
43230 + pprev = (struct page **)page->private;
43231 + *pprev = next;
43232 + if (next)
43233 + next->private = (unsigned long)pprev;
43234 + spin_unlock(&pgd_lock);
43235 +}
43236 +
43237 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
43238 +{
43239 + /*
43240 + * We allocate two contiguous pages for kernel and user.
43241 + */
43242 + unsigned boundary;
43243 + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
43244 + if (!pgd)
43245 + return NULL;
43246 + pgd_list_add(pgd);
43247 + /*
43248 + * Copy kernel pointers in from init.
43249 + * Could keep a freelist or slab cache of those because the kernel
43250 + * part never changes.
43251 + */
43252 + boundary = pgd_index(__PAGE_OFFSET);
43253 + memset(pgd, 0, boundary * sizeof(pgd_t));
43254 + memcpy(pgd + boundary,
43255 + init_level4_pgt + boundary,
43256 + (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
43257 +
43258 + memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
43259 + /*
43260 + * Set level3_user_pgt for vsyscall area
43261 + */
43262 + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
43263 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
43264 + return pgd;
43265 +}
43266 +
43267 +static inline void pgd_free(pgd_t *pgd)
43268 +{
43269 + pte_t *ptep = virt_to_ptep(pgd);
43270 +
43271 + if (!pte_write(*ptep)) {
43272 + xen_pgd_unpin(__pa(pgd));
43273 + BUG_ON(HYPERVISOR_update_va_mapping(
43274 + (unsigned long)pgd,
43275 + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
43276 + 0));
43277 + }
43278 +
43279 + ptep = virt_to_ptep(__user_pgd(pgd));
43280 +
43281 + if (!pte_write(*ptep)) {
43282 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
43283 + BUG_ON(HYPERVISOR_update_va_mapping(
43284 + (unsigned long)__user_pgd(pgd),
43285 + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
43286 + PAGE_KERNEL),
43287 + 0));
43288 + }
43289 +
43290 + pgd_list_del(pgd);
43291 + free_pages((unsigned long)pgd, 1);
43292 +}
43293 +
43294 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
43295 +{
43296 + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
43297 + if (pte)
43298 + make_page_readonly(pte, XENFEAT_writable_page_tables);
43299 +
43300 + return pte;
43301 +}
43302 +
43303 +/* Should really implement gc for free page table pages. This could be
43304 + done with a reference count in struct page. */
43305 +
43306 +static inline void pte_free_kernel(pte_t *pte)
43307 +{
43308 + BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
43309 + make_page_writable(pte, XENFEAT_writable_page_tables);
43310 + free_page((unsigned long)pte);
43311 +}
43312 +
43313 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
43314 +#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43315 +#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
43316 +
43317 +#endif /* _X86_64_PGALLOC_H */
43318 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h
43319 ===================================================================
43320 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43321 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-07-21 11:00:33.000000000 +0200
43322 @@ -0,0 +1,583 @@
43323 +#ifndef _X86_64_PGTABLE_H
43324 +#define _X86_64_PGTABLE_H
43325 +
43326 +/*
43327 + * This file contains the functions and defines necessary to modify and use
43328 + * the x86-64 page table tree.
43329 + */
43330 +#include <asm/processor.h>
43331 +#include <asm/fixmap.h>
43332 +#include <asm/bitops.h>
43333 +#include <linux/threads.h>
43334 +#include <linux/sched.h>
43335 +#include <asm/pda.h>
43336 +#ifdef CONFIG_XEN
43337 +#include <asm/hypervisor.h>
43338 +
43339 +extern pud_t level3_user_pgt[512];
43340 +
43341 +extern void xen_init_pt(void);
43342 +
43343 +extern pte_t *lookup_address(unsigned long address);
43344 +
43345 +#define virt_to_ptep(va) \
43346 +({ \
43347 + pte_t *__ptep = lookup_address((unsigned long)(va)); \
43348 + BUG_ON(!__ptep || !pte_present(*__ptep)); \
43349 + __ptep; \
43350 +})
43351 +
43352 +#define arbitrary_virt_to_machine(va) \
43353 + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
43354 + | ((unsigned long)(va) & (PAGE_SIZE - 1)))
43355 +#endif
43356 +
43357 +extern pud_t level3_kernel_pgt[512];
43358 +extern pud_t level3_physmem_pgt[512];
43359 +extern pud_t level3_ident_pgt[512];
43360 +extern pmd_t level2_kernel_pgt[512];
43361 +extern pgd_t init_level4_pgt[];
43362 +extern pgd_t boot_level4_pgt[];
43363 +extern unsigned long __supported_pte_mask;
43364 +
43365 +#define swapper_pg_dir init_level4_pgt
43366 +
43367 +extern int nonx_setup(char *str);
43368 +extern void paging_init(void);
43369 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
43370 +
43371 +extern unsigned long pgkern_mask;
43372 +
43373 +/*
43374 + * ZERO_PAGE is a global shared page that is always zero: used
43375 + * for zero-mapped memory areas etc..
43376 + */
43377 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
43378 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
43379 +
43380 +/*
43381 + * PGDIR_SHIFT determines what a top-level page table entry can map
43382 + */
43383 +#define PGDIR_SHIFT 39
43384 +#define PTRS_PER_PGD 512
43385 +
43386 +/*
43387 + * 3rd level page
43388 + */
43389 +#define PUD_SHIFT 30
43390 +#define PTRS_PER_PUD 512
43391 +
43392 +/*
43393 + * PMD_SHIFT determines the size of the area a middle-level
43394 + * page table can map
43395 + */
43396 +#define PMD_SHIFT 21
43397 +#define PTRS_PER_PMD 512
43398 +
43399 +/*
43400 + * entries per page directory level
43401 + */
43402 +#define PTRS_PER_PTE 512
43403 +
43404 +#define pte_ERROR(e) \
43405 + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43406 + &(e), __pte_val(e), pte_pfn(e))
43407 +#define pmd_ERROR(e) \
43408 + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43409 + &(e), __pmd_val(e), pmd_pfn(e))
43410 +#define pud_ERROR(e) \
43411 + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43412 + &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43413 +#define pgd_ERROR(e) \
43414 + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
43415 + &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43416 +
43417 +#define pgd_none(x) (!__pgd_val(x))
43418 +#define pud_none(x) (!__pud_val(x))
43419 +
43420 +static inline void set_pte(pte_t *dst, pte_t val)
43421 +{
43422 + *dst = val;
43423 +}
43424 +
43425 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
43426 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
43427 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
43428 +
43429 +static inline void pud_clear (pud_t * pud)
43430 +{
43431 + set_pud(pud, __pud(0));
43432 +}
43433 +
43434 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
43435 +
43436 +static inline void pgd_clear (pgd_t * pgd)
43437 +{
43438 + set_pgd(pgd, __pgd(0));
43439 + set_pgd(__user_pgd(pgd), __pgd(0));
43440 +}
43441 +
43442 +#define pud_page(pud) \
43443 + ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
43444 +
43445 +#define pte_same(a, b) ((a).pte == (b).pte)
43446 +
43447 +#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
43448 +
43449 +#define PMD_SIZE (1UL << PMD_SHIFT)
43450 +#define PMD_MASK (~(PMD_SIZE-1))
43451 +#define PUD_SIZE (1UL << PUD_SHIFT)
43452 +#define PUD_MASK (~(PUD_SIZE-1))
43453 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
43454 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
43455 +
43456 +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
43457 +#define FIRST_USER_ADDRESS 0
43458 +
43459 +#ifndef __ASSEMBLY__
43460 +#define MAXMEM 0x3fffffffffffUL
43461 +#define VMALLOC_START 0xffffc20000000000UL
43462 +#define VMALLOC_END 0xffffe1ffffffffffUL
43463 +#define MODULES_VADDR 0xffffffff88000000UL
43464 +#define MODULES_END 0xfffffffffff00000UL
43465 +#define MODULES_LEN (MODULES_END - MODULES_VADDR)
43466 +
43467 +#define _PAGE_BIT_PRESENT 0
43468 +#define _PAGE_BIT_RW 1
43469 +#define _PAGE_BIT_USER 2
43470 +#define _PAGE_BIT_PWT 3
43471 +#define _PAGE_BIT_PCD 4
43472 +#define _PAGE_BIT_ACCESSED 5
43473 +#define _PAGE_BIT_DIRTY 6
43474 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
43475 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
43476 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
43477 +
43478 +#define _PAGE_PRESENT 0x001
43479 +#define _PAGE_RW 0x002
43480 +#define _PAGE_USER 0x004
43481 +#define _PAGE_PWT 0x008
43482 +#define _PAGE_PCD 0x010
43483 +#define _PAGE_ACCESSED 0x020
43484 +#define _PAGE_DIRTY 0x040
43485 +#define _PAGE_PSE 0x080 /* 2MB page */
43486 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
43487 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
43488 +
43489 +#define _PAGE_PROTNONE 0x080 /* If not present */
43490 +#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
43491 +
43492 +/* Mapped page is I/O or foreign and has no associated page struct. */
43493 +#define _PAGE_IO 0x200
43494 +
43495 +#if CONFIG_XEN_COMPAT <= 0x030002
43496 +extern unsigned int __kernel_page_user;
43497 +#else
43498 +#define __kernel_page_user 0
43499 +#endif
43500 +
43501 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
43502 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
43503 +
43504 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
43505 +
43506 +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
43507 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43508 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
43509 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43510 +#define PAGE_COPY PAGE_COPY_NOEXEC
43511 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43512 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
43513 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43514 +#define __PAGE_KERNEL \
43515 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43516 +#define __PAGE_KERNEL_EXEC \
43517 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
43518 +#define __PAGE_KERNEL_NOCACHE \
43519 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43520 +#define __PAGE_KERNEL_RO \
43521 + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
43522 +#define __PAGE_KERNEL_VSYSCALL \
43523 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
43524 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
43525 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
43526 +#define __PAGE_KERNEL_LARGE \
43527 + (__PAGE_KERNEL | _PAGE_PSE)
43528 +#define __PAGE_KERNEL_LARGE_EXEC \
43529 + (__PAGE_KERNEL_EXEC | _PAGE_PSE)
43530 +
43531 +/*
43532 + * We don't support GLOBAL page in xenolinux64
43533 + */
43534 +#define MAKE_GLOBAL(x) __pgprot((x))
43535 +
43536 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
43537 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
43538 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
43539 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
43540 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
43541 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
43542 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
43543 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
43544 +
43545 +/* xwr */
43546 +#define __P000 PAGE_NONE
43547 +#define __P001 PAGE_READONLY
43548 +#define __P010 PAGE_COPY
43549 +#define __P011 PAGE_COPY
43550 +#define __P100 PAGE_READONLY_EXEC
43551 +#define __P101 PAGE_READONLY_EXEC
43552 +#define __P110 PAGE_COPY_EXEC
43553 +#define __P111 PAGE_COPY_EXEC
43554 +
43555 +#define __S000 PAGE_NONE
43556 +#define __S001 PAGE_READONLY
43557 +#define __S010 PAGE_SHARED
43558 +#define __S011 PAGE_SHARED
43559 +#define __S100 PAGE_READONLY_EXEC
43560 +#define __S101 PAGE_READONLY_EXEC
43561 +#define __S110 PAGE_SHARED_EXEC
43562 +#define __S111 PAGE_SHARED_EXEC
43563 +
43564 +static inline unsigned long pgd_bad(pgd_t pgd)
43565 +{
43566 + unsigned long val = __pgd_val(pgd);
43567 + val &= ~PTE_MASK;
43568 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
43569 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43570 +}
43571 +
43572 +static inline unsigned long pud_bad(pud_t pud)
43573 +{
43574 + unsigned long val = __pud_val(pud);
43575 + val &= ~PTE_MASK;
43576 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
43577 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
43578 +}
43579 +
43580 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
43581 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
43582 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
43583 + set_pte((ptep), (pteval)); \
43584 +} while (0)
43585 +
43586 +#define pte_none(x) (!(x).pte)
43587 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
43588 +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
43589 +
43590 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
43591 +
43592 +#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
43593 +#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
43594 + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
43595 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
43596 + (_pte).pte & _PAGE_PRESENT ? \
43597 + mfn_to_local_pfn(__pte_mfn(_pte)) : \
43598 + __pte_mfn(_pte))
43599 +
43600 +#define pte_page(x) pfn_to_page(pte_pfn(x))
43601 +
43602 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
43603 +{
43604 + unsigned long pte = page_nr << PAGE_SHIFT;
43605 + pte |= pgprot_val(pgprot);
43606 + pte &= __supported_pte_mask;
43607 + return __pte(pte);
43608 +}
43609 +
43610 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43611 +{
43612 + pte_t pte = *ptep;
43613 + if (!pte_none(pte)) {
43614 + if ((mm != &init_mm) ||
43615 + HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
43616 + pte = __pte_ma(xchg(&ptep->pte, 0));
43617 + }
43618 + return pte;
43619 +}
43620 +
43621 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
43622 +{
43623 + if (full) {
43624 + pte_t pte = *ptep;
43625 + if (mm->context.pinned)
43626 + xen_l1_entry_update(ptep, __pte(0));
43627 + else
43628 + *ptep = __pte(0);
43629 + return pte;
43630 + }
43631 + return ptep_get_and_clear(mm, addr, ptep);
43632 +}
43633 +
43634 +#define ptep_clear_flush(vma, addr, ptep) \
43635 +({ \
43636 + pte_t *__ptep = (ptep); \
43637 + pte_t __res = *__ptep; \
43638 + if (!pte_none(__res) && \
43639 + ((vma)->vm_mm != current->mm || \
43640 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
43641 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43642 + UVMF_INVLPG|UVMF_MULTI))) { \
43643 + __ptep->pte = 0; \
43644 + flush_tlb_page(vma, addr); \
43645 + } \
43646 + __res; \
43647 +})
43648 +
43649 +/*
43650 + * The following only work if pte_present() is true.
43651 + * Undefined behaviour if not..
43652 + */
43653 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
43654 +static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43655 +static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43656 +static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
43657 +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
43658 +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
43659 +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
43660 +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
43661 +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
43662 +
43663 +static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43664 +static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
43665 +static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
43666 +static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
43667 +static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
43668 +static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43669 +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
43670 +static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
43671 +static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
43672 +static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
43673 +static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
43674 +
43675 +#define ptep_test_and_clear_dirty(vma, addr, ptep) \
43676 +({ \
43677 + pte_t __pte = *(ptep); \
43678 + int __ret = pte_dirty(__pte); \
43679 + if (__ret) \
43680 + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
43681 + __ret; \
43682 +})
43683 +
43684 +#define ptep_test_and_clear_young(vma, addr, ptep) \
43685 +({ \
43686 + pte_t __pte = *(ptep); \
43687 + int __ret = pte_young(__pte); \
43688 + if (__ret) \
43689 + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
43690 + __ret; \
43691 +})
43692 +
43693 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
43694 +{
43695 + pte_t pte = *ptep;
43696 + if (pte_write(pte))
43697 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
43698 +}
43699 +
43700 +/*
43701 + * Macro to mark a page protection value as "uncacheable".
43702 + */
43703 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
43704 +
43705 +static inline int pmd_large(pmd_t pte) {
43706 + return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
43707 +}
43708 +
43709 +
43710 +/*
43711 + * Conversion functions: convert a page and protection to a page entry,
43712 + * and a page entry and page directory to the page they refer to.
43713 + */
43714 +
43715 +/*
43716 + * Level 4 access.
43717 + * Never use these in the common code.
43718 + */
43719 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
43720 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
43721 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
43722 +#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
43723 +#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
43724 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
43725 +
43726 +/* PUD - Level3 access */
43727 +/* to find an entry in a page-table-directory. */
43728 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
43729 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
43730 +#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
43731 +
43732 +/* PMD - Level 2 access */
43733 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
43734 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
43735 +
43736 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
43737 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
43738 + pmd_index(address))
43739 +#define pmd_none(x) (!__pmd_val(x))
43740 +#if CONFIG_XEN_COMPAT <= 0x030002
43741 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
43742 + can temporarily clear it. */
43743 +#define pmd_present(x) (__pmd_val(x))
43744 +#else
43745 +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
43746 +#endif
43747 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
43748 +#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
43749 + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
43750 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
43751 +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
43752 +
43753 +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
43754 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
43755 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
43756 +
43757 +/* PTE - Level 1 access. */
43758 +
43759 +/* page, protection -> pte */
43760 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
43761 +#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
43762 +
43763 +/* physical address -> PTE */
43764 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
43765 +{
43766 + unsigned long pteval;
43767 + pteval = physpage | pgprot_val(pgprot);
43768 + return __pte(pteval);
43769 +}
43770 +
43771 +/* Change flags of a PTE */
43772 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
43773 +{
43774 + /*
43775 + * Since this might change the present bit (which controls whether
43776 + * a pte_t object has undergone p2m translation), we must use
43777 + * pte_val() on the input pte and __pte() for the return value.
43778 + */
43779 + unsigned long pteval = pte_val(pte);
43780 +
43781 + pteval &= _PAGE_CHG_MASK;
43782 + pteval |= pgprot_val(newprot);
43783 + pteval &= __supported_pte_mask;
43784 + return __pte(pteval);
43785 +}
43786 +
43787 +#define pte_index(address) \
43788 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
43789 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
43790 + pte_index(address))
43791 +
43792 +/* x86-64 always has all page tables mapped. */
43793 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
43794 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
43795 +#define pte_unmap(pte) /* NOP */
43796 +#define pte_unmap_nested(pte) /* NOP */
43797 +
43798 +#define update_mmu_cache(vma,address,pte) do { } while (0)
43799 +
43800 +/*
43801 + * Rules for using ptep_establish: the pte MUST be a user pte, and
43802 + * must be a present->present transition.
43803 + */
43804 +#define __HAVE_ARCH_PTEP_ESTABLISH
43805 +#define ptep_establish(vma, address, ptep, pteval) \
43806 + do { \
43807 + if ( likely((vma)->vm_mm == current->mm) ) { \
43808 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
43809 + pteval, \
43810 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
43811 + UVMF_INVLPG|UVMF_MULTI)); \
43812 + } else { \
43813 + xen_l1_entry_update(ptep, pteval); \
43814 + flush_tlb_page(vma, address); \
43815 + } \
43816 + } while (0)
43817 +
43818 +/* We only update the dirty/accessed state if we set
43819 + * the dirty bit by hand in the kernel, since the hardware
43820 + * will do the accessed bit for us, and we don't want to
43821 + * race with other CPU's that might be updating the dirty
43822 + * bit at the same time. */
43823 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
43824 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
43825 + do { \
43826 + if (dirty) \
43827 + ptep_establish(vma, address, ptep, entry); \
43828 + } while (0)
43829 +
43830 +/* Encode and de-code a swap entry */
43831 +#define __swp_type(x) (((x).val >> 1) & 0x3f)
43832 +#define __swp_offset(x) ((x).val >> 8)
43833 +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
43834 +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
43835 +#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
43836 +
43837 +extern spinlock_t pgd_lock;
43838 +extern struct page *pgd_list;
43839 +void vmalloc_sync_all(void);
43840 +
43841 +#endif /* !__ASSEMBLY__ */
43842 +
43843 +extern int kern_addr_valid(unsigned long addr);
43844 +
43845 +#define DOMID_LOCAL (0xFFFFU)
43846 +
43847 +struct vm_area_struct;
43848 +
43849 +int direct_remap_pfn_range(struct vm_area_struct *vma,
43850 + unsigned long address,
43851 + unsigned long mfn,
43852 + unsigned long size,
43853 + pgprot_t prot,
43854 + domid_t domid);
43855 +
43856 +int direct_kernel_remap_pfn_range(unsigned long address,
43857 + unsigned long mfn,
43858 + unsigned long size,
43859 + pgprot_t prot,
43860 + domid_t domid);
43861 +
43862 +int create_lookup_pte_addr(struct mm_struct *mm,
43863 + unsigned long address,
43864 + uint64_t *ptep);
43865 +
43866 +int touch_pte_range(struct mm_struct *mm,
43867 + unsigned long address,
43868 + unsigned long size);
43869 +
43870 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43871 + unsigned long addr, unsigned long end, pgprot_t newprot);
43872 +
43873 +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
43874 + xen_change_pte_range(mm, pmd, addr, end, newprot)
43875 +
43876 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
43877 + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
43878 +
43879 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
43880 +#define GET_IOSPACE(pfn) 0
43881 +#define GET_PFN(pfn) (pfn)
43882 +
43883 +#define HAVE_ARCH_UNMAPPED_AREA
43884 +
43885 +#define pgtable_cache_init() do { } while (0)
43886 +#define check_pgt_cache() do { } while (0)
43887 +
43888 +#define PAGE_AGP PAGE_KERNEL_NOCACHE
43889 +#define HAVE_PAGE_AGP 1
43890 +
43891 +/* fs/proc/kcore.c */
43892 +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
43893 +#define kc_offset_to_vaddr(o) \
43894 + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
43895 +
43896 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
43897 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
43898 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
43899 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
43900 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
43901 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
43902 +#define __HAVE_ARCH_PTE_SAME
43903 +#include <asm-generic/pgtable.h>
43904 +
43905 +#endif /* _X86_64_PGTABLE_H */
43906 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h
43907 ===================================================================
43908 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
43909 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100
43910 @@ -0,0 +1,502 @@
43911 +/*
43912 + * include/asm-x86_64/processor.h
43913 + *
43914 + * Copyright (C) 1994 Linus Torvalds
43915 + */
43916 +
43917 +#ifndef __ASM_X86_64_PROCESSOR_H
43918 +#define __ASM_X86_64_PROCESSOR_H
43919 +
43920 +#include <asm/segment.h>
43921 +#include <asm/page.h>
43922 +#include <asm/types.h>
43923 +#include <asm/sigcontext.h>
43924 +#include <asm/cpufeature.h>
43925 +#include <linux/threads.h>
43926 +#include <asm/msr.h>
43927 +#include <asm/current.h>
43928 +#include <asm/system.h>
43929 +#include <asm/mmsegment.h>
43930 +#include <asm/percpu.h>
43931 +#include <linux/personality.h>
43932 +#include <linux/cpumask.h>
43933 +
43934 +#define TF_MASK 0x00000100
43935 +#define IF_MASK 0x00000200
43936 +#define IOPL_MASK 0x00003000
43937 +#define NT_MASK 0x00004000
43938 +#define VM_MASK 0x00020000
43939 +#define AC_MASK 0x00040000
43940 +#define VIF_MASK 0x00080000 /* virtual interrupt flag */
43941 +#define VIP_MASK 0x00100000 /* virtual interrupt pending */
43942 +#define ID_MASK 0x00200000
43943 +
43944 +#define desc_empty(desc) \
43945 + (!((desc)->a | (desc)->b))
43946 +
43947 +#define desc_equal(desc1, desc2) \
43948 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
43949 +
43950 +/*
43951 + * Default implementation of macro that returns current
43952 + * instruction pointer ("program counter").
43953 + */
43954 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
43955 +
43956 +/*
43957 + * CPU type and hardware bug flags. Kept separately for each CPU.
43958 + */
43959 +
43960 +struct cpuinfo_x86 {
43961 + __u8 x86; /* CPU family */
43962 + __u8 x86_vendor; /* CPU vendor */
43963 + __u8 x86_model;
43964 + __u8 x86_mask;
43965 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
43966 + __u32 x86_capability[NCAPINTS];
43967 + char x86_vendor_id[16];
43968 + char x86_model_id[64];
43969 + int x86_cache_size; /* in KB */
43970 + int x86_clflush_size;
43971 + int x86_cache_alignment;
43972 + int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
43973 + __u8 x86_virt_bits, x86_phys_bits;
43974 + __u8 x86_max_cores; /* cpuid returned max cores value */
43975 + __u32 x86_power;
43976 + __u32 extended_cpuid_level; /* Max extended CPUID function supported */
43977 + unsigned long loops_per_jiffy;
43978 +#ifdef CONFIG_SMP
43979 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
43980 +#endif
43981 + __u8 apicid;
43982 +#ifdef CONFIG_SMP
43983 + __u8 booted_cores; /* number of cores as seen by OS */
43984 + __u8 phys_proc_id; /* Physical Processor id. */
43985 + __u8 cpu_core_id; /* Core id. */
43986 +#endif
43987 +} ____cacheline_aligned;
43988 +
43989 +#define X86_VENDOR_INTEL 0
43990 +#define X86_VENDOR_CYRIX 1
43991 +#define X86_VENDOR_AMD 2
43992 +#define X86_VENDOR_UMC 3
43993 +#define X86_VENDOR_NEXGEN 4
43994 +#define X86_VENDOR_CENTAUR 5
43995 +#define X86_VENDOR_RISE 6
43996 +#define X86_VENDOR_TRANSMETA 7
43997 +#define X86_VENDOR_NUM 8
43998 +#define X86_VENDOR_UNKNOWN 0xff
43999 +
44000 +#ifdef CONFIG_SMP
44001 +extern struct cpuinfo_x86 cpu_data[];
44002 +#define current_cpu_data cpu_data[smp_processor_id()]
44003 +#else
44004 +#define cpu_data (&boot_cpu_data)
44005 +#define current_cpu_data boot_cpu_data
44006 +#endif
44007 +
44008 +extern char ignore_irq13;
44009 +
44010 +extern void identify_cpu(struct cpuinfo_x86 *);
44011 +extern void print_cpu_info(struct cpuinfo_x86 *);
44012 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
44013 +extern unsigned short num_cache_leaves;
44014 +
44015 +/*
44016 + * EFLAGS bits
44017 + */
44018 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
44019 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
44020 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
44021 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
44022 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
44023 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
44024 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
44025 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
44026 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
44027 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
44028 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
44029 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
44030 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
44031 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
44032 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
44033 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
44034 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
44035 +
44036 +/*
44037 + * Intel CPU features in CR4
44038 + */
44039 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
44040 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
44041 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
44042 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
44043 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
44044 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
44045 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
44046 +#define X86_CR4_PGE 0x0080 /* enable global pages */
44047 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
44048 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
44049 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
44050 +
44051 +/*
44052 + * Save the cr4 feature set we're using (ie
44053 + * Pentium 4MB enable and PPro Global page
44054 + * enable), so that any CPU's that boot up
44055 + * after us can get the correct flags.
44056 + */
44057 +extern unsigned long mmu_cr4_features;
44058 +
44059 +static inline void set_in_cr4 (unsigned long mask)
44060 +{
44061 + mmu_cr4_features |= mask;
44062 + __asm__("movq %%cr4,%%rax\n\t"
44063 + "orq %0,%%rax\n\t"
44064 + "movq %%rax,%%cr4\n"
44065 + : : "irg" (mask)
44066 + :"ax");
44067 +}
44068 +
44069 +static inline void clear_in_cr4 (unsigned long mask)
44070 +{
44071 + mmu_cr4_features &= ~mask;
44072 + __asm__("movq %%cr4,%%rax\n\t"
44073 + "andq %0,%%rax\n\t"
44074 + "movq %%rax,%%cr4\n"
44075 + : : "irg" (~mask)
44076 + :"ax");
44077 +}
44078 +
44079 +
44080 +/*
44081 + * User space process size. 47bits minus one guard page.
44082 + */
44083 +#define TASK_SIZE64 (0x800000000000UL - 4096)
44084 +
44085 +/* This decides where the kernel will search for a free chunk of vm
44086 + * space during mmap's.
44087 + */
44088 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
44089 +
44090 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44091 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
44092 +
44093 +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
44094 +
44095 +/*
44096 + * Size of io_bitmap.
44097 + */
44098 +#define IO_BITMAP_BITS 65536
44099 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
44100 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
44101 +#ifndef CONFIG_X86_NO_TSS
44102 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
44103 +#endif
44104 +#define INVALID_IO_BITMAP_OFFSET 0x8000
44105 +
44106 +struct i387_fxsave_struct {
44107 + u16 cwd;
44108 + u16 swd;
44109 + u16 twd;
44110 + u16 fop;
44111 + u64 rip;
44112 + u64 rdp;
44113 + u32 mxcsr;
44114 + u32 mxcsr_mask;
44115 + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
44116 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
44117 + u32 padding[24];
44118 +} __attribute__ ((aligned (16)));
44119 +
44120 +union i387_union {
44121 + struct i387_fxsave_struct fxsave;
44122 +};
44123 +
44124 +#ifndef CONFIG_X86_NO_TSS
44125 +struct tss_struct {
44126 + u32 reserved1;
44127 + u64 rsp0;
44128 + u64 rsp1;
44129 + u64 rsp2;
44130 + u64 reserved2;
44131 + u64 ist[7];
44132 + u32 reserved3;
44133 + u32 reserved4;
44134 + u16 reserved5;
44135 + u16 io_bitmap_base;
44136 + /*
44137 + * The extra 1 is there because the CPU will access an
44138 + * additional byte beyond the end of the IO permission
44139 + * bitmap. The extra byte must be all 1 bits, and must
44140 + * be within the limit. Thus we have:
44141 + *
44142 + * 128 bytes, the bitmap itself, for ports 0..0x3ff
44143 + * 8 bytes, for an extra "long" of ~0UL
44144 + */
44145 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
44146 +} __attribute__((packed)) ____cacheline_aligned;
44147 +
44148 +DECLARE_PER_CPU(struct tss_struct,init_tss);
44149 +#endif
44150 +
44151 +
44152 +extern struct cpuinfo_x86 boot_cpu_data;
44153 +#ifndef CONFIG_X86_NO_TSS
44154 +/* Save the original ist values for checking stack pointers during debugging */
44155 +struct orig_ist {
44156 + unsigned long ist[7];
44157 +};
44158 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
44159 +#endif
44160 +
44161 +#ifdef CONFIG_X86_VSMP
44162 +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
44163 +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
44164 +#else
44165 +#define ARCH_MIN_TASKALIGN 16
44166 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
44167 +#endif
44168 +
44169 +struct thread_struct {
44170 + unsigned long rsp0;
44171 + unsigned long rsp;
44172 + unsigned long userrsp; /* Copy from PDA */
44173 + unsigned long fs;
44174 + unsigned long gs;
44175 + unsigned short es, ds, fsindex, gsindex;
44176 +/* Hardware debugging registers */
44177 + unsigned long debugreg0;
44178 + unsigned long debugreg1;
44179 + unsigned long debugreg2;
44180 + unsigned long debugreg3;
44181 + unsigned long debugreg6;
44182 + unsigned long debugreg7;
44183 +/* fault info */
44184 + unsigned long cr2, trap_no, error_code;
44185 +/* floating point info */
44186 + union i387_union i387 __attribute__((aligned(16)));
44187 +/* IO permissions. the bitmap could be moved into the GDT, that would make
44188 + switch faster for a limited number of ioperm using tasks. -AK */
44189 + int ioperm;
44190 + unsigned long *io_bitmap_ptr;
44191 + unsigned io_bitmap_max;
44192 +/* cached TLS descriptors. */
44193 + u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
44194 + unsigned int iopl;
44195 +} __attribute__((aligned(16)));
44196 +
44197 +#define INIT_THREAD { \
44198 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44199 +}
44200 +
44201 +#ifndef CONFIG_X86_NO_TSS
44202 +#define INIT_TSS { \
44203 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
44204 +}
44205 +#endif
44206 +
44207 +#define INIT_MMAP \
44208 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
44209 +
44210 +#define start_thread(regs,new_rip,new_rsp) do { \
44211 + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
44212 + load_gs_index(0); \
44213 + (regs)->rip = (new_rip); \
44214 + (regs)->rsp = (new_rsp); \
44215 + write_pda(oldrsp, (new_rsp)); \
44216 + (regs)->cs = __USER_CS; \
44217 + (regs)->ss = __USER_DS; \
44218 + (regs)->eflags = 0x200; \
44219 + set_fs(USER_DS); \
44220 +} while(0)
44221 +
44222 +#define get_debugreg(var, register) \
44223 + var = HYPERVISOR_get_debugreg(register)
44224 +#define set_debugreg(value, register) do { \
44225 + if (HYPERVISOR_set_debugreg(register, value)) \
44226 + BUG(); \
44227 +} while (0)
44228 +
44229 +struct task_struct;
44230 +struct mm_struct;
44231 +
44232 +/* Free all resources held by a thread. */
44233 +extern void release_thread(struct task_struct *);
44234 +
44235 +/* Prepare to copy thread state - unlazy all lazy status */
44236 +extern void prepare_to_copy(struct task_struct *tsk);
44237 +
44238 +/*
44239 + * create a kernel thread without removing it from tasklists
44240 + */
44241 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
44242 +
44243 +/*
44244 + * Return saved PC of a blocked thread.
44245 + * What is this good for? it will be always the scheduler or ret_from_fork.
44246 + */
44247 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
44248 +
44249 +extern unsigned long get_wchan(struct task_struct *p);
44250 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
44251 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
44252 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
44253 +
44254 +
44255 +struct microcode_header {
44256 + unsigned int hdrver;
44257 + unsigned int rev;
44258 + unsigned int date;
44259 + unsigned int sig;
44260 + unsigned int cksum;
44261 + unsigned int ldrver;
44262 + unsigned int pf;
44263 + unsigned int datasize;
44264 + unsigned int totalsize;
44265 + unsigned int reserved[3];
44266 +};
44267 +
44268 +struct microcode {
44269 + struct microcode_header hdr;
44270 + unsigned int bits[0];
44271 +};
44272 +
44273 +typedef struct microcode microcode_t;
44274 +typedef struct microcode_header microcode_header_t;
44275 +
44276 +/* microcode format is extended from prescott processors */
44277 +struct extended_signature {
44278 + unsigned int sig;
44279 + unsigned int pf;
44280 + unsigned int cksum;
44281 +};
44282 +
44283 +struct extended_sigtable {
44284 + unsigned int count;
44285 + unsigned int cksum;
44286 + unsigned int reserved[3];
44287 + struct extended_signature sigs[0];
44288 +};
44289 +
44290 +
44291 +#define ASM_NOP1 K8_NOP1
44292 +#define ASM_NOP2 K8_NOP2
44293 +#define ASM_NOP3 K8_NOP3
44294 +#define ASM_NOP4 K8_NOP4
44295 +#define ASM_NOP5 K8_NOP5
44296 +#define ASM_NOP6 K8_NOP6
44297 +#define ASM_NOP7 K8_NOP7
44298 +#define ASM_NOP8 K8_NOP8
44299 +
44300 +/* Opteron nops */
44301 +#define K8_NOP1 ".byte 0x90\n"
44302 +#define K8_NOP2 ".byte 0x66,0x90\n"
44303 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
44304 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
44305 +#define K8_NOP5 K8_NOP3 K8_NOP2
44306 +#define K8_NOP6 K8_NOP3 K8_NOP3
44307 +#define K8_NOP7 K8_NOP4 K8_NOP3
44308 +#define K8_NOP8 K8_NOP4 K8_NOP4
44309 +
44310 +#define ASM_NOP_MAX 8
44311 +
44312 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
44313 +static inline void rep_nop(void)
44314 +{
44315 + __asm__ __volatile__("rep;nop": : :"memory");
44316 +}
44317 +
44318 +/* Stop speculative execution */
44319 +static inline void sync_core(void)
44320 +{
44321 + int tmp;
44322 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
44323 +}
44324 +
44325 +#define cpu_has_fpu 1
44326 +
44327 +#define ARCH_HAS_PREFETCH
44328 +static inline void prefetch(void *x)
44329 +{
44330 + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
44331 +}
44332 +
44333 +#define ARCH_HAS_PREFETCHW 1
44334 +static inline void prefetchw(void *x)
44335 +{
44336 + alternative_input("prefetcht0 (%1)",
44337 + "prefetchw (%1)",
44338 + X86_FEATURE_3DNOW,
44339 + "r" (x));
44340 +}
44341 +
44342 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
44343 +
44344 +#define spin_lock_prefetch(x) prefetchw(x)
44345 +
44346 +#define cpu_relax() rep_nop()
44347 +
44348 +/*
44349 + * NSC/Cyrix CPU configuration register indexes
44350 + */
44351 +#define CX86_CCR0 0xc0
44352 +#define CX86_CCR1 0xc1
44353 +#define CX86_CCR2 0xc2
44354 +#define CX86_CCR3 0xc3
44355 +#define CX86_CCR4 0xe8
44356 +#define CX86_CCR5 0xe9
44357 +#define CX86_CCR6 0xea
44358 +#define CX86_CCR7 0xeb
44359 +#define CX86_DIR0 0xfe
44360 +#define CX86_DIR1 0xff
44361 +#define CX86_ARR_BASE 0xc4
44362 +#define CX86_RCR_BASE 0xdc
44363 +
44364 +/*
44365 + * NSC/Cyrix CPU indexed register access macros
44366 + */
44367 +
44368 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
44369 +
44370 +#define setCx86(reg, data) do { \
44371 + outb((reg), 0x22); \
44372 + outb((data), 0x23); \
44373 +} while (0)
44374 +
44375 +static inline void serialize_cpu(void)
44376 +{
44377 + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
44378 +}
44379 +
44380 +static inline void __monitor(const void *eax, unsigned long ecx,
44381 + unsigned long edx)
44382 +{
44383 + /* "monitor %eax,%ecx,%edx;" */
44384 + asm volatile(
44385 + ".byte 0x0f,0x01,0xc8;"
44386 + : :"a" (eax), "c" (ecx), "d"(edx));
44387 +}
44388 +
44389 +static inline void __mwait(unsigned long eax, unsigned long ecx)
44390 +{
44391 + /* "mwait %eax,%ecx;" */
44392 + asm volatile(
44393 + ".byte 0x0f,0x01,0xc9;"
44394 + : :"a" (eax), "c" (ecx));
44395 +}
44396 +
44397 +#define stack_current() \
44398 +({ \
44399 + struct thread_info *ti; \
44400 + asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44401 + ti->task; \
44402 +})
44403 +
44404 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
44405 +
44406 +extern unsigned long boot_option_idle_override;
44407 +/* Boot loader type from the setup header */
44408 +extern int bootloader_type;
44409 +
44410 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
44411 +
44412 +#endif /* __ASM_X86_64_PROCESSOR_H */
44413 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h
44414 ===================================================================
44415 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44416 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200
44417 @@ -0,0 +1,150 @@
44418 +#ifndef __ASM_SMP_H
44419 +#define __ASM_SMP_H
44420 +
44421 +/*
44422 + * We need the APIC definitions automatically as part of 'smp.h'
44423 + */
44424 +#ifndef __ASSEMBLY__
44425 +#include <linux/threads.h>
44426 +#include <linux/cpumask.h>
44427 +#include <linux/bitops.h>
44428 +extern int disable_apic;
44429 +#endif
44430 +
44431 +#ifdef CONFIG_X86_LOCAL_APIC
44432 +#ifndef __ASSEMBLY__
44433 +#include <asm/fixmap.h>
44434 +#include <asm/mpspec.h>
44435 +#ifdef CONFIG_X86_IO_APIC
44436 +#include <asm/io_apic.h>
44437 +#endif
44438 +#include <asm/apic.h>
44439 +#include <asm/thread_info.h>
44440 +#endif
44441 +#endif
44442 +
44443 +#ifdef CONFIG_SMP
44444 +#ifndef ASSEMBLY
44445 +
44446 +#include <asm/pda.h>
44447 +
44448 +struct pt_regs;
44449 +
44450 +extern cpumask_t cpu_present_mask;
44451 +extern cpumask_t cpu_possible_map;
44452 +extern cpumask_t cpu_online_map;
44453 +extern cpumask_t cpu_initialized;
44454 +
44455 +/*
44456 + * Private routines/data
44457 + */
44458 +
44459 +extern void smp_alloc_memory(void);
44460 +extern volatile unsigned long smp_invalidate_needed;
44461 +extern int pic_mode;
44462 +extern void lock_ipi_call_lock(void);
44463 +extern void unlock_ipi_call_lock(void);
44464 +extern int smp_num_siblings;
44465 +extern void smp_send_reschedule(int cpu);
44466 +void smp_stop_cpu(void);
44467 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
44468 + void *info, int retry, int wait);
44469 +
44470 +extern cpumask_t cpu_sibling_map[NR_CPUS];
44471 +extern cpumask_t cpu_core_map[NR_CPUS];
44472 +extern u8 cpu_llc_id[NR_CPUS];
44473 +
44474 +#define SMP_TRAMPOLINE_BASE 0x6000
44475 +
44476 +/*
44477 + * On x86 all CPUs are mapped 1:1 to the APIC space.
44478 + * This simplifies scheduling and IPI sending and
44479 + * compresses data structures.
44480 + */
44481 +
44482 +static inline int num_booting_cpus(void)
44483 +{
44484 + return cpus_weight(cpu_possible_map);
44485 +}
44486 +
44487 +#define raw_smp_processor_id() read_pda(cpunumber)
44488 +
44489 +#ifdef CONFIG_X86_LOCAL_APIC
44490 +static inline int hard_smp_processor_id(void)
44491 +{
44492 + /* we don't want to mark this access volatile - bad code generation */
44493 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
44494 +}
44495 +#endif
44496 +
44497 +extern int safe_smp_processor_id(void);
44498 +extern int __cpu_disable(void);
44499 +extern void __cpu_die(unsigned int cpu);
44500 +extern void prefill_possible_map(void);
44501 +extern unsigned num_processors;
44502 +extern unsigned disabled_cpus;
44503 +
44504 +#endif /* !ASSEMBLY */
44505 +
44506 +#define NO_PROC_ID 0xFF /* No processor magic marker */
44507 +
44508 +#endif
44509 +
44510 +#ifndef ASSEMBLY
44511 +/*
44512 + * Some lowlevel functions might want to know about
44513 + * the real APIC ID <-> CPU # mapping.
44514 + */
44515 +extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
44516 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
44517 +extern u8 bios_cpu_apicid[];
44518 +
44519 +#ifdef CONFIG_X86_LOCAL_APIC
44520 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
44521 +{
44522 + return cpus_addr(cpumask)[0];
44523 +}
44524 +
44525 +static inline int cpu_present_to_apicid(int mps_cpu)
44526 +{
44527 + if (mps_cpu < NR_CPUS)
44528 + return (int)bios_cpu_apicid[mps_cpu];
44529 + else
44530 + return BAD_APICID;
44531 +}
44532 +#endif
44533 +
44534 +#endif /* !ASSEMBLY */
44535 +
44536 +#ifndef CONFIG_SMP
44537 +#define stack_smp_processor_id() 0
44538 +#define safe_smp_processor_id() 0
44539 +#define cpu_logical_map(x) (x)
44540 +#else
44541 +#include <asm/thread_info.h>
44542 +#define stack_smp_processor_id() \
44543 +({ \
44544 + struct thread_info *ti; \
44545 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
44546 + ti->cpu; \
44547 +})
44548 +#endif
44549 +
44550 +#ifndef __ASSEMBLY__
44551 +#ifdef CONFIG_X86_LOCAL_APIC
44552 +static __inline int logical_smp_processor_id(void)
44553 +{
44554 + /* we don't want to mark this access volatile - bad code generation */
44555 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
44556 +}
44557 +#endif
44558 +#endif
44559 +
44560 +#ifdef CONFIG_SMP
44561 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
44562 +#else
44563 +#define cpu_physical_id(cpu) boot_cpu_id
44564 +#endif
44565 +
44566 +#endif
44567 +
44568 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h
44569 ===================================================================
44570 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44571 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100
44572 @@ -0,0 +1,256 @@
44573 +#ifndef __ASM_SYSTEM_H
44574 +#define __ASM_SYSTEM_H
44575 +
44576 +#include <linux/kernel.h>
44577 +#include <asm/segment.h>
44578 +#include <asm/alternative.h>
44579 +
44580 +#include <asm/synch_bitops.h>
44581 +#include <asm/hypervisor.h>
44582 +#include <xen/interface/arch-x86_64.h>
44583 +
44584 +#ifdef __KERNEL__
44585 +
44586 +#define __STR(x) #x
44587 +#define STR(x) __STR(x)
44588 +
44589 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
44590 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
44591 +
44592 +/* frame pointer must be last for get_wchan */
44593 +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
44594 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
44595 +
44596 +#define __EXTRA_CLOBBER \
44597 + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
44598 +
44599 +#define switch_to(prev,next,last) \
44600 + asm volatile(SAVE_CONTEXT \
44601 + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
44602 + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
44603 + "call __switch_to\n\t" \
44604 + ".globl thread_return\n" \
44605 + "thread_return:\n\t" \
44606 + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
44607 + "movq %P[thread_info](%%rsi),%%r8\n\t" \
44608 + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
44609 + "movq %%rax,%%rdi\n\t" \
44610 + "jc ret_from_fork\n\t" \
44611 + RESTORE_CONTEXT \
44612 + : "=a" (last) \
44613 + : [next] "S" (next), [prev] "D" (prev), \
44614 + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
44615 + [ti_flags] "i" (offsetof(struct thread_info, flags)),\
44616 + [tif_fork] "i" (TIF_FORK), \
44617 + [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
44618 + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
44619 + : "memory", "cc" __EXTRA_CLOBBER)
44620 +
44621 +extern void load_gs_index(unsigned);
44622 +
44623 +/*
44624 + * Load a segment. Fall back on loading the zero
44625 + * segment if something goes wrong..
44626 + */
44627 +#define loadsegment(seg,value) \
44628 + asm volatile("\n" \
44629 + "1:\t" \
44630 + "movl %k0,%%" #seg "\n" \
44631 + "2:\n" \
44632 + ".section .fixup,\"ax\"\n" \
44633 + "3:\t" \
44634 + "movl %1,%%" #seg "\n\t" \
44635 + "jmp 2b\n" \
44636 + ".previous\n" \
44637 + ".section __ex_table,\"a\"\n\t" \
44638 + ".align 8\n\t" \
44639 + ".quad 1b,3b\n" \
44640 + ".previous" \
44641 + : :"r" (value), "r" (0))
44642 +
44643 +/*
44644 + * Clear and set 'TS' bit respectively
44645 + */
44646 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
44647 +
44648 +static inline unsigned long read_cr0(void)
44649 +{
44650 + unsigned long cr0;
44651 + asm volatile("movq %%cr0,%0" : "=r" (cr0));
44652 + return cr0;
44653 +}
44654 +
44655 +static inline void write_cr0(unsigned long val)
44656 +{
44657 + asm volatile("movq %0,%%cr0" :: "r" (val));
44658 +}
44659 +
44660 +#define read_cr3() ({ \
44661 + unsigned long __dummy; \
44662 + asm("movq %%cr3,%0" : "=r" (__dummy)); \
44663 + machine_to_phys(__dummy); \
44664 +})
44665 +
44666 +static inline unsigned long read_cr4(void)
44667 +{
44668 + unsigned long cr4;
44669 + asm("movq %%cr4,%0" : "=r" (cr4));
44670 + return cr4;
44671 +}
44672 +
44673 +static inline void write_cr4(unsigned long val)
44674 +{
44675 + asm volatile("movq %0,%%cr4" :: "r" (val));
44676 +}
44677 +
44678 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
44679 +
44680 +#define wbinvd() \
44681 + __asm__ __volatile__ ("wbinvd": : :"memory");
44682 +
44683 +/*
44684 + * On SMP systems, when the scheduler does migration-cost autodetection,
44685 + * it needs a way to flush as much of the CPU's caches as possible.
44686 + */
44687 +static inline void sched_cacheflush(void)
44688 +{
44689 + wbinvd();
44690 +}
44691 +
44692 +#endif /* __KERNEL__ */
44693 +
44694 +#define nop() __asm__ __volatile__ ("nop")
44695 +
44696 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
44697 +
44698 +#define tas(ptr) (xchg((ptr),1))
44699 +
44700 +#define __xg(x) ((volatile long *)(x))
44701 +
44702 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
44703 +{
44704 + *ptr = val;
44705 +}
44706 +
44707 +#define _set_64bit set_64bit
44708 +
44709 +/*
44710 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
44711 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
44712 + * but generally the primitive is invalid, *ptr is output argument. --ANK
44713 + */
44714 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
44715 +{
44716 + switch (size) {
44717 + case 1:
44718 + __asm__ __volatile__("xchgb %b0,%1"
44719 + :"=q" (x)
44720 + :"m" (*__xg(ptr)), "0" (x)
44721 + :"memory");
44722 + break;
44723 + case 2:
44724 + __asm__ __volatile__("xchgw %w0,%1"
44725 + :"=r" (x)
44726 + :"m" (*__xg(ptr)), "0" (x)
44727 + :"memory");
44728 + break;
44729 + case 4:
44730 + __asm__ __volatile__("xchgl %k0,%1"
44731 + :"=r" (x)
44732 + :"m" (*__xg(ptr)), "0" (x)
44733 + :"memory");
44734 + break;
44735 + case 8:
44736 + __asm__ __volatile__("xchgq %0,%1"
44737 + :"=r" (x)
44738 + :"m" (*__xg(ptr)), "0" (x)
44739 + :"memory");
44740 + break;
44741 + }
44742 + return x;
44743 +}
44744 +
44745 +/*
44746 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
44747 + * store NEW in MEM. Return the initial value in MEM. Success is
44748 + * indicated by comparing RETURN with OLD.
44749 + */
44750 +
44751 +#define __HAVE_ARCH_CMPXCHG 1
44752 +
44753 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
44754 + unsigned long new, int size)
44755 +{
44756 + unsigned long prev;
44757 + switch (size) {
44758 + case 1:
44759 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
44760 + : "=a"(prev)
44761 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
44762 + : "memory");
44763 + return prev;
44764 + case 2:
44765 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
44766 + : "=a"(prev)
44767 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44768 + : "memory");
44769 + return prev;
44770 + case 4:
44771 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
44772 + : "=a"(prev)
44773 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44774 + : "memory");
44775 + return prev;
44776 + case 8:
44777 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
44778 + : "=a"(prev)
44779 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
44780 + : "memory");
44781 + return prev;
44782 + }
44783 + return old;
44784 +}
44785 +
44786 +#define cmpxchg(ptr,o,n)\
44787 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
44788 + (unsigned long)(n),sizeof(*(ptr))))
44789 +
44790 +#ifdef CONFIG_SMP
44791 +#define smp_mb() mb()
44792 +#define smp_rmb() rmb()
44793 +#define smp_wmb() wmb()
44794 +#define smp_read_barrier_depends() do {} while(0)
44795 +#else
44796 +#define smp_mb() barrier()
44797 +#define smp_rmb() barrier()
44798 +#define smp_wmb() barrier()
44799 +#define smp_read_barrier_depends() do {} while(0)
44800 +#endif
44801 +
44802 +
44803 +/*
44804 + * Force strict CPU ordering.
44805 + * And yes, this is required on UP too when we're talking
44806 + * to devices.
44807 + */
44808 +#define mb() asm volatile("mfence":::"memory")
44809 +#define rmb() asm volatile("lfence":::"memory")
44810 +
44811 +#ifdef CONFIG_UNORDERED_IO
44812 +#define wmb() asm volatile("sfence" ::: "memory")
44813 +#else
44814 +#define wmb() asm volatile("" ::: "memory")
44815 +#endif
44816 +#define read_barrier_depends() do {} while(0)
44817 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
44818 +
44819 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
44820 +
44821 +#include <linux/irqflags.h>
44822 +
44823 +void cpu_idle_wait(void);
44824 +
44825 +extern unsigned long arch_align_stack(unsigned long sp);
44826 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
44827 +
44828 +#endif
44829 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h
44830 ===================================================================
44831 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44832 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100
44833 @@ -0,0 +1,103 @@
44834 +#ifndef _X8664_TLBFLUSH_H
44835 +#define _X8664_TLBFLUSH_H
44836 +
44837 +#include <linux/mm.h>
44838 +#include <asm/processor.h>
44839 +
44840 +#define __flush_tlb() xen_tlb_flush()
44841 +
44842 +/*
44843 + * Global pages have to be flushed a bit differently. Not a real
44844 + * performance problem because this does not happen often.
44845 + */
44846 +#define __flush_tlb_global() xen_tlb_flush()
44847 +
44848 +
44849 +extern unsigned long pgkern_mask;
44850 +
44851 +#define __flush_tlb_all() __flush_tlb_global()
44852 +
44853 +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
44854 +
44855 +
44856 +/*
44857 + * TLB flushing:
44858 + *
44859 + * - flush_tlb() flushes the current mm struct TLBs
44860 + * - flush_tlb_all() flushes all processes TLBs
44861 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
44862 + * - flush_tlb_page(vma, vmaddr) flushes one page
44863 + * - flush_tlb_range(vma, start, end) flushes a range of pages
44864 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
44865 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
44866 + *
44867 + * x86-64 can only flush individual pages or full VMs. For a range flush
44868 + * we always do the full VM. Might be worth trying if for a small
44869 + * range a few INVLPGs in a row are a win.
44870 + */
44871 +
44872 +#ifndef CONFIG_SMP
44873 +
44874 +#define flush_tlb() __flush_tlb()
44875 +#define flush_tlb_all() __flush_tlb_all()
44876 +#define local_flush_tlb() __flush_tlb()
44877 +
44878 +static inline void flush_tlb_mm(struct mm_struct *mm)
44879 +{
44880 + if (mm == current->active_mm)
44881 + __flush_tlb();
44882 +}
44883 +
44884 +static inline void flush_tlb_page(struct vm_area_struct *vma,
44885 + unsigned long addr)
44886 +{
44887 + if (vma->vm_mm == current->active_mm)
44888 + __flush_tlb_one(addr);
44889 +}
44890 +
44891 +static inline void flush_tlb_range(struct vm_area_struct *vma,
44892 + unsigned long start, unsigned long end)
44893 +{
44894 + if (vma->vm_mm == current->active_mm)
44895 + __flush_tlb();
44896 +}
44897 +
44898 +#else
44899 +
44900 +#include <asm/smp.h>
44901 +
44902 +#define local_flush_tlb() \
44903 + __flush_tlb()
44904 +
44905 +#define flush_tlb_all xen_tlb_flush_all
44906 +#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
44907 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
44908 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
44909 +
44910 +#define flush_tlb() flush_tlb_current_task()
44911 +
44912 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
44913 +{
44914 + flush_tlb_mm(vma->vm_mm);
44915 +}
44916 +
44917 +#define TLBSTATE_OK 1
44918 +#define TLBSTATE_LAZY 2
44919 +
44920 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
44921 + ranges. Cost is about 42k of memory for each CPU. */
44922 +#define ARCH_FREE_PTE_NR 5350
44923 +
44924 +#endif
44925 +
44926 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
44927 +
44928 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
44929 + unsigned long start, unsigned long end)
44930 +{
44931 + /* x86_64 does not keep any page table caches in a software TLB.
44932 + The CPUs do in their hardware TLBs, but they are handled
44933 + by the normal TLB flushing algorithms. */
44934 +}
44935 +
44936 +#endif /* _X8664_TLBFLUSH_H */
44937 Index: head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h
44938 ===================================================================
44939 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
44940 +++ head-2008-11-25/include/asm-x86/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200
44941 @@ -0,0 +1,328 @@
44942 +/*
44943 + * x86-64 changes / gcc fixes from Andi Kleen.
44944 + * Copyright 2002 Andi Kleen, SuSE Labs.
44945 + *
44946 + * This hasn't been optimized for the hammer yet, but there are likely
44947 + * no advantages to be gotten from x86-64 here anyways.
44948 + */
44949 +
44950 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
44951 +
44952 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
44953 + tell it to do a clts before the register saving. */
44954 +#define XMMS_SAVE do { \
44955 + preempt_disable(); \
44956 + if (!(current_thread_info()->status & TS_USEDFPU)) \
44957 + clts(); \
44958 + __asm__ __volatile__ ( \
44959 + "movups %%xmm0,(%1) ;\n\t" \
44960 + "movups %%xmm1,0x10(%1) ;\n\t" \
44961 + "movups %%xmm2,0x20(%1) ;\n\t" \
44962 + "movups %%xmm3,0x30(%1) ;\n\t" \
44963 + : "=&r" (cr0) \
44964 + : "r" (xmm_save) \
44965 + : "memory"); \
44966 +} while(0)
44967 +
44968 +#define XMMS_RESTORE do { \
44969 + asm volatile ( \
44970 + "sfence ;\n\t" \
44971 + "movups (%1),%%xmm0 ;\n\t" \
44972 + "movups 0x10(%1),%%xmm1 ;\n\t" \
44973 + "movups 0x20(%1),%%xmm2 ;\n\t" \
44974 + "movups 0x30(%1),%%xmm3 ;\n\t" \
44975 + : \
44976 + : "r" (cr0), "r" (xmm_save) \
44977 + : "memory"); \
44978 + if (!(current_thread_info()->status & TS_USEDFPU)) \
44979 + stts(); \
44980 + preempt_enable(); \
44981 +} while(0)
44982 +
44983 +#define OFFS(x) "16*("#x")"
44984 +#define PF_OFFS(x) "256+16*("#x")"
44985 +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
44986 +#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
44987 +#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44988 +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
44989 +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44990 +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
44991 +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
44992 +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
44993 +#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
44994 +#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
44995 +#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
44996 +#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
44997 +#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
44998 +
44999 +
45000 +static void
45001 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45002 +{
45003 + unsigned int lines = bytes >> 8;
45004 + unsigned long cr0;
45005 + xmm_store_t xmm_save[4];
45006 +
45007 + XMMS_SAVE;
45008 +
45009 + asm volatile (
45010 +#undef BLOCK
45011 +#define BLOCK(i) \
45012 + LD(i,0) \
45013 + LD(i+1,1) \
45014 + PF1(i) \
45015 + PF1(i+2) \
45016 + LD(i+2,2) \
45017 + LD(i+3,3) \
45018 + PF0(i+4) \
45019 + PF0(i+6) \
45020 + XO1(i,0) \
45021 + XO1(i+1,1) \
45022 + XO1(i+2,2) \
45023 + XO1(i+3,3) \
45024 + ST(i,0) \
45025 + ST(i+1,1) \
45026 + ST(i+2,2) \
45027 + ST(i+3,3) \
45028 +
45029 +
45030 + PF0(0)
45031 + PF0(2)
45032 +
45033 + " .align 32 ;\n"
45034 + " 1: ;\n"
45035 +
45036 + BLOCK(0)
45037 + BLOCK(4)
45038 + BLOCK(8)
45039 + BLOCK(12)
45040 +
45041 + " addq %[inc], %[p1] ;\n"
45042 + " addq %[inc], %[p2] ;\n"
45043 + " decl %[cnt] ; jnz 1b"
45044 + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
45045 + : [inc] "r" (256UL)
45046 + : "memory");
45047 +
45048 + XMMS_RESTORE;
45049 +}
45050 +
45051 +static void
45052 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45053 + unsigned long *p3)
45054 +{
45055 + unsigned int lines = bytes >> 8;
45056 + xmm_store_t xmm_save[4];
45057 + unsigned long cr0;
45058 +
45059 + XMMS_SAVE;
45060 +
45061 + __asm__ __volatile__ (
45062 +#undef BLOCK
45063 +#define BLOCK(i) \
45064 + PF1(i) \
45065 + PF1(i+2) \
45066 + LD(i,0) \
45067 + LD(i+1,1) \
45068 + LD(i+2,2) \
45069 + LD(i+3,3) \
45070 + PF2(i) \
45071 + PF2(i+2) \
45072 + PF0(i+4) \
45073 + PF0(i+6) \
45074 + XO1(i,0) \
45075 + XO1(i+1,1) \
45076 + XO1(i+2,2) \
45077 + XO1(i+3,3) \
45078 + XO2(i,0) \
45079 + XO2(i+1,1) \
45080 + XO2(i+2,2) \
45081 + XO2(i+3,3) \
45082 + ST(i,0) \
45083 + ST(i+1,1) \
45084 + ST(i+2,2) \
45085 + ST(i+3,3) \
45086 +
45087 +
45088 + PF0(0)
45089 + PF0(2)
45090 +
45091 + " .align 32 ;\n"
45092 + " 1: ;\n"
45093 +
45094 + BLOCK(0)
45095 + BLOCK(4)
45096 + BLOCK(8)
45097 + BLOCK(12)
45098 +
45099 + " addq %[inc], %[p1] ;\n"
45100 + " addq %[inc], %[p2] ;\n"
45101 + " addq %[inc], %[p3] ;\n"
45102 + " decl %[cnt] ; jnz 1b"
45103 + : [cnt] "+r" (lines),
45104 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
45105 + : [inc] "r" (256UL)
45106 + : "memory");
45107 + XMMS_RESTORE;
45108 +}
45109 +
45110 +static void
45111 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45112 + unsigned long *p3, unsigned long *p4)
45113 +{
45114 + unsigned int lines = bytes >> 8;
45115 + xmm_store_t xmm_save[4];
45116 + unsigned long cr0;
45117 +
45118 + XMMS_SAVE;
45119 +
45120 + __asm__ __volatile__ (
45121 +#undef BLOCK
45122 +#define BLOCK(i) \
45123 + PF1(i) \
45124 + PF1(i+2) \
45125 + LD(i,0) \
45126 + LD(i+1,1) \
45127 + LD(i+2,2) \
45128 + LD(i+3,3) \
45129 + PF2(i) \
45130 + PF2(i+2) \
45131 + XO1(i,0) \
45132 + XO1(i+1,1) \
45133 + XO1(i+2,2) \
45134 + XO1(i+3,3) \
45135 + PF3(i) \
45136 + PF3(i+2) \
45137 + PF0(i+4) \
45138 + PF0(i+6) \
45139 + XO2(i,0) \
45140 + XO2(i+1,1) \
45141 + XO2(i+2,2) \
45142 + XO2(i+3,3) \
45143 + XO3(i,0) \
45144 + XO3(i+1,1) \
45145 + XO3(i+2,2) \
45146 + XO3(i+3,3) \
45147 + ST(i,0) \
45148 + ST(i+1,1) \
45149 + ST(i+2,2) \
45150 + ST(i+3,3) \
45151 +
45152 +
45153 + PF0(0)
45154 + PF0(2)
45155 +
45156 + " .align 32 ;\n"
45157 + " 1: ;\n"
45158 +
45159 + BLOCK(0)
45160 + BLOCK(4)
45161 + BLOCK(8)
45162 + BLOCK(12)
45163 +
45164 + " addq %[inc], %[p1] ;\n"
45165 + " addq %[inc], %[p2] ;\n"
45166 + " addq %[inc], %[p3] ;\n"
45167 + " addq %[inc], %[p4] ;\n"
45168 + " decl %[cnt] ; jnz 1b"
45169 + : [cnt] "+c" (lines),
45170 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
45171 + : [inc] "r" (256UL)
45172 + : "memory" );
45173 +
45174 + XMMS_RESTORE;
45175 +}
45176 +
45177 +static void
45178 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
45179 + unsigned long *p3, unsigned long *p4, unsigned long *p5)
45180 +{
45181 + unsigned int lines = bytes >> 8;
45182 + xmm_store_t xmm_save[4];
45183 + unsigned long cr0;
45184 +
45185 + XMMS_SAVE;
45186 +
45187 + __asm__ __volatile__ (
45188 +#undef BLOCK
45189 +#define BLOCK(i) \
45190 + PF1(i) \
45191 + PF1(i+2) \
45192 + LD(i,0) \
45193 + LD(i+1,1) \
45194 + LD(i+2,2) \
45195 + LD(i+3,3) \
45196 + PF2(i) \
45197 + PF2(i+2) \
45198 + XO1(i,0) \
45199 + XO1(i+1,1) \
45200 + XO1(i+2,2) \
45201 + XO1(i+3,3) \
45202 + PF3(i) \
45203 + PF3(i+2) \
45204 + XO2(i,0) \
45205 + XO2(i+1,1) \
45206 + XO2(i+2,2) \
45207 + XO2(i+3,3) \
45208 + PF4(i) \
45209 + PF4(i+2) \
45210 + PF0(i+4) \
45211 + PF0(i+6) \
45212 + XO3(i,0) \
45213 + XO3(i+1,1) \
45214 + XO3(i+2,2) \
45215 + XO3(i+3,3) \
45216 + XO4(i,0) \
45217 + XO4(i+1,1) \
45218 + XO4(i+2,2) \
45219 + XO4(i+3,3) \
45220 + ST(i,0) \
45221 + ST(i+1,1) \
45222 + ST(i+2,2) \
45223 + ST(i+3,3) \
45224 +
45225 +
45226 + PF0(0)
45227 + PF0(2)
45228 +
45229 + " .align 32 ;\n"
45230 + " 1: ;\n"
45231 +
45232 + BLOCK(0)
45233 + BLOCK(4)
45234 + BLOCK(8)
45235 + BLOCK(12)
45236 +
45237 + " addq %[inc], %[p1] ;\n"
45238 + " addq %[inc], %[p2] ;\n"
45239 + " addq %[inc], %[p3] ;\n"
45240 + " addq %[inc], %[p4] ;\n"
45241 + " addq %[inc], %[p5] ;\n"
45242 + " decl %[cnt] ; jnz 1b"
45243 + : [cnt] "+c" (lines),
45244 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
45245 + [p5] "+r" (p5)
45246 + : [inc] "r" (256UL)
45247 + : "memory");
45248 +
45249 + XMMS_RESTORE;
45250 +}
45251 +
45252 +static struct xor_block_template xor_block_sse = {
45253 + .name = "generic_sse",
45254 + .do_2 = xor_sse_2,
45255 + .do_3 = xor_sse_3,
45256 + .do_4 = xor_sse_4,
45257 + .do_5 = xor_sse_5,
45258 +};
45259 +
45260 +#undef XOR_TRY_TEMPLATES
45261 +#define XOR_TRY_TEMPLATES \
45262 + do { \
45263 + xor_speed(&xor_block_sse); \
45264 + } while (0)
45265 +
45266 +/* We force the use of the SSE xor block because it can write around L2.
45267 + We may also be able to load into the L1 only depending on how the cpu
45268 + deals with a load to a line that is being prefetched. */
45269 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
45270 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_time.h
45271 ===================================================================
45272 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45273 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_time.h 2007-06-12 13:14:13.000000000 +0200
45274 @@ -0,0 +1,111 @@
45275 +/*
45276 + * include/asm-i386/mach-default/mach_time.h
45277 + *
45278 + * Machine specific set RTC function for generic.
45279 + * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
45280 + */
45281 +#ifndef _MACH_TIME_H
45282 +#define _MACH_TIME_H
45283 +
45284 +#include <asm-i386/mc146818rtc.h>
45285 +
45286 +/* for check timing call set_rtc_mmss() 500ms */
45287 +/* used in arch/i386/time.c::do_timer_interrupt() */
45288 +#define USEC_AFTER 500000
45289 +#define USEC_BEFORE 500000
45290 +
45291 +/*
45292 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
45293 + * called 500 ms after the second nowtime has started, because when
45294 + * nowtime is written into the registers of the CMOS clock, it will
45295 + * jump to the next second precisely 500 ms later. Check the Motorola
45296 + * MC146818A or Dallas DS12887 data sheet for details.
45297 + *
45298 + * BUG: This routine does not handle hour overflow properly; it just
45299 + * sets the minutes. Usually you'll only notice that after reboot!
45300 + */
45301 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
45302 +{
45303 + int retval = 0;
45304 + int real_seconds, real_minutes, cmos_minutes;
45305 + unsigned char save_control, save_freq_select;
45306 +
45307 + save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
45308 + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
45309 +
45310 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
45311 + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
45312 +
45313 + cmos_minutes = CMOS_READ(RTC_MINUTES);
45314 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
45315 + BCD_TO_BIN(cmos_minutes);
45316 +
45317 + /*
45318 + * since we're only adjusting minutes and seconds,
45319 + * don't interfere with hour overflow. This avoids
45320 + * messing with unknown time zones but requires your
45321 + * RTC not to be off by more than 15 minutes
45322 + */
45323 + real_seconds = nowtime % 60;
45324 + real_minutes = nowtime / 60;
45325 + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
45326 + real_minutes += 30; /* correct for half hour time zone */
45327 + real_minutes %= 60;
45328 +
45329 + if (abs(real_minutes - cmos_minutes) < 30) {
45330 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45331 + BIN_TO_BCD(real_seconds);
45332 + BIN_TO_BCD(real_minutes);
45333 + }
45334 + CMOS_WRITE(real_seconds,RTC_SECONDS);
45335 + CMOS_WRITE(real_minutes,RTC_MINUTES);
45336 + } else {
45337 + printk(KERN_WARNING
45338 + "set_rtc_mmss: can't update from %d to %d\n",
45339 + cmos_minutes, real_minutes);
45340 + retval = -1;
45341 + }
45342 +
45343 + /* The following flags have to be released exactly in this order,
45344 + * otherwise the DS12887 (popular MC146818A clone with integrated
45345 + * battery and quartz) will not reset the oscillator and will not
45346 + * update precisely 500 ms later. You won't find this mentioned in
45347 + * the Dallas Semiconductor data sheets, but who believes data
45348 + * sheets anyway ... -- Markus Kuhn
45349 + */
45350 + CMOS_WRITE(save_control, RTC_CONTROL);
45351 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
45352 +
45353 + return retval;
45354 +}
45355 +
45356 +static inline unsigned long mach_get_cmos_time(void)
45357 +{
45358 + unsigned int year, mon, day, hour, min, sec;
45359 +
45360 + do {
45361 + sec = CMOS_READ(RTC_SECONDS);
45362 + min = CMOS_READ(RTC_MINUTES);
45363 + hour = CMOS_READ(RTC_HOURS);
45364 + day = CMOS_READ(RTC_DAY_OF_MONTH);
45365 + mon = CMOS_READ(RTC_MONTH);
45366 + year = CMOS_READ(RTC_YEAR);
45367 + } while (sec != CMOS_READ(RTC_SECONDS));
45368 +
45369 + if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
45370 + BCD_TO_BIN(sec);
45371 + BCD_TO_BIN(min);
45372 + BCD_TO_BIN(hour);
45373 + BCD_TO_BIN(day);
45374 + BCD_TO_BIN(mon);
45375 + BCD_TO_BIN(year);
45376 + }
45377 +
45378 + year += 1900;
45379 + if (year < 1970)
45380 + year += 100;
45381 +
45382 + return mktime(year, mon, day, hour, min, sec);
45383 +}
45384 +
45385 +#endif /* !_MACH_TIME_H */
45386 Index: head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h
45387 ===================================================================
45388 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45389 +++ head-2008-11-25/include/asm-x86/mach-xen/mach_timer.h 2007-06-12 13:14:13.000000000 +0200
45390 @@ -0,0 +1,50 @@
45391 +/*
45392 + * include/asm-i386/mach-default/mach_timer.h
45393 + *
45394 + * Machine specific calibrate_tsc() for generic.
45395 + * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
45396 + */
45397 +/* ------ Calibrate the TSC -------
45398 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
45399 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
45400 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
45401 + * output busy loop as low as possible. We avoid reading the CTC registers
45402 + * directly because of the awkward 8-bit access mechanism of the 82C54
45403 + * device.
45404 + */
45405 +#ifndef _MACH_TIMER_H
45406 +#define _MACH_TIMER_H
45407 +
45408 +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
45409 +#define CALIBRATE_LATCH \
45410 + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
45411 +
45412 +static inline void mach_prepare_counter(void)
45413 +{
45414 + /* Set the Gate high, disable speaker */
45415 + outb((inb(0x61) & ~0x02) | 0x01, 0x61);
45416 +
45417 + /*
45418 + * Now let's take care of CTC channel 2
45419 + *
45420 + * Set the Gate high, program CTC channel 2 for mode 0,
45421 + * (interrupt on terminal count mode), binary count,
45422 + * load 5 * LATCH count, (LSB and MSB) to begin countdown.
45423 + *
45424 + * Some devices need a delay here.
45425 + */
45426 + outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
45427 + outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
45428 + outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
45429 +}
45430 +
45431 +static inline void mach_countup(unsigned long *count_p)
45432 +{
45433 + unsigned long count = 0;
45434 + do {
45435 + count++;
45436 + } while ((inb_p(0x61) & 0x20) == 0);
45437 + *count_p = count;
45438 +}
45439 +
45440 +#endif /* !_MACH_TIMER_H */
45441 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h
45442 ===================================================================
45443 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45444 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200
45445 @@ -0,0 +1,63 @@
45446 +/**
45447 + * machine_specific_* - Hooks for machine specific setup.
45448 + *
45449 + * Description:
45450 + * This is included late in kernel/setup.c so that it can make
45451 + * use of all of the static functions.
45452 + **/
45453 +
45454 +#include <xen/interface/callback.h>
45455 +
45456 +extern void hypervisor_callback(void);
45457 +extern void failsafe_callback(void);
45458 +extern void nmi(void);
45459 +
45460 +static void __init machine_specific_arch_setup(void)
45461 +{
45462 + int ret;
45463 + static struct callback_register __initdata event = {
45464 + .type = CALLBACKTYPE_event,
45465 + .address = (unsigned long) hypervisor_callback,
45466 + };
45467 + static struct callback_register __initdata failsafe = {
45468 + .type = CALLBACKTYPE_failsafe,
45469 + .address = (unsigned long)failsafe_callback,
45470 + };
45471 + static struct callback_register __initdata syscall = {
45472 + .type = CALLBACKTYPE_syscall,
45473 + .address = (unsigned long)system_call,
45474 + };
45475 +#ifdef CONFIG_X86_LOCAL_APIC
45476 + static struct callback_register __initdata nmi_cb = {
45477 + .type = CALLBACKTYPE_nmi,
45478 + .address = (unsigned long)nmi,
45479 + };
45480 +#endif
45481 +
45482 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
45483 + if (ret == 0)
45484 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
45485 + if (ret == 0)
45486 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
45487 +#if CONFIG_XEN_COMPAT <= 0x030002
45488 + if (ret == -ENOSYS)
45489 + ret = HYPERVISOR_set_callbacks(
45490 + event.address,
45491 + failsafe.address,
45492 + syscall.address);
45493 +#endif
45494 + BUG_ON(ret);
45495 +
45496 +#ifdef CONFIG_X86_LOCAL_APIC
45497 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
45498 +#if CONFIG_XEN_COMPAT <= 0x030002
45499 + if (ret == -ENOSYS) {
45500 + static struct xennmi_callback __initdata cb = {
45501 + .handler_address = (unsigned long)nmi
45502 + };
45503 +
45504 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
45505 + }
45506 +#endif
45507 +#endif
45508 +}
45509 Index: head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h
45510 ===================================================================
45511 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45512 +++ head-2008-11-25/include/asm-x86/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200
45513 @@ -0,0 +1,5 @@
45514 +/* Hook to call BIOS initialisation function */
45515 +
45516 +#define ARCH_SETUP machine_specific_arch_setup();
45517 +
45518 +static void __init machine_specific_arch_setup(void);
45519 Index: head-2008-11-25/include/xen/blkif.h
45520 ===================================================================
45521 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45522 +++ head-2008-11-25/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200
45523 @@ -0,0 +1,123 @@
45524 +/*
45525 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45526 + * of this software and associated documentation files (the "Software"), to
45527 + * deal in the Software without restriction, including without limitation the
45528 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
45529 + * sell copies of the Software, and to permit persons to whom the Software is
45530 + * furnished to do so, subject to the following conditions:
45531 + *
45532 + * The above copyright notice and this permission notice shall be included in
45533 + * all copies or substantial portions of the Software.
45534 + *
45535 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45536 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45537 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45538 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45539 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45540 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
45541 + * DEALINGS IN THE SOFTWARE.
45542 + */
45543 +
45544 +#ifndef __XEN_BLKIF_H__
45545 +#define __XEN_BLKIF_H__
45546 +
45547 +#include <xen/interface/io/ring.h>
45548 +#include <xen/interface/io/blkif.h>
45549 +#include <xen/interface/io/protocols.h>
45550 +
45551 +/* Not a real protocol. Used to generate ring structs which contain
45552 + * the elements common to all protocols only. This way we get a
45553 + * compiler-checkable way to use common struct elements, so we can
45554 + * avoid using switch(protocol) in a number of places. */
45555 +struct blkif_common_request {
45556 + char dummy;
45557 +};
45558 +struct blkif_common_response {
45559 + char dummy;
45560 +};
45561 +
45562 +/* i386 protocol version */
45563 +#pragma pack(push, 4)
45564 +struct blkif_x86_32_request {
45565 + uint8_t operation; /* BLKIF_OP_??? */
45566 + uint8_t nr_segments; /* number of segments */
45567 + blkif_vdev_t handle; /* only for read/write requests */
45568 + uint64_t id; /* private guest value, echoed in resp */
45569 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45570 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45571 +};
45572 +struct blkif_x86_32_response {
45573 + uint64_t id; /* copied from request */
45574 + uint8_t operation; /* copied from request */
45575 + int16_t status; /* BLKIF_RSP_??? */
45576 +};
45577 +typedef struct blkif_x86_32_request blkif_x86_32_request_t;
45578 +typedef struct blkif_x86_32_response blkif_x86_32_response_t;
45579 +#pragma pack(pop)
45580 +
45581 +/* x86_64 protocol version */
45582 +struct blkif_x86_64_request {
45583 + uint8_t operation; /* BLKIF_OP_??? */
45584 + uint8_t nr_segments; /* number of segments */
45585 + blkif_vdev_t handle; /* only for read/write requests */
45586 + uint64_t __attribute__((__aligned__(8))) id;
45587 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
45588 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
45589 +};
45590 +struct blkif_x86_64_response {
45591 + uint64_t __attribute__((__aligned__(8))) id;
45592 + uint8_t operation; /* copied from request */
45593 + int16_t status; /* BLKIF_RSP_??? */
45594 +};
45595 +typedef struct blkif_x86_64_request blkif_x86_64_request_t;
45596 +typedef struct blkif_x86_64_response blkif_x86_64_response_t;
45597 +
45598 +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
45599 +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
45600 +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
45601 +
45602 +union blkif_back_rings {
45603 + blkif_back_ring_t native;
45604 + blkif_common_back_ring_t common;
45605 + blkif_x86_32_back_ring_t x86_32;
45606 + blkif_x86_64_back_ring_t x86_64;
45607 +};
45608 +typedef union blkif_back_rings blkif_back_rings_t;
45609 +
45610 +enum blkif_protocol {
45611 + BLKIF_PROTOCOL_NATIVE = 1,
45612 + BLKIF_PROTOCOL_X86_32 = 2,
45613 + BLKIF_PROTOCOL_X86_64 = 3,
45614 +};
45615 +
45616 +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
45617 +{
45618 + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45619 + dst->operation = src->operation;
45620 + dst->nr_segments = src->nr_segments;
45621 + dst->handle = src->handle;
45622 + dst->id = src->id;
45623 + dst->sector_number = src->sector_number;
45624 + barrier();
45625 + if (n > dst->nr_segments)
45626 + n = dst->nr_segments;
45627 + for (i = 0; i < n; i++)
45628 + dst->seg[i] = src->seg[i];
45629 +}
45630 +
45631 +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
45632 +{
45633 + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
45634 + dst->operation = src->operation;
45635 + dst->nr_segments = src->nr_segments;
45636 + dst->handle = src->handle;
45637 + dst->id = src->id;
45638 + dst->sector_number = src->sector_number;
45639 + barrier();
45640 + if (n > dst->nr_segments)
45641 + n = dst->nr_segments;
45642 + for (i = 0; i < n; i++)
45643 + dst->seg[i] = src->seg[i];
45644 +}
45645 +
45646 +#endif /* __XEN_BLKIF_H__ */
45647 Index: head-2008-11-25/include/xen/compat_ioctl.h
45648 ===================================================================
45649 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45650 +++ head-2008-11-25/include/xen/compat_ioctl.h 2007-07-10 09:42:30.000000000 +0200
45651 @@ -0,0 +1,45 @@
45652 +/*
45653 + * This program is free software; you can redistribute it and/or
45654 + * modify it under the terms of the GNU General Public License as
45655 + * published by the Free Software Foundation; either version 2 of the
45656 + * License, or (at your option) any later version.
45657 + *
45658 + * This program is distributed in the hope that it will be useful,
45659 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
45660 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
45661 + * GNU General Public License for more details.
45662 + *
45663 + * You should have received a copy of the GNU General Public License
45664 + * along with this program; if not, write to the Free Software
45665 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
45666 + *
45667 + * Copyright IBM Corp. 2007
45668 + *
45669 + * Authors: Jimi Xenidis <jimix@watson.ibm.com>
45670 + * Hollis Blanchard <hollisb@us.ibm.com>
45671 + */
45672 +
45673 +#ifndef __LINUX_XEN_COMPAT_H__
45674 +#define __LINUX_XEN_COMPAT_H__
45675 +
45676 +#include <linux/compat.h>
45677 +
45678 +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg);
45679 +struct privcmd_mmap_32 {
45680 + int num;
45681 + domid_t dom;
45682 + compat_uptr_t entry;
45683 +};
45684 +
45685 +struct privcmd_mmapbatch_32 {
45686 + int num; /* number of pages to populate */
45687 + domid_t dom; /* target domain */
45688 + __u64 addr; /* virtual address */
45689 + compat_uptr_t arr; /* array of mfns - top nibble set on err */
45690 +};
45691 +#define IOCTL_PRIVCMD_MMAP_32 \
45692 + _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
45693 +#define IOCTL_PRIVCMD_MMAPBATCH_32 \
45694 + _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
45695 +
45696 +#endif /* __LINUX_XEN_COMPAT_H__ */
45697 Index: head-2008-11-25/include/xen/cpu_hotplug.h
45698 ===================================================================
45699 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45700 +++ head-2008-11-25/include/xen/cpu_hotplug.h 2007-08-16 18:07:01.000000000 +0200
45701 @@ -0,0 +1,41 @@
45702 +#ifndef __XEN_CPU_HOTPLUG_H__
45703 +#define __XEN_CPU_HOTPLUG_H__
45704 +
45705 +#include <linux/kernel.h>
45706 +#include <linux/cpumask.h>
45707 +
45708 +#if defined(CONFIG_X86) && defined(CONFIG_SMP)
45709 +extern cpumask_t cpu_initialized_map;
45710 +#endif
45711 +
45712 +#if defined(CONFIG_HOTPLUG_CPU)
45713 +
45714 +int cpu_up_check(unsigned int cpu);
45715 +void init_xenbus_allowed_cpumask(void);
45716 +int smp_suspend(void);
45717 +void smp_resume(void);
45718 +
45719 +void cpu_bringup(void);
45720 +
45721 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
45722 +
45723 +#define cpu_up_check(cpu) (0)
45724 +#define init_xenbus_allowed_cpumask() ((void)0)
45725 +
45726 +static inline int smp_suspend(void)
45727 +{
45728 + if (num_online_cpus() > 1) {
45729 + printk(KERN_WARNING "Can't suspend SMP guests "
45730 + "without CONFIG_HOTPLUG_CPU\n");
45731 + return -EOPNOTSUPP;
45732 + }
45733 + return 0;
45734 +}
45735 +
45736 +static inline void smp_resume(void)
45737 +{
45738 +}
45739 +
45740 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
45741 +
45742 +#endif /* __XEN_CPU_HOTPLUG_H__ */
45743 Index: head-2008-11-25/include/xen/driver_util.h
45744 ===================================================================
45745 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45746 +++ head-2008-11-25/include/xen/driver_util.h 2007-06-12 13:14:19.000000000 +0200
45747 @@ -0,0 +1,14 @@
45748 +
45749 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
45750 +#define __ASM_XEN_DRIVER_UTIL_H__
45751 +
45752 +#include <linux/vmalloc.h>
45753 +#include <linux/device.h>
45754 +
45755 +/* Allocate/destroy a 'vmalloc' VM area. */
45756 +extern struct vm_struct *alloc_vm_area(unsigned long size);
45757 +extern void free_vm_area(struct vm_struct *area);
45758 +
45759 +extern struct class *get_xen_class(void);
45760 +
45761 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
45762 Index: head-2008-11-25/include/xen/evtchn.h
45763 ===================================================================
45764 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45765 +++ head-2008-11-25/include/xen/evtchn.h 2008-09-15 13:40:15.000000000 +0200
45766 @@ -0,0 +1,160 @@
45767 +/******************************************************************************
45768 + * evtchn.h
45769 + *
45770 + * Communication via Xen event channels.
45771 + * Also definitions for the device that demuxes notifications to userspace.
45772 + *
45773 + * Copyright (c) 2004-2005, K A Fraser
45774 + *
45775 + * This program is free software; you can redistribute it and/or
45776 + * modify it under the terms of the GNU General Public License version 2
45777 + * as published by the Free Software Foundation; or, when distributed
45778 + * separately from the Linux kernel or incorporated into other
45779 + * software packages, subject to the following license:
45780 + *
45781 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45782 + * of this source file (the "Software"), to deal in the Software without
45783 + * restriction, including without limitation the rights to use, copy, modify,
45784 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45785 + * and to permit persons to whom the Software is furnished to do so, subject to
45786 + * the following conditions:
45787 + *
45788 + * The above copyright notice and this permission notice shall be included in
45789 + * all copies or substantial portions of the Software.
45790 + *
45791 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45792 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45793 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45794 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45795 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45796 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45797 + * IN THE SOFTWARE.
45798 + */
45799 +
45800 +#ifndef __ASM_EVTCHN_H__
45801 +#define __ASM_EVTCHN_H__
45802 +
45803 +#include <linux/interrupt.h>
45804 +#include <asm/hypervisor.h>
45805 +#include <asm/ptrace.h>
45806 +#include <asm/synch_bitops.h>
45807 +#include <xen/interface/event_channel.h>
45808 +#include <linux/smp.h>
45809 +
45810 +/*
45811 + * LOW-LEVEL DEFINITIONS
45812 + */
45813 +
45814 +/*
45815 + * Dynamically bind an event source to an IRQ-like callback handler.
45816 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
45817 + * The IRQ argument passed to the callback handler is the same as returned
45818 + * from the bind call. It may not correspond to a Linux IRQ number.
45819 + * Returns IRQ or negative errno.
45820 + */
45821 +int bind_caller_port_to_irqhandler(
45822 + unsigned int caller_port,
45823 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45824 + unsigned long irqflags,
45825 + const char *devname,
45826 + void *dev_id);
45827 +int bind_listening_port_to_irqhandler(
45828 + unsigned int remote_domain,
45829 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45830 + unsigned long irqflags,
45831 + const char *devname,
45832 + void *dev_id);
45833 +int bind_interdomain_evtchn_to_irqhandler(
45834 + unsigned int remote_domain,
45835 + unsigned int remote_port,
45836 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45837 + unsigned long irqflags,
45838 + const char *devname,
45839 + void *dev_id);
45840 +int bind_virq_to_irqhandler(
45841 + unsigned int virq,
45842 + unsigned int cpu,
45843 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45844 + unsigned long irqflags,
45845 + const char *devname,
45846 + void *dev_id);
45847 +int bind_ipi_to_irqhandler(
45848 + unsigned int ipi,
45849 + unsigned int cpu,
45850 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
45851 + unsigned long irqflags,
45852 + const char *devname,
45853 + void *dev_id);
45854 +
45855 +/*
45856 + * Common unbind function for all event sources. Takes IRQ to unbind from.
45857 + * Automatically closes the underlying event channel (except for bindings
45858 + * made with bind_caller_port_to_irqhandler()).
45859 + */
45860 +void unbind_from_irqhandler(unsigned int irq, void *dev_id);
45861 +
45862 +void irq_resume(void);
45863 +
45864 +/* Entry point for notifications into Linux subsystems. */
45865 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
45866 +
45867 +/* Entry point for notifications into the userland character device. */
45868 +void evtchn_device_upcall(int port);
45869 +
45870 +/* Mark a PIRQ as unavailable for dynamic allocation. */
45871 +void evtchn_register_pirq(int irq);
45872 +/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
45873 +int evtchn_map_pirq(int irq, int xen_pirq);
45874 +/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
45875 +int evtchn_get_xen_pirq(int irq);
45876 +
45877 +void mask_evtchn(int port);
45878 +void disable_all_local_evtchn(void);
45879 +void unmask_evtchn(int port);
45880 +
45881 +#ifdef CONFIG_SMP
45882 +void rebind_evtchn_to_cpu(int port, unsigned int cpu);
45883 +#else
45884 +#define rebind_evtchn_to_cpu(port, cpu) ((void)0)
45885 +#endif
45886 +
45887 +static inline int test_and_set_evtchn_mask(int port)
45888 +{
45889 + shared_info_t *s = HYPERVISOR_shared_info;
45890 + return synch_test_and_set_bit(port, s->evtchn_mask);
45891 +}
45892 +
45893 +static inline void clear_evtchn(int port)
45894 +{
45895 + shared_info_t *s = HYPERVISOR_shared_info;
45896 + synch_clear_bit(port, s->evtchn_pending);
45897 +}
45898 +
45899 +static inline void notify_remote_via_evtchn(int port)
45900 +{
45901 + struct evtchn_send send = { .port = port };
45902 + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
45903 +}
45904 +
45905 +/*
45906 + * Use these to access the event channel underlying the IRQ handle returned
45907 + * by bind_*_to_irqhandler().
45908 + */
45909 +void notify_remote_via_irq(int irq);
45910 +int irq_to_evtchn_port(int irq);
45911 +
45912 +#define PIRQ_SET_MAPPING 0x0
45913 +#define PIRQ_CLEAR_MAPPING 0x1
45914 +#define PIRQ_GET_MAPPING 0x3
45915 +int pirq_mapstatus(int pirq, int action);
45916 +int set_pirq_hw_action(int pirq, int (*action)(int pirq, int action));
45917 +int clear_pirq_hw_action(int pirq);
45918 +
45919 +#define PIRQ_STARTUP 1
45920 +#define PIRQ_SHUTDOWN 2
45921 +#define PIRQ_ENABLE 3
45922 +#define PIRQ_DISABLE 4
45923 +#define PIRQ_END 5
45924 +#define PIRQ_ACK 6
45925 +
45926 +#endif /* __ASM_EVTCHN_H__ */
45927 Index: head-2008-11-25/include/xen/firmware.h
45928 ===================================================================
45929 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45930 +++ head-2008-11-25/include/xen/firmware.h 2007-07-02 08:16:19.000000000 +0200
45931 @@ -0,0 +1,10 @@
45932 +#ifndef __XEN_FIRMWARE_H__
45933 +#define __XEN_FIRMWARE_H__
45934 +
45935 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
45936 +void copy_edd(void);
45937 +#endif
45938 +
45939 +void copy_edid(void);
45940 +
45941 +#endif /* __XEN_FIRMWARE_H__ */
45942 Index: head-2008-11-25/include/xen/gnttab.h
45943 ===================================================================
45944 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
45945 +++ head-2008-11-25/include/xen/gnttab.h 2008-11-04 11:13:10.000000000 +0100
45946 @@ -0,0 +1,164 @@
45947 +/******************************************************************************
45948 + * gnttab.h
45949 + *
45950 + * Two sets of functionality:
45951 + * 1. Granting foreign access to our memory reservation.
45952 + * 2. Accessing others' memory reservations via grant references.
45953 + * (i.e., mechanisms for both sender and recipient of grant references)
45954 + *
45955 + * Copyright (c) 2004-2005, K A Fraser
45956 + * Copyright (c) 2005, Christopher Clark
45957 + *
45958 + * This program is free software; you can redistribute it and/or
45959 + * modify it under the terms of the GNU General Public License version 2
45960 + * as published by the Free Software Foundation; or, when distributed
45961 + * separately from the Linux kernel or incorporated into other
45962 + * software packages, subject to the following license:
45963 + *
45964 + * Permission is hereby granted, free of charge, to any person obtaining a copy
45965 + * of this source file (the "Software"), to deal in the Software without
45966 + * restriction, including without limitation the rights to use, copy, modify,
45967 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
45968 + * and to permit persons to whom the Software is furnished to do so, subject to
45969 + * the following conditions:
45970 + *
45971 + * The above copyright notice and this permission notice shall be included in
45972 + * all copies or substantial portions of the Software.
45973 + *
45974 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45975 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45976 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45977 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45978 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
45979 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45980 + * IN THE SOFTWARE.
45981 + */
45982 +
45983 +#ifndef __ASM_GNTTAB_H__
45984 +#define __ASM_GNTTAB_H__
45985 +
45986 +#include <asm/hypervisor.h>
45987 +#include <asm/maddr.h> /* maddr_t */
45988 +#include <linux/mm.h>
45989 +#include <xen/interface/grant_table.h>
45990 +#include <xen/features.h>
45991 +
45992 +struct gnttab_free_callback {
45993 + struct gnttab_free_callback *next;
45994 + void (*fn)(void *);
45995 + void *arg;
45996 + u16 count;
45997 + u8 queued;
45998 +};
45999 +
46000 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
46001 + int flags);
46002 +
46003 +/*
46004 + * End access through the given grant reference, iff the grant entry is no
46005 + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
46006 + * use.
46007 + */
46008 +int gnttab_end_foreign_access_ref(grant_ref_t ref);
46009 +
46010 +/*
46011 + * Eventually end access through the given grant reference, and once that
46012 + * access has been ended, free the given page too. Access will be ended
46013 + * immediately iff the grant entry is not in use, otherwise it will happen
46014 + * some time later. page may be 0, in which case no freeing will occur.
46015 + */
46016 +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
46017 +
46018 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
46019 +
46020 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
46021 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
46022 +
46023 +int gnttab_query_foreign_access(grant_ref_t ref);
46024 +
46025 +/*
46026 + * operations on reserved batches of grant references
46027 + */
46028 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
46029 +
46030 +void gnttab_free_grant_reference(grant_ref_t ref);
46031 +
46032 +void gnttab_free_grant_references(grant_ref_t head);
46033 +
46034 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
46035 +
46036 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
46037 +
46038 +void gnttab_release_grant_reference(grant_ref_t *private_head,
46039 + grant_ref_t release);
46040 +
46041 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
46042 + void (*fn)(void *), void *arg, u16 count);
46043 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
46044 +
46045 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
46046 + unsigned long frame, int flags);
46047 +
46048 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
46049 + unsigned long pfn);
46050 +
46051 +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
46052 +void __gnttab_dma_map_page(struct page *page);
46053 +static inline void __gnttab_dma_unmap_page(struct page *page)
46054 +{
46055 +}
46056 +
46057 +void gnttab_reset_grant_page(struct page *page);
46058 +
46059 +int gnttab_suspend(void);
46060 +int gnttab_resume(void);
46061 +
46062 +void *arch_gnttab_alloc_shared(unsigned long *frames);
46063 +
46064 +static inline void
46065 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
46066 + uint32_t flags, grant_ref_t ref, domid_t domid)
46067 +{
46068 + if (flags & GNTMAP_contains_pte)
46069 + map->host_addr = addr;
46070 + else if (xen_feature(XENFEAT_auto_translated_physmap))
46071 + map->host_addr = __pa(addr);
46072 + else
46073 + map->host_addr = addr;
46074 +
46075 + map->flags = flags;
46076 + map->ref = ref;
46077 + map->dom = domid;
46078 +}
46079 +
46080 +static inline void
46081 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
46082 + uint32_t flags, grant_handle_t handle)
46083 +{
46084 + if (flags & GNTMAP_contains_pte)
46085 + unmap->host_addr = addr;
46086 + else if (xen_feature(XENFEAT_auto_translated_physmap))
46087 + unmap->host_addr = __pa(addr);
46088 + else
46089 + unmap->host_addr = addr;
46090 +
46091 + unmap->handle = handle;
46092 + unmap->dev_bus_addr = 0;
46093 +}
46094 +
46095 +static inline void
46096 +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
46097 + maddr_t new_addr, grant_handle_t handle)
46098 +{
46099 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
46100 + unmap->host_addr = __pa(addr);
46101 + unmap->new_addr = __pa(new_addr);
46102 + } else {
46103 + unmap->host_addr = addr;
46104 + unmap->new_addr = new_addr;
46105 + }
46106 +
46107 + unmap->handle = handle;
46108 +}
46109 +
46110 +#endif /* __ASM_GNTTAB_H__ */
46111 Index: head-2008-11-25/include/xen/hvm.h
46112 ===================================================================
46113 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46114 +++ head-2008-11-25/include/xen/hvm.h 2007-06-12 13:14:19.000000000 +0200
46115 @@ -0,0 +1,23 @@
46116 +/* Simple wrappers around HVM functions */
46117 +#ifndef XEN_HVM_H__
46118 +#define XEN_HVM_H__
46119 +
46120 +#include <xen/interface/hvm/params.h>
46121 +
46122 +static inline unsigned long hvm_get_parameter(int idx)
46123 +{
46124 + struct xen_hvm_param xhv;
46125 + int r;
46126 +
46127 + xhv.domid = DOMID_SELF;
46128 + xhv.index = idx;
46129 + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
46130 + if (r < 0) {
46131 + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
46132 + idx, r);
46133 + return 0;
46134 + }
46135 + return xhv.value;
46136 +}
46137 +
46138 +#endif /* XEN_HVM_H__ */
46139 Index: head-2008-11-25/include/xen/hypercall.h
46140 ===================================================================
46141 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46142 +++ head-2008-11-25/include/xen/hypercall.h 2008-01-28 12:24:19.000000000 +0100
46143 @@ -0,0 +1,30 @@
46144 +#ifndef __XEN_HYPERCALL_H__
46145 +#define __XEN_HYPERCALL_H__
46146 +
46147 +#include <asm/hypercall.h>
46148 +
46149 +static inline int __must_check
46150 +HYPERVISOR_multicall_check(
46151 + multicall_entry_t *call_list, unsigned int nr_calls,
46152 + const unsigned long *rc_list)
46153 +{
46154 + int rc = HYPERVISOR_multicall(call_list, nr_calls);
46155 +
46156 + if (unlikely(rc < 0))
46157 + return rc;
46158 + BUG_ON(rc);
46159 + BUG_ON((int)nr_calls < 0);
46160 +
46161 + for ( ; nr_calls > 0; --nr_calls, ++call_list)
46162 + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
46163 + return nr_calls;
46164 +
46165 + return 0;
46166 +}
46167 +
46168 +/* A construct to ignore the return value of hypercall wrappers in a few
46169 + * exceptional cases (simply casting the function result to void doesn't
46170 + * avoid the compiler warning): */
46171 +#define VOID(expr) ((void)((expr)?:0))
46172 +
46173 +#endif /* __XEN_HYPERCALL_H__ */
46174 Index: head-2008-11-25/include/xen/hypervisor_sysfs.h
46175 ===================================================================
46176 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46177 +++ head-2008-11-25/include/xen/hypervisor_sysfs.h 2007-06-22 09:08:06.000000000 +0200
46178 @@ -0,0 +1,30 @@
46179 +/*
46180 + * copyright (c) 2006 IBM Corporation
46181 + * Authored by: Mike D. Day <ncmike@us.ibm.com>
46182 + *
46183 + * This program is free software; you can redistribute it and/or modify
46184 + * it under the terms of the GNU General Public License version 2 as
46185 + * published by the Free Software Foundation.
46186 + */
46187 +
46188 +#ifndef _HYP_SYSFS_H_
46189 +#define _HYP_SYSFS_H_
46190 +
46191 +#include <linux/kobject.h>
46192 +#include <linux/sysfs.h>
46193 +
46194 +#define HYPERVISOR_ATTR_RO(_name) \
46195 +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
46196 +
46197 +#define HYPERVISOR_ATTR_RW(_name) \
46198 +static struct hyp_sysfs_attr _name##_attr = \
46199 + __ATTR(_name, 0644, _name##_show, _name##_store)
46200 +
46201 +struct hyp_sysfs_attr {
46202 + struct attribute attr;
46203 + ssize_t (*show)(struct hyp_sysfs_attr *, char *);
46204 + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
46205 + void *hyp_attr_data;
46206 +};
46207 +
46208 +#endif /* _HYP_SYSFS_H_ */
46209 Index: head-2008-11-25/include/xen/pcifront.h
46210 ===================================================================
46211 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46212 +++ head-2008-11-25/include/xen/pcifront.h 2007-06-18 08:38:13.000000000 +0200
46213 @@ -0,0 +1,83 @@
46214 +/*
46215 + * PCI Frontend - arch-dependendent declarations
46216 + *
46217 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
46218 + */
46219 +#ifndef __XEN_ASM_PCIFRONT_H__
46220 +#define __XEN_ASM_PCIFRONT_H__
46221 +
46222 +#include <linux/spinlock.h>
46223 +
46224 +#ifdef __KERNEL__
46225 +
46226 +#ifndef __ia64__
46227 +
46228 +struct pcifront_device;
46229 +struct pci_bus;
46230 +
46231 +struct pcifront_sd {
46232 + int domain;
46233 + struct pcifront_device *pdev;
46234 +};
46235 +
46236 +static inline struct pcifront_device *
46237 +pcifront_get_pdev(struct pcifront_sd *sd)
46238 +{
46239 + return sd->pdev;
46240 +}
46241 +
46242 +static inline void pcifront_init_sd(struct pcifront_sd *sd,
46243 + unsigned int domain, unsigned int bus,
46244 + struct pcifront_device *pdev)
46245 +{
46246 + sd->domain = domain;
46247 + sd->pdev = pdev;
46248 +}
46249 +
46250 +#if defined(CONFIG_PCI_DOMAINS)
46251 +static inline int pci_domain_nr(struct pci_bus *bus)
46252 +{
46253 + struct pcifront_sd *sd = bus->sysdata;
46254 + return sd->domain;
46255 +}
46256 +static inline int pci_proc_domain(struct pci_bus *bus)
46257 +{
46258 + return pci_domain_nr(bus);
46259 +}
46260 +#endif /* CONFIG_PCI_DOMAINS */
46261 +
46262 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46263 + struct pcifront_sd *sd)
46264 +{
46265 +}
46266 +
46267 +#else /* __ia64__ */
46268 +
46269 +#include <linux/acpi.h>
46270 +#include <asm/pci.h>
46271 +#define pcifront_sd pci_controller
46272 +
46273 +extern void xen_add_resource(struct pci_controller *, unsigned int,
46274 + unsigned int, struct acpi_resource *);
46275 +extern void xen_pcibios_setup_root_windows(struct pci_bus *,
46276 + struct pci_controller *);
46277 +
46278 +static inline struct pcifront_device *
46279 +pcifront_get_pdev(struct pcifront_sd *sd)
46280 +{
46281 + return (struct pcifront_device *)sd->platform_data;
46282 +}
46283 +
46284 +static inline void pcifront_setup_root_resources(struct pci_bus *bus,
46285 + struct pcifront_sd *sd)
46286 +{
46287 + xen_pcibios_setup_root_windows(bus, sd);
46288 +}
46289 +
46290 +#endif /* __ia64__ */
46291 +
46292 +extern struct rw_semaphore pci_bus_sem;
46293 +
46294 +#endif /* __KERNEL__ */
46295 +
46296 +#endif /* __XEN_ASM_PCIFRONT_H__ */
46297 Index: head-2008-11-25/include/xen/public/evtchn.h
46298 ===================================================================
46299 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46300 +++ head-2008-11-25/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200
46301 @@ -0,0 +1,88 @@
46302 +/******************************************************************************
46303 + * evtchn.h
46304 + *
46305 + * Interface to /dev/xen/evtchn.
46306 + *
46307 + * Copyright (c) 2003-2005, K A Fraser
46308 + *
46309 + * This program is free software; you can redistribute it and/or
46310 + * modify it under the terms of the GNU General Public License version 2
46311 + * as published by the Free Software Foundation; or, when distributed
46312 + * separately from the Linux kernel or incorporated into other
46313 + * software packages, subject to the following license:
46314 + *
46315 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46316 + * of this source file (the "Software"), to deal in the Software without
46317 + * restriction, including without limitation the rights to use, copy, modify,
46318 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46319 + * and to permit persons to whom the Software is furnished to do so, subject to
46320 + * the following conditions:
46321 + *
46322 + * The above copyright notice and this permission notice shall be included in
46323 + * all copies or substantial portions of the Software.
46324 + *
46325 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46326 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46327 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46328 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46329 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46330 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46331 + * IN THE SOFTWARE.
46332 + */
46333 +
46334 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
46335 +#define __LINUX_PUBLIC_EVTCHN_H__
46336 +
46337 +/*
46338 + * Bind a fresh port to VIRQ @virq.
46339 + * Return allocated port.
46340 + */
46341 +#define IOCTL_EVTCHN_BIND_VIRQ \
46342 + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
46343 +struct ioctl_evtchn_bind_virq {
46344 + unsigned int virq;
46345 +};
46346 +
46347 +/*
46348 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
46349 + * Return allocated port.
46350 + */
46351 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
46352 + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
46353 +struct ioctl_evtchn_bind_interdomain {
46354 + unsigned int remote_domain, remote_port;
46355 +};
46356 +
46357 +/*
46358 + * Allocate a fresh port for binding to @remote_domain.
46359 + * Return allocated port.
46360 + */
46361 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
46362 + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
46363 +struct ioctl_evtchn_bind_unbound_port {
46364 + unsigned int remote_domain;
46365 +};
46366 +
46367 +/*
46368 + * Unbind previously allocated @port.
46369 + */
46370 +#define IOCTL_EVTCHN_UNBIND \
46371 + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
46372 +struct ioctl_evtchn_unbind {
46373 + unsigned int port;
46374 +};
46375 +
46376 +/*
46377 + * Unbind previously allocated @port.
46378 + */
46379 +#define IOCTL_EVTCHN_NOTIFY \
46380 + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
46381 +struct ioctl_evtchn_notify {
46382 + unsigned int port;
46383 +};
46384 +
46385 +/* Clear and reinitialise the event buffer. Clear error condition. */
46386 +#define IOCTL_EVTCHN_RESET \
46387 + _IOC(_IOC_NONE, 'E', 5, 0)
46388 +
46389 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
46390 Index: head-2008-11-25/include/xen/public/gntdev.h
46391 ===================================================================
46392 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46393 +++ head-2008-11-25/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200
46394 @@ -0,0 +1,119 @@
46395 +/******************************************************************************
46396 + * gntdev.h
46397 + *
46398 + * Interface to /dev/xen/gntdev.
46399 + *
46400 + * Copyright (c) 2007, D G Murray
46401 + *
46402 + * This program is free software; you can redistribute it and/or
46403 + * modify it under the terms of the GNU General Public License version 2
46404 + * as published by the Free Software Foundation; or, when distributed
46405 + * separately from the Linux kernel or incorporated into other
46406 + * software packages, subject to the following license:
46407 + *
46408 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46409 + * of this source file (the "Software"), to deal in the Software without
46410 + * restriction, including without limitation the rights to use, copy, modify,
46411 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46412 + * and to permit persons to whom the Software is furnished to do so, subject to
46413 + * the following conditions:
46414 + *
46415 + * The above copyright notice and this permission notice shall be included in
46416 + * all copies or substantial portions of the Software.
46417 + *
46418 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46419 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46420 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46421 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46422 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46423 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46424 + * IN THE SOFTWARE.
46425 + */
46426 +
46427 +#ifndef __LINUX_PUBLIC_GNTDEV_H__
46428 +#define __LINUX_PUBLIC_GNTDEV_H__
46429 +
46430 +struct ioctl_gntdev_grant_ref {
46431 + /* The domain ID of the grant to be mapped. */
46432 + uint32_t domid;
46433 + /* The grant reference of the grant to be mapped. */
46434 + uint32_t ref;
46435 +};
46436 +
46437 +/*
46438 + * Inserts the grant references into the mapping table of an instance
46439 + * of gntdev. N.B. This does not perform the mapping, which is deferred
46440 + * until mmap() is called with @index as the offset.
46441 + */
46442 +#define IOCTL_GNTDEV_MAP_GRANT_REF \
46443 +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
46444 +struct ioctl_gntdev_map_grant_ref {
46445 + /* IN parameters */
46446 + /* The number of grants to be mapped. */
46447 + uint32_t count;
46448 + uint32_t pad;
46449 + /* OUT parameters */
46450 + /* The offset to be used on a subsequent call to mmap(). */
46451 + uint64_t index;
46452 + /* Variable IN parameter. */
46453 + /* Array of grant references, of size @count. */
46454 + struct ioctl_gntdev_grant_ref refs[1];
46455 +};
46456 +
46457 +/*
46458 + * Removes the grant references from the mapping table of an instance of
46459 + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
46460 + * before this ioctl is called, or an error will result.
46461 + */
46462 +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
46463 +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
46464 +struct ioctl_gntdev_unmap_grant_ref {
46465 + /* IN parameters */
46466 + /* The offset was returned by the corresponding map operation. */
46467 + uint64_t index;
46468 + /* The number of pages to be unmapped. */
46469 + uint32_t count;
46470 + uint32_t pad;
46471 +};
46472 +
46473 +/*
46474 + * Returns the offset in the driver's address space that corresponds
46475 + * to @vaddr. This can be used to perform a munmap(), followed by an
46476 + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
46477 + * the caller. The number of pages that were allocated at the same time as
46478 + * @vaddr is returned in @count.
46479 + *
46480 + * N.B. Where more than one page has been mapped into a contiguous range, the
46481 + * supplied @vaddr must correspond to the start of the range; otherwise
46482 + * an error will result. It is only possible to munmap() the entire
46483 + * contiguously-allocated range at once, and not any subrange thereof.
46484 + */
46485 +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
46486 +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
46487 +struct ioctl_gntdev_get_offset_for_vaddr {
46488 + /* IN parameters */
46489 + /* The virtual address of the first mapped page in a range. */
46490 + uint64_t vaddr;
46491 + /* OUT parameters */
46492 + /* The offset that was used in the initial mmap() operation. */
46493 + uint64_t offset;
46494 + /* The number of pages mapped in the VM area that begins at @vaddr. */
46495 + uint32_t count;
46496 + uint32_t pad;
46497 +};
46498 +
46499 +/*
46500 + * Sets the maximum number of grants that may mapped at once by this gntdev
46501 + * instance.
46502 + *
46503 + * N.B. This must be called before any other ioctl is performed on the device.
46504 + */
46505 +#define IOCTL_GNTDEV_SET_MAX_GRANTS \
46506 +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
46507 +struct ioctl_gntdev_set_max_grants {
46508 + /* IN parameter */
46509 + /* The maximum number of grants that may be mapped at once. */
46510 + uint32_t count;
46511 +};
46512 +
46513 +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
46514 Index: head-2008-11-25/include/xen/public/privcmd.h
46515 ===================================================================
46516 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46517 +++ head-2008-11-25/include/xen/public/privcmd.h 2007-06-12 13:14:19.000000000 +0200
46518 @@ -0,0 +1,79 @@
46519 +/******************************************************************************
46520 + * privcmd.h
46521 + *
46522 + * Interface to /proc/xen/privcmd.
46523 + *
46524 + * Copyright (c) 2003-2005, K A Fraser
46525 + *
46526 + * This program is free software; you can redistribute it and/or
46527 + * modify it under the terms of the GNU General Public License version 2
46528 + * as published by the Free Software Foundation; or, when distributed
46529 + * separately from the Linux kernel or incorporated into other
46530 + * software packages, subject to the following license:
46531 + *
46532 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46533 + * of this source file (the "Software"), to deal in the Software without
46534 + * restriction, including without limitation the rights to use, copy, modify,
46535 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46536 + * and to permit persons to whom the Software is furnished to do so, subject to
46537 + * the following conditions:
46538 + *
46539 + * The above copyright notice and this permission notice shall be included in
46540 + * all copies or substantial portions of the Software.
46541 + *
46542 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46543 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46544 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46545 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46546 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46547 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46548 + * IN THE SOFTWARE.
46549 + */
46550 +
46551 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
46552 +#define __LINUX_PUBLIC_PRIVCMD_H__
46553 +
46554 +#include <linux/types.h>
46555 +
46556 +#ifndef __user
46557 +#define __user
46558 +#endif
46559 +
46560 +typedef struct privcmd_hypercall
46561 +{
46562 + __u64 op;
46563 + __u64 arg[5];
46564 +} privcmd_hypercall_t;
46565 +
46566 +typedef struct privcmd_mmap_entry {
46567 + __u64 va;
46568 + __u64 mfn;
46569 + __u64 npages;
46570 +} privcmd_mmap_entry_t;
46571 +
46572 +typedef struct privcmd_mmap {
46573 + int num;
46574 + domid_t dom; /* target domain */
46575 + privcmd_mmap_entry_t __user *entry;
46576 +} privcmd_mmap_t;
46577 +
46578 +typedef struct privcmd_mmapbatch {
46579 + int num; /* number of pages to populate */
46580 + domid_t dom; /* target domain */
46581 + __u64 addr; /* virtual address */
46582 + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
46583 +} privcmd_mmapbatch_t;
46584 +
46585 +/*
46586 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
46587 + * @arg: &privcmd_hypercall_t
46588 + * Return: Value returned from execution of the specified hypercall.
46589 + */
46590 +#define IOCTL_PRIVCMD_HYPERCALL \
46591 + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
46592 +#define IOCTL_PRIVCMD_MMAP \
46593 + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
46594 +#define IOCTL_PRIVCMD_MMAPBATCH \
46595 + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
46596 +
46597 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
46598 Index: head-2008-11-25/include/xen/xen_proc.h
46599 ===================================================================
46600 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46601 +++ head-2008-11-25/include/xen/xen_proc.h 2007-06-12 13:14:19.000000000 +0200
46602 @@ -0,0 +1,12 @@
46603 +
46604 +#ifndef __ASM_XEN_PROC_H__
46605 +#define __ASM_XEN_PROC_H__
46606 +
46607 +#include <linux/proc_fs.h>
46608 +
46609 +extern struct proc_dir_entry *create_xen_proc_entry(
46610 + const char *name, mode_t mode);
46611 +extern void remove_xen_proc_entry(
46612 + const char *name);
46613 +
46614 +#endif /* __ASM_XEN_PROC_H__ */
46615 Index: head-2008-11-25/include/xen/xencons.h
46616 ===================================================================
46617 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46618 +++ head-2008-11-25/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200
46619 @@ -0,0 +1,17 @@
46620 +#ifndef __ASM_XENCONS_H__
46621 +#define __ASM_XENCONS_H__
46622 +
46623 +struct dom0_vga_console_info;
46624 +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
46625 +
46626 +void xencons_force_flush(void);
46627 +void xencons_resume(void);
46628 +
46629 +/* Interrupt work hooks. Receive data, or kick data out. */
46630 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
46631 +void xencons_tx(void);
46632 +
46633 +int xencons_ring_init(void);
46634 +int xencons_ring_send(const char *data, unsigned len);
46635 +
46636 +#endif /* __ASM_XENCONS_H__ */
46637 Index: head-2008-11-25/include/xen/xenoprof.h
46638 ===================================================================
46639 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46640 +++ head-2008-11-25/include/xen/xenoprof.h 2007-06-12 13:14:19.000000000 +0200
46641 @@ -0,0 +1,42 @@
46642 +/******************************************************************************
46643 + * xen/xenoprof.h
46644 + *
46645 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
46646 + * VA Linux Systems Japan K.K.
46647 + *
46648 + * This program is free software; you can redistribute it and/or modify
46649 + * it under the terms of the GNU General Public License as published by
46650 + * the Free Software Foundation; either version 2 of the License, or
46651 + * (at your option) any later version.
46652 + *
46653 + * This program is distributed in the hope that it will be useful,
46654 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
46655 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
46656 + * GNU General Public License for more details.
46657 + *
46658 + * You should have received a copy of the GNU General Public License
46659 + * along with this program; if not, write to the Free Software
46660 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46661 + *
46662 + */
46663 +
46664 +#ifndef __XEN_XENOPROF_H__
46665 +#define __XEN_XENOPROF_H__
46666 +#ifdef CONFIG_XEN
46667 +
46668 +#include <asm/xenoprof.h>
46669 +
46670 +struct oprofile_operations;
46671 +int xenoprofile_init(struct oprofile_operations * ops);
46672 +void xenoprofile_exit(void);
46673 +
46674 +struct xenoprof_shared_buffer {
46675 + char *buffer;
46676 + struct xenoprof_arch_shared_buffer arch;
46677 +};
46678 +#else
46679 +#define xenoprofile_init(ops) (-ENOSYS)
46680 +#define xenoprofile_exit() do { } while (0)
46681 +
46682 +#endif /* CONFIG_XEN */
46683 +#endif /* __XEN_XENOPROF_H__ */
46684 Index: head-2008-11-25/lib/swiotlb-xen.c
46685 ===================================================================
46686 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
46687 +++ head-2008-11-25/lib/swiotlb-xen.c 2008-09-15 13:40:15.000000000 +0200
46688 @@ -0,0 +1,739 @@
46689 +/*
46690 + * Dynamic DMA mapping support.
46691 + *
46692 + * This implementation is a fallback for platforms that do not support
46693 + * I/O TLBs (aka DMA address translation hardware).
46694 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
46695 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
46696 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
46697 + * David Mosberger-Tang <davidm@hpl.hp.com>
46698 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
46699 + */
46700 +
46701 +#include <linux/cache.h>
46702 +#include <linux/mm.h>
46703 +#include <linux/module.h>
46704 +#include <linux/pci.h>
46705 +#include <linux/spinlock.h>
46706 +#include <linux/string.h>
46707 +#include <linux/types.h>
46708 +#include <linux/ctype.h>
46709 +#include <linux/init.h>
46710 +#include <linux/bootmem.h>
46711 +#include <linux/highmem.h>
46712 +#include <asm/io.h>
46713 +#include <asm/pci.h>
46714 +#include <asm/dma.h>
46715 +#include <asm/uaccess.h>
46716 +#include <xen/gnttab.h>
46717 +#include <xen/interface/memory.h>
46718 +#include <asm-i386/mach-xen/asm/gnttab_dma.h>
46719 +
46720 +int swiotlb;
46721 +EXPORT_SYMBOL(swiotlb);
46722 +
46723 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
46724 +
46725 +/*
46726 + * Maximum allowable number of contiguous slabs to map,
46727 + * must be a power of 2. What is the appropriate value ?
46728 + * The complexity of {map,unmap}_single is linearly dependent on this value.
46729 + */
46730 +#define IO_TLB_SEGSIZE 128
46731 +
46732 +/*
46733 + * log of the size of each IO TLB slab. The number of slabs is command line
46734 + * controllable.
46735 + */
46736 +#define IO_TLB_SHIFT 11
46737 +
46738 +int swiotlb_force;
46739 +
46740 +static char *iotlb_virt_start;
46741 +static unsigned long iotlb_nslabs;
46742 +
46743 +/*
46744 + * Used to do a quick range check in swiotlb_unmap_single and
46745 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
46746 + * API.
46747 + */
46748 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
46749 +
46750 +/* Does the given dma address reside within the swiotlb aperture? */
46751 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
46752 +{
46753 + unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
46754 + return (pfn_valid(pfn)
46755 + && (pfn >= iotlb_pfn_start)
46756 + && (pfn < iotlb_pfn_end));
46757 +}
46758 +
46759 +/*
46760 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
46761 + */
46762 +static unsigned long io_tlb_overflow = 32*1024;
46763 +
46764 +void *io_tlb_overflow_buffer;
46765 +
46766 +/*
46767 + * This is a free list describing the number of free entries available from
46768 + * each index
46769 + */
46770 +static unsigned int *io_tlb_list;
46771 +static unsigned int io_tlb_index;
46772 +
46773 +/*
46774 + * We need to save away the original address corresponding to a mapped entry
46775 + * for the sync operations.
46776 + */
46777 +static struct phys_addr {
46778 + struct page *page;
46779 + unsigned int offset;
46780 +} *io_tlb_orig_addr;
46781 +
46782 +/*
46783 + * Protect the above data structures in the map and unmap calls
46784 + */
46785 +static DEFINE_SPINLOCK(io_tlb_lock);
46786 +
46787 +static unsigned int dma_bits;
46788 +static unsigned int __initdata max_dma_bits = 32;
46789 +static int __init
46790 +setup_dma_bits(char *str)
46791 +{
46792 + max_dma_bits = simple_strtoul(str, NULL, 0);
46793 + return 0;
46794 +}
46795 +__setup("dma_bits=", setup_dma_bits);
46796 +
46797 +static int __init
46798 +setup_io_tlb_npages(char *str)
46799 +{
46800 + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
46801 + if (isdigit(*str)) {
46802 + iotlb_nslabs = simple_strtoul(str, &str, 0) <<
46803 + (20 - IO_TLB_SHIFT);
46804 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46805 + }
46806 + if (*str == ',')
46807 + ++str;
46808 + /*
46809 + * NB. 'force' enables the swiotlb, but doesn't force its use for
46810 + * every DMA like it does on native Linux. 'off' forcibly disables
46811 + * use of the swiotlb.
46812 + */
46813 + if (!strcmp(str, "force"))
46814 + swiotlb_force = 1;
46815 + else if (!strcmp(str, "off"))
46816 + swiotlb_force = -1;
46817 + return 1;
46818 +}
46819 +__setup("swiotlb=", setup_io_tlb_npages);
46820 +/* make io_tlb_overflow tunable too? */
46821 +
46822 +/*
46823 + * Statically reserve bounce buffer space and initialize bounce buffer data
46824 + * structures for the software IO TLB used to implement the PCI DMA API.
46825 + */
46826 +void
46827 +swiotlb_init_with_default_size (size_t default_size)
46828 +{
46829 + unsigned long i, bytes;
46830 + int rc;
46831 +
46832 + if (!iotlb_nslabs) {
46833 + iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
46834 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
46835 + }
46836 +
46837 + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
46838 +
46839 + /*
46840 + * Get IO TLB memory from the low pages
46841 + */
46842 + iotlb_virt_start = alloc_bootmem_low_pages(bytes);
46843 + if (!iotlb_virt_start)
46844 + panic("Cannot allocate SWIOTLB buffer!\n");
46845 +
46846 + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
46847 + for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
46848 + do {
46849 + rc = xen_create_contiguous_region(
46850 + (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
46851 + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
46852 + dma_bits);
46853 + } while (rc && dma_bits++ < max_dma_bits);
46854 + if (rc) {
46855 + if (i == 0)
46856 + panic("No suitable physical memory available for SWIOTLB buffer!\n"
46857 + "Use dom0_mem Xen boot parameter to reserve\n"
46858 + "some DMA memory (e.g., dom0_mem=-128M).\n");
46859 + iotlb_nslabs = i;
46860 + i <<= IO_TLB_SHIFT;
46861 + free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
46862 + bytes = i;
46863 + for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
46864 + unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
46865 +
46866 + if (bits > dma_bits)
46867 + dma_bits = bits;
46868 + }
46869 + break;
46870 + }
46871 + }
46872 +
46873 + /*
46874 + * Allocate and initialize the free list array. This array is used
46875 + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
46876 + */
46877 + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
46878 + for (i = 0; i < iotlb_nslabs; i++)
46879 + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
46880 + io_tlb_index = 0;
46881 + io_tlb_orig_addr = alloc_bootmem(
46882 + iotlb_nslabs * sizeof(*io_tlb_orig_addr));
46883 +
46884 + /*
46885 + * Get the overflow emergency buffer
46886 + */
46887 + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
46888 + if (!io_tlb_overflow_buffer)
46889 + panic("Cannot allocate SWIOTLB overflow buffer!\n");
46890 +
46891 + do {
46892 + rc = xen_create_contiguous_region(
46893 + (unsigned long)io_tlb_overflow_buffer,
46894 + get_order(io_tlb_overflow),
46895 + dma_bits);
46896 + } while (rc && dma_bits++ < max_dma_bits);
46897 + if (rc)
46898 + panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
46899 +
46900 + iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
46901 + iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
46902 +
46903 + printk(KERN_INFO "Software IO TLB enabled: \n"
46904 + " Aperture: %lu megabytes\n"
46905 + " Kernel range: %p - %p\n"
46906 + " Address size: %u bits\n",
46907 + bytes >> 20,
46908 + iotlb_virt_start, iotlb_virt_start + bytes,
46909 + dma_bits);
46910 +}
46911 +
46912 +void
46913 +swiotlb_init(void)
46914 +{
46915 + long ram_end;
46916 + size_t defsz = 64 * (1 << 20); /* 64MB default size */
46917 +
46918 + if (swiotlb_force == 1) {
46919 + swiotlb = 1;
46920 + } else if ((swiotlb_force != -1) &&
46921 + is_running_on_xen() &&
46922 + is_initial_xendomain()) {
46923 + /* Domain 0 always has a swiotlb. */
46924 + ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
46925 + if (ram_end <= 0x7ffff)
46926 + defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
46927 + swiotlb = 1;
46928 + }
46929 +
46930 + if (swiotlb)
46931 + swiotlb_init_with_default_size(defsz);
46932 + else
46933 + printk(KERN_INFO "Software IO TLB disabled\n");
46934 +}
46935 +
46936 +/*
46937 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
46938 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
46939 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
46940 + * unnecessary copy from the aperture to the host buffer, and a page fault.
46941 + */
46942 +static void
46943 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
46944 +{
46945 + if (PageHighMem(buffer.page)) {
46946 + size_t len, bytes;
46947 + char *dev, *host, *kmp;
46948 + len = size;
46949 + while (len != 0) {
46950 + unsigned long flags;
46951 +
46952 + if (((bytes = len) + buffer.offset) > PAGE_SIZE)
46953 + bytes = PAGE_SIZE - buffer.offset;
46954 + local_irq_save(flags); /* protects KM_BOUNCE_READ */
46955 + kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
46956 + dev = dma_addr + size - len;
46957 + host = kmp + buffer.offset;
46958 + if (dir == DMA_FROM_DEVICE) {
46959 + if (__copy_to_user_inatomic(host, dev, bytes))
46960 + /* inaccessible */;
46961 + } else
46962 + memcpy(dev, host, bytes);
46963 + kunmap_atomic(kmp, KM_BOUNCE_READ);
46964 + local_irq_restore(flags);
46965 + len -= bytes;
46966 + buffer.page++;
46967 + buffer.offset = 0;
46968 + }
46969 + } else {
46970 + char *host = (char *)phys_to_virt(
46971 + page_to_pseudophys(buffer.page)) + buffer.offset;
46972 + if (dir == DMA_FROM_DEVICE) {
46973 + if (__copy_to_user_inatomic(host, dma_addr, size))
46974 + /* inaccessible */;
46975 + } else if (dir == DMA_TO_DEVICE)
46976 + memcpy(dma_addr, host, size);
46977 + }
46978 +}
46979 +
46980 +/*
46981 + * Allocates bounce buffer and returns its kernel virtual address.
46982 + */
46983 +static void *
46984 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
46985 +{
46986 + unsigned long flags;
46987 + char *dma_addr;
46988 + unsigned int nslots, stride, index, wrap;
46989 + struct phys_addr slot_buf;
46990 + int i;
46991 +
46992 + /*
46993 + * For mappings greater than a page, we limit the stride (and
46994 + * hence alignment) to a page size.
46995 + */
46996 + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
46997 + if (size > PAGE_SIZE)
46998 + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
46999 + else
47000 + stride = 1;
47001 +
47002 + BUG_ON(!nslots);
47003 +
47004 + /*
47005 + * Find suitable number of IO TLB entries size that will fit this
47006 + * request and allocate a buffer from that IO TLB pool.
47007 + */
47008 + spin_lock_irqsave(&io_tlb_lock, flags);
47009 + {
47010 + wrap = index = ALIGN(io_tlb_index, stride);
47011 +
47012 + if (index >= iotlb_nslabs)
47013 + wrap = index = 0;
47014 +
47015 + do {
47016 + /*
47017 + * If we find a slot that indicates we have 'nslots'
47018 + * number of contiguous buffers, we allocate the
47019 + * buffers from that slot and mark the entries as '0'
47020 + * indicating unavailable.
47021 + */
47022 + if (io_tlb_list[index] >= nslots) {
47023 + int count = 0;
47024 +
47025 + for (i = index; i < (int)(index + nslots); i++)
47026 + io_tlb_list[i] = 0;
47027 + for (i = index - 1;
47028 + (OFFSET(i, IO_TLB_SEGSIZE) !=
47029 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47030 + i--)
47031 + io_tlb_list[i] = ++count;
47032 + dma_addr = iotlb_virt_start +
47033 + (index << IO_TLB_SHIFT);
47034 +
47035 + /*
47036 + * Update the indices to avoid searching in
47037 + * the next round.
47038 + */
47039 + io_tlb_index =
47040 + ((index + nslots) < iotlb_nslabs
47041 + ? (index + nslots) : 0);
47042 +
47043 + goto found;
47044 + }
47045 + index += stride;
47046 + if (index >= iotlb_nslabs)
47047 + index = 0;
47048 + } while (index != wrap);
47049 +
47050 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47051 + return NULL;
47052 + }
47053 + found:
47054 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47055 +
47056 + /*
47057 + * Save away the mapping from the original address to the DMA address.
47058 + * This is needed when we sync the memory. Then we sync the buffer if
47059 + * needed.
47060 + */
47061 + slot_buf = buffer;
47062 + for (i = 0; i < nslots; i++) {
47063 + slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
47064 + slot_buf.offset &= PAGE_SIZE - 1;
47065 + io_tlb_orig_addr[index+i] = slot_buf;
47066 + slot_buf.offset += 1 << IO_TLB_SHIFT;
47067 + }
47068 + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47069 + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
47070 +
47071 + return dma_addr;
47072 +}
47073 +
47074 +static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
47075 +{
47076 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47077 + struct phys_addr buffer = io_tlb_orig_addr[index];
47078 + buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
47079 + buffer.page += buffer.offset >> PAGE_SHIFT;
47080 + buffer.offset &= PAGE_SIZE - 1;
47081 + return buffer;
47082 +}
47083 +
47084 +/*
47085 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
47086 + */
47087 +static void
47088 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47089 +{
47090 + unsigned long flags;
47091 + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
47092 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
47093 + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47094 +
47095 + /*
47096 + * First, sync the memory before unmapping the entry
47097 + */
47098 + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
47099 + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
47100 +
47101 + /*
47102 + * Return the buffer to the free list by setting the corresponding
47103 + * entries to indicate the number of contigous entries available.
47104 + * While returning the entries to the free list, we merge the entries
47105 + * with slots below and above the pool being returned.
47106 + */
47107 + spin_lock_irqsave(&io_tlb_lock, flags);
47108 + {
47109 + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
47110 + io_tlb_list[index + nslots] : 0);
47111 + /*
47112 + * Step 1: return the slots to the free list, merging the
47113 + * slots with superceeding slots
47114 + */
47115 + for (i = index + nslots - 1; i >= index; i--)
47116 + io_tlb_list[i] = ++count;
47117 + /*
47118 + * Step 2: merge the returned slots with the preceding slots,
47119 + * if available (non zero)
47120 + */
47121 + for (i = index - 1;
47122 + (OFFSET(i, IO_TLB_SEGSIZE) !=
47123 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
47124 + i--)
47125 + io_tlb_list[i] = ++count;
47126 + }
47127 + spin_unlock_irqrestore(&io_tlb_lock, flags);
47128 +}
47129 +
47130 +static void
47131 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
47132 +{
47133 + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
47134 + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
47135 + __sync_single(buffer, dma_addr, size, dir);
47136 +}
47137 +
47138 +static void
47139 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
47140 +{
47141 + /*
47142 + * Ran out of IOMMU space for this operation. This is very bad.
47143 + * Unfortunately the drivers cannot handle this operation properly.
47144 + * unless they check for pci_dma_mapping_error (most don't)
47145 + * When the mapping is small enough return a static buffer to limit
47146 + * the damage, or panic when the transfer is too big.
47147 + */
47148 + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
47149 + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
47150 +
47151 + if (size > io_tlb_overflow && do_panic) {
47152 + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47153 + panic("PCI-DMA: Memory would be corrupted\n");
47154 + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
47155 + panic("PCI-DMA: Random memory would be DMAed\n");
47156 + }
47157 +}
47158 +
47159 +/*
47160 + * Map a single buffer of the indicated size for DMA in streaming mode. The
47161 + * PCI address to use is returned.
47162 + *
47163 + * Once the device is given the dma address, the device owns this memory until
47164 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
47165 + */
47166 +dma_addr_t
47167 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
47168 +{
47169 + dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
47170 + offset_in_page(ptr);
47171 + void *map;
47172 + struct phys_addr buffer;
47173 +
47174 + BUG_ON(dir == DMA_NONE);
47175 +
47176 + /*
47177 + * If the pointer passed in happens to be in the device's DMA window,
47178 + * we can safely return the device addr and not worry about bounce
47179 + * buffering it.
47180 + */
47181 + if (!range_straddles_page_boundary(__pa(ptr), size) &&
47182 + !address_needs_mapping(hwdev, dev_addr))
47183 + return dev_addr;
47184 +
47185 + /*
47186 + * Oh well, have to allocate and map a bounce buffer.
47187 + */
47188 + gnttab_dma_unmap_page(dev_addr);
47189 + buffer.page = virt_to_page(ptr);
47190 + buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
47191 + map = map_single(hwdev, buffer, size, dir);
47192 + if (!map) {
47193 + swiotlb_full(hwdev, size, dir, 1);
47194 + map = io_tlb_overflow_buffer;
47195 + }
47196 +
47197 + dev_addr = virt_to_bus(map);
47198 + return dev_addr;
47199 +}
47200 +
47201 +/*
47202 + * Unmap a single streaming mode DMA translation. The dma_addr and size must
47203 + * match what was provided for in a previous swiotlb_map_single call. All
47204 + * other usages are undefined.
47205 + *
47206 + * After this call, reads by the cpu to the buffer are guaranteed to see
47207 + * whatever the device wrote there.
47208 + */
47209 +void
47210 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
47211 + int dir)
47212 +{
47213 + BUG_ON(dir == DMA_NONE);
47214 + if (in_swiotlb_aperture(dev_addr))
47215 + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
47216 + else
47217 + gnttab_dma_unmap_page(dev_addr);
47218 +}
47219 +
47220 +/*
47221 + * Make physical memory consistent for a single streaming mode DMA translation
47222 + * after a transfer.
47223 + *
47224 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
47225 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
47226 + * call this function before doing so. At the next point you give the PCI dma
47227 + * address back to the card, you must first perform a
47228 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
47229 + */
47230 +void
47231 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
47232 + size_t size, int dir)
47233 +{
47234 + BUG_ON(dir == DMA_NONE);
47235 + if (in_swiotlb_aperture(dev_addr))
47236 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47237 +}
47238 +
47239 +void
47240 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
47241 + size_t size, int dir)
47242 +{
47243 + BUG_ON(dir == DMA_NONE);
47244 + if (in_swiotlb_aperture(dev_addr))
47245 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
47246 +}
47247 +
47248 +/*
47249 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
47250 + * This is the scatter-gather version of the above swiotlb_map_single
47251 + * interface. Here the scatter gather list elements are each tagged with the
47252 + * appropriate dma address and length. They are obtained via
47253 + * sg_dma_{address,length}(SG).
47254 + *
47255 + * NOTE: An implementation may be able to use a smaller number of
47256 + * DMA address/length pairs than there are SG table elements.
47257 + * (for example via virtual mapping capabilities)
47258 + * The routine returns the number of addr/length pairs actually
47259 + * used, at most nents.
47260 + *
47261 + * Device ownership issues as mentioned above for swiotlb_map_single are the
47262 + * same here.
47263 + */
47264 +int
47265 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47266 + int dir)
47267 +{
47268 + struct phys_addr buffer;
47269 + dma_addr_t dev_addr;
47270 + char *map;
47271 + int i;
47272 +
47273 + BUG_ON(dir == DMA_NONE);
47274 +
47275 + for (i = 0; i < nelems; i++, sg++) {
47276 + dev_addr = gnttab_dma_map_page(sg->page) + sg->offset;
47277 +
47278 + if (range_straddles_page_boundary(page_to_pseudophys(sg->page)
47279 + + sg->offset, sg->length)
47280 + || address_needs_mapping(hwdev, dev_addr)) {
47281 + gnttab_dma_unmap_page(dev_addr);
47282 + buffer.page = sg->page;
47283 + buffer.offset = sg->offset;
47284 + map = map_single(hwdev, buffer, sg->length, dir);
47285 + if (!map) {
47286 + /* Don't panic here, we expect map_sg users
47287 + to do proper error handling. */
47288 + swiotlb_full(hwdev, sg->length, dir, 0);
47289 + swiotlb_unmap_sg(hwdev, sg - i, i, dir);
47290 + sg[0].dma_length = 0;
47291 + return 0;
47292 + }
47293 + sg->dma_address = (dma_addr_t)virt_to_bus(map);
47294 + } else
47295 + sg->dma_address = dev_addr;
47296 + sg->dma_length = sg->length;
47297 + }
47298 + return nelems;
47299 +}
47300 +
47301 +/*
47302 + * Unmap a set of streaming mode DMA translations. Again, cpu read rules
47303 + * concerning calls here are the same as for swiotlb_unmap_single() above.
47304 + */
47305 +void
47306 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
47307 + int dir)
47308 +{
47309 + int i;
47310 +
47311 + BUG_ON(dir == DMA_NONE);
47312 +
47313 + for (i = 0; i < nelems; i++, sg++)
47314 + if (in_swiotlb_aperture(sg->dma_address))
47315 + unmap_single(hwdev,
47316 + (void *)bus_to_virt(sg->dma_address),
47317 + sg->dma_length, dir);
47318 + else
47319 + gnttab_dma_unmap_page(sg->dma_address);
47320 +}
47321 +
47322 +/*
47323 + * Make physical memory consistent for a set of streaming mode DMA translations
47324 + * after a transfer.
47325 + *
47326 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
47327 + * and usage.
47328 + */
47329 +void
47330 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
47331 + int nelems, int dir)
47332 +{
47333 + int i;
47334 +
47335 + BUG_ON(dir == DMA_NONE);
47336 +
47337 + for (i = 0; i < nelems; i++, sg++)
47338 + if (in_swiotlb_aperture(sg->dma_address))
47339 + sync_single(hwdev,
47340 + (void *)bus_to_virt(sg->dma_address),
47341 + sg->dma_length, dir);
47342 +}
47343 +
47344 +void
47345 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
47346 + int nelems, int dir)
47347 +{
47348 + int i;
47349 +
47350 + BUG_ON(dir == DMA_NONE);
47351 +
47352 + for (i = 0; i < nelems; i++, sg++)
47353 + if (in_swiotlb_aperture(sg->dma_address))
47354 + sync_single(hwdev,
47355 + (void *)bus_to_virt(sg->dma_address),
47356 + sg->dma_length, dir);
47357 +}
47358 +
47359 +#ifdef CONFIG_HIGHMEM
47360 +
47361 +dma_addr_t
47362 +swiotlb_map_page(struct device *hwdev, struct page *page,
47363 + unsigned long offset, size_t size,
47364 + enum dma_data_direction direction)
47365 +{
47366 + struct phys_addr buffer;
47367 + dma_addr_t dev_addr;
47368 + char *map;
47369 +
47370 + dev_addr = gnttab_dma_map_page(page) + offset;
47371 + if (address_needs_mapping(hwdev, dev_addr)) {
47372 + gnttab_dma_unmap_page(dev_addr);
47373 + buffer.page = page;
47374 + buffer.offset = offset;
47375 + map = map_single(hwdev, buffer, size, direction);
47376 + if (!map) {
47377 + swiotlb_full(hwdev, size, direction, 1);
47378 + map = io_tlb_overflow_buffer;
47379 + }
47380 + dev_addr = (dma_addr_t)virt_to_bus(map);
47381 + }
47382 +
47383 + return dev_addr;
47384 +}
47385 +
47386 +void
47387 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
47388 + size_t size, enum dma_data_direction direction)
47389 +{
47390 + BUG_ON(direction == DMA_NONE);
47391 + if (in_swiotlb_aperture(dma_address))
47392 + unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
47393 + else
47394 + gnttab_dma_unmap_page(dma_address);
47395 +}
47396 +
47397 +#endif
47398 +
47399 +int
47400 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
47401 +{
47402 + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
47403 +}
47404 +
47405 +/*
47406 + * Return whether the given PCI device DMA address mask can be supported
47407 + * properly. For example, if your device can only drive the low 24-bits
47408 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
47409 + * this function.
47410 + */
47411 +int
47412 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
47413 +{
47414 + return (mask >= ((1UL << dma_bits) - 1));
47415 +}
47416 +
47417 +EXPORT_SYMBOL(swiotlb_init);
47418 +EXPORT_SYMBOL(swiotlb_map_single);
47419 +EXPORT_SYMBOL(swiotlb_unmap_single);
47420 +EXPORT_SYMBOL(swiotlb_map_sg);
47421 +EXPORT_SYMBOL(swiotlb_unmap_sg);
47422 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
47423 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
47424 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
47425 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
47426 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
47427 +EXPORT_SYMBOL(swiotlb_dma_supported);
47428 Index: head-2008-11-25/scripts/Makefile.xen.awk
47429 ===================================================================
47430 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
47431 +++ head-2008-11-25/scripts/Makefile.xen.awk 2007-08-06 15:10:49.000000000 +0200
47432 @@ -0,0 +1,34 @@
47433 +BEGIN {
47434 + is_rule = 0
47435 +}
47436 +
47437 +/^[[:space:]]*#/ {
47438 + next
47439 +}
47440 +
47441 +/^[[:space:]]*$/ {
47442 + if (is_rule)
47443 + print("")
47444 + is_rule = 0
47445 + next
47446 +}
47447 +
47448 +/:[[:space:]]*%\.[cS][[:space:]]/ {
47449 + line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
47450 + line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
47451 + print line
47452 + is_rule = 1
47453 + next
47454 +}
47455 +
47456 +/^[^\t]$/ {
47457 + if (is_rule)
47458 + print("")
47459 + is_rule = 0
47460 + next
47461 +}
47462 +
47463 +is_rule {
47464 + print $0
47465 + next
47466 +}